It's not necessary to do rounding for alloca operations when the requested alignment is equal to the stack alignment. git-svn-id: https://llvm.org/svn/llvm-project/llvm/trunk@40004 91177308-0d34-0410-b5e6-96231b3b80d8

commit: f17a25c88b892d30c2b41ba7ecdfbdfb2b4be9cc [log] [tgz]
author: Dan Gohman <djg@cray.com> Wed Jul 18 16:29:46 2007 +0000
committer: Dan Gohman <djg@cray.com> Wed Jul 18 16:29:46 2007 +0000
tree: ebb79ea1ee5e3bc1fdf38541a811a8b804f0679a
diff --git a/lib/Transforms/Scalar/ADCE.cpp b/lib/Transforms/Scalar/ADCE.cpp
new file mode 100644
index 0000000..4968fc9
--- /dev/null
+++ b/lib/Transforms/Scalar/ADCE.cpp

@@ -0,0 +1,497 @@
+//===- ADCE.cpp - Code to perform aggressive dead code elimination --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements "aggressive" dead code elimination.  ADCE is DCe where
+// values are assumed to be dead until proven otherwise.  This is similar to
+// SCCP, except applied to the liveness of values.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "adce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumBlockRemoved, "Number of basic blocks removed");
+STATISTIC(NumInstRemoved , "Number of instructions removed");
+STATISTIC(NumCallRemoved , "Number of calls and invokes removed");
+
+namespace {
+//===----------------------------------------------------------------------===//
+// ADCE Class
+//
+// This class does all of the work of Aggressive Dead Code Elimination.
+// It's public interface consists of a constructor and a doADCE() method.
+//
+class VISIBILITY_HIDDEN ADCE : public FunctionPass {
+  Function *Func;                       // The function that we are working on
+  std::vector<Instruction*> WorkList;   // Instructions that just became live
+  std::set<Instruction*>    LiveSet;    // The set of live instructions
+
+  //===--------------------------------------------------------------------===//
+  // The public interface for this class
+  //
+public:
+  static char ID; // Pass identification, replacement for typeid
+  ADCE() : FunctionPass((intptr_t)&ID) {}
+
+  // Execute the Aggressive Dead Code Elimination Algorithm
+  //
+  virtual bool runOnFunction(Function &F) {
+    Func = &F;
+    bool Changed = doADCE();
+    assert(WorkList.empty());
+    LiveSet.clear();
+    return Changed;
+  }
+  // getAnalysisUsage - We require post dominance frontiers (aka Control
+  // Dependence Graph)
+  virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+    // We require that all function nodes are unified, because otherwise code
+    // can be marked live that wouldn't necessarily be otherwise.
+    AU.addRequired<UnifyFunctionExitNodes>();
+    AU.addRequired<AliasAnalysis>();
+    AU.addRequired<PostDominatorTree>();
+    AU.addRequired<PostDominanceFrontier>();
+  }
+
+
+  //===--------------------------------------------------------------------===//
+  // The implementation of this class
+  //
+private:
+  // doADCE() - Run the Aggressive Dead Code Elimination algorithm, returning
+  // true if the function was modified.
+  //
+  bool doADCE();
+
+  void markBlockAlive(BasicBlock *BB);
+
+
+  // deleteDeadInstructionsInLiveBlock - Loop over all of the instructions in
+  // the specified basic block, deleting ones that are dead according to
+  // LiveSet.
+  bool deleteDeadInstructionsInLiveBlock(BasicBlock *BB);
+
+  TerminatorInst *convertToUnconditionalBranch(TerminatorInst *TI);
+
+  inline void markInstructionLive(Instruction *I) {
+    if (!LiveSet.insert(I).second) return;
+    DOUT << "Insn Live: " << *I;
+    WorkList.push_back(I);
+  }
+
+  inline void markTerminatorLive(const BasicBlock *BB) {
+    DOUT << "Terminator Live: " << *BB->getTerminator();
+    markInstructionLive(const_cast<TerminatorInst*>(BB->getTerminator()));
+  }
+};
+
+  char ADCE::ID = 0;
+  RegisterPass<ADCE> X("adce", "Aggressive Dead Code Elimination");
+} // End of anonymous namespace
+
+FunctionPass *llvm::createAggressiveDCEPass() { return new ADCE(); }
+
+void ADCE::markBlockAlive(BasicBlock *BB) {
+  // Mark the basic block as being newly ALIVE... and mark all branches that
+  // this block is control dependent on as being alive also...
+  //
+  PostDominanceFrontier &CDG = getAnalysis<PostDominanceFrontier>();
+
+  PostDominanceFrontier::const_iterator It = CDG.find(BB);
+  if (It != CDG.end()) {
+    // Get the blocks that this node is control dependent on...
+    const PostDominanceFrontier::DomSetType &CDB = It->second;
+    for (PostDominanceFrontier::DomSetType::const_iterator I =
+           CDB.begin(), E = CDB.end(); I != E; ++I)
+      markTerminatorLive(*I);   // Mark all their terminators as live
+  }
+
+  // If this basic block is live, and it ends in an unconditional branch, then
+  // the branch is alive as well...
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    if (BI->isUnconditional())
+      markTerminatorLive(BB);
+}
+
+// deleteDeadInstructionsInLiveBlock - Loop over all of the instructions in the
+// specified basic block, deleting ones that are dead according to LiveSet.
+bool ADCE::deleteDeadInstructionsInLiveBlock(BasicBlock *BB) {
+  bool Changed = false;
+  for (BasicBlock::iterator II = BB->begin(), E = --BB->end(); II != E; ) {
+    Instruction *I = II++;
+    if (!LiveSet.count(I)) {              // Is this instruction alive?
+      if (!I->use_empty())
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
+
+      // Nope... remove the instruction from it's basic block...
+      if (isa<CallInst>(I))
+        ++NumCallRemoved;
+      else
+        ++NumInstRemoved;
+      BB->getInstList().erase(I);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+
+/// convertToUnconditionalBranch - Transform this conditional terminator
+/// instruction into an unconditional branch because we don't care which of the
+/// successors it goes to.  This eliminate a use of the condition as well.
+///
+TerminatorInst *ADCE::convertToUnconditionalBranch(TerminatorInst *TI) {
+  BranchInst *NB = new BranchInst(TI->getSuccessor(0), TI);
+  BasicBlock *BB = TI->getParent();
+
+  // Remove entries from PHI nodes to avoid confusing ourself later...
+  for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+    TI->getSuccessor(i)->removePredecessor(BB);
+
+  // Delete the old branch itself...
+  BB->getInstList().erase(TI);
+  return NB;
+}
+
+
+// doADCE() - Run the Aggressive Dead Code Elimination algorithm, returning
+// true if the function was modified.
+//
+bool ADCE::doADCE() {
+  bool MadeChanges = false;
+
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+
+
+  // Iterate over all invokes in the function, turning invokes into calls if
+  // they cannot throw.
+  for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E; ++BB)
+    if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator()))
+      if (Function *F = II->getCalledFunction())
+        if (AA.onlyReadsMemory(F)) {
+          // The function cannot unwind.  Convert it to a call with a branch
+          // after it to the normal destination.
+          SmallVector<Value*, 8> Args(II->op_begin()+3, II->op_end());
+          CallInst *NewCall = new CallInst(F, &Args[0], Args.size(), "", II);
+          NewCall->takeName(II);
+          NewCall->setCallingConv(II->getCallingConv());
+          II->replaceAllUsesWith(NewCall);
+          new BranchInst(II->getNormalDest(), II);
+
+          // Update PHI nodes in the unwind destination
+          II->getUnwindDest()->removePredecessor(BB);
+          BB->getInstList().erase(II);
+
+          if (NewCall->use_empty()) {
+            BB->getInstList().erase(NewCall);
+            ++NumCallRemoved;
+          }
+        }
+
+  // Iterate over all of the instructions in the function, eliminating trivially
+  // dead instructions, and marking instructions live that are known to be
+  // needed.  Perform the walk in depth first order so that we avoid marking any
+  // instructions live in basic blocks that are unreachable.  These blocks will
+  // be eliminated later, along with the instructions inside.
+  //
+  std::set<BasicBlock*> ReachableBBs;
+  for (df_ext_iterator<BasicBlock*>
+         BBI = df_ext_begin(&Func->front(), ReachableBBs),
+         BBE = df_ext_end(&Func->front(), ReachableBBs); BBI != BBE; ++BBI) {
+    BasicBlock *BB = *BBI;
+    for (BasicBlock::iterator II = BB->begin(), EI = BB->end(); II != EI; ) {
+      Instruction *I = II++;
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        Function *F = CI->getCalledFunction();
+        if (F && AA.onlyReadsMemory(F)) {
+          if (CI->use_empty()) {
+            BB->getInstList().erase(CI);
+            ++NumCallRemoved;
+          }
+        } else {
+          markInstructionLive(I);
+        }
+      } else if (I->mayWriteToMemory() || isa<ReturnInst>(I) ||
+                 isa<UnwindInst>(I) || isa<UnreachableInst>(I)) {
+        // FIXME: Unreachable instructions should not be marked intrinsically
+        // live here.
+        markInstructionLive(I);
+      } else if (isInstructionTriviallyDead(I)) {
+        // Remove the instruction from it's basic block...
+        BB->getInstList().erase(I);
+        ++NumInstRemoved;
+      }
+    }
+  }
+
+  // Check to ensure we have an exit node for this CFG.  If we don't, we won't
+  // have any post-dominance information, thus we cannot perform our
+  // transformations safely.
+  //
+  PostDominatorTree &DT = getAnalysis<PostDominatorTree>();
+  if (DT[&Func->getEntryBlock()] == 0) {
+    WorkList.clear();
+    return MadeChanges;
+  }
+
+  // Scan the function marking blocks without post-dominance information as
+  // live.  Blocks without post-dominance information occur when there is an
+  // infinite loop in the program.  Because the infinite loop could contain a
+  // function which unwinds, exits or has side-effects, we don't want to delete
+  // the infinite loop or those blocks leading up to it.
+  for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I)
+    if (DT[I] == 0 && ReachableBBs.count(I))
+      for (pred_iterator PI = pred_begin(I), E = pred_end(I); PI != E; ++PI)
+        markInstructionLive((*PI)->getTerminator());
+
+  DOUT << "Processing work list\n";
+
+  // AliveBlocks - Set of basic blocks that we know have instructions that are
+  // alive in them...
+  //
+  std::set<BasicBlock*> AliveBlocks;
+
+  // Process the work list of instructions that just became live... if they
+  // became live, then that means that all of their operands are necessary as
+  // well... make them live as well.
+  //
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back(); // Get an instruction that became live...
+    WorkList.pop_back();
+
+    BasicBlock *BB = I->getParent();
+    if (!ReachableBBs.count(BB)) continue;
+    if (AliveBlocks.insert(BB).second)     // Basic block not alive yet.
+      markBlockAlive(BB);             // Make it so now!
+
+    // PHI nodes are a special case, because the incoming values are actually
+    // defined in the predecessor nodes of this block, meaning that the PHI
+    // makes the predecessors alive.
+    //
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+        // If the incoming edge is clearly dead, it won't have control
+        // dependence information.  Do not mark it live.
+        BasicBlock *PredBB = PN->getIncomingBlock(i);
+        if (ReachableBBs.count(PredBB)) {
+          // FIXME: This should mark the control dependent edge as live, not
+          // necessarily the predecessor itself!
+          if (AliveBlocks.insert(PredBB).second)
+            markBlockAlive(PN->getIncomingBlock(i));   // Block is newly ALIVE!
+          if (Instruction *Op = dyn_cast<Instruction>(PN->getIncomingValue(i)))
+            markInstructionLive(Op);
+        }
+      }
+    } else {
+      // Loop over all of the operands of the live instruction, making sure that
+      // they are known to be alive as well.
+      //
+      for (unsigned op = 0, End = I->getNumOperands(); op != End; ++op)
+        if (Instruction *Operand = dyn_cast<Instruction>(I->getOperand(op)))
+          markInstructionLive(Operand);
+    }
+  }
+
+  DEBUG(
+    DOUT << "Current Function: X = Live\n";
+    for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I){
+      DOUT << I->getName() << ":\t"
+           << (AliveBlocks.count(I) ? "LIVE\n" : "DEAD\n");
+      for (BasicBlock::iterator BI = I->begin(), BE = I->end(); BI != BE; ++BI){
+        if (LiveSet.count(BI)) DOUT << "X ";
+        DOUT << *BI;
+      }
+    });
+
+  // All blocks being live is a common case, handle it specially.
+  if (AliveBlocks.size() == Func->size()) {  // No dead blocks?
+    for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I) {
+      // Loop over all of the instructions in the function deleting instructions
+      // to drop their references.
+      deleteDeadInstructionsInLiveBlock(I);
+
+      // Check to make sure the terminator instruction is live.  If it isn't,
+      // this means that the condition that it branches on (we know it is not an
+      // unconditional branch), is not needed to make the decision of where to
+      // go to, because all outgoing edges go to the same place.  We must remove
+      // the use of the condition (because it's probably dead), so we convert
+      // the terminator to an unconditional branch.
+      //
+      TerminatorInst *TI = I->getTerminator();
+      if (!LiveSet.count(TI))
+        convertToUnconditionalBranch(TI);
+    }
+
+    return MadeChanges;
+  }
+
+
+  // If the entry node is dead, insert a new entry node to eliminate the entry
+  // node as a special case.
+  //
+  if (!AliveBlocks.count(&Func->front())) {
+    BasicBlock *NewEntry = new BasicBlock();
+    new BranchInst(&Func->front(), NewEntry);
+    Func->getBasicBlockList().push_front(NewEntry);
+    AliveBlocks.insert(NewEntry);    // This block is always alive!
+    LiveSet.insert(NewEntry->getTerminator());  // The branch is live
+  }
+
+  // Loop over all of the alive blocks in the function.  If any successor
+  // blocks are not alive, we adjust the outgoing branches to branch to the
+  // first live postdominator of the live block, adjusting any PHI nodes in
+  // the block to reflect this.
+  //
+  for (Function::iterator I = Func->begin(), E = Func->end(); I != E; ++I)
+    if (AliveBlocks.count(I)) {
+      BasicBlock *BB = I;
+      TerminatorInst *TI = BB->getTerminator();
+
+      // If the terminator instruction is alive, but the block it is contained
+      // in IS alive, this means that this terminator is a conditional branch on
+      // a condition that doesn't matter.  Make it an unconditional branch to
+      // ONE of the successors.  This has the side effect of dropping a use of
+      // the conditional value, which may also be dead.
+      if (!LiveSet.count(TI))
+        TI = convertToUnconditionalBranch(TI);
+
+      // Loop over all of the successors, looking for ones that are not alive.
+      // We cannot save the number of successors in the terminator instruction
+      // here because we may remove them if we don't have a postdominator.
+      //
+      for (unsigned i = 0; i != TI->getNumSuccessors(); ++i)
+        if (!AliveBlocks.count(TI->getSuccessor(i))) {
+          // Scan up the postdominator tree, looking for the first
+          // postdominator that is alive, and the last postdominator that is
+          // dead...
+          //
+          DomTreeNode *LastNode = DT[TI->getSuccessor(i)];
+          DomTreeNode *NextNode = 0;
+
+          if (LastNode) {
+            NextNode = LastNode->getIDom();
+            while (!AliveBlocks.count(NextNode->getBlock())) {
+              LastNode = NextNode;
+              NextNode = NextNode->getIDom();
+              if (NextNode == 0) {
+                LastNode = 0;
+                break;
+              }
+            }
+          }
+
+          // There is a special case here... if there IS no post-dominator for
+          // the block we have nowhere to point our branch to.  Instead, convert
+          // it to a return.  This can only happen if the code branched into an
+          // infinite loop.  Note that this may not be desirable, because we
+          // _are_ altering the behavior of the code.  This is a well known
+          // drawback of ADCE, so in the future if we choose to revisit the
+          // decision, this is where it should be.
+          //
+          if (LastNode == 0) {        // No postdominator!
+            if (!isa<InvokeInst>(TI)) {
+              // Call RemoveSuccessor to transmogrify the terminator instruction
+              // to not contain the outgoing branch, or to create a new
+              // terminator if the form fundamentally changes (i.e.,
+              // unconditional branch to return).  Note that this will change a
+              // branch into an infinite loop into a return instruction!
+              //
+              RemoveSuccessor(TI, i);
+
+              // RemoveSuccessor may replace TI... make sure we have a fresh
+              // pointer.
+              //
+              TI = BB->getTerminator();
+
+              // Rescan this successor...
+              --i;
+            } else {
+
+            }
+          } else {
+            // Get the basic blocks that we need...
+            BasicBlock *LastDead = LastNode->getBlock();
+            BasicBlock *NextAlive = NextNode->getBlock();
+
+            // Make the conditional branch now go to the next alive block...
+            TI->getSuccessor(i)->removePredecessor(BB);
+            TI->setSuccessor(i, NextAlive);
+
+            // If there are PHI nodes in NextAlive, we need to add entries to
+            // the PHI nodes for the new incoming edge.  The incoming values
+            // should be identical to the incoming values for LastDead.
+            //
+            for (BasicBlock::iterator II = NextAlive->begin();
+                 isa<PHINode>(II); ++II) {
+              PHINode *PN = cast<PHINode>(II);
+              if (LiveSet.count(PN)) {  // Only modify live phi nodes
+                // Get the incoming value for LastDead...
+                int OldIdx = PN->getBasicBlockIndex(LastDead);
+                assert(OldIdx != -1 &&"LastDead is not a pred of NextAlive!");
+                Value *InVal = PN->getIncomingValue(OldIdx);
+
+                // Add an incoming value for BB now...
+                PN->addIncoming(InVal, BB);
+              }
+            }
+          }
+        }
+
+      // Now loop over all of the instructions in the basic block, deleting
+      // dead instructions.  This is so that the next sweep over the program
+      // can safely delete dead instructions without other dead instructions
+      // still referring to them.
+      //
+      deleteDeadInstructionsInLiveBlock(BB);
+    }
+
+  // Loop over all of the basic blocks in the function, dropping references of
+  // the dead basic blocks.  We must do this after the previous step to avoid
+  // dropping references to PHIs which still have entries...
+  //
+  std::vector<BasicBlock*> DeadBlocks;
+  for (Function::iterator BB = Func->begin(), E = Func->end(); BB != E; ++BB)
+    if (!AliveBlocks.count(BB)) {
+      // Remove PHI node entries for this block in live successor blocks.
+      for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+        if (!SI->empty() && isa<PHINode>(SI->front()) && AliveBlocks.count(*SI))
+          (*SI)->removePredecessor(BB);
+
+      BB->dropAllReferences();
+      MadeChanges = true;
+      DeadBlocks.push_back(BB);
+    }
+
+  NumBlockRemoved += DeadBlocks.size();
+
+  // Now loop through all of the blocks and delete the dead ones.  We can safely
+  // do this now because we know that there are no references to dead blocks
+  // (because they have dropped all of their references).
+  for (std::vector<BasicBlock*>::iterator I = DeadBlocks.begin(),
+         E = DeadBlocks.end(); I != E; ++I)
+    Func->getBasicBlockList().erase(*I);
+
+  return MadeChanges;
+}

diff --git a/lib/Transforms/Scalar/BasicBlockPlacement.cpp b/lib/Transforms/Scalar/BasicBlockPlacement.cpp
new file mode 100644
index 0000000..7521ea3
--- /dev/null
+++ b/lib/Transforms/Scalar/BasicBlockPlacement.cpp

@@ -0,0 +1,148 @@
+//===-- BasicBlockPlacement.cpp - Basic Block Code Layout optimization ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a very simple profile guided basic block placement
+// algorithm.  The idea is to put frequently executed blocks together at the
+// start of the function, and hopefully increase the number of fall-through
+// conditional branches.  If there is no profile information for a particular
+// function, this pass basically orders blocks in depth-first order
+//
+// The algorithm implemented here is basically "Algo1" from "Profile Guided Code
+// Positioning" by Pettis and Hansen, except that it uses basic block counts
+// instead of edge counts.  This should be improved in many ways, but is very
+// simple for now.
+//
+// Basically we "place" the entry block, then loop over all successors in a DFO,
+// placing the most frequently executed successor until we run out of blocks.  I
+// told you this was _extremely_ simplistic. :) This is also much slower than it
+// could be.  When it becomes important, this pass will be rewritten to use a
+// better algorithm, and then we can worry about efficiency.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "block-placement"
+#include "llvm/Analysis/ProfileInfo.h"
+#include "llvm/Function.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Scalar.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumMoved, "Number of basic blocks moved");
+
+namespace {
+  struct VISIBILITY_HIDDEN BlockPlacement : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    BlockPlacement() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<ProfileInfo>();
+      //AU.addPreserved<ProfileInfo>();  // Does this work?
+    }
+  private:
+    /// PI - The profile information that is guiding us.
+    ///
+    ProfileInfo *PI;
+
+    /// NumMovedBlocks - Every time we move a block, increment this counter.
+    ///
+    unsigned NumMovedBlocks;
+
+    /// PlacedBlocks - Every time we place a block, remember it so we don't get
+    /// into infinite loops.
+    std::set<BasicBlock*> PlacedBlocks;
+
+    /// InsertPos - This an iterator to the next place we want to insert a
+    /// block.
+    Function::iterator InsertPos;
+
+    /// PlaceBlocks - Recursively place the specified blocks and any unplaced
+    /// successors.
+    void PlaceBlocks(BasicBlock *BB);
+  };
+
+  char BlockPlacement::ID = 0;
+  RegisterPass<BlockPlacement> X("block-placement",
+                                 "Profile Guided Basic Block Placement");
+}
+
+FunctionPass *llvm::createBlockPlacementPass() { return new BlockPlacement(); }
+
+bool BlockPlacement::runOnFunction(Function &F) {
+  PI = &getAnalysis<ProfileInfo>();
+
+  NumMovedBlocks = 0;
+  InsertPos = F.begin();
+
+  // Recursively place all blocks.
+  PlaceBlocks(F.begin());
+
+  PlacedBlocks.clear();
+  NumMoved += NumMovedBlocks;
+  return NumMovedBlocks != 0;
+}
+
+
+/// PlaceBlocks - Recursively place the specified blocks and any unplaced
+/// successors.
+void BlockPlacement::PlaceBlocks(BasicBlock *BB) {
+  assert(!PlacedBlocks.count(BB) && "Already placed this block!");
+  PlacedBlocks.insert(BB);
+
+  // Place the specified block.
+  if (&*InsertPos != BB) {
+    // Use splice to move the block into the right place.  This avoids having to
+    // remove the block from the function then readd it, which causes a bunch of
+    // symbol table traffic that is entirely pointless.
+    Function::BasicBlockListType &Blocks = BB->getParent()->getBasicBlockList();
+    Blocks.splice(InsertPos, Blocks, BB);
+
+    ++NumMovedBlocks;
+  } else {
+    // This block is already in the right place, we don't have to do anything.
+    ++InsertPos;
+  }
+
+  // Keep placing successors until we run out of ones to place.  Note that this
+  // loop is very inefficient (N^2) for blocks with many successors, like switch
+  // statements.  FIXME!
+  while (1) {
+    // Okay, now place any unplaced successors.
+    succ_iterator SI = succ_begin(BB), E = succ_end(BB);
+
+    // Scan for the first unplaced successor.
+    for (; SI != E && PlacedBlocks.count(*SI); ++SI)
+      /*empty*/;
+    if (SI == E) return;  // No more successors to place.
+
+    unsigned MaxExecutionCount = PI->getExecutionCount(*SI);
+    BasicBlock *MaxSuccessor = *SI;
+
+    // Scan for more frequently executed successors
+    for (; SI != E; ++SI)
+      if (!PlacedBlocks.count(*SI)) {
+        unsigned Count = PI->getExecutionCount(*SI);
+        if (Count > MaxExecutionCount ||
+            // Prefer to not disturb the code.
+            (Count == MaxExecutionCount && *SI == &*InsertPos)) {
+          MaxExecutionCount = Count;
+          MaxSuccessor = *SI;
+        }
+      }
+
+    // Now that we picked the maximally executed successor, place it.
+    PlaceBlocks(MaxSuccessor);
+  }
+}

diff --git a/lib/Transforms/Scalar/CodeGenPrepare.cpp b/lib/Transforms/Scalar/CodeGenPrepare.cpp
new file mode 100644
index 0000000..2969df3
--- /dev/null
+++ b/lib/Transforms/Scalar/CodeGenPrepare.cpp

@@ -0,0 +1,988 @@
+//===- CodeGenPrepare.cpp - Prepare a function for code generation --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Chris Lattner and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass munges the code in the input function to better prepare it for
+// SelectionDAG-based code generation.  This works around limitations in it's
+// basic-block-at-a-time approach.  It should eventually be removed.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "codegenprepare"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetAsmInfo.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+using namespace llvm;
+
+namespace {  
+  class VISIBILITY_HIDDEN CodeGenPrepare : public FunctionPass {
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    CodeGenPrepare(const TargetLowering *tli = 0) : FunctionPass((intptr_t)&ID),
+      TLI(tli) {}
+    bool runOnFunction(Function &F);
+    
+  private:
+    bool EliminateMostlyEmptyBlocks(Function &F);
+    bool CanMergeBlocks(const BasicBlock *BB, const BasicBlock *DestBB) const;
+    void EliminateMostlyEmptyBlock(BasicBlock *BB);
+    bool OptimizeBlock(BasicBlock &BB);
+    bool OptimizeLoadStoreInst(Instruction *I, Value *Addr,
+                               const Type *AccessTy,
+                               DenseMap<Value*,Value*> &SunkAddrs);
+  };
+}
+
+char CodeGenPrepare::ID = 0;
+static RegisterPass<CodeGenPrepare> X("codegenprepare",
+                                      "Optimize for code generation");
+
+FunctionPass *llvm::createCodeGenPreparePass(const TargetLowering *TLI) {
+  return new CodeGenPrepare(TLI);
+}
+
+
+bool CodeGenPrepare::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+  
+  // First pass, eliminate blocks that contain only PHI nodes and an
+  // unconditional branch.
+  EverMadeChange |= EliminateMostlyEmptyBlocks(F);
+  
+  bool MadeChange = true;
+  while (MadeChange) {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      MadeChange |= OptimizeBlock(*BB);
+    EverMadeChange |= MadeChange;
+  }
+  return EverMadeChange;
+}
+
+/// EliminateMostlyEmptyBlocks - eliminate blocks that contain only PHI nodes
+/// and an unconditional branch.  Passes before isel (e.g. LSR/loopsimplify) 
+/// often split edges in ways that are non-optimal for isel.  Start by
+/// eliminating these blocks so we can split them the way we want them.
+bool CodeGenPrepare::EliminateMostlyEmptyBlocks(Function &F) {
+  bool MadeChange = false;
+  // Note that this intentionally skips the entry block.
+  for (Function::iterator I = ++F.begin(), E = F.end(); I != E; ) {
+    BasicBlock *BB = I++;
+
+    // If this block doesn't end with an uncond branch, ignore it.
+    BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator());
+    if (!BI || !BI->isUnconditional())
+      continue;
+    
+    // If the instruction before the branch isn't a phi node, then other stuff
+    // is happening here.
+    BasicBlock::iterator BBI = BI;
+    if (BBI != BB->begin()) {
+      --BBI;
+      if (!isa<PHINode>(BBI)) continue;
+    }
+    
+    // Do not break infinite loops.
+    BasicBlock *DestBB = BI->getSuccessor(0);
+    if (DestBB == BB)
+      continue;
+    
+    if (!CanMergeBlocks(BB, DestBB))
+      continue;
+    
+    EliminateMostlyEmptyBlock(BB);
+    MadeChange = true;
+  }
+  return MadeChange;
+}
+
+/// CanMergeBlocks - Return true if we can merge BB into DestBB if there is a
+/// single uncond branch between them, and BB contains no other non-phi
+/// instructions.
+bool CodeGenPrepare::CanMergeBlocks(const BasicBlock *BB,
+                                    const BasicBlock *DestBB) const {
+  // We only want to eliminate blocks whose phi nodes are used by phi nodes in
+  // the successor.  If there are more complex condition (e.g. preheaders),
+  // don't mess around with them.
+  BasicBlock::const_iterator BBI = BB->begin();
+  while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+    for (Value::use_const_iterator UI = PN->use_begin(), E = PN->use_end();
+         UI != E; ++UI) {
+      const Instruction *User = cast<Instruction>(*UI);
+      if (User->getParent() != DestBB || !isa<PHINode>(User))
+        return false;
+      // If User is inside DestBB block and it is a PHINode then check 
+      // incoming value. If incoming value is not from BB then this is 
+      // a complex condition (e.g. preheaders) we want to avoid here.
+      if (User->getParent() == DestBB) {
+        if (const PHINode *UPN = dyn_cast<PHINode>(User))
+          for (unsigned I = 0, E = UPN->getNumIncomingValues(); I != E; ++I) {
+            Instruction *Insn = dyn_cast<Instruction>(UPN->getIncomingValue(I));
+            if (Insn && Insn->getParent() == BB &&
+                Insn->getParent() != UPN->getIncomingBlock(I))
+              return false;
+          }
+      }
+    }
+  }
+  
+  // If BB and DestBB contain any common predecessors, then the phi nodes in BB
+  // and DestBB may have conflicting incoming values for the block.  If so, we
+  // can't merge the block.
+  const PHINode *DestBBPN = dyn_cast<PHINode>(DestBB->begin());
+  if (!DestBBPN) return true;  // no conflict.
+  
+  // Collect the preds of BB.
+  SmallPtrSet<BasicBlock*, 16> BBPreds;
+  if (const PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+    // It is faster to get preds from a PHI than with pred_iterator.
+    for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+      BBPreds.insert(BBPN->getIncomingBlock(i));
+  } else {
+    BBPreds.insert(pred_begin(BB), pred_end(BB));
+  }
+  
+  // Walk the preds of DestBB.
+  for (unsigned i = 0, e = DestBBPN->getNumIncomingValues(); i != e; ++i) {
+    BasicBlock *Pred = DestBBPN->getIncomingBlock(i);
+    if (BBPreds.count(Pred)) {   // Common predecessor?
+      BBI = DestBB->begin();
+      while (const PHINode *PN = dyn_cast<PHINode>(BBI++)) {
+        const Value *V1 = PN->getIncomingValueForBlock(Pred);
+        const Value *V2 = PN->getIncomingValueForBlock(BB);
+        
+        // If V2 is a phi node in BB, look up what the mapped value will be.
+        if (const PHINode *V2PN = dyn_cast<PHINode>(V2))
+          if (V2PN->getParent() == BB)
+            V2 = V2PN->getIncomingValueForBlock(Pred);
+        
+        // If there is a conflict, bail out.
+        if (V1 != V2) return false;
+      }
+    }
+  }
+
+  return true;
+}
+
+
+/// EliminateMostlyEmptyBlock - Eliminate a basic block that have only phi's and
+/// an unconditional branch in it.
+void CodeGenPrepare::EliminateMostlyEmptyBlock(BasicBlock *BB) {
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  BasicBlock *DestBB = BI->getSuccessor(0);
+  
+  DOUT << "MERGING MOSTLY EMPTY BLOCKS - BEFORE:\n" << *BB << *DestBB;
+  
+  // If the destination block has a single pred, then this is a trivial edge,
+  // just collapse it.
+  if (DestBB->getSinglePredecessor()) {
+    // If DestBB has single-entry PHI nodes, fold them.
+    while (PHINode *PN = dyn_cast<PHINode>(DestBB->begin())) {
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      PN->eraseFromParent();
+    }
+    
+    // Splice all the PHI nodes from BB over to DestBB.
+    DestBB->getInstList().splice(DestBB->begin(), BB->getInstList(),
+                                 BB->begin(), BI);
+    
+    // Anything that branched to BB now branches to DestBB.
+    BB->replaceAllUsesWith(DestBB);
+    
+    // Nuke BB.
+    BB->eraseFromParent();
+    
+    DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+    return;
+  }
+  
+  // Otherwise, we have multiple predecessors of BB.  Update the PHIs in DestBB
+  // to handle the new incoming edges it is about to have.
+  PHINode *PN;
+  for (BasicBlock::iterator BBI = DestBB->begin();
+       (PN = dyn_cast<PHINode>(BBI)); ++BBI) {
+    // Remove the incoming value for BB, and remember it.
+    Value *InVal = PN->removeIncomingValue(BB, false);
+    
+    // Two options: either the InVal is a phi node defined in BB or it is some
+    // value that dominates BB.
+    PHINode *InValPhi = dyn_cast<PHINode>(InVal);
+    if (InValPhi && InValPhi->getParent() == BB) {
+      // Add all of the input values of the input PHI as inputs of this phi.
+      for (unsigned i = 0, e = InValPhi->getNumIncomingValues(); i != e; ++i)
+        PN->addIncoming(InValPhi->getIncomingValue(i),
+                        InValPhi->getIncomingBlock(i));
+    } else {
+      // Otherwise, add one instance of the dominating value for each edge that
+      // we will be adding.
+      if (PHINode *BBPN = dyn_cast<PHINode>(BB->begin())) {
+        for (unsigned i = 0, e = BBPN->getNumIncomingValues(); i != e; ++i)
+          PN->addIncoming(InVal, BBPN->getIncomingBlock(i));
+      } else {
+        for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI)
+          PN->addIncoming(InVal, *PI);
+      }
+    }
+  }
+  
+  // The PHIs are now updated, change everything that refers to BB to use
+  // DestBB and remove BB.
+  BB->replaceAllUsesWith(DestBB);
+  BB->eraseFromParent();
+  
+  DOUT << "AFTER:\n" << *DestBB << "\n\n\n";
+}
+
+
+/// SplitEdgeNicely - Split the critical edge from TI to it's specified
+/// successor if it will improve codegen.  We only do this if the successor has
+/// phi nodes (otherwise critical edges are ok).  If there is already another
+/// predecessor of the succ that is empty (and thus has no phi nodes), use it
+/// instead of introducing a new block.
+static void SplitEdgeNicely(TerminatorInst *TI, unsigned SuccNum, Pass *P) {
+  BasicBlock *TIBB = TI->getParent();
+  BasicBlock *Dest = TI->getSuccessor(SuccNum);
+  assert(isa<PHINode>(Dest->begin()) &&
+         "This should only be called if Dest has a PHI!");
+  
+  /// TIPHIValues - This array is lazily computed to determine the values of
+  /// PHIs in Dest that TI would provide.
+  std::vector<Value*> TIPHIValues;
+  
+  // Check to see if Dest has any blocks that can be used as a split edge for
+  // this terminator.
+  for (pred_iterator PI = pred_begin(Dest), E = pred_end(Dest); PI != E; ++PI) {
+    BasicBlock *Pred = *PI;
+    // To be usable, the pred has to end with an uncond branch to the dest.
+    BranchInst *PredBr = dyn_cast<BranchInst>(Pred->getTerminator());
+    if (!PredBr || !PredBr->isUnconditional() ||
+        // Must be empty other than the branch.
+        &Pred->front() != PredBr ||
+        // Cannot be the entry block; its label does not get emitted.
+        Pred == &(Dest->getParent()->getEntryBlock()))
+      continue;
+    
+    // Finally, since we know that Dest has phi nodes in it, we have to make
+    // sure that jumping to Pred will have the same affect as going to Dest in
+    // terms of PHI values.
+    PHINode *PN;
+    unsigned PHINo = 0;
+    bool FoundMatch = true;
+    for (BasicBlock::iterator I = Dest->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I, ++PHINo) {
+      if (PHINo == TIPHIValues.size())
+        TIPHIValues.push_back(PN->getIncomingValueForBlock(TIBB));
+      
+      // If the PHI entry doesn't work, we can't use this pred.
+      if (TIPHIValues[PHINo] != PN->getIncomingValueForBlock(Pred)) {
+        FoundMatch = false;
+        break;
+      }
+    }
+    
+    // If we found a workable predecessor, change TI to branch to Succ.
+    if (FoundMatch) {
+      Dest->removePredecessor(TIBB);
+      TI->setSuccessor(SuccNum, Pred);
+      return;
+    }
+  }
+  
+  SplitCriticalEdge(TI, SuccNum, P, true);  
+}
+
+/// OptimizeNoopCopyExpression - If the specified cast instruction is a noop
+/// copy (e.g. it's casting from one pointer type to another, int->uint, or
+/// int->sbyte on PPC), sink it into user blocks to reduce the number of virtual
+/// registers that must be created and coalesced.
+///
+/// Return true if any changes are made.
+static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI){
+  // If this is a noop copy, 
+  MVT::ValueType SrcVT = TLI.getValueType(CI->getOperand(0)->getType());
+  MVT::ValueType DstVT = TLI.getValueType(CI->getType());
+  
+  // This is an fp<->int conversion?
+  if (MVT::isInteger(SrcVT) != MVT::isInteger(DstVT))
+    return false;
+  
+  // If this is an extension, it will be a zero or sign extension, which
+  // isn't a noop.
+  if (SrcVT < DstVT) return false;
+  
+  // If these values will be promoted, find out what they will be promoted
+  // to.  This helps us consider truncates on PPC as noop copies when they
+  // are.
+  if (TLI.getTypeAction(SrcVT) == TargetLowering::Promote)
+    SrcVT = TLI.getTypeToTransformTo(SrcVT);
+  if (TLI.getTypeAction(DstVT) == TargetLowering::Promote)
+    DstVT = TLI.getTypeToTransformTo(DstVT);
+  
+  // If, after promotion, these are the same types, this is a noop copy.
+  if (SrcVT != DstVT)
+    return false;
+  
+  BasicBlock *DefBB = CI->getParent();
+  
+  /// InsertedCasts - Only insert a cast in each block once.
+  DenseMap<BasicBlock*, CastInst*> InsertedCasts;
+  
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); 
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    
+    // Figure out which BB this cast is used in.  For PHI's this is the
+    // appropriate predecessor block.
+    BasicBlock *UserBB = User->getParent();
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      unsigned OpVal = UI.getOperandNo()/2;
+      UserBB = PN->getIncomingBlock(OpVal);
+    }
+    
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+    
+    // If this user is in the same block as the cast, don't change the cast.
+    if (UserBB == DefBB) continue;
+    
+    // If we have already inserted a cast into this block, use it.
+    CastInst *&InsertedCast = InsertedCasts[UserBB];
+
+    if (!InsertedCast) {
+      BasicBlock::iterator InsertPt = UserBB->begin();
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      
+      InsertedCast = 
+        CastInst::create(CI->getOpcode(), CI->getOperand(0), CI->getType(), "", 
+                         InsertPt);
+      MadeChange = true;
+    }
+    
+    // Replace a use of the cast with a use of the new cast.
+    TheUse = InsertedCast;
+  }
+  
+  // If we removed all uses, nuke the cast.
+  if (CI->use_empty())
+    CI->eraseFromParent();
+  
+  return MadeChange;
+}
+
+/// OptimizeCmpExpression - sink the given CmpInst into user blocks to reduce 
+/// the number of virtual registers that must be created and coalesced.  This is
+/// a clear win except on targets with multiple condition code registers (powerPC),
+/// where it might lose; some adjustment may be wanted there.
+///
+/// Return true if any changes are made.
+static bool OptimizeCmpExpression(CmpInst *CI){
+
+  BasicBlock *DefBB = CI->getParent();
+  
+  /// InsertedCmp - Only insert a cmp in each block once.
+  DenseMap<BasicBlock*, CmpInst*> InsertedCmps;
+  
+  bool MadeChange = false;
+  for (Value::use_iterator UI = CI->use_begin(), E = CI->use_end(); 
+       UI != E; ) {
+    Use &TheUse = UI.getUse();
+    Instruction *User = cast<Instruction>(*UI);
+    
+    // Preincrement use iterator so we don't invalidate it.
+    ++UI;
+    
+    // Don't bother for PHI nodes.
+    if (isa<PHINode>(User))
+      continue;
+
+    // Figure out which BB this cmp is used in.
+    BasicBlock *UserBB = User->getParent();
+    
+    // If this user is in the same block as the cmp, don't change the cmp.
+    if (UserBB == DefBB) continue;
+    
+    // If we have already inserted a cmp into this block, use it.
+    CmpInst *&InsertedCmp = InsertedCmps[UserBB];
+
+    if (!InsertedCmp) {
+      BasicBlock::iterator InsertPt = UserBB->begin();
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      
+      InsertedCmp = 
+        CmpInst::create(CI->getOpcode(), CI->getPredicate(), CI->getOperand(0), 
+                        CI->getOperand(1), "", InsertPt);
+      MadeChange = true;
+    }
+    
+    // Replace a use of the cmp with a use of the new cmp.
+    TheUse = InsertedCmp;
+  }
+  
+  // If we removed all uses, nuke the cmp.
+  if (CI->use_empty())
+    CI->eraseFromParent();
+  
+  return MadeChange;
+}
+
+/// EraseDeadInstructions - Erase any dead instructions
+static void EraseDeadInstructions(Value *V) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || !I->use_empty()) return;
+  
+  SmallPtrSet<Instruction*, 16> Insts;
+  Insts.insert(I);
+  
+  while (!Insts.empty()) {
+    I = *Insts.begin();
+    Insts.erase(I);
+    if (isInstructionTriviallyDead(I)) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
+          Insts.insert(U);
+      I->eraseFromParent();
+    }
+  }
+}
+
+
+/// ExtAddrMode - This is an extended version of TargetLowering::AddrMode which
+/// holds actual Value*'s for register values.
+struct ExtAddrMode : public TargetLowering::AddrMode {
+  Value *BaseReg;
+  Value *ScaledReg;
+  ExtAddrMode() : BaseReg(0), ScaledReg(0) {}
+  void dump() const;
+};
+
+static std::ostream &operator<<(std::ostream &OS, const ExtAddrMode &AM) {
+  bool NeedPlus = false;
+  OS << "[";
+  if (AM.BaseGV)
+    OS << (NeedPlus ? " + " : "")
+       << "GV:%" << AM.BaseGV->getName(), NeedPlus = true;
+  
+  if (AM.BaseOffs)
+    OS << (NeedPlus ? " + " : "") << AM.BaseOffs, NeedPlus = true;
+  
+  if (AM.BaseReg)
+    OS << (NeedPlus ? " + " : "")
+       << "Base:%" << AM.BaseReg->getName(), NeedPlus = true;
+  if (AM.Scale)
+    OS << (NeedPlus ? " + " : "")
+       << AM.Scale << "*%" << AM.ScaledReg->getName(), NeedPlus = true;
+  
+  return OS << "]";
+}
+
+void ExtAddrMode::dump() const {
+  cerr << *this << "\n";
+}
+
+static bool TryMatchingScaledValue(Value *ScaleReg, int64_t Scale,
+                                   const Type *AccessTy, ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                   const TargetLowering &TLI, unsigned Depth);
+  
+/// FindMaximalLegalAddressingMode - If we can, try to merge the computation of
+/// Addr into the specified addressing mode.  If Addr can't be added to AddrMode
+/// this returns false.  This assumes that Addr is either a pointer type or
+/// intptr_t for the target.
+static bool FindMaximalLegalAddressingMode(Value *Addr, const Type *AccessTy,
+                                           ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                           const TargetLowering &TLI,
+                                           unsigned Depth) {
+  
+  // If this is a global variable, fold it into the addressing mode if possible.
+  if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
+    if (AddrMode.BaseGV == 0) {
+      AddrMode.BaseGV = GV;
+      if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+        return true;
+      AddrMode.BaseGV = 0;
+    }
+  } else if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
+    AddrMode.BaseOffs += CI->getSExtValue();
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy))
+      return true;
+    AddrMode.BaseOffs -= CI->getSExtValue();
+  } else if (isa<ConstantPointerNull>(Addr)) {
+    return true;
+  }
+  
+  // Look through constant exprs and instructions.
+  unsigned Opcode = ~0U;
+  User *AddrInst = 0;
+  if (Instruction *I = dyn_cast<Instruction>(Addr)) {
+    Opcode = I->getOpcode();
+    AddrInst = I;
+  } else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Addr)) {
+    Opcode = CE->getOpcode();
+    AddrInst = CE;
+  }
+
+  // Limit recursion to avoid exponential behavior.
+  if (Depth == 5) { AddrInst = 0; Opcode = ~0U; }
+
+  // If this is really an instruction, add it to our list of related
+  // instructions.
+  if (Instruction *I = dyn_cast_or_null<Instruction>(AddrInst))
+    AddrModeInsts.push_back(I);
+
+  switch (Opcode) {
+  case Instruction::PtrToInt:
+    // PtrToInt is always a noop, as we know that the int type is pointer sized.
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth))
+      return true;
+    break;
+  case Instruction::IntToPtr:
+    // This inttoptr is a no-op if the integer type is pointer sized.
+    if (TLI.getValueType(AddrInst->getOperand(0)->getType()) ==
+        TLI.getPointerTy()) {
+      if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                         AddrMode, AddrModeInsts, TLI, Depth))
+        return true;
+    }
+    break;
+  case Instruction::Add: {
+    // Check to see if we can merge in the RHS then the LHS.  If so, we win.
+    ExtAddrMode BackupAddrMode = AddrMode;
+    unsigned OldSize = AddrModeInsts.size();
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(1), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1) &&
+        FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1))
+      return true;
+
+    // Restore the old addr mode info.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    
+    // Otherwise this was over-aggressive.  Try merging in the LHS then the RHS.
+    if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1) &&
+        FindMaximalLegalAddressingMode(AddrInst->getOperand(1), AccessTy,
+                                       AddrMode, AddrModeInsts, TLI, Depth+1))
+      return true;
+    
+    // Otherwise we definitely can't merge the ADD in.
+    AddrMode = BackupAddrMode;
+    AddrModeInsts.resize(OldSize);
+    break;    
+  }
+  case Instruction::Or: {
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) break;
+    // TODO: We can handle "Or Val, Imm" iff this OR is equivalent to an ADD.
+    break;
+  }
+  case Instruction::Mul:
+  case Instruction::Shl: {
+    // Can only handle X*C and X << C, and can only handle this when the scale
+    // field is available.
+    ConstantInt *RHS = dyn_cast<ConstantInt>(AddrInst->getOperand(1));
+    if (!RHS) break;
+    int64_t Scale = RHS->getSExtValue();
+    if (Opcode == Instruction::Shl)
+      Scale = 1 << Scale;
+    
+    if (TryMatchingScaledValue(AddrInst->getOperand(0), Scale, AccessTy,
+                               AddrMode, AddrModeInsts, TLI, Depth))
+      return true;
+    break;
+  }
+  case Instruction::GetElementPtr: {
+    // Scan the GEP.  We check it if it contains constant offsets and at most
+    // one variable offset.
+    int VariableOperand = -1;
+    unsigned VariableScale = 0;
+    
+    int64_t ConstantOffset = 0;
+    const TargetData *TD = TLI.getTargetData();
+    gep_type_iterator GTI = gep_type_begin(AddrInst);
+    for (unsigned i = 1, e = AddrInst->getNumOperands(); i != e; ++i, ++GTI) {
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        const StructLayout *SL = TD->getStructLayout(STy);
+        unsigned Idx =
+          cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
+        ConstantOffset += SL->getElementOffset(Idx);
+      } else {
+        uint64_t TypeSize = TD->getTypeSize(GTI.getIndexedType());
+        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+          ConstantOffset += CI->getSExtValue()*TypeSize;
+        } else if (TypeSize) {  // Scales of zero don't do anything.
+          // We only allow one variable index at the moment.
+          if (VariableOperand != -1) {
+            VariableOperand = -2;
+            break;
+          }
+          
+          // Remember the variable index.
+          VariableOperand = i;
+          VariableScale = TypeSize;
+        }
+      }
+    }
+
+    // If the GEP had multiple variable indices, punt.
+    if (VariableOperand == -2)
+      break;
+
+    // A common case is for the GEP to only do a constant offset.  In this case,
+    // just add it to the disp field and check validity.
+    if (VariableOperand == -1) {
+      AddrMode.BaseOffs += ConstantOffset;
+      if (ConstantOffset == 0 || TLI.isLegalAddressingMode(AddrMode, AccessTy)){
+        // Check to see if we can fold the base pointer in too.
+        if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                           AddrMode, AddrModeInsts, TLI,
+                                           Depth+1))
+          return true;
+      }
+      AddrMode.BaseOffs -= ConstantOffset;
+    } else {
+      // Check that this has no base reg yet.  If so, we won't have a place to
+      // put the base of the GEP (assuming it is not a null ptr).
+      bool SetBaseReg = false;
+      if (AddrMode.HasBaseReg) {
+        if (!isa<ConstantPointerNull>(AddrInst->getOperand(0)))
+          break;
+      } else {
+        AddrMode.HasBaseReg = true;
+        AddrMode.BaseReg = AddrInst->getOperand(0);
+        SetBaseReg = true;
+      }
+      
+      // See if the scale amount is valid for this target.
+      AddrMode.BaseOffs += ConstantOffset;
+      if (TryMatchingScaledValue(AddrInst->getOperand(VariableOperand),
+                                 VariableScale, AccessTy, AddrMode, 
+                                 AddrModeInsts, TLI, Depth)) {
+        if (!SetBaseReg) return true;
+
+        // If this match succeeded, we know that we can form an address with the
+        // GepBase as the basereg.  See if we can match *more*.
+        AddrMode.HasBaseReg = false;
+        AddrMode.BaseReg = 0;
+        if (FindMaximalLegalAddressingMode(AddrInst->getOperand(0), AccessTy,
+                                           AddrMode, AddrModeInsts, TLI,
+                                           Depth+1))
+          return true;
+        // Strange, shouldn't happen.  Restore the base reg and succeed the easy
+        // way.        
+        AddrMode.HasBaseReg = true;
+        AddrMode.BaseReg = AddrInst->getOperand(0);
+        return true;
+      }
+      
+      AddrMode.BaseOffs -= ConstantOffset;
+      if (SetBaseReg) {
+        AddrMode.HasBaseReg = false;
+        AddrMode.BaseReg = 0;
+      }
+    }
+    break;    
+  }
+  }
+  
+  if (Instruction *I = dyn_cast_or_null<Instruction>(AddrInst)) {
+    assert(AddrModeInsts.back() == I && "Stack imbalance");
+    AddrModeInsts.pop_back();
+  }
+  
+  // Worse case, the target should support [reg] addressing modes. :)
+  if (!AddrMode.HasBaseReg) {
+    AddrMode.HasBaseReg = true;
+    // Still check for legality in case the target supports [imm] but not [i+r].
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+      AddrMode.BaseReg = Addr;
+      return true;
+    }
+    AddrMode.HasBaseReg = false;
+  }
+  
+  // If the base register is already taken, see if we can do [r+r].
+  if (AddrMode.Scale == 0) {
+    AddrMode.Scale = 1;
+    if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+      AddrMode.ScaledReg = Addr;
+      return true;
+    }
+    AddrMode.Scale = 0;
+  }
+  // Couldn't match.
+  return false;
+}
+
+/// TryMatchingScaledValue - Try adding ScaleReg*Scale to the specified
+/// addressing mode.  Return true if this addr mode is legal for the target,
+/// false if not.
+static bool TryMatchingScaledValue(Value *ScaleReg, int64_t Scale,
+                                   const Type *AccessTy, ExtAddrMode &AddrMode,
+                                   SmallVector<Instruction*, 16> &AddrModeInsts,
+                                   const TargetLowering &TLI, unsigned Depth) {
+  // If we already have a scale of this value, we can add to it, otherwise, we
+  // need an available scale field.
+  if (AddrMode.Scale != 0 && AddrMode.ScaledReg != ScaleReg)
+    return false;
+  
+  ExtAddrMode InputAddrMode = AddrMode;
+  
+  // Add scale to turn X*4+X*3 -> X*7.  This could also do things like
+  // [A+B + A*7] -> [B+A*8].
+  AddrMode.Scale += Scale;
+  AddrMode.ScaledReg = ScaleReg;
+  
+  if (TLI.isLegalAddressingMode(AddrMode, AccessTy)) {
+    // Okay, we decided that we can add ScaleReg+Scale to AddrMode.  Check now
+    // to see if ScaleReg is actually X+C.  If so, we can turn this into adding
+    // X*Scale + C*Scale to addr mode.
+    BinaryOperator *BinOp = dyn_cast<BinaryOperator>(ScaleReg);
+    if (BinOp && BinOp->getOpcode() == Instruction::Add &&
+        isa<ConstantInt>(BinOp->getOperand(1)) && InputAddrMode.ScaledReg ==0) {
+      
+      InputAddrMode.Scale = Scale;
+      InputAddrMode.ScaledReg = BinOp->getOperand(0);
+      InputAddrMode.BaseOffs += 
+        cast<ConstantInt>(BinOp->getOperand(1))->getSExtValue()*Scale;
+      if (TLI.isLegalAddressingMode(InputAddrMode, AccessTy)) {
+        AddrModeInsts.push_back(BinOp);
+        AddrMode = InputAddrMode;
+        return true;
+      }
+    }
+
+    // Otherwise, not (x+c)*scale, just return what we have.
+    return true;
+  }
+  
+  // Otherwise, back this attempt out.
+  AddrMode.Scale -= Scale;
+  if (AddrMode.Scale == 0) AddrMode.ScaledReg = 0;
+  
+  return false;
+}
+
+
+/// IsNonLocalValue - Return true if the specified values are defined in a
+/// different basic block than BB.
+static bool IsNonLocalValue(Value *V, BasicBlock *BB) {
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    return I->getParent() != BB;
+  return false;
+}
+
+/// OptimizeLoadStoreInst - Load and Store Instructions have often have
+/// addressing modes that can do significant amounts of computation.  As such,
+/// instruction selection will try to get the load or store to do as much
+/// computation as possible for the program.  The problem is that isel can only
+/// see within a single block.  As such, we sink as much legal addressing mode
+/// stuff into the block as possible.
+bool CodeGenPrepare::OptimizeLoadStoreInst(Instruction *LdStInst, Value *Addr,
+                                           const Type *AccessTy,
+                                           DenseMap<Value*,Value*> &SunkAddrs) {
+  // Figure out what addressing mode will be built up for this operation.
+  SmallVector<Instruction*, 16> AddrModeInsts;
+  ExtAddrMode AddrMode;
+  bool Success = FindMaximalLegalAddressingMode(Addr, AccessTy, AddrMode,
+                                                AddrModeInsts, *TLI, 0);
+  Success = Success; assert(Success && "Couldn't select *anything*?");
+  
+  // Check to see if any of the instructions supersumed by this addr mode are
+  // non-local to I's BB.
+  bool AnyNonLocal = false;
+  for (unsigned i = 0, e = AddrModeInsts.size(); i != e; ++i) {
+    if (IsNonLocalValue(AddrModeInsts[i], LdStInst->getParent())) {
+      AnyNonLocal = true;
+      break;
+    }
+  }
+  
+  // If all the instructions matched are already in this BB, don't do anything.
+  if (!AnyNonLocal) {
+    DEBUG(cerr << "CGP: Found      local addrmode: " << AddrMode << "\n");
+    return false;
+  }
+  
+  // Insert this computation right after this user.  Since our caller is
+  // scanning from the top of the BB to the bottom, reuse of the expr are
+  // guaranteed to happen later.
+  BasicBlock::iterator InsertPt = LdStInst;
+  
+  // Now that we determined the addressing expression we want to use and know
+  // that we have to sink it into this block.  Check to see if we have already
+  // done this for some other load/store instr in this block.  If so, reuse the
+  // computation.
+  Value *&SunkAddr = SunkAddrs[Addr];
+  if (SunkAddr) {
+    DEBUG(cerr << "CGP: Reusing nonlocal addrmode: " << AddrMode << "\n");
+    if (SunkAddr->getType() != Addr->getType())
+      SunkAddr = new BitCastInst(SunkAddr, Addr->getType(), "tmp", InsertPt);
+  } else {
+    DEBUG(cerr << "CGP: SINKING nonlocal addrmode: " << AddrMode << "\n");
+    const Type *IntPtrTy = TLI->getTargetData()->getIntPtrType();
+    
+    Value *Result = 0;
+    // Start with the scale value.
+    if (AddrMode.Scale) {
+      Value *V = AddrMode.ScaledReg;
+      if (V->getType() == IntPtrTy) {
+        // done.
+      } else if (isa<PointerType>(V->getType())) {
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else if (cast<IntegerType>(IntPtrTy)->getBitWidth() <
+                 cast<IntegerType>(V->getType())->getBitWidth()) {
+        V = new TruncInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      } else {
+        V = new SExtInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      }
+      if (AddrMode.Scale != 1)
+        V = BinaryOperator::createMul(V, ConstantInt::get(IntPtrTy,
+                                                          AddrMode.Scale),
+                                      "sunkaddr", InsertPt);
+      Result = V;
+    }
+
+    // Add in the base register.
+    if (AddrMode.BaseReg) {
+      Value *V = AddrMode.BaseReg;
+      if (V->getType() != IntPtrTy)
+        V = new PtrToIntInst(V, IntPtrTy, "sunkaddr", InsertPt);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+    
+    // Add in the BaseGV if present.
+    if (AddrMode.BaseGV) {
+      Value *V = new PtrToIntInst(AddrMode.BaseGV, IntPtrTy, "sunkaddr",
+                                  InsertPt);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+    
+    // Add in the Base Offset if present.
+    if (AddrMode.BaseOffs) {
+      Value *V = ConstantInt::get(IntPtrTy, AddrMode.BaseOffs);
+      if (Result)
+        Result = BinaryOperator::createAdd(Result, V, "sunkaddr", InsertPt);
+      else
+        Result = V;
+    }
+
+    if (Result == 0)
+      SunkAddr = Constant::getNullValue(Addr->getType());
+    else
+      SunkAddr = new IntToPtrInst(Result, Addr->getType(), "sunkaddr",InsertPt);
+  }
+  
+  LdStInst->replaceUsesOfWith(Addr, SunkAddr);
+  
+  if (Addr->use_empty())
+    EraseDeadInstructions(Addr);
+  return true;
+}
+
+// In this pass we look for GEP and cast instructions that are used
+// across basic blocks and rewrite them to improve basic-block-at-a-time
+// selection.
+bool CodeGenPrepare::OptimizeBlock(BasicBlock &BB) {
+  bool MadeChange = false;
+  
+  // Split all critical edges where the dest block has a PHI and where the phi
+  // has shared immediate operands.
+  TerminatorInst *BBTI = BB.getTerminator();
+  if (BBTI->getNumSuccessors() > 1) {
+    for (unsigned i = 0, e = BBTI->getNumSuccessors(); i != e; ++i)
+      if (isa<PHINode>(BBTI->getSuccessor(i)->begin()) &&
+          isCriticalEdge(BBTI, i, true))
+        SplitEdgeNicely(BBTI, i, this);
+  }
+  
+  
+  // Keep track of non-local addresses that have been sunk into this block.
+  // This allows us to avoid inserting duplicate code for blocks with multiple
+  // load/stores of the same address.
+  DenseMap<Value*, Value*> SunkAddrs;
+  
+  for (BasicBlock::iterator BBI = BB.begin(), E = BB.end(); BBI != E; ) {
+    Instruction *I = BBI++;
+    
+    if (CastInst *CI = dyn_cast<CastInst>(I)) {
+      // If the source of the cast is a constant, then this should have
+      // already been constant folded.  The only reason NOT to constant fold
+      // it is if something (e.g. LSR) was careful to place the constant
+      // evaluation in a block other than then one that uses it (e.g. to hoist
+      // the address of globals out of a loop).  If this is the case, we don't
+      // want to forward-subst the cast.
+      if (isa<Constant>(CI->getOperand(0)))
+        continue;
+      
+      if (TLI)
+        MadeChange |= OptimizeNoopCopyExpression(CI, *TLI);
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+      MadeChange |= OptimizeCmpExpression(CI);
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeLoadStoreInst(I, I->getOperand(0), LI->getType(),
+                                            SunkAddrs);
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
+      if (TLI)
+        MadeChange |= OptimizeLoadStoreInst(I, SI->getOperand(1),
+                                            SI->getOperand(0)->getType(),
+                                            SunkAddrs);
+    } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+      if (GEPI->hasAllZeroIndices()) {
+        /// The GEP operand must be a pointer, so must its result -> BitCast
+        Instruction *NC = new BitCastInst(GEPI->getOperand(0), GEPI->getType(), 
+                                          GEPI->getName(), GEPI);
+        GEPI->replaceAllUsesWith(NC);
+        GEPI->eraseFromParent();
+        MadeChange = true;
+        BBI = NC;
+      }
+    } else if (CallInst *CI = dyn_cast<CallInst>(I)) {
+      // If we found an inline asm expession, and if the target knows how to
+      // lower it to normal LLVM code, do so now.
+      if (TLI && isa<InlineAsm>(CI->getCalledValue()))
+        if (const TargetAsmInfo *TAI = 
+            TLI->getTargetMachine().getTargetAsmInfo()) {
+          if (TAI->ExpandInlineAsm(CI))
+            BBI = BB.begin();
+        }
+    }
+  }
+    
+  return MadeChange;
+}
+

diff --git a/lib/Transforms/Scalar/CondPropagate.cpp b/lib/Transforms/Scalar/CondPropagate.cpp
new file mode 100644
index 0000000..d4c583f
--- /dev/null
+++ b/lib/Transforms/Scalar/CondPropagate.cpp

@@ -0,0 +1,219 @@
+//===-- CondPropagate.cpp - Propagate Conditional Expressions -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass propagates information about conditional expressions through the
+// program, allowing it to eliminate conditional branches in some cases.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "condprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Streams.h"
+using namespace llvm;
+
+STATISTIC(NumBrThread, "Number of CFG edges threaded through branches");
+STATISTIC(NumSwThread, "Number of CFG edges threaded through switches");
+
+namespace {
+  struct VISIBILITY_HIDDEN CondProp : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CondProp() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      //AU.addRequired<DominanceFrontier>();
+    }
+
+  private:
+    bool MadeChange;
+    void SimplifyBlock(BasicBlock *BB);
+    void SimplifyPredecessors(BranchInst *BI);
+    void SimplifyPredecessors(SwitchInst *SI);
+    void RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB);
+  };
+  
+  char CondProp::ID = 0;
+  RegisterPass<CondProp> X("condprop", "Conditional Propagation");
+}
+
+FunctionPass *llvm::createCondPropagationPass() {
+  return new CondProp();
+}
+
+bool CondProp::runOnFunction(Function &F) {
+  bool EverMadeChange = false;
+
+  // While we are simplifying blocks, keep iterating.
+  do {
+    MadeChange = false;
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      SimplifyBlock(BB);
+    EverMadeChange = MadeChange;
+  } while (MadeChange);
+  return EverMadeChange;
+}
+
+void CondProp::SimplifyBlock(BasicBlock *BB) {
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
+    // If this is a conditional branch based on a phi node that is defined in
+    // this block, see if we can simplify predecessors of this block.
+    if (BI->isConditional() && isa<PHINode>(BI->getCondition()) &&
+        cast<PHINode>(BI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(BI);
+
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
+    if (isa<PHINode>(SI->getCondition()) &&
+        cast<PHINode>(SI->getCondition())->getParent() == BB)
+      SimplifyPredecessors(SI);
+  }
+
+  // If possible, simplify the terminator of this block.
+  if (ConstantFoldTerminator(BB))
+    MadeChange = true;
+
+  // If this block ends with an unconditional branch and the only successor has
+  // only this block as a predecessor, merge the two blocks together.
+  if (BranchInst *BI = dyn_cast<BranchInst>(BB->getTerminator()))
+    if (BI->isUnconditional() && BI->getSuccessor(0)->getSinglePredecessor() &&
+        BB != BI->getSuccessor(0)) {
+      BasicBlock *Succ = BI->getSuccessor(0);
+      
+      // If Succ has any PHI nodes, they are all single-entry PHI's.
+      while (PHINode *PN = dyn_cast<PHINode>(Succ->begin())) {
+        assert(PN->getNumIncomingValues() == 1 &&
+               "PHI doesn't match parent block");
+        PN->replaceAllUsesWith(PN->getIncomingValue(0));
+        PN->eraseFromParent();
+      }
+      
+      // Remove BI.
+      BI->eraseFromParent();
+
+      // Move over all of the instructions.
+      BB->getInstList().splice(BB->end(), Succ->getInstList());
+
+      // Any phi nodes that had entries for Succ now have entries from BB.
+      Succ->replaceAllUsesWith(BB);
+
+      // Succ is now dead, but we cannot delete it without potentially
+      // invalidating iterators elsewhere.  Just insert an unreachable
+      // instruction in it.
+      new UnreachableInst(Succ);
+      MadeChange = true;
+    }
+}
+
+// SimplifyPredecessors(branches) - We know that BI is a conditional branch
+// based on a PHI node defined in this block.  If the phi node contains constant
+// operands, then the blocks corresponding to those operands can be modified to
+// jump directly to the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(BranchInst *BI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch in the
+  // block.
+  PHINode *PN = cast<PHINode>(BI->getCondition());
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = BI->getParent();
+  if (&*BB->begin() != PN || &*next(BB->begin()) != BI)
+    return;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i)
+    if (ConstantInt *CB = dyn_cast<ConstantInt>(PN->getIncomingValue(i-1))) {
+      // If we have a constant, forward the edge from its current to its
+      // ultimate destination.
+      bool PHIGone = PN->getNumIncomingValues() == 2;
+      RevectorBlockTo(PN->getIncomingBlock(i-1),
+                      BI->getSuccessor(CB->isZero()));
+      ++NumBrThread;
+
+      // If there were two predecessors before this simplification, the PHI node
+      // will be deleted.  Don't iterate through it the last time.
+      if (PHIGone) return;
+    }
+}
+
+// SimplifyPredecessors(switch) - We know that SI is switch based on a PHI node
+// defined in this block.  If the phi node contains constant operands, then the
+// blocks corresponding to those operands can be modified to jump directly to
+// the destination instead of going through this block.
+void CondProp::SimplifyPredecessors(SwitchInst *SI) {
+  // TODO: We currently only handle the most trival case, where the PHI node has
+  // one use (the branch), and is the only instruction besides the branch in the
+  // block.
+  PHINode *PN = cast<PHINode>(SI->getCondition());
+  if (!PN->hasOneUse()) return;
+
+  BasicBlock *BB = SI->getParent();
+  if (&*BB->begin() != PN || &*next(BB->begin()) != SI)
+    return;
+
+  bool RemovedPreds = false;
+
+  // Ok, we have this really simple case, walk the PHI operands, looking for
+  // constants.  Walk from the end to remove operands from the end when
+  // possible, and to avoid invalidating "i".
+  for (unsigned i = PN->getNumIncomingValues(); i != 0; --i)
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(PN->getIncomingValue(i-1))) {
+      // If we have a constant, forward the edge from its current to its
+      // ultimate destination.
+      bool PHIGone = PN->getNumIncomingValues() == 2;
+      unsigned DestCase = SI->findCaseValue(CI);
+      RevectorBlockTo(PN->getIncomingBlock(i-1),
+                      SI->getSuccessor(DestCase));
+      ++NumSwThread;
+      RemovedPreds = true;
+
+      // If there were two predecessors before this simplification, the PHI node
+      // will be deleted.  Don't iterate through it the last time.
+      if (PHIGone) return;
+    }
+}
+
+
+// RevectorBlockTo - Revector the unconditional branch at the end of FromBB to
+// the ToBB block, which is one of the successors of its current successor.
+void CondProp::RevectorBlockTo(BasicBlock *FromBB, BasicBlock *ToBB) {
+  BranchInst *FromBr = cast<BranchInst>(FromBB->getTerminator());
+  assert(FromBr->isUnconditional() && "FromBB should end with uncond br!");
+
+  // Get the old block we are threading through.
+  BasicBlock *OldSucc = FromBr->getSuccessor(0);
+
+  // OldSucc had multiple successors. If ToBB has multiple predecessors, then 
+  // the edge between them would be critical, which we already took care of.
+  // If ToBB has single operand PHI node then take care of it here.
+  while (PHINode *PN = dyn_cast<PHINode>(ToBB->begin())) {
+    assert(PN->getNumIncomingValues() == 1 && "Critical Edge Found!");    
+    PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    PN->eraseFromParent();
+  }
+
+  // Update PHI nodes in OldSucc to know that FromBB no longer branches to it.
+  OldSucc->removePredecessor(FromBB);
+
+  // Change FromBr to branch to the new destination.
+  FromBr->setSuccessor(0, ToBB);
+
+  MadeChange = true;
+}

diff --git a/lib/Transforms/Scalar/ConstantProp.cpp b/lib/Transforms/Scalar/ConstantProp.cpp
new file mode 100644
index 0000000..3308e33
--- /dev/null
+++ b/lib/Transforms/Scalar/ConstantProp.cpp

@@ -0,0 +1,90 @@
+//===- ConstantProp.cpp - Code to perform Simple Constant Propagation -----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements constant propagation and merging:
+//
+// Specifically, this:
+//   * Converts instructions like "add int 1, 2" into 3
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to to run a DIE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "constprop"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Constant.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumInstKilled, "Number of instructions killed");
+
+namespace {
+  struct VISIBILITY_HIDDEN ConstantPropagation : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    ConstantPropagation() : FunctionPass((intptr_t)&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+
+  char ConstantPropagation::ID = 0;
+  RegisterPass<ConstantPropagation> X("constprop",
+                                      "Simple constant propagation");
+}
+
+FunctionPass *llvm::createConstantPropagationPass() {
+  return new ConstantPropagation();
+}
+
+
+bool ConstantPropagation::runOnFunction(Function &F) {
+  // Initialize the worklist to all of the instructions ready to process...
+  std::set<Instruction*> WorkList;
+  for(inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i) {
+      WorkList.insert(&*i);
+  }
+  bool Changed = false;
+
+  while (!WorkList.empty()) {
+    Instruction *I = *WorkList.begin();
+    WorkList.erase(WorkList.begin());    // Get an element from the worklist...
+
+    if (!I->use_empty())                 // Don't muck with dead instructions...
+      if (Constant *C = ConstantFoldInstruction(I)) {
+        // Add all of the users of this instruction to the worklist, they might
+        // be constant propagatable now...
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE; ++UI)
+          WorkList.insert(cast<Instruction>(*UI));
+
+        // Replace all of the uses of a variable with uses of the constant.
+        I->replaceAllUsesWith(C);
+
+        // Remove the dead instruction.
+        WorkList.erase(I);
+        I->getParent()->getInstList().erase(I);
+
+        // We made a change to the function...
+        Changed = true;
+        ++NumInstKilled;
+      }
+  }
+  return Changed;
+}

diff --git a/lib/Transforms/Scalar/CorrelatedExprs.cpp b/lib/Transforms/Scalar/CorrelatedExprs.cpp
new file mode 100644
index 0000000..655f9eb
--- /dev/null
+++ b/lib/Transforms/Scalar/CorrelatedExprs.cpp

@@ -0,0 +1,1487 @@
+//===- CorrelatedExprs.cpp - Pass to detect and eliminated c.e.'s ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Correlated Expression Elimination propagates information from conditional
+// branches to blocks dominated by destinations of the branch.  It propagates
+// information from the condition check itself into the body of the branch,
+// allowing transformations like these for example:
+//
+//  if (i == 7)
+//    ... 4*i;  // constant propagation
+//
+//  M = i+1; N = j+1;
+//  if (i == j)
+//    X = M-N;  // = M-M == 0;
+//
+// This is called Correlated Expression Elimination because we eliminate or
+// simplify expressions that are correlated with the direction of a branch.  In
+// this way we use static information to give us some information about the
+// dynamic value of a variable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "cee"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumCmpRemoved, "Number of cmp instruction eliminated");
+STATISTIC(NumOperandsCann, "Number of operands canonicalized");
+STATISTIC(BranchRevectors, "Number of branches revectored");
+
+namespace {
+  class ValueInfo;
+  class VISIBILITY_HIDDEN Relation {
+    Value *Val;          // Relation to what value?
+    unsigned Rel;        // SetCC or ICmp relation, or Add if no information
+  public:
+    Relation(Value *V) : Val(V), Rel(Instruction::Add) {}
+    bool operator<(const Relation &R) const { return Val < R.Val; }
+    Value *getValue() const { return Val; }
+    unsigned getRelation() const { return Rel; }
+
+    // contradicts - Return true if the relationship specified by the operand
+    // contradicts already known information.
+    //
+    bool contradicts(unsigned Rel, const ValueInfo &VI) const;
+
+    // incorporate - Incorporate information in the argument into this relation
+    // entry.  This assumes that the information doesn't contradict itself.  If
+    // any new information is gained, true is returned, otherwise false is
+    // returned to indicate that nothing was updated.
+    //
+    bool incorporate(unsigned Rel, ValueInfo &VI);
+
+    // KnownResult - Whether or not this condition determines the result of a
+    // setcc or icmp in the program.  False & True are intentionally 0 & 1 
+    // so we can convert to bool by casting after checking for unknown.
+    //
+    enum KnownResult { KnownFalse = 0, KnownTrue = 1, Unknown = 2 };
+
+    // getImpliedResult - If this relationship between two values implies that
+    // the specified relationship is true or false, return that.  If we cannot
+    // determine the result required, return Unknown.
+    //
+    KnownResult getImpliedResult(unsigned Rel) const;
+
+    // print - Output this relation to the specified stream
+    void print(std::ostream &OS) const;
+    void dump() const;
+  };
+
+
+  // ValueInfo - One instance of this record exists for every value with
+  // relationships between other values.  It keeps track of all of the
+  // relationships to other values in the program (specified with Relation) that
+  // are known to be valid in a region.
+  //
+  class VISIBILITY_HIDDEN ValueInfo {
+    // RelationShips - this value is know to have the specified relationships to
+    // other values.  There can only be one entry per value, and this list is
+    // kept sorted by the Val field.
+    std::vector<Relation> Relationships;
+
+    // If information about this value is known or propagated from constant
+    // expressions, this range contains the possible values this value may hold.
+    ConstantRange Bounds;
+
+    // If we find that this value is equal to another value that has a lower
+    // rank, this value is used as it's replacement.
+    //
+    Value *Replacement;
+  public:
+    ValueInfo(const Type *Ty)
+      : Bounds(Ty->isInteger() ? cast<IntegerType>(Ty)->getBitWidth()  : 32), 
+               Replacement(0) {}
+
+    // getBounds() - Return the constant bounds of the value...
+    const ConstantRange &getBounds() const { return Bounds; }
+    ConstantRange &getBounds() { return Bounds; }
+
+    const std::vector<Relation> &getRelationships() { return Relationships; }
+
+    // getReplacement - Return the value this value is to be replaced with if it
+    // exists, otherwise return null.
+    //
+    Value *getReplacement() const { return Replacement; }
+
+    // setReplacement - Used by the replacement calculation pass to figure out
+    // what to replace this value with, if anything.
+    //
+    void setReplacement(Value *Repl) { Replacement = Repl; }
+
+    // getRelation - return the relationship entry for the specified value.
+    // This can invalidate references to other Relations, so use it carefully.
+    //
+    Relation &getRelation(Value *V) {
+      // Binary search for V's entry...
+      std::vector<Relation>::iterator I =
+        std::lower_bound(Relationships.begin(), Relationships.end(),
+                         Relation(V));
+
+      // If we found the entry, return it...
+      if (I != Relationships.end() && I->getValue() == V)
+        return *I;
+
+      // Insert and return the new relationship...
+      return *Relationships.insert(I, V);
+    }
+
+    const Relation *requestRelation(Value *V) const {
+      // Binary search for V's entry...
+      std::vector<Relation>::const_iterator I =
+        std::lower_bound(Relationships.begin(), Relationships.end(),
+                         Relation(V));
+      if (I != Relationships.end() && I->getValue() == V)
+        return &*I;
+      return 0;
+    }
+
+    // print - Output information about this value relation...
+    void print(std::ostream &OS, Value *V) const;
+    void dump() const;
+  };
+
+  // RegionInfo - Keeps track of all of the value relationships for a region.  A
+  // region is the are dominated by a basic block.  RegionInfo's keep track of
+  // the RegionInfo for their dominator, because anything known in a dominator
+  // is known to be true in a dominated block as well.
+  //
+  class VISIBILITY_HIDDEN RegionInfo {
+    BasicBlock *BB;
+
+    // ValueMap - Tracks the ValueInformation known for this region
+    typedef std::map<Value*, ValueInfo> ValueMapTy;
+    ValueMapTy ValueMap;
+  public:
+    RegionInfo(BasicBlock *bb) : BB(bb) {}
+
+    // getEntryBlock - Return the block that dominates all of the members of
+    // this region.
+    BasicBlock *getEntryBlock() const { return BB; }
+
+    // empty - return true if this region has no information known about it.
+    bool empty() const { return ValueMap.empty(); }
+
+    const RegionInfo &operator=(const RegionInfo &RI) {
+      ValueMap = RI.ValueMap;
+      return *this;
+    }
+
+    // print - Output information about this region...
+    void print(std::ostream &OS) const;
+    void dump() const;
+
+    // Allow external access.
+    typedef ValueMapTy::iterator iterator;
+    iterator begin() { return ValueMap.begin(); }
+    iterator end() { return ValueMap.end(); }
+
+    ValueInfo &getValueInfo(Value *V) {
+      ValueMapTy::iterator I = ValueMap.lower_bound(V);
+      if (I != ValueMap.end() && I->first == V) return I->second;
+      return ValueMap.insert(I, std::make_pair(V, V->getType()))->second;
+    }
+
+    const ValueInfo *requestValueInfo(Value *V) const {
+      ValueMapTy::const_iterator I = ValueMap.find(V);
+      if (I != ValueMap.end()) return &I->second;
+      return 0;
+    }
+
+    /// removeValueInfo - Remove anything known about V from our records.  This
+    /// works whether or not we know anything about V.
+    ///
+    void removeValueInfo(Value *V) {
+      ValueMap.erase(V);
+    }
+  };
+
+  /// CEE - Correlated Expression Elimination
+  class VISIBILITY_HIDDEN CEE : public FunctionPass {
+    std::map<Value*, unsigned> RankMap;
+    std::map<BasicBlock*, RegionInfo> RegionInfoMap;
+    DominatorTree *DT;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    CEE() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+    // We don't modify the program, so we preserve all analyses
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequiredID(BreakCriticalEdgesID);
+    };
+
+    // print - Implement the standard print form to print out analysis
+    // information.
+    virtual void print(std::ostream &O, const Module *M) const;
+
+  private:
+    RegionInfo &getRegionInfo(BasicBlock *BB) {
+      std::map<BasicBlock*, RegionInfo>::iterator I
+        = RegionInfoMap.lower_bound(BB);
+      if (I != RegionInfoMap.end() && I->first == BB) return I->second;
+      return RegionInfoMap.insert(I, std::make_pair(BB, BB))->second;
+    }
+
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V) const {
+      if (isa<Constant>(V)) return 0;
+      std::map<Value*, unsigned>::const_iterator I = RankMap.find(V);
+      if (I != RankMap.end()) return I->second;
+      return 0; // Must be some other global thing
+    }
+
+    bool TransformRegion(BasicBlock *BB, std::set<BasicBlock*> &VisitedBlocks);
+
+    bool ForwardCorrelatedEdgeDestination(TerminatorInst *TI, unsigned SuccNo,
+                                          RegionInfo &RI);
+
+    void ForwardSuccessorTo(TerminatorInst *TI, unsigned Succ, BasicBlock *D,
+                            RegionInfo &RI);
+    void ReplaceUsesOfValueInRegion(Value *Orig, Value *New,
+                                    BasicBlock *RegionDominator);
+    void CalculateRegionExitBlocks(BasicBlock *BB, BasicBlock *OldSucc,
+                                   std::vector<BasicBlock*> &RegionExitBlocks);
+    void InsertRegionExitMerges(PHINode *NewPHI, Instruction *OldVal,
+                             const std::vector<BasicBlock*> &RegionExitBlocks);
+
+    void PropagateBranchInfo(BranchInst *BI);
+    void PropagateSwitchInfo(SwitchInst *SI);
+    void PropagateEquality(Value *Op0, Value *Op1, RegionInfo &RI);
+    void PropagateRelation(unsigned Opcode, Value *Op0,
+                           Value *Op1, RegionInfo &RI);
+    void UpdateUsersOfValue(Value *V, RegionInfo &RI);
+    void IncorporateInstruction(Instruction *Inst, RegionInfo &RI);
+    void ComputeReplacements(RegionInfo &RI);
+
+    // getCmpResult - Given a icmp instruction, determine if the result is
+    // determined by facts we already know about the region under analysis.
+    // Return KnownTrue, KnownFalse, or UnKnown based on what we can determine.
+    Relation::KnownResult getCmpResult(CmpInst *ICI, const RegionInfo &RI);
+
+    bool SimplifyBasicBlock(BasicBlock &BB, const RegionInfo &RI);
+    bool SimplifyInstruction(Instruction *Inst, const RegionInfo &RI);
+  };
+  
+  char CEE::ID = 0;
+  RegisterPass<CEE> X("cee", "Correlated Expression Elimination");
+}
+
+FunctionPass *llvm::createCorrelatedExpressionEliminationPass() {
+  return new CEE();
+}
+
+
+bool CEE::runOnFunction(Function &F) {
+  // Build a rank map for the function...
+  BuildRankMap(F);
+
+  // Traverse the dominator tree, computing information for each node in the
+  // tree.  Note that our traversal will not even touch unreachable basic
+  // blocks.
+  DT = &getAnalysis<DominatorTree>();
+
+  std::set<BasicBlock*> VisitedBlocks;
+  bool Changed = TransformRegion(&F.getEntryBlock(), VisitedBlocks);
+
+  RegionInfoMap.clear();
+  RankMap.clear();
+  return Changed;
+}
+
+// TransformRegion - Transform the region starting with BB according to the
+// calculated region information for the block.  Transforming the region
+// involves analyzing any information this block provides to successors,
+// propagating the information to successors, and finally transforming
+// successors.
+//
+// This method processes the function in depth first order, which guarantees
+// that we process the immediate dominator of a block before the block itself.
+// Because we are passing information from immediate dominators down to
+// dominatees, we obviously have to process the information source before the
+// information consumer.
+//
+bool CEE::TransformRegion(BasicBlock *BB, std::set<BasicBlock*> &VisitedBlocks){
+  // Prevent infinite recursion...
+  if (VisitedBlocks.count(BB)) return false;
+  VisitedBlocks.insert(BB);
+
+  // Get the computed region information for this block...
+  RegionInfo &RI = getRegionInfo(BB);
+
+  // Compute the replacement information for this block...
+  ComputeReplacements(RI);
+
+  // If debugging, print computed region information...
+  DEBUG(RI.print(*cerr.stream()));
+
+  // Simplify the contents of this block...
+  bool Changed = SimplifyBasicBlock(*BB, RI);
+
+  // Get the terminator of this basic block...
+  TerminatorInst *TI = BB->getTerminator();
+
+  // Loop over all of the blocks that this block is the immediate dominator for.
+  // Because all information known in this region is also known in all of the
+  // blocks that are dominated by this one, we can safely propagate the
+  // information down now.
+  //
+  DomTreeNode *BBDom = DT->getNode(BB);
+  if (!RI.empty()) {     // Time opt: only propagate if we can change something
+    for (std::vector<DomTreeNode*>::iterator DI = BBDom->begin(),
+           E = BBDom->end(); DI != E; ++DI) {
+      BasicBlock *ChildBB = (*DI)->getBlock();
+      assert(RegionInfoMap.find(ChildBB) == RegionInfoMap.end() &&
+             "RegionInfo should be calculated in dominanace order!");
+      getRegionInfo(ChildBB) = RI;
+    }
+  }
+
+  // Now that all of our successors have information if they deserve it,
+  // propagate any information our terminator instruction finds to our
+  // successors.
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isConditional())
+      PropagateBranchInfo(BI);
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    PropagateSwitchInfo(SI);
+  }
+
+  // If this is a branch to a block outside our region that simply performs
+  // another conditional branch, one whose outcome is known inside of this
+  // region, then vector this outgoing edge directly to the known destination.
+  //
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+    while (ForwardCorrelatedEdgeDestination(TI, i, RI)) {
+      ++BranchRevectors;
+      Changed = true;
+    }
+
+  // Now that all of our successors have information, recursively process them.
+  for (std::vector<DomTreeNode*>::iterator DI = BBDom->begin(),
+         E = BBDom->end(); DI != E; ++DI) {
+    BasicBlock *ChildBB = (*DI)->getBlock();
+    Changed |= TransformRegion(ChildBB, VisitedBlocks);
+  }
+
+  return Changed;
+}
+
+// isBlockSimpleEnoughForCheck to see if the block is simple enough for us to
+// revector the conditional branch in the bottom of the block, do so now.
+//
+static bool isBlockSimpleEnough(BasicBlock *BB) {
+  assert(isa<BranchInst>(BB->getTerminator()));
+  BranchInst *BI = cast<BranchInst>(BB->getTerminator());
+  assert(BI->isConditional());
+
+  // Check the common case first: empty block, or block with just a setcc.
+  if (BB->size() == 1 ||
+      (BB->size() == 2 && &BB->front() == BI->getCondition() &&
+       BI->getCondition()->hasOneUse()))
+    return true;
+
+  // Check the more complex case now...
+  BasicBlock::iterator I = BB->begin();
+
+  // FIXME: This should be reenabled once the regression with SIM is fixed!
+#if 0
+  // PHI Nodes are ok, just skip over them...
+  while (isa<PHINode>(*I)) ++I;
+#endif
+
+  // Accept the setcc instruction...
+  if (&*I == BI->getCondition())
+    ++I;
+
+  // Nothing else is acceptable here yet.  We must not revector... unless we are
+  // at the terminator instruction.
+  if (&*I == BI)
+    return true;
+
+  return false;
+}
+
+
+bool CEE::ForwardCorrelatedEdgeDestination(TerminatorInst *TI, unsigned SuccNo,
+                                           RegionInfo &RI) {
+  // If this successor is a simple block not in the current region, which
+  // contains only a conditional branch, we decide if the outcome of the branch
+  // can be determined from information inside of the region.  Instead of going
+  // to this block, we can instead go to the destination we know is the right
+  // target.
+  //
+
+  // Check to see if we dominate the block. If so, this block will get the
+  // condition turned to a constant anyway.
+  //
+  //if (EF->dominates(RI.getEntryBlock(), BB))
+  // return 0;
+
+  BasicBlock *BB = TI->getParent();
+
+  // Get the destination block of this edge...
+  BasicBlock *OldSucc = TI->getSuccessor(SuccNo);
+
+  // Make sure that the block ends with a conditional branch and is simple
+  // enough for use to be able to revector over.
+  BranchInst *BI = dyn_cast<BranchInst>(OldSucc->getTerminator());
+  if (BI == 0 || !BI->isConditional() || !isBlockSimpleEnough(OldSucc))
+    return false;
+
+  // We can only forward the branch over the block if the block ends with a
+  // cmp we can determine the outcome for.
+  //
+  // FIXME: we can make this more generic.  Code below already handles more
+  // generic case.
+  if (!isa<CmpInst>(BI->getCondition()))
+    return false;
+
+  // Make a new RegionInfo structure so that we can simulate the effect of the
+  // PHI nodes in the block we are skipping over...
+  //
+  RegionInfo NewRI(RI);
+
+  // Remove value information for all of the values we are simulating... to make
+  // sure we don't have any stale information.
+  for (BasicBlock::iterator I = OldSucc->begin(), E = OldSucc->end(); I!=E; ++I)
+    if (I->getType() != Type::VoidTy)
+      NewRI.removeValueInfo(I);
+
+  // Put the newly discovered information into the RegionInfo...
+  for (BasicBlock::iterator I = OldSucc->begin(), E = OldSucc->end(); I!=E; ++I)
+    if (PHINode *PN = dyn_cast<PHINode>(I)) {
+      int OpNum = PN->getBasicBlockIndex(BB);
+      assert(OpNum != -1 && "PHI doesn't have incoming edge for predecessor!?");
+      PropagateEquality(PN, PN->getIncomingValue(OpNum), NewRI);
+    } else if (CmpInst *CI = dyn_cast<CmpInst>(I)) {
+      Relation::KnownResult Res = getCmpResult(CI, NewRI);
+      if (Res == Relation::Unknown) return false;
+      PropagateEquality(CI, ConstantInt::get(Type::Int1Ty, Res), NewRI);
+    } else {
+      assert(isa<BranchInst>(*I) && "Unexpected instruction type!");
+    }
+
+  // Compute the facts implied by what we have discovered...
+  ComputeReplacements(NewRI);
+
+  ValueInfo &PredicateVI = NewRI.getValueInfo(BI->getCondition());
+  if (PredicateVI.getReplacement() &&
+      isa<Constant>(PredicateVI.getReplacement()) &&
+      !isa<GlobalValue>(PredicateVI.getReplacement())) {
+    ConstantInt *CB = cast<ConstantInt>(PredicateVI.getReplacement());
+
+    // Forward to the successor that corresponds to the branch we will take.
+    ForwardSuccessorTo(TI, SuccNo, 
+                       BI->getSuccessor(!CB->getZExtValue()), NewRI);
+    return true;
+  }
+
+  return false;
+}
+
+static Value *getReplacementOrValue(Value *V, RegionInfo &RI) {
+  if (const ValueInfo *VI = RI.requestValueInfo(V))
+    if (Value *Repl = VI->getReplacement())
+      return Repl;
+  return V;
+}
+
+/// ForwardSuccessorTo - We have found that we can forward successor # 'SuccNo'
+/// of Terminator 'TI' to the 'Dest' BasicBlock.  This method performs the
+/// mechanics of updating SSA information and revectoring the branch.
+///
+void CEE::ForwardSuccessorTo(TerminatorInst *TI, unsigned SuccNo,
+                             BasicBlock *Dest, RegionInfo &RI) {
+  // If there are any PHI nodes in the Dest BB, we must duplicate the entry
+  // in the PHI node for the old successor to now include an entry from the
+  // current basic block.
+  //
+  BasicBlock *OldSucc = TI->getSuccessor(SuccNo);
+  BasicBlock *BB = TI->getParent();
+
+  DOUT << "Forwarding branch in basic block %" << BB->getName()
+       << " from block %" << OldSucc->getName() << " to block %"
+       << Dest->getName() << "\n"
+       << "Before forwarding: " << *BB->getParent();
+
+  // Because we know that there cannot be critical edges in the flow graph, and
+  // that OldSucc has multiple outgoing edges, this means that Dest cannot have
+  // multiple incoming edges.
+  //
+#ifndef NDEBUG
+  pred_iterator DPI = pred_begin(Dest); ++DPI;
+  assert(DPI == pred_end(Dest) && "Critical edge found!!");
+#endif
+
+  // Loop over any PHI nodes in the destination, eliminating them, because they
+  // may only have one input.
+  //
+  while (PHINode *PN = dyn_cast<PHINode>(&Dest->front())) {
+    assert(PN->getNumIncomingValues() == 1 && "Crit edge found!");
+    // Eliminate the PHI node
+    PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    Dest->getInstList().erase(PN);
+  }
+
+  // If there are values defined in the "OldSucc" basic block, we need to insert
+  // PHI nodes in the regions we are dealing with to emulate them.  This can
+  // insert dead phi nodes, but it is more trouble to see if they are used than
+  // to just blindly insert them.
+  //
+  if (DT->dominates(OldSucc, Dest)) {
+    // RegionExitBlocks - Find all of the blocks that are not dominated by Dest,
+    // but have predecessors that are.  Additionally, prune down the set to only
+    // include blocks that are dominated by OldSucc as well.
+    //
+    std::vector<BasicBlock*> RegionExitBlocks;
+    CalculateRegionExitBlocks(Dest, OldSucc, RegionExitBlocks);
+
+    for (BasicBlock::iterator I = OldSucc->begin(), E = OldSucc->end();
+         I != E; ++I)
+      if (I->getType() != Type::VoidTy) {
+        // Create and insert the PHI node into the top of Dest.
+        PHINode *NewPN = new PHINode(I->getType(), I->getName()+".fw_merge",
+                                     Dest->begin());
+        // There is definitely an edge from OldSucc... add the edge now
+        NewPN->addIncoming(I, OldSucc);
+
+        // There is also an edge from BB now, add the edge with the calculated
+        // value from the RI.
+        NewPN->addIncoming(getReplacementOrValue(I, RI), BB);
+
+        // Make everything in the Dest region use the new PHI node now...
+        ReplaceUsesOfValueInRegion(I, NewPN, Dest);
+
+        // Make sure that exits out of the region dominated by NewPN get PHI
+        // nodes that merge the values as appropriate.
+        InsertRegionExitMerges(NewPN, I, RegionExitBlocks);
+      }
+  }
+
+  // If there were PHI nodes in OldSucc, we need to remove the entry for this
+  // edge from the PHI node, and we need to replace any references to the PHI
+  // node with a new value.
+  //
+  for (BasicBlock::iterator I = OldSucc->begin(); isa<PHINode>(I); ) {
+    PHINode *PN = cast<PHINode>(I);
+
+    // Get the value flowing across the old edge and remove the PHI node entry
+    // for this edge: we are about to remove the edge!  Don't remove the PHI
+    // node yet though if this is the last edge into it.
+    Value *EdgeValue = PN->removeIncomingValue(BB, false);
+
+    // Make sure that anything that used to use PN now refers to EdgeValue
+    ReplaceUsesOfValueInRegion(PN, EdgeValue, Dest);
+
+    // If there is only one value left coming into the PHI node, replace the PHI
+    // node itself with the one incoming value left.
+    //
+    if (PN->getNumIncomingValues() == 1) {
+      assert(PN->getNumIncomingValues() == 1);
+      PN->replaceAllUsesWith(PN->getIncomingValue(0));
+      PN->getParent()->getInstList().erase(PN);
+      I = OldSucc->begin();
+    } else if (PN->getNumIncomingValues() == 0) {  // Nuke the PHI
+      // If we removed the last incoming value to this PHI, nuke the PHI node
+      // now.
+      PN->replaceAllUsesWith(Constant::getNullValue(PN->getType()));
+      PN->getParent()->getInstList().erase(PN);
+      I = OldSucc->begin();
+    } else {
+      ++I;  // Otherwise, move on to the next PHI node
+    }
+  }
+
+  // Actually revector the branch now...
+  TI->setSuccessor(SuccNo, Dest);
+
+  // If we just introduced a critical edge in the flow graph, make sure to break
+  // it right away...
+  SplitCriticalEdge(TI, SuccNo, this);
+
+  // Make sure that we don't introduce critical edges from oldsucc now!
+  for (unsigned i = 0, e = OldSucc->getTerminator()->getNumSuccessors();
+       i != e; ++i)
+    SplitCriticalEdge(OldSucc->getTerminator(), i, this);
+
+  // Since we invalidated the CFG, recalculate the dominator set so that it is
+  // useful for later processing!
+  // FIXME: This is much worse than it really should be!
+  //EF->recalculate();
+
+  DOUT << "After forwarding: " << *BB->getParent();
+}
+
+/// ReplaceUsesOfValueInRegion - This method replaces all uses of Orig with uses
+/// of New.  It only affects instructions that are defined in basic blocks that
+/// are dominated by Head.
+///
+void CEE::ReplaceUsesOfValueInRegion(Value *Orig, Value *New,
+                                     BasicBlock *RegionDominator) {
+  assert(Orig != New && "Cannot replace value with itself");
+  std::vector<Instruction*> InstsToChange;
+  std::vector<PHINode*>     PHIsToChange;
+  InstsToChange.reserve(Orig->getNumUses());
+
+  // Loop over instructions adding them to InstsToChange vector, this allows us
+  // an easy way to avoid invalidating the use_iterator at a bad time.
+  for (Value::use_iterator I = Orig->use_begin(), E = Orig->use_end();
+       I != E; ++I)
+    if (Instruction *User = dyn_cast<Instruction>(*I))
+      if (DT->dominates(RegionDominator, User->getParent()))
+        InstsToChange.push_back(User);
+      else if (PHINode *PN = dyn_cast<PHINode>(User)) {
+        PHIsToChange.push_back(PN);
+      }
+
+  // PHIsToChange contains PHI nodes that use Orig that do not live in blocks
+  // dominated by orig.  If the block the value flows in from is dominated by
+  // RegionDominator, then we rewrite the PHI
+  for (unsigned i = 0, e = PHIsToChange.size(); i != e; ++i) {
+    PHINode *PN = PHIsToChange[i];
+    for (unsigned j = 0, e = PN->getNumIncomingValues(); j != e; ++j)
+      if (PN->getIncomingValue(j) == Orig &&
+          DT->dominates(RegionDominator, PN->getIncomingBlock(j)))
+        PN->setIncomingValue(j, New);
+  }
+
+  // Loop over the InstsToChange list, replacing all uses of Orig with uses of
+  // New.  This list contains all of the instructions in our region that use
+  // Orig.
+  for (unsigned i = 0, e = InstsToChange.size(); i != e; ++i)
+    if (PHINode *PN = dyn_cast<PHINode>(InstsToChange[i])) {
+      // PHINodes must be handled carefully.  If the PHI node itself is in the
+      // region, we have to make sure to only do the replacement for incoming
+      // values that correspond to basic blocks in the region.
+      for (unsigned j = 0, e = PN->getNumIncomingValues(); j != e; ++j)
+        if (PN->getIncomingValue(j) == Orig &&
+            DT->dominates(RegionDominator, PN->getIncomingBlock(j)))
+          PN->setIncomingValue(j, New);
+
+    } else {
+      InstsToChange[i]->replaceUsesOfWith(Orig, New);
+    }
+}
+
+static void CalcRegionExitBlocks(BasicBlock *Header, BasicBlock *BB,
+                                 std::set<BasicBlock*> &Visited,
+                                 DominatorTree &DT,
+                                 std::vector<BasicBlock*> &RegionExitBlocks) {
+  if (Visited.count(BB)) return;
+  Visited.insert(BB);
+
+  if (DT.dominates(Header, BB)) {  // Block in the region, recursively traverse
+    for (succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+      CalcRegionExitBlocks(Header, *I, Visited, DT, RegionExitBlocks);
+  } else {
+    // Header does not dominate this block, but we have a predecessor that does
+    // dominate us.  Add ourself to the list.
+    RegionExitBlocks.push_back(BB);
+  }
+}
+
+/// CalculateRegionExitBlocks - Find all of the blocks that are not dominated by
+/// BB, but have predecessors that are.  Additionally, prune down the set to
+/// only include blocks that are dominated by OldSucc as well.
+///
+void CEE::CalculateRegionExitBlocks(BasicBlock *BB, BasicBlock *OldSucc,
+                                    std::vector<BasicBlock*> &RegionExitBlocks){
+  std::set<BasicBlock*> Visited;  // Don't infinite loop
+
+  // Recursively calculate blocks we are interested in...
+  CalcRegionExitBlocks(BB, BB, Visited, *DT, RegionExitBlocks);
+
+  // Filter out blocks that are not dominated by OldSucc...
+  for (unsigned i = 0; i != RegionExitBlocks.size(); ) {
+    if (DT->dominates(OldSucc, RegionExitBlocks[i]))
+      ++i;  // Block is ok, keep it.
+    else {
+      // Move to end of list...
+      std::swap(RegionExitBlocks[i], RegionExitBlocks.back());
+      RegionExitBlocks.pop_back();        // Nuke the end
+    }
+  }
+}
+
+void CEE::InsertRegionExitMerges(PHINode *BBVal, Instruction *OldVal,
+                             const std::vector<BasicBlock*> &RegionExitBlocks) {
+  assert(BBVal->getType() == OldVal->getType() && "Should be derived values!");
+  BasicBlock *BB = BBVal->getParent();
+
+  // Loop over all of the blocks we have to place PHIs in, doing it.
+  for (unsigned i = 0, e = RegionExitBlocks.size(); i != e; ++i) {
+    BasicBlock *FBlock = RegionExitBlocks[i];  // Block on the frontier
+
+    // Create the new PHI node
+    PHINode *NewPN = new PHINode(BBVal->getType(),
+                                 OldVal->getName()+".fw_frontier",
+                                 FBlock->begin());
+
+    // Add an incoming value for every predecessor of the block...
+    for (pred_iterator PI = pred_begin(FBlock), PE = pred_end(FBlock);
+         PI != PE; ++PI) {
+      // If the incoming edge is from the region dominated by BB, use BBVal,
+      // otherwise use OldVal.
+      NewPN->addIncoming(DT->dominates(BB, *PI) ? BBVal : OldVal, *PI);
+    }
+
+    // Now make everyone dominated by this block use this new value!
+    ReplaceUsesOfValueInRegion(OldVal, NewPN, FBlock);
+  }
+}
+
+
+
+// BuildRankMap - This method builds the rank map data structure which gives
+// each instruction/value in the function a value based on how early it appears
+// in the function.  We give constants and globals rank 0, arguments are
+// numbered starting at one, and instructions are numbered in reverse post-order
+// from where the arguments leave off.  This gives instructions in loops higher
+// values than instructions not in loops.
+//
+void CEE::BuildRankMap(Function &F) {
+  unsigned Rank = 1;  // Skip rank zero.
+
+  // Number the arguments...
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    RankMap[I] = Rank++;
+
+  // Number the instructions in reverse post order...
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I)
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
+         BBI != E; ++BBI)
+      if (BBI->getType() != Type::VoidTy)
+        RankMap[BBI] = Rank++;
+}
+
+
+// PropagateBranchInfo - When this method is invoked, we need to propagate
+// information derived from the branch condition into the true and false
+// branches of BI.  Since we know that there aren't any critical edges in the
+// flow graph, this can proceed unconditionally.
+//
+void CEE::PropagateBranchInfo(BranchInst *BI) {
+  assert(BI->isConditional() && "Must be a conditional branch!");
+
+  // Propagate information into the true block...
+  //
+  PropagateEquality(BI->getCondition(), ConstantInt::getTrue(),
+                    getRegionInfo(BI->getSuccessor(0)));
+
+  // Propagate information into the false block...
+  //
+  PropagateEquality(BI->getCondition(), ConstantInt::getFalse(),
+                    getRegionInfo(BI->getSuccessor(1)));
+}
+
+
+// PropagateSwitchInfo - We need to propagate the value tested by the
+// switch statement through each case block.
+//
+void CEE::PropagateSwitchInfo(SwitchInst *SI) {
+  // Propagate information down each of our non-default case labels.  We
+  // don't yet propagate information down the default label, because a
+  // potentially large number of inequality constraints provide less
+  // benefit per unit work than a single equality constraint.
+  //
+  Value *cond = SI->getCondition();
+  for (unsigned i = 1; i < SI->getNumSuccessors(); ++i)
+    PropagateEquality(cond, SI->getSuccessorValue(i),
+                      getRegionInfo(SI->getSuccessor(i)));
+}
+
+
+// PropagateEquality - If we discover that two values are equal to each other in
+// a specified region, propagate this knowledge recursively.
+//
+void CEE::PropagateEquality(Value *Op0, Value *Op1, RegionInfo &RI) {
+  if (Op0 == Op1) return;  // Gee whiz. Are these really equal each other?
+
+  if (isa<Constant>(Op0))  // Make sure the constant is always Op1
+    std::swap(Op0, Op1);
+
+  // Make sure we don't already know these are equal, to avoid infinite loops...
+  ValueInfo &VI = RI.getValueInfo(Op0);
+
+  // Get information about the known relationship between Op0 & Op1
+  Relation &KnownRelation = VI.getRelation(Op1);
+
+  // If we already know they're equal, don't reprocess...
+  if (KnownRelation.getRelation() == FCmpInst::FCMP_OEQ ||
+      KnownRelation.getRelation() == ICmpInst::ICMP_EQ)
+    return;
+
+  // If this is boolean, check to see if one of the operands is a constant.  If
+  // it's a constant, then see if the other one is one of a setcc instruction,
+  // an AND, OR, or XOR instruction.
+  //
+  ConstantInt *CB = dyn_cast<ConstantInt>(Op1);
+  if (CB && Op1->getType() == Type::Int1Ty) {
+    if (Instruction *Inst = dyn_cast<Instruction>(Op0)) {
+      // If we know that this instruction is an AND instruction, and the 
+      // result is true, this means that both operands to the OR are known 
+      // to be true as well.
+      //
+      if (CB->getZExtValue() && Inst->getOpcode() == Instruction::And) {
+        PropagateEquality(Inst->getOperand(0), CB, RI);
+        PropagateEquality(Inst->getOperand(1), CB, RI);
+      }
+
+      // If we know that this instruction is an OR instruction, and the result
+      // is false, this means that both operands to the OR are know to be 
+      // false as well.
+      //
+      if (!CB->getZExtValue() && Inst->getOpcode() == Instruction::Or) {
+        PropagateEquality(Inst->getOperand(0), CB, RI);
+        PropagateEquality(Inst->getOperand(1), CB, RI);
+      }
+
+      // If we know that this instruction is a NOT instruction, we know that 
+      // the operand is known to be the inverse of whatever the current 
+      // value is.
+      //
+      if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Inst))
+        if (BinaryOperator::isNot(BOp))
+          PropagateEquality(BinaryOperator::getNotArgument(BOp),
+                            ConstantInt::get(Type::Int1Ty, 
+                                             !CB->getZExtValue()), RI);
+
+      // If we know the value of a FCmp instruction, propagate the information
+      // about the relation into this region as well.
+      //
+      if (FCmpInst *FCI = dyn_cast<FCmpInst>(Inst)) {
+        if (CB->getZExtValue()) {  // If we know the condition is true...
+          // Propagate info about the LHS to the RHS & RHS to LHS
+          PropagateRelation(FCI->getPredicate(), FCI->getOperand(0),
+                            FCI->getOperand(1), RI);
+          PropagateRelation(FCI->getSwappedPredicate(),
+                            FCI->getOperand(1), FCI->getOperand(0), RI);
+
+        } else {               // If we know the condition is false...
+          // We know the opposite of the condition is true...
+          FCmpInst::Predicate C = FCI->getInversePredicate();
+
+          PropagateRelation(C, FCI->getOperand(0), FCI->getOperand(1), RI);
+          PropagateRelation(FCmpInst::getSwappedPredicate(C),
+                            FCI->getOperand(1), FCI->getOperand(0), RI);
+        }
+      }
+    
+      // If we know the value of a ICmp instruction, propagate the information
+      // about the relation into this region as well.
+      //
+      if (ICmpInst *ICI = dyn_cast<ICmpInst>(Inst)) {
+        if (CB->getZExtValue()) { // If we know the condition is true...
+          // Propagate info about the LHS to the RHS & RHS to LHS
+          PropagateRelation(ICI->getPredicate(), ICI->getOperand(0),
+                            ICI->getOperand(1), RI);
+          PropagateRelation(ICI->getSwappedPredicate(), ICI->getOperand(1),
+                            ICI->getOperand(1), RI);
+
+        } else {               // If we know the condition is false ...
+          // We know the opposite of the condition is true...
+          ICmpInst::Predicate C = ICI->getInversePredicate();
+
+          PropagateRelation(C, ICI->getOperand(0), ICI->getOperand(1), RI);
+          PropagateRelation(ICmpInst::getSwappedPredicate(C),
+                            ICI->getOperand(1), ICI->getOperand(0), RI);
+        }
+      }
+    }
+  }
+
+  // Propagate information about Op0 to Op1 & visa versa
+  PropagateRelation(ICmpInst::ICMP_EQ, Op0, Op1, RI);
+  PropagateRelation(ICmpInst::ICMP_EQ, Op1, Op0, RI);
+  PropagateRelation(FCmpInst::FCMP_OEQ, Op0, Op1, RI);
+  PropagateRelation(FCmpInst::FCMP_OEQ, Op1, Op0, RI);
+}
+
+
+// PropagateRelation - We know that the specified relation is true in all of the
+// blocks in the specified region.  Propagate the information about Op0 and
+// anything derived from it into this region.
+//
+void CEE::PropagateRelation(unsigned Opcode, Value *Op0,
+                            Value *Op1, RegionInfo &RI) {
+  assert(Op0->getType() == Op1->getType() && "Equal types expected!");
+
+  // Constants are already pretty well understood.  We will apply information
+  // about the constant to Op1 in another call to PropagateRelation.
+  //
+  if (isa<Constant>(Op0)) return;
+
+  // Get the region information for this block to update...
+  ValueInfo &VI = RI.getValueInfo(Op0);
+
+  // Get information about the known relationship between Op0 & Op1
+  Relation &Op1R = VI.getRelation(Op1);
+
+  // Quick bailout for common case if we are reprocessing an instruction...
+  if (Op1R.getRelation() == Opcode)
+    return;
+
+  // If we already have information that contradicts the current information we
+  // are propagating, ignore this info.  Something bad must have happened!
+  //
+  if (Op1R.contradicts(Opcode, VI)) {
+    Op1R.contradicts(Opcode, VI);
+    cerr << "Contradiction found for opcode: "
+         << ((isa<ICmpInst>(Op0)||isa<ICmpInst>(Op1)) ? 
+                  Instruction::getOpcodeName(Instruction::ICmp) :
+                  Instruction::getOpcodeName(Opcode))
+         << "\n";
+    Op1R.print(*cerr.stream());
+    return;
+  }
+
+  // If the information propagated is new, then we want process the uses of this
+  // instruction to propagate the information down to them.
+  //
+  if (Op1R.incorporate(Opcode, VI))
+    UpdateUsersOfValue(Op0, RI);
+}
+
+
+// UpdateUsersOfValue - The information about V in this region has been updated.
+// Propagate this to all consumers of the value.
+//
+void CEE::UpdateUsersOfValue(Value *V, RegionInfo &RI) {
+  for (Value::use_iterator I = V->use_begin(), E = V->use_end();
+       I != E; ++I)
+    if (Instruction *Inst = dyn_cast<Instruction>(*I)) {
+      // If this is an instruction using a value that we know something about,
+      // try to propagate information to the value produced by the
+      // instruction.  We can only do this if it is an instruction we can
+      // propagate information for (a setcc for example), and we only WANT to
+      // do this if the instruction dominates this region.
+      //
+      // If the instruction doesn't dominate this region, then it cannot be
+      // used in this region and we don't care about it.  If the instruction
+      // is IN this region, then we will simplify the instruction before we
+      // get to uses of it anyway, so there is no reason to bother with it
+      // here.  This check is also effectively checking to make sure that Inst
+      // is in the same function as our region (in case V is a global f.e.).
+      //
+      if (DT->properlyDominates(Inst->getParent(), RI.getEntryBlock()))
+        IncorporateInstruction(Inst, RI);
+    }
+}
+
+// IncorporateInstruction - We just updated the information about one of the
+// operands to the specified instruction.  Update the information about the
+// value produced by this instruction
+//
+void CEE::IncorporateInstruction(Instruction *Inst, RegionInfo &RI) {
+  if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+    // See if we can figure out a result for this instruction...
+    Relation::KnownResult Result = getCmpResult(CI, RI);
+    if (Result != Relation::Unknown) {
+      PropagateEquality(CI, ConstantInt::get(Type::Int1Ty, Result != 0), RI);
+    }
+  }
+}
+
+
+// ComputeReplacements - Some values are known to be equal to other values in a
+// region.  For example if there is a comparison of equality between a variable
+// X and a constant C, we can replace all uses of X with C in the region we are
+// interested in.  We generalize this replacement to replace variables with
+// other variables if they are equal and there is a variable with lower rank
+// than the current one.  This offers a canonicalizing property that exposes
+// more redundancies for later transformations to take advantage of.
+//
+void CEE::ComputeReplacements(RegionInfo &RI) {
+  // Loop over all of the values in the region info map...
+  for (RegionInfo::iterator I = RI.begin(), E = RI.end(); I != E; ++I) {
+    ValueInfo &VI = I->second;
+
+    // If we know that this value is a particular constant, set Replacement to
+    // the constant...
+    Value *Replacement = 0;
+    const APInt * Rplcmnt = VI.getBounds().getSingleElement();
+    if (Rplcmnt)
+      Replacement = ConstantInt::get(*Rplcmnt);
+
+    // If this value is not known to be some constant, figure out the lowest
+    // rank value that it is known to be equal to (if anything).
+    //
+    if (Replacement == 0) {
+      // Find out if there are any equality relationships with values of lower
+      // rank than VI itself...
+      unsigned MinRank = getRank(I->first);
+
+      // Loop over the relationships known about Op0.
+      const std::vector<Relation> &Relationships = VI.getRelationships();
+      for (unsigned i = 0, e = Relationships.size(); i != e; ++i)
+        if (Relationships[i].getRelation() == FCmpInst::FCMP_OEQ) {
+          unsigned R = getRank(Relationships[i].getValue());
+          if (R < MinRank) {
+            MinRank = R;
+            Replacement = Relationships[i].getValue();
+          }
+        }
+        else if (Relationships[i].getRelation() == ICmpInst::ICMP_EQ) {
+          unsigned R = getRank(Relationships[i].getValue());
+          if (R < MinRank) {
+            MinRank = R;
+            Replacement = Relationships[i].getValue();
+          }
+        }
+    }
+
+    // If we found something to replace this value with, keep track of it.
+    if (Replacement)
+      VI.setReplacement(Replacement);
+  }
+}
+
+// SimplifyBasicBlock - Given information about values in region RI, simplify
+// the instructions in the specified basic block.
+//
+bool CEE::SimplifyBasicBlock(BasicBlock &BB, const RegionInfo &RI) {
+  bool Changed = false;
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
+    Instruction *Inst = I++;
+
+    // Convert instruction arguments to canonical forms...
+    Changed |= SimplifyInstruction(Inst, RI);
+
+    if (CmpInst *CI = dyn_cast<CmpInst>(Inst)) {
+      // Try to simplify a setcc instruction based on inherited information
+      Relation::KnownResult Result = getCmpResult(CI, RI);
+      if (Result != Relation::Unknown) {
+        DEBUG(cerr << "Replacing icmp with " << Result
+                   << " constant: " << *CI);
+
+        CI->replaceAllUsesWith(ConstantInt::get(Type::Int1Ty, (bool)Result));
+        // The instruction is now dead, remove it from the program.
+        CI->getParent()->getInstList().erase(CI);
+        ++NumCmpRemoved;
+        Changed = true;
+      }
+    }
+  }
+
+  return Changed;
+}
+
+// SimplifyInstruction - Inspect the operands of the instruction, converting
+// them to their canonical form if possible.  This takes care of, for example,
+// replacing a value 'X' with a constant 'C' if the instruction in question is
+// dominated by a true seteq 'X', 'C'.
+//
+bool CEE::SimplifyInstruction(Instruction *I, const RegionInfo &RI) {
+  bool Changed = false;
+
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (const ValueInfo *VI = RI.requestValueInfo(I->getOperand(i)))
+      if (Value *Repl = VI->getReplacement()) {
+        // If we know if a replacement with lower rank than Op0, make the
+        // replacement now.
+        DOUT << "In Inst: " << *I << "  Replacing operand #" << i
+             << " with " << *Repl << "\n";
+        I->setOperand(i, Repl);
+        Changed = true;
+        ++NumOperandsCann;
+      }
+
+  return Changed;
+}
+
+// getCmpResult - Try to simplify a cmp instruction based on information
+// inherited from a dominating icmp instruction.  V is one of the operands to
+// the icmp instruction, and VI is the set of information known about it.  We
+// take two cases into consideration here.  If the comparison is against a
+// constant value, we can use the constant range to see if the comparison is
+// possible to succeed.  If it is not a comparison against a constant, we check
+// to see if there is a known relationship between the two values.  If so, we
+// may be able to eliminate the check.
+//
+Relation::KnownResult CEE::getCmpResult(CmpInst *CI,
+                                        const RegionInfo &RI) {
+  Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
+  unsigned short predicate = CI->getPredicate();
+
+  if (isa<Constant>(Op0)) {
+    if (isa<Constant>(Op1)) {
+      if (Constant *Result = ConstantFoldInstruction(CI)) {
+        // Wow, this is easy, directly eliminate the ICmpInst.
+        DEBUG(cerr << "Replacing cmp with constant fold: " << *CI);
+        return cast<ConstantInt>(Result)->getZExtValue()
+          ? Relation::KnownTrue : Relation::KnownFalse;
+      }
+    } else {
+      // We want to swap this instruction so that operand #0 is the constant.
+      std::swap(Op0, Op1);
+      if (isa<ICmpInst>(CI))
+        predicate = cast<ICmpInst>(CI)->getSwappedPredicate();
+      else
+        predicate = cast<FCmpInst>(CI)->getSwappedPredicate();
+    }
+  }
+
+  // Try to figure out what the result of this comparison will be...
+  Relation::KnownResult Result = Relation::Unknown;
+
+  // We have to know something about the relationship to prove anything...
+  if (const ValueInfo *Op0VI = RI.requestValueInfo(Op0)) {
+
+    // At this point, we know that if we have a constant argument that it is in
+    // Op1.  Check to see if we know anything about comparing value with a
+    // constant, and if we can use this info to fold the icmp.
+    //
+    if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+      // Check to see if we already know the result of this comparison...
+      ICmpInst::Predicate ipred = ICmpInst::Predicate(predicate);
+      ConstantRange R = ICmpInst::makeConstantRange(ipred, C->getValue());
+      ConstantRange Int = R.intersectWith(Op0VI->getBounds());
+
+      // If the intersection of the two ranges is empty, then the condition
+      // could never be true!
+      //
+      if (Int.isEmptySet()) {
+        Result = Relation::KnownFalse;
+
+      // Otherwise, if VI.getBounds() (the possible values) is a subset of R
+      // (the allowed values) then we know that the condition must always be
+      // true!
+      //
+      } else if (Int == Op0VI->getBounds()) {
+        Result = Relation::KnownTrue;
+      }
+    } else {
+      // If we are here, we know that the second argument is not a constant
+      // integral.  See if we know anything about Op0 & Op1 that allows us to
+      // fold this anyway.
+      //
+      // Do we have value information about Op0 and a relation to Op1?
+      if (const Relation *Op2R = Op0VI->requestRelation(Op1))
+        Result = Op2R->getImpliedResult(predicate);
+    }
+  }
+  return Result;
+}
+
+//===----------------------------------------------------------------------===//
+//  Relation Implementation
+//===----------------------------------------------------------------------===//
+
+// contradicts - Return true if the relationship specified by the operand
+// contradicts already known information.
+//
+bool Relation::contradicts(unsigned Op,
+                           const ValueInfo &VI) const {
+  assert (Op != Instruction::Add && "Invalid relation argument!");
+
+  // If this is a relationship with a constant, make sure that this relationship
+  // does not contradict properties known about the bounds of the constant.
+  //
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Val))
+    if (Op >= ICmpInst::FIRST_ICMP_PREDICATE && 
+        Op <= ICmpInst::LAST_ICMP_PREDICATE) {
+      ICmpInst::Predicate ipred = ICmpInst::Predicate(Op);
+      if (ICmpInst::makeConstantRange(ipred, C->getValue())
+                    .intersectWith(VI.getBounds()).isEmptySet())
+        return true;
+    }
+
+  switch (Rel) {
+  default: assert(0 && "Unknown Relationship code!");
+  case Instruction::Add: return false;  // Nothing known, nothing contradicts
+  case ICmpInst::ICMP_EQ:
+    return Op == ICmpInst::ICMP_ULT || Op == ICmpInst::ICMP_SLT ||
+           Op == ICmpInst::ICMP_UGT || Op == ICmpInst::ICMP_SGT ||
+           Op == ICmpInst::ICMP_NE;
+  case ICmpInst::ICMP_NE:  return Op == ICmpInst::ICMP_EQ;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE: return Op == ICmpInst::ICMP_UGT ||
+                                  Op == ICmpInst::ICMP_SGT;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE: return Op == ICmpInst::ICMP_ULT ||
+                                  Op == ICmpInst::ICMP_SLT;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    return Op == ICmpInst::ICMP_EQ  || Op == ICmpInst::ICMP_UGT ||
+           Op == ICmpInst::ICMP_SGT || Op == ICmpInst::ICMP_UGE ||
+           Op == ICmpInst::ICMP_SGE;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    return Op == ICmpInst::ICMP_EQ  || Op == ICmpInst::ICMP_ULT ||
+           Op == ICmpInst::ICMP_SLT || Op == ICmpInst::ICMP_ULE ||
+           Op == ICmpInst::ICMP_SLE;
+  case FCmpInst::FCMP_OEQ:
+    return Op == FCmpInst::FCMP_OLT || Op == FCmpInst::FCMP_OGT ||
+           Op == FCmpInst::FCMP_ONE;
+  case FCmpInst::FCMP_ONE: return Op == FCmpInst::FCMP_OEQ;
+  case FCmpInst::FCMP_OLE: return Op == FCmpInst::FCMP_OGT;
+  case FCmpInst::FCMP_OGE: return Op == FCmpInst::FCMP_OLT;
+  case FCmpInst::FCMP_OLT:
+    return Op == FCmpInst::FCMP_OEQ || Op == FCmpInst::FCMP_OGT ||
+           Op == FCmpInst::FCMP_OGE;
+  case FCmpInst::FCMP_OGT:
+    return Op == FCmpInst::FCMP_OEQ || Op == FCmpInst::FCMP_OLT ||
+           Op == FCmpInst::FCMP_OLE;
+  }
+}
+
+// incorporate - Incorporate information in the argument into this relation
+// entry.  This assumes that the information doesn't contradict itself.  If any
+// new information is gained, true is returned, otherwise false is returned to
+// indicate that nothing was updated.
+//
+bool Relation::incorporate(unsigned Op, ValueInfo &VI) {
+  assert(!contradicts(Op, VI) &&
+         "Cannot incorporate contradictory information!");
+
+  // If this is a relationship with a constant, make sure that we update the
+  // range that is possible for the value to have...
+  //
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Val))
+    if (Op >= ICmpInst::FIRST_ICMP_PREDICATE && 
+        Op <= ICmpInst::LAST_ICMP_PREDICATE) {
+      ICmpInst::Predicate ipred = ICmpInst::Predicate(Op);
+      VI.getBounds() = 
+        ICmpInst::makeConstantRange(ipred, C->getValue())
+                  .intersectWith(VI.getBounds());
+    }
+
+  switch (Rel) {
+  default: assert(0 && "Unknown prior value!");
+  case Instruction::Add:   Rel = Op; return true;
+  case ICmpInst::ICMP_EQ:
+  case ICmpInst::ICMP_NE:
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT: return false;  // Nothing is more precise
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+    if (Op == ICmpInst::ICMP_EQ  || Op == ICmpInst::ICMP_ULT ||
+        Op == ICmpInst::ICMP_SLT) {
+      Rel = Op;
+      return true;
+    } else if (Op == ICmpInst::ICMP_NE) {
+      Rel = Rel == ICmpInst::ICMP_ULE ? ICmpInst::ICMP_ULT :
+            ICmpInst::ICMP_SLT;
+      return true;
+    }
+    return false;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+    if (Op == ICmpInst::ICMP_EQ  || ICmpInst::ICMP_UGT ||
+        Op == ICmpInst::ICMP_SGT) {
+      Rel = Op;
+      return true;
+    } else if (Op == ICmpInst::ICMP_NE) {
+      Rel = Rel == ICmpInst::ICMP_UGE ? ICmpInst::ICMP_UGT :
+            ICmpInst::ICMP_SGT;
+      return true;
+    }
+    return false;
+  case FCmpInst::FCMP_OEQ: return false;  // Nothing is more precise
+  case FCmpInst::FCMP_ONE: return false;  // Nothing is more precise
+  case FCmpInst::FCMP_OLT: return false;  // Nothing is more precise
+  case FCmpInst::FCMP_OGT: return false;  // Nothing is more precise
+  case FCmpInst::FCMP_OLE:
+    if (Op == FCmpInst::FCMP_OEQ || Op == FCmpInst::FCMP_OLT) {
+      Rel = Op;
+      return true;
+    } else if (Op == FCmpInst::FCMP_ONE) {
+      Rel = FCmpInst::FCMP_OLT;
+      return true;
+    }
+    return false;
+  case FCmpInst::FCMP_OGE: 
+    return Op == FCmpInst::FCMP_OLT;
+    if (Op == FCmpInst::FCMP_OEQ || Op == FCmpInst::FCMP_OGT) {
+      Rel = Op;
+      return true;
+    } else if (Op == FCmpInst::FCMP_ONE) {
+      Rel = FCmpInst::FCMP_OGT;
+      return true;
+    }
+    return false;
+  }
+}
+
+// getImpliedResult - If this relationship between two values implies that
+// the specified relationship is true or false, return that.  If we cannot
+// determine the result required, return Unknown.
+//
+Relation::KnownResult
+Relation::getImpliedResult(unsigned Op) const {
+  if (Rel == Op) return KnownTrue;
+  if (Op >= ICmpInst::FIRST_ICMP_PREDICATE && 
+      Op <= ICmpInst::LAST_ICMP_PREDICATE) {
+    if (Rel == unsigned(ICmpInst::getInversePredicate(ICmpInst::Predicate(Op))))
+      return KnownFalse;
+  } else if (Op <= FCmpInst::LAST_FCMP_PREDICATE) {
+    if (Rel == unsigned(FCmpInst::getInversePredicate(FCmpInst::Predicate(Op))))
+    return KnownFalse;
+  }
+
+  switch (Rel) {
+  default: assert(0 && "Unknown prior value!");
+  case ICmpInst::ICMP_EQ:
+    if (Op == ICmpInst::ICMP_ULE || Op == ICmpInst::ICMP_SLE || 
+        Op == ICmpInst::ICMP_UGE || Op == ICmpInst::ICMP_SGE) return KnownTrue;
+    if (Op == ICmpInst::ICMP_ULT || Op == ICmpInst::ICMP_SLT || 
+        Op == ICmpInst::ICMP_UGT || Op == ICmpInst::ICMP_SGT) return KnownFalse;
+    break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (Op == ICmpInst::ICMP_ULE || Op == ICmpInst::ICMP_SLE ||
+        Op == ICmpInst::ICMP_NE) return KnownTrue;
+    if (Op == ICmpInst::ICMP_EQ) return KnownFalse;
+    break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (Op == ICmpInst::ICMP_UGE || Op == ICmpInst::ICMP_SGE ||
+        Op == ICmpInst::ICMP_NE) return KnownTrue;
+    if (Op == ICmpInst::ICMP_EQ) return KnownFalse;
+    break;
+  case FCmpInst::FCMP_OEQ:
+    if (Op == FCmpInst::FCMP_OLE || Op == FCmpInst::FCMP_OGE) return KnownTrue;
+    if (Op == FCmpInst::FCMP_OLT || Op == FCmpInst::FCMP_OGT) return KnownFalse;
+    break;
+  case FCmpInst::FCMP_OLT:
+    if (Op == FCmpInst::FCMP_ONE || Op == FCmpInst::FCMP_OLE) return KnownTrue;
+    if (Op == FCmpInst::FCMP_OEQ) return KnownFalse;
+    break;
+  case FCmpInst::FCMP_OGT:
+    if (Op == FCmpInst::FCMP_ONE || Op == FCmpInst::FCMP_OGE) return KnownTrue;
+    if (Op == FCmpInst::FCMP_OEQ) return KnownFalse;
+    break;
+  case ICmpInst::ICMP_NE:
+  case ICmpInst::ICMP_SLE:
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+  case FCmpInst::FCMP_ONE:
+  case FCmpInst::FCMP_OLE:
+  case FCmpInst::FCMP_OGE:
+  case FCmpInst::FCMP_FALSE:
+  case FCmpInst::FCMP_ORD:
+  case FCmpInst::FCMP_UNO:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_TRUE:
+    break;
+  }
+  return Unknown;
+}
+
+
+//===----------------------------------------------------------------------===//
+// Printing Support...
+//===----------------------------------------------------------------------===//
+
+// print - Implement the standard print form to print out analysis information.
+void CEE::print(std::ostream &O, const Module *M) const {
+  O << "\nPrinting Correlated Expression Info:\n";
+  for (std::map<BasicBlock*, RegionInfo>::const_iterator I =
+         RegionInfoMap.begin(), E = RegionInfoMap.end(); I != E; ++I)
+    I->second.print(O);
+}
+
+// print - Output information about this region...
+void RegionInfo::print(std::ostream &OS) const {
+  if (ValueMap.empty()) return;
+
+  OS << " RegionInfo for basic block: " << BB->getName() << "\n";
+  for (std::map<Value*, ValueInfo>::const_iterator
+         I = ValueMap.begin(), E = ValueMap.end(); I != E; ++I)
+    I->second.print(OS, I->first);
+  OS << "\n";
+}
+
+// print - Output information about this value relation...
+void ValueInfo::print(std::ostream &OS, Value *V) const {
+  if (Relationships.empty()) return;
+
+  if (V) {
+    OS << "  ValueInfo for: ";
+    WriteAsOperand(OS, V);
+  }
+  OS << "\n    Bounds = " << Bounds << "\n";
+  if (Replacement) {
+    OS << "    Replacement = ";
+    WriteAsOperand(OS, Replacement);
+    OS << "\n";
+  }
+  for (unsigned i = 0, e = Relationships.size(); i != e; ++i)
+    Relationships[i].print(OS);
+}
+
+// print - Output this relation to the specified stream
+void Relation::print(std::ostream &OS) const {
+  OS << "    is ";
+  switch (Rel) {
+  default:           OS << "*UNKNOWN*"; break;
+  case ICmpInst::ICMP_EQ:
+  case FCmpInst::FCMP_ORD:
+  case FCmpInst::FCMP_UEQ:
+  case FCmpInst::FCMP_OEQ: OS << "== "; break;
+  case ICmpInst::ICMP_NE:
+  case FCmpInst::FCMP_UNO:
+  case FCmpInst::FCMP_UNE:
+  case FCmpInst::FCMP_ONE: OS << "!= "; break;
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+  case FCmpInst::FCMP_ULT:
+  case FCmpInst::FCMP_OLT: OS << "< "; break;
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+  case FCmpInst::FCMP_UGT:
+  case FCmpInst::FCMP_OGT: OS << "> "; break;
+  case ICmpInst::ICMP_ULE:
+  case ICmpInst::ICMP_SLE:
+  case FCmpInst::FCMP_ULE:
+  case FCmpInst::FCMP_OLE: OS << "<= "; break;
+  case ICmpInst::ICMP_UGE:
+  case ICmpInst::ICMP_SGE:
+  case FCmpInst::FCMP_UGE:
+  case FCmpInst::FCMP_OGE: OS << ">= "; break;
+  }
+
+  WriteAsOperand(OS, Val);
+  OS << "\n";
+}
+
+// Don't inline these methods or else we won't be able to call them from GDB!
+void Relation::dump() const { print(*cerr.stream()); }
+void ValueInfo::dump() const { print(*cerr.stream(), 0); }
+void RegionInfo::dump() const { print(*cerr.stream()); }

diff --git a/lib/Transforms/Scalar/DCE.cpp b/lib/Transforms/Scalar/DCE.cpp
new file mode 100644
index 0000000..163c2b0
--- /dev/null
+++ b/lib/Transforms/Scalar/DCE.cpp

@@ -0,0 +1,130 @@
+//===- DCE.cpp - Code to perform dead code elimination --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead inst elimination and dead code elimination.
+//
+// Dead Inst Elimination performs a single pass over the function removing
+// instructions that are obviously dead.  Dead Code Elimination is similar, but
+// it rechecks instructions that were used by removed instructions to see if
+// they are newly dead.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Instruction.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <set>
+using namespace llvm;
+
+STATISTIC(DIEEliminated, "Number of insts removed by DIE pass");
+STATISTIC(DCEEliminated, "Number of insts removed");
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadInstElimination pass implementation
+  //
+  struct VISIBILITY_HIDDEN DeadInstElimination : public BasicBlockPass {
+    static char ID; // Pass identification, replacement for typeid
+    DeadInstElimination() : BasicBlockPass(intptr_t(&ID)) {}
+    virtual bool runOnBasicBlock(BasicBlock &BB) {
+      bool Changed = false;
+      for (BasicBlock::iterator DI = BB.begin(); DI != BB.end(); )
+        if (dceInstruction(DI)) {
+          Changed = true;
+          ++DIEEliminated;
+        } else
+          ++DI;
+      return Changed;
+    }
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+
+  char DeadInstElimination::ID = 0;
+  RegisterPass<DeadInstElimination> X("die", "Dead Instruction Elimination");
+}
+
+Pass *llvm::createDeadInstEliminationPass() {
+  return new DeadInstElimination();
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  // DeadCodeElimination pass implementation
+  //
+  struct DCE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DCE() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+     virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+ };
+
+  char DCE::ID = 0;
+  RegisterPass<DCE> Y("dce", "Dead Code Elimination");
+}
+
+bool DCE::runOnFunction(Function &F) {
+  // Start out with all of the instructions in the worklist...
+  std::vector<Instruction*> WorkList;
+  for (inst_iterator i = inst_begin(F), e = inst_end(F); i != e; ++i)
+    WorkList.push_back(&*i);
+
+  // Loop over the worklist finding instructions that are dead.  If they are
+  // dead make them drop all of their uses, making other instructions
+  // potentially dead, and work until the worklist is empty.
+  //
+  bool MadeChange = false;
+  while (!WorkList.empty()) {
+    Instruction *I = WorkList.back();
+    WorkList.pop_back();
+
+    if (isInstructionTriviallyDead(I)) {       // If the instruction is dead.
+      // Loop over all of the values that the instruction uses, if there are
+      // instructions being used, add them to the worklist, because they might
+      // go dead after this one is removed.
+      //
+      for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
+        if (Instruction *Used = dyn_cast<Instruction>(*OI))
+          WorkList.push_back(Used);
+
+      // Remove the instruction.
+      I->eraseFromParent();
+
+      // Remove the instruction from the worklist if it still exists in it.
+      for (std::vector<Instruction*>::iterator WI = WorkList.begin(),
+             E = WorkList.end(); WI != E; ++WI)
+        if (*WI == I) {
+          WorkList.erase(WI);
+          --E;
+          --WI;
+        }
+
+      MadeChange = true;
+      ++DCEEliminated;
+    }
+  }
+  return MadeChange;
+}
+
+FunctionPass *llvm::createDeadCodeEliminationPass() {
+  return new DCE();
+}
+

diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
new file mode 100644
index 0000000..665d538
--- /dev/null
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp

@@ -0,0 +1,179 @@
+//===- DeadStoreElimination.cpp - Dead Store Elimination ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "dse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumStores, "Number of stores deleted");
+STATISTIC(NumOther , "Number of other instrs removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN DSE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    DSE() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      bool Changed = false;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        Changed |= runOnBasicBlock(*I);
+      return Changed;
+    }
+
+    bool runOnBasicBlock(BasicBlock &BB);
+
+    void DeleteDeadInstructionChains(Instruction *I,
+                                     SetVector<Instruction*> &DeadInsts);
+
+    // getAnalysisUsage - We require post dominance frontiers (aka Control
+    // Dependence Graph)
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetData>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
+    }
+  };
+  char DSE::ID = 0;
+  RegisterPass<DSE> X("dse", "Dead Store Elimination");
+}
+
+FunctionPass *llvm::createDeadStoreEliminationPass() { return new DSE(); }
+
+bool DSE::runOnBasicBlock(BasicBlock &BB) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  AliasSetTracker KillLocs(AA);
+
+  // If this block ends in a return, unwind, unreachable, and eventually
+  // tailcall, then all allocas are dead at its end.
+  if (BB.getTerminator()->getNumSuccessors() == 0) {
+    BasicBlock *Entry = BB.getParent()->begin();
+    for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+        unsigned Size = ~0U;
+        if (!AI->isArrayAllocation() &&
+            AI->getType()->getElementType()->isSized())
+          Size = (unsigned)TD.getTypeSize(AI->getType()->getElementType());
+        KillLocs.add(AI, Size);
+      }
+  }
+
+  // PotentiallyDeadInsts - Deleting dead stores from the program can make other
+  // instructions die if they were only used as operands to stores.  Keep track
+  // of the operands to stores so that we can try deleting them at the end of
+  // the traversal.
+  SetVector<Instruction*> PotentiallyDeadInsts;
+
+  bool MadeChange = false;
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ) {
+    Instruction *I = --BBI;   // Keep moving iterator backwards
+
+    // If this is a free instruction, it makes the free'd location dead!
+    if (FreeInst *FI = dyn_cast<FreeInst>(I)) {
+      // Free instructions make any stores to the free'd location dead.
+      KillLocs.add(FI);
+      continue;
+    }
+
+    if (!isa<StoreInst>(I) || cast<StoreInst>(I)->isVolatile()) {
+      // If this is a vaarg instruction, it reads its operand.  We don't model
+      // it correctly, so just conservatively remove all entries.
+      if (isa<VAArgInst>(I)) {
+        KillLocs.clear();
+        continue;
+      }      
+      
+      // If this is a non-store instruction, it makes everything referenced no
+      // longer killed.  Remove anything aliased from the alias set tracker.
+      KillLocs.remove(I);
+      continue;
+    }
+
+    // If this is a non-volatile store instruction, and if it is already in
+    // the stored location is already in the tracker, then this is a dead
+    // store.  We can just delete it here, but while we're at it, we also
+    // delete any trivially dead expression chains.
+    unsigned ValSize = (unsigned)TD.getTypeSize(I->getOperand(0)->getType());
+    Value *Ptr = I->getOperand(1);
+
+    if (AliasSet *AS = KillLocs.getAliasSetForPointerIfExists(Ptr, ValSize))
+      for (AliasSet::iterator ASI = AS->begin(), E = AS->end(); ASI != E; ++ASI)
+        if (ASI.getSize() >= ValSize &&  // Overwriting all of this store.
+            AA.alias(ASI.getPointer(), ASI.getSize(), Ptr, ValSize)
+               == AliasAnalysis::MustAlias) {
+          // If we found a must alias in the killed set, then this store really
+          // is dead.  Remember that the various operands of the store now have
+          // fewer users.  At the end we will see if we can delete any values
+          // that are dead as part of the store becoming dead.
+          if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(0)))
+            PotentiallyDeadInsts.insert(Op);
+          if (Instruction *Op = dyn_cast<Instruction>(Ptr))
+            PotentiallyDeadInsts.insert(Op);
+
+          // Delete it now.
+          ++BBI;                        // Don't invalidate iterator.
+          BB.getInstList().erase(I);    // Nuke the store!
+          ++NumStores;
+          MadeChange = true;
+          goto BigContinue;
+        }
+
+    // Otherwise, this is a non-dead store just add it to the set of dead
+    // locations.
+    KillLocs.add(cast<StoreInst>(I));
+  BigContinue:;
+  }
+
+  while (!PotentiallyDeadInsts.empty()) {
+    Instruction *I = PotentiallyDeadInsts.back();
+    PotentiallyDeadInsts.pop_back();
+    DeleteDeadInstructionChains(I, PotentiallyDeadInsts);
+  }
+  return MadeChange;
+}
+
+void DSE::DeleteDeadInstructionChains(Instruction *I,
+                                      SetVector<Instruction*> &DeadInsts) {
+  // Instruction must be dead.
+  if (!I->use_empty() || !isInstructionTriviallyDead(I)) return;
+
+  // Let the alias analysis know that we have nuked a value.
+  getAnalysis<AliasAnalysis>().deleteValue(I);
+
+  // See if this made any operands dead.  We do it this way in case the
+  // instruction uses the same operand twice.  We don't want to delete a
+  // value then reference it.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    if (Instruction *Op = dyn_cast<Instruction>(I->getOperand(i)))
+      DeadInsts.insert(Op);      // Attempt to nuke it later.
+    I->setOperand(i, 0);         // Drop from the operand list.
+  }
+
+  I->eraseFromParent();
+  ++NumOther;
+}

diff --git a/lib/Transforms/Scalar/FastDSE.cpp b/lib/Transforms/Scalar/FastDSE.cpp
new file mode 100644
index 0000000..72857b9
--- /dev/null
+++ b/lib/Transforms/Scalar/FastDSE.cpp

@@ -0,0 +1,387 @@
+//===- DeadStoreElimination.cpp - Dead Store Elimination ------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Owen Anderson and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a trivial dead store elimination that only considers
+// basic-block local redundant stores.
+//
+// FIXME: This should eventually be extended to be a post-dominator tree
+// traversal.  Doing so would be pretty trivial.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "fdse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/MemoryDependenceAnalysis.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumFastStores, "Number of stores deleted");
+STATISTIC(NumFastOther , "Number of other instrs removed");
+
+namespace {
+  struct VISIBILITY_HIDDEN FDSE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    FDSE() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F) {
+      bool Changed = false;
+      for (Function::iterator I = F.begin(), E = F.end(); I != E; ++I)
+        Changed |= runOnBasicBlock(*I);
+      return Changed;
+    }
+
+    bool runOnBasicBlock(BasicBlock &BB);
+    bool handleFreeWithNonTrivialDependency(FreeInst* F, Instruction* dependency,
+                                            SetVector<Instruction*>& possiblyDead);
+    bool handleEndBlock(BasicBlock& BB, SetVector<Instruction*>& possiblyDead);
+    bool RemoveUndeadPointers(Value* pointer, unsigned pointerSize,
+                              BasicBlock::iterator& BBI,
+                              SmallPtrSet<AllocaInst*, 4>& deadPointers, 
+                              SetVector<Instruction*>& possiblyDead);
+    void DeleteDeadInstructionChains(Instruction *I,
+                                     SetVector<Instruction*> &DeadInsts);
+    void TranslatePointerBitCasts(Value*& v) {
+      assert(isa<PointerType>(v->getType()) && "Translating a non-pointer type?");
+      
+      // See through pointer-to-pointer bitcasts
+      while (isa<BitCastInst>(v) || isa<GetElementPtrInst>(v))
+        if (BitCastInst* C = dyn_cast<BitCastInst>(v))
+          v = C->getOperand(0);
+        else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(v))
+          v = G->getOperand(0);
+    }
+
+    // getAnalysisUsage - We require post dominance frontiers (aka Control
+    // Dependence Graph)
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<TargetData>();
+      AU.addRequired<AliasAnalysis>();
+      AU.addRequired<MemoryDependenceAnalysis>();
+      AU.addPreserved<AliasAnalysis>();
+      AU.addPreserved<MemoryDependenceAnalysis>();
+    }
+  };
+  char FDSE::ID = 0;
+  RegisterPass<FDSE> X("fdse", "Fast Dead Store Elimination");
+}
+
+FunctionPass *llvm::createFastDeadStoreEliminationPass() { return new FDSE(); }
+
+bool FDSE::runOnBasicBlock(BasicBlock &BB) {
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  
+  // Record the last-seen store to this pointer
+  DenseMap<Value*, StoreInst*> lastStore;
+  // Record instructions possibly made dead by deleting a store
+  SetVector<Instruction*> possiblyDead;
+  
+  bool MadeChange = false;
+  
+  // Do a top-down walk on the BB
+  for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ++BBI) {
+    // If we find a store or a free...
+    if (isa<StoreInst>(BBI) || isa<FreeInst>(BBI)) {
+      Value* pointer = 0;
+      if (StoreInst* S = dyn_cast<StoreInst>(BBI))
+        pointer = S->getPointerOperand();
+      else if (FreeInst* F = dyn_cast<FreeInst>(BBI))
+        pointer = F->getPointerOperand();
+      
+      assert(pointer && "Not a free or a store?");
+      
+      StoreInst*& last = lastStore[pointer];
+      bool deletedStore = false;
+      
+      // ... to a pointer that has been stored to before...
+      if (last) {
+        
+        Instruction* dep = MD.getDependency(BBI);
+        
+        // ... and no other memory dependencies are between them....
+        while (dep != MemoryDependenceAnalysis::None &&
+               dep != MemoryDependenceAnalysis::NonLocal &&
+               isa<StoreInst>(dep)) {
+          if (dep == last) {
+            
+            // Remove it!
+            MD.removeInstruction(last);
+          
+            // DCE instructions only used to calculate that store
+            if (Instruction* D = dyn_cast<Instruction>(last->getOperand(0)))
+              possiblyDead.insert(D);
+            if (Instruction* D = dyn_cast<Instruction>(last->getOperand(1)))
+              possiblyDead.insert(D);
+          
+            last->eraseFromParent();
+            NumFastStores++;
+            deletedStore = true;
+            MadeChange = true;
+            
+            break;
+          } else {
+            dep = MD.getDependency(BBI, dep);
+          }
+        }
+      }
+      
+      // Handle frees whose dependencies are non-trivial
+      if (FreeInst* F = dyn_cast<FreeInst>(BBI))
+        if (!deletedStore)
+          MadeChange |= handleFreeWithNonTrivialDependency(F, MD.getDependency(F),
+                                                           possiblyDead);
+      
+      // Update our most-recent-store map
+      if (StoreInst* S = dyn_cast<StoreInst>(BBI))
+        last = S;
+      else
+        last = 0;
+    }
+  }
+  
+  // If this block ends in a return, unwind, unreachable, and eventually
+  // tailcall, then all allocas are dead at its end.
+  if (BB.getTerminator()->getNumSuccessors() == 0)
+    MadeChange |= handleEndBlock(BB, possiblyDead);
+  
+  // Do a trivial DCE
+  while (!possiblyDead.empty()) {
+    Instruction *I = possiblyDead.back();
+    possiblyDead.pop_back();
+    DeleteDeadInstructionChains(I, possiblyDead);
+  }
+  
+  return MadeChange;
+}
+
+/// handleFreeWithNonTrivialDependency - Handle frees of entire structures whose
+/// dependency is a store to a field of that structure
+bool FDSE::handleFreeWithNonTrivialDependency(FreeInst* F, Instruction* dep,
+                                              SetVector<Instruction*>& possiblyDead) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  
+  if (dep == MemoryDependenceAnalysis::None ||
+      dep == MemoryDependenceAnalysis::NonLocal)
+    return false;
+  
+  StoreInst* dependency = dyn_cast<StoreInst>(dep);
+  if (!dependency)
+    return false;
+  
+  Value* depPointer = dependency->getPointerOperand();
+  unsigned depPointerSize = TD.getTypeSize(dependency->getOperand(0)->getType());
+  
+  // Check for aliasing
+  AliasAnalysis::AliasResult A = AA.alias(F->getPointerOperand(), ~0UL,
+                                          depPointer, depPointerSize);
+    
+  if (A == AliasAnalysis::MustAlias) {
+    // Remove it!
+    MD.removeInstruction(dependency);
+
+    // DCE instructions only used to calculate that store
+    if (Instruction* D = dyn_cast<Instruction>(dependency->getOperand(0)))
+      possiblyDead.insert(D);
+    if (Instruction* D = dyn_cast<Instruction>(dependency->getOperand(1)))
+      possiblyDead.insert(D);
+
+    dependency->eraseFromParent();
+    NumFastStores++;
+    return true;
+  }
+  
+  return false;
+}
+
+/// handleEndBlock - Remove dead stores to stack-allocated locations in the function
+/// end block
+bool FDSE::handleEndBlock(BasicBlock& BB, SetVector<Instruction*>& possiblyDead) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+  
+  bool MadeChange = false;
+  
+  // Pointers alloca'd in this function are dead in the end block
+  SmallPtrSet<AllocaInst*, 4> deadPointers;
+  
+  // Find all of the alloca'd pointers in the entry block
+  BasicBlock *Entry = BB.getParent()->begin();
+  for (BasicBlock::iterator I = Entry->begin(), E = Entry->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
+      deadPointers.insert(AI);
+  
+  // Scan the basic block backwards
+  for (BasicBlock::iterator BBI = BB.end(); BBI != BB.begin(); ){
+    --BBI;
+    
+    if (deadPointers.empty())
+      break;
+    
+    Value* killPointer = 0;
+    unsigned killPointerSize = 0;
+    
+    // If we find a store whose pointer is dead...
+    if (StoreInst* S = dyn_cast<StoreInst>(BBI)) {
+      Value* pointerOperand = S->getPointerOperand();
+      // See through pointer-to-pointer bitcasts
+      TranslatePointerBitCasts(pointerOperand);
+      
+      if (deadPointers.count(pointerOperand)){
+        // Remove it!
+        MD.removeInstruction(S);
+        
+        // DCE instructions only used to calculate that store
+        if (Instruction* D = dyn_cast<Instruction>(S->getOperand(0)))
+          possiblyDead.insert(D);
+        if (Instruction* D = dyn_cast<Instruction>(S->getOperand(1)))
+          possiblyDead.insert(D);
+        
+        BBI++;
+        S->eraseFromParent();
+        NumFastStores++;
+        MadeChange = true;
+      }
+    
+    // If we encounter a use of the pointer, it is no longer considered dead
+    } else if (LoadInst* L = dyn_cast<LoadInst>(BBI)) {
+      killPointer = L->getPointerOperand();
+      killPointerSize = TD.getTypeSize(L->getType());
+    } else if (VAArgInst* V = dyn_cast<VAArgInst>(BBI)) {
+      killPointer = V->getOperand(0);
+      killPointerSize = TD.getTypeSize(V->getType());
+    } else if (FreeInst* F = dyn_cast<FreeInst>(BBI)) {
+      killPointer = F->getPointerOperand();
+      killPointerSize = ~0UL;
+    } else if (AllocaInst* A = dyn_cast<AllocaInst>(BBI)) {
+      deadPointers.erase(A);
+      continue;
+    } else if (CallSite::get(BBI).getInstruction() != 0) {
+      // Remove any pointers made undead by the call from the dead set
+      std::vector<Instruction*> dead;
+      for (SmallPtrSet<AllocaInst*, 4>::iterator I = deadPointers.begin(),
+           E = deadPointers.end(); I != E; ++I) {
+        // Get size information for the alloca
+        unsigned pointerSize = ~0UL;
+        if (ConstantInt* C = dyn_cast<ConstantInt>((*I)->getArraySize()))
+          pointerSize = C->getZExtValue() * TD.getTypeSize((*I)->getAllocatedType());     
+        
+        // See if the call site touches it
+        AliasAnalysis::ModRefResult A = AA.getModRefInfo(CallSite::get(BBI),
+                                                         *I, pointerSize);
+        if (A == AliasAnalysis::ModRef || A == AliasAnalysis::Ref)
+          dead.push_back(*I);
+      }
+
+      for (std::vector<Instruction*>::iterator I = dead.begin(), E = dead.end();
+           I != E; ++I)
+        deadPointers.erase(*I);
+      
+      continue;
+    }
+    
+    if (!killPointer)
+      continue;
+    
+    // Deal with undead pointers
+    MadeChange |= RemoveUndeadPointers(killPointer, killPointerSize, BBI,
+                                       deadPointers, possiblyDead);
+  }
+  
+  return MadeChange;
+}
+
+bool FDSE::RemoveUndeadPointers(Value* killPointer, unsigned killPointerSize,
+                                BasicBlock::iterator& BBI,
+                                SmallPtrSet<AllocaInst*, 4>& deadPointers, 
+                                SetVector<Instruction*>& possiblyDead) {
+  TargetData &TD = getAnalysis<TargetData>();
+  AliasAnalysis &AA = getAnalysis<AliasAnalysis>();
+  MemoryDependenceAnalysis& MD = getAnalysis<MemoryDependenceAnalysis>();
+                                  
+  bool MadeChange = false;
+  
+  std::vector<Instruction*> undead;
+    
+  for (SmallPtrSet<AllocaInst*, 4>::iterator I = deadPointers.begin(),
+      E = deadPointers.end(); I != E; ++I) {
+    // Get size information for the alloca
+    unsigned pointerSize = ~0UL;
+    if (ConstantInt* C = dyn_cast<ConstantInt>((*I)->getArraySize()))
+      pointerSize = C->getZExtValue() * TD.getTypeSize((*I)->getAllocatedType());     
+      
+    // See if this pointer could alias it
+    AliasAnalysis::AliasResult A = AA.alias(*I, pointerSize, killPointer, killPointerSize);
+
+    // If it must-alias and a store, we can delete it
+    if (isa<StoreInst>(BBI) && A == AliasAnalysis::MustAlias) {
+      StoreInst* S = cast<StoreInst>(BBI);
+
+      // Remove it!
+      MD.removeInstruction(S);
+
+      // DCE instructions only used to calculate that store
+      if (Instruction* D = dyn_cast<Instruction>(S->getOperand(0)))
+        possiblyDead.insert(D);
+      if (Instruction* D = dyn_cast<Instruction>(S->getOperand(1)))
+        possiblyDead.insert(D);
+
+      BBI++;
+      S->eraseFromParent();
+      NumFastStores++;
+      MadeChange = true;
+
+      continue;
+
+      // Otherwise, it is undead
+      } else if (A != AliasAnalysis::NoAlias)
+        undead.push_back(*I);
+  }
+
+  for (std::vector<Instruction*>::iterator I = undead.begin(), E = undead.end();
+       I != E; ++I)
+    deadPointers.erase(*I);
+  
+  return MadeChange;
+}
+
+void FDSE::DeleteDeadInstructionChains(Instruction *I,
+                                      SetVector<Instruction*> &DeadInsts) {
+  // Instruction must be dead.
+  if (!I->use_empty() || !isInstructionTriviallyDead(I)) return;
+
+  // Let the memory dependence know
+  getAnalysis<MemoryDependenceAnalysis>().removeInstruction(I);
+
+  // See if this made any operands dead.  We do it this way in case the
+  // instruction uses the same operand twice.  We don't want to delete a
+  // value then reference it.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+    if (I->getOperand(i)->hasOneUse())
+      if (Instruction* Op = dyn_cast<Instruction>(I->getOperand(i)))
+        DeadInsts.insert(Op);      // Attempt to nuke it later.
+    
+    I->setOperand(i, 0);         // Drop from the operand list.
+  }
+
+  I->eraseFromParent();
+  ++NumFastOther;
+}

diff --git a/lib/Transforms/Scalar/GCSE.cpp b/lib/Transforms/Scalar/GCSE.cpp
new file mode 100644
index 0000000..93ed8c4
--- /dev/null
+++ b/lib/Transforms/Scalar/GCSE.cpp

@@ -0,0 +1,201 @@
+//===-- GCSE.cpp - SSA-based Global Common Subexpression Elimination ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass is designed to be a very quick global transformation that
+// eliminates global common subexpressions from a function.  It does this by
+// using an existing value numbering implementation to identify the common
+// subexpressions, eliminating them when possible.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gcse"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ValueNumbering.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumLoadRemoved, "Number of loads removed");
+STATISTIC(NumCallRemoved, "Number of calls removed");
+STATISTIC(NumNonInsts   , "Number of instructions removed due "
+                          "to non-instruction values");
+STATISTIC(NumArgsRepl   , "Number of function arguments replaced "
+                          "with constant values");
+namespace {
+  struct VISIBILITY_HIDDEN GCSE : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    GCSE() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    void ReplaceInstructionWith(Instruction *I, Value *V);
+
+    // This transformation requires dominator and immediate dominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<ValueNumbering>();
+    }
+  };
+
+  char GCSE::ID = 0;
+  RegisterPass<GCSE> X("gcse", "Global Common Subexpression Elimination");
+}
+
+// createGCSEPass - The public interface to this file...
+FunctionPass *llvm::createGCSEPass() { return new GCSE(); }
+
+// GCSE::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool GCSE::runOnFunction(Function &F) {
+  bool Changed = false;
+
+  // Get pointers to the analysis results that we will be using...
+  DominatorTree &DT = getAnalysis<DominatorTree>();
+  ValueNumbering &VN = getAnalysis<ValueNumbering>();
+
+  std::vector<Value*> EqualValues;
+
+  // Check for value numbers of arguments.  If the value numbering
+  // implementation can prove that an incoming argument is a constant or global
+  // value address, substitute it, making the argument dead.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
+    if (!AI->use_empty()) {
+      VN.getEqualNumberNodes(AI, EqualValues);
+      if (!EqualValues.empty()) {
+        for (unsigned i = 0, e = EqualValues.size(); i != e; ++i)
+          if (isa<Constant>(EqualValues[i])) {
+            AI->replaceAllUsesWith(EqualValues[i]);
+            ++NumArgsRepl;
+            Changed = true;
+            break;
+          }
+        EqualValues.clear();
+      }
+    }
+
+  // Traverse the CFG of the function in dominator order, so that we see each
+  // instruction after we see its operands.
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    BasicBlock *BB = DI->getBlock();
+
+    // Remember which instructions we've seen in this basic block as we scan.
+    std::set<Instruction*> BlockInsts;
+
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ) {
+      Instruction *Inst = I++;
+
+      if (Constant *C = ConstantFoldInstruction(Inst)) {
+        ReplaceInstructionWith(Inst, C);
+      } else if (Inst->getType() != Type::VoidTy) {
+        // If this instruction computes a value, try to fold together common
+        // instructions that compute it.
+        //
+        VN.getEqualNumberNodes(Inst, EqualValues);
+
+        // If this instruction computes a value that is already computed
+        // elsewhere, try to recycle the old value.
+        if (!EqualValues.empty()) {
+          if (Inst == &*BB->begin())
+            I = BB->end();
+          else {
+            I = Inst; --I;
+          }
+
+          // First check to see if we were able to value number this instruction
+          // to a non-instruction value.  If so, prefer that value over other
+          // instructions which may compute the same thing.
+          for (unsigned i = 0, e = EqualValues.size(); i != e; ++i)
+            if (!isa<Instruction>(EqualValues[i])) {
+              ++NumNonInsts;      // Keep track of # of insts repl with values
+
+              // Change all users of Inst to use the replacement and remove it
+              // from the program.
+              ReplaceInstructionWith(Inst, EqualValues[i]);
+              Inst = 0;
+              EqualValues.clear();  // don't enter the next loop
+              break;
+            }
+
+          // If there were no non-instruction values that this instruction
+          // produces, find a dominating instruction that produces the same
+          // value.  If we find one, use it's value instead of ours.
+          for (unsigned i = 0, e = EqualValues.size(); i != e; ++i) {
+            Instruction *OtherI = cast<Instruction>(EqualValues[i]);
+            bool Dominates = false;
+            if (OtherI->getParent() == BB)
+              Dominates = BlockInsts.count(OtherI);
+            else
+              Dominates = DT.dominates(OtherI->getParent(), BB);
+
+            if (Dominates) {
+              // Okay, we found an instruction with the same value as this one
+              // and that dominates this one.  Replace this instruction with the
+              // specified one.
+              ReplaceInstructionWith(Inst, OtherI);
+              Inst = 0;
+              break;
+            }
+          }
+
+          EqualValues.clear();
+
+          if (Inst) {
+            I = Inst; ++I;             // Deleted no instructions
+          } else if (I == BB->end()) { // Deleted first instruction
+            I = BB->begin();
+          } else {                     // Deleted inst in middle of block.
+            ++I;
+          }
+        }
+
+        if (Inst)
+          BlockInsts.insert(Inst);
+      }
+    }
+  }
+
+  // When the worklist is empty, return whether or not we changed anything...
+  return Changed;
+}
+
+
+void GCSE::ReplaceInstructionWith(Instruction *I, Value *V) {
+  if (isa<LoadInst>(I))
+    ++NumLoadRemoved; // Keep track of loads eliminated
+  if (isa<CallInst>(I))
+    ++NumCallRemoved; // Keep track of calls eliminated
+  ++NumInstRemoved;   // Keep track of number of insts eliminated
+
+  // Update value numbering
+  getAnalysis<ValueNumbering>().deleteValue(I);
+
+  I->replaceAllUsesWith(V);
+
+  if (InvokeInst *II = dyn_cast<InvokeInst>(I)) {
+    // Removing an invoke instruction requires adding a branch to the normal
+    // destination and removing PHI node entries in the exception destination.
+    new BranchInst(II->getNormalDest(), II);
+    II->getUnwindDest()->removePredecessor(II->getParent());
+  }
+
+  // Erase the instruction from the program.
+  I->getParent()->getInstList().erase(I);
+}

diff --git a/lib/Transforms/Scalar/GVNPRE.cpp b/lib/Transforms/Scalar/GVNPRE.cpp
new file mode 100644
index 0000000..e625fc2
--- /dev/null
+++ b/lib/Transforms/Scalar/GVNPRE.cpp

@@ -0,0 +1,1819 @@
+//===- GVNPRE.cpp - Eliminate redundant values and expressions ------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the Owen Anderson and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a hybrid of global value numbering and partial redundancy
+// elimination, known as GVN-PRE.  It performs partial redundancy elimination on
+// values, rather than lexical expressions, allowing a more comprehensive view 
+// the optimization.  It replaces redundant values with uses of earlier 
+// occurences of the same value.  While this is beneficial in that it eliminates
+// unneeded computation, it also increases register pressure by creating large
+// live ranges, and should be used with caution on platforms that are very 
+// sensitive to register pressure.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "gvnpre"
+#include "llvm/Value.h"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Instructions.h"
+#include "llvm/Function.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Transforms/Utils/UnifyFunctionExitNodes.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <deque>
+#include <map>
+#include <vector>
+#include <set>
+using namespace llvm;
+
+//===----------------------------------------------------------------------===//
+//                         ValueTable Class
+//===----------------------------------------------------------------------===//
+
+/// This class holds the mapping between values and value numbers.  It is used
+/// as an efficient mechanism to determine the expression-wise equivalence of
+/// two values.
+
+namespace {
+  class VISIBILITY_HIDDEN ValueTable {
+    public:
+      struct Expression {
+        enum ExpressionOpcode { ADD, SUB, MUL, UDIV, SDIV, FDIV, UREM, SREM, 
+                              FREM, SHL, LSHR, ASHR, AND, OR, XOR, ICMPEQ, 
+                              ICMPNE, ICMPUGT, ICMPUGE, ICMPULT, ICMPULE, 
+                              ICMPSGT, ICMPSGE, ICMPSLT, ICMPSLE, FCMPOEQ, 
+                              FCMPOGT, FCMPOGE, FCMPOLT, FCMPOLE, FCMPONE, 
+                              FCMPORD, FCMPUNO, FCMPUEQ, FCMPUGT, FCMPUGE, 
+                              FCMPULT, FCMPULE, FCMPUNE, EXTRACT, INSERT,
+                              SHUFFLE, SELECT, TRUNC, ZEXT, SEXT, FPTOUI,
+                              FPTOSI, UITOFP, SITOFP, FPTRUNC, FPEXT, 
+                              PTRTOINT, INTTOPTR, BITCAST, GEP};
+    
+        ExpressionOpcode opcode;
+        const Type* type;
+        uint32_t firstVN;
+        uint32_t secondVN;
+        uint32_t thirdVN;
+        std::vector<uint32_t> varargs;
+      
+        bool operator< (const Expression& other) const {
+          if (opcode < other.opcode)
+            return true;
+          else if (opcode > other.opcode)
+            return false;
+          else if (type < other.type)
+            return true;
+          else if (type > other.type)
+            return false;
+          else if (firstVN < other.firstVN)
+            return true;
+          else if (firstVN > other.firstVN)
+            return false;
+          else if (secondVN < other.secondVN)
+            return true;
+          else if (secondVN > other.secondVN)
+            return false;
+          else if (thirdVN < other.thirdVN)
+            return true;
+          else if (thirdVN > other.thirdVN)
+            return false;
+          else {
+            if (varargs.size() < other.varargs.size())
+              return true;
+            else if (varargs.size() > other.varargs.size())
+              return false;
+            
+            for (size_t i = 0; i < varargs.size(); ++i)
+              if (varargs[i] < other.varargs[i])
+                return true;
+              else if (varargs[i] > other.varargs[i])
+                return false;
+          
+            return false;
+          }
+        }
+      };
+    
+    private:
+      DenseMap<Value*, uint32_t> valueNumbering;
+      std::map<Expression, uint32_t> expressionNumbering;
+  
+      uint32_t nextValueNumber;
+    
+      Expression::ExpressionOpcode getOpcode(BinaryOperator* BO);
+      Expression::ExpressionOpcode getOpcode(CmpInst* C);
+      Expression::ExpressionOpcode getOpcode(CastInst* C);
+      Expression create_expression(BinaryOperator* BO);
+      Expression create_expression(CmpInst* C);
+      Expression create_expression(ShuffleVectorInst* V);
+      Expression create_expression(ExtractElementInst* C);
+      Expression create_expression(InsertElementInst* V);
+      Expression create_expression(SelectInst* V);
+      Expression create_expression(CastInst* C);
+      Expression create_expression(GetElementPtrInst* G);
+    public:
+      ValueTable() { nextValueNumber = 1; }
+      uint32_t lookup_or_add(Value* V);
+      uint32_t lookup(Value* V) const;
+      void add(Value* V, uint32_t num);
+      void clear();
+      void erase(Value* v);
+      unsigned size();
+  };
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable Internal Functions
+//===----------------------------------------------------------------------===//
+ValueTable::Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(BinaryOperator* BO) {
+  switch(BO->getOpcode()) {
+    case Instruction::Add:
+      return Expression::ADD;
+    case Instruction::Sub:
+      return Expression::SUB;
+    case Instruction::Mul:
+      return Expression::MUL;
+    case Instruction::UDiv:
+      return Expression::UDIV;
+    case Instruction::SDiv:
+      return Expression::SDIV;
+    case Instruction::FDiv:
+      return Expression::FDIV;
+    case Instruction::URem:
+      return Expression::UREM;
+    case Instruction::SRem:
+      return Expression::SREM;
+    case Instruction::FRem:
+      return Expression::FREM;
+    case Instruction::Shl:
+      return Expression::SHL;
+    case Instruction::LShr:
+      return Expression::LSHR;
+    case Instruction::AShr:
+      return Expression::ASHR;
+    case Instruction::And:
+      return Expression::AND;
+    case Instruction::Or:
+      return Expression::OR;
+    case Instruction::Xor:
+      return Expression::XOR;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Binary operator with unknown opcode?");
+      return Expression::ADD;
+  }
+}
+
+ValueTable::Expression::ExpressionOpcode ValueTable::getOpcode(CmpInst* C) {
+  if (C->getOpcode() == Instruction::ICmp) {
+    switch (C->getPredicate()) {
+      case ICmpInst::ICMP_EQ:
+        return Expression::ICMPEQ;
+      case ICmpInst::ICMP_NE:
+        return Expression::ICMPNE;
+      case ICmpInst::ICMP_UGT:
+        return Expression::ICMPUGT;
+      case ICmpInst::ICMP_UGE:
+        return Expression::ICMPUGE;
+      case ICmpInst::ICMP_ULT:
+        return Expression::ICMPULT;
+      case ICmpInst::ICMP_ULE:
+        return Expression::ICMPULE;
+      case ICmpInst::ICMP_SGT:
+        return Expression::ICMPSGT;
+      case ICmpInst::ICMP_SGE:
+        return Expression::ICMPSGE;
+      case ICmpInst::ICMP_SLT:
+        return Expression::ICMPSLT;
+      case ICmpInst::ICMP_SLE:
+        return Expression::ICMPSLE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::ICMPEQ;
+    }
+  } else {
+    switch (C->getPredicate()) {
+      case FCmpInst::FCMP_OEQ:
+        return Expression::FCMPOEQ;
+      case FCmpInst::FCMP_OGT:
+        return Expression::FCMPOGT;
+      case FCmpInst::FCMP_OGE:
+        return Expression::FCMPOGE;
+      case FCmpInst::FCMP_OLT:
+        return Expression::FCMPOLT;
+      case FCmpInst::FCMP_OLE:
+        return Expression::FCMPOLE;
+      case FCmpInst::FCMP_ONE:
+        return Expression::FCMPONE;
+      case FCmpInst::FCMP_ORD:
+        return Expression::FCMPORD;
+      case FCmpInst::FCMP_UNO:
+        return Expression::FCMPUNO;
+      case FCmpInst::FCMP_UEQ:
+        return Expression::FCMPUEQ;
+      case FCmpInst::FCMP_UGT:
+        return Expression::FCMPUGT;
+      case FCmpInst::FCMP_UGE:
+        return Expression::FCMPUGE;
+      case FCmpInst::FCMP_ULT:
+        return Expression::FCMPULT;
+      case FCmpInst::FCMP_ULE:
+        return Expression::FCMPULE;
+      case FCmpInst::FCMP_UNE:
+        return Expression::FCMPUNE;
+      
+      // THIS SHOULD NEVER HAPPEN
+      default:
+        assert(0 && "Comparison with unknown predicate?");
+        return Expression::FCMPOEQ;
+    }
+  }
+}
+
+ValueTable::Expression::ExpressionOpcode 
+                             ValueTable::getOpcode(CastInst* C) {
+  switch(C->getOpcode()) {
+    case Instruction::Trunc:
+      return Expression::TRUNC;
+    case Instruction::ZExt:
+      return Expression::ZEXT;
+    case Instruction::SExt:
+      return Expression::SEXT;
+    case Instruction::FPToUI:
+      return Expression::FPTOUI;
+    case Instruction::FPToSI:
+      return Expression::FPTOSI;
+    case Instruction::UIToFP:
+      return Expression::UITOFP;
+    case Instruction::SIToFP:
+      return Expression::SITOFP;
+    case Instruction::FPTrunc:
+      return Expression::FPTRUNC;
+    case Instruction::FPExt:
+      return Expression::FPEXT;
+    case Instruction::PtrToInt:
+      return Expression::PTRTOINT;
+    case Instruction::IntToPtr:
+      return Expression::INTTOPTR;
+    case Instruction::BitCast:
+      return Expression::BITCAST;
+    
+    // THIS SHOULD NEVER HAPPEN
+    default:
+      assert(0 && "Cast operator with unknown opcode?");
+      return Expression::BITCAST;
+  }
+}
+
+ValueTable::Expression ValueTable::create_expression(BinaryOperator* BO) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(BO->getOperand(0));
+  e.secondVN = lookup_or_add(BO->getOperand(1));
+  e.thirdVN = 0;
+  e.type = BO->getType();
+  e.opcode = getOpcode(BO);
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(CmpInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = lookup_or_add(C->getOperand(1));
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(CastInst* C) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(C->getOperand(0));
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = C->getType();
+  e.opcode = getOpcode(C);
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(ShuffleVectorInst* S) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(S->getOperand(0));
+  e.secondVN = lookup_or_add(S->getOperand(1));
+  e.thirdVN = lookup_or_add(S->getOperand(2));
+  e.type = S->getType();
+  e.opcode = Expression::SHUFFLE;
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(ExtractElementInst* E) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(E->getOperand(0));
+  e.secondVN = lookup_or_add(E->getOperand(1));
+  e.thirdVN = 0;
+  e.type = E->getType();
+  e.opcode = Expression::EXTRACT;
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(InsertElementInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getOperand(0));
+  e.secondVN = lookup_or_add(I->getOperand(1));
+  e.thirdVN = lookup_or_add(I->getOperand(2));
+  e.type = I->getType();
+  e.opcode = Expression::INSERT;
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(SelectInst* I) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(I->getCondition());
+  e.secondVN = lookup_or_add(I->getTrueValue());
+  e.thirdVN = lookup_or_add(I->getFalseValue());
+  e.type = I->getType();
+  e.opcode = Expression::SELECT;
+  
+  return e;
+}
+
+ValueTable::Expression ValueTable::create_expression(GetElementPtrInst* G) {
+  Expression e;
+    
+  e.firstVN = lookup_or_add(G->getPointerOperand());
+  e.secondVN = 0;
+  e.thirdVN = 0;
+  e.type = G->getType();
+  e.opcode = Expression::SELECT;
+  
+  for (GetElementPtrInst::op_iterator I = G->idx_begin(), E = G->idx_end();
+       I != E; ++I)
+    e.varargs.push_back(lookup_or_add(*I));
+  
+  return e;
+}
+
+//===----------------------------------------------------------------------===//
+//                     ValueTable External Functions
+//===----------------------------------------------------------------------===//
+
+/// lookup_or_add - Returns the value number for the specified value, assigning
+/// it a new number if it did not have one before.
+uint32_t ValueTable::lookup_or_add(Value* V) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  
+  
+  if (BinaryOperator* BO = dyn_cast<BinaryOperator>(V)) {
+    Expression e = create_expression(BO);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CmpInst* C = dyn_cast<CmpInst>(V)) {
+    Expression e = create_expression(C);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ShuffleVectorInst* U = dyn_cast<ShuffleVectorInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (ExtractElementInst* U = dyn_cast<ExtractElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (InsertElementInst* U = dyn_cast<InsertElementInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (SelectInst* U = dyn_cast<SelectInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Expression e = create_expression(U);
+    
+    std::map<Expression, uint32_t>::iterator EI = expressionNumbering.find(e);
+    if (EI != expressionNumbering.end()) {
+      valueNumbering.insert(std::make_pair(V, EI->second));
+      return EI->second;
+    } else {
+      expressionNumbering.insert(std::make_pair(e, nextValueNumber));
+      valueNumbering.insert(std::make_pair(V, nextValueNumber));
+      
+      return nextValueNumber++;
+    }
+  } else {
+    valueNumbering.insert(std::make_pair(V, nextValueNumber));
+    return nextValueNumber++;
+  }
+}
+
+/// lookup - Returns the value number of the specified value. Fails if
+/// the value has not yet been numbered.
+uint32_t ValueTable::lookup(Value* V) const {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    return VI->second;
+  else
+    assert(0 && "Value not numbered?");
+  
+  return 0;
+}
+
+/// add - Add the specified value with the given value number, removing
+/// its old number, if any
+void ValueTable::add(Value* V, uint32_t num) {
+  DenseMap<Value*, uint32_t>::iterator VI = valueNumbering.find(V);
+  if (VI != valueNumbering.end())
+    valueNumbering.erase(VI);
+  valueNumbering.insert(std::make_pair(V, num));
+}
+
+/// clear - Remove all entries from the ValueTable
+void ValueTable::clear() {
+  valueNumbering.clear();
+  expressionNumbering.clear();
+  nextValueNumber = 1;
+}
+
+/// erase - Remove a value from the value numbering
+void ValueTable::erase(Value* V) {
+  valueNumbering.erase(V);
+}
+
+/// size - Return the number of assigned value numbers
+unsigned ValueTable::size() {
+  // NOTE: zero is never assigned
+  return nextValueNumber;
+}
+
+//===----------------------------------------------------------------------===//
+//                       ValueNumberedSet Class
+//===----------------------------------------------------------------------===//
+
+class ValueNumberedSet {
+  private:
+    SmallPtrSet<Value*, 8> contents;
+    BitVector numbers;
+  public:
+    ValueNumberedSet() { numbers.resize(1); }
+    ValueNumberedSet(const ValueNumberedSet& other) {
+      numbers = other.numbers;
+      contents = other.contents;
+    }
+    
+    typedef SmallPtrSet<Value*, 8>::iterator iterator;
+    
+    iterator begin() { return contents.begin(); }
+    iterator end() { return contents.end(); }
+    
+    bool insert(Value* v) { return contents.insert(v); }
+    void insert(iterator I, iterator E) { contents.insert(I, E); }
+    void erase(Value* v) { contents.erase(v); }
+    unsigned count(Value* v) { return contents.count(v); }
+    size_t size() { return contents.size(); }
+    
+    void set(unsigned i)  {
+      if (i >= numbers.size())
+        numbers.resize(i+1);
+      
+      numbers.set(i);
+    }
+    
+    void operator=(const ValueNumberedSet& other) {
+      contents = other.contents;
+      numbers = other.numbers;
+    }
+    
+    void reset(unsigned i)  {
+      if (i < numbers.size())
+        numbers.reset(i);
+    }
+    
+    bool test(unsigned i)  {
+      if (i >= numbers.size())
+        return false;
+      
+      return numbers.test(i);
+    }
+    
+    void clear() {
+      contents.clear();
+      numbers.clear();
+    }
+};
+
+//===----------------------------------------------------------------------===//
+//                         GVNPRE Pass
+//===----------------------------------------------------------------------===//
+
+namespace {
+
+  class VISIBILITY_HIDDEN GVNPRE : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    GVNPRE() : FunctionPass((intptr_t)&ID) { }
+
+  private:
+    ValueTable VN;
+    std::vector<Instruction*> createdExpressions;
+    
+    DenseMap<BasicBlock*, ValueNumberedSet> availableOut;
+    DenseMap<BasicBlock*, ValueNumberedSet> anticipatedIn;
+    DenseMap<BasicBlock*, ValueNumberedSet> generatedPhis;
+    
+    // This transformation requires dominator postdominator info
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<UnifyFunctionExitNodes>();
+      AU.addRequired<DominatorTree>();
+    }
+  
+    // Helper fuctions
+    // FIXME: eliminate or document these better
+    void dump(ValueNumberedSet& s) const ;
+    void clean(ValueNumberedSet& set) ;
+    Value* find_leader(ValueNumberedSet& vals, uint32_t v) ;
+    Value* phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) ;
+    void phi_translate_set(ValueNumberedSet& anticIn, BasicBlock* pred,
+                           BasicBlock* succ, ValueNumberedSet& out) ;
+    
+    void topo_sort(ValueNumberedSet& set,
+                   std::vector<Value*>& vec) ;
+    
+    void cleanup() ;
+    bool elimination() ;
+    
+    void val_insert(ValueNumberedSet& s, Value* v) ;
+    void val_replace(ValueNumberedSet& s, Value* v) ;
+    bool dependsOnInvoke(Value* V) ;
+    void buildsets_availout(BasicBlock::iterator I,
+                            ValueNumberedSet& currAvail,
+                            ValueNumberedSet& currPhis,
+                            ValueNumberedSet& currExps,
+                            SmallPtrSet<Value*, 16>& currTemps) ;
+    bool buildsets_anticout(BasicBlock* BB,
+                            ValueNumberedSet& anticOut,
+                            std::set<BasicBlock*>& visited) ;
+    unsigned buildsets_anticin(BasicBlock* BB,
+                           ValueNumberedSet& anticOut,
+                           ValueNumberedSet& currExps,
+                           SmallPtrSet<Value*, 16>& currTemps,
+                           std::set<BasicBlock*>& visited) ;
+    void buildsets(Function& F) ;
+    
+    void insertion_pre(Value* e, BasicBlock* BB,
+                       std::map<BasicBlock*, Value*>& avail,
+                      std::map<BasicBlock*,ValueNumberedSet>& new_set) ;
+    unsigned insertion_mergepoint(std::vector<Value*>& workList,
+                                  df_iterator<DomTreeNode*>& D,
+                      std::map<BasicBlock*, ValueNumberedSet>& new_set) ;
+    bool insertion(Function& F) ;
+  
+  };
+  
+  char GVNPRE::ID = 0;
+  
+}
+
+// createGVNPREPass - The public interface to this file...
+FunctionPass *llvm::createGVNPREPass() { return new GVNPRE(); }
+
+static RegisterPass<GVNPRE> X("gvnpre",
+                              "Global Value Numbering/Partial Redundancy Elimination");
+
+
+STATISTIC(NumInsertedVals, "Number of values inserted");
+STATISTIC(NumInsertedPhis, "Number of PHI nodes inserted");
+STATISTIC(NumEliminated, "Number of redundant instructions eliminated");
+
+/// find_leader - Given a set and a value number, return the first
+/// element of the set with that value number, or 0 if no such element
+/// is present
+Value* GVNPRE::find_leader(ValueNumberedSet& vals, uint32_t v) {
+  if (!vals.test(v))
+    return 0;
+  
+  for (ValueNumberedSet::iterator I = vals.begin(), E = vals.end();
+       I != E; ++I)
+    if (v == VN.lookup(*I))
+      return *I;
+  
+  assert(0 && "No leader found, but present bit is set?");
+  return 0;
+}
+
+/// val_insert - Insert a value into a set only if there is not a value
+/// with the same value number already in the set
+void GVNPRE::val_insert(ValueNumberedSet& s, Value* v) {
+  uint32_t num = VN.lookup(v);
+  if (!s.test(num))
+    s.insert(v);
+}
+
+/// val_replace - Insert a value into a set, replacing any values already in
+/// the set that have the same value number
+void GVNPRE::val_replace(ValueNumberedSet& s, Value* v) {
+  uint32_t num = VN.lookup(v);
+  Value* leader = find_leader(s, num);
+  if (leader != 0)
+    s.erase(leader);
+  s.insert(v);
+  s.set(num);
+}
+
+/// phi_translate - Given a value, its parent block, and a predecessor of its
+/// parent, translate the value into legal for the predecessor block.  This 
+/// means translating its operands (and recursively, their operands) through
+/// any phi nodes in the parent into values available in the predecessor
+Value* GVNPRE::phi_translate(Value* V, BasicBlock* pred, BasicBlock* succ) {
+  if (V == 0)
+    return 0;
+  
+  // Unary Operations
+  if (CastInst* U = dyn_cast<CastInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0)) {
+      Instruction* newVal = 0;
+      if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::create(C->getOpcode(),
+                                  newOp1, C->getType(),
+                                  C->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Binary Operations
+  } if (isa<BinaryOperator>(V) || isa<CmpInst>(V) || 
+      isa<ExtractElementInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) || newOp2 != U->getOperand(1)) {
+      Instruction* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::create(BO->getOpcode(),
+                                        newOp1, newOp2,
+                                        BO->getName()+".expr");
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::create(C->getOpcode(),
+                                 C->getPredicate(),
+                                 newOp1, newOp2,
+                                 C->getName()+".expr");
+      else if (ExtractElementInst* E = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(newOp1, newOp2, E->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Ternary Operations
+  } else if (isa<ShuffleVectorInst>(V) || isa<InsertElementInst>(V) ||
+             isa<SelectInst>(V)) {
+    User* U = cast<User>(V);
+    
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getOperand(0)))
+      newOp1 = phi_translate(U->getOperand(0), pred, succ);
+    else
+      newOp1 = U->getOperand(0);
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    Value* newOp2 = 0;
+    if (isa<Instruction>(U->getOperand(1)))
+      newOp2 = phi_translate(U->getOperand(1), pred, succ);
+    else
+      newOp2 = U->getOperand(1);
+    
+    if (newOp2 == 0)
+      return 0;
+    
+    Value* newOp3 = 0;
+    if (isa<Instruction>(U->getOperand(2)))
+      newOp3 = phi_translate(U->getOperand(2), pred, succ);
+    else
+      newOp3 = U->getOperand(2);
+    
+    if (newOp3 == 0)
+      return 0;
+    
+    if (newOp1 != U->getOperand(0) ||
+        newOp2 != U->getOperand(1) ||
+        newOp3 != U->getOperand(2)) {
+      Instruction* newVal = 0;
+      if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(newOp1, newOp2, newOp3,
+                                       S->getName()+".expr");
+      else if (InsertElementInst* I = dyn_cast<InsertElementInst>(U))
+        newVal = new InsertElementInst(newOp1, newOp2, newOp3,
+                                       I->getName()+".expr");
+      else if (SelectInst* I = dyn_cast<SelectInst>(U))
+        newVal = new SelectInst(newOp1, newOp2, newOp3, I->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // Varargs operators
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(V)) {
+    Value* newOp1 = 0;
+    if (isa<Instruction>(U->getPointerOperand()))
+      newOp1 = phi_translate(U->getPointerOperand(), pred, succ);
+    else
+      newOp1 = U->getPointerOperand();
+    
+    if (newOp1 == 0)
+      return 0;
+    
+    bool changed_idx = false;
+    std::vector<Value*> newIdx;
+    for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+         I != E; ++I)
+      if (isa<Instruction>(*I)) {
+        Value* newVal = phi_translate(*I, pred, succ);
+        newIdx.push_back(newVal);
+        if (newVal != *I)
+          changed_idx = true;
+      } else {
+        newIdx.push_back(*I);
+      }
+    
+    if (newOp1 != U->getPointerOperand() || changed_idx) {
+      Instruction* newVal = new GetElementPtrInst(newOp1,
+                                       &newIdx[0], newIdx.size(),
+                                       U->getName()+".expr");
+      
+      uint32_t v = VN.lookup_or_add(newVal);
+      
+      Value* leader = find_leader(availableOut[pred], v);
+      if (leader == 0) {
+        createdExpressions.push_back(newVal);
+        return newVal;
+      } else {
+        VN.erase(newVal);
+        delete newVal;
+        return leader;
+      }
+    }
+  
+  // PHI Nodes
+  } else if (PHINode* P = dyn_cast<PHINode>(V)) {
+    if (P->getParent() == succ)
+      return P->getIncomingValueForBlock(pred);
+  }
+  
+  return V;
+}
+
+/// phi_translate_set - Perform phi translation on every element of a set
+void GVNPRE::phi_translate_set(ValueNumberedSet& anticIn,
+                              BasicBlock* pred, BasicBlock* succ,
+                              ValueNumberedSet& out) {
+  for (ValueNumberedSet::iterator I = anticIn.begin(),
+       E = anticIn.end(); I != E; ++I) {
+    Value* V = phi_translate(*I, pred, succ);
+    if (V != 0 && !out.test(VN.lookup_or_add(V))) {
+      out.insert(V);
+      out.set(VN.lookup(V));
+    }
+  }
+}
+
+/// dependsOnInvoke - Test if a value has an phi node as an operand, any of 
+/// whose inputs is an invoke instruction.  If this is true, we cannot safely
+/// PRE the instruction or anything that depends on it.
+bool GVNPRE::dependsOnInvoke(Value* V) {
+  if (PHINode* p = dyn_cast<PHINode>(V)) {
+    for (PHINode::op_iterator I = p->op_begin(), E = p->op_end(); I != E; ++I)
+      if (isa<InvokeInst>(*I))
+        return true;
+    return false;
+  } else {
+    return false;
+  }
+}
+
+/// clean - Remove all non-opaque values from the set whose operands are not
+/// themselves in the set, as well as all values that depend on invokes (see 
+/// above)
+void GVNPRE::clean(ValueNumberedSet& set) {
+  std::vector<Value*> worklist;
+  worklist.reserve(set.size());
+  topo_sort(set, worklist);
+  
+  for (unsigned i = 0; i < worklist.size(); ++i) {
+    Value* v = worklist[i];
+    
+    // Handle unary ops
+    if (CastInst* U = dyn_cast<CastInst>(v)) {
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      if (!lhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle binary ops
+    } else if (isa<BinaryOperator>(v) || isa<CmpInst>(v) ||
+        isa<ExtractElementInst>(v)) {
+      User* U = cast<User>(v);
+      
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+    
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      if (!lhsValid || !rhsValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle ternary ops
+    } else if (isa<ShuffleVectorInst>(v) || isa<InsertElementInst>(v) ||
+               isa<SelectInst>(v)) {
+      User* U = cast<User>(v);
+    
+      bool lhsValid = !isa<Instruction>(U->getOperand(0));
+      lhsValid |= set.test(VN.lookup(U->getOperand(0)));
+      if (lhsValid)
+        lhsValid = !dependsOnInvoke(U->getOperand(0));
+      
+      bool rhsValid = !isa<Instruction>(U->getOperand(1));
+      rhsValid |= set.test(VN.lookup(U->getOperand(1)));
+      if (rhsValid)
+        rhsValid = !dependsOnInvoke(U->getOperand(1));
+      
+      bool thirdValid = !isa<Instruction>(U->getOperand(2));
+      thirdValid |= set.test(VN.lookup(U->getOperand(2)));
+      if (thirdValid)
+        thirdValid = !dependsOnInvoke(U->getOperand(2));
+    
+      if (!lhsValid || !rhsValid || !thirdValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    
+    // Handle varargs ops
+    } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(v)) {
+      bool ptrValid = !isa<Instruction>(U->getPointerOperand());
+      ptrValid |= set.test(VN.lookup(U->getPointerOperand()));
+      if (ptrValid)
+        ptrValid = !dependsOnInvoke(U->getPointerOperand());
+      
+      bool varValid = true;
+      for (GetElementPtrInst::op_iterator I = U->idx_begin(), E = U->idx_end();
+           I != E; ++I)
+        if (varValid) {
+          varValid &= !isa<Instruction>(*I) || set.test(VN.lookup(*I));
+          varValid &= !dependsOnInvoke(*I);
+        }
+    
+      if (!ptrValid || !varValid) {
+        set.erase(U);
+        set.reset(VN.lookup(U));
+      }
+    }
+  }
+}
+
+/// topo_sort - Given a set of values, sort them by topological
+/// order into the provided vector.
+void GVNPRE::topo_sort(ValueNumberedSet& set, std::vector<Value*>& vec) {
+  SmallPtrSet<Value*, 16> visited;
+  std::vector<Value*> stack;
+  for (ValueNumberedSet::iterator I = set.begin(), E = set.end();
+       I != E; ++I) {
+    if (visited.count(*I) == 0)
+      stack.push_back(*I);
+    
+    while (!stack.empty()) {
+      Value* e = stack.back();
+      
+      // Handle unary ops
+      if (CastInst* U = dyn_cast<CastInst>(e)) {
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle binary ops
+      } else if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+          isa<ExtractElementInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+    
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle ternary ops
+      } else if (isa<InsertElementInst>(e) || isa<ShuffleVectorInst>(e) ||
+                 isa<SelectInst>(e)) {
+        User* U = cast<User>(e);
+        Value* l = find_leader(set, VN.lookup(U->getOperand(0)));
+        Value* r = find_leader(set, VN.lookup(U->getOperand(1)));
+        Value* m = find_leader(set, VN.lookup(U->getOperand(2)));
+      
+        if (l != 0 && isa<Instruction>(l) &&
+            visited.count(l) == 0)
+          stack.push_back(l);
+        else if (r != 0 && isa<Instruction>(r) &&
+                 visited.count(r) == 0)
+          stack.push_back(r);
+        else if (m != 0 && isa<Instruction>(m) &&
+                 visited.count(m) == 0)
+          stack.push_back(m);
+        else {
+          vec.push_back(e);
+          visited.insert(e);
+          stack.pop_back();
+        }
+      
+      // Handle vararg ops
+      } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(e)) {
+        Value* p = find_leader(set, VN.lookup(U->getPointerOperand()));
+        
+        if (p != 0 && isa<Instruction>(p) &&
+            visited.count(p) == 0)
+          stack.push_back(p);
+        else {
+          bool push_va = false;
+          for (GetElementPtrInst::op_iterator I = U->idx_begin(),
+               E = U->idx_end(); I != E; ++I) {
+            Value * v = find_leader(set, VN.lookup(*I));
+            if (v != 0 && isa<Instruction>(v) && visited.count(v) == 0) {
+              stack.push_back(v);
+              push_va = true;
+            }
+          }
+          
+          if (!push_va) {
+            vec.push_back(e);
+            visited.insert(e);
+            stack.pop_back();
+          }
+        }
+      
+      // Handle opaque ops
+      } else {
+        visited.insert(e);
+        vec.push_back(e);
+        stack.pop_back();
+      }
+    }
+    
+    stack.clear();
+  }
+}
+
+/// dump - Dump a set of values to standard error
+void GVNPRE::dump(ValueNumberedSet& s) const {
+  DOUT << "{ ";
+  for (ValueNumberedSet::iterator I = s.begin(), E = s.end();
+       I != E; ++I) {
+    DOUT << "" << VN.lookup(*I) << ": ";
+    DEBUG((*I)->dump());
+  }
+  DOUT << "}\n\n";
+}
+
+/// elimination - Phase 3 of the main algorithm.  Perform full redundancy 
+/// elimination by walking the dominator tree and removing any instruction that 
+/// is dominated by another instruction with the same value number.
+bool GVNPRE::elimination() {
+  bool changed_function = false;
+  
+  std::vector<std::pair<Instruction*, Value*> > replace;
+  std::vector<Instruction*> erase;
+  
+  DominatorTree& DT = getAnalysis<DominatorTree>();
+  
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    BasicBlock* BB = DI->getBlock();
+    
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI) {
+
+      if (isa<BinaryOperator>(BI) || isa<CmpInst>(BI) ||
+          isa<ShuffleVectorInst>(BI) || isa<InsertElementInst>(BI) ||
+          isa<ExtractElementInst>(BI) || isa<SelectInst>(BI) ||
+          isa<CastInst>(BI) || isa<GetElementPtrInst>(BI)) {
+        
+        if (availableOut[BB].test(VN.lookup(BI)) && !availableOut[BB].count(BI)) {
+          Value *leader = find_leader(availableOut[BB], VN.lookup(BI));
+          if (Instruction* Instr = dyn_cast<Instruction>(leader))
+            if (Instr->getParent() != 0 && Instr != BI) {
+              replace.push_back(std::make_pair(BI, leader));
+              erase.push_back(BI);
+              ++NumEliminated;
+            }
+        }
+      }
+    }
+  }
+  
+  while (!replace.empty()) {
+    std::pair<Instruction*, Value*> rep = replace.back();
+    replace.pop_back();
+    rep.first->replaceAllUsesWith(rep.second);
+    changed_function = true;
+  }
+    
+  for (std::vector<Instruction*>::iterator I = erase.begin(), E = erase.end();
+       I != E; ++I)
+     (*I)->eraseFromParent();
+  
+  return changed_function;
+}
+
+/// cleanup - Delete any extraneous values that were created to represent
+/// expressions without leaders.
+void GVNPRE::cleanup() {
+  while (!createdExpressions.empty()) {
+    Instruction* I = createdExpressions.back();
+    createdExpressions.pop_back();
+    
+    delete I;
+  }
+}
+
+/// buildsets_availout - When calculating availability, handle an instruction
+/// by inserting it into the appropriate sets
+void GVNPRE::buildsets_availout(BasicBlock::iterator I,
+                                ValueNumberedSet& currAvail,
+                                ValueNumberedSet& currPhis,
+                                ValueNumberedSet& currExps,
+                                SmallPtrSet<Value*, 16>& currTemps) {
+  // Handle PHI nodes
+  if (PHINode* p = dyn_cast<PHINode>(I)) {
+    unsigned num = VN.lookup_or_add(p);
+    
+    currPhis.insert(p);
+    currPhis.set(num);
+  
+  // Handle unary ops
+  } else if (CastInst* U = dyn_cast<CastInst>(I)) {
+    Value* leftValue = U->getOperand(0);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+  
+  // Handle binary ops
+  } else if (isa<BinaryOperator>(I) || isa<CmpInst>(I) ||
+             isa<ExtractElementInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    
+    unsigned num = VN.lookup_or_add(U);
+      
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle ternary ops
+  } else if (isa<InsertElementInst>(I) || isa<ShuffleVectorInst>(I) ||
+             isa<SelectInst>(I)) {
+    User* U = cast<User>(I);
+    Value* leftValue = U->getOperand(0);
+    Value* rightValue = U->getOperand(1);
+    Value* thirdValue = U->getOperand(2);
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(leftValue))
+      if (!currExps.test(VN.lookup(leftValue))) {
+        currExps.insert(leftValue);
+        currExps.set(VN.lookup(leftValue));
+      }
+    if (isa<Instruction>(rightValue))
+      if (!currExps.test(VN.lookup(rightValue))) {
+        currExps.insert(rightValue);
+        currExps.set(VN.lookup(rightValue));
+      }
+    if (isa<Instruction>(thirdValue))
+      if (!currExps.test(VN.lookup(thirdValue))) {
+        currExps.insert(thirdValue);
+        currExps.set(VN.lookup(thirdValue));
+      }
+    
+    if (!currExps.test(num)) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle vararg ops
+  } else if (GetElementPtrInst* U = dyn_cast<GetElementPtrInst>(I)) {
+    Value* ptrValue = U->getPointerOperand();
+      
+    VN.lookup_or_add(U);
+    
+    unsigned num = VN.lookup_or_add(U);
+    
+    if (isa<Instruction>(ptrValue))
+      if (!currExps.test(VN.lookup(ptrValue))) {
+        currExps.insert(ptrValue);
+        currExps.set(VN.lookup(ptrValue));
+      }
+    
+    for (GetElementPtrInst::op_iterator OI = U->idx_begin(), OE = U->idx_end();
+         OI != OE; ++OI)
+      if (isa<Instruction>(*OI) && !currExps.test(VN.lookup(*OI))) {
+        currExps.insert(*OI);
+        currExps.set(VN.lookup(*OI));
+      }
+    
+    if (!currExps.test(VN.lookup(U))) {
+      currExps.insert(U);
+      currExps.set(num);
+    }
+    
+  // Handle opaque ops
+  } else if (!I->isTerminator()){
+    VN.lookup_or_add(I);
+    
+    currTemps.insert(I);
+  }
+    
+  if (!I->isTerminator())
+    if (!currAvail.test(VN.lookup(I))) {
+      currAvail.insert(I);
+      currAvail.set(VN.lookup(I));
+    }
+}
+
+/// buildsets_anticout - When walking the postdom tree, calculate the ANTIC_OUT
+/// set as a function of the ANTIC_IN set of the block's predecessors
+bool GVNPRE::buildsets_anticout(BasicBlock* BB,
+                                ValueNumberedSet& anticOut,
+                                std::set<BasicBlock*>& visited) {
+  if (BB->getTerminator()->getNumSuccessors() == 1) {
+    if (BB->getTerminator()->getSuccessor(0) != BB &&
+        visited.count(BB->getTerminator()->getSuccessor(0)) == 0) {
+      return true;
+    }
+    else {
+      phi_translate_set(anticipatedIn[BB->getTerminator()->getSuccessor(0)],
+                        BB,  BB->getTerminator()->getSuccessor(0), anticOut);
+    }
+  } else if (BB->getTerminator()->getNumSuccessors() > 1) {
+    BasicBlock* first = BB->getTerminator()->getSuccessor(0);
+    for (ValueNumberedSet::iterator I = anticipatedIn[first].begin(),
+         E = anticipatedIn[first].end(); I != E; ++I) {
+      anticOut.insert(*I);
+      anticOut.set(VN.lookup(*I));
+    }
+    
+    for (unsigned i = 1; i < BB->getTerminator()->getNumSuccessors(); ++i) {
+      BasicBlock* currSucc = BB->getTerminator()->getSuccessor(i);
+      ValueNumberedSet& succAnticIn = anticipatedIn[currSucc];
+      
+      std::vector<Value*> temp;
+      
+      for (ValueNumberedSet::iterator I = anticOut.begin(),
+           E = anticOut.end(); I != E; ++I)
+        if (!succAnticIn.test(VN.lookup(*I)))
+          temp.push_back(*I);
+
+      for (std::vector<Value*>::iterator I = temp.begin(), E = temp.end();
+           I != E; ++I) {
+        anticOut.erase(*I);
+        anticOut.reset(VN.lookup(*I));
+      }
+    }
+  }
+  
+  return false;
+}
+
+/// buildsets_anticin - Walk the postdom tree, calculating ANTIC_OUT for
+/// each block.  ANTIC_IN is then a function of ANTIC_OUT and the GEN
+/// sets populated in buildsets_availout
+unsigned GVNPRE::buildsets_anticin(BasicBlock* BB,
+                               ValueNumberedSet& anticOut,
+                               ValueNumberedSet& currExps,
+                               SmallPtrSet<Value*, 16>& currTemps,
+                               std::set<BasicBlock*>& visited) {
+  ValueNumberedSet& anticIn = anticipatedIn[BB];
+  unsigned old = anticIn.size();
+      
+  bool defer = buildsets_anticout(BB, anticOut, visited);
+  if (defer)
+    return 0;
+  
+  anticIn.clear();
+  
+  for (ValueNumberedSet::iterator I = anticOut.begin(),
+       E = anticOut.end(); I != E; ++I) {
+    anticIn.insert(*I);
+    anticIn.set(VN.lookup(*I));
+  }
+  for (ValueNumberedSet::iterator I = currExps.begin(),
+       E = currExps.end(); I != E; ++I) {
+    if (!anticIn.test(VN.lookup(*I))) {
+      anticIn.insert(*I);
+      anticIn.set(VN.lookup(*I));
+    }
+  } 
+  
+  for (SmallPtrSet<Value*, 16>::iterator I = currTemps.begin(),
+       E = currTemps.end(); I != E; ++I) {
+    anticIn.erase(*I);
+    anticIn.reset(VN.lookup(*I));
+  }
+  
+  clean(anticIn);
+  anticOut.clear();
+  
+  if (old != anticIn.size())
+    return 2;
+  else
+    return 1;
+}
+
+/// buildsets - Phase 1 of the main algorithm.  Construct the AVAIL_OUT
+/// and the ANTIC_IN sets.
+void GVNPRE::buildsets(Function& F) {
+  std::map<BasicBlock*, ValueNumberedSet> generatedExpressions;
+  std::map<BasicBlock*, SmallPtrSet<Value*, 16> > generatedTemporaries;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();   
+  
+  // Phase 1, Part 1: calculate AVAIL_OUT
+  
+  // Top-down walk of the dominator tree
+  for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+    
+    // Get the sets to update for this block
+    ValueNumberedSet& currExps = generatedExpressions[DI->getBlock()];
+    ValueNumberedSet& currPhis = generatedPhis[DI->getBlock()];
+    SmallPtrSet<Value*, 16>& currTemps = generatedTemporaries[DI->getBlock()];
+    ValueNumberedSet& currAvail = availableOut[DI->getBlock()];     
+    
+    BasicBlock* BB = DI->getBlock();
+  
+    // A block inherits AVAIL_OUT from its dominator
+    if (DI->getIDom() != 0)
+      currAvail = availableOut[DI->getIDom()->getBlock()];
+
+    for (BasicBlock::iterator BI = BB->begin(), BE = BB->end();
+         BI != BE; ++BI)
+      buildsets_availout(BI, currAvail, currPhis, currExps,
+                         currTemps);
+      
+  }
+
+  // Phase 1, Part 2: calculate ANTIC_IN
+  
+  std::set<BasicBlock*> visited;
+  SmallPtrSet<BasicBlock*, 4> block_changed;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    block_changed.insert(FI);
+  
+  bool changed = true;
+  unsigned iterations = 0;
+  
+  while (changed) {
+    changed = false;
+    ValueNumberedSet anticOut;
+    
+    // Postorder walk of the CFG
+    for (po_iterator<BasicBlock*> BBI = po_begin(&F.getEntryBlock()),
+         BBE = po_end(&F.getEntryBlock()); BBI != BBE; ++BBI) {
+      BasicBlock* BB = *BBI;
+      
+      if (block_changed.count(BB) != 0) {
+        unsigned ret = buildsets_anticin(BB, anticOut,generatedExpressions[BB],
+                                         generatedTemporaries[BB], visited);
+      
+        if (ret == 0) {
+          changed = true;
+          continue;
+        } else {
+          visited.insert(BB);
+        
+          if (ret == 2)
+           for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB);
+                 PI != PE; ++PI) {
+              block_changed.insert(*PI);
+           }
+          else
+            block_changed.erase(BB);
+        
+          changed |= (ret == 2);
+        }
+      }
+    }
+    
+    iterations++;
+  }
+}
+
+/// insertion_pre - When a partial redundancy has been identified, eliminate it
+/// by inserting appropriate values into the predecessors and a phi node in
+/// the main block
+void GVNPRE::insertion_pre(Value* e, BasicBlock* BB,
+                           std::map<BasicBlock*, Value*>& avail,
+                    std::map<BasicBlock*, ValueNumberedSet>& new_sets) {
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    Value* e2 = avail[*PI];
+    if (!availableOut[*PI].test(VN.lookup(e2))) {
+      User* U = cast<User>(e2);
+      
+      Value* s1 = 0;
+      if (isa<BinaryOperator>(U->getOperand(0)) || 
+          isa<CmpInst>(U->getOperand(0)) ||
+          isa<ShuffleVectorInst>(U->getOperand(0)) ||
+          isa<ExtractElementInst>(U->getOperand(0)) ||
+          isa<InsertElementInst>(U->getOperand(0)) ||
+          isa<SelectInst>(U->getOperand(0)) ||
+          isa<CastInst>(U->getOperand(0)) ||
+          isa<GetElementPtrInst>(U->getOperand(0)))
+        s1 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(0)));
+      else
+        s1 = U->getOperand(0);
+      
+      Value* s2 = 0;
+      
+      if (isa<BinaryOperator>(U) || 
+          isa<CmpInst>(U) ||
+          isa<ShuffleVectorInst>(U) ||
+          isa<ExtractElementInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U))
+        if (isa<BinaryOperator>(U->getOperand(1)) || 
+            isa<CmpInst>(U->getOperand(1)) ||
+            isa<ShuffleVectorInst>(U->getOperand(1)) ||
+            isa<ExtractElementInst>(U->getOperand(1)) ||
+            isa<InsertElementInst>(U->getOperand(1)) ||
+            isa<SelectInst>(U->getOperand(1)) ||
+            isa<CastInst>(U->getOperand(1)) ||
+            isa<GetElementPtrInst>(U->getOperand(1))) {
+          s2 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(1)));
+        } else {
+          s2 = U->getOperand(1);
+        }
+      
+      // Ternary Operators
+      Value* s3 = 0;
+      if (isa<ShuffleVectorInst>(U) ||
+          isa<InsertElementInst>(U) ||
+          isa<SelectInst>(U))
+        if (isa<BinaryOperator>(U->getOperand(2)) || 
+            isa<CmpInst>(U->getOperand(2)) ||
+            isa<ShuffleVectorInst>(U->getOperand(2)) ||
+            isa<ExtractElementInst>(U->getOperand(2)) ||
+            isa<InsertElementInst>(U->getOperand(2)) ||
+            isa<SelectInst>(U->getOperand(2)) ||
+            isa<CastInst>(U->getOperand(2)) ||
+            isa<GetElementPtrInst>(U->getOperand(2))) {
+          s3 = find_leader(availableOut[*PI], VN.lookup(U->getOperand(2)));
+        } else {
+          s3 = U->getOperand(2);
+        }
+      
+      // Vararg operators
+      std::vector<Value*> sVarargs;
+      if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U)) {
+        for (GetElementPtrInst::op_iterator OI = G->idx_begin(),
+             OE = G->idx_end(); OI != OE; ++OI) {
+          if (isa<BinaryOperator>(*OI) || 
+              isa<CmpInst>(*OI) ||
+              isa<ShuffleVectorInst>(*OI) ||
+              isa<ExtractElementInst>(*OI) ||
+              isa<InsertElementInst>(*OI) ||
+              isa<SelectInst>(*OI) ||
+              isa<CastInst>(*OI) ||
+              isa<GetElementPtrInst>(*OI)) {
+            sVarargs.push_back(find_leader(availableOut[*PI], 
+                               VN.lookup(*OI)));
+          } else {
+            sVarargs.push_back(*OI);
+          }
+        }
+      }
+      
+      Value* newVal = 0;
+      if (BinaryOperator* BO = dyn_cast<BinaryOperator>(U))
+        newVal = BinaryOperator::create(BO->getOpcode(), s1, s2,
+                                        BO->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (CmpInst* C = dyn_cast<CmpInst>(U))
+        newVal = CmpInst::create(C->getOpcode(), C->getPredicate(), s1, s2,
+                                 C->getName()+".gvnpre", 
+                                 (*PI)->getTerminator());
+      else if (ShuffleVectorInst* S = dyn_cast<ShuffleVectorInst>(U))
+        newVal = new ShuffleVectorInst(s1, s2, s3, S->getName()+".gvnpre",
+                                       (*PI)->getTerminator());
+      else if (InsertElementInst* S = dyn_cast<InsertElementInst>(U))
+        newVal = new InsertElementInst(s1, s2, s3, S->getName()+".gvnpre",
+                                       (*PI)->getTerminator());
+      else if (ExtractElementInst* S = dyn_cast<ExtractElementInst>(U))
+        newVal = new ExtractElementInst(s1, s2, S->getName()+".gvnpre",
+                                        (*PI)->getTerminator());
+      else if (SelectInst* S = dyn_cast<SelectInst>(U))
+        newVal = new SelectInst(s1, s2, s3, S->getName()+".gvnpre",
+                                (*PI)->getTerminator());
+      else if (CastInst* C = dyn_cast<CastInst>(U))
+        newVal = CastInst::create(C->getOpcode(), s1, C->getType(),
+                                  C->getName()+".gvnpre", 
+                                  (*PI)->getTerminator());
+      else if (GetElementPtrInst* G = dyn_cast<GetElementPtrInst>(U))
+        newVal = new GetElementPtrInst(s1, &sVarargs[0], sVarargs.size(), 
+                                       G->getName()+".gvnpre", 
+                                       (*PI)->getTerminator());
+                                
+                  
+      VN.add(newVal, VN.lookup(U));
+                  
+      ValueNumberedSet& predAvail = availableOut[*PI];
+      val_replace(predAvail, newVal);
+      val_replace(new_sets[*PI], newVal);
+      predAvail.set(VN.lookup(newVal));
+            
+      std::map<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+      if (av != avail.end())
+        avail.erase(av);
+      avail.insert(std::make_pair(*PI, newVal));
+                  
+      ++NumInsertedVals;
+    }
+  }
+              
+  PHINode* p = 0;
+              
+  for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE; ++PI) {
+    if (p == 0)
+      p = new PHINode(avail[*PI]->getType(), "gvnpre-join", BB->begin());
+    
+    p->addIncoming(avail[*PI], *PI);
+  }
+
+  VN.add(p, VN.lookup(e));
+  val_replace(availableOut[BB], p);
+  availableOut[BB].set(VN.lookup(e));
+  generatedPhis[BB].insert(p);
+  generatedPhis[BB].set(VN.lookup(e));
+  new_sets[BB].insert(p);
+  new_sets[BB].set(VN.lookup(e));
+              
+  ++NumInsertedPhis;
+}
+
+/// insertion_mergepoint - When walking the dom tree, check at each merge
+/// block for the possibility of a partial redundancy.  If present, eliminate it
+unsigned GVNPRE::insertion_mergepoint(std::vector<Value*>& workList,
+                                      df_iterator<DomTreeNode*>& D,
+                    std::map<BasicBlock*, ValueNumberedSet >& new_sets) {
+  bool changed_function = false;
+  bool new_stuff = false;
+  
+  BasicBlock* BB = D->getBlock();
+  for (unsigned i = 0; i < workList.size(); ++i) {
+    Value* e = workList[i];
+          
+    if (isa<BinaryOperator>(e) || isa<CmpInst>(e) ||
+        isa<ExtractElementInst>(e) || isa<InsertElementInst>(e) ||
+        isa<ShuffleVectorInst>(e) || isa<SelectInst>(e) || isa<CastInst>(e) ||
+        isa<GetElementPtrInst>(e)) {
+      if (availableOut[D->getIDom()->getBlock()].test(VN.lookup(e)))
+        continue;
+            
+      std::map<BasicBlock*, Value*> avail;
+      bool by_some = false;
+      bool all_same = true;
+      Value * first_s = 0;
+            
+      for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;
+           ++PI) {
+        Value *e2 = phi_translate(e, *PI, BB);
+        Value *e3 = find_leader(availableOut[*PI], VN.lookup(e2));
+              
+        if (e3 == 0) {
+          std::map<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e2));
+          all_same = false;
+        } else {
+          std::map<BasicBlock*, Value*>::iterator av = avail.find(*PI);
+          if (av != avail.end())
+            avail.erase(av);
+          avail.insert(std::make_pair(*PI, e3));
+                
+          by_some = true;
+          if (first_s == 0)
+            first_s = e3;
+          else if (first_s != e3)
+            all_same = false;
+        }
+      }
+            
+      if (by_some && !all_same &&
+          !generatedPhis[BB].test(VN.lookup(e))) {
+        insertion_pre(e, BB, avail, new_sets);
+              
+        changed_function = true;
+        new_stuff = true;
+      }
+    }
+  }
+  
+  unsigned retval = 0;
+  if (changed_function)
+    retval += 1;
+  if (new_stuff)
+    retval += 2;
+  
+  return retval;
+}
+
+/// insert - Phase 2 of the main algorithm.  Walk the dominator tree looking for
+/// merge points.  When one is found, check for a partial redundancy.  If one is
+/// present, eliminate it.  Repeat this walk until no changes are made.
+bool GVNPRE::insertion(Function& F) {
+  bool changed_function = false;
+
+  DominatorTree &DT = getAnalysis<DominatorTree>();  
+  
+  std::map<BasicBlock*, ValueNumberedSet> new_sets;
+  bool new_stuff = true;
+  while (new_stuff) {
+    new_stuff = false;
+    for (df_iterator<DomTreeNode*> DI = df_begin(DT.getRootNode()),
+         E = df_end(DT.getRootNode()); DI != E; ++DI) {
+      BasicBlock* BB = DI->getBlock();
+      
+      if (BB == 0)
+        continue;
+      
+      ValueNumberedSet& availOut = availableOut[BB];
+      ValueNumberedSet& anticIn = anticipatedIn[BB];
+      
+      // Replace leaders with leaders inherited from dominator
+      if (DI->getIDom() != 0) {
+        ValueNumberedSet& dom_set = new_sets[DI->getIDom()->getBlock()];
+        for (ValueNumberedSet::iterator I = dom_set.begin(),
+             E = dom_set.end(); I != E; ++I) {
+          val_replace(new_sets[BB], *I);
+          val_replace(availOut, *I);
+        }
+      }
+      
+      // If there is more than one predecessor...
+      if (pred_begin(BB) != pred_end(BB) && ++pred_begin(BB) != pred_end(BB)) {
+        std::vector<Value*> workList;
+        workList.reserve(anticIn.size());
+        topo_sort(anticIn, workList);
+        
+        unsigned result = insertion_mergepoint(workList, DI, new_sets);
+        if (result & 1)
+          changed_function = true;
+        if (result & 2)
+          new_stuff = true;
+      }
+    }
+  }
+  
+  return changed_function;
+}
+
+// GVNPRE::runOnFunction - This is the main transformation entry point for a
+// function.
+//
+bool GVNPRE::runOnFunction(Function &F) {
+  // Clean out global sets from any previous functions
+  VN.clear();
+  createdExpressions.clear();
+  availableOut.clear();
+  anticipatedIn.clear();
+  generatedPhis.clear();
+ 
+  bool changed_function = false;
+  
+  // Phase 1: BuildSets
+  // This phase calculates the AVAIL_OUT and ANTIC_IN sets
+  buildsets(F);
+  
+  // Phase 2: Insert
+  // This phase inserts values to make partially redundant values
+  // fully redundant
+  changed_function |= insertion(F);
+  
+  // Phase 3: Eliminate
+  // This phase performs trivial full redundancy elimination
+  changed_function |= elimination();
+  
+  // Phase 4: Cleanup
+  // This phase cleans up values that were created solely
+  // as leaders for expressions
+  cleanup();
+  
+  return changed_function;
+}

diff --git a/lib/Transforms/Scalar/IndVarSimplify.cpp b/lib/Transforms/Scalar/IndVarSimplify.cpp
new file mode 100644
index 0000000..01b7481
--- /dev/null
+++ b/lib/Transforms/Scalar/IndVarSimplify.cpp

@@ -0,0 +1,604 @@
+//===- IndVarSimplify.cpp - Induction Variable Elimination ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation analyzes and transforms the induction variables (and
+// computations derived from them) into simpler forms suitable for subsequent
+// analysis and transformation.
+//
+// This transformation makes the following changes to each loop with an
+// identifiable induction variable:
+//   1. All loops are transformed to have a SINGLE canonical induction variable
+//      which starts at zero and steps by one.
+//   2. The canonical induction variable is guaranteed to be the first PHI node
+//      in the loop header block.
+//   3. Any pointer arithmetic recurrences are raised to use array subscripts.
+//
+// If the trip count of a loop is computable, this pass also makes the following
+// changes:
+//   1. The exit condition for the loop is canonicalized to compare the
+//      induction value against the exit value.  This turns loops like:
+//        'for (i = 7; i*i < 1000; ++i)' into 'for (i = 0; i != 25; ++i)'
+//   2. Any use outside of the loop of an expression derived from the indvar
+//      is changed to compute the derived value outside of the loop, eliminating
+//      the dependence on the exit value of the induction variable.  If the only
+//      purpose of the loop is to compute the exit value of some derived
+//      expression, this transformation will make the loop dead.
+//
+// This transformation should be followed by strength reduction after all of the
+// desired loop transformations have been performed.  Additionally, on targets
+// where it is profitable, the loop could be transformed to count down to zero
+// (the "do loop" optimization).
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "indvars"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Type.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumRemoved , "Number of aux indvars removed");
+STATISTIC(NumPointer , "Number of pointer indvars promoted");
+STATISTIC(NumInserted, "Number of canonical indvars added");
+STATISTIC(NumReplaced, "Number of exit values replaced");
+STATISTIC(NumLFTR    , "Number of loop exit tests replaced");
+
+namespace {
+  class VISIBILITY_HIDDEN IndVarSimplify : public LoopPass {
+    LoopInfo        *LI;
+    ScalarEvolution *SE;
+    bool Changed;
+  public:
+
+   static char ID; // Pass identification, replacement for typeid
+   IndVarSimplify() : LoopPass((intptr_t)&ID) {}
+
+   bool runOnLoop(Loop *L, LPPassManager &LPM);
+   bool doInitialization(Loop *L, LPPassManager &LPM);
+   virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+     AU.addRequiredID(LCSSAID);
+     AU.addRequiredID(LoopSimplifyID);
+     AU.addRequired<ScalarEvolution>();
+     AU.addRequired<LoopInfo>();
+     AU.addPreservedID(LoopSimplifyID);
+     AU.addPreservedID(LCSSAID);
+     AU.setPreservesCFG();
+   }
+
+  private:
+
+    void EliminatePointerRecurrence(PHINode *PN, BasicBlock *Preheader,
+                                    std::set<Instruction*> &DeadInsts);
+    Instruction *LinearFunctionTestReplace(Loop *L, SCEV *IterationCount,
+                                           SCEVExpander &RW);
+    void RewriteLoopExitValues(Loop *L);
+
+    void DeleteTriviallyDeadInstructions(std::set<Instruction*> &Insts);
+  };
+
+  char IndVarSimplify::ID = 0;
+  RegisterPass<IndVarSimplify> X("indvars", "Canonicalize Induction Variables");
+}
+
+LoopPass *llvm::createIndVarSimplifyPass() {
+  return new IndVarSimplify();
+}
+
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+void IndVarSimplify::
+DeleteTriviallyDeadInstructions(std::set<Instruction*> &Insts) {
+  while (!Insts.empty()) {
+    Instruction *I = *Insts.begin();
+    Insts.erase(Insts.begin());
+    if (isInstructionTriviallyDead(I)) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
+          Insts.insert(U);
+      SE->deleteValueFromRecords(I);
+      DOUT << "INDVARS: Deleting: " << *I;
+      I->eraseFromParent();
+      Changed = true;
+    }
+  }
+}
+
+
+/// EliminatePointerRecurrence - Check to see if this is a trivial GEP pointer
+/// recurrence.  If so, change it into an integer recurrence, permitting
+/// analysis by the SCEV routines.
+void IndVarSimplify::EliminatePointerRecurrence(PHINode *PN,
+                                                BasicBlock *Preheader,
+                                            std::set<Instruction*> &DeadInsts) {
+  assert(PN->getNumIncomingValues() == 2 && "Noncanonicalized loop!");
+  unsigned PreheaderIdx = PN->getBasicBlockIndex(Preheader);
+  unsigned BackedgeIdx = PreheaderIdx^1;
+  if (GetElementPtrInst *GEPI =
+          dyn_cast<GetElementPtrInst>(PN->getIncomingValue(BackedgeIdx)))
+    if (GEPI->getOperand(0) == PN) {
+      assert(GEPI->getNumOperands() == 2 && "GEP types must match!");
+      DOUT << "INDVARS: Eliminating pointer recurrence: " << *GEPI;
+      
+      // Okay, we found a pointer recurrence.  Transform this pointer
+      // recurrence into an integer recurrence.  Compute the value that gets
+      // added to the pointer at every iteration.
+      Value *AddedVal = GEPI->getOperand(1);
+
+      // Insert a new integer PHI node into the top of the block.
+      PHINode *NewPhi = new PHINode(AddedVal->getType(),
+                                    PN->getName()+".rec", PN);
+      NewPhi->addIncoming(Constant::getNullValue(NewPhi->getType()), Preheader);
+
+      // Create the new add instruction.
+      Value *NewAdd = BinaryOperator::createAdd(NewPhi, AddedVal,
+                                                GEPI->getName()+".rec", GEPI);
+      NewPhi->addIncoming(NewAdd, PN->getIncomingBlock(BackedgeIdx));
+
+      // Update the existing GEP to use the recurrence.
+      GEPI->setOperand(0, PN->getIncomingValue(PreheaderIdx));
+
+      // Update the GEP to use the new recurrence we just inserted.
+      GEPI->setOperand(1, NewAdd);
+
+      // If the incoming value is a constant expr GEP, try peeling out the array
+      // 0 index if possible to make things simpler.
+      if (ConstantExpr *CE = dyn_cast<ConstantExpr>(GEPI->getOperand(0)))
+        if (CE->getOpcode() == Instruction::GetElementPtr) {
+          unsigned NumOps = CE->getNumOperands();
+          assert(NumOps > 1 && "CE folding didn't work!");
+          if (CE->getOperand(NumOps-1)->isNullValue()) {
+            // Check to make sure the last index really is an array index.
+            gep_type_iterator GTI = gep_type_begin(CE);
+            for (unsigned i = 1, e = CE->getNumOperands()-1;
+                 i != e; ++i, ++GTI)
+              /*empty*/;
+            if (isa<SequentialType>(*GTI)) {
+              // Pull the last index out of the constant expr GEP.
+              SmallVector<Value*, 8> CEIdxs(CE->op_begin()+1, CE->op_end()-1);
+              Constant *NCE = ConstantExpr::getGetElementPtr(CE->getOperand(0),
+                                                             &CEIdxs[0],
+                                                             CEIdxs.size());
+              GetElementPtrInst *NGEPI = new GetElementPtrInst(
+                  NCE, Constant::getNullValue(Type::Int32Ty), NewAdd, 
+                  GEPI->getName(), GEPI);
+              SE->deleteValueFromRecords(GEPI);
+              GEPI->replaceAllUsesWith(NGEPI);
+              GEPI->eraseFromParent();
+              GEPI = NGEPI;
+            }
+          }
+        }
+
+
+      // Finally, if there are any other users of the PHI node, we must
+      // insert a new GEP instruction that uses the pre-incremented version
+      // of the induction amount.
+      if (!PN->use_empty()) {
+        BasicBlock::iterator InsertPos = PN; ++InsertPos;
+        while (isa<PHINode>(InsertPos)) ++InsertPos;
+        Value *PreInc =
+          new GetElementPtrInst(PN->getIncomingValue(PreheaderIdx),
+                                NewPhi, "", InsertPos);
+        PreInc->takeName(PN);
+        PN->replaceAllUsesWith(PreInc);
+      }
+
+      // Delete the old PHI for sure, and the GEP if its otherwise unused.
+      DeadInsts.insert(PN);
+
+      ++NumPointer;
+      Changed = true;
+    }
+}
+
+/// LinearFunctionTestReplace - This method rewrites the exit condition of the
+/// loop to be a canonical != comparison against the incremented loop induction
+/// variable.  This pass is able to rewrite the exit tests of any loop where the
+/// SCEV analysis can determine a loop-invariant trip count of the loop, which
+/// is actually a much broader range than just linear tests.
+///
+/// This method returns a "potentially dead" instruction whose computation chain
+/// should be deleted when convenient.
+Instruction *IndVarSimplify::LinearFunctionTestReplace(Loop *L,
+                                                       SCEV *IterationCount,
+                                                       SCEVExpander &RW) {
+  // Find the exit block for the loop.  We can currently only handle loops with
+  // a single exit.
+  std::vector<BasicBlock*> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1) return 0;
+  BasicBlock *ExitBlock = ExitBlocks[0];
+
+  // Make sure there is only one predecessor block in the loop.
+  BasicBlock *ExitingBlock = 0;
+  for (pred_iterator PI = pred_begin(ExitBlock), PE = pred_end(ExitBlock);
+       PI != PE; ++PI)
+    if (L->contains(*PI)) {
+      if (ExitingBlock == 0)
+        ExitingBlock = *PI;
+      else
+        return 0;  // Multiple exits from loop to this block.
+    }
+  assert(ExitingBlock && "Loop info is broken");
+
+  if (!isa<BranchInst>(ExitingBlock->getTerminator()))
+    return 0;  // Can't rewrite non-branch yet
+  BranchInst *BI = cast<BranchInst>(ExitingBlock->getTerminator());
+  assert(BI->isConditional() && "Must be conditional to be part of loop!");
+
+  Instruction *PotentiallyDeadInst = dyn_cast<Instruction>(BI->getCondition());
+  
+  // If the exiting block is not the same as the backedge block, we must compare
+  // against the preincremented value, otherwise we prefer to compare against
+  // the post-incremented value.
+  BasicBlock *Header = L->getHeader();
+  pred_iterator HPI = pred_begin(Header);
+  assert(HPI != pred_end(Header) && "Loop with zero preds???");
+  if (!L->contains(*HPI)) ++HPI;
+  assert(HPI != pred_end(Header) && L->contains(*HPI) &&
+         "No backedge in loop?");
+
+  SCEVHandle TripCount = IterationCount;
+  Value *IndVar;
+  if (*HPI == ExitingBlock) {
+    // The IterationCount expression contains the number of times that the
+    // backedge actually branches to the loop header.  This is one less than the
+    // number of times the loop executes, so add one to it.
+    ConstantInt *OneC = ConstantInt::get(IterationCount->getType(), 1);
+    TripCount = SCEVAddExpr::get(IterationCount, SCEVConstant::get(OneC));
+    IndVar = L->getCanonicalInductionVariableIncrement();
+  } else {
+    // We have to use the preincremented value...
+    IndVar = L->getCanonicalInductionVariable();
+  }
+  
+  DOUT << "INDVARS: LFTR: TripCount = " << *TripCount
+       << "  IndVar = " << *IndVar << "\n";
+
+  // Expand the code for the iteration count into the preheader of the loop.
+  BasicBlock *Preheader = L->getLoopPreheader();
+  Value *ExitCnt = RW.expandCodeFor(TripCount, Preheader->getTerminator());
+
+  // Insert a new icmp_ne or icmp_eq instruction before the branch.
+  ICmpInst::Predicate Opcode;
+  if (L->contains(BI->getSuccessor(0)))
+    Opcode = ICmpInst::ICMP_NE;
+  else
+    Opcode = ICmpInst::ICMP_EQ;
+
+  Value *Cond = new ICmpInst(Opcode, IndVar, ExitCnt, "exitcond", BI);
+  BI->setCondition(Cond);
+  ++NumLFTR;
+  Changed = true;
+  return PotentiallyDeadInst;
+}
+
+
+/// RewriteLoopExitValues - Check to see if this loop has a computable
+/// loop-invariant execution count.  If so, this means that we can compute the
+/// final value of any expressions that are recurrent in the loop, and
+/// substitute the exit values from the loop into any instructions outside of
+/// the loop that use the final values of the current expressions.
+void IndVarSimplify::RewriteLoopExitValues(Loop *L) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+
+  // Scan all of the instructions in the loop, looking at those that have
+  // extra-loop users and which are recurrences.
+  SCEVExpander Rewriter(*SE, *LI);
+
+  // We insert the code into the preheader of the loop if the loop contains
+  // multiple exit blocks, or in the exit block if there is exactly one.
+  BasicBlock *BlockToInsertInto;
+  std::vector<BasicBlock*> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() == 1)
+    BlockToInsertInto = ExitBlocks[0];
+  else
+    BlockToInsertInto = Preheader;
+  BasicBlock::iterator InsertPt = BlockToInsertInto->begin();
+  while (isa<PHINode>(InsertPt)) ++InsertPt;
+
+  bool HasConstantItCount = isa<SCEVConstant>(SE->getIterationCount(L));
+
+  std::set<Instruction*> InstructionsToDelete;
+  std::map<Instruction*, Value*> ExitValues;
+
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBB = ExitBlocks[i];
+    
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+    
+    unsigned NumPreds = PN->getNumIncomingValues();
+    
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal) ||
+            // SCEV only supports integer expressions for now.
+            !isa<IntegerType>(InVal->getType()))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L) 
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst->getParent()))
+          continue;
+        
+        // We require that this value either have a computable evolution or that
+        // the loop have a constant iteration count.  In the case where the loop
+        // has a constant iteration count, we can sometimes force evaluation of
+        // the exit value through brute force.
+        SCEVHandle SH = SE->getSCEV(Inst);
+        if (!SH->hasComputableLoopEvolution(L) && !HasConstantItCount)
+          continue;          // Cannot get exit evolution for the loop value.
+        
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.
+        SCEVHandle ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (isa<SCEVCouldNotCompute>(ExitValue) ||
+            !ExitValue->isLoopInvariant(L))
+          continue;
+
+        Changed = true;
+        ++NumReplaced;
+        
+        // See if we already computed the exit value for the instruction, if so,
+        // just reuse it.
+        Value *&ExitVal = ExitValues[Inst];
+        if (!ExitVal)
+          ExitVal = Rewriter.expandCodeFor(ExitValue, InsertPt);
+        
+        DOUT << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
+             << "  LoopVal = " << *Inst << "\n";
+
+        PN->setIncomingValue(i, ExitVal);
+        
+        // If this instruction is dead now, schedule it to be removed.
+        if (Inst->use_empty())
+          InstructionsToDelete.insert(Inst);
+        
+        // See if this is a single-entry LCSSA PHI node.  If so, we can (and
+        // have to) remove
+        // the PHI entirely.  This is safe, because the NewVal won't be variant
+        // in the loop, so we don't need an LCSSA phi node anymore.
+        if (NumPreds == 1) {
+          SE->deleteValueFromRecords(PN);
+          PN->replaceAllUsesWith(ExitVal);
+          PN->eraseFromParent();
+          break;
+        }
+      }
+    }
+  }
+  
+  DeleteTriviallyDeadInstructions(InstructionsToDelete);
+}
+
+bool IndVarSimplify::doInitialization(Loop *L, LPPassManager &LPM) {
+
+  Changed = false;
+  // First step.  Check to see if there are any trivial GEP pointer recurrences.
+  // If there are, change them into integer recurrences, permitting analysis by
+  // the SCEV routines.
+  //
+  BasicBlock *Header    = L->getHeader();
+  BasicBlock *Preheader = L->getLoopPreheader();
+  SE = &LPM.getAnalysis<ScalarEvolution>();
+
+  std::set<Instruction*> DeadInsts;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    if (isa<PointerType>(PN->getType()))
+      EliminatePointerRecurrence(PN, Preheader, DeadInsts);
+  }
+
+  if (!DeadInsts.empty())
+    DeleteTriviallyDeadInstructions(DeadInsts);
+
+  return Changed;
+}
+
+bool IndVarSimplify::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+
+  LI = &getAnalysis<LoopInfo>();
+  SE = &getAnalysis<ScalarEvolution>();
+
+  Changed = false;
+  BasicBlock *Header    = L->getHeader();
+  std::set<Instruction*> DeadInsts;
+  
+  // Verify the input to the pass in already in LCSSA form.
+  assert(L->isLCSSAForm());
+
+  // Check to see if this loop has a computable loop-invariant execution count.
+  // If so, this means that we can compute the final value of any expressions
+  // that are recurrent in the loop, and substitute the exit values from the
+  // loop into any instructions outside of the loop that use the final values of
+  // the current expressions.
+  //
+  SCEVHandle IterationCount = SE->getIterationCount(L);
+  if (!isa<SCEVCouldNotCompute>(IterationCount))
+    RewriteLoopExitValues(L);
+
+  // Next, analyze all of the induction variables in the loop, canonicalizing
+  // auxillary induction variables.
+  std::vector<std::pair<PHINode*, SCEVHandle> > IndVars;
+
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    if (PN->getType()->isInteger()) { // FIXME: when we have fast-math, enable!
+      SCEVHandle SCEV = SE->getSCEV(PN);
+      if (SCEV->hasComputableLoopEvolution(L))
+        // FIXME: It is an extremely bad idea to indvar substitute anything more
+        // complex than affine induction variables.  Doing so will put expensive
+        // polynomial evaluations inside of the loop, and the str reduction pass
+        // currently can only reduce affine polynomials.  For now just disable
+        // indvar subst on anything more complex than an affine addrec.
+        if (SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(SCEV))
+          if (AR->isAffine())
+            IndVars.push_back(std::make_pair(PN, SCEV));
+    }
+  }
+
+  // If there are no induction variables in the loop, there is nothing more to
+  // do.
+  if (IndVars.empty()) {
+    // Actually, if we know how many times the loop iterates, lets insert a
+    // canonical induction variable to help subsequent passes.
+    if (!isa<SCEVCouldNotCompute>(IterationCount)) {
+      SCEVExpander Rewriter(*SE, *LI);
+      Rewriter.getOrInsertCanonicalInductionVariable(L,
+                                                     IterationCount->getType());
+      if (Instruction *I = LinearFunctionTestReplace(L, IterationCount,
+                                                     Rewriter)) {
+        std::set<Instruction*> InstructionsToDelete;
+        InstructionsToDelete.insert(I);
+        DeleteTriviallyDeadInstructions(InstructionsToDelete);
+      }
+    }
+    return Changed;
+  }
+
+  // Compute the type of the largest recurrence expression.
+  //
+  const Type *LargestType = IndVars[0].first->getType();
+  bool DifferingSizes = false;
+  for (unsigned i = 1, e = IndVars.size(); i != e; ++i) {
+    const Type *Ty = IndVars[i].first->getType();
+    DifferingSizes |= 
+      Ty->getPrimitiveSizeInBits() != LargestType->getPrimitiveSizeInBits();
+    if (Ty->getPrimitiveSizeInBits() > LargestType->getPrimitiveSizeInBits())
+      LargestType = Ty;
+  }
+
+  // Create a rewriter object which we'll use to transform the code with.
+  SCEVExpander Rewriter(*SE, *LI);
+
+  // Now that we know the largest of of the induction variables in this loop,
+  // insert a canonical induction variable of the largest size.
+  Value *IndVar = Rewriter.getOrInsertCanonicalInductionVariable(L,LargestType);
+  ++NumInserted;
+  Changed = true;
+  DOUT << "INDVARS: New CanIV: " << *IndVar;
+
+  if (!isa<SCEVCouldNotCompute>(IterationCount)) {
+    if (IterationCount->getType()->getPrimitiveSizeInBits() <
+        LargestType->getPrimitiveSizeInBits())
+      IterationCount = SCEVZeroExtendExpr::get(IterationCount, LargestType);
+    else if (IterationCount->getType() != LargestType)
+      IterationCount = SCEVTruncateExpr::get(IterationCount, LargestType);
+    if (Instruction *DI = LinearFunctionTestReplace(L, IterationCount,Rewriter))
+      DeadInsts.insert(DI);
+  }
+
+  // Now that we have a canonical induction variable, we can rewrite any
+  // recurrences in terms of the induction variable.  Start with the auxillary
+  // induction variables, and recursively rewrite any of their uses.
+  BasicBlock::iterator InsertPt = Header->begin();
+  while (isa<PHINode>(InsertPt)) ++InsertPt;
+
+  // If there were induction variables of other sizes, cast the primary
+  // induction variable to the right size for them, avoiding the need for the
+  // code evaluation methods to insert induction variables of different sizes.
+  if (DifferingSizes) {
+    SmallVector<unsigned,4> InsertedSizes;
+    InsertedSizes.push_back(LargestType->getPrimitiveSizeInBits());
+    for (unsigned i = 0, e = IndVars.size(); i != e; ++i) {
+      unsigned ithSize = IndVars[i].first->getType()->getPrimitiveSizeInBits();
+      if (std::find(InsertedSizes.begin(), InsertedSizes.end(), ithSize)
+          == InsertedSizes.end()) {
+        PHINode *PN = IndVars[i].first;
+        InsertedSizes.push_back(ithSize);
+        Instruction *New = new TruncInst(IndVar, PN->getType(), "indvar",
+                                         InsertPt);
+        Rewriter.addInsertedValue(New, SE->getSCEV(New));
+        DOUT << "INDVARS: Made trunc IV for " << *PN
+             << "   NewVal = " << *New << "\n";
+      }
+    }
+  }
+
+  // Rewrite all induction variables in terms of the canonical induction
+  // variable.
+  std::map<unsigned, Value*> InsertedSizes;
+  while (!IndVars.empty()) {
+    PHINode *PN = IndVars.back().first;
+    Value *NewVal = Rewriter.expandCodeFor(IndVars.back().second, InsertPt);
+    DOUT << "INDVARS: Rewrote IV '" << *IndVars.back().second << "' " << *PN
+         << "   into = " << *NewVal << "\n";
+    NewVal->takeName(PN);
+
+    // Replace the old PHI Node with the inserted computation.
+    PN->replaceAllUsesWith(NewVal);
+    DeadInsts.insert(PN);
+    IndVars.pop_back();
+    ++NumRemoved;
+    Changed = true;
+  }
+
+#if 0
+  // Now replace all derived expressions in the loop body with simpler
+  // expressions.
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i)
+    if (LI->getLoopFor(L->getBlocks()[i]) == L) {  // Not in a subloop...
+      BasicBlock *BB = L->getBlocks()[i];
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (I->getType()->isInteger() &&      // Is an integer instruction
+            !I->use_empty() &&
+            !Rewriter.isInsertedInstruction(I)) {
+          SCEVHandle SH = SE->getSCEV(I);
+          Value *V = Rewriter.expandCodeFor(SH, I, I->getType());
+          if (V != I) {
+            if (isa<Instruction>(V))
+              V->takeName(I);
+            I->replaceAllUsesWith(V);
+            DeadInsts.insert(I);
+            ++NumRemoved;
+            Changed = true;
+          }
+        }
+    }
+#endif
+
+  DeleteTriviallyDeadInstructions(DeadInsts);
+  
+  assert(L->isLCSSAForm());
+  return Changed;
+}

diff --git a/lib/Transforms/Scalar/InstructionCombining.cpp b/lib/Transforms/Scalar/InstructionCombining.cpp
new file mode 100644
index 0000000..816a1c6
--- /dev/null
+++ b/lib/Transforms/Scalar/InstructionCombining.cpp

@@ -0,0 +1,10090 @@
+//===- InstructionCombining.cpp - Combine multiple instructions -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// InstructionCombining - Combine instructions to form fewer, simple
+// instructions.  This pass does not modify the CFG This pass is where algebraic
+// simplification happens.
+//
+// This pass combines things like:
+//    %Y = add i32 %X, 1
+//    %Z = add i32 %Y, 1
+// into:
+//    %Z = add i32 %X, 2
+//
+// This is a simple worklist driven algorithm.
+//
+// This pass guarantees that the following canonicalizations are performed on
+// the program:
+//    1. If a binary operator has a constant operand, it is moved to the RHS
+//    2. Bitwise operators with constant operands are always grouped so that
+//       shifts are performed first, then or's, then and's, then xor's.
+//    3. Compare instructions are converted from <,>,<=,>= to ==,!= if possible
+//    4. All cmp instructions on boolean values are replaced with logical ops
+//    5. add X, X is represented as (X*2) => (X << 1)
+//    6. Multiplies with a power-of-two constant argument are transformed into
+//       shifts.
+//   ... etc.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "instcombine"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/PatternMatch.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <sstream>
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumCombined , "Number of insts combined");
+STATISTIC(NumConstProp, "Number of constant folds");
+STATISTIC(NumDeadInst , "Number of dead inst eliminated");
+STATISTIC(NumDeadStore, "Number of dead stores eliminated");
+STATISTIC(NumSunkInst , "Number of instructions sunk");
+
+namespace {
+  class VISIBILITY_HIDDEN InstCombiner
+    : public FunctionPass,
+      public InstVisitor<InstCombiner, Instruction*> {
+    // Worklist of all of the instructions that need to be simplified.
+    std::vector<Instruction*> Worklist;
+    DenseMap<Instruction*, unsigned> WorklistMap;
+    TargetData *TD;
+    bool MustPreserveLCSSA;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    InstCombiner() : FunctionPass((intptr_t)&ID) {}
+
+    /// AddToWorkList - Add the specified instruction to the worklist if it
+    /// isn't already in it.
+    void AddToWorkList(Instruction *I) {
+      if (WorklistMap.insert(std::make_pair(I, Worklist.size())))
+        Worklist.push_back(I);
+    }
+    
+    // RemoveFromWorkList - remove I from the worklist if it exists.
+    void RemoveFromWorkList(Instruction *I) {
+      DenseMap<Instruction*, unsigned>::iterator It = WorklistMap.find(I);
+      if (It == WorklistMap.end()) return; // Not in worklist.
+      
+      // Don't bother moving everything down, just null out the slot.
+      Worklist[It->second] = 0;
+      
+      WorklistMap.erase(It);
+    }
+    
+    Instruction *RemoveOneFromWorkList() {
+      Instruction *I = Worklist.back();
+      Worklist.pop_back();
+      WorklistMap.erase(I);
+      return I;
+    }
+
+    
+    /// AddUsersToWorkList - When an instruction is simplified, add all users of
+    /// the instruction to the work lists because they might get more simplified
+    /// now.
+    ///
+    void AddUsersToWorkList(Value &I) {
+      for (Value::use_iterator UI = I.use_begin(), UE = I.use_end();
+           UI != UE; ++UI)
+        AddToWorkList(cast<Instruction>(*UI));
+    }
+
+    /// AddUsesToWorkList - When an instruction is simplified, add operands to
+    /// the work lists because they might get more simplified now.
+    ///
+    void AddUsesToWorkList(Instruction &I) {
+      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i)))
+          AddToWorkList(Op);
+    }
+    
+    /// AddSoonDeadInstToWorklist - The specified instruction is about to become
+    /// dead.  Add all of its operands to the worklist, turning them into
+    /// undef's to reduce the number of uses of those instructions.
+    ///
+    /// Return the specified operand before it is turned into an undef.
+    ///
+    Value *AddSoonDeadInstToWorklist(Instruction &I, unsigned op) {
+      Value *R = I.getOperand(op);
+      
+      for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+        if (Instruction *Op = dyn_cast<Instruction>(I.getOperand(i))) {
+          AddToWorkList(Op);
+          // Set the operand to undef to drop the use.
+          I.setOperand(i, UndefValue::get(Op->getType()));
+        }
+      
+      return R;
+    }
+
+  public:
+    virtual bool runOnFunction(Function &F);
+    
+    bool DoOneIteration(Function &F, unsigned ItNum);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<TargetData>();
+      AU.addPreservedID(LCSSAID);
+      AU.setPreservesCFG();
+    }
+
+    TargetData &getTargetData() const { return *TD; }
+
+    // Visitation implementation - Implement instruction combining for different
+    // instruction types.  The semantics are as follows:
+    // Return Value:
+    //    null        - No change was made
+    //     I          - Change was made, I is still valid, I may be dead though
+    //   otherwise    - Change was made, replace I with returned instruction
+    //
+    Instruction *visitAdd(BinaryOperator &I);
+    Instruction *visitSub(BinaryOperator &I);
+    Instruction *visitMul(BinaryOperator &I);
+    Instruction *visitURem(BinaryOperator &I);
+    Instruction *visitSRem(BinaryOperator &I);
+    Instruction *visitFRem(BinaryOperator &I);
+    Instruction *commonRemTransforms(BinaryOperator &I);
+    Instruction *commonIRemTransforms(BinaryOperator &I);
+    Instruction *commonDivTransforms(BinaryOperator &I);
+    Instruction *commonIDivTransforms(BinaryOperator &I);
+    Instruction *visitUDiv(BinaryOperator &I);
+    Instruction *visitSDiv(BinaryOperator &I);
+    Instruction *visitFDiv(BinaryOperator &I);
+    Instruction *visitAnd(BinaryOperator &I);
+    Instruction *visitOr (BinaryOperator &I);
+    Instruction *visitXor(BinaryOperator &I);
+    Instruction *visitShl(BinaryOperator &I);
+    Instruction *visitAShr(BinaryOperator &I);
+    Instruction *visitLShr(BinaryOperator &I);
+    Instruction *commonShiftTransforms(BinaryOperator &I);
+    Instruction *visitFCmpInst(FCmpInst &I);
+    Instruction *visitICmpInst(ICmpInst &I);
+    Instruction *visitICmpInstWithCastAndCast(ICmpInst &ICI);
+    Instruction *visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                Instruction *LHS,
+                                                ConstantInt *RHS);
+    Instruction *FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                ConstantInt *DivRHS);
+
+    Instruction *FoldGEPICmp(User *GEPLHS, Value *RHS,
+                             ICmpInst::Predicate Cond, Instruction &I);
+    Instruction *FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                     BinaryOperator &I);
+    Instruction *commonCastTransforms(CastInst &CI);
+    Instruction *commonIntCastTransforms(CastInst &CI);
+    Instruction *commonPointerCastTransforms(CastInst &CI);
+    Instruction *visitTrunc(TruncInst &CI);
+    Instruction *visitZExt(ZExtInst &CI);
+    Instruction *visitSExt(SExtInst &CI);
+    Instruction *visitFPTrunc(CastInst &CI);
+    Instruction *visitFPExt(CastInst &CI);
+    Instruction *visitFPToUI(CastInst &CI);
+    Instruction *visitFPToSI(CastInst &CI);
+    Instruction *visitUIToFP(CastInst &CI);
+    Instruction *visitSIToFP(CastInst &CI);
+    Instruction *visitPtrToInt(CastInst &CI);
+    Instruction *visitIntToPtr(CastInst &CI);
+    Instruction *visitBitCast(BitCastInst &CI);
+    Instruction *FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                Instruction *FI);
+    Instruction *visitSelectInst(SelectInst &CI);
+    Instruction *visitCallInst(CallInst &CI);
+    Instruction *visitInvokeInst(InvokeInst &II);
+    Instruction *visitPHINode(PHINode &PN);
+    Instruction *visitGetElementPtrInst(GetElementPtrInst &GEP);
+    Instruction *visitAllocationInst(AllocationInst &AI);
+    Instruction *visitFreeInst(FreeInst &FI);
+    Instruction *visitLoadInst(LoadInst &LI);
+    Instruction *visitStoreInst(StoreInst &SI);
+    Instruction *visitBranchInst(BranchInst &BI);
+    Instruction *visitSwitchInst(SwitchInst &SI);
+    Instruction *visitInsertElementInst(InsertElementInst &IE);
+    Instruction *visitExtractElementInst(ExtractElementInst &EI);
+    Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
+
+    // visitInstruction - Specify what to return for unhandled instructions...
+    Instruction *visitInstruction(Instruction &I) { return 0; }
+
+  private:
+    Instruction *visitCallSite(CallSite CS);
+    bool transformConstExprCastCall(CallSite CS);
+
+  public:
+    // InsertNewInstBefore - insert an instruction New before instruction Old
+    // in the program.  Add the new instruction to the worklist.
+    //
+    Instruction *InsertNewInstBefore(Instruction *New, Instruction &Old) {
+      assert(New && New->getParent() == 0 &&
+             "New instruction already inserted into a basic block!");
+      BasicBlock *BB = Old.getParent();
+      BB->getInstList().insert(&Old, New);  // Insert inst
+      AddToWorkList(New);
+      return New;
+    }
+
+    /// InsertCastBefore - Insert a cast of V to TY before the instruction POS.
+    /// This also adds the cast to the worklist.  Finally, this returns the
+    /// cast.
+    Value *InsertCastBefore(Instruction::CastOps opc, Value *V, const Type *Ty,
+                            Instruction &Pos) {
+      if (V->getType() == Ty) return V;
+
+      if (Constant *CV = dyn_cast<Constant>(V))
+        return ConstantExpr::getCast(opc, CV, Ty);
+      
+      Instruction *C = CastInst::create(opc, V, Ty, V->getName(), &Pos);
+      AddToWorkList(C);
+      return C;
+    }
+
+    // ReplaceInstUsesWith - This method is to be used when an instruction is
+    // found to be dead, replacable with another preexisting expression.  Here
+    // we add all uses of I to the worklist, replace all uses of I with the new
+    // value, then return I, so that the inst combiner will know that I was
+    // modified.
+    //
+    Instruction *ReplaceInstUsesWith(Instruction &I, Value *V) {
+      AddUsersToWorkList(I);         // Add all modified instrs to worklist
+      if (&I != V) {
+        I.replaceAllUsesWith(V);
+        return &I;
+      } else {
+        // If we are replacing the instruction with itself, this must be in a
+        // segment of unreachable code, so just clobber the instruction.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+        return &I;
+      }
+    }
+
+    // UpdateValueUsesWith - This method is to be used when an value is
+    // found to be replacable with another preexisting expression or was
+    // updated.  Here we add all uses of I to the worklist, replace all uses of
+    // I with the new value (unless the instruction was just updated), then
+    // return true, so that the inst combiner will know that I was modified.
+    //
+    bool UpdateValueUsesWith(Value *Old, Value *New) {
+      AddUsersToWorkList(*Old);         // Add all modified instrs to worklist
+      if (Old != New)
+        Old->replaceAllUsesWith(New);
+      if (Instruction *I = dyn_cast<Instruction>(Old))
+        AddToWorkList(I);
+      if (Instruction *I = dyn_cast<Instruction>(New))
+        AddToWorkList(I);
+      return true;
+    }
+    
+    // EraseInstFromFunction - When dealing with an instruction that has side
+    // effects or produces a void value, we can't rely on DCE to delete the
+    // instruction.  Instead, visit methods should return the value returned by
+    // this function.
+    Instruction *EraseInstFromFunction(Instruction &I) {
+      assert(I.use_empty() && "Cannot erase instruction that is used!");
+      AddUsesToWorkList(I);
+      RemoveFromWorkList(&I);
+      I.eraseFromParent();
+      return 0;  // Don't do anything with FI
+    }
+
+  private:
+    /// InsertOperandCastBefore - This inserts a cast of V to DestTy before the
+    /// InsertBefore instruction.  This is specialized a bit to avoid inserting
+    /// casts that are known to not do anything...
+    ///
+    Value *InsertOperandCastBefore(Instruction::CastOps opcode,
+                                   Value *V, const Type *DestTy,
+                                   Instruction *InsertBefore);
+
+    /// SimplifyCommutative - This performs a few simplifications for 
+    /// commutative operators.
+    bool SimplifyCommutative(BinaryOperator &I);
+
+    /// SimplifyCompare - This reorders the operands of a CmpInst to get them in
+    /// most-complex to least-complex order.
+    bool SimplifyCompare(CmpInst &I);
+
+    /// SimplifyDemandedBits - Attempts to replace V with a simpler value based
+    /// on the demanded bits.
+    bool SimplifyDemandedBits(Value *V, APInt DemandedMask, 
+                              APInt& KnownZero, APInt& KnownOne,
+                              unsigned Depth = 0);
+
+    Value *SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
+                                      uint64_t &UndefElts, unsigned Depth = 0);
+      
+    // FoldOpIntoPhi - Given a binary operator or cast instruction which has a
+    // PHI node as operand #0, see if we can fold the instruction into the PHI
+    // (which is only possible if all operands to the PHI are constants).
+    Instruction *FoldOpIntoPhi(Instruction &I);
+
+    // FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+    // operator and they all are only used by the PHI, PHI together their
+    // inputs, and do the operation once, to the result of the PHI.
+    Instruction *FoldPHIArgOpIntoPHI(PHINode &PN);
+    Instruction *FoldPHIArgBinOpIntoPHI(PHINode &PN);
+    
+    
+    Instruction *OptAndOp(Instruction *Op, ConstantInt *OpRHS,
+                          ConstantInt *AndRHS, BinaryOperator &TheAnd);
+    
+    Value *FoldLogicalPlusAnd(Value *LHS, Value *RHS, ConstantInt *Mask,
+                              bool isSub, Instruction &I);
+    Instruction *InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                 bool isSigned, bool Inside, Instruction &IB);
+    Instruction *PromoteCastOfAllocation(BitCastInst &CI, AllocationInst &AI);
+    Instruction *MatchBSwap(BinaryOperator &I);
+    bool SimplifyStoreAtEndOfBlock(StoreInst &SI);
+
+    Value *EvaluateInDifferentType(Value *V, const Type *Ty, bool isSigned);
+  };
+
+  char InstCombiner::ID = 0;
+  RegisterPass<InstCombiner> X("instcombine", "Combine redundant instructions");
+}
+
+// getComplexity:  Assign a complexity or rank value to LLVM Values...
+//   0 -> undef, 1 -> Const, 2 -> Other, 3 -> Arg, 3 -> Unary, 4 -> OtherInst
+static unsigned getComplexity(Value *V) {
+  if (isa<Instruction>(V)) {
+    if (BinaryOperator::isNeg(V) || BinaryOperator::isNot(V))
+      return 3;
+    return 4;
+  }
+  if (isa<Argument>(V)) return 3;
+  return isa<Constant>(V) ? (isa<UndefValue>(V) ? 0 : 1) : 2;
+}
+
+// isOnlyUse - Return true if this instruction will be deleted if we stop using
+// it.
+static bool isOnlyUse(Value *V) {
+  return V->hasOneUse() || isa<Constant>(V);
+}
+
+// getPromotedType - Return the specified type promoted as it would be to pass
+// though a va_arg area...
+static const Type *getPromotedType(const Type *Ty) {
+  if (const IntegerType* ITy = dyn_cast<IntegerType>(Ty)) {
+    if (ITy->getBitWidth() < 32)
+      return Type::Int32Ty;
+  }
+  return Ty;
+}
+
+/// getBitCastOperand - If the specified operand is a CastInst or a constant 
+/// expression bitcast,  return the operand value, otherwise return null.
+static Value *getBitCastOperand(Value *V) {
+  if (BitCastInst *I = dyn_cast<BitCastInst>(V))
+    return I->getOperand(0);
+  else if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast)
+      return CE->getOperand(0);
+  return 0;
+}
+
+/// This function is a wrapper around CastInst::isEliminableCastPair. It
+/// simply extracts arguments and returns what that function returns.
+static Instruction::CastOps 
+isEliminableCastPair(
+  const CastInst *CI, ///< The first cast instruction
+  unsigned opcode,       ///< The opcode of the second cast instruction
+  const Type *DstTy,     ///< The target type for the second cast instruction
+  TargetData *TD         ///< The target data for pointer size
+) {
+  
+  const Type *SrcTy = CI->getOperand(0)->getType();   // A from above
+  const Type *MidTy = CI->getType();                  // B from above
+
+  // Get the opcodes of the two Cast instructions
+  Instruction::CastOps firstOp = Instruction::CastOps(CI->getOpcode());
+  Instruction::CastOps secondOp = Instruction::CastOps(opcode);
+
+  return Instruction::CastOps(
+      CastInst::isEliminableCastPair(firstOp, secondOp, SrcTy, MidTy,
+                                     DstTy, TD->getIntPtrType()));
+}
+
+/// ValueRequiresCast - Return true if the cast from "V to Ty" actually results
+/// in any code being generated.  It does not require codegen if V is simple
+/// enough or if the cast can be folded into other casts.
+static bool ValueRequiresCast(Instruction::CastOps opcode, const Value *V, 
+                              const Type *Ty, TargetData *TD) {
+  if (V->getType() == Ty || isa<Constant>(V)) return false;
+  
+  // If this is another cast that can be eliminated, it isn't codegen either.
+  if (const CastInst *CI = dyn_cast<CastInst>(V))
+    if (isEliminableCastPair(CI, opcode, Ty, TD)) 
+      return false;
+  return true;
+}
+
+/// InsertOperandCastBefore - This inserts a cast of V to DestTy before the
+/// InsertBefore instruction.  This is specialized a bit to avoid inserting
+/// casts that are known to not do anything...
+///
+Value *InstCombiner::InsertOperandCastBefore(Instruction::CastOps opcode,
+                                             Value *V, const Type *DestTy,
+                                             Instruction *InsertBefore) {
+  if (V->getType() == DestTy) return V;
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(opcode, C, DestTy);
+  
+  return InsertCastBefore(opcode, V, DestTy, *InsertBefore);
+}
+
+// SimplifyCommutative - This performs a few simplifications for commutative
+// operators:
+//
+//  1. Order operands such that they are listed from right (least complex) to
+//     left (most complex).  This puts constants before unary operators before
+//     binary operators.
+//
+//  2. Transform: (op (op V, C1), C2) ==> (op V, (op C1, C2))
+//  3. Transform: (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+//
+bool InstCombiner::SimplifyCommutative(BinaryOperator &I) {
+  bool Changed = false;
+  if (getComplexity(I.getOperand(0)) < getComplexity(I.getOperand(1)))
+    Changed = !I.swapOperands();
+
+  if (!I.isAssociative()) return Changed;
+  Instruction::BinaryOps Opcode = I.getOpcode();
+  if (BinaryOperator *Op = dyn_cast<BinaryOperator>(I.getOperand(0)))
+    if (Op->getOpcode() == Opcode && isa<Constant>(Op->getOperand(1))) {
+      if (isa<Constant>(I.getOperand(1))) {
+        Constant *Folded = ConstantExpr::get(I.getOpcode(),
+                                             cast<Constant>(I.getOperand(1)),
+                                             cast<Constant>(Op->getOperand(1)));
+        I.setOperand(0, Op->getOperand(0));
+        I.setOperand(1, Folded);
+        return true;
+      } else if (BinaryOperator *Op1=dyn_cast<BinaryOperator>(I.getOperand(1)))
+        if (Op1->getOpcode() == Opcode && isa<Constant>(Op1->getOperand(1)) &&
+            isOnlyUse(Op) && isOnlyUse(Op1)) {
+          Constant *C1 = cast<Constant>(Op->getOperand(1));
+          Constant *C2 = cast<Constant>(Op1->getOperand(1));
+
+          // Fold (op (op V1, C1), (op V2, C2)) ==> (op (op V1, V2), (op C1,C2))
+          Constant *Folded = ConstantExpr::get(I.getOpcode(), C1, C2);
+          Instruction *New = BinaryOperator::create(Opcode, Op->getOperand(0),
+                                                    Op1->getOperand(0),
+                                                    Op1->getName(), &I);
+          AddToWorkList(New);
+          I.setOperand(0, New);
+          I.setOperand(1, Folded);
+          return true;
+        }
+    }
+  return Changed;
+}
+
+/// SimplifyCompare - For a CmpInst this function just orders the operands
+/// so that theyare listed from right (least complex) to left (most complex).
+/// This puts constants before unary operators before binary operators.
+bool InstCombiner::SimplifyCompare(CmpInst &I) {
+  if (getComplexity(I.getOperand(0)) >= getComplexity(I.getOperand(1)))
+    return false;
+  I.swapOperands();
+  // Compare instructions are not associative so there's nothing else we can do.
+  return true;
+}
+
+// dyn_castNegVal - Given a 'sub' instruction, return the RHS of the instruction
+// if the LHS is a constant zero (which is the 'negate' form).
+//
+static inline Value *dyn_castNegVal(Value *V) {
+  if (BinaryOperator::isNeg(V))
+    return BinaryOperator::getNegArgument(V);
+
+  // Constants can be considered to be negated values if they can be folded.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantExpr::getNeg(C);
+  return 0;
+}
+
+static inline Value *dyn_castNotVal(Value *V) {
+  if (BinaryOperator::isNot(V))
+    return BinaryOperator::getNotArgument(V);
+
+  // Constants can be considered to be not'ed values...
+  if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+    return ConstantInt::get(~C->getValue());
+  return 0;
+}
+
+// dyn_castFoldableMul - If this value is a multiply that can be folded into
+// other computations (because it has a constant operand), return the
+// non-constant operand of the multiply, and set CST to point to the multiplier.
+// Otherwise, return null.
+//
+static inline Value *dyn_castFoldableMul(Value *V, ConstantInt *&CST) {
+  if (V->hasOneUse() && V->getType()->isInteger())
+    if (Instruction *I = dyn_cast<Instruction>(V)) {
+      if (I->getOpcode() == Instruction::Mul)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1))))
+          return I->getOperand(0);
+      if (I->getOpcode() == Instruction::Shl)
+        if ((CST = dyn_cast<ConstantInt>(I->getOperand(1)))) {
+          // The multiplier is really 1 << CST.
+          uint32_t BitWidth = cast<IntegerType>(V->getType())->getBitWidth();
+          uint32_t CSTVal = CST->getLimitedValue(BitWidth);
+          CST = ConstantInt::get(APInt(BitWidth, 1).shl(CSTVal));
+          return I->getOperand(0);
+        }
+    }
+  return 0;
+}
+
+/// dyn_castGetElementPtr - If this is a getelementptr instruction or constant
+/// expression, return it.
+static User *dyn_castGetElementPtr(Value *V) {
+  if (isa<GetElementPtrInst>(V)) return cast<User>(V);
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::GetElementPtr)
+      return cast<User>(V);
+  return false;
+}
+
+/// AddOne - Add one to a ConstantInt
+static ConstantInt *AddOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(++Val);
+}
+/// SubOne - Subtract one from a ConstantInt
+static ConstantInt *SubOne(ConstantInt *C) {
+  APInt Val(C->getValue());
+  return ConstantInt::get(--Val);
+}
+/// Add - Add two ConstantInts together
+static ConstantInt *Add(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() + C2->getValue());
+}
+/// And - Bitwise AND two ConstantInts together
+static ConstantInt *And(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() & C2->getValue());
+}
+/// Subtract - Subtract one ConstantInt from another
+static ConstantInt *Subtract(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() - C2->getValue());
+}
+/// Multiply - Multiply two ConstantInts together
+static ConstantInt *Multiply(ConstantInt *C1, ConstantInt *C2) {
+  return ConstantInt::get(C1->getValue() * C2->getValue());
+}
+
+/// ComputeMaskedBits - Determine which of the bits specified in Mask are
+/// known to be either zero or one and return them in the KnownZero/KnownOne
+/// bit sets.  This code only analyzes bits in Mask, in order to short-circuit
+/// processing.
+/// NOTE: we cannot consider 'undef' to be "IsZero" here.  The problem is that
+/// we cannot optimize based on the assumption that it is zero without changing
+/// it to be an explicit zero.  If we don't change it to zero, other code could
+/// optimized based on the contradictory assumption that it is non-zero.
+/// Because instcombine aggressively folds operations with undef args anyway,
+/// this won't lose us code quality.
+static void ComputeMaskedBits(Value *V, const APInt &Mask, APInt& KnownZero, 
+                              APInt& KnownOne, unsigned Depth = 0) {
+  assert(V && "No Value?");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = Mask.getBitWidth();
+  assert(cast<IntegerType>(V->getType())->getBitWidth() == BitWidth &&
+         KnownZero.getBitWidth() == BitWidth && 
+         KnownOne.getBitWidth() == BitWidth &&
+         "V, Mask, KnownOne and KnownZero should have same BitWidth");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & Mask;
+    KnownZero = ~KnownOne & Mask;
+    return;
+  }
+
+  if (Depth == 6 || Mask == 0)
+    return;  // Limit search depth.
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return;
+
+  KnownZero.clear(); KnownOne.clear();   // Don't know anything.
+  APInt KnownZero2(KnownZero), KnownOne2(KnownOne);
+  
+  switch (I->getOpcode()) {
+  case Instruction::And: {
+    // If either the LHS or the RHS are Zero, the result is zero.
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    APInt Mask2(Mask & ~KnownZero);
+    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    KnownOne &= KnownOne2;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    KnownZero |= KnownZero2;
+    return;
+  }
+  case Instruction::Or: {
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    APInt Mask2(Mask & ~KnownOne);
+    ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    KnownZero &= KnownZero2;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    KnownOne |= KnownOne2;
+    return;
+  }
+  case Instruction::Xor: {
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(I->getOperand(0), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (KnownZero & KnownZero2) | (KnownOne & KnownOne2);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    KnownOne = (KnownZero & KnownOne2) | (KnownOne & KnownZero2);
+    KnownZero = KnownZeroOut;
+    return;
+  }
+  case Instruction::Select:
+    ComputeMaskedBits(I->getOperand(2), Mask, KnownZero, KnownOne, Depth+1);
+    ComputeMaskedBits(I->getOperand(1), Mask, KnownZero2, KnownOne2, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    assert((KnownZero2 & KnownOne2) == 0 && "Bits known to be one AND zero?"); 
+
+    // Only known if known in both the LHS and RHS.
+    KnownOne &= KnownOne2;
+    KnownZero &= KnownZero2;
+    return;
+  case Instruction::FPTrunc:
+  case Instruction::FPExt:
+  case Instruction::FPToUI:
+  case Instruction::FPToSI:
+  case Instruction::SIToFP:
+  case Instruction::PtrToInt:
+  case Instruction::UIToFP:
+  case Instruction::IntToPtr:
+    return; // Can't work with floating point or pointers
+  case Instruction::Trunc: {
+    // All these have integer operands
+    uint32_t SrcBitWidth = 
+      cast<IntegerType>(I->getOperand(0)->getType())->getBitWidth();
+    APInt MaskIn(Mask);
+    MaskIn.zext(SrcBitWidth);
+    KnownZero.zext(SrcBitWidth);
+    KnownOne.zext(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, Depth+1);
+    KnownZero.trunc(BitWidth);
+    KnownOne.trunc(BitWidth);
+    return;
+  }
+  case Instruction::BitCast: {
+    const Type *SrcTy = I->getOperand(0)->getType();
+    if (SrcTy->isInteger()) {
+      ComputeMaskedBits(I->getOperand(0), Mask, KnownZero, KnownOne, Depth+1);
+      return;
+    }
+    break;
+  }
+  case Instruction::ZExt:  {
+    // Compute the bits in the result that are not present in the input.
+    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
+    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+      
+    APInt MaskIn(Mask);
+    MaskIn.trunc(SrcBitWidth);
+    KnownZero.trunc(SrcBitWidth);
+    KnownOne.trunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    // The top bits are known to be zero.
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+    KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
+    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+      
+    APInt MaskIn(Mask); 
+    MaskIn.trunc(SrcBitWidth);
+    KnownZero.trunc(SrcBitWidth);
+    KnownOne.trunc(SrcBitWidth);
+    ComputeMaskedBits(I->getOperand(0), MaskIn, KnownZero, KnownOne, Depth+1);
+    assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+    KnownZero.zext(BitWidth);
+    KnownOne.zext(BitWidth);
+
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+    if (KnownZero[SrcBitWidth-1])             // Input sign bit known zero
+      KnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    else if (KnownOne[SrcBitWidth-1])           // Input sign bit known set
+      KnownOne |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    return;
+  }
+  case Instruction::Shl:
+    // (shl X, C1) & C2 == 0   iff   (X & C2 >>u C1) == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      APInt Mask2(Mask.lshr(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero, KnownOne, Depth+1);
+      assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+      KnownZero <<= ShiftAmt;
+      KnownOne  <<= ShiftAmt;
+      KnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt); // low bits known 0
+      return;
+    }
+    break;
+  case Instruction::LShr:
+    // (ushr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Unsigned shift right.
+      APInt Mask2(Mask.shl(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne,Depth+1);
+      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+      // high bits known zero.
+      KnownZero |= APInt::getHighBitsSet(BitWidth, ShiftAmt);
+      return;
+    }
+    break;
+  case Instruction::AShr:
+    // (ashr X, C1) & C2 == 0   iff  (-1 >> C1) & C2 == 0
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // Compute the new bits that are at the top now.
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Signed shift right.
+      APInt Mask2(Mask.shl(ShiftAmt));
+      ComputeMaskedBits(I->getOperand(0), Mask2, KnownZero,KnownOne,Depth+1);
+      assert((KnownZero & KnownOne) == 0&&"Bits known to be one AND zero?"); 
+      KnownZero = APIntOps::lshr(KnownZero, ShiftAmt);
+      KnownOne  = APIntOps::lshr(KnownOne, ShiftAmt);
+        
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      if (KnownZero[BitWidth-ShiftAmt-1])    // New bits are known zero.
+        KnownZero |= HighBits;
+      else if (KnownOne[BitWidth-ShiftAmt-1])  // New bits are known one.
+        KnownOne |= HighBits;
+      return;
+    }
+    break;
+  }
+}
+
+/// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero.  We use
+/// this predicate to simplify operations downstream.  Mask is known to be zero
+/// for bits that V cannot have.
+static bool MaskedValueIsZero(Value *V, const APInt& Mask, unsigned Depth = 0) {
+  APInt KnownZero(Mask.getBitWidth(), 0), KnownOne(Mask.getBitWidth(), 0);
+  ComputeMaskedBits(V, Mask, KnownZero, KnownOne, Depth);
+  assert((KnownZero & KnownOne) == 0 && "Bits known to be one AND zero?"); 
+  return (KnownZero & Mask) == Mask;
+}
+
+/// ShrinkDemandedConstant - Check to see if the specified operand of the 
+/// specified instruction is a constant integer.  If so, check to see if there
+/// are any bits set in the constant that are not demanded.  If so, shrink the
+/// constant and return true.
+static bool ShrinkDemandedConstant(Instruction *I, unsigned OpNo, 
+                                   APInt Demanded) {
+  assert(I && "No instruction?");
+  assert(OpNo < I->getNumOperands() && "Operand index too large");
+
+  // If the operand is not a constant integer, nothing to do.
+  ConstantInt *OpC = dyn_cast<ConstantInt>(I->getOperand(OpNo));
+  if (!OpC) return false;
+
+  // If there are no bits set that aren't demanded, nothing to do.
+  Demanded.zextOrTrunc(OpC->getValue().getBitWidth());
+  if ((~Demanded & OpC->getValue()) == 0)
+    return false;
+
+  // This instruction is producing bits that are not demanded. Shrink the RHS.
+  Demanded &= OpC->getValue();
+  I->setOperand(OpNo, ConstantInt::get(Demanded));
+  return true;
+}
+
+// ComputeSignedMinMaxValuesFromKnownBits - Given a signed integer type and a 
+// set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeSignedMinMaxValuesFromKnownBits(const Type *Ty,
+                                                   const APInt& KnownZero,
+                                                   const APInt& KnownOne,
+                                                   APInt& Min, APInt& Max) {
+  uint32_t BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+  assert(KnownZero.getBitWidth() == BitWidth && 
+         KnownOne.getBitWidth() == BitWidth &&
+         Min.getBitWidth() == BitWidth && Max.getBitWidth() == BitWidth &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+
+  // The minimum value is when all unknown bits are zeros, EXCEPT for the sign
+  // bit if it is unknown.
+  Min = KnownOne;
+  Max = KnownOne|UnknownBits;
+  
+  if (UnknownBits[BitWidth-1]) { // Sign bit is unknown
+    Min.set(BitWidth-1);
+    Max.clear(BitWidth-1);
+  }
+}
+
+// ComputeUnsignedMinMaxValuesFromKnownBits - Given an unsigned integer type and
+// a set of known zero and one bits, compute the maximum and minimum values that
+// could have the specified known zero and known one bits, returning them in
+// min/max.
+static void ComputeUnsignedMinMaxValuesFromKnownBits(const Type *Ty,
+                                                     const APInt& KnownZero,
+                                                     const APInt& KnownOne,
+                                                     APInt& Min,
+                                                     APInt& Max) {
+  uint32_t BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+  assert(KnownZero.getBitWidth() == BitWidth && 
+         KnownOne.getBitWidth() == BitWidth &&
+         Min.getBitWidth() == BitWidth && Max.getBitWidth() &&
+         "Ty, KnownZero, KnownOne and Min, Max must have equal bitwidth.");
+  APInt UnknownBits = ~(KnownZero|KnownOne);
+  
+  // The minimum value is when the unknown bits are all zeros.
+  Min = KnownOne;
+  // The maximum value is when the unknown bits are all ones.
+  Max = KnownOne|UnknownBits;
+}
+
+/// SimplifyDemandedBits - This function attempts to replace V with a simpler
+/// value based on the demanded bits. When this function is called, it is known
+/// that only the bits set in DemandedMask of the result of V are ever used
+/// downstream. Consequently, depending on the mask and V, it may be possible
+/// to replace V with a constant or one of its operands. In such cases, this
+/// function does the replacement and returns true. In all other cases, it
+/// returns false after analyzing the expression and setting KnownOne and known
+/// to be one in the expression. KnownZero contains all the bits that are known
+/// to be zero in the expression. These are provided to potentially allow the
+/// caller (which might recursively be SimplifyDemandedBits itself) to simplify
+/// the expression. KnownOne and KnownZero always follow the invariant that 
+/// KnownOne & KnownZero == 0. That is, a bit can't be both 1 and 0. Note that
+/// the bits in KnownOne and KnownZero may only be accurate for those bits set
+/// in DemandedMask. Note also that the bitwidth of V, DemandedMask, KnownZero
+/// and KnownOne must all be the same.
+bool InstCombiner::SimplifyDemandedBits(Value *V, APInt DemandedMask,
+                                        APInt& KnownZero, APInt& KnownOne,
+                                        unsigned Depth) {
+  assert(V != 0 && "Null pointer of Value???");
+  assert(Depth <= 6 && "Limit Search Depth");
+  uint32_t BitWidth = DemandedMask.getBitWidth();
+  const IntegerType *VTy = cast<IntegerType>(V->getType());
+  assert(VTy->getBitWidth() == BitWidth && 
+         KnownZero.getBitWidth() == BitWidth && 
+         KnownOne.getBitWidth() == BitWidth &&
+         "Value *V, DemandedMask, KnownZero and KnownOne \
+          must have same BitWidth");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V)) {
+    // We know all of the bits for a constant!
+    KnownOne = CI->getValue() & DemandedMask;
+    KnownZero = ~KnownOne & DemandedMask;
+    return false;
+  }
+  
+  KnownZero.clear(); 
+  KnownOne.clear();
+  if (!V->hasOneUse()) {    // Other users may use these bits.
+    if (Depth != 0) {       // Not at the root.
+      // Just compute the KnownZero/KnownOne bits to simplify things downstream.
+      ComputeMaskedBits(V, DemandedMask, KnownZero, KnownOne, Depth);
+      return false;
+    }
+    // If this is the root being simplified, allow it to have multiple uses,
+    // just set the DemandedMask to all bits.
+    DemandedMask = APInt::getAllOnesValue(BitWidth);
+  } else if (DemandedMask == 0) {   // Not demanding any bits from V.
+    if (V != UndefValue::get(VTy))
+      return UpdateValueUsesWith(V, UndefValue::get(VTy));
+    return false;
+  } else if (Depth == 6) {        // Limit search depth.
+    return false;
+  }
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;        // Only analyze instructions.
+
+  APInt LHSKnownZero(BitWidth, 0), LHSKnownOne(BitWidth, 0);
+  APInt &RHSKnownZero = KnownZero, &RHSKnownOne = KnownOne;
+  switch (I->getOpcode()) {
+  default: break;
+  case Instruction::And:
+    // If either the LHS or the RHS are Zero, the result is zero.
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+
+    // If something is known zero on the RHS, the bits aren't demanded on the
+    // LHS.
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~RHSKnownZero,
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return true;
+    assert((LHSKnownZero & LHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+
+    // If all of the demanded bits are known 1 on one side, return the other.
+    // These bits cannot contribute to the result of the 'and'.
+    if ((DemandedMask & ~LHSKnownZero & RHSKnownOne) == 
+        (DemandedMask & ~LHSKnownZero))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & ~RHSKnownZero & LHSKnownOne) == 
+        (DemandedMask & ~RHSKnownZero))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+    
+    // If all of the demanded bits in the inputs are known zeros, return zero.
+    if ((DemandedMask & (RHSKnownZero|LHSKnownZero)) == DemandedMask)
+      return UpdateValueUsesWith(I, Constant::getNullValue(VTy));
+      
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnownZero))
+      return UpdateValueUsesWith(I, I);
+      
+    // Output known-1 bits are only known if set in both the LHS & RHS.
+    RHSKnownOne &= LHSKnownOne;
+    // Output known-0 are known to be clear if zero in either the LHS | RHS.
+    RHSKnownZero |= LHSKnownZero;
+    break;
+  case Instruction::Or:
+    // If either the LHS or the RHS are One, the result is One.
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    // If something is known one on the RHS, the bits aren't demanded on the
+    // LHS.
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask & ~RHSKnownOne, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return true;
+    assert((LHSKnownZero & LHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'or'.
+    if ((DemandedMask & ~LHSKnownOne & RHSKnownZero) == 
+        (DemandedMask & ~LHSKnownOne))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & ~RHSKnownOne & LHSKnownZero) == 
+        (DemandedMask & ~RHSKnownOne))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+
+    // If all of the potentially set bits on one side are known to be set on
+    // the other side, just use the 'other' side.
+    if ((DemandedMask & (~RHSKnownZero) & LHSKnownOne) == 
+        (DemandedMask & (~RHSKnownZero)))
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & (~LHSKnownZero) & RHSKnownOne) == 
+        (DemandedMask & (~LHSKnownZero)))
+      return UpdateValueUsesWith(I, I->getOperand(1));
+        
+    // If the RHS is a constant, see if we can simplify it.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+          
+    // Output known-0 bits are only known if clear in both the LHS & RHS.
+    RHSKnownZero &= LHSKnownZero;
+    // Output known-1 are known to be set if set in either the LHS | RHS.
+    RHSKnownOne |= LHSKnownOne;
+    break;
+  case Instruction::Xor: {
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return true;
+    assert((LHSKnownZero & LHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    
+    // If all of the demanded bits are known zero on one side, return the other.
+    // These bits cannot contribute to the result of the 'xor'.
+    if ((DemandedMask & RHSKnownZero) == DemandedMask)
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    if ((DemandedMask & LHSKnownZero) == DemandedMask)
+      return UpdateValueUsesWith(I, I->getOperand(1));
+    
+    // Output known-0 bits are known if clear or set in both the LHS & RHS.
+    APInt KnownZeroOut = (RHSKnownZero & LHSKnownZero) | 
+                         (RHSKnownOne & LHSKnownOne);
+    // Output known-1 are known to be set if set in only one of the LHS, RHS.
+    APInt KnownOneOut = (RHSKnownZero & LHSKnownOne) | 
+                        (RHSKnownOne & LHSKnownZero);
+    
+    // If all of the demanded bits are known to be zero on one side or the
+    // other, turn this into an *inclusive* or.
+    //    e.g. (A & C1)^(B & C2) -> (A & C1)|(B & C2) iff C1&C2 == 0
+    if ((DemandedMask & ~RHSKnownZero & ~LHSKnownZero) == 0) {
+      Instruction *Or =
+        BinaryOperator::createOr(I->getOperand(0), I->getOperand(1),
+                                 I->getName());
+      InsertNewInstBefore(Or, *I);
+      return UpdateValueUsesWith(I, Or);
+    }
+    
+    // If all of the demanded bits on one side are known, and all of the set
+    // bits on that side are also known to be set on the other side, turn this
+    // into an AND, as we know the bits will be cleared.
+    //    e.g. (X | C1) ^ C2 --> (X | C1) & ~C2 iff (C1&C2) == C2
+    if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask) { 
+      // all known
+      if ((RHSKnownOne & LHSKnownOne) == RHSKnownOne) {
+        Constant *AndC = ConstantInt::get(~RHSKnownOne & DemandedMask);
+        Instruction *And = 
+          BinaryOperator::createAnd(I->getOperand(0), AndC, "tmp");
+        InsertNewInstBefore(And, *I);
+        return UpdateValueUsesWith(I, And);
+      }
+    }
+    
+    // If the RHS is a constant, see if we can simplify it.
+    // FIXME: for XOR, we prefer to force bits to 1 if they will make a -1.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    
+    RHSKnownZero = KnownZeroOut;
+    RHSKnownOne  = KnownOneOut;
+    break;
+  }
+  case Instruction::Select:
+    if (SimplifyDemandedBits(I->getOperand(2), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    if (SimplifyDemandedBits(I->getOperand(1), DemandedMask, 
+                             LHSKnownZero, LHSKnownOne, Depth+1))
+      return true;
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    assert((LHSKnownZero & LHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    
+    // If the operands are constants, see if we can simplify them.
+    if (ShrinkDemandedConstant(I, 1, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    if (ShrinkDemandedConstant(I, 2, DemandedMask))
+      return UpdateValueUsesWith(I, I);
+    
+    // Only known if known in both the LHS and RHS.
+    RHSKnownOne &= LHSKnownOne;
+    RHSKnownZero &= LHSKnownZero;
+    break;
+  case Instruction::Trunc: {
+    uint32_t truncBf = 
+      cast<IntegerType>(I->getOperand(0)->getType())->getBitWidth();
+    DemandedMask.zext(truncBf);
+    RHSKnownZero.zext(truncBf);
+    RHSKnownOne.zext(truncBf);
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask, 
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    DemandedMask.trunc(BitWidth);
+    RHSKnownZero.trunc(BitWidth);
+    RHSKnownOne.trunc(BitWidth);
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    break;
+  }
+  case Instruction::BitCast:
+    if (!I->getOperand(0)->getType()->isInteger())
+      return false;
+      
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    break;
+  case Instruction::ZExt: {
+    // Compute the bits in the result that are not present in the input.
+    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
+    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+    
+    DemandedMask.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperand(0), DemandedMask,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    DemandedMask.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+    // The top bits are known to be zero.
+    RHSKnownZero |= APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth);
+    break;
+  }
+  case Instruction::SExt: {
+    // Compute the bits in the result that are not present in the input.
+    const IntegerType *SrcTy = cast<IntegerType>(I->getOperand(0)->getType());
+    uint32_t SrcBitWidth = SrcTy->getBitWidth();
+    
+    APInt InputDemandedBits = DemandedMask & 
+                              APInt::getLowBitsSet(BitWidth, SrcBitWidth);
+
+    APInt NewBits(APInt::getHighBitsSet(BitWidth, BitWidth - SrcBitWidth));
+    // If any of the sign extended bits are demanded, we know that the sign
+    // bit is demanded.
+    if ((NewBits & DemandedMask) != 0)
+      InputDemandedBits.set(SrcBitWidth-1);
+      
+    InputDemandedBits.trunc(SrcBitWidth);
+    RHSKnownZero.trunc(SrcBitWidth);
+    RHSKnownOne.trunc(SrcBitWidth);
+    if (SimplifyDemandedBits(I->getOperand(0), InputDemandedBits,
+                             RHSKnownZero, RHSKnownOne, Depth+1))
+      return true;
+    InputDemandedBits.zext(BitWidth);
+    RHSKnownZero.zext(BitWidth);
+    RHSKnownOne.zext(BitWidth);
+    assert((RHSKnownZero & RHSKnownOne) == 0 && 
+           "Bits known to be one AND zero?"); 
+      
+    // If the sign bit of the input is known set or clear, then we know the
+    // top bits of the result.
+
+    // If the input sign bit is known zero, or if the NewBits are not demanded
+    // convert this into a zero extension.
+    if (RHSKnownZero[SrcBitWidth-1] || (NewBits & ~DemandedMask) == NewBits)
+    {
+      // Convert to ZExt cast
+      CastInst *NewCast = new ZExtInst(I->getOperand(0), VTy, I->getName(), I);
+      return UpdateValueUsesWith(I, NewCast);
+    } else if (RHSKnownOne[SrcBitWidth-1]) {    // Input sign bit known set
+      RHSKnownOne |= NewBits;
+    }
+    break;
+  }
+  case Instruction::Add: {
+    // Figure out what the input bits are.  If the top bits of the and result
+    // are not demanded, then the add doesn't demand them from its input
+    // either.
+    uint32_t NLZ = DemandedMask.countLeadingZeros();
+      
+    // If there is a constant on the RHS, there are a variety of xformations
+    // we can do.
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // If null, this should be simplified elsewhere.  Some of the xforms here
+      // won't work if the RHS is zero.
+      if (RHS->isZero())
+        break;
+      
+      // If the top bit of the output is demanded, demand everything from the
+      // input.  Otherwise, we demand all the input bits except NLZ top bits.
+      APInt InDemandedBits(APInt::getLowBitsSet(BitWidth, BitWidth - NLZ));
+
+      // Find information about known zero/one bits in the input.
+      if (SimplifyDemandedBits(I->getOperand(0), InDemandedBits, 
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return true;
+
+      // If the RHS of the add has bits set that can't affect the input, reduce
+      // the constant.
+      if (ShrinkDemandedConstant(I, 1, InDemandedBits))
+        return UpdateValueUsesWith(I, I);
+      
+      // Avoid excess work.
+      if (LHSKnownZero == 0 && LHSKnownOne == 0)
+        break;
+      
+      // Turn it into OR if input bits are zero.
+      if ((LHSKnownZero & RHS->getValue()) == RHS->getValue()) {
+        Instruction *Or =
+          BinaryOperator::createOr(I->getOperand(0), I->getOperand(1),
+                                   I->getName());
+        InsertNewInstBefore(Or, *I);
+        return UpdateValueUsesWith(I, Or);
+      }
+      
+      // We can say something about the output known-zero and known-one bits,
+      // depending on potential carries from the input constant and the
+      // unknowns.  For example if the LHS is known to have at most the 0x0F0F0
+      // bits set and the RHS constant is 0x01001, then we know we have a known
+      // one mask of 0x00001 and a known zero mask of 0xE0F0E.
+      
+      // To compute this, we first compute the potential carry bits.  These are
+      // the bits which may be modified.  I'm not aware of a better way to do
+      // this scan.
+      const APInt& RHSVal = RHS->getValue();
+      APInt CarryBits((~LHSKnownZero + RHSVal) ^ (~LHSKnownZero ^ RHSVal));
+      
+      // Now that we know which bits have carries, compute the known-1/0 sets.
+      
+      // Bits are known one if they are known zero in one operand and one in the
+      // other, and there is no input carry.
+      RHSKnownOne = ((LHSKnownZero & RHSVal) | 
+                     (LHSKnownOne & ~RHSVal)) & ~CarryBits;
+      
+      // Bits are known zero if they are known zero in both operands and there
+      // is no input carry.
+      RHSKnownZero = LHSKnownZero & ~RHSVal & ~CarryBits;
+    } else {
+      // If the high-bits of this ADD are not demanded, then it does not demand
+      // the high bits of its LHS or RHS.
+      if (DemandedMask[BitWidth-1] == 0) {
+        // Right fill the mask of bits for this ADD to demand the most
+        // significant bit and all those below it.
+        APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+        if (SimplifyDemandedBits(I->getOperand(0), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return true;
+        if (SimplifyDemandedBits(I->getOperand(1), DemandedFromOps,
+                                 LHSKnownZero, LHSKnownOne, Depth+1))
+          return true;
+      }
+    }
+    break;
+  }
+  case Instruction::Sub:
+    // If the high-bits of this SUB are not demanded, then it does not demand
+    // the high bits of its LHS or RHS.
+    if (DemandedMask[BitWidth-1] == 0) {
+      // Right fill the mask of bits for this SUB to demand the most
+      // significant bit and all those below it.
+      uint32_t NLZ = DemandedMask.countLeadingZeros();
+      APInt DemandedFromOps(APInt::getLowBitsSet(BitWidth, BitWidth-NLZ));
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return true;
+      if (SimplifyDemandedBits(I->getOperand(1), DemandedFromOps,
+                               LHSKnownZero, LHSKnownOne, Depth+1))
+        return true;
+    }
+    break;
+  case Instruction::Shl:
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      APInt DemandedMaskIn(DemandedMask.lshr(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedMaskIn, 
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return true;
+      assert((RHSKnownZero & RHSKnownOne) == 0 && 
+             "Bits known to be one AND zero?"); 
+      RHSKnownZero <<= ShiftAmt;
+      RHSKnownOne  <<= ShiftAmt;
+      // low bits known zero.
+      if (ShiftAmt)
+        RHSKnownZero |= APInt::getLowBitsSet(BitWidth, ShiftAmt);
+    }
+    break;
+  case Instruction::LShr:
+    // For a logical shift right
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint64_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Unsigned shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      if (SimplifyDemandedBits(I->getOperand(0), DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return true;
+      assert((RHSKnownZero & RHSKnownOne) == 0 && 
+             "Bits known to be one AND zero?"); 
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+      if (ShiftAmt) {
+        // Compute the new bits that are at the top now.
+        APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+        RHSKnownZero |= HighBits;  // high bits known zero.
+      }
+    }
+    break;
+  case Instruction::AShr:
+    // If this is an arithmetic shift right and only the low-bit is set, we can
+    // always convert this into a logical shr, even if the shift amount is
+    // variable.  The low bit of the shift cannot be an input sign bit unless
+    // the shift amount is >= the size of the datatype, which is undefined.
+    if (DemandedMask == 1) {
+      // Perform the logical shift right.
+      Value *NewVal = BinaryOperator::createLShr(
+                        I->getOperand(0), I->getOperand(1), I->getName());
+      InsertNewInstBefore(cast<Instruction>(NewVal), *I);
+      return UpdateValueUsesWith(I, NewVal);
+    }    
+
+    // If the sign bit is the only bit demanded by this ashr, then there is no
+    // need to do it, the shift doesn't change the high bit.
+    if (DemandedMask.isSignBit())
+      return UpdateValueUsesWith(I, I->getOperand(0));
+    
+    if (ConstantInt *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t ShiftAmt = SA->getLimitedValue(BitWidth);
+      
+      // Signed shift right.
+      APInt DemandedMaskIn(DemandedMask.shl(ShiftAmt));
+      // If any of the "high bits" are demanded, we should set the sign bit as
+      // demanded.
+      if (DemandedMask.countLeadingZeros() <= ShiftAmt)
+        DemandedMaskIn.set(BitWidth-1);
+      if (SimplifyDemandedBits(I->getOperand(0),
+                               DemandedMaskIn,
+                               RHSKnownZero, RHSKnownOne, Depth+1))
+        return true;
+      assert((RHSKnownZero & RHSKnownOne) == 0 && 
+             "Bits known to be one AND zero?"); 
+      // Compute the new bits that are at the top now.
+      APInt HighBits(APInt::getHighBitsSet(BitWidth, ShiftAmt));
+      RHSKnownZero = APIntOps::lshr(RHSKnownZero, ShiftAmt);
+      RHSKnownOne  = APIntOps::lshr(RHSKnownOne, ShiftAmt);
+        
+      // Handle the sign bits.
+      APInt SignBit(APInt::getSignBit(BitWidth));
+      // Adjust to where it is now in the mask.
+      SignBit = APIntOps::lshr(SignBit, ShiftAmt);  
+        
+      // If the input sign bit is known to be zero, or if none of the top bits
+      // are demanded, turn this into an unsigned shift right.
+      if (RHSKnownZero[BitWidth-ShiftAmt-1] || 
+          (HighBits & ~DemandedMask) == HighBits) {
+        // Perform the logical shift right.
+        Value *NewVal = BinaryOperator::createLShr(
+                          I->getOperand(0), SA, I->getName());
+        InsertNewInstBefore(cast<Instruction>(NewVal), *I);
+        return UpdateValueUsesWith(I, NewVal);
+      } else if ((RHSKnownOne & SignBit) != 0) { // New bits are known one.
+        RHSKnownOne |= HighBits;
+      }
+    }
+    break;
+  }
+  
+  // If the client is only demanding bits that we know, return the known
+  // constant.
+  if ((DemandedMask & (RHSKnownZero|RHSKnownOne)) == DemandedMask)
+    return UpdateValueUsesWith(I, ConstantInt::get(RHSKnownOne));
+  return false;
+}
+
+
+/// SimplifyDemandedVectorElts - The specified value producecs a vector with
+/// 64 or fewer elements.  DemandedElts contains the set of elements that are
+/// actually used by the caller.  This method analyzes which elements of the
+/// operand are undef and returns that information in UndefElts.
+///
+/// If the information about demanded elements can be used to simplify the
+/// operation, the operation is simplified, then the resultant value is
+/// returned.  This returns null if no change was made.
+Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, uint64_t DemandedElts,
+                                                uint64_t &UndefElts,
+                                                unsigned Depth) {
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
+  assert(VWidth <= 64 && "Vector too wide to analyze!");
+  uint64_t EltMask = ~0ULL >> (64-VWidth);
+  assert(DemandedElts != EltMask && (DemandedElts & ~EltMask) == 0 &&
+         "Invalid DemandedElts!");
+
+  if (isa<UndefValue>(V)) {
+    // If the entire vector is undefined, just return this info.
+    UndefElts = EltMask;
+    return 0;
+  } else if (DemandedElts == 0) { // If nothing is demanded, provide undef.
+    UndefElts = EltMask;
+    return UndefValue::get(V->getType());
+  }
+  
+  UndefElts = 0;
+  if (ConstantVector *CP = dyn_cast<ConstantVector>(V)) {
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Undef = UndefValue::get(EltTy);
+
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      if (!(DemandedElts & (1ULL << i))) {   // If not demanded, set to undef.
+        Elts.push_back(Undef);
+        UndefElts |= (1ULL << i);
+      } else if (isa<UndefValue>(CP->getOperand(i))) {   // Already undef.
+        Elts.push_back(Undef);
+        UndefElts |= (1ULL << i);
+      } else {                               // Otherwise, defined.
+        Elts.push_back(CP->getOperand(i));
+      }
+        
+    // If we changed the constant, return it.
+    Constant *NewCP = ConstantVector::get(Elts);
+    return NewCP != CP ? NewCP : 0;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    // Simplify the CAZ to a ConstantVector where the non-demanded elements are
+    // set to undef.
+    const Type *EltTy = cast<VectorType>(V->getType())->getElementType();
+    Constant *Zero = Constant::getNullValue(EltTy);
+    Constant *Undef = UndefValue::get(EltTy);
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0; i != VWidth; ++i)
+      Elts.push_back((DemandedElts & (1ULL << i)) ? Zero : Undef);
+    UndefElts = DemandedElts ^ EltMask;
+    return ConstantVector::get(Elts);
+  }
+  
+  if (!V->hasOneUse()) {    // Other users may use these bits.
+    if (Depth != 0) {       // Not at the root.
+      // TODO: Just compute the UndefElts information recursively.
+      return false;
+    }
+    return false;
+  } else if (Depth == 10) {        // Limit search depth.
+    return false;
+  }
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;        // Only analyze instructions.
+  
+  bool MadeChange = false;
+  uint64_t UndefElts2;
+  Value *TmpV;
+  switch (I->getOpcode()) {
+  default: break;
+    
+  case Instruction::InsertElement: {
+    // If this is a variable index, we don't know which element it overwrites.
+    // demand exactly the same input as we produce.
+    ConstantInt *Idx = dyn_cast<ConstantInt>(I->getOperand(2));
+    if (Idx == 0) {
+      // Note that we can't propagate undef elt info, because we don't know
+      // which elt is getting updated.
+      TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+      break;
+    }
+    
+    // If this is inserting an element that isn't demanded, remove this
+    // insertelement.
+    unsigned IdxNo = Idx->getZExtValue();
+    if (IdxNo >= VWidth || (DemandedElts & (1ULL << IdxNo)) == 0)
+      return AddSoonDeadInstToWorklist(*I, 0);
+    
+    // Otherwise, the element inserted overwrites whatever was there, so the
+    // input demanded set is simpler than the output set.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0),
+                                      DemandedElts & ~(1ULL << IdxNo),
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+
+    // The inserted element is defined.
+    UndefElts |= 1ULL << IdxNo;
+    break;
+  }
+  case Instruction::BitCast: {
+    // Vector->vector casts only.
+    const VectorType *VTy = dyn_cast<VectorType>(I->getOperand(0)->getType());
+    if (!VTy) break;
+    unsigned InVWidth = VTy->getNumElements();
+    uint64_t InputDemandedElts = 0;
+    unsigned Ratio;
+
+    if (VWidth == InVWidth) {
+      // If we are converting from <4 x i32> -> <4 x f32>, we demand the same
+      // elements as are demanded of us.
+      Ratio = 1;
+      InputDemandedElts = DemandedElts;
+    } else if (VWidth > InVWidth) {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the result than there are in the source,
+      // then an input element is live if any of the corresponding output
+      // elements are live.
+      Ratio = VWidth/InVWidth;
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx) {
+        if (DemandedElts & (1ULL << OutIdx))
+          InputDemandedElts |= 1ULL << (OutIdx/Ratio);
+      }
+    } else {
+      // Untested so far.
+      break;
+      
+      // If there are more elements in the source than there are in the result,
+      // then an input element is live if the corresponding output element is
+      // live.
+      Ratio = InVWidth/VWidth;
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if (DemandedElts & (1ULL << InIdx/Ratio))
+          InputDemandedElts |= 1ULL << InIdx;
+    }
+    
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), InputDemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) {
+      I->setOperand(0, TmpV);
+      MadeChange = true;
+    }
+    
+    UndefElts = UndefElts2;
+    if (VWidth > InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the result than there are in the source,
+      // then an output element is undef if the corresponding input element is
+      // undef.
+      for (unsigned OutIdx = 0; OutIdx != VWidth; ++OutIdx)
+        if (UndefElts2 & (1ULL << (OutIdx/Ratio)))
+          UndefElts |= 1ULL << OutIdx;
+    } else if (VWidth < InVWidth) {
+      assert(0 && "Unimp");
+      // If there are more elements in the source than there are in the result,
+      // then a result element is undef if all of the corresponding input
+      // elements are undef.
+      UndefElts = ~0ULL >> (64-VWidth);  // Start out all undef.
+      for (unsigned InIdx = 0; InIdx != InVWidth; ++InIdx)
+        if ((UndefElts2 & (1ULL << InIdx)) == 0)    // Not undef?
+          UndefElts &= ~(1ULL << (InIdx/Ratio));    // Clear undef bit.
+    }
+    break;
+  }
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Mul:
+    // div/rem demand all inputs, because they don't want divide by zero.
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(0), DemandedElts,
+                                      UndefElts, Depth+1);
+    if (TmpV) { I->setOperand(0, TmpV); MadeChange = true; }
+    TmpV = SimplifyDemandedVectorElts(I->getOperand(1), DemandedElts,
+                                      UndefElts2, Depth+1);
+    if (TmpV) { I->setOperand(1, TmpV); MadeChange = true; }
+      
+    // Output elements are undefined if both are undefined.  Consider things
+    // like undef&0.  The result is known zero, not undef.
+    UndefElts &= UndefElts2;
+    break;
+    
+  case Instruction::Call: {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    if (!II) break;
+    switch (II->getIntrinsicID()) {
+    default: break;
+      
+    // Binary vector operations that work column-wise.  A dest element is a
+    // function of the corresponding input elements from the two inputs.
+    case Intrinsic::x86_sse_sub_ss:
+    case Intrinsic::x86_sse_mul_ss:
+    case Intrinsic::x86_sse_min_ss:
+    case Intrinsic::x86_sse_max_ss:
+    case Intrinsic::x86_sse2_sub_sd:
+    case Intrinsic::x86_sse2_mul_sd:
+    case Intrinsic::x86_sse2_min_sd:
+    case Intrinsic::x86_sse2_max_sd:
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(1), DemandedElts,
+                                        UndefElts, Depth+1);
+      if (TmpV) { II->setOperand(1, TmpV); MadeChange = true; }
+      TmpV = SimplifyDemandedVectorElts(II->getOperand(2), DemandedElts,
+                                        UndefElts2, Depth+1);
+      if (TmpV) { II->setOperand(2, TmpV); MadeChange = true; }
+
+      // If only the low elt is demanded and this is a scalarizable intrinsic,
+      // scalarize it now.
+      if (DemandedElts == 1) {
+        switch (II->getIntrinsicID()) {
+        default: break;
+        case Intrinsic::x86_sse_sub_ss:
+        case Intrinsic::x86_sse_mul_ss:
+        case Intrinsic::x86_sse2_sub_sd:
+        case Intrinsic::x86_sse2_mul_sd:
+          // TODO: Lower MIN/MAX/ABS/etc
+          Value *LHS = II->getOperand(1);
+          Value *RHS = II->getOperand(2);
+          // Extract the element as scalars.
+          LHS = InsertNewInstBefore(new ExtractElementInst(LHS, 0U,"tmp"), *II);
+          RHS = InsertNewInstBefore(new ExtractElementInst(RHS, 0U,"tmp"), *II);
+          
+          switch (II->getIntrinsicID()) {
+          default: assert(0 && "Case stmts out of sync!");
+          case Intrinsic::x86_sse_sub_ss:
+          case Intrinsic::x86_sse2_sub_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::createSub(LHS, RHS,
+                                                        II->getName()), *II);
+            break;
+          case Intrinsic::x86_sse_mul_ss:
+          case Intrinsic::x86_sse2_mul_sd:
+            TmpV = InsertNewInstBefore(BinaryOperator::createMul(LHS, RHS,
+                                                         II->getName()), *II);
+            break;
+          }
+          
+          Instruction *New =
+            new InsertElementInst(UndefValue::get(II->getType()), TmpV, 0U,
+                                  II->getName());
+          InsertNewInstBefore(New, *II);
+          AddSoonDeadInstToWorklist(*II, 0);
+          return New;
+        }            
+      }
+        
+      // Output elements are undefined if both are undefined.  Consider things
+      // like undef&0.  The result is known zero, not undef.
+      UndefElts &= UndefElts2;
+      break;
+    }
+    break;
+  }
+  }
+  return MadeChange ? I : 0;
+}
+
+/// @returns true if the specified compare instruction is
+/// true when both operands are equal...
+/// @brief Determine if the ICmpInst returns true if both operands are equal
+static bool isTrueWhenEqual(ICmpInst &ICI) {
+  ICmpInst::Predicate pred = ICI.getPredicate();
+  return pred == ICmpInst::ICMP_EQ  || pred == ICmpInst::ICMP_UGE ||
+         pred == ICmpInst::ICMP_SGE || pred == ICmpInst::ICMP_ULE ||
+         pred == ICmpInst::ICMP_SLE;
+}
+
+/// AssociativeOpt - Perform an optimization on an associative operator.  This
+/// function is designed to check a chain of associative operators for a
+/// potential to apply a certain optimization.  Since the optimization may be
+/// applicable if the expression was reassociated, this checks the chain, then
+/// reassociates the expression as necessary to expose the optimization
+/// opportunity.  This makes use of a special Functor, which must define
+/// 'shouldApply' and 'apply' methods.
+///
+template<typename Functor>
+Instruction *AssociativeOpt(BinaryOperator &Root, const Functor &F) {
+  unsigned Opcode = Root.getOpcode();
+  Value *LHS = Root.getOperand(0);
+
+  // Quick check, see if the immediate LHS matches...
+  if (F.shouldApply(LHS))
+    return F.apply(Root);
+
+  // Otherwise, if the LHS is not of the same opcode as the root, return.
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  while (LHSI && LHSI->getOpcode() == Opcode && LHSI->hasOneUse()) {
+    // Should we apply this transform to the RHS?
+    bool ShouldApply = F.shouldApply(LHSI->getOperand(1));
+
+    // If not to the RHS, check to see if we should apply to the LHS...
+    if (!ShouldApply && F.shouldApply(LHSI->getOperand(0))) {
+      cast<BinaryOperator>(LHSI)->swapOperands();   // Make the LHS the RHS
+      ShouldApply = true;
+    }
+
+    // If the functor wants to apply the optimization to the RHS of LHSI,
+    // reassociate the expression from ((? op A) op B) to (? op (A op B))
+    if (ShouldApply) {
+      BasicBlock *BB = Root.getParent();
+
+      // Now all of the instructions are in the current basic block, go ahead
+      // and perform the reassociation.
+      Instruction *TmpLHSI = cast<Instruction>(Root.getOperand(0));
+
+      // First move the selected RHS to the LHS of the root...
+      Root.setOperand(0, LHSI->getOperand(1));
+
+      // Make what used to be the LHS of the root be the user of the root...
+      Value *ExtraOperand = TmpLHSI->getOperand(1);
+      if (&Root == TmpLHSI) {
+        Root.replaceAllUsesWith(Constant::getNullValue(TmpLHSI->getType()));
+        return 0;
+      }
+      Root.replaceAllUsesWith(TmpLHSI);          // Users now use TmpLHSI
+      TmpLHSI->setOperand(1, &Root);             // TmpLHSI now uses the root
+      TmpLHSI->getParent()->getInstList().remove(TmpLHSI);
+      BasicBlock::iterator ARI = &Root; ++ARI;
+      BB->getInstList().insert(ARI, TmpLHSI);    // Move TmpLHSI to after Root
+      ARI = Root;
+
+      // Now propagate the ExtraOperand down the chain of instructions until we
+      // get to LHSI.
+      while (TmpLHSI != LHSI) {
+        Instruction *NextLHSI = cast<Instruction>(TmpLHSI->getOperand(0));
+        // Move the instruction to immediately before the chain we are
+        // constructing to avoid breaking dominance properties.
+        NextLHSI->getParent()->getInstList().remove(NextLHSI);
+        BB->getInstList().insert(ARI, NextLHSI);
+        ARI = NextLHSI;
+
+        Value *NextOp = NextLHSI->getOperand(1);
+        NextLHSI->setOperand(1, ExtraOperand);
+        TmpLHSI = NextLHSI;
+        ExtraOperand = NextOp;
+      }
+
+      // Now that the instructions are reassociated, have the functor perform
+      // the transformation...
+      return F.apply(Root);
+    }
+
+    LHSI = dyn_cast<Instruction>(LHSI->getOperand(0));
+  }
+  return 0;
+}
+
+
+// AddRHS - Implements: X + X --> X << 1
+struct AddRHS {
+  Value *RHS;
+  AddRHS(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::createShl(Add.getOperand(0),
+                                  ConstantInt::get(Add.getType(), 1));
+  }
+};
+
+// AddMaskingAnd - Implements (A & C1)+(B & C2) --> (A & C1)|(B & C2)
+//                 iff C1&C2 == 0
+struct AddMaskingAnd {
+  Constant *C2;
+  AddMaskingAnd(Constant *c) : C2(c) {}
+  bool shouldApply(Value *LHS) const {
+    ConstantInt *C1;
+    return match(LHS, m_And(m_Value(), m_ConstantInt(C1))) &&
+           ConstantExpr::getAnd(C1, C2)->isNullValue();
+  }
+  Instruction *apply(BinaryOperator &Add) const {
+    return BinaryOperator::createOr(Add.getOperand(0), Add.getOperand(1));
+  }
+};
+
+static Value *FoldOperationIntoSelectOperand(Instruction &I, Value *SO,
+                                             InstCombiner *IC) {
+  if (CastInst *CI = dyn_cast<CastInst>(&I)) {
+    if (Constant *SOC = dyn_cast<Constant>(SO))
+      return ConstantExpr::getCast(CI->getOpcode(), SOC, I.getType());
+
+    return IC->InsertNewInstBefore(CastInst::create(
+          CI->getOpcode(), SO, I.getType(), SO->getName() + ".cast"), I);
+  }
+
+  // Figure out if the constant is the left or the right argument.
+  bool ConstIsRHS = isa<Constant>(I.getOperand(1));
+  Constant *ConstOperand = cast<Constant>(I.getOperand(ConstIsRHS));
+
+  if (Constant *SOC = dyn_cast<Constant>(SO)) {
+    if (ConstIsRHS)
+      return ConstantExpr::get(I.getOpcode(), SOC, ConstOperand);
+    return ConstantExpr::get(I.getOpcode(), ConstOperand, SOC);
+  }
+
+  Value *Op0 = SO, *Op1 = ConstOperand;
+  if (!ConstIsRHS)
+    std::swap(Op0, Op1);
+  Instruction *New;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I))
+    New = BinaryOperator::create(BO->getOpcode(), Op0, Op1,SO->getName()+".op");
+  else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+    New = CmpInst::create(CI->getOpcode(), CI->getPredicate(), Op0, Op1, 
+                          SO->getName()+".cmp");
+  else {
+    assert(0 && "Unknown binary instruction type!");
+    abort();
+  }
+  return IC->InsertNewInstBefore(New, I);
+}
+
+// FoldOpIntoSelect - Given an instruction with a select as one operand and a
+// constant as the other operand, try to fold the binary operator into the
+// select arguments.  This also works for Cast instructions, which obviously do
+// not have a second operand.
+static Instruction *FoldOpIntoSelect(Instruction &Op, SelectInst *SI,
+                                     InstCombiner *IC) {
+  // Don't modify shared select instructions
+  if (!SI->hasOneUse()) return 0;
+  Value *TV = SI->getOperand(1);
+  Value *FV = SI->getOperand(2);
+
+  if (isa<Constant>(TV) || isa<Constant>(FV)) {
+    // Bool selects with constant operands can be folded to logical ops.
+    if (SI->getType() == Type::Int1Ty) return 0;
+
+    Value *SelectTrueVal = FoldOperationIntoSelectOperand(Op, TV, IC);
+    Value *SelectFalseVal = FoldOperationIntoSelectOperand(Op, FV, IC);
+
+    return new SelectInst(SI->getCondition(), SelectTrueVal,
+                          SelectFalseVal);
+  }
+  return 0;
+}
+
+
+/// FoldOpIntoPhi - Given a binary operator or cast instruction which has a PHI
+/// node as operand #0, see if we can fold the instruction into the PHI (which
+/// is only possible if all operands to the PHI are constants).
+Instruction *InstCombiner::FoldOpIntoPhi(Instruction &I) {
+  PHINode *PN = cast<PHINode>(I.getOperand(0));
+  unsigned NumPHIValues = PN->getNumIncomingValues();
+  if (!PN->hasOneUse() || NumPHIValues == 0) return 0;
+
+  // Check to see if all of the operands of the PHI are constants.  If there is
+  // one non-constant value, remember the BB it is.  If there is more than one
+  // or if *it* is a PHI, bail out.
+  BasicBlock *NonConstBB = 0;
+  for (unsigned i = 0; i != NumPHIValues; ++i)
+    if (!isa<Constant>(PN->getIncomingValue(i))) {
+      if (NonConstBB) return 0;  // More than one non-const value.
+      if (isa<PHINode>(PN->getIncomingValue(i))) return 0;  // Itself a phi.
+      NonConstBB = PN->getIncomingBlock(i);
+      
+      // If the incoming non-constant value is in I's block, we have an infinite
+      // loop.
+      if (NonConstBB == I.getParent())
+        return 0;
+    }
+  
+  // If there is exactly one non-constant value, we can insert a copy of the
+  // operation in that block.  However, if this is a critical edge, we would be
+  // inserting the computation one some other paths (e.g. inside a loop).  Only
+  // do this if the pred block is unconditionally branching into the phi block.
+  if (NonConstBB) {
+    BranchInst *BI = dyn_cast<BranchInst>(NonConstBB->getTerminator());
+    if (!BI || !BI->isUnconditional()) return 0;
+  }
+
+  // Okay, we can do the transformation: create the new PHI node.
+  PHINode *NewPN = new PHINode(I.getType(), "");
+  NewPN->reserveOperandSpace(PN->getNumOperands()/2);
+  InsertNewInstBefore(NewPN, *PN);
+  NewPN->takeName(PN);
+
+  // Next, add all of the operands to the PHI.
+  if (I.getNumOperands() == 2) {
+    Constant *C = cast<Constant>(I.getOperand(1));
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
+        else
+          InV = ConstantExpr::get(I.getOpcode(), InC, C);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        if (BinaryOperator *BO = dyn_cast<BinaryOperator>(&I)) 
+          InV = BinaryOperator::create(BO->getOpcode(),
+                                       PN->getIncomingValue(i), C, "phitmp",
+                                       NonConstBB->getTerminator());
+        else if (CmpInst *CI = dyn_cast<CmpInst>(&I))
+          InV = CmpInst::create(CI->getOpcode(), 
+                                CI->getPredicate(),
+                                PN->getIncomingValue(i), C, "phitmp",
+                                NonConstBB->getTerminator());
+        else
+          assert(0 && "Unknown binop!");
+        
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  } else { 
+    CastInst *CI = cast<CastInst>(&I);
+    const Type *RetTy = CI->getType();
+    for (unsigned i = 0; i != NumPHIValues; ++i) {
+      Value *InV;
+      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i))) {
+        InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
+      } else {
+        assert(PN->getIncomingBlock(i) == NonConstBB);
+        InV = CastInst::create(CI->getOpcode(), PN->getIncomingValue(i), 
+                               I.getType(), "phitmp", 
+                               NonConstBB->getTerminator());
+        AddToWorkList(cast<Instruction>(InV));
+      }
+      NewPN->addIncoming(InV, PN->getIncomingBlock(i));
+    }
+  }
+  return ReplaceInstUsesWith(I, NewPN);
+}
+
+Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *LHS = I.getOperand(0), *RHS = I.getOperand(1);
+
+  if (Constant *RHSC = dyn_cast<Constant>(RHS)) {
+    // X + undef -> undef
+    if (isa<UndefValue>(RHS))
+      return ReplaceInstUsesWith(I, RHS);
+
+    // X + 0 --> X
+    if (!I.getType()->isFPOrFPVector()) { // NOTE: -0 + +0 = +0.
+      if (RHSC->isNullValue())
+        return ReplaceInstUsesWith(I, LHS);
+    } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(RHSC)) {
+      if (CFP->isExactlyValue(-0.0))
+        return ReplaceInstUsesWith(I, LHS);
+    }
+
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHSC)) {
+      // X + (signbit) --> X ^ signbit
+      const APInt& Val = CI->getValue();
+      uint32_t BitWidth = Val.getBitWidth();
+      if (Val == APInt::getSignBit(BitWidth))
+        return BinaryOperator::createXor(LHS, RHS);
+      
+      // See if SimplifyDemandedBits can simplify this.  This handles stuff like
+      // (X & 254)+1 -> (X&254)|1
+      if (!isa<VectorType>(I.getType())) {
+        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+        if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
+                                 KnownZero, KnownOne))
+          return &I;
+      }
+    }
+
+    if (isa<PHINode>(LHS))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+    
+    ConstantInt *XorRHS = 0;
+    Value *XorLHS = 0;
+    if (isa<ConstantInt>(RHSC) &&
+        match(LHS, m_Xor(m_Value(XorLHS), m_ConstantInt(XorRHS)))) {
+      uint32_t TySizeBits = I.getType()->getPrimitiveSizeInBits();
+      const APInt& RHSVal = cast<ConstantInt>(RHSC)->getValue();
+      
+      uint32_t Size = TySizeBits / 2;
+      APInt C0080Val(APInt(TySizeBits, 1ULL).shl(Size - 1));
+      APInt CFF80Val(-C0080Val);
+      do {
+        if (TySizeBits > Size) {
+          // If we have ADD(XOR(AND(X, 0xFF), 0x80), 0xF..F80), it's a sext.
+          // If we have ADD(XOR(AND(X, 0xFF), 0xF..F80), 0x80), it's a sext.
+          if ((RHSVal == CFF80Val && XorRHS->getValue() == C0080Val) ||
+              (RHSVal == C0080Val && XorRHS->getValue() == CFF80Val)) {
+            // This is a sign extend if the top bits are known zero.
+            if (!MaskedValueIsZero(XorLHS, 
+                   APInt::getHighBitsSet(TySizeBits, TySizeBits - Size)))
+              Size = 0;  // Not a sign ext, but can't be any others either.
+            break;
+          }
+        }
+        Size >>= 1;
+        C0080Val = APIntOps::lshr(C0080Val, Size);
+        CFF80Val = APIntOps::ashr(CFF80Val, Size);
+      } while (Size >= 1);
+      
+      // FIXME: This shouldn't be necessary. When the backends can handle types
+      // with funny bit widths then this whole cascade of if statements should
+      // be removed. It is just here to get the size of the "middle" type back
+      // up to something that the back ends can handle.
+      const Type *MiddleType = 0;
+      switch (Size) {
+        default: break;
+        case 32: MiddleType = Type::Int32Ty; break;
+        case 16: MiddleType = Type::Int16Ty; break;
+        case  8: MiddleType = Type::Int8Ty; break;
+      }
+      if (MiddleType) {
+        Instruction *NewTrunc = new TruncInst(XorLHS, MiddleType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, I.getType(), I.getName());
+      }
+    }
+  }
+
+  // X + X --> X << 1
+  if (I.getType()->isInteger() && I.getType() != Type::Int1Ty) {
+    if (Instruction *Result = AssociativeOpt(I, AddRHS(RHS))) return Result;
+
+    if (Instruction *RHSI = dyn_cast<Instruction>(RHS)) {
+      if (RHSI->getOpcode() == Instruction::Sub)
+        if (LHS == RHSI->getOperand(1))                   // A + (B - A) --> B
+          return ReplaceInstUsesWith(I, RHSI->getOperand(0));
+    }
+    if (Instruction *LHSI = dyn_cast<Instruction>(LHS)) {
+      if (LHSI->getOpcode() == Instruction::Sub)
+        if (RHS == LHSI->getOperand(1))                   // (B - A) + A --> B
+          return ReplaceInstUsesWith(I, LHSI->getOperand(0));
+    }
+  }
+
+  // -A + B  -->  B - A
+  if (Value *V = dyn_castNegVal(LHS))
+    return BinaryOperator::createSub(RHS, V);
+
+  // A + -B  -->  A - B
+  if (!isa<Constant>(RHS))
+    if (Value *V = dyn_castNegVal(RHS))
+      return BinaryOperator::createSub(LHS, V);
+
+
+  ConstantInt *C2;
+  if (Value *X = dyn_castFoldableMul(LHS, C2)) {
+    if (X == RHS)   // X*C + X --> X * (C+1)
+      return BinaryOperator::createMul(RHS, AddOne(C2));
+
+    // X*C1 + X*C2 --> X * (C1+C2)
+    ConstantInt *C1;
+    if (X == dyn_castFoldableMul(RHS, C1))
+      return BinaryOperator::createMul(X, Add(C1, C2));
+  }
+
+  // X + X*C --> X * (C+1)
+  if (dyn_castFoldableMul(RHS, C2) == LHS)
+    return BinaryOperator::createMul(LHS, AddOne(C2));
+
+  // X + ~X --> -1   since   ~X = -X-1
+  if (dyn_castNotVal(LHS) == RHS || dyn_castNotVal(RHS) == LHS)
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  
+
+  // (A & C1)+(B & C2) --> (A & C1)|(B & C2) iff C1&C2 == 0
+  if (match(RHS, m_And(m_Value(), m_ConstantInt(C2))))
+    if (Instruction *R = AssociativeOpt(I, AddMaskingAnd(C2)))
+      return R;
+
+  if (ConstantInt *CRHS = dyn_cast<ConstantInt>(RHS)) {
+    Value *X = 0;
+    if (match(LHS, m_Not(m_Value(X))))    // ~X + C --> (C-1) - X
+      return BinaryOperator::createSub(SubOne(CRHS), X);
+
+    // (X & FF00) + xx00  -> (X+xx00) & FF00
+    if (LHS->hasOneUse() && match(LHS, m_And(m_Value(X), m_ConstantInt(C2)))) {
+      Constant *Anded = And(CRHS, C2);
+      if (Anded == CRHS) {
+        // See if all bits from the first bit set in the Add RHS up are included
+        // in the mask.  First, get the rightmost bit.
+        const APInt& AddRHSV = CRHS->getValue();
+
+        // Form a mask of all bits from the lowest bit added through the top.
+        APInt AddRHSHighBits(~((AddRHSV & -AddRHSV)-1));
+
+        // See if the and mask includes all of these bits.
+        APInt AddRHSHighBitsAnd(AddRHSHighBits & C2->getValue());
+
+        if (AddRHSHighBits == AddRHSHighBitsAnd) {
+          // Okay, the xform is safe.  Insert the new add pronto.
+          Value *NewAdd = InsertNewInstBefore(BinaryOperator::createAdd(X, CRHS,
+                                                            LHS->getName()), I);
+          return BinaryOperator::createAnd(NewAdd, C2);
+        }
+      }
+    }
+
+    // Try to fold constant add into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(LHS))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+  }
+
+  // add (cast *A to intptrtype) B -> 
+  //   cast (GEP (cast *A to sbyte*) B) -> 
+  //     intptrtype
+  {
+    CastInst *CI = dyn_cast<CastInst>(LHS);
+    Value *Other = RHS;
+    if (!CI) {
+      CI = dyn_cast<CastInst>(RHS);
+      Other = LHS;
+    }
+    if (CI && CI->getType()->isSized() && 
+        (CI->getType()->getPrimitiveSizeInBits() == 
+         TD->getIntPtrType()->getPrimitiveSizeInBits()) 
+        && isa<PointerType>(CI->getOperand(0)->getType())) {
+      Value *I2 = InsertCastBefore(Instruction::BitCast, CI->getOperand(0),
+                                   PointerType::get(Type::Int8Ty), I);
+      I2 = InsertNewInstBefore(new GetElementPtrInst(I2, Other, "ctg2"), I);
+      return new PtrToIntInst(I2, CI->getType());
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+// isSignBit - Return true if the value represented by the constant only has the
+// highest order bit set.
+static bool isSignBit(ConstantInt *CI) {
+  uint32_t NumBits = CI->getType()->getPrimitiveSizeInBits();
+  return CI->getValue() == APInt::getSignBit(NumBits);
+}
+
+Instruction *InstCombiner::visitSub(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Op0 == Op1)         // sub X, X  -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // If this is a 'B = x-(-A)', change to B = x+A...
+  if (Value *V = dyn_castNegVal(Op1))
+    return BinaryOperator::createAdd(Op0, V);
+
+  if (isa<UndefValue>(Op0))
+    return ReplaceInstUsesWith(I, Op0);    // undef - X -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);    // X - undef -> undef
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op0)) {
+    // Replace (-1 - A) with (~A)...
+    if (C->isAllOnesValue())
+      return BinaryOperator::createNot(Op1);
+
+    // C - ~X == X + (1+C)
+    Value *X = 0;
+    if (match(Op1, m_Not(m_Value(X))))
+      return BinaryOperator::createAdd(X, AddOne(C));
+
+    // -(X >>u 31) -> (X >>s 31)
+    // -(X >>s 31) -> (X >>u 31)
+    if (C->isZero()) {
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op1))
+        if (SI->getOpcode() == Instruction::LShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert AShr.
+              return BinaryOperator::create(Instruction::AShr, 
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        }
+        else if (SI->getOpcode() == Instruction::AShr) {
+          if (ConstantInt *CU = dyn_cast<ConstantInt>(SI->getOperand(1))) {
+            // Check to see if we are shifting out everything but the sign bit.
+            if (CU->getLimitedValue(SI->getType()->getPrimitiveSizeInBits()) ==
+                SI->getType()->getPrimitiveSizeInBits()-1) {
+              // Ok, the transformation is safe.  Insert LShr. 
+              return BinaryOperator::createLShr(
+                                          SI->getOperand(0), CU, SI->getName());
+            }
+          }
+        } 
+    }
+
+    // Try to fold constant sub into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1)) {
+    if (Op1I->getOpcode() == Instruction::Add &&
+        !Op0->getType()->isFPOrFPVector()) {
+      if (Op1I->getOperand(0) == Op0)              // X-(X+Y) == -Y
+        return BinaryOperator::createNeg(Op1I->getOperand(1), I.getName());
+      else if (Op1I->getOperand(1) == Op0)         // X-(Y+X) == -Y
+        return BinaryOperator::createNeg(Op1I->getOperand(0), I.getName());
+      else if (ConstantInt *CI1 = dyn_cast<ConstantInt>(I.getOperand(0))) {
+        if (ConstantInt *CI2 = dyn_cast<ConstantInt>(Op1I->getOperand(1)))
+          // C1-(X+C2) --> (C1-C2)-X
+          return BinaryOperator::createSub(Subtract(CI1, CI2), 
+                                           Op1I->getOperand(0));
+      }
+    }
+
+    if (Op1I->hasOneUse()) {
+      // Replace (x - (y - z)) with (x + (z - y)) if the (y - z) subexpression
+      // is not used by anyone else...
+      //
+      if (Op1I->getOpcode() == Instruction::Sub &&
+          !Op1I->getType()->isFPOrFPVector()) {
+        // Swap the two operands of the subexpr...
+        Value *IIOp0 = Op1I->getOperand(0), *IIOp1 = Op1I->getOperand(1);
+        Op1I->setOperand(0, IIOp1);
+        Op1I->setOperand(1, IIOp0);
+
+        // Create the new top level add instruction...
+        return BinaryOperator::createAdd(Op0, Op1);
+      }
+
+      // Replace (A - (A & B)) with (A & ~B) if this is the only use of (A&B)...
+      //
+      if (Op1I->getOpcode() == Instruction::And &&
+          (Op1I->getOperand(0) == Op0 || Op1I->getOperand(1) == Op0)) {
+        Value *OtherOp = Op1I->getOperand(Op1I->getOperand(0) == Op0);
+
+        Value *NewNot =
+          InsertNewInstBefore(BinaryOperator::createNot(OtherOp, "B.not"), I);
+        return BinaryOperator::createAnd(Op0, NewNot);
+      }
+
+      // 0 - (X sdiv C)  -> (X sdiv -C)
+      if (Op1I->getOpcode() == Instruction::SDiv)
+        if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+          if (CSI->isZero())
+            if (Constant *DivRHS = dyn_cast<Constant>(Op1I->getOperand(1)))
+              return BinaryOperator::createSDiv(Op1I->getOperand(0),
+                                               ConstantExpr::getNeg(DivRHS));
+
+      // X - X*C --> X * (1-C)
+      ConstantInt *C2 = 0;
+      if (dyn_castFoldableMul(Op1I, C2) == Op0) {
+        Constant *CP1 = Subtract(ConstantInt::get(I.getType(), 1), C2);
+        return BinaryOperator::createMul(Op0, CP1);
+      }
+    }
+  }
+
+  if (!Op0->getType()->isFPOrFPVector())
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add) {
+        if (Op0I->getOperand(0) == Op1)             // (Y+X)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(1));
+        else if (Op0I->getOperand(1) == Op1)        // (X+Y)-Y == X
+          return ReplaceInstUsesWith(I, Op0I->getOperand(0));
+      } else if (Op0I->getOpcode() == Instruction::Sub) {
+        if (Op0I->getOperand(0) == Op1)             // (X-Y)-X == -Y
+          return BinaryOperator::createNeg(Op0I->getOperand(1), I.getName());
+      }
+
+  ConstantInt *C1;
+  if (Value *X = dyn_castFoldableMul(Op0, C1)) {
+    if (X == Op1)  // X*C - X --> X * (C-1)
+      return BinaryOperator::createMul(Op1, SubOne(C1));
+
+    ConstantInt *C2;   // X*C1 - X*C2 -> X * (C1-C2)
+    if (X == dyn_castFoldableMul(Op1, C2))
+      return BinaryOperator::createMul(Op1, Subtract(C1, C2));
+  }
+  return 0;
+}
+
+/// isSignBitCheck - Given an exploded icmp instruction, return true if the
+/// comparison only checks the sign bit.  If it only checks the sign bit, set
+/// TrueIfSigned if the result of the comparison is true when the input value is
+/// signed.
+static bool isSignBitCheck(ICmpInst::Predicate pred, ConstantInt *RHS,
+                           bool &TrueIfSigned) {
+  switch (pred) {
+  case ICmpInst::ICMP_SLT:   // True if LHS s< 0
+    TrueIfSigned = true;
+    return RHS->isZero();
+  case ICmpInst::ICMP_SLE:   // True if LHS s<= RHS and RHS == -1
+    TrueIfSigned = true;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_SGT:   // True if LHS s> -1
+    TrueIfSigned = false;
+    return RHS->isAllOnesValue();
+  case ICmpInst::ICMP_UGT:
+    // True if LHS u> RHS and RHS == high-bit-mask - 1
+    TrueIfSigned = true;
+    return RHS->getValue() ==
+      APInt::getSignedMaxValue(RHS->getType()->getPrimitiveSizeInBits());
+  case ICmpInst::ICMP_UGE: 
+    // True if LHS u>= RHS and RHS == high-bit-mask (2^7, 2^15, 2^31, etc)
+    TrueIfSigned = true;
+    return RHS->getValue() == 
+      APInt::getSignBit(RHS->getType()->getPrimitiveSizeInBits());
+  default:
+    return false;
+  }
+}
+
+Instruction *InstCombiner::visitMul(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0);
+
+  if (isa<UndefValue>(I.getOperand(1)))              // undef * X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // Simplify mul instructions with a constant RHS...
+  if (Constant *Op1 = dyn_cast<Constant>(I.getOperand(1))) {
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+
+      // ((X << C1)*C2) == (X * (C2 << C1))
+      if (BinaryOperator *SI = dyn_cast<BinaryOperator>(Op0))
+        if (SI->getOpcode() == Instruction::Shl)
+          if (Constant *ShOp = dyn_cast<Constant>(SI->getOperand(1)))
+            return BinaryOperator::createMul(SI->getOperand(0),
+                                             ConstantExpr::getShl(CI, ShOp));
+
+      if (CI->isZero())
+        return ReplaceInstUsesWith(I, Op1);  // X * 0  == 0
+      if (CI->equalsInt(1))                  // X * 1  == X
+        return ReplaceInstUsesWith(I, Op0);
+      if (CI->isAllOnesValue())              // X * -1 == 0 - X
+        return BinaryOperator::createNeg(Op0, I.getName());
+
+      const APInt& Val = cast<ConstantInt>(CI)->getValue();
+      if (Val.isPowerOf2()) {          // Replace X*(2^C) with X << C
+        return BinaryOperator::createShl(Op0,
+                 ConstantInt::get(Op0->getType(), Val.logBase2()));
+      }
+    } else if (ConstantFP *Op1F = dyn_cast<ConstantFP>(Op1)) {
+      if (Op1F->isNullValue())
+        return ReplaceInstUsesWith(I, Op1);
+
+      // "In IEEE floating point, x*1 is not equivalent to x for nans.  However,
+      // ANSI says we can drop signals, so we can do this anyway." (from GCC)
+      if (Op1F->getValue() == 1.0)
+        return ReplaceInstUsesWith(I, Op0);  // Eliminate 'mul double %X, 1.0'
+    }
+    
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0))
+      if (Op0I->getOpcode() == Instruction::Add && Op0I->hasOneUse() &&
+          isa<ConstantInt>(Op0I->getOperand(1))) {
+        // Canonicalize (X+C1)*C2 -> X*C2+C1*C2.
+        Instruction *Add = BinaryOperator::createMul(Op0I->getOperand(0),
+                                                     Op1, "tmp");
+        InsertNewInstBefore(Add, I);
+        Value *C1C2 = ConstantExpr::getMul(Op1, 
+                                           cast<Constant>(Op0I->getOperand(1)));
+        return BinaryOperator::createAdd(Add, C1C2);
+        
+      }
+
+    // Try to fold constant mul into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *Op0v = dyn_castNegVal(Op0))     // -X * -Y = X*Y
+    if (Value *Op1v = dyn_castNegVal(I.getOperand(1)))
+      return BinaryOperator::createMul(Op0v, Op1v);
+
+  // If one of the operands of the multiply is a cast from a boolean value, then
+  // we know the bool is either zero or one, so this is a 'masking' multiply.
+  // See if we can simplify things based on how the boolean was originally
+  // formed.
+  CastInst *BoolCast = 0;
+  if (ZExtInst *CI = dyn_cast<ZExtInst>(I.getOperand(0)))
+    if (CI->getOperand(0)->getType() == Type::Int1Ty)
+      BoolCast = CI;
+  if (!BoolCast)
+    if (ZExtInst *CI = dyn_cast<ZExtInst>(I.getOperand(1)))
+      if (CI->getOperand(0)->getType() == Type::Int1Ty)
+        BoolCast = CI;
+  if (BoolCast) {
+    if (ICmpInst *SCI = dyn_cast<ICmpInst>(BoolCast->getOperand(0))) {
+      Value *SCIOp0 = SCI->getOperand(0), *SCIOp1 = SCI->getOperand(1);
+      const Type *SCOpTy = SCIOp0->getType();
+      bool TIS = false;
+      
+      // If the icmp is true iff the sign bit of X is set, then convert this
+      // multiply into a shift/and combination.
+      if (isa<ConstantInt>(SCIOp1) &&
+          isSignBitCheck(SCI->getPredicate(), cast<ConstantInt>(SCIOp1), TIS) &&
+          TIS) {
+        // Shift the X value right to turn it into "all signbits".
+        Constant *Amt = ConstantInt::get(SCIOp0->getType(),
+                                          SCOpTy->getPrimitiveSizeInBits()-1);
+        Value *V =
+          InsertNewInstBefore(
+            BinaryOperator::create(Instruction::AShr, SCIOp0, Amt,
+                                            BoolCast->getOperand(0)->getName()+
+                                            ".mask"), I);
+
+        // If the multiply type is not the same as the source type, sign extend
+        // or truncate to the multiply type.
+        if (I.getType() != V->getType()) {
+          uint32_t SrcBits = V->getType()->getPrimitiveSizeInBits();
+          uint32_t DstBits = I.getType()->getPrimitiveSizeInBits();
+          Instruction::CastOps opcode = 
+            (SrcBits == DstBits ? Instruction::BitCast : 
+             (SrcBits < DstBits ? Instruction::SExt : Instruction::Trunc));
+          V = InsertCastBefore(opcode, V, I.getType(), I);
+        }
+
+        Value *OtherOp = Op0 == BoolCast ? I.getOperand(1) : Op0;
+        return BinaryOperator::createAnd(V, OtherOp);
+      }
+    }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// This function implements the transforms on div instructions that work
+/// regardless of the kind of div instruction it is (udiv, sdiv, or fdiv). It is
+/// used by the visitors to those instructions.
+/// @brief Transforms common to all three div instructions
+Instruction *InstCombiner::commonDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // undef / X -> 0
+  if (isa<UndefValue>(Op0))
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // X / undef -> undef
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);
+
+  // Handle cases involving: div X, (select Cond, Y, Z)
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) {
+    // div X, (Cond ? 0 : Y) -> div X, Y.  If the div and the select are in the
+    // same basic block, then we replace the select with Y, and the condition 
+    // of the select with false (if the cond value is in the same BB).  If the
+    // select has uses other than the div, this allows them to be simplified
+    // also. Note that div X, Y is just as good as div X, 0 (undef)
+    if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+      if (ST->isNullValue()) {
+        Instruction *CondI = dyn_cast<Instruction>(SI->getOperand(0));
+        if (CondI && CondI->getParent() == I.getParent())
+          UpdateValueUsesWith(CondI, ConstantInt::getFalse());
+        else if (I.getParent() != SI->getParent() || SI->hasOneUse())
+          I.setOperand(1, SI->getOperand(2));
+        else
+          UpdateValueUsesWith(SI, SI->getOperand(2));
+        return &I;
+      }
+
+    // Likewise for: div X, (Cond ? Y : 0) -> div X, Y
+    if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+      if (ST->isNullValue()) {
+        Instruction *CondI = dyn_cast<Instruction>(SI->getOperand(0));
+        if (CondI && CondI->getParent() == I.getParent())
+          UpdateValueUsesWith(CondI, ConstantInt::getTrue());
+        else if (I.getParent() != SI->getParent() || SI->hasOneUse())
+          I.setOperand(1, SI->getOperand(1));
+        else
+          UpdateValueUsesWith(SI, SI->getOperand(1));
+        return &I;
+      }
+  }
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer division
+/// instructions (udiv and sdiv). It is called by the visitors to those integer
+/// division instructions.
+/// @brief Common integer divide transforms
+Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *Common = commonDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // div X, 1 == X
+    if (RHS->equalsInt(1))
+      return ReplaceInstUsesWith(I, Op0);
+
+    // (X / C1) / C2  -> X / (C1*C2)
+    if (Instruction *LHS = dyn_cast<Instruction>(Op0))
+      if (Instruction::BinaryOps(LHS->getOpcode()) == I.getOpcode())
+        if (ConstantInt *LHSRHS = dyn_cast<ConstantInt>(LHS->getOperand(1))) {
+          return BinaryOperator::create(I.getOpcode(), LHS->getOperand(0),
+                                        Multiply(RHS, LHSRHS));
+        }
+
+    if (!RHS->isZero()) { // avoid X udiv 0
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      if (isa<PHINode>(Op0))
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+    }
+  }
+
+  // 0 / X == 0, we don't need to preserve faults!
+  if (ConstantInt *LHS = dyn_cast<ConstantInt>(Op0))
+    if (LHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitUDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  // X udiv C^2 -> X >> C
+  // Check to see if this is an unsigned division with an exact power of 2,
+  // if so, convert to a right shift.
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Op1)) {
+    if (C->getValue().isPowerOf2())  // 0 not included in isPowerOf2
+      return BinaryOperator::createLShr(Op0, 
+               ConstantInt::get(Op0->getType(), C->getValue().logBase2()));
+  }
+
+  // X udiv (C1 << N), where C1 is "1<<C2"  -->  X >> (N+C2)
+  if (BinaryOperator *RHSI = dyn_cast<BinaryOperator>(I.getOperand(1))) {
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      const APInt& C1 = cast<ConstantInt>(RHSI->getOperand(0))->getValue();
+      if (C1.isPowerOf2()) {
+        Value *N = RHSI->getOperand(1);
+        const Type *NTy = N->getType();
+        if (uint32_t C2 = C1.logBase2()) {
+          Constant *C2V = ConstantInt::get(NTy, C2);
+          N = InsertNewInstBefore(BinaryOperator::createAdd(N, C2V, "tmp"), I);
+        }
+        return BinaryOperator::createLShr(Op0, N);
+      }
+    }
+  }
+  
+  // udiv X, (Select Cond, C1, C2) --> Select Cond, (shr X, C1), (shr X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) 
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2)))  {
+        const APInt &TVA = STO->getValue(), &FVA = SFO->getValue();
+        if (TVA.isPowerOf2() && FVA.isPowerOf2()) {
+          // Compute the shift amounts
+          uint32_t TSA = TVA.logBase2(), FSA = FVA.logBase2();
+          // Construct the "on true" case of the select
+          Constant *TC = ConstantInt::get(Op0->getType(), TSA);
+          Instruction *TSI = BinaryOperator::createLShr(
+                                                 Op0, TC, SI->getName()+".t");
+          TSI = InsertNewInstBefore(TSI, I);
+  
+          // Construct the "on false" case of the select
+          Constant *FC = ConstantInt::get(Op0->getType(), FSA); 
+          Instruction *FSI = BinaryOperator::createLShr(
+                                                 Op0, FC, SI->getName()+".f");
+          FSI = InsertNewInstBefore(FSI, I);
+
+          // construct the select instruction and return it.
+          return new SelectInst(SI->getOperand(0), TSI, FSI, SI->getName());
+        }
+      }
+  return 0;
+}
+
+Instruction *InstCombiner::visitSDiv(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Handle the integer div common cases
+  if (Instruction *Common = commonIDivTransforms(I))
+    return Common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // sdiv X, -1 == -X
+    if (RHS->isAllOnesValue())
+      return BinaryOperator::createNeg(Op0);
+
+    // -X/C -> X/-C
+    if (Value *LHSNeg = dyn_castNegVal(Op0))
+      return BinaryOperator::createSDiv(LHSNeg, ConstantExpr::getNeg(RHS));
+  }
+
+  // If the sign bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a udiv.
+  if (I.getType()->isInteger()) {
+    APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+    if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+      return BinaryOperator::createUDiv(Op0, Op1, I.getName());
+    }
+  }      
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
+  return commonDivTransforms(I);
+}
+
+/// GetFactor - If we can prove that the specified value is at least a multiple
+/// of some factor, return that factor.
+static Constant *GetFactor(Value *V) {
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(V))
+    return CI;
+  
+  // Unless we can be tricky, we know this is a multiple of 1.
+  Constant *Result = ConstantInt::get(V->getType(), 1);
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return Result;
+  
+  if (I->getOpcode() == Instruction::Mul) {
+    // Handle multiplies by a constant, etc.
+    return ConstantExpr::getMul(GetFactor(I->getOperand(0)),
+                                GetFactor(I->getOperand(1)));
+  } else if (I->getOpcode() == Instruction::Shl) {
+    // (X<<C) -> X * (1 << C)
+    if (Constant *ShRHS = dyn_cast<Constant>(I->getOperand(1))) {
+      ShRHS = ConstantExpr::getShl(Result, ShRHS);
+      return ConstantExpr::getMul(GetFactor(I->getOperand(0)), ShRHS);
+    }
+  } else if (I->getOpcode() == Instruction::And) {
+    if (ConstantInt *RHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      // X & 0xFFF0 is known to be a multiple of 16.
+      uint32_t Zeros = RHS->getValue().countTrailingZeros();
+      if (Zeros != V->getType()->getPrimitiveSizeInBits())
+        return ConstantExpr::getShl(Result, 
+                                    ConstantInt::get(Result->getType(), Zeros));
+    }
+  } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+    // Only handle int->int casts.
+    if (!CI->isIntegerCast())
+      return Result;
+    Value *Op = CI->getOperand(0);
+    return ConstantExpr::getCast(CI->getOpcode(), GetFactor(Op), V->getType());
+  }    
+  return Result;
+}
+
+/// This function implements the transforms on rem instructions that work
+/// regardless of the kind of rem instruction it is (urem, srem, or frem). It 
+/// is used by the visitors to those instructions.
+/// @brief Transforms common to all three rem instructions
+Instruction *InstCombiner::commonRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // 0 % X == 0, we don't need to preserve faults!
+  if (Constant *LHS = dyn_cast<Constant>(Op0))
+    if (LHS->isNullValue())
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  if (isa<UndefValue>(Op0))              // undef % X -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X % undef -> undef
+
+  // Handle cases involving: rem X, (select Cond, Y, Z)
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) {
+    // rem X, (Cond ? 0 : Y) -> rem X, Y.  If the rem and the select are in
+    // the same basic block, then we replace the select with Y, and the
+    // condition of the select with false (if the cond value is in the same
+    // BB).  If the select has uses other than the div, this allows them to be
+    // simplified also.
+    if (Constant *ST = dyn_cast<Constant>(SI->getOperand(1)))
+      if (ST->isNullValue()) {
+        Instruction *CondI = dyn_cast<Instruction>(SI->getOperand(0));
+        if (CondI && CondI->getParent() == I.getParent())
+          UpdateValueUsesWith(CondI, ConstantInt::getFalse());
+        else if (I.getParent() != SI->getParent() || SI->hasOneUse())
+          I.setOperand(1, SI->getOperand(2));
+        else
+          UpdateValueUsesWith(SI, SI->getOperand(2));
+        return &I;
+      }
+    // Likewise for: rem X, (Cond ? Y : 0) -> rem X, Y
+    if (Constant *ST = dyn_cast<Constant>(SI->getOperand(2)))
+      if (ST->isNullValue()) {
+        Instruction *CondI = dyn_cast<Instruction>(SI->getOperand(0));
+        if (CondI && CondI->getParent() == I.getParent())
+          UpdateValueUsesWith(CondI, ConstantInt::getTrue());
+        else if (I.getParent() != SI->getParent() || SI->hasOneUse())
+          I.setOperand(1, SI->getOperand(1));
+        else
+          UpdateValueUsesWith(SI, SI->getOperand(1));
+        return &I;
+      }
+  }
+
+  return 0;
+}
+
+/// This function implements the transforms common to both integer remainder
+/// instructions (urem and srem). It is called by the visitors to those integer
+/// remainder instructions.
+/// @brief Common integer remainder transforms
+Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonRemTransforms(I))
+    return common;
+
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X % 0 == undef, we don't need to preserve faults!
+    if (RHS->equalsInt(0))
+      return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+    
+    if (RHS->equalsInt(1))  // X % 1 == 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+    if (Instruction *Op0I = dyn_cast<Instruction>(Op0)) {
+      if (SelectInst *SI = dyn_cast<SelectInst>(Op0I)) {
+        if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+          return R;
+      } else if (isa<PHINode>(Op0I)) {
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+      }
+      // (X * C1) % C2 --> 0  iff  C1 % C2 == 0
+      if (ConstantExpr::getSRem(GetFactor(Op0I), RHS)->isNullValue())
+        return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitURem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // X urem C^2 -> X and C
+    // Check to see if this is an unsigned remainder with an exact power of 2,
+    // if so, convert to a bitwise and.
+    if (ConstantInt *C = dyn_cast<ConstantInt>(RHS))
+      if (C->getValue().isPowerOf2())
+        return BinaryOperator::createAnd(Op0, SubOne(C));
+  }
+
+  if (Instruction *RHSI = dyn_cast<Instruction>(I.getOperand(1))) {
+    // Turn A % (C << N), where C is 2^k, into A & ((C << N)-1)  
+    if (RHSI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(RHSI->getOperand(0))) {
+      if (cast<ConstantInt>(RHSI->getOperand(0))->getValue().isPowerOf2()) {
+        Constant *N1 = ConstantInt::getAllOnesValue(I.getType());
+        Value *Add = InsertNewInstBefore(BinaryOperator::createAdd(RHSI, N1,
+                                                                   "tmp"), I);
+        return BinaryOperator::createAnd(Op0, Add);
+      }
+    }
+  }
+
+  // urem X, (select Cond, 2^C1, 2^C2) --> select Cond, (and X, C1), (and X, C2)
+  // where C1&C2 are powers of two.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op1)) {
+    if (ConstantInt *STO = dyn_cast<ConstantInt>(SI->getOperand(1)))
+      if (ConstantInt *SFO = dyn_cast<ConstantInt>(SI->getOperand(2))) {
+        // STO == 0 and SFO == 0 handled above.
+        if ((STO->getValue().isPowerOf2()) && 
+            (SFO->getValue().isPowerOf2())) {
+          Value *TrueAnd = InsertNewInstBefore(
+            BinaryOperator::createAnd(Op0, SubOne(STO), SI->getName()+".t"), I);
+          Value *FalseAnd = InsertNewInstBefore(
+            BinaryOperator::createAnd(Op0, SubOne(SFO), SI->getName()+".f"), I);
+          return new SelectInst(SI->getOperand(0), TrueAnd, FalseAnd);
+        }
+      }
+  }
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (Instruction *common = commonIRemTransforms(I))
+    return common;
+  
+  if (Value *RHSNeg = dyn_castNegVal(Op1))
+    if (!isa<ConstantInt>(RHSNeg) || 
+        cast<ConstantInt>(RHSNeg)->getValue().isStrictlyPositive()) {
+      // X % -Y -> X % Y
+      AddUsesToWorkList(I);
+      I.setOperand(1, RHSNeg);
+      return &I;
+    }
+ 
+  // If the top bits of both operands are zero (i.e. we can prove they are
+  // unsigned inputs), turn this into a urem.
+  APInt Mask(APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()));
+  if (MaskedValueIsZero(Op1, Mask) && MaskedValueIsZero(Op0, Mask)) {
+    // X srem Y -> X urem Y, iff X and Y don't have sign bit set
+    return BinaryOperator::createURem(Op0, Op1, I.getName());
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFRem(BinaryOperator &I) {
+  return commonRemTransforms(I);
+}
+
+// isMaxValueMinusOne - return true if this is Max-1
+static bool isMaxValueMinusOne(const ConstantInt *C, bool isSigned) {
+  uint32_t TypeBits = C->getType()->getPrimitiveSizeInBits();
+  if (!isSigned)
+    return C->getValue() == APInt::getAllOnesValue(TypeBits) - 1;
+  return C->getValue() == APInt::getSignedMaxValue(TypeBits)-1;
+}
+
+// isMinValuePlusOne - return true if this is Min+1
+static bool isMinValuePlusOne(const ConstantInt *C, bool isSigned) {
+  if (!isSigned)
+    return C->getValue() == 1; // unsigned
+    
+  // Calculate 1111111111000000000000
+  uint32_t TypeBits = C->getType()->getPrimitiveSizeInBits();
+  return C->getValue() == APInt::getSignedMinValue(TypeBits)+1;
+}
+
+// isOneBitSet - Return true if there is exactly one bit set in the specified
+// constant.
+static bool isOneBitSet(const ConstantInt *CI) {
+  return CI->getValue().isPowerOf2();
+}
+
+// isHighOnes - Return true if the constant is of the form 1+0+.
+// This is the same as lowones(~X).
+static bool isHighOnes(const ConstantInt *CI) {
+  return (~CI->getValue() + 1).isPowerOf2();
+}
+
+/// getICmpCode - Encode a icmp predicate into a three bit mask.  These bits
+/// are carefully arranged to allow folding of expressions such as:
+///
+///      (A < B) | (A > B) --> (A != B)
+///
+/// Note that this is only valid if the first and second predicates have the
+/// same sign. Is illegal to do: (A u< B) | (A s> B) 
+///
+/// Three bits are used to represent the condition, as follows:
+///   0  A > B
+///   1  A == B
+///   2  A < B
+///
+/// <=>  Value  Definition
+/// 000     0   Always false
+/// 001     1   A >  B
+/// 010     2   A == B
+/// 011     3   A >= B
+/// 100     4   A <  B
+/// 101     5   A != B
+/// 110     6   A <= B
+/// 111     7   Always true
+///  
+static unsigned getICmpCode(const ICmpInst *ICI) {
+  switch (ICI->getPredicate()) {
+    // False -> 0
+  case ICmpInst::ICMP_UGT: return 1;  // 001
+  case ICmpInst::ICMP_SGT: return 1;  // 001
+  case ICmpInst::ICMP_EQ:  return 2;  // 010
+  case ICmpInst::ICMP_UGE: return 3;  // 011
+  case ICmpInst::ICMP_SGE: return 3;  // 011
+  case ICmpInst::ICMP_ULT: return 4;  // 100
+  case ICmpInst::ICMP_SLT: return 4;  // 100
+  case ICmpInst::ICMP_NE:  return 5;  // 101
+  case ICmpInst::ICMP_ULE: return 6;  // 110
+  case ICmpInst::ICMP_SLE: return 6;  // 110
+    // True -> 7
+  default:
+    assert(0 && "Invalid ICmp predicate!");
+    return 0;
+  }
+}
+
+/// getICmpValue - This is the complement of getICmpCode, which turns an
+/// opcode and two operands into either a constant true or false, or a brand 
+/// new /// ICmp instruction. The sign is passed in to determine which kind
+/// of predicate to use in new icmp instructions.
+static Value *getICmpValue(bool sign, unsigned code, Value *LHS, Value *RHS) {
+  switch (code) {
+  default: assert(0 && "Illegal ICmp code!");
+  case  0: return ConstantInt::getFalse();
+  case  1: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGT, LHS, RHS);
+  case  2: return new ICmpInst(ICmpInst::ICMP_EQ,  LHS, RHS);
+  case  3: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SGE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_UGE, LHS, RHS);
+  case  4: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLT, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULT, LHS, RHS);
+  case  5: return new ICmpInst(ICmpInst::ICMP_NE,  LHS, RHS);
+  case  6: 
+    if (sign)
+      return new ICmpInst(ICmpInst::ICMP_SLE, LHS, RHS);
+    else
+      return new ICmpInst(ICmpInst::ICMP_ULE, LHS, RHS);
+  case  7: return ConstantInt::getTrue();
+  }
+}
+
+static bool PredicatesFoldable(ICmpInst::Predicate p1, ICmpInst::Predicate p2) {
+  return (ICmpInst::isSignedPredicate(p1) == ICmpInst::isSignedPredicate(p2)) ||
+    (ICmpInst::isSignedPredicate(p1) && 
+     (p2 == ICmpInst::ICMP_EQ || p2 == ICmpInst::ICMP_NE)) ||
+    (ICmpInst::isSignedPredicate(p2) && 
+     (p1 == ICmpInst::ICMP_EQ || p1 == ICmpInst::ICMP_NE));
+}
+
+namespace { 
+// FoldICmpLogical - Implements (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+struct FoldICmpLogical {
+  InstCombiner &IC;
+  Value *LHS, *RHS;
+  ICmpInst::Predicate pred;
+  FoldICmpLogical(InstCombiner &ic, ICmpInst *ICI)
+    : IC(ic), LHS(ICI->getOperand(0)), RHS(ICI->getOperand(1)),
+      pred(ICI->getPredicate()) {}
+  bool shouldApply(Value *V) const {
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(V))
+      if (PredicatesFoldable(pred, ICI->getPredicate()))
+        return (ICI->getOperand(0) == LHS && ICI->getOperand(1) == RHS ||
+                ICI->getOperand(0) == RHS && ICI->getOperand(1) == LHS);
+    return false;
+  }
+  Instruction *apply(Instruction &Log) const {
+    ICmpInst *ICI = cast<ICmpInst>(Log.getOperand(0));
+    if (ICI->getOperand(0) != LHS) {
+      assert(ICI->getOperand(1) == LHS);
+      ICI->swapOperands();  // Swap the LHS and RHS of the ICmp
+    }
+
+    ICmpInst *RHSICI = cast<ICmpInst>(Log.getOperand(1));
+    unsigned LHSCode = getICmpCode(ICI);
+    unsigned RHSCode = getICmpCode(RHSICI);
+    unsigned Code;
+    switch (Log.getOpcode()) {
+    case Instruction::And: Code = LHSCode & RHSCode; break;
+    case Instruction::Or:  Code = LHSCode | RHSCode; break;
+    case Instruction::Xor: Code = LHSCode ^ RHSCode; break;
+    default: assert(0 && "Illegal logical opcode!"); return 0;
+    }
+
+    bool isSigned = ICmpInst::isSignedPredicate(RHSICI->getPredicate()) || 
+                    ICmpInst::isSignedPredicate(ICI->getPredicate());
+      
+    Value *RV = getICmpValue(isSigned, Code, LHS, RHS);
+    if (Instruction *I = dyn_cast<Instruction>(RV))
+      return I;
+    // Otherwise, it's a constant boolean value...
+    return IC.ReplaceInstUsesWith(Log, RV);
+  }
+};
+} // end anonymous namespace
+
+// OptAndOp - This handles expressions of the form ((val OP C1) & C2).  Where
+// the Op parameter is 'OP', OpRHS is 'C1', and AndRHS is 'C2'.  Op is
+// guaranteed to be a binary operator.
+Instruction *InstCombiner::OptAndOp(Instruction *Op,
+                                    ConstantInt *OpRHS,
+                                    ConstantInt *AndRHS,
+                                    BinaryOperator &TheAnd) {
+  Value *X = Op->getOperand(0);
+  Constant *Together = 0;
+  if (!Op->isShift())
+    Together = And(AndRHS, OpRHS);
+
+  switch (Op->getOpcode()) {
+  case Instruction::Xor:
+    if (Op->hasOneUse()) {
+      // (X ^ C1) & C2 --> (X & C2) ^ (C1&C2)
+      Instruction *And = BinaryOperator::createAnd(X, AndRHS);
+      InsertNewInstBefore(And, TheAnd);
+      And->takeName(Op);
+      return BinaryOperator::createXor(And, Together);
+    }
+    break;
+  case Instruction::Or:
+    if (Together == AndRHS) // (X | C) & C --> C
+      return ReplaceInstUsesWith(TheAnd, AndRHS);
+
+    if (Op->hasOneUse() && Together != OpRHS) {
+      // (X | C1) & C2 --> (X | (C1&C2)) & C2
+      Instruction *Or = BinaryOperator::createOr(X, Together);
+      InsertNewInstBefore(Or, TheAnd);
+      Or->takeName(Op);
+      return BinaryOperator::createAnd(Or, AndRHS);
+    }
+    break;
+  case Instruction::Add:
+    if (Op->hasOneUse()) {
+      // Adding a one to a single bit bit-field should be turned into an XOR
+      // of the bit.  First thing to check is to see if this AND is with a
+      // single bit constant.
+      const APInt& AndRHSV = cast<ConstantInt>(AndRHS)->getValue();
+
+      // If there is only one bit set...
+      if (isOneBitSet(cast<ConstantInt>(AndRHS))) {
+        // Ok, at this point, we know that we are masking the result of the
+        // ADD down to exactly one bit.  If the constant we are adding has
+        // no bits set below this bit, then we can eliminate the ADD.
+        const APInt& AddRHS = cast<ConstantInt>(OpRHS)->getValue();
+
+        // Check to see if any bits below the one bit set in AndRHSV are set.
+        if ((AddRHS & (AndRHSV-1)) == 0) {
+          // If not, the only thing that can effect the output of the AND is
+          // the bit specified by AndRHSV.  If that bit is set, the effect of
+          // the XOR is to toggle the bit.  If it is clear, then the ADD has
+          // no effect.
+          if ((AddRHS & AndRHSV) == 0) { // Bit is not set, noop
+            TheAnd.setOperand(0, X);
+            return &TheAnd;
+          } else {
+            // Pull the XOR out of the AND.
+            Instruction *NewAnd = BinaryOperator::createAnd(X, AndRHS);
+            InsertNewInstBefore(NewAnd, TheAnd);
+            NewAnd->takeName(Op);
+            return BinaryOperator::createXor(NewAnd, AndRHS);
+          }
+        }
+      }
+    }
+    break;
+
+  case Instruction::Shl: {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShlMask(APInt::getHighBitsSet(BitWidth, BitWidth-OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShlMask);
+
+    if (CI->getValue() == ShlMask) { 
+    // Masking out bits that the shift already masks
+      return ReplaceInstUsesWith(TheAnd, Op);   // No need for the and.
+    } else if (CI != AndRHS) {                  // Reducing bits set in and.
+      TheAnd.setOperand(1, CI);
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::LShr:
+  {
+    // We know that the AND will not produce any of the bits shifted in, so if
+    // the anded constant includes them, clear them now!  This only applies to
+    // unsigned shifts, because a signed shr may bring in set bits!
+    //
+    uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+    uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+    APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+    ConstantInt *CI = ConstantInt::get(AndRHS->getValue() & ShrMask);
+
+    if (CI->getValue() == ShrMask) {   
+    // Masking out bits that the shift already masks.
+      return ReplaceInstUsesWith(TheAnd, Op);
+    } else if (CI != AndRHS) {
+      TheAnd.setOperand(1, CI);  // Reduce bits set in and cst.
+      return &TheAnd;
+    }
+    break;
+  }
+  case Instruction::AShr:
+    // Signed shr.
+    // See if this is shifting in some sign extension, then masking it out
+    // with an and.
+    if (Op->hasOneUse()) {
+      uint32_t BitWidth = AndRHS->getType()->getBitWidth();
+      uint32_t OpRHSVal = OpRHS->getLimitedValue(BitWidth);
+      APInt ShrMask(APInt::getLowBitsSet(BitWidth, BitWidth - OpRHSVal));
+      Constant *C = ConstantInt::get(AndRHS->getValue() & ShrMask);
+      if (C == AndRHS) {          // Masking out bits shifted in.
+        // (Val ashr C1) & C2 -> (Val lshr C1) & C2
+        // Make the argument unsigned.
+        Value *ShVal = Op->getOperand(0);
+        ShVal = InsertNewInstBefore(
+            BinaryOperator::createLShr(ShVal, OpRHS, 
+                                   Op->getName()), TheAnd);
+        return BinaryOperator::createAnd(ShVal, AndRHS, TheAnd.getName());
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+
+/// InsertRangeTest - Emit a computation of: (V >= Lo && V < Hi) if Inside is
+/// true, otherwise (V < Lo || V >= Hi).  In pratice, we emit the more efficient
+/// (V-Lo) <u Hi-Lo.  This method expects that Lo <= Hi. isSigned indicates
+/// whether to treat the V, Lo and HI as signed or not. IB is the location to
+/// insert new instructions.
+Instruction *InstCombiner::InsertRangeTest(Value *V, Constant *Lo, Constant *Hi,
+                                           bool isSigned, bool Inside, 
+                                           Instruction &IB) {
+  assert(cast<ConstantInt>(ConstantExpr::getICmp((isSigned ? 
+            ICmpInst::ICMP_SLE:ICmpInst::ICMP_ULE), Lo, Hi))->getZExtValue() &&
+         "Lo is not <= Hi in range emission code!");
+    
+  if (Inside) {
+    if (Lo == Hi)  // Trivially false.
+      return new ICmpInst(ICmpInst::ICMP_NE, V, V);
+
+    // V >= Min && V < Hi --> V < Hi
+    if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+      ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SLT : ICmpInst::ICMP_ULT);
+      return new ICmpInst(pred, V, Hi);
+    }
+
+    // Emit V-Lo <u Hi-Lo
+    Constant *NegLo = ConstantExpr::getNeg(Lo);
+    Instruction *Add = BinaryOperator::createAdd(V, NegLo, V->getName()+".off");
+    InsertNewInstBefore(Add, IB);
+    Constant *UpperBound = ConstantExpr::getAdd(NegLo, Hi);
+    return new ICmpInst(ICmpInst::ICMP_ULT, Add, UpperBound);
+  }
+
+  if (Lo == Hi)  // Trivially true.
+    return new ICmpInst(ICmpInst::ICMP_EQ, V, V);
+
+  // V < Min || V >= Hi -> V > Hi-1
+  Hi = SubOne(cast<ConstantInt>(Hi));
+  if (cast<ConstantInt>(Lo)->isMinValue(isSigned)) {
+    ICmpInst::Predicate pred = (isSigned ? 
+        ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT);
+    return new ICmpInst(pred, V, Hi);
+  }
+
+  // Emit V-Lo >u Hi-1-Lo
+  // Note that Hi has already had one subtracted from it, above.
+  ConstantInt *NegLo = cast<ConstantInt>(ConstantExpr::getNeg(Lo));
+  Instruction *Add = BinaryOperator::createAdd(V, NegLo, V->getName()+".off");
+  InsertNewInstBefore(Add, IB);
+  Constant *LowerBound = ConstantExpr::getAdd(NegLo, Hi);
+  return new ICmpInst(ICmpInst::ICMP_UGT, Add, LowerBound);
+}
+
+// isRunOfOnes - Returns true iff Val consists of one contiguous run of 1s with
+// any number of 0s on either side.  The 1s are allowed to wrap from LSB to
+// MSB, so 0x000FFF0, 0x0000FFFF, and 0xFF0000FF are all runs.  0x0F0F0000 is
+// not, since all 1s are not contiguous.
+static bool isRunOfOnes(ConstantInt *Val, uint32_t &MB, uint32_t &ME) {
+  const APInt& V = Val->getValue();
+  uint32_t BitWidth = Val->getType()->getBitWidth();
+  if (!APIntOps::isShiftedMask(BitWidth, V)) return false;
+
+  // look for the first zero bit after the run of ones
+  MB = BitWidth - ((V - 1) ^ V).countLeadingZeros();
+  // look for the first non-zero bit
+  ME = V.getActiveBits(); 
+  return true;
+}
+
+/// FoldLogicalPlusAnd - This is part of an expression (LHS +/- RHS) & Mask,
+/// where isSub determines whether the operator is a sub.  If we can fold one of
+/// the following xforms:
+/// 
+/// ((A & N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == Mask
+/// ((A | N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+/// ((A ^ N) +/- B) & Mask -> (A +/- B) & Mask iff N&Mask == 0
+///
+/// return (A +/- B).
+///
+Value *InstCombiner::FoldLogicalPlusAnd(Value *LHS, Value *RHS,
+                                        ConstantInt *Mask, bool isSub,
+                                        Instruction &I) {
+  Instruction *LHSI = dyn_cast<Instruction>(LHS);
+  if (!LHSI || LHSI->getNumOperands() != 2 ||
+      !isa<ConstantInt>(LHSI->getOperand(1))) return 0;
+
+  ConstantInt *N = cast<ConstantInt>(LHSI->getOperand(1));
+
+  switch (LHSI->getOpcode()) {
+  default: return 0;
+  case Instruction::And:
+    if (And(N, Mask) == Mask) {
+      // If the AndRHS is a power of two minus one (0+1+), this is simple.
+      if ((Mask->getValue().countLeadingZeros() + 
+           Mask->getValue().countPopulation()) == 
+          Mask->getValue().getBitWidth())
+        break;
+
+      // Otherwise, if Mask is 0+1+0+, and if B is known to have the low 0+
+      // part, we don't need any explicit masks to take them out of A.  If that
+      // is all N is, ignore it.
+      uint32_t MB = 0, ME = 0;
+      if (isRunOfOnes(Mask, MB, ME)) {  // begin/end bit of run, inclusive
+        uint32_t BitWidth = cast<IntegerType>(RHS->getType())->getBitWidth();
+        APInt Mask(APInt::getLowBitsSet(BitWidth, MB-1));
+        if (MaskedValueIsZero(RHS, Mask))
+          break;
+      }
+    }
+    return 0;
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If the AndRHS is a power of two minus one (0+1+), and N&Mask == 0
+    if ((Mask->getValue().countLeadingZeros() + 
+         Mask->getValue().countPopulation()) == Mask->getValue().getBitWidth()
+        && And(N, Mask)->isZero())
+      break;
+    return 0;
+  }
+  
+  Instruction *New;
+  if (isSub)
+    New = BinaryOperator::createSub(LHSI->getOperand(0), RHS, "fold");
+  else
+    New = BinaryOperator::createAdd(LHSI->getOperand(0), RHS, "fold");
+  return InsertNewInstBefore(New, I);
+}
+
+Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                         // X & undef -> 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // and X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op1);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
+                             KnownZero, KnownOne))
+      return &I;
+  } else {
+    if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+      if (CP->isAllOnesValue())            // X & <-1,-1> -> X
+        return ReplaceInstUsesWith(I, I.getOperand(0));
+    } else if (isa<ConstantAggregateZero>(Op1)) {
+      return ReplaceInstUsesWith(I, Op1);  // X & <0,0> -> <0,0>
+    }
+  }
+  
+  if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
+    const APInt& AndRHSMask = AndRHS->getValue();
+    APInt NotAndRHS(~AndRHSMask);
+
+    // Optimize a variety of ((val OP C1) & C2) combinations...
+    if (isa<BinaryOperator>(Op0)) {
+      Instruction *Op0I = cast<Instruction>(Op0);
+      Value *Op0LHS = Op0I->getOperand(0);
+      Value *Op0RHS = Op0I->getOperand(1);
+      switch (Op0I->getOpcode()) {
+      case Instruction::Xor:
+      case Instruction::Or:
+        // If the mask is only needed on one incoming arm, push it up.
+        if (Op0I->hasOneUse()) {
+          if (MaskedValueIsZero(Op0LHS, NotAndRHS)) {
+            // Not masking anything out for the LHS, move to RHS.
+            Instruction *NewRHS = BinaryOperator::createAnd(Op0RHS, AndRHS,
+                                                   Op0RHS->getName()+".masked");
+            InsertNewInstBefore(NewRHS, I);
+            return BinaryOperator::create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), Op0LHS, NewRHS);
+          }
+          if (!isa<Constant>(Op0RHS) &&
+              MaskedValueIsZero(Op0RHS, NotAndRHS)) {
+            // Not masking anything out for the RHS, move to LHS.
+            Instruction *NewLHS = BinaryOperator::createAnd(Op0LHS, AndRHS,
+                                                   Op0LHS->getName()+".masked");
+            InsertNewInstBefore(NewLHS, I);
+            return BinaryOperator::create(
+                       cast<BinaryOperator>(Op0I)->getOpcode(), NewLHS, Op0RHS);
+          }
+        }
+
+        break;
+      case Instruction::Add:
+        // ((A & N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) + B) & AndRHS -> (A + B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, false, I))
+          return BinaryOperator::createAnd(V, AndRHS);
+        if (Value *V = FoldLogicalPlusAnd(Op0RHS, Op0LHS, AndRHS, false, I))
+          return BinaryOperator::createAnd(V, AndRHS);  // Add commutes
+        break;
+
+      case Instruction::Sub:
+        // ((A & N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == AndRHS.
+        // ((A | N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        // ((A ^ N) - B) & AndRHS -> (A - B) & AndRHS iff N&AndRHS == 0
+        if (Value *V = FoldLogicalPlusAnd(Op0LHS, Op0RHS, AndRHS, true, I))
+          return BinaryOperator::createAnd(V, AndRHS);
+        break;
+      }
+
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Instruction *Res = OptAndOp(Op0I, Op0CI, AndRHS, I))
+          return Res;
+    } else if (CastInst *CI = dyn_cast<CastInst>(Op0)) {
+      // If this is an integer truncation or change from signed-to-unsigned, and
+      // if the source is an and/or with immediate, transform it.  This
+      // frequently occurs for bitfield accesses.
+      if (Instruction *CastOp = dyn_cast<Instruction>(CI->getOperand(0))) {
+        if ((isa<TruncInst>(CI) || isa<BitCastInst>(CI)) &&
+            CastOp->getNumOperands() == 2)
+          if (ConstantInt *AndCI = dyn_cast<ConstantInt>(CastOp->getOperand(1)))
+            if (CastOp->getOpcode() == Instruction::And) {
+              // Change: and (cast (and X, C1) to T), C2
+              // into  : and (cast X to T), trunc_or_bitcast(C1)&C2
+              // This will fold the two constants together, which may allow 
+              // other simplifications.
+              Instruction *NewCast = CastInst::createTruncOrBitCast(
+                CastOp->getOperand(0), I.getType(), 
+                CastOp->getName()+".shrunk");
+              NewCast = InsertNewInstBefore(NewCast, I);
+              // trunc_or_bitcast(C1)&C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              C3 = ConstantExpr::getAnd(C3, AndRHS);
+              return BinaryOperator::createAnd(NewCast, C3);
+            } else if (CastOp->getOpcode() == Instruction::Or) {
+              // Change: and (cast (or X, C1) to T), C2
+              // into  : trunc(C1)&C2 iff trunc(C1)&C2 == C2
+              Constant *C3 = ConstantExpr::getTruncOrBitCast(AndCI,I.getType());
+              if (ConstantExpr::getAnd(C3, AndRHS) == AndRHS)   // trunc(C1)&C2
+                return ReplaceInstUsesWith(I, AndRHS);
+            }
+      }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *Op0NotVal = dyn_castNotVal(Op0);
+  Value *Op1NotVal = dyn_castNotVal(Op1);
+
+  if (Op0NotVal == Op1 || Op1NotVal == Op0)  // A & ~A  == ~A & A == 0
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+
+  // (~A & ~B) == (~(A | B)) - De Morgan's Law
+  if (Op0NotVal && Op1NotVal && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+    Instruction *Or = BinaryOperator::createOr(Op0NotVal, Op1NotVal,
+                                               I.getName()+".demorgan");
+    InsertNewInstBefore(Or, I);
+    return BinaryOperator::createNot(Or);
+  }
+  
+  {
+    Value *A = 0, *B = 0, *C = 0, *D = 0;
+    if (match(Op0, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1)    // (A | ?) & A  --> A
+        return ReplaceInstUsesWith(I, Op1);
+    
+      // (A|B) & ~(A&B) -> A^B
+      if (match(Op1, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::createXor(A, B);
+      }
+    }
+    
+    if (match(Op1, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0 || B == Op0)    // A & (A | ?)  --> A
+        return ReplaceInstUsesWith(I, Op0);
+
+      // ~(A&B) & (A|B) -> A^B
+      if (match(Op0, m_Not(m_And(m_Value(C), m_Value(D))))) {
+        if ((A == C && B == D) || (A == D && B == C))
+          return BinaryOperator::createXor(A, B);
+      }
+    }
+    
+    if (Op0->hasOneUse() &&
+        match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1) {                                // (A^B)&A -> A&(A^B)
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      } else if (B == Op1) {                         // (A^B)&B -> B&(B^A)
+        cast<BinaryOperator>(Op0)->swapOperands();
+        I.swapOperands();     // Simplify below
+        std::swap(Op0, Op1);
+      }
+    }
+    if (Op1->hasOneUse() &&
+        match(Op1, m_Xor(m_Value(A), m_Value(B)))) {
+      if (B == Op0) {                                // B&(A^B) -> B&(B^A)
+        cast<BinaryOperator>(Op1)->swapOperands();
+        std::swap(A, B);
+      }
+      if (A == Op0) {                                // A&(A^B) -> A & ~B
+        Instruction *NotB = BinaryOperator::createNot(B, "tmp");
+        InsertNewInstBefore(NotB, I);
+        return BinaryOperator::createAnd(A, NotB);
+      }
+    }
+  }
+  
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(Op1)) {
+    // (icmp1 A, B) & (icmp2 A, B) --> (icmp3 A, B)
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    Value *LHSVal, *RHSVal;
+    ConstantInt *LHSCst, *RHSCst;
+    ICmpInst::Predicate LHSCC, RHSCC;
+    if (match(Op0, m_ICmp(LHSCC, m_Value(LHSVal), m_ConstantInt(LHSCst))))
+      if (match(RHS, m_ICmp(RHSCC, m_Value(RHSVal), m_ConstantInt(RHSCst))))
+        if (LHSVal == RHSVal &&    // Found (X icmp C1) & (X icmp C2)
+            // ICMP_[GL]E X, CST is folded to ICMP_[GL]T elsewhere.
+            LHSCC != ICmpInst::ICMP_UGE && LHSCC != ICmpInst::ICMP_ULE &&
+            RHSCC != ICmpInst::ICMP_UGE && RHSCC != ICmpInst::ICMP_ULE &&
+            LHSCC != ICmpInst::ICMP_SGE && LHSCC != ICmpInst::ICMP_SLE &&
+            RHSCC != ICmpInst::ICMP_SGE && RHSCC != ICmpInst::ICMP_SLE) {
+          // Ensure that the larger constant is on the RHS.
+          ICmpInst::Predicate GT = ICmpInst::isSignedPredicate(LHSCC) ? 
+            ICmpInst::ICMP_SGT : ICmpInst::ICMP_UGT;
+          Constant *Cmp = ConstantExpr::getICmp(GT, LHSCst, RHSCst);
+          ICmpInst *LHS = cast<ICmpInst>(Op0);
+          if (cast<ConstantInt>(Cmp)->getZExtValue()) {
+            std::swap(LHS, RHS);
+            std::swap(LHSCst, RHSCst);
+            std::swap(LHSCC, RHSCC);
+          }
+
+          // At this point, we know we have have two icmp instructions
+          // comparing a value against two constants and and'ing the result
+          // together.  Because of the above check, we know that we only have
+          // icmp eq, icmp ne, icmp [su]lt, and icmp [SU]gt here. We also know 
+          // (from the FoldICmpLogical check above), that the two constants 
+          // are not equal and that the larger constant is on the RHS
+          assert(LHSCst != RHSCst && "Compares not folded above?");
+
+          switch (LHSCC) {
+          default: assert(0 && "Unknown integer condition code!");
+          case ICmpInst::ICMP_EQ:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X == 13 & X == 15) -> false
+            case ICmpInst::ICMP_UGT:        // (X == 13 & X >  15) -> false
+            case ICmpInst::ICMP_SGT:        // (X == 13 & X >  15) -> false
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            case ICmpInst::ICMP_NE:         // (X == 13 & X != 15) -> X == 13
+            case ICmpInst::ICMP_ULT:        // (X == 13 & X <  15) -> X == 13
+            case ICmpInst::ICMP_SLT:        // (X == 13 & X <  15) -> X == 13
+              return ReplaceInstUsesWith(I, LHS);
+            }
+          case ICmpInst::ICMP_NE:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_ULT:
+              if (LHSCst == SubOne(RHSCst)) // (X != 13 & X u< 14) -> X < 13
+                return new ICmpInst(ICmpInst::ICMP_ULT, LHSVal, LHSCst);
+              break;                        // (X != 13 & X u< 15) -> no change
+            case ICmpInst::ICMP_SLT:
+              if (LHSCst == SubOne(RHSCst)) // (X != 13 & X s< 14) -> X < 13
+                return new ICmpInst(ICmpInst::ICMP_SLT, LHSVal, LHSCst);
+              break;                        // (X != 13 & X s< 15) -> no change
+            case ICmpInst::ICMP_EQ:         // (X != 13 & X == 15) -> X == 15
+            case ICmpInst::ICMP_UGT:        // (X != 13 & X u> 15) -> X u> 15
+            case ICmpInst::ICMP_SGT:        // (X != 13 & X s> 15) -> X s> 15
+              return ReplaceInstUsesWith(I, RHS);
+            case ICmpInst::ICMP_NE:
+              if (LHSCst == SubOne(RHSCst)){// (X != 13 & X != 14) -> X-13 >u 1
+                Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+                Instruction *Add = BinaryOperator::createAdd(LHSVal, AddCST,
+                                                      LHSVal->getName()+".off");
+                InsertNewInstBefore(Add, I);
+                return new ICmpInst(ICmpInst::ICMP_UGT, Add,
+                                    ConstantInt::get(Add->getType(), 1));
+              }
+              break;                        // (X != 13 & X != 15) -> no change
+            }
+            break;
+          case ICmpInst::ICMP_ULT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X u< 13 & X == 15) -> false
+            case ICmpInst::ICMP_UGT:        // (X u< 13 & X u> 15) -> false
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            case ICmpInst::ICMP_SGT:        // (X u< 13 & X s> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X u< 13 & X != 15) -> X u< 13
+            case ICmpInst::ICMP_ULT:        // (X u< 13 & X u< 15) -> X u< 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_SLT:        // (X u< 13 & X s< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_SLT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X s< 13 & X == 15) -> false
+            case ICmpInst::ICMP_SGT:        // (X s< 13 & X s> 15) -> false
+              return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+            case ICmpInst::ICMP_UGT:        // (X s< 13 & X u> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X s< 13 & X != 15) -> X < 13
+            case ICmpInst::ICMP_SLT:        // (X s< 13 & X s< 15) -> X < 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_ULT:        // (X s< 13 & X u< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_UGT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X u> 13 & X == 15) -> X > 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_UGT:        // (X u> 13 & X u> 15) -> X u> 15
+              return ReplaceInstUsesWith(I, RHS);
+            case ICmpInst::ICMP_SGT:        // (X u> 13 & X s> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:
+              if (RHSCst == AddOne(LHSCst)) // (X u> 13 & X != 14) -> X u> 14
+                return new ICmpInst(LHSCC, LHSVal, RHSCst);
+              break;                        // (X u> 13 & X != 15) -> no change
+            case ICmpInst::ICMP_ULT:        // (X u> 13 & X u< 15) ->(X-14) <u 1
+              return InsertRangeTest(LHSVal, AddOne(LHSCst), RHSCst, false, 
+                                     true, I);
+            case ICmpInst::ICMP_SLT:        // (X u> 13 & X s< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_SGT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X s> 13 & X == 15) -> X s> 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_SGT:        // (X s> 13 & X s> 15) -> X s> 15
+              return ReplaceInstUsesWith(I, RHS);
+            case ICmpInst::ICMP_UGT:        // (X s> 13 & X u> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:
+              if (RHSCst == AddOne(LHSCst)) // (X s> 13 & X != 14) -> X s> 14
+                return new ICmpInst(LHSCC, LHSVal, RHSCst);
+              break;                        // (X s> 13 & X != 15) -> no change
+            case ICmpInst::ICMP_SLT:        // (X s> 13 & X s< 15) ->(X-14) s< 1
+              return InsertRangeTest(LHSVal, AddOne(LHSCst), RHSCst, true, 
+                                     true, I);
+            case ICmpInst::ICMP_ULT:        // (X s> 13 & X u< 15) -> no change
+              break;
+            }
+            break;
+          }
+        }
+  }
+
+  // fold (and (cast A), (cast B)) -> (cast (and A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind ?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::createAnd(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+    
+  // (X >> Z) & (Y >> Z)  -> (X&Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+          InsertNewInstBefore(BinaryOperator::createAnd(SI0->getOperand(0),
+                                                        SI1->getOperand(0),
+                                                        SI0->getName()), I);
+        return BinaryOperator::create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+/// CollectBSwapParts - Look to see if the specified value defines a single byte
+/// in the result.  If it does, and if the specified byte hasn't been filled in
+/// yet, fill it in and return false.
+static bool CollectBSwapParts(Value *V, SmallVector<Value*, 8> &ByteValues) {
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return true;
+
+  // If this is an or instruction, it is an inner node of the bswap.
+  if (I->getOpcode() == Instruction::Or)
+    return CollectBSwapParts(I->getOperand(0), ByteValues) ||
+           CollectBSwapParts(I->getOperand(1), ByteValues);
+  
+  uint32_t BitWidth = I->getType()->getPrimitiveSizeInBits();
+  // If this is a shift by a constant int, and it is "24", then its operand
+  // defines a byte.  We only handle unsigned types here.
+  if (I->isShift() && isa<ConstantInt>(I->getOperand(1))) {
+    // Not shifting the entire input by N-1 bytes?
+    if (cast<ConstantInt>(I->getOperand(1))->getLimitedValue(BitWidth) !=
+        8*(ByteValues.size()-1))
+      return true;
+    
+    unsigned DestNo;
+    if (I->getOpcode() == Instruction::Shl) {
+      // X << 24 defines the top byte with the lowest of the input bytes.
+      DestNo = ByteValues.size()-1;
+    } else {
+      // X >>u 24 defines the low byte with the highest of the input bytes.
+      DestNo = 0;
+    }
+    
+    // If the destination byte value is already defined, the values are or'd
+    // together, which isn't a bswap (unless it's an or of the same bits).
+    if (ByteValues[DestNo] && ByteValues[DestNo] != I->getOperand(0))
+      return true;
+    ByteValues[DestNo] = I->getOperand(0);
+    return false;
+  }
+  
+  // Otherwise, we can only handle and(shift X, imm), imm).  Bail out of if we
+  // don't have this.
+  Value *Shift = 0, *ShiftLHS = 0;
+  ConstantInt *AndAmt = 0, *ShiftAmt = 0;
+  if (!match(I, m_And(m_Value(Shift), m_ConstantInt(AndAmt))) ||
+      !match(Shift, m_Shift(m_Value(ShiftLHS), m_ConstantInt(ShiftAmt))))
+    return true;
+  Instruction *SI = cast<Instruction>(Shift);
+
+  // Make sure that the shift amount is by a multiple of 8 and isn't too big.
+  if (ShiftAmt->getLimitedValue(BitWidth) & 7 ||
+      ShiftAmt->getLimitedValue(BitWidth) > 8*ByteValues.size())
+    return true;
+  
+  // Turn 0xFF -> 0, 0xFF00 -> 1, 0xFF0000 -> 2, etc.
+  unsigned DestByte;
+  if (AndAmt->getValue().getActiveBits() > 64)
+    return true;
+  uint64_t AndAmtVal = AndAmt->getZExtValue();
+  for (DestByte = 0; DestByte != ByteValues.size(); ++DestByte)
+    if (AndAmtVal == uint64_t(0xFF) << 8*DestByte)
+      break;
+  // Unknown mask for bswap.
+  if (DestByte == ByteValues.size()) return true;
+  
+  unsigned ShiftBytes = ShiftAmt->getZExtValue()/8;
+  unsigned SrcByte;
+  if (SI->getOpcode() == Instruction::Shl)
+    SrcByte = DestByte - ShiftBytes;
+  else
+    SrcByte = DestByte + ShiftBytes;
+  
+  // If the SrcByte isn't a bswapped value from the DestByte, reject it.
+  if (SrcByte != ByteValues.size()-DestByte-1)
+    return true;
+  
+  // If the destination byte value is already defined, the values are or'd
+  // together, which isn't a bswap (unless it's an or of the same bits).
+  if (ByteValues[DestByte] && ByteValues[DestByte] != SI->getOperand(0))
+    return true;
+  ByteValues[DestByte] = SI->getOperand(0);
+  return false;
+}
+
+/// MatchBSwap - Given an OR instruction, check to see if this is a bswap idiom.
+/// If so, insert the new bswap intrinsic and return it.
+Instruction *InstCombiner::MatchBSwap(BinaryOperator &I) {
+  const IntegerType *ITy = dyn_cast<IntegerType>(I.getType());
+  if (!ITy || ITy->getBitWidth() % 16) 
+    return 0;   // Can only bswap pairs of bytes.  Can't do vectors.
+  
+  /// ByteValues - For each byte of the result, we keep track of which value
+  /// defines each byte.
+  SmallVector<Value*, 8> ByteValues;
+  ByteValues.resize(ITy->getBitWidth()/8);
+    
+  // Try to find all the pieces corresponding to the bswap.
+  if (CollectBSwapParts(I.getOperand(0), ByteValues) ||
+      CollectBSwapParts(I.getOperand(1), ByteValues))
+    return 0;
+  
+  // Check to see if all of the bytes come from the same value.
+  Value *V = ByteValues[0];
+  if (V == 0) return 0;  // Didn't find a byte?  Must be zero.
+  
+  // Check to make sure that all of the bytes come from the same value.
+  for (unsigned i = 1, e = ByteValues.size(); i != e; ++i)
+    if (ByteValues[i] != V)
+      return 0;
+  const Type *Tys[] = { ITy, ITy };
+  Module *M = I.getParent()->getParent()->getParent();
+  Function *F = Intrinsic::getDeclaration(M, Intrinsic::bswap, Tys, 2);
+  return new CallInst(F, V);
+}
+
+
+Instruction *InstCombiner::visitOr(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))                       // X | undef -> -1
+    return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  // or X, X = X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, Op0);
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
+                             KnownZero, KnownOne))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X | <0,0> -> X
+  } else if (ConstantVector *CP = dyn_cast<ConstantVector>(Op1)) {
+    if (CP->isAllOnesValue())            // X | <-1,-1> -> <-1,-1>
+      return ReplaceInstUsesWith(I, I.getOperand(1));
+  }
+    
+
+  
+  // or X, -1 == -1
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    ConstantInt *C1 = 0; Value *X = 0;
+    // (X & C1) | C2 --> (X | C2) & (C1|C2)
+    if (match(Op0, m_And(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::createOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::createAnd(Or, 
+               ConstantInt::get(RHS->getValue() | C1->getValue()));
+    }
+
+    // (X ^ C1) | C2 --> (X | C2) ^ (C1&~C2)
+    if (match(Op0, m_Xor(m_Value(X), m_ConstantInt(C1))) && isOnlyUse(Op0)) {
+      Instruction *Or = BinaryOperator::createOr(X, RHS);
+      InsertNewInstBefore(Or, I);
+      Or->takeName(Op0);
+      return BinaryOperator::createXor(Or,
+                 ConstantInt::get(C1->getValue() & ~RHS->getValue()));
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  Value *A = 0, *B = 0;
+  ConstantInt *C1 = 0, *C2 = 0;
+
+  if (match(Op0, m_And(m_Value(A), m_Value(B))))
+    if (A == Op1 || B == Op1)    // (A & ?) | A  --> A
+      return ReplaceInstUsesWith(I, Op1);
+  if (match(Op1, m_And(m_Value(A), m_Value(B))))
+    if (A == Op0 || B == Op0)    // A | (A & ?)  --> A
+      return ReplaceInstUsesWith(I, Op0);
+
+  // (A | B) | C  and  A | (B | C)                  -> bswap if possible.
+  // (A >> B) | (C << D)  and  (A << B) | (B >> C)  -> bswap if possible.
+  if (match(Op0, m_Or(m_Value(), m_Value())) ||
+      match(Op1, m_Or(m_Value(), m_Value())) ||
+      (match(Op0, m_Shift(m_Value(), m_Value())) &&
+       match(Op1, m_Shift(m_Value(), m_Value())))) {
+    if (Instruction *BSwap = MatchBSwap(I))
+      return BSwap;
+  }
+  
+  // (X^C)|Y -> (X|Y)^C iff Y&C == 0
+  if (Op0->hasOneUse() && match(Op0, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op1, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::createOr(A, Op1);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::createXor(NOr, C1);
+  }
+
+  // Y|(X^C) -> (X|Y)^C iff Y&C == 0
+  if (Op1->hasOneUse() && match(Op1, m_Xor(m_Value(A), m_ConstantInt(C1))) &&
+      MaskedValueIsZero(Op0, C1->getValue())) {
+    Instruction *NOr = BinaryOperator::createOr(A, Op0);
+    InsertNewInstBefore(NOr, I);
+    NOr->takeName(Op0);
+    return BinaryOperator::createXor(NOr, C1);
+  }
+
+  // (A & C)|(B & D)
+  Value *C = 0, *D = 0;
+  if (match(Op0, m_And(m_Value(A), m_Value(C))) &&
+      match(Op1, m_And(m_Value(B), m_Value(D)))) {
+    Value *V1 = 0, *V2 = 0, *V3 = 0;
+    C1 = dyn_cast<ConstantInt>(C);
+    C2 = dyn_cast<ConstantInt>(D);
+    if (C1 && C2) {  // (A & C1)|(B & C2)
+      // If we have: ((V + N) & C1) | (V & C2)
+      // .. and C2 = ~C1 and C2 is 0+1+ and (N & C2) == 0
+      // replace with V+N.
+      if (C1->getValue() == ~C2->getValue()) {
+        if ((C2->getValue() & (C2->getValue()+1)) == 0 && // C2 == 0+1+
+            match(A, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == B && MaskedValueIsZero(V2, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+          if (V2 == B && MaskedValueIsZero(V1, C2->getValue()))
+            return ReplaceInstUsesWith(I, A);
+        }
+        // Or commutes, try both ways.
+        if ((C1->getValue() & (C1->getValue()+1)) == 0 &&
+            match(B, m_Add(m_Value(V1), m_Value(V2)))) {
+          // Add commutes, try both ways.
+          if (V1 == A && MaskedValueIsZero(V2, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+          if (V2 == A && MaskedValueIsZero(V1, C1->getValue()))
+            return ReplaceInstUsesWith(I, B);
+        }
+      }
+      V1 = 0; V2 = 0; V3 = 0;
+    }
+    
+    // Check to see if we have any common things being and'ed.  If so, find the
+    // terms for V1 & (V2|V3).
+    if (isOnlyUse(Op0) || isOnlyUse(Op1)) {
+      if (A == B)      // (A & C)|(A & D) == A & (C|D)
+        V1 = A, V2 = C, V3 = D;
+      else if (A == D) // (A & C)|(B & A) == A & (B|C)
+        V1 = A, V2 = B, V3 = C;
+      else if (C == B) // (A & C)|(C & D) == C & (A|D)
+        V1 = C, V2 = A, V3 = D;
+      else if (C == D) // (A & C)|(B & C) == C & (A|B)
+        V1 = C, V2 = A, V3 = B;
+      
+      if (V1) {
+        Value *Or =
+          InsertNewInstBefore(BinaryOperator::createOr(V2, V3, "tmp"), I);
+        return BinaryOperator::createAnd(V1, Or);
+      }
+      
+      // (V1 & V3)|(V2 & ~V3) -> ((V1 ^ V2) & V3) ^ V2
+      if (isOnlyUse(Op0) && isOnlyUse(Op1)) {
+        // Try all combination of terms to find V3 and ~V3.
+        if (A->hasOneUse() && match(A, m_Not(m_Value(V3)))) {
+          if (V3 == B)
+            V1 = D, V2 = C;
+          else if (V3 == D)
+            V1 = B, V2 = C;
+        }
+        if (B->hasOneUse() && match(B, m_Not(m_Value(V3)))) {
+          if (V3 == A)
+            V1 = C, V2 = D;
+          else if (V3 == C)
+            V1 = A, V2 = D;
+        }
+        if (C->hasOneUse() && match(C, m_Not(m_Value(V3)))) {
+          if (V3 == B)
+            V1 = D, V2 = A;
+          else if (V3 == D)
+            V1 = B, V2 = A;
+        }
+        if (D->hasOneUse() && match(D, m_Not(m_Value(V3)))) {
+          if (V3 == A)
+            V1 = C, V2 = B;
+          else if (V3 == C)
+            V1 = A, V2 = B;
+        }
+        if (V1) {
+          A = InsertNewInstBefore(BinaryOperator::createXor(V1, V2, "tmp"), I);
+          A = InsertNewInstBefore(BinaryOperator::createAnd(A, V3, "tmp"), I);
+          return BinaryOperator::createXor(A, V2);
+        }
+      }
+    }
+  }
+  
+  // (X >> Z) | (Y >> Z)  -> (X|Y) >> Z  for all shifts.
+  if (BinaryOperator *SI1 = dyn_cast<BinaryOperator>(Op1)) {
+    if (BinaryOperator *SI0 = dyn_cast<BinaryOperator>(Op0))
+      if (SI0->isShift() && SI0->getOpcode() == SI1->getOpcode() && 
+          SI0->getOperand(1) == SI1->getOperand(1) &&
+          (SI0->hasOneUse() || SI1->hasOneUse())) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::createOr(SI0->getOperand(0),
+                                                     SI1->getOperand(0),
+                                                     SI0->getName()), I);
+        return BinaryOperator::create(SI1->getOpcode(), NewOp, 
+                                      SI1->getOperand(1));
+      }
+  }
+
+  if (match(Op0, m_Not(m_Value(A)))) {   // ~A | Op1
+    if (A == Op1)   // ~A | A == -1
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+  } else {
+    A = 0;
+  }
+  // Note, A is still live here!
+  if (match(Op1, m_Not(m_Value(B)))) {   // Op0 | ~B
+    if (Op0 == B)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+    // (~A | ~B) == (~(A & B)) - De Morgan's Law
+    if (A && isOnlyUse(Op0) && isOnlyUse(Op1)) {
+      Value *And = InsertNewInstBefore(BinaryOperator::createAnd(A, B,
+                                              I.getName()+".demorgan"), I);
+      return BinaryOperator::createNot(And);
+    }
+  }
+
+  // (icmp1 A, B) | (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1))) {
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+    Value *LHSVal, *RHSVal;
+    ConstantInt *LHSCst, *RHSCst;
+    ICmpInst::Predicate LHSCC, RHSCC;
+    if (match(Op0, m_ICmp(LHSCC, m_Value(LHSVal), m_ConstantInt(LHSCst))))
+      if (match(RHS, m_ICmp(RHSCC, m_Value(RHSVal), m_ConstantInt(RHSCst))))
+        if (LHSVal == RHSVal &&    // Found (X icmp C1) | (X icmp C2)
+            // icmp [us][gl]e x, cst is folded to icmp [us][gl]t elsewhere.
+            LHSCC != ICmpInst::ICMP_UGE && LHSCC != ICmpInst::ICMP_ULE &&
+            RHSCC != ICmpInst::ICMP_UGE && RHSCC != ICmpInst::ICMP_ULE &&
+            LHSCC != ICmpInst::ICMP_SGE && LHSCC != ICmpInst::ICMP_SLE &&
+            RHSCC != ICmpInst::ICMP_SGE && RHSCC != ICmpInst::ICMP_SLE &&
+            // We can't fold (ugt x, C) | (sgt x, C2).
+            PredicatesFoldable(LHSCC, RHSCC)) {
+          // Ensure that the larger constant is on the RHS.
+          ICmpInst *LHS = cast<ICmpInst>(Op0);
+          bool NeedsSwap;
+          if (ICmpInst::isSignedPredicate(LHSCC))
+            NeedsSwap = LHSCst->getValue().sgt(RHSCst->getValue());
+          else
+            NeedsSwap = LHSCst->getValue().ugt(RHSCst->getValue());
+            
+          if (NeedsSwap) {
+            std::swap(LHS, RHS);
+            std::swap(LHSCst, RHSCst);
+            std::swap(LHSCC, RHSCC);
+          }
+
+          // At this point, we know we have have two icmp instructions
+          // comparing a value against two constants and or'ing the result
+          // together.  Because of the above check, we know that we only have
+          // ICMP_EQ, ICMP_NE, ICMP_LT, and ICMP_GT here. We also know (from the
+          // FoldICmpLogical check above), that the two constants are not
+          // equal.
+          assert(LHSCst != RHSCst && "Compares not folded above?");
+
+          switch (LHSCC) {
+          default: assert(0 && "Unknown integer condition code!");
+          case ICmpInst::ICMP_EQ:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:
+              if (LHSCst == SubOne(RHSCst)) {// (X == 13 | X == 14) -> X-13 <u 2
+                Constant *AddCST = ConstantExpr::getNeg(LHSCst);
+                Instruction *Add = BinaryOperator::createAdd(LHSVal, AddCST,
+                                                      LHSVal->getName()+".off");
+                InsertNewInstBefore(Add, I);
+                AddCST = Subtract(AddOne(RHSCst), LHSCst);
+                return new ICmpInst(ICmpInst::ICMP_ULT, Add, AddCST);
+              }
+              break;                         // (X == 13 | X == 15) -> no change
+            case ICmpInst::ICMP_UGT:         // (X == 13 | X u> 14) -> no change
+            case ICmpInst::ICMP_SGT:         // (X == 13 | X s> 14) -> no change
+              break;
+            case ICmpInst::ICMP_NE:          // (X == 13 | X != 15) -> X != 15
+            case ICmpInst::ICMP_ULT:         // (X == 13 | X u< 15) -> X u< 15
+            case ICmpInst::ICMP_SLT:         // (X == 13 | X s< 15) -> X s< 15
+              return ReplaceInstUsesWith(I, RHS);
+            }
+            break;
+          case ICmpInst::ICMP_NE:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:          // (X != 13 | X == 15) -> X != 13
+            case ICmpInst::ICMP_UGT:         // (X != 13 | X u> 15) -> X != 13
+            case ICmpInst::ICMP_SGT:         // (X != 13 | X s> 15) -> X != 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_NE:          // (X != 13 | X != 15) -> true
+            case ICmpInst::ICMP_ULT:         // (X != 13 | X u< 15) -> true
+            case ICmpInst::ICMP_SLT:         // (X != 13 | X s< 15) -> true
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            }
+            break;
+          case ICmpInst::ICMP_ULT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X u< 13 | X == 14) -> no change
+              break;
+            case ICmpInst::ICMP_UGT:        // (X u< 13 | X u> 15) ->(X-13) u> 2
+              return InsertRangeTest(LHSVal, LHSCst, AddOne(RHSCst), false, 
+                                     false, I);
+            case ICmpInst::ICMP_SGT:        // (X u< 13 | X s> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X u< 13 | X != 15) -> X != 15
+            case ICmpInst::ICMP_ULT:        // (X u< 13 | X u< 15) -> X u< 15
+              return ReplaceInstUsesWith(I, RHS);
+            case ICmpInst::ICMP_SLT:        // (X u< 13 | X s< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_SLT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X s< 13 | X == 14) -> no change
+              break;
+            case ICmpInst::ICMP_SGT:        // (X s< 13 | X s> 15) ->(X-13) s> 2
+              return InsertRangeTest(LHSVal, LHSCst, AddOne(RHSCst), true, 
+                                     false, I);
+            case ICmpInst::ICMP_UGT:        // (X s< 13 | X u> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X s< 13 | X != 15) -> X != 15
+            case ICmpInst::ICMP_SLT:        // (X s< 13 | X s< 15) -> X s< 15
+              return ReplaceInstUsesWith(I, RHS);
+            case ICmpInst::ICMP_ULT:        // (X s< 13 | X u< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_UGT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X u> 13 | X == 15) -> X u> 13
+            case ICmpInst::ICMP_UGT:        // (X u> 13 | X u> 15) -> X u> 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_SGT:        // (X u> 13 | X s> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X u> 13 | X != 15) -> true
+            case ICmpInst::ICMP_ULT:        // (X u> 13 | X u< 15) -> true
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            case ICmpInst::ICMP_SLT:        // (X u> 13 | X s< 15) -> no change
+              break;
+            }
+            break;
+          case ICmpInst::ICMP_SGT:
+            switch (RHSCC) {
+            default: assert(0 && "Unknown integer condition code!");
+            case ICmpInst::ICMP_EQ:         // (X s> 13 | X == 15) -> X > 13
+            case ICmpInst::ICMP_SGT:        // (X s> 13 | X s> 15) -> X > 13
+              return ReplaceInstUsesWith(I, LHS);
+            case ICmpInst::ICMP_UGT:        // (X s> 13 | X u> 15) -> no change
+              break;
+            case ICmpInst::ICMP_NE:         // (X s> 13 | X != 15) -> true
+            case ICmpInst::ICMP_SLT:        // (X s> 13 | X s< 15) -> true
+              return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+            case ICmpInst::ICMP_ULT:        // (X s> 13 | X u< 15) -> no change
+              break;
+            }
+            break;
+          }
+        }
+  }
+    
+  // fold (or (cast A), (cast B)) -> (cast (or A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0))
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) {// same cast kind ?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::createOr(Op0C->getOperand(0),
+                                                        Op1C->getOperand(0),
+                                                        I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+      
+
+  return Changed ? &I : 0;
+}
+
+// XorSelf - Implements: X ^ X --> 0
+struct XorSelf {
+  Value *RHS;
+  XorSelf(Value *rhs) : RHS(rhs) {}
+  bool shouldApply(Value *LHS) const { return LHS == RHS; }
+  Instruction *apply(BinaryOperator &Xor) const {
+    return &Xor;
+  }
+};
+
+
+Instruction *InstCombiner::visitXor(BinaryOperator &I) {
+  bool Changed = SimplifyCommutative(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  if (isa<UndefValue>(Op1))
+    return ReplaceInstUsesWith(I, Op1);  // X ^ undef -> undef
+
+  // xor X, X = 0, even if X is nested in a sequence of Xor's.
+  if (Instruction *Result = AssociativeOpt(I, XorSelf(Op1))) {
+    assert(Result == &I && "AssociativeOpt didn't work?");
+    return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  if (!isa<VectorType>(I.getType())) {
+    uint32_t BitWidth = cast<IntegerType>(I.getType())->getBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(BitWidth),
+                             KnownZero, KnownOne))
+      return &I;
+  } else if (isa<ConstantAggregateZero>(Op1)) {
+    return ReplaceInstUsesWith(I, Op0);  // X ^ <0,0> -> X
+  }
+
+  // Is this a ~ operation?
+  if (Value *NotOp = dyn_castNotVal(&I)) {
+    // ~(~X & Y) --> (X | ~Y) - De Morgan's Law
+    // ~(~X | Y) === (X & ~Y) - De Morgan's Law
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(NotOp)) {
+      if (Op0I->getOpcode() == Instruction::And || 
+          Op0I->getOpcode() == Instruction::Or) {
+        if (dyn_castNotVal(Op0I->getOperand(1))) Op0I->swapOperands();
+        if (Value *Op0NotVal = dyn_castNotVal(Op0I->getOperand(0))) {
+          Instruction *NotY =
+            BinaryOperator::createNot(Op0I->getOperand(1),
+                                      Op0I->getOperand(1)->getName()+".not");
+          InsertNewInstBefore(NotY, I);
+          if (Op0I->getOpcode() == Instruction::And)
+            return BinaryOperator::createOr(Op0NotVal, NotY);
+          else
+            return BinaryOperator::createAnd(Op0NotVal, NotY);
+        }
+      }
+    }
+  }
+  
+  
+  if (ConstantInt *RHS = dyn_cast<ConstantInt>(Op1)) {
+    // xor (icmp A, B), true = not (icmp A, B) = !icmp A, B
+    if (ICmpInst *ICI = dyn_cast<ICmpInst>(Op0))
+      if (RHS == ConstantInt::getTrue() && ICI->hasOneUse())
+        return new ICmpInst(ICI->getInversePredicate(),
+                            ICI->getOperand(0), ICI->getOperand(1));
+
+    if (BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0)) {
+      // ~(c-X) == X-c-1 == X+(-c-1)
+      if (Op0I->getOpcode() == Instruction::Sub && RHS->isAllOnesValue())
+        if (Constant *Op0I0C = dyn_cast<Constant>(Op0I->getOperand(0))) {
+          Constant *NegOp0I0C = ConstantExpr::getNeg(Op0I0C);
+          Constant *ConstantRHS = ConstantExpr::getSub(NegOp0I0C,
+                                              ConstantInt::get(I.getType(), 1));
+          return BinaryOperator::createAdd(Op0I->getOperand(1), ConstantRHS);
+        }
+          
+      if (ConstantInt *Op0CI = dyn_cast<ConstantInt>(Op0I->getOperand(1)))
+        if (Op0I->getOpcode() == Instruction::Add) {
+          // ~(X-c) --> (-c-1)-X
+          if (RHS->isAllOnesValue()) {
+            Constant *NegOp0CI = ConstantExpr::getNeg(Op0CI);
+            return BinaryOperator::createSub(
+                           ConstantExpr::getSub(NegOp0CI,
+                                             ConstantInt::get(I.getType(), 1)),
+                                          Op0I->getOperand(0));
+          } else if (RHS->getValue().isSignBit()) {
+            // (X + C) ^ signbit -> (X + C + signbit)
+            Constant *C = ConstantInt::get(RHS->getValue() + Op0CI->getValue());
+            return BinaryOperator::createAdd(Op0I->getOperand(0), C);
+
+          }
+        } else if (Op0I->getOpcode() == Instruction::Or) {
+          // (X|C1)^C2 -> X^(C1|C2) iff X&~C1 == 0
+          if (MaskedValueIsZero(Op0I->getOperand(0), Op0CI->getValue())) {
+            Constant *NewRHS = ConstantExpr::getOr(Op0CI, RHS);
+            // Anything in both C1 and C2 is known to be zero, remove it from
+            // NewRHS.
+            Constant *CommonBits = And(Op0CI, RHS);
+            NewRHS = ConstantExpr::getAnd(NewRHS, 
+                                          ConstantExpr::getNot(CommonBits));
+            AddToWorkList(Op0I);
+            I.setOperand(0, Op0I->getOperand(0));
+            I.setOperand(1, NewRHS);
+            return &I;
+          }
+        }
+    }
+
+    // Try to fold constant and into select arguments.
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+    if (isa<PHINode>(Op0))
+      if (Instruction *NV = FoldOpIntoPhi(I))
+        return NV;
+  }
+
+  if (Value *X = dyn_castNotVal(Op0))   // ~A ^ A == -1
+    if (X == Op1)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  if (Value *X = dyn_castNotVal(Op1))   // A ^ ~A == -1
+    if (X == Op0)
+      return ReplaceInstUsesWith(I, Constant::getAllOnesValue(I.getType()));
+
+  
+  BinaryOperator *Op1I = dyn_cast<BinaryOperator>(Op1);
+  if (Op1I) {
+    Value *A, *B;
+    if (match(Op1I, m_Or(m_Value(A), m_Value(B)))) {
+      if (A == Op0) {              // B^(B|A) == (A|B)^B
+        Op1I->swapOperands();
+        I.swapOperands();
+        std::swap(Op0, Op1);
+      } else if (B == Op0) {       // B^(A|B) == (A|B)^B
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    } else if (match(Op1I, m_Xor(m_Value(A), m_Value(B)))) {
+      if (Op0 == A)                                          // A^(A^B) == B
+        return ReplaceInstUsesWith(I, B);
+      else if (Op0 == B)                                     // A^(B^A) == B
+        return ReplaceInstUsesWith(I, A);
+    } else if (match(Op1I, m_And(m_Value(A), m_Value(B))) && Op1I->hasOneUse()){
+      if (A == Op0) {                                      // A^(A&B) -> A^(B&A)
+        Op1I->swapOperands();
+        std::swap(A, B);
+      }
+      if (B == Op0) {                                      // A^(B&A) -> (B&A)^A
+        I.swapOperands();     // Simplified below.
+        std::swap(Op0, Op1);
+      }
+    }
+  }
+  
+  BinaryOperator *Op0I = dyn_cast<BinaryOperator>(Op0);
+  if (Op0I) {
+    Value *A, *B;
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) && Op0I->hasOneUse()) {
+      if (A == Op1)                                  // (B|A)^B == (A|B)^B
+        std::swap(A, B);
+      if (B == Op1) {                                // (A|B)^B == A & ~B
+        Instruction *NotB =
+          InsertNewInstBefore(BinaryOperator::createNot(Op1, "tmp"), I);
+        return BinaryOperator::createAnd(A, NotB);
+      }
+    } else if (match(Op0I, m_Xor(m_Value(A), m_Value(B)))) {
+      if (Op1 == A)                                          // (A^B)^A == B
+        return ReplaceInstUsesWith(I, B);
+      else if (Op1 == B)                                     // (B^A)^A == B
+        return ReplaceInstUsesWith(I, A);
+    } else if (match(Op0I, m_And(m_Value(A), m_Value(B))) && Op0I->hasOneUse()){
+      if (A == Op1)                                        // (A&B)^A -> (B&A)^A
+        std::swap(A, B);
+      if (B == Op1 &&                                      // (B&A)^A == ~B & A
+          !isa<ConstantInt>(Op1)) {  // Canonical form is (B&C)^C
+        Instruction *N =
+          InsertNewInstBefore(BinaryOperator::createNot(A, "tmp"), I);
+        return BinaryOperator::createAnd(N, Op1);
+      }
+    }
+  }
+  
+  // (X >> Z) ^ (Y >> Z)  -> (X^Y) >> Z  for all shifts.
+  if (Op0I && Op1I && Op0I->isShift() && 
+      Op0I->getOpcode() == Op1I->getOpcode() && 
+      Op0I->getOperand(1) == Op1I->getOperand(1) &&
+      (Op1I->hasOneUse() || Op1I->hasOneUse())) {
+    Instruction *NewOp =
+      InsertNewInstBefore(BinaryOperator::createXor(Op0I->getOperand(0),
+                                                    Op1I->getOperand(0),
+                                                    Op0I->getName()), I);
+    return BinaryOperator::create(Op1I->getOpcode(), NewOp, 
+                                  Op1I->getOperand(1));
+  }
+    
+  if (Op0I && Op1I) {
+    Value *A, *B, *C, *D;
+    // (A & B)^(A | B) -> A ^ B
+    if (match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_Or(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::createXor(A, B);
+    }
+    // (A | B)^(A & B) -> A ^ B
+    if (match(Op0I, m_Or(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      if ((A == C && B == D) || (A == D && B == C)) 
+        return BinaryOperator::createXor(A, B);
+    }
+    
+    // (A & B)^(C & D)
+    if ((Op0I->hasOneUse() || Op1I->hasOneUse()) &&
+        match(Op0I, m_And(m_Value(A), m_Value(B))) &&
+        match(Op1I, m_And(m_Value(C), m_Value(D)))) {
+      // (X & Y)^(X & Y) -> (Y^Z) & X
+      Value *X = 0, *Y = 0, *Z = 0;
+      if (A == C)
+        X = A, Y = B, Z = D;
+      else if (A == D)
+        X = A, Y = B, Z = C;
+      else if (B == C)
+        X = B, Y = A, Z = D;
+      else if (B == D)
+        X = B, Y = A, Z = C;
+      
+      if (X) {
+        Instruction *NewOp =
+        InsertNewInstBefore(BinaryOperator::createXor(Y, Z, Op0->getName()), I);
+        return BinaryOperator::createAnd(NewOp, X);
+      }
+    }
+  }
+    
+  // (icmp1 A, B) ^ (icmp2 A, B) --> (icmp3 A, B)
+  if (ICmpInst *RHS = dyn_cast<ICmpInst>(I.getOperand(1)))
+    if (Instruction *R = AssociativeOpt(I, FoldICmpLogical(*this, RHS)))
+      return R;
+
+  // fold (xor (cast A), (cast B)) -> (cast (xor A, B))
+  if (CastInst *Op0C = dyn_cast<CastInst>(Op0)) 
+    if (CastInst *Op1C = dyn_cast<CastInst>(Op1))
+      if (Op0C->getOpcode() == Op1C->getOpcode()) { // same cast kind?
+        const Type *SrcTy = Op0C->getOperand(0)->getType();
+        if (SrcTy == Op1C->getOperand(0)->getType() && SrcTy->isInteger() &&
+            // Only do this if the casts both really cause code to be generated.
+            ValueRequiresCast(Op0C->getOpcode(), Op0C->getOperand(0), 
+                              I.getType(), TD) &&
+            ValueRequiresCast(Op1C->getOpcode(), Op1C->getOperand(0), 
+                              I.getType(), TD)) {
+          Instruction *NewOp = BinaryOperator::createXor(Op0C->getOperand(0),
+                                                         Op1C->getOperand(0),
+                                                         I.getName());
+          InsertNewInstBefore(NewOp, I);
+          return CastInst::create(Op0C->getOpcode(), NewOp, I.getType());
+        }
+      }
+
+  return Changed ? &I : 0;
+}
+
+/// AddWithOverflow - Compute Result = In1+In2, returning true if the result
+/// overflowed for this type.
+static bool AddWithOverflow(ConstantInt *&Result, ConstantInt *In1,
+                            ConstantInt *In2, bool IsSigned = false) {
+  Result = cast<ConstantInt>(Add(In1, In2));
+
+  if (IsSigned)
+    if (In2->getValue().isNegative())
+      return Result->getValue().sgt(In1->getValue());
+    else
+      return Result->getValue().slt(In1->getValue());
+  else
+    return Result->getValue().ult(In1->getValue());
+}
+
+/// EmitGEPOffset - Given a getelementptr instruction/constantexpr, emit the
+/// code necessary to compute the offset from the base pointer (without adding
+/// in the base pointer).  Return the result as a signed integer of intptr size.
+static Value *EmitGEPOffset(User *GEP, Instruction &I, InstCombiner &IC) {
+  TargetData &TD = IC.getTargetData();
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  const Type *IntPtrTy = TD.getIntPtrType();
+  Value *Result = Constant::getNullValue(IntPtrTy);
+
+  // Build a mask for high order bits.
+  unsigned IntPtrWidth = TD.getPointerSize()*8;
+  uint64_t PtrSizeMask = ~0ULL >> (64-IntPtrWidth);
+
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    Value *Op = GEP->getOperand(i);
+    uint64_t Size = TD.getTypeSize(GTI.getIndexedType()) & PtrSizeMask;
+    if (ConstantInt *OpC = dyn_cast<ConstantInt>(Op)) {
+      if (OpC->isZero()) continue;
+      
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+        Size = TD.getStructLayout(STy)->getElementOffset(OpC->getZExtValue());
+        
+        if (ConstantInt *RC = dyn_cast<ConstantInt>(Result))
+          Result = ConstantInt::get(RC->getValue() + APInt(IntPtrWidth, Size));
+        else
+          Result = IC.InsertNewInstBefore(
+                   BinaryOperator::createAdd(Result,
+                                             ConstantInt::get(IntPtrTy, Size),
+                                             GEP->getName()+".offs"), I);
+        continue;
+      }
+      
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      Constant *OC = ConstantExpr::getIntegerCast(OpC, IntPtrTy, true /*SExt*/);
+      Scale = ConstantExpr::getMul(OC, Scale);
+      if (Constant *RC = dyn_cast<Constant>(Result))
+        Result = ConstantExpr::getAdd(RC, Scale);
+      else {
+        // Emit an add instruction.
+        Result = IC.InsertNewInstBefore(
+           BinaryOperator::createAdd(Result, Scale,
+                                     GEP->getName()+".offs"), I);
+      }
+      continue;
+    }
+    // Convert to correct type.
+    if (Op->getType() != IntPtrTy) {
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getSExt(OpC, IntPtrTy);
+      else
+        Op = IC.InsertNewInstBefore(new SExtInst(Op, IntPtrTy,
+                                                 Op->getName()+".c"), I);
+    }
+    if (Size != 1) {
+      Constant *Scale = ConstantInt::get(IntPtrTy, Size);
+      if (Constant *OpC = dyn_cast<Constant>(Op))
+        Op = ConstantExpr::getMul(OpC, Scale);
+      else    // We'll let instcombine(mul) convert this to a shl if possible.
+        Op = IC.InsertNewInstBefore(BinaryOperator::createMul(Op, Scale,
+                                                  GEP->getName()+".idx"), I);
+    }
+
+    // Emit an add instruction.
+    if (isa<Constant>(Op) && isa<Constant>(Result))
+      Result = ConstantExpr::getAdd(cast<Constant>(Op),
+                                    cast<Constant>(Result));
+    else
+      Result = IC.InsertNewInstBefore(BinaryOperator::createAdd(Op, Result,
+                                                  GEP->getName()+".offs"), I);
+  }
+  return Result;
+}
+
+/// FoldGEPICmp - Fold comparisons between a GEP instruction and something
+/// else.  At this point we know that the GEP is on the LHS of the comparison.
+Instruction *InstCombiner::FoldGEPICmp(User *GEPLHS, Value *RHS,
+                                       ICmpInst::Predicate Cond,
+                                       Instruction &I) {
+  assert(dyn_castGetElementPtr(GEPLHS) && "LHS is not a getelementptr!");
+
+  if (CastInst *CI = dyn_cast<CastInst>(RHS))
+    if (isa<PointerType>(CI->getOperand(0)->getType()))
+      RHS = CI->getOperand(0);
+
+  Value *PtrBase = GEPLHS->getOperand(0);
+  if (PtrBase == RHS) {
+    // As an optimization, we don't actually have to compute the actual value of
+    // OFFSET if this is a icmp_eq or icmp_ne comparison, just return whether 
+    // each index is zero or not.
+    if (Cond == ICmpInst::ICMP_EQ || Cond == ICmpInst::ICMP_NE) {
+      Instruction *InVal = 0;
+      gep_type_iterator GTI = gep_type_begin(GEPLHS);
+      for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i, ++GTI) {
+        bool EmitIt = true;
+        if (Constant *C = dyn_cast<Constant>(GEPLHS->getOperand(i))) {
+          if (isa<UndefValue>(C))  // undef index -> undef.
+            return ReplaceInstUsesWith(I, UndefValue::get(I.getType()));
+          if (C->isNullValue())
+            EmitIt = false;
+          else if (TD->getTypeSize(GTI.getIndexedType()) == 0) {
+            EmitIt = false;  // This is indexing into a zero sized array?
+          } else if (isa<ConstantInt>(C))
+            return ReplaceInstUsesWith(I, // No comparison is needed here.
+                                 ConstantInt::get(Type::Int1Ty, 
+                                                  Cond == ICmpInst::ICMP_NE));
+        }
+
+        if (EmitIt) {
+          Instruction *Comp =
+            new ICmpInst(Cond, GEPLHS->getOperand(i),
+                    Constant::getNullValue(GEPLHS->getOperand(i)->getType()));
+          if (InVal == 0)
+            InVal = Comp;
+          else {
+            InVal = InsertNewInstBefore(InVal, I);
+            InsertNewInstBefore(Comp, I);
+            if (Cond == ICmpInst::ICMP_NE)   // True if any are unequal
+              InVal = BinaryOperator::createOr(InVal, Comp);
+            else                              // True if all are equal
+              InVal = BinaryOperator::createAnd(InVal, Comp);
+          }
+        }
+      }
+
+      if (InVal)
+        return InVal;
+      else
+        // No comparison is needed here, all indexes = 0
+        ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                Cond == ICmpInst::ICMP_EQ));
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if (isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) {
+      // ((gep Ptr, OFFSET) cmp Ptr)   ---> (OFFSET cmp 0).
+      Value *Offset = EmitGEPOffset(GEPLHS, I, *this);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), Offset,
+                          Constant::getNullValue(Offset->getType()));
+    }
+  } else if (User *GEPRHS = dyn_castGetElementPtr(RHS)) {
+    // If the base pointers are different, but the indices are the same, just
+    // compare the base pointer.
+    if (PtrBase != GEPRHS->getOperand(0)) {
+      bool IndicesTheSame = GEPLHS->getNumOperands()==GEPRHS->getNumOperands();
+      IndicesTheSame &= GEPLHS->getOperand(0)->getType() ==
+                        GEPRHS->getOperand(0)->getType();
+      if (IndicesTheSame)
+        for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+          if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+            IndicesTheSame = false;
+            break;
+          }
+
+      // If all indices are the same, just compare the base pointers.
+      if (IndicesTheSame)
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), 
+                            GEPLHS->getOperand(0), GEPRHS->getOperand(0));
+
+      // Otherwise, the base pointers are different and the indices are
+      // different, bail out.
+      return 0;
+    }
+
+    // If one of the GEPs has all zero indices, recurse.
+    bool AllZeros = true;
+    for (unsigned i = 1, e = GEPLHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPLHS->getOperand(i)) ||
+          !cast<Constant>(GEPLHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPRHS, GEPLHS->getOperand(0),
+                          ICmpInst::getSwappedPredicate(Cond), I);
+
+    // If the other GEP has all zero indices, recurse.
+    AllZeros = true;
+    for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPRHS->getOperand(i)) ||
+          !cast<Constant>(GEPRHS->getOperand(i))->isNullValue()) {
+        AllZeros = false;
+        break;
+      }
+    if (AllZeros)
+      return FoldGEPICmp(GEPLHS, GEPRHS->getOperand(0), Cond, I);
+
+    if (GEPLHS->getNumOperands() == GEPRHS->getNumOperands()) {
+      // If the GEPs only differ by one index, compare it.
+      unsigned NumDifferences = 0;  // Keep track of # differences.
+      unsigned DiffOperand = 0;     // The operand that differs.
+      for (unsigned i = 1, e = GEPRHS->getNumOperands(); i != e; ++i)
+        if (GEPLHS->getOperand(i) != GEPRHS->getOperand(i)) {
+          if (GEPLHS->getOperand(i)->getType()->getPrimitiveSizeInBits() !=
+                   GEPRHS->getOperand(i)->getType()->getPrimitiveSizeInBits()) {
+            // Irreconcilable differences.
+            NumDifferences = 2;
+            break;
+          } else {
+            if (NumDifferences++) break;
+            DiffOperand = i;
+          }
+        }
+
+      if (NumDifferences == 0)   // SAME GEP?
+        return ReplaceInstUsesWith(I, // No comparison is needed here.
+                                   ConstantInt::get(Type::Int1Ty, 
+                                                    Cond == ICmpInst::ICMP_EQ));
+      else if (NumDifferences == 1) {
+        Value *LHSV = GEPLHS->getOperand(DiffOperand);
+        Value *RHSV = GEPRHS->getOperand(DiffOperand);
+        // Make sure we do a signed comparison here.
+        return new ICmpInst(ICmpInst::getSignedPredicate(Cond), LHSV, RHSV);
+      }
+    }
+
+    // Only lower this if the icmp is the only user of the GEP or if we expect
+    // the result to fold to a constant!
+    if ((isa<ConstantExpr>(GEPLHS) || GEPLHS->hasOneUse()) &&
+        (isa<ConstantExpr>(GEPRHS) || GEPRHS->hasOneUse())) {
+      // ((gep Ptr, OFFSET1) cmp (gep Ptr, OFFSET2)  --->  (OFFSET1 cmp OFFSET2)
+      Value *L = EmitGEPOffset(GEPLHS, I, *this);
+      Value *R = EmitGEPOffset(GEPRHS, I, *this);
+      return new ICmpInst(ICmpInst::getSignedPredicate(Cond), L, R);
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // Fold trivial predicates.
+  if (I.getPredicate() == FCmpInst::FCMP_FALSE)
+    return ReplaceInstUsesWith(I, Constant::getNullValue(Type::Int1Ty));
+  if (I.getPredicate() == FCmpInst::FCMP_TRUE)
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+  
+  // Simplify 'fcmp pred X, X'
+  if (Op0 == Op1) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Unknown predicate!");
+    case FCmpInst::FCMP_UEQ:    // True if unordered or equal
+    case FCmpInst::FCMP_UGE:    // True if unordered, greater than, or equal
+    case FCmpInst::FCMP_ULE:    // True if unordered, less than, or equal
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 1));
+    case FCmpInst::FCMP_OGT:    // True if ordered and greater than
+    case FCmpInst::FCMP_OLT:    // True if ordered and less than
+    case FCmpInst::FCMP_ONE:    // True if ordered and operands are unequal
+      return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 0));
+      
+    case FCmpInst::FCMP_UNO:    // True if unordered: isnan(X) | isnan(Y)
+    case FCmpInst::FCMP_ULT:    // True if unordered or less than
+    case FCmpInst::FCMP_UGT:    // True if unordered or greater than
+    case FCmpInst::FCMP_UNE:    // True if unordered or not equal
+      // Canonicalize these to be 'fcmp uno %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_UNO);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+      
+    case FCmpInst::FCMP_ORD:    // True if ordered (no nans)
+    case FCmpInst::FCMP_OEQ:    // True if ordered and equal
+    case FCmpInst::FCMP_OGE:    // True if ordered and greater than or equal
+    case FCmpInst::FCMP_OLE:    // True if ordered and less than or equal
+      // Canonicalize these to be 'fcmp ord %X, 0.0'.
+      I.setPredicate(FCmpInst::FCMP_ORD);
+      I.setOperand(1, Constant::getNullValue(Op0->getType()));
+      return &I;
+    }
+  }
+    
+  if (isa<UndefValue>(Op1))                  // fcmp pred X, undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+
+  // Handle fcmp with constant RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::PHI:
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+        break;
+      case Instruction::Select:
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op2 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(2), RHSC,
+                                                      I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getCompare(I.getPredicate(), C, RHSC);
+            // Insert a new FCmp of the other select operand.
+            Op1 = InsertNewInstBefore(new FCmpInst(I.getPredicate(),
+                                                      LHSI->getOperand(1), RHSC,
+                                                      I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return new SelectInst(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+  }
+
+  return Changed ? &I : 0;
+}
+
+Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
+  bool Changed = SimplifyCompare(I);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  const Type *Ty = Op0->getType();
+
+  // icmp X, X
+  if (Op0 == Op1)
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   isTrueWhenEqual(I)));
+
+  if (isa<UndefValue>(Op1))                  // X icmp undef -> undef
+    return ReplaceInstUsesWith(I, UndefValue::get(Type::Int1Ty));
+
+  // icmp of GlobalValues can never equal each other as long as they aren't
+  // external weak linkage type.
+  if (GlobalValue *GV0 = dyn_cast<GlobalValue>(Op0))
+    if (GlobalValue *GV1 = dyn_cast<GlobalValue>(Op1))
+      if (!GV0->hasExternalWeakLinkage() || !GV1->hasExternalWeakLinkage())
+        return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty,
+                                                       !isTrueWhenEqual(I)));
+
+  // icmp <global/alloca*/null>, <global/alloca*/null> - Global/Stack value
+  // addresses never equal each other!  We already know that Op0 != Op1.
+  if ((isa<GlobalValue>(Op0) || isa<AllocaInst>(Op0) ||
+       isa<ConstantPointerNull>(Op0)) &&
+      (isa<GlobalValue>(Op1) || isa<AllocaInst>(Op1) ||
+       isa<ConstantPointerNull>(Op1)))
+    return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty, 
+                                                   !isTrueWhenEqual(I)));
+
+  // icmp's with boolean values can always be turned into bitwise operations
+  if (Ty == Type::Int1Ty) {
+    switch (I.getPredicate()) {
+    default: assert(0 && "Invalid icmp instruction!");
+    case ICmpInst::ICMP_EQ: {               // icmp eq bool %A, %B -> ~(A^B)
+      Instruction *Xor = BinaryOperator::createXor(Op0, Op1, I.getName()+"tmp");
+      InsertNewInstBefore(Xor, I);
+      return BinaryOperator::createNot(Xor);
+    }
+    case ICmpInst::ICMP_NE:                  // icmp eq bool %A, %B -> A^B
+      return BinaryOperator::createXor(Op0, Op1);
+
+    case ICmpInst::ICMP_UGT:
+    case ICmpInst::ICMP_SGT:
+      std::swap(Op0, Op1);                   // Change icmp gt -> icmp lt
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULT:
+    case ICmpInst::ICMP_SLT: {               // icmp lt bool A, B -> ~X & Y
+      Instruction *Not = BinaryOperator::createNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::createAnd(Not, Op1);
+    }
+    case ICmpInst::ICMP_UGE:
+    case ICmpInst::ICMP_SGE:
+      std::swap(Op0, Op1);                   // Change icmp ge -> icmp le
+      // FALL THROUGH
+    case ICmpInst::ICMP_ULE:
+    case ICmpInst::ICMP_SLE: {               //  icmp le bool %A, %B -> ~A | B
+      Instruction *Not = BinaryOperator::createNot(Op0, I.getName()+"tmp");
+      InsertNewInstBefore(Not, I);
+      return BinaryOperator::createOr(Not, Op1);
+    }
+    }
+  }
+
+  // See if we are doing a comparison between a constant and an instruction that
+  // can be folded into the comparison.
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Op1)) {
+    switch (I.getPredicate()) {
+    default: break;
+    case ICmpInst::ICMP_ULT:                        // A <u MIN -> FALSE
+      if (CI->isMinValue(false))
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (CI->isMaxValue(false))                    // A <u MAX -> A != MAX
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0,Op1);
+      if (isMinValuePlusOne(CI,false))              // A <u MIN+1 -> A == MIN
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+      // (x <u 2147483648) -> (x >s -1)  -> true if sign bit clear
+      if (CI->isMinValue(true))
+        return new ICmpInst(ICmpInst::ICMP_SGT, Op0,
+                            ConstantInt::getAllOnesValue(Op0->getType()));
+          
+      break;
+
+    case ICmpInst::ICMP_SLT:
+      if (CI->isMinValue(true))                    // A <s MIN -> FALSE
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (CI->isMaxValue(true))                    // A <s MAX -> A != MAX
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (isMinValuePlusOne(CI,true))              // A <s MIN+1 -> A == MIN
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, SubOne(CI));
+      break;
+
+    case ICmpInst::ICMP_UGT:
+      if (CI->isMaxValue(false))                  // A >u MAX -> FALSE
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (CI->isMinValue(false))                  // A >u MIN -> A != MIN
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (isMaxValueMinusOne(CI, false))          // A >u MAX-1 -> A == MAX
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+        
+      // (x >u 2147483647) -> (x <s 0)  -> true if sign bit set
+      if (CI->isMaxValue(true))
+        return new ICmpInst(ICmpInst::ICMP_SLT, Op0,
+                            ConstantInt::getNullValue(Op0->getType()));
+      break;
+
+    case ICmpInst::ICMP_SGT:
+      if (CI->isMaxValue(true))                   // A >s MAX -> FALSE
+        return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+      if (CI->isMinValue(true))                   // A >s MIN -> A != MIN
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, Op1);
+      if (isMaxValueMinusOne(CI, true))           // A >s MAX-1 -> A == MAX
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, AddOne(CI));
+      break;
+
+    case ICmpInst::ICMP_ULE:
+      if (CI->isMaxValue(false))                 // A <=u MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (CI->isMinValue(false))                 // A <=u MIN -> A == MIN
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+      if (isMaxValueMinusOne(CI,false))          // A <=u MAX-1 -> A != MAX
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, AddOne(CI));
+      break;
+
+    case ICmpInst::ICMP_SLE:
+      if (CI->isMaxValue(true))                  // A <=s MAX -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (CI->isMinValue(true))                  // A <=s MIN -> A == MIN
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+      if (isMaxValueMinusOne(CI,true))           // A <=s MAX-1 -> A != MAX
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, AddOne(CI));
+      break;
+
+    case ICmpInst::ICMP_UGE:
+      if (CI->isMinValue(false))                 // A >=u MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (CI->isMaxValue(false))                 // A >=u MAX -> A == MAX
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+      if (isMinValuePlusOne(CI,false))           // A >=u MIN-1 -> A != MIN
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, SubOne(CI));
+      break;
+
+    case ICmpInst::ICMP_SGE:
+      if (CI->isMinValue(true))                  // A >=s MIN -> TRUE
+        return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+      if (CI->isMaxValue(true))                  // A >=s MAX -> A == MAX
+        return new ICmpInst(ICmpInst::ICMP_EQ, Op0, Op1);
+      if (isMinValuePlusOne(CI,true))            // A >=s MIN-1 -> A != MIN
+        return new ICmpInst(ICmpInst::ICMP_NE, Op0, SubOne(CI));
+      break;
+    }
+
+    // If we still have a icmp le or icmp ge instruction, turn it into the
+    // appropriate icmp lt or icmp gt instruction.  Since the border cases have
+    // already been handled above, this requires little checking.
+    //
+    switch (I.getPredicate()) {
+    default: break;
+    case ICmpInst::ICMP_ULE: 
+      return new ICmpInst(ICmpInst::ICMP_ULT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_SLE:
+      return new ICmpInst(ICmpInst::ICMP_SLT, Op0, AddOne(CI));
+    case ICmpInst::ICMP_UGE:
+      return new ICmpInst( ICmpInst::ICMP_UGT, Op0, SubOne(CI));
+    case ICmpInst::ICMP_SGE:
+      return new ICmpInst(ICmpInst::ICMP_SGT, Op0, SubOne(CI));
+    }
+    
+    // See if we can fold the comparison based on bits known to be zero or one
+    // in the input.  If this comparison is a normal comparison, it demands all
+    // bits, if it is a sign bit comparison, it only demands the sign bit.
+    
+    bool UnusedBit;
+    bool isSignBit = isSignBitCheck(I.getPredicate(), CI, UnusedBit);
+    
+    uint32_t BitWidth = cast<IntegerType>(Ty)->getBitWidth();
+    APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+    if (SimplifyDemandedBits(Op0, 
+                             isSignBit ? APInt::getSignBit(BitWidth)
+                                       : APInt::getAllOnesValue(BitWidth),
+                             KnownZero, KnownOne, 0))
+      return &I;
+        
+    // Given the known and unknown bits, compute a range that the LHS could be
+    // in.
+    if ((KnownOne | KnownZero) != 0) {
+      // Compute the Min, Max and RHS values based on the known bits. For the
+      // EQ and NE we use unsigned values.
+      APInt Min(BitWidth, 0), Max(BitWidth, 0);
+      const APInt& RHSVal = CI->getValue();
+      if (ICmpInst::isSignedPredicate(I.getPredicate())) {
+        ComputeSignedMinMaxValuesFromKnownBits(Ty, KnownZero, KnownOne, Min, 
+                                               Max);
+      } else {
+        ComputeUnsignedMinMaxValuesFromKnownBits(Ty, KnownZero, KnownOne, Min, 
+                                                 Max);
+      }
+      switch (I.getPredicate()) {  // LE/GE have been folded already.
+      default: assert(0 && "Unknown icmp opcode!");
+      case ICmpInst::ICMP_EQ:
+        if (Max.ult(RHSVal) || Min.ugt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_NE:
+        if (Max.ult(RHSVal) || Min.ugt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        break;
+      case ICmpInst::ICMP_ULT:
+        if (Max.ult(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        if (Min.uge(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_UGT:
+        if (Min.ugt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        if (Max.ule(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_SLT:
+        if (Max.slt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        if (Min.sgt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      case ICmpInst::ICMP_SGT: 
+        if (Min.sgt(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getTrue());
+        if (Max.sle(RHSVal))
+          return ReplaceInstUsesWith(I, ConstantInt::getFalse());
+        break;
+      }
+    }
+          
+    // Since the RHS is a ConstantInt (CI), if the left hand side is an 
+    // instruction, see if that instruction also has constants so that the 
+    // instruction can be folded into the icmp 
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      if (Instruction *Res = visitICmpInstWithInstAndIntCst(I, LHSI, CI))
+        return Res;
+  }
+
+  // Handle icmp with constant (but not simple integer constant) RHS
+  if (Constant *RHSC = dyn_cast<Constant>(Op1)) {
+    if (Instruction *LHSI = dyn_cast<Instruction>(Op0))
+      switch (LHSI->getOpcode()) {
+      case Instruction::GetElementPtr:
+        if (RHSC->isNullValue()) {
+          // icmp pred GEP (P, int 0, int 0, int 0), null -> icmp pred P, null
+          bool isAllZeros = true;
+          for (unsigned i = 1, e = LHSI->getNumOperands(); i != e; ++i)
+            if (!isa<Constant>(LHSI->getOperand(i)) ||
+                !cast<Constant>(LHSI->getOperand(i))->isNullValue()) {
+              isAllZeros = false;
+              break;
+            }
+          if (isAllZeros)
+            return new ICmpInst(I.getPredicate(), LHSI->getOperand(0),
+                    Constant::getNullValue(LHSI->getOperand(0)->getType()));
+        }
+        break;
+
+      case Instruction::PHI:
+        if (Instruction *NV = FoldOpIntoPhi(I))
+          return NV;
+        break;
+      case Instruction::Select: {
+        // If either operand of the select is a constant, we can fold the
+        // comparison into the select arms, which will cause one to be
+        // constant folded and the select turned into a bitwise or.
+        Value *Op1 = 0, *Op2 = 0;
+        if (LHSI->hasOneUse()) {
+          if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(1))) {
+            // Fold the known value into the constant operand.
+            Op1 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op2 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(2), RHSC,
+                                                   I.getName()), I);
+          } else if (Constant *C = dyn_cast<Constant>(LHSI->getOperand(2))) {
+            // Fold the known value into the constant operand.
+            Op2 = ConstantExpr::getICmp(I.getPredicate(), C, RHSC);
+            // Insert a new ICmp of the other select operand.
+            Op1 = InsertNewInstBefore(new ICmpInst(I.getPredicate(),
+                                                   LHSI->getOperand(1), RHSC,
+                                                   I.getName()), I);
+          }
+        }
+
+        if (Op1)
+          return new SelectInst(LHSI->getOperand(0), Op1, Op2);
+        break;
+      }
+      case Instruction::Malloc:
+        // If we have (malloc != null), and if the malloc has a single use, we
+        // can assume it is successful and remove the malloc.
+        if (LHSI->hasOneUse() && isa<ConstantPointerNull>(RHSC)) {
+          AddToWorkList(LHSI);
+          return ReplaceInstUsesWith(I, ConstantInt::get(Type::Int1Ty,
+                                                         !isTrueWhenEqual(I)));
+        }
+        break;
+      }
+  }
+
+  // If we can optimize a 'icmp GEP, P' or 'icmp P, GEP', do so now.
+  if (User *GEP = dyn_castGetElementPtr(Op0))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op1, I.getPredicate(), I))
+      return NI;
+  if (User *GEP = dyn_castGetElementPtr(Op1))
+    if (Instruction *NI = FoldGEPICmp(GEP, Op0,
+                           ICmpInst::getSwappedPredicate(I.getPredicate()), I))
+      return NI;
+
+  // Test to see if the operands of the icmp are casted versions of other
+  // values.  If the ptr->ptr cast can be stripped off both arguments, we do so
+  // now.
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op0)) {
+    if (isa<PointerType>(Op0->getType()) && 
+        (isa<Constant>(Op1) || isa<BitCastInst>(Op1))) { 
+      // We keep moving the cast from the left operand over to the right
+      // operand, where it can often be eliminated completely.
+      Op0 = CI->getOperand(0);
+
+      // If operand #1 is a bitcast instruction, it must also be a ptr->ptr cast
+      // so eliminate it as well.
+      if (BitCastInst *CI2 = dyn_cast<BitCastInst>(Op1))
+        Op1 = CI2->getOperand(0);
+
+      // If Op1 is a constant, we can fold the cast into the constant.
+      if (Op0->getType() != Op1->getType())
+        if (Constant *Op1C = dyn_cast<Constant>(Op1)) {
+          Op1 = ConstantExpr::getBitCast(Op1C, Op0->getType());
+        } else {
+          // Otherwise, cast the RHS right before the icmp
+          Op1 = InsertCastBefore(Instruction::BitCast, Op1, Op0->getType(), I);
+        }
+      return new ICmpInst(I.getPredicate(), Op0, Op1);
+    }
+  }
+  
+  if (isa<CastInst>(Op0)) {
+    // Handle the special case of: icmp (cast bool to X), <cst>
+    // This comes up when you have code like
+    //   int X = A < B;
+    //   if (X) ...
+    // For generality, we handle any zero-extension of any operand comparison
+    // with a constant or another cast from the same type.
+    if (isa<ConstantInt>(Op1) || isa<CastInst>(Op1))
+      if (Instruction *R = visitICmpInstWithCastAndCast(I))
+        return R;
+  }
+  
+  if (I.isEquality()) {
+    Value *A, *B, *C, *D;
+    if (match(Op0, m_Xor(m_Value(A), m_Value(B)))) {
+      if (A == Op1 || B == Op1) {    // (A^B) == A  ->  B == 0
+        Value *OtherVal = A == Op1 ? B : A;
+        return new ICmpInst(I.getPredicate(), OtherVal,
+                            Constant::getNullValue(A->getType()));
+      }
+
+      if (match(Op1, m_Xor(m_Value(C), m_Value(D)))) {
+        // A^c1 == C^c2 --> A == C^(c1^c2)
+        if (ConstantInt *C1 = dyn_cast<ConstantInt>(B))
+          if (ConstantInt *C2 = dyn_cast<ConstantInt>(D))
+            if (Op1->hasOneUse()) {
+              Constant *NC = ConstantInt::get(C1->getValue() ^ C2->getValue());
+              Instruction *Xor = BinaryOperator::createXor(C, NC, "tmp");
+              return new ICmpInst(I.getPredicate(), A,
+                                  InsertNewInstBefore(Xor, I));
+            }
+        
+        // A^B == A^D -> B == D
+        if (A == C) return new ICmpInst(I.getPredicate(), B, D);
+        if (A == D) return new ICmpInst(I.getPredicate(), B, C);
+        if (B == C) return new ICmpInst(I.getPredicate(), A, D);
+        if (B == D) return new ICmpInst(I.getPredicate(), A, C);
+      }
+    }
+    
+    if (match(Op1, m_Xor(m_Value(A), m_Value(B))) &&
+        (A == Op0 || B == Op0)) {
+      // A == (A^B)  ->  B == 0
+      Value *OtherVal = A == Op0 ? B : A;
+      return new ICmpInst(I.getPredicate(), OtherVal,
+                          Constant::getNullValue(A->getType()));
+    }
+    if (match(Op0, m_Sub(m_Value(A), m_Value(B))) && A == Op1) {
+      // (A-B) == A  ->  B == 0
+      return new ICmpInst(I.getPredicate(), B,
+                          Constant::getNullValue(B->getType()));
+    }
+    if (match(Op1, m_Sub(m_Value(A), m_Value(B))) && A == Op0) {
+      // A == (A-B)  ->  B == 0
+      return new ICmpInst(I.getPredicate(), B,
+                          Constant::getNullValue(B->getType()));
+    }
+    
+    // (X&Z) == (Y&Z) -> (X^Y) & Z == 0
+    if (Op0->hasOneUse() && Op1->hasOneUse() &&
+        match(Op0, m_And(m_Value(A), m_Value(B))) && 
+        match(Op1, m_And(m_Value(C), m_Value(D)))) {
+      Value *X = 0, *Y = 0, *Z = 0;
+      
+      if (A == C) {
+        X = B; Y = D; Z = A;
+      } else if (A == D) {
+        X = B; Y = C; Z = A;
+      } else if (B == C) {
+        X = A; Y = D; Z = B;
+      } else if (B == D) {
+        X = A; Y = C; Z = B;
+      }
+      
+      if (X) {   // Build (X^Y) & Z
+        Op1 = InsertNewInstBefore(BinaryOperator::createXor(X, Y, "tmp"), I);
+        Op1 = InsertNewInstBefore(BinaryOperator::createAnd(Op1, Z, "tmp"), I);
+        I.setOperand(0, Op1);
+        I.setOperand(1, Constant::getNullValue(Op1->getType()));
+        return &I;
+      }
+    }
+  }
+  return Changed ? &I : 0;
+}
+
+
+/// FoldICmpDivCst - Fold "icmp pred, ([su]div X, DivRHS), CmpRHS" where DivRHS
+/// and CmpRHS are both known to be integer constants.
+Instruction *InstCombiner::FoldICmpDivCst(ICmpInst &ICI, BinaryOperator *DivI,
+                                          ConstantInt *DivRHS) {
+  ConstantInt *CmpRHS = cast<ConstantInt>(ICI.getOperand(1));
+  const APInt &CmpRHSV = CmpRHS->getValue();
+  
+  // FIXME: If the operand types don't match the type of the divide 
+  // then don't attempt this transform. The code below doesn't have the
+  // logic to deal with a signed divide and an unsigned compare (and
+  // vice versa). This is because (x /s C1) <s C2  produces different 
+  // results than (x /s C1) <u C2 or (x /u C1) <s C2 or even
+  // (x /u C1) <u C2.  Simply casting the operands and result won't 
+  // work. :(  The if statement below tests that condition and bails 
+  // if it finds it. 
+  bool DivIsSigned = DivI->getOpcode() == Instruction::SDiv;
+  if (!ICI.isEquality() && DivIsSigned != ICI.isSignedPredicate())
+    return 0;
+  if (DivRHS->isZero())
+    return 0; // The ProdOV computation fails on divide by zero.
+
+  // Compute Prod = CI * DivRHS. We are essentially solving an equation
+  // of form X/C1=C2. We solve for X by multiplying C1 (DivRHS) and 
+  // C2 (CI). By solving for X we can turn this into a range check 
+  // instead of computing a divide. 
+  ConstantInt *Prod = Multiply(CmpRHS, DivRHS);
+
+  // Determine if the product overflows by seeing if the product is
+  // not equal to the divide. Make sure we do the same kind of divide
+  // as in the LHS instruction that we're folding. 
+  bool ProdOV = (DivIsSigned ? ConstantExpr::getSDiv(Prod, DivRHS) :
+                 ConstantExpr::getUDiv(Prod, DivRHS)) != CmpRHS;
+
+  // Get the ICmp opcode
+  ICmpInst::Predicate Pred = ICI.getPredicate();
+
+  // Figure out the interval that is being checked.  For example, a comparison
+  // like "X /u 5 == 0" is really checking that X is in the interval [0, 5). 
+  // Compute this interval based on the constants involved and the signedness of
+  // the compare/divide.  This computes a half-open interval, keeping track of
+  // whether either value in the interval overflows.  After analysis each
+  // overflow variable is set to 0 if it's corresponding bound variable is valid
+  // -1 if overflowed off the bottom end, or +1 if overflowed off the top end.
+  int LoOverflow = 0, HiOverflow = 0;
+  ConstantInt *LoBound = 0, *HiBound = 0;
+  
+  
+  if (!DivIsSigned) {  // udiv
+    // e.g. X/5 op 3  --> [15, 20)
+    LoBound = Prod;
+    HiOverflow = LoOverflow = ProdOV;
+    if (!HiOverflow)
+      HiOverflow = AddWithOverflow(HiBound, LoBound, DivRHS, false);
+  } else if (DivRHS->getValue().isPositive()) { // Divisor is > 0.
+    if (CmpRHSV == 0) {       // (X / pos) op 0
+      // Can't overflow.  e.g.  X/2 op 0 --> [-1, 2)
+      LoBound = cast<ConstantInt>(ConstantExpr::getNeg(SubOne(DivRHS)));
+      HiBound = DivRHS;
+    } else if (CmpRHSV.isPositive()) {   // (X / pos) op pos
+      LoBound = Prod;     // e.g.   X/5 op 3 --> [15, 20)
+      HiOverflow = LoOverflow = ProdOV;
+      if (!HiOverflow)
+        HiOverflow = AddWithOverflow(HiBound, Prod, DivRHS, true);
+    } else {                       // (X / pos) op neg
+      // e.g. X/5 op -3  --> [-15-4, -15+1) --> [-19, -14)
+      Constant *DivRHSH = ConstantExpr::getNeg(SubOne(DivRHS));
+      LoOverflow = AddWithOverflow(LoBound, Prod,
+                                   cast<ConstantInt>(DivRHSH), true) ? -1 : 0;
+      HiBound = AddOne(Prod);
+      HiOverflow = ProdOV ? -1 : 0;
+    }
+  } else {                         // Divisor is < 0.
+    if (CmpRHSV == 0) {       // (X / neg) op 0
+      // e.g. X/-5 op 0  --> [-4, 5)
+      LoBound = AddOne(DivRHS);
+      HiBound = cast<ConstantInt>(ConstantExpr::getNeg(DivRHS));
+      if (HiBound == DivRHS) {     // -INTMIN = INTMIN
+        HiOverflow = 1;            // [INTMIN+1, overflow)
+        HiBound = 0;               // e.g. X/INTMIN = 0 --> X > INTMIN
+      }
+    } else if (CmpRHSV.isPositive()) {   // (X / neg) op pos
+      // e.g. X/-5 op 3  --> [-19, -14)
+      HiOverflow = LoOverflow = ProdOV ? -1 : 0;
+      if (!LoOverflow)
+        LoOverflow = AddWithOverflow(LoBound, Prod, AddOne(DivRHS), true) ?-1:0;
+      HiBound = AddOne(Prod);
+    } else {                       // (X / neg) op neg
+      // e.g. X/-5 op -3  --> [15, 20)
+      LoBound = Prod;
+      LoOverflow = HiOverflow = ProdOV ? 1 : 0;
+      HiBound = Subtract(Prod, DivRHS);
+    }
+    
+    // Dividing by a negative swaps the condition.  LT <-> GT
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  Value *X = DivI->getOperand(0);
+  switch (Pred) {
+  default: assert(0 && "Unhandled icmp opcode!");
+  case ICmpInst::ICMP_EQ:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, true, ICI);
+  case ICmpInst::ICMP_NE:
+    if (LoOverflow && HiOverflow)
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    else if (HiOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SLT : 
+                          ICmpInst::ICMP_ULT, X, LoBound);
+    else if (LoOverflow)
+      return new ICmpInst(DivIsSigned ? ICmpInst::ICMP_SGE : 
+                          ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return InsertRangeTest(X, LoBound, HiBound, DivIsSigned, false, ICI);
+  case ICmpInst::ICMP_ULT:
+  case ICmpInst::ICMP_SLT:
+    if (LoOverflow == +1)   // Low bound is greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (LoOverflow == -1)   // Low bound is less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    return new ICmpInst(Pred, X, LoBound);
+  case ICmpInst::ICMP_UGT:
+  case ICmpInst::ICMP_SGT:
+    if (HiOverflow == +1)       // High bound greater than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+    else if (HiOverflow == -1)  // High bound less than input range.
+      return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+    if (Pred == ICmpInst::ICMP_UGT)
+      return new ICmpInst(ICmpInst::ICMP_UGE, X, HiBound);
+    else
+      return new ICmpInst(ICmpInst::ICMP_SGE, X, HiBound);
+  }
+}
+
+
+/// visitICmpInstWithInstAndIntCst - Handle "icmp (instr, intcst)".
+///
+Instruction *InstCombiner::visitICmpInstWithInstAndIntCst(ICmpInst &ICI,
+                                                          Instruction *LHSI,
+                                                          ConstantInt *RHS) {
+  const APInt &RHSV = RHS->getValue();
+  
+  switch (LHSI->getOpcode()) {
+  case Instruction::Xor:         // (icmp pred (xor X, XorCST), CI)
+    if (ConstantInt *XorCST = dyn_cast<ConstantInt>(LHSI->getOperand(1))) {
+      // If this is a comparison that tests the signbit (X < 0) or (x > -1),
+      // fold the xor.
+      if (ICI.getPredicate() == ICmpInst::ICMP_SLT && RHSV == 0 ||
+          ICI.getPredicate() == ICmpInst::ICMP_SGT && RHSV.isAllOnesValue()) {
+        Value *CompareVal = LHSI->getOperand(0);
+        
+        // If the sign bit of the XorCST is not set, there is no change to
+        // the operation, just stop using the Xor.
+        if (!XorCST->getValue().isNegative()) {
+          ICI.setOperand(0, CompareVal);
+          AddToWorkList(LHSI);
+          return &ICI;
+        }
+        
+        // Was the old condition true if the operand is positive?
+        bool isTrueIfPositive = ICI.getPredicate() == ICmpInst::ICMP_SGT;
+        
+        // If so, the new one isn't.
+        isTrueIfPositive ^= true;
+        
+        if (isTrueIfPositive)
+          return new ICmpInst(ICmpInst::ICMP_SGT, CompareVal, SubOne(RHS));
+        else
+          return new ICmpInst(ICmpInst::ICMP_SLT, CompareVal, AddOne(RHS));
+      }
+    }
+    break;
+  case Instruction::And:         // (icmp pred (and X, AndCST), RHS)
+    if (LHSI->hasOneUse() && isa<ConstantInt>(LHSI->getOperand(1)) &&
+        LHSI->getOperand(0)->hasOneUse()) {
+      ConstantInt *AndCST = cast<ConstantInt>(LHSI->getOperand(1));
+      
+      // If the LHS is an AND of a truncating cast, we can widen the
+      // and/compare to be the input width without changing the value
+      // produced, eliminating a cast.
+      if (TruncInst *Cast = dyn_cast<TruncInst>(LHSI->getOperand(0))) {
+        // We can do this transformation if either the AND constant does not
+        // have its sign bit set or if it is an equality comparison. 
+        // Extending a relational comparison when we're checking the sign
+        // bit would not work.
+        if (Cast->hasOneUse() &&
+            (ICI.isEquality() || AndCST->getValue().isPositive() && 
+             RHSV.isPositive())) {
+          uint32_t BitWidth = 
+            cast<IntegerType>(Cast->getOperand(0)->getType())->getBitWidth();
+          APInt NewCST = AndCST->getValue();
+          NewCST.zext(BitWidth);
+          APInt NewCI = RHSV;
+          NewCI.zext(BitWidth);
+          Instruction *NewAnd = 
+            BinaryOperator::createAnd(Cast->getOperand(0),
+                                      ConstantInt::get(NewCST),LHSI->getName());
+          InsertNewInstBefore(NewAnd, ICI);
+          return new ICmpInst(ICI.getPredicate(), NewAnd,
+                              ConstantInt::get(NewCI));
+        }
+      }
+      
+      // If this is: (X >> C1) & C2 != C3 (where any shift and any compare
+      // could exist), turn it into (X & (C2 << C1)) != (C3 << C1).  This
+      // happens a LOT in code produced by the C front-end, for bitfield
+      // access.
+      BinaryOperator *Shift = dyn_cast<BinaryOperator>(LHSI->getOperand(0));
+      if (Shift && !Shift->isShift())
+        Shift = 0;
+      
+      ConstantInt *ShAmt;
+      ShAmt = Shift ? dyn_cast<ConstantInt>(Shift->getOperand(1)) : 0;
+      const Type *Ty = Shift ? Shift->getType() : 0;  // Type of the shift.
+      const Type *AndTy = AndCST->getType();          // Type of the and.
+      
+      // We can fold this as long as we can't shift unknown bits
+      // into the mask.  This can only happen with signed shift
+      // rights, as they sign-extend.
+      if (ShAmt) {
+        bool CanFold = Shift->isLogicalShift();
+        if (!CanFold) {
+          // To test for the bad case of the signed shr, see if any
+          // of the bits shifted in could be tested after the mask.
+          uint32_t TyBits = Ty->getPrimitiveSizeInBits();
+          int ShAmtVal = TyBits - ShAmt->getLimitedValue(TyBits);
+          
+          uint32_t BitWidth = AndTy->getPrimitiveSizeInBits();
+          if ((APInt::getHighBitsSet(BitWidth, BitWidth-ShAmtVal) & 
+               AndCST->getValue()) == 0)
+            CanFold = true;
+        }
+        
+        if (CanFold) {
+          Constant *NewCst;
+          if (Shift->getOpcode() == Instruction::Shl)
+            NewCst = ConstantExpr::getLShr(RHS, ShAmt);
+          else
+            NewCst = ConstantExpr::getShl(RHS, ShAmt);
+          
+          // Check to see if we are shifting out any of the bits being
+          // compared.
+          if (ConstantExpr::get(Shift->getOpcode(), NewCst, ShAmt) != RHS) {
+            // If we shifted bits out, the fold is not going to work out.
+            // As a special case, check to see if this means that the
+            // result is always true or false now.
+            if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+            if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+              return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+          } else {
+            ICI.setOperand(1, NewCst);
+            Constant *NewAndCST;
+            if (Shift->getOpcode() == Instruction::Shl)
+              NewAndCST = ConstantExpr::getLShr(AndCST, ShAmt);
+            else
+              NewAndCST = ConstantExpr::getShl(AndCST, ShAmt);
+            LHSI->setOperand(1, NewAndCST);
+            LHSI->setOperand(0, Shift->getOperand(0));
+            AddToWorkList(Shift); // Shift is dead.
+            AddUsesToWorkList(ICI);
+            return &ICI;
+          }
+        }
+      }
+      
+      // Turn ((X >> Y) & C) == 0  into  (X & (C << Y)) == 0.  The later is
+      // preferable because it allows the C<<Y expression to be hoisted out
+      // of a loop if Y is invariant and X is not.
+      if (Shift && Shift->hasOneUse() && RHSV == 0 &&
+          ICI.isEquality() && !Shift->isArithmeticShift() &&
+          isa<Instruction>(Shift->getOperand(0))) {
+        // Compute C << Y.
+        Value *NS;
+        if (Shift->getOpcode() == Instruction::LShr) {
+          NS = BinaryOperator::createShl(AndCST, 
+                                         Shift->getOperand(1), "tmp");
+        } else {
+          // Insert a logical shift.
+          NS = BinaryOperator::createLShr(AndCST,
+                                          Shift->getOperand(1), "tmp");
+        }
+        InsertNewInstBefore(cast<Instruction>(NS), ICI);
+        
+        // Compute X & (C << Y).
+        Instruction *NewAnd = 
+          BinaryOperator::createAnd(Shift->getOperand(0), NS, LHSI->getName());
+        InsertNewInstBefore(NewAnd, ICI);
+        
+        ICI.setOperand(0, NewAnd);
+        return &ICI;
+      }
+    }
+    break;
+    
+  case Instruction::Shl: {       // (icmp pred (shl X, ShAmt), CI)
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt) break;
+    
+    uint32_t TypeBits = RHSV.getBitWidth();
+    
+    // Check that the shift amount is in range.  If not, don't perform
+    // undefined shifts.  When the shift is visited it will be
+    // simplified.
+    if (ShAmt->uge(TypeBits))
+      break;
+    
+    if (ICI.isEquality()) {
+      // If we are comparing against bits always shifted out, the
+      // comparison cannot succeed.
+      Constant *Comp =
+        ConstantExpr::getShl(ConstantExpr::getLShr(RHS, ShAmt), ShAmt);
+      if (Comp != RHS) {// Comparing against a bit that we know is zero.
+        bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+        Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+        return ReplaceInstUsesWith(ICI, Cst);
+      }
+      
+      if (LHSI->hasOneUse()) {
+        // Otherwise strength reduce the shift into an and.
+        uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+        Constant *Mask =
+          ConstantInt::get(APInt::getLowBitsSet(TypeBits, TypeBits-ShAmtVal));
+        
+        Instruction *AndI =
+          BinaryOperator::createAnd(LHSI->getOperand(0),
+                                    Mask, LHSI->getName()+".mask");
+        Value *And = InsertNewInstBefore(AndI, ICI);
+        return new ICmpInst(ICI.getPredicate(), And,
+                            ConstantInt::get(RHSV.lshr(ShAmtVal)));
+      }
+    }
+    
+    // Otherwise, if this is a comparison of the sign bit, simplify to and/test.
+    bool TrueIfSigned = false;
+    if (LHSI->hasOneUse() &&
+        isSignBitCheck(ICI.getPredicate(), RHS, TrueIfSigned)) {
+      // (X << 31) <s 0  --> (X&1) != 0
+      Constant *Mask = ConstantInt::get(APInt(TypeBits, 1) <<
+                                           (TypeBits-ShAmt->getZExtValue()-1));
+      Instruction *AndI =
+        BinaryOperator::createAnd(LHSI->getOperand(0),
+                                  Mask, LHSI->getName()+".mask");
+      Value *And = InsertNewInstBefore(AndI, ICI);
+      
+      return new ICmpInst(TrueIfSigned ? ICmpInst::ICMP_NE : ICmpInst::ICMP_EQ,
+                          And, Constant::getNullValue(And->getType()));
+    }
+    break;
+  }
+    
+  case Instruction::LShr:         // (icmp pred (shr X, ShAmt), CI)
+  case Instruction::AShr: {
+    ConstantInt *ShAmt = dyn_cast<ConstantInt>(LHSI->getOperand(1));
+    if (!ShAmt) break;
+
+    if (ICI.isEquality()) {
+      // Check that the shift amount is in range.  If not, don't perform
+      // undefined shifts.  When the shift is visited it will be
+      // simplified.
+      uint32_t TypeBits = RHSV.getBitWidth();
+      if (ShAmt->uge(TypeBits))
+        break;
+      uint32_t ShAmtVal = (uint32_t)ShAmt->getLimitedValue(TypeBits);
+      
+      // If we are comparing against bits always shifted out, the
+      // comparison cannot succeed.
+      APInt Comp = RHSV << ShAmtVal;
+      if (LHSI->getOpcode() == Instruction::LShr)
+        Comp = Comp.lshr(ShAmtVal);
+      else
+        Comp = Comp.ashr(ShAmtVal);
+      
+      if (Comp != RHSV) { // Comparing against a bit that we know is zero.
+        bool IsICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+        Constant *Cst = ConstantInt::get(Type::Int1Ty, IsICMP_NE);
+        return ReplaceInstUsesWith(ICI, Cst);
+      }
+      
+      if (LHSI->hasOneUse() || RHSV == 0) {
+        // Otherwise strength reduce the shift into an and.
+        APInt Val(APInt::getHighBitsSet(TypeBits, TypeBits - ShAmtVal));
+        Constant *Mask = ConstantInt::get(Val);
+        
+        Instruction *AndI =
+          BinaryOperator::createAnd(LHSI->getOperand(0),
+                                    Mask, LHSI->getName()+".mask");
+        Value *And = InsertNewInstBefore(AndI, ICI);
+        return new ICmpInst(ICI.getPredicate(), And,
+                            ConstantExpr::getShl(RHS, ShAmt));
+      }
+    }
+    break;
+  }
+    
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+    // Fold: icmp pred ([us]div X, C1), C2 -> range test
+    // Fold this div into the comparison, producing a range check. 
+    // Determine, based on the divide type, what the range is being 
+    // checked.  If there is an overflow on the low or high side, remember 
+    // it, otherwise compute the range [low, hi) bounding the new value.
+    // See: InsertRangeTest above for the kinds of replacements possible.
+    if (ConstantInt *DivRHS = dyn_cast<ConstantInt>(LHSI->getOperand(1)))
+      if (Instruction *R = FoldICmpDivCst(ICI, cast<BinaryOperator>(LHSI),
+                                          DivRHS))
+        return R;
+    break;
+  }
+  
+  // Simplify icmp_eq and icmp_ne instructions with integer constant RHS.
+  if (ICI.isEquality()) {
+    bool isICMP_NE = ICI.getPredicate() == ICmpInst::ICMP_NE;
+    
+    // If the first operand is (add|sub|and|or|xor|rem) with a constant, and 
+    // the second operand is a constant, simplify a bit.
+    if (BinaryOperator *BO = dyn_cast<BinaryOperator>(LHSI)) {
+      switch (BO->getOpcode()) {
+      case Instruction::SRem:
+        // If we have a signed (X % (2^c)) == 0, turn it into an unsigned one.
+        if (RHSV == 0 && isa<ConstantInt>(BO->getOperand(1)) &&BO->hasOneUse()){
+          const APInt &V = cast<ConstantInt>(BO->getOperand(1))->getValue();
+          if (V.sgt(APInt(V.getBitWidth(), 1)) && V.isPowerOf2()) {
+            Instruction *NewRem =
+              BinaryOperator::createURem(BO->getOperand(0), BO->getOperand(1),
+                                         BO->getName());
+            InsertNewInstBefore(NewRem, ICI);
+            return new ICmpInst(ICI.getPredicate(), NewRem, 
+                                Constant::getNullValue(BO->getType()));
+          }
+        }
+        break;
+      case Instruction::Add:
+        // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
+        if (ConstantInt *BOp1C = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          if (BO->hasOneUse())
+            return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                                Subtract(RHS, BOp1C));
+        } else if (RHSV == 0) {
+          // Replace ((add A, B) != 0) with (A != -B) if A or B is
+          // efficiently invertible, or if the add has just this one use.
+          Value *BOp0 = BO->getOperand(0), *BOp1 = BO->getOperand(1);
+          
+          if (Value *NegVal = dyn_castNegVal(BOp1))
+            return new ICmpInst(ICI.getPredicate(), BOp0, NegVal);
+          else if (Value *NegVal = dyn_castNegVal(BOp0))
+            return new ICmpInst(ICI.getPredicate(), NegVal, BOp1);
+          else if (BO->hasOneUse()) {
+            Instruction *Neg = BinaryOperator::createNeg(BOp1);
+            InsertNewInstBefore(Neg, ICI);
+            Neg->takeName(BO);
+            return new ICmpInst(ICI.getPredicate(), BOp0, Neg);
+          }
+        }
+        break;
+      case Instruction::Xor:
+        // For the xor case, we can xor two constants together, eliminating
+        // the explicit xor.
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1)))
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0), 
+                              ConstantExpr::getXor(RHS, BOC));
+        
+        // FALLTHROUGH
+      case Instruction::Sub:
+        // Replace (([sub|xor] A, B) != 0) with (A != B)
+        if (RHSV == 0)
+          return new ICmpInst(ICI.getPredicate(), BO->getOperand(0),
+                              BO->getOperand(1));
+        break;
+        
+      case Instruction::Or:
+        // If bits are being or'd in that are not present in the constant we
+        // are comparing against, then the comparison could never succeed!
+        if (Constant *BOC = dyn_cast<Constant>(BO->getOperand(1))) {
+          Constant *NotCI = ConstantExpr::getNot(RHS);
+          if (!ConstantExpr::getAnd(BOC, NotCI)->isNullValue())
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty, 
+                                                             isICMP_NE));
+        }
+        break;
+        
+      case Instruction::And:
+        if (ConstantInt *BOC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          // If bits are being compared against that are and'd out, then the
+          // comparison can never succeed!
+          if ((RHSV & ~BOC->getValue()) != 0)
+            return ReplaceInstUsesWith(ICI, ConstantInt::get(Type::Int1Ty,
+                                                             isICMP_NE));
+          
+          // If we have ((X & C) == C), turn it into ((X & C) != 0).
+          if (RHS == BOC && RHSV.isPowerOf2())
+            return new ICmpInst(isICMP_NE ? ICmpInst::ICMP_EQ :
+                                ICmpInst::ICMP_NE, LHSI,
+                                Constant::getNullValue(RHS->getType()));
+          
+          // Replace (and X, (1 << size(X)-1) != 0) with x s< 0
+          if (isSignBit(BOC)) {
+            Value *X = BO->getOperand(0);
+            Constant *Zero = Constant::getNullValue(X->getType());
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_SLT : ICmpInst::ICMP_SGE;
+            return new ICmpInst(pred, X, Zero);
+          }
+          
+          // ((X & ~7) == 0) --> X < 8
+          if (RHSV == 0 && isHighOnes(BOC)) {
+            Value *X = BO->getOperand(0);
+            Constant *NegX = ConstantExpr::getNeg(BOC);
+            ICmpInst::Predicate pred = isICMP_NE ? 
+              ICmpInst::ICMP_UGE : ICmpInst::ICMP_ULT;
+            return new ICmpInst(pred, X, NegX);
+          }
+        }
+      default: break;
+      }
+    } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(LHSI)) {
+      // Handle icmp {eq|ne} <intrinsic>, intcst.
+      if (II->getIntrinsicID() == Intrinsic::bswap) {
+        AddToWorkList(II);
+        ICI.setOperand(0, II->getOperand(1));
+        ICI.setOperand(1, ConstantInt::get(RHSV.byteSwap()));
+        return &ICI;
+      }
+    }
+  } else {  // Not a ICMP_EQ/ICMP_NE
+            // If the LHS is a cast from an integral value of the same size, 
+            // then since we know the RHS is a constant, try to simlify.
+    if (CastInst *Cast = dyn_cast<CastInst>(LHSI)) {
+      Value *CastOp = Cast->getOperand(0);
+      const Type *SrcTy = CastOp->getType();
+      uint32_t SrcTySize = SrcTy->getPrimitiveSizeInBits();
+      if (SrcTy->isInteger() && 
+          SrcTySize == Cast->getType()->getPrimitiveSizeInBits()) {
+        // If this is an unsigned comparison, try to make the comparison use
+        // smaller constant values.
+        if (ICI.getPredicate() == ICmpInst::ICMP_ULT && RHSV.isSignBit()) {
+          // X u< 128 => X s> -1
+          return new ICmpInst(ICmpInst::ICMP_SGT, CastOp, 
+                           ConstantInt::get(APInt::getAllOnesValue(SrcTySize)));
+        } else if (ICI.getPredicate() == ICmpInst::ICMP_UGT &&
+                   RHSV == APInt::getSignedMaxValue(SrcTySize)) {
+          // X u> 127 => X s< 0
+          return new ICmpInst(ICmpInst::ICMP_SLT, CastOp, 
+                              Constant::getNullValue(SrcTy));
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// visitICmpInstWithCastAndCast - Handle icmp (cast x to y), (cast/cst).
+/// We only handle extending casts so far.
+///
+Instruction *InstCombiner::visitICmpInstWithCastAndCast(ICmpInst &ICI) {
+  const CastInst *LHSCI = cast<CastInst>(ICI.getOperand(0));
+  Value *LHSCIOp        = LHSCI->getOperand(0);
+  const Type *SrcTy     = LHSCIOp->getType();
+  const Type *DestTy    = LHSCI->getType();
+  Value *RHSCIOp;
+
+  // Turn icmp (ptrtoint x), (ptrtoint/c) into a compare of the input if the 
+  // integer type is the same size as the pointer type.
+  if (LHSCI->getOpcode() == Instruction::PtrToInt &&
+      getTargetData().getPointerSizeInBits() == 
+         cast<IntegerType>(DestTy)->getBitWidth()) {
+    Value *RHSOp = 0;
+    if (Constant *RHSC = dyn_cast<Constant>(ICI.getOperand(1))) {
+      RHSOp = ConstantExpr::getIntToPtr(RHSC, SrcTy);
+    } else if (PtrToIntInst *RHSC = dyn_cast<PtrToIntInst>(ICI.getOperand(1))) {
+      RHSOp = RHSC->getOperand(0);
+      // If the pointer types don't match, insert a bitcast.
+      if (LHSCIOp->getType() != RHSOp->getType())
+        RHSOp = InsertCastBefore(Instruction::BitCast, RHSOp,
+                                 LHSCIOp->getType(), ICI);
+    }
+
+    if (RHSOp)
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSOp);
+  }
+  
+  // The code below only handles extension cast instructions, so far.
+  // Enforce this.
+  if (LHSCI->getOpcode() != Instruction::ZExt &&
+      LHSCI->getOpcode() != Instruction::SExt)
+    return 0;
+
+  bool isSignedExt = LHSCI->getOpcode() == Instruction::SExt;
+  bool isSignedCmp = ICI.isSignedPredicate();
+
+  if (CastInst *CI = dyn_cast<CastInst>(ICI.getOperand(1))) {
+    // Not an extension from the same type?
+    RHSCIOp = CI->getOperand(0);
+    if (RHSCIOp->getType() != LHSCIOp->getType()) 
+      return 0;
+    
+    // If the signedness of the two compares doesn't agree (i.e. one is a sext
+    // and the other is a zext), then we can't handle this.
+    if (CI->getOpcode() != LHSCI->getOpcode())
+      return 0;
+
+    // Likewise, if the signedness of the [sz]exts and the compare don't match, 
+    // then we can't handle this.
+    if (isSignedExt != isSignedCmp && !ICI.isEquality())
+      return 0;
+    
+    // Okay, just insert a compare of the reduced operands now!
+    return new ICmpInst(ICI.getPredicate(), LHSCIOp, RHSCIOp);
+  }
+
+  // If we aren't dealing with a constant on the RHS, exit early
+  ConstantInt *CI = dyn_cast<ConstantInt>(ICI.getOperand(1));
+  if (!CI)
+    return 0;
+
+  // Compute the constant that would happen if we truncated to SrcTy then
+  // reextended to DestTy.
+  Constant *Res1 = ConstantExpr::getTrunc(CI, SrcTy);
+  Constant *Res2 = ConstantExpr::getCast(LHSCI->getOpcode(), Res1, DestTy);
+
+  // If the re-extended constant didn't change...
+  if (Res2 == CI) {
+    // Make sure that sign of the Cmp and the sign of the Cast are the same.
+    // For example, we might have:
+    //    %A = sext short %X to uint
+    //    %B = icmp ugt uint %A, 1330
+    // It is incorrect to transform this into 
+    //    %B = icmp ugt short %X, 1330 
+    // because %A may have negative value. 
+    //
+    // However, it is OK if SrcTy is bool (See cast-set.ll testcase)
+    // OR operation is EQ/NE.
+    if (isSignedExt == isSignedCmp || SrcTy == Type::Int1Ty || ICI.isEquality())
+      return new ICmpInst(ICI.getPredicate(), LHSCIOp, Res1);
+    else
+      return 0;
+  }
+
+  // The re-extended constant changed so the constant cannot be represented 
+  // in the shorter type. Consequently, we cannot emit a simple comparison.
+
+  // First, handle some easy cases. We know the result cannot be equal at this
+  // point so handle the ICI.isEquality() cases
+  if (ICI.getPredicate() == ICmpInst::ICMP_EQ)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getFalse());
+  if (ICI.getPredicate() == ICmpInst::ICMP_NE)
+    return ReplaceInstUsesWith(ICI, ConstantInt::getTrue());
+
+  // Evaluate the comparison for LT (we invert for GT below). LE and GE cases
+  // should have been folded away previously and not enter in here.
+  Value *Result;
+  if (isSignedCmp) {
+    // We're performing a signed comparison.
+    if (cast<ConstantInt>(CI)->getValue().isNegative())
+      Result = ConstantInt::getFalse();          // X < (small) --> false
+    else
+      Result = ConstantInt::getTrue();           // X < (large) --> true
+  } else {
+    // We're performing an unsigned comparison.
+    if (isSignedExt) {
+      // We're performing an unsigned comp with a sign extended value.
+      // This is true if the input is >= 0. [aka >s -1]
+      Constant *NegOne = ConstantInt::getAllOnesValue(SrcTy);
+      Result = InsertNewInstBefore(new ICmpInst(ICmpInst::ICMP_SGT, LHSCIOp,
+                                   NegOne, ICI.getName()), ICI);
+    } else {
+      // Unsigned extend & unsigned compare -> always true.
+      Result = ConstantInt::getTrue();
+    }
+  }
+
+  // Finally, return the value computed.
+  if (ICI.getPredicate() == ICmpInst::ICMP_ULT ||
+      ICI.getPredicate() == ICmpInst::ICMP_SLT) {
+    return ReplaceInstUsesWith(ICI, Result);
+  } else {
+    assert((ICI.getPredicate()==ICmpInst::ICMP_UGT || 
+            ICI.getPredicate()==ICmpInst::ICMP_SGT) &&
+           "ICmp should be folded!");
+    if (Constant *CI = dyn_cast<Constant>(Result))
+      return ReplaceInstUsesWith(ICI, ConstantExpr::getNot(CI));
+    else
+      return BinaryOperator::createNot(Result);
+  }
+}
+
+Instruction *InstCombiner::visitShl(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitLShr(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::visitAShr(BinaryOperator &I) {
+  return commonShiftTransforms(I);
+}
+
+Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
+  assert(I.getOperand(1)->getType() == I.getOperand(0)->getType());
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+
+  // shl X, 0 == X and shr X, 0 == X
+  // shl 0, X == 0 and shr 0, X == 0
+  if (Op1 == Constant::getNullValue(Op1->getType()) ||
+      Op0 == Constant::getNullValue(Op0->getType()))
+    return ReplaceInstUsesWith(I, Op0);
+  
+  if (isa<UndefValue>(Op0)) {            
+    if (I.getOpcode() == Instruction::AShr) // undef >>s X -> undef
+      return ReplaceInstUsesWith(I, Op0);
+    else                                    // undef << X -> 0, undef >>u X -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+  if (isa<UndefValue>(Op1)) {
+    if (I.getOpcode() == Instruction::AShr)  // X >>s undef -> X
+      return ReplaceInstUsesWith(I, Op0);          
+    else                                     // X << undef, X >>u undef -> 0
+      return ReplaceInstUsesWith(I, Constant::getNullValue(I.getType()));
+  }
+
+  // ashr int -1, X = -1   (for any arithmetic shift rights of ~0)
+  if (I.getOpcode() == Instruction::AShr)
+    if (ConstantInt *CSI = dyn_cast<ConstantInt>(Op0))
+      if (CSI->isAllOnesValue())
+        return ReplaceInstUsesWith(I, CSI);
+
+  // Try to fold constant and into select arguments.
+  if (isa<Constant>(Op0))
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
+      if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+        return R;
+
+  // See if we can turn a signed shr into an unsigned shr.
+  if (I.isArithmeticShift()) {
+    if (MaskedValueIsZero(Op0, 
+          APInt::getSignBit(I.getType()->getPrimitiveSizeInBits()))) {
+      return BinaryOperator::createLShr(Op0, Op1, I.getName());
+    }
+  }
+
+  if (ConstantInt *CUI = dyn_cast<ConstantInt>(Op1))
+    if (Instruction *Res = FoldShiftByConstant(Op0, CUI, I))
+      return Res;
+  return 0;
+}
+
+Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, ConstantInt *Op1,
+                                               BinaryOperator &I) {
+  bool isLeftShift    = I.getOpcode() == Instruction::Shl;
+
+  // See if we can simplify any instructions used by the instruction whose sole 
+  // purpose is to compute bits we don't care about.
+  uint32_t TypeBits = Op0->getType()->getPrimitiveSizeInBits();
+  APInt KnownZero(TypeBits, 0), KnownOne(TypeBits, 0);
+  if (SimplifyDemandedBits(&I, APInt::getAllOnesValue(TypeBits),
+                           KnownZero, KnownOne))
+    return &I;
+  
+  // shl uint X, 32 = 0 and shr ubyte Y, 9 = 0, ... just don't eliminate shr
+  // of a signed value.
+  //
+  if (Op1->uge(TypeBits)) {
+    if (I.getOpcode() != Instruction::AShr)
+      return ReplaceInstUsesWith(I, Constant::getNullValue(Op0->getType()));
+    else {
+      I.setOperand(1, ConstantInt::get(I.getType(), TypeBits-1));
+      return &I;
+    }
+  }
+  
+  // ((X*C1) << C2) == (X * (C1 << C2))
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Op0))
+    if (BO->getOpcode() == Instruction::Mul && isLeftShift)
+      if (Constant *BOOp = dyn_cast<Constant>(BO->getOperand(1)))
+        return BinaryOperator::createMul(BO->getOperand(0),
+                                         ConstantExpr::getShl(BOOp, Op1));
+  
+  // Try to fold constant and into select arguments.
+  if (SelectInst *SI = dyn_cast<SelectInst>(Op0))
+    if (Instruction *R = FoldOpIntoSelect(I, SI, this))
+      return R;
+  if (isa<PHINode>(Op0))
+    if (Instruction *NV = FoldOpIntoPhi(I))
+      return NV;
+  
+  if (Op0->hasOneUse()) {
+    if (BinaryOperator *Op0BO = dyn_cast<BinaryOperator>(Op0)) {
+      // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+      Value *V1, *V2;
+      ConstantInt *CC;
+      switch (Op0BO->getOpcode()) {
+        default: break;
+        case Instruction::Add:
+        case Instruction::And:
+        case Instruction::Or:
+        case Instruction::Xor: {
+          // These operators commute.
+          // Turn (Y + (X >> C)) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(1)->hasOneUse() &&
+              match(Op0BO->getOperand(1),
+                    m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
+            Instruction *YS = BinaryOperator::createShl(
+                                            Op0BO->getOperand(0), Op1,
+                                            Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X = 
+              BinaryOperator::create(Op0BO->getOpcode(), YS, V1,
+                                     Op0BO->getOperand(1)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::createAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (Y + ((X >> C) & CC)) << C  ->  ((X & (CC << C)) + (Y << C))
+          Value *Op0BOOp1 = Op0BO->getOperand(1);
+          if (isLeftShift && Op0BOOp1->hasOneUse() &&
+              match(Op0BOOp1, 
+                    m_And(m_Shr(m_Value(V1), m_Value(V2)),m_ConstantInt(CC))) &&
+              cast<BinaryOperator>(Op0BOOp1)->getOperand(0)->hasOneUse() &&
+              V2 == Op1) {
+            Instruction *YS = BinaryOperator::createShl(
+                                                     Op0BO->getOperand(0), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::createAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::create(Op0BO->getOpcode(), YS, XM);
+          }
+        }
+          
+        // FALL THROUGH.
+        case Instruction::Sub: {
+          // Turn ((X >> C) + Y) << C  ->  (X + (Y << C)) & (~0 << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0),
+                    m_Shr(m_Value(V1), m_ConstantInt(CC))) && CC == Op1) {
+            Instruction *YS = BinaryOperator::createShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *X =
+              BinaryOperator::create(Op0BO->getOpcode(), V1, YS,
+                                     Op0BO->getOperand(0)->getName());
+            InsertNewInstBefore(X, I);  // (X + (Y << C))
+            uint32_t Op1Val = Op1->getLimitedValue(TypeBits);
+            return BinaryOperator::createAnd(X, ConstantInt::get(
+                       APInt::getHighBitsSet(TypeBits, TypeBits-Op1Val)));
+          }
+          
+          // Turn (((X >> C)&CC) + Y) << C  ->  (X + (Y << C)) & (CC << C)
+          if (isLeftShift && Op0BO->getOperand(0)->hasOneUse() &&
+              match(Op0BO->getOperand(0),
+                    m_And(m_Shr(m_Value(V1), m_Value(V2)),
+                          m_ConstantInt(CC))) && V2 == Op1 &&
+              cast<BinaryOperator>(Op0BO->getOperand(0))
+                  ->getOperand(0)->hasOneUse()) {
+            Instruction *YS = BinaryOperator::createShl(
+                                                     Op0BO->getOperand(1), Op1,
+                                                     Op0BO->getName());
+            InsertNewInstBefore(YS, I); // (Y << C)
+            Instruction *XM =
+              BinaryOperator::createAnd(V1, ConstantExpr::getShl(CC, Op1),
+                                        V1->getName()+".mask");
+            InsertNewInstBefore(XM, I); // X & (CC << C)
+            
+            return BinaryOperator::create(Op0BO->getOpcode(), XM, YS);
+          }
+          
+          break;
+        }
+      }
+      
+      
+      // If the operand is an bitwise operator with a constant RHS, and the
+      // shift is the only use, we can pull it out of the shift.
+      if (ConstantInt *Op0C = dyn_cast<ConstantInt>(Op0BO->getOperand(1))) {
+        bool isValid = true;     // Valid only for And, Or, Xor
+        bool highBitSet = false; // Transform if high bit of constant set?
+        
+        switch (Op0BO->getOpcode()) {
+          default: isValid = false; break;   // Do not perform transform!
+          case Instruction::Add:
+            isValid = isLeftShift;
+            break;
+          case Instruction::Or:
+          case Instruction::Xor:
+            highBitSet = false;
+            break;
+          case Instruction::And:
+            highBitSet = true;
+            break;
+        }
+        
+        // If this is a signed shift right, and the high bit is modified
+        // by the logical operation, do not perform the transformation.
+        // The highBitSet boolean indicates the value of the high bit of
+        // the constant which would cause it to be modified for this
+        // operation.
+        //
+        if (isValid && !isLeftShift && I.getOpcode() == Instruction::AShr) {
+          isValid = Op0C->getValue()[TypeBits-1] == highBitSet;
+        }
+        
+        if (isValid) {
+          Constant *NewRHS = ConstantExpr::get(I.getOpcode(), Op0C, Op1);
+          
+          Instruction *NewShift =
+            BinaryOperator::create(I.getOpcode(), Op0BO->getOperand(0), Op1);
+          InsertNewInstBefore(NewShift, I);
+          NewShift->takeName(Op0BO);
+          
+          return BinaryOperator::create(Op0BO->getOpcode(), NewShift,
+                                        NewRHS);
+        }
+      }
+    }
+  }
+  
+  // Find out if this is a shift of a shift by a constant.
+  BinaryOperator *ShiftOp = dyn_cast<BinaryOperator>(Op0);
+  if (ShiftOp && !ShiftOp->isShift())
+    ShiftOp = 0;
+  
+  if (ShiftOp && isa<ConstantInt>(ShiftOp->getOperand(1))) {
+    ConstantInt *ShiftAmt1C = cast<ConstantInt>(ShiftOp->getOperand(1));
+    uint32_t ShiftAmt1 = ShiftAmt1C->getLimitedValue(TypeBits);
+    uint32_t ShiftAmt2 = Op1->getLimitedValue(TypeBits);
+    assert(ShiftAmt2 != 0 && "Should have been simplified earlier");
+    if (ShiftAmt1 == 0) return 0;  // Will be simplified in the future.
+    Value *X = ShiftOp->getOperand(0);
+    
+    uint32_t AmtSum = ShiftAmt1+ShiftAmt2;   // Fold into one big shift.
+    if (AmtSum > TypeBits)
+      AmtSum = TypeBits;
+    
+    const IntegerType *Ty = cast<IntegerType>(I.getType());
+    
+    // Check for (X << c1) << c2  and  (X >> c1) >> c2
+    if (I.getOpcode() == ShiftOp->getOpcode()) {
+      return BinaryOperator::create(I.getOpcode(), X,
+                                    ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::LShr &&
+               I.getOpcode() == Instruction::AShr) {
+      // ((X >>u C1) >>s C2) -> (X >>u (C1+C2))  since C1 != 0.
+      return BinaryOperator::createLShr(X, ConstantInt::get(Ty, AmtSum));
+    } else if (ShiftOp->getOpcode() == Instruction::AShr &&
+               I.getOpcode() == Instruction::LShr) {
+      // ((X >>s C1) >>u C2) -> ((X >>s (C1+C2)) & mask) since C1 != 0.
+      Instruction *Shift =
+        BinaryOperator::createAShr(X, ConstantInt::get(Ty, AmtSum));
+      InsertNewInstBefore(Shift, I);
+
+      APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+      return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+    }
+    
+    // Okay, if we get here, one shift must be left, and the other shift must be
+    // right.  See if the amounts are equal.
+    if (ShiftAmt1 == ShiftAmt2) {
+      // If we have ((X >>? C) << C), turn this into X & (-1 << C).
+      if (I.getOpcode() == Instruction::Shl) {
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::createAnd(X, ConstantInt::get(Mask));
+      }
+      // If we have ((X << C) >>u C), turn this into X & (-1 >>u C).
+      if (I.getOpcode() == Instruction::LShr) {
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt1));
+        return BinaryOperator::createAnd(X, ConstantInt::get(Mask));
+      }
+      // We can simplify ((X << C) >>s C) into a trunc + sext.
+      // NOTE: we could do this for any C, but that would make 'unusual' integer
+      // types.  For now, just stick to ones well-supported by the code
+      // generators.
+      const Type *SExtType = 0;
+      switch (Ty->getBitWidth() - ShiftAmt1) {
+      case 1  :
+      case 8  :
+      case 16 :
+      case 32 :
+      case 64 :
+      case 128:
+        SExtType = IntegerType::get(Ty->getBitWidth() - ShiftAmt1);
+        break;
+      default: break;
+      }
+      if (SExtType) {
+        Instruction *NewTrunc = new TruncInst(X, SExtType, "sext");
+        InsertNewInstBefore(NewTrunc, I);
+        return new SExtInst(NewTrunc, Ty);
+      }
+      // Otherwise, we can't handle it yet.
+    } else if (ShiftAmt1 < ShiftAmt2) {
+      uint32_t ShiftDiff = ShiftAmt2-ShiftAmt1;
+      
+      // (X >>? C1) << C2 --> X << (C2-C1) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::createShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X >>u (C2-C1) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::createLShr(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>s C2, it shifts arbitrary bits in.
+    } else {
+      assert(ShiftAmt2 < ShiftAmt1);
+      uint32_t ShiftDiff = ShiftAmt1-ShiftAmt2;
+
+      // (X >>? C1) << C2 --> X >>? (C1-C2) & (-1 << C2)
+      if (I.getOpcode() == Instruction::Shl) {
+        assert(ShiftOp->getOpcode() == Instruction::LShr ||
+               ShiftOp->getOpcode() == Instruction::AShr);
+        Instruction *Shift =
+          BinaryOperator::create(ShiftOp->getOpcode(), X,
+                                 ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getHighBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // (X << C1) >>u C2  --> X << (C1-C2) & (-1 >> C2)
+      if (I.getOpcode() == Instruction::LShr) {
+        assert(ShiftOp->getOpcode() == Instruction::Shl);
+        Instruction *Shift =
+          BinaryOperator::createShl(X, ConstantInt::get(Ty, ShiftDiff));
+        InsertNewInstBefore(Shift, I);
+        
+        APInt Mask(APInt::getLowBitsSet(TypeBits, TypeBits - ShiftAmt2));
+        return BinaryOperator::createAnd(Shift, ConstantInt::get(Mask));
+      }
+      
+      // We can't handle (X << C1) >>a C2, it shifts arbitrary bits in.
+    }
+  }
+  return 0;
+}
+
+
+/// DecomposeSimpleLinearExpr - Analyze 'Val', seeing if it is a simple linear
+/// expression.  If so, decompose it, returning some value X, such that Val is
+/// X*Scale+Offset.
+///
+static Value *DecomposeSimpleLinearExpr(Value *Val, unsigned &Scale,
+                                        int &Offset) {
+  assert(Val->getType() == Type::Int32Ty && "Unexpected allocation size type!");
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+    Offset = CI->getZExtValue();
+    Scale  = 1;
+    return ConstantInt::get(Type::Int32Ty, 0);
+  } else if (Instruction *I = dyn_cast<Instruction>(Val)) {
+    if (I->getNumOperands() == 2) {
+      if (ConstantInt *CUI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        if (I->getOpcode() == Instruction::Shl) {
+          // This is a value scaled by '1 << the shift amt'.
+          Scale = 1U << CUI->getZExtValue();
+          Offset = 0;
+          return I->getOperand(0);
+        } else if (I->getOpcode() == Instruction::Mul) {
+          // This value is scaled by 'CUI'.
+          Scale = CUI->getZExtValue();
+          Offset = 0;
+          return I->getOperand(0);
+        } else if (I->getOpcode() == Instruction::Add) {
+          // We have X+C.  Check to see if we really have (X*C2)+C1, 
+          // where C1 is divisible by C2.
+          unsigned SubScale;
+          Value *SubVal = 
+            DecomposeSimpleLinearExpr(I->getOperand(0), SubScale, Offset);
+          Offset += CUI->getZExtValue();
+          if (SubScale > 1 && (Offset % SubScale == 0)) {
+            Scale = SubScale;
+            return SubVal;
+          }
+        }
+      }
+    }
+  }
+
+  // Otherwise, we can't look past this.
+  Scale = 1;
+  Offset = 0;
+  return Val;
+}
+
+
+/// PromoteCastOfAllocation - If we find a cast of an allocation instruction,
+/// try to eliminate the cast by moving the type information into the alloc.
+Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
+                                                   AllocationInst &AI) {
+  const PointerType *PTy = cast<PointerType>(CI.getType());
+  
+  // Remove any uses of AI that are dead.
+  assert(!CI.use_empty() && "Dead instructions should be removed earlier!");
+  
+  for (Value::use_iterator UI = AI.use_begin(), E = AI.use_end(); UI != E; ) {
+    Instruction *User = cast<Instruction>(*UI++);
+    if (isInstructionTriviallyDead(User)) {
+      while (UI != E && *UI == User)
+        ++UI; // If this instruction uses AI more than once, don't break UI.
+      
+      ++NumDeadInst;
+      DOUT << "IC: DCE: " << *User;
+      EraseInstFromFunction(*User);
+    }
+  }
+  
+  // Get the type really allocated and the type casted to.
+  const Type *AllocElTy = AI.getAllocatedType();
+  const Type *CastElTy = PTy->getElementType();
+  if (!AllocElTy->isSized() || !CastElTy->isSized()) return 0;
+
+  unsigned AllocElTyAlign = TD->getABITypeAlignment(AllocElTy);
+  unsigned CastElTyAlign = TD->getABITypeAlignment(CastElTy);
+  if (CastElTyAlign < AllocElTyAlign) return 0;
+
+  // If the allocation has multiple uses, only promote it if we are strictly
+  // increasing the alignment of the resultant allocation.  If we keep it the
+  // same, we open the door to infinite loops of various kinds.
+  if (!AI.hasOneUse() && CastElTyAlign == AllocElTyAlign) return 0;
+
+  uint64_t AllocElTySize = TD->getTypeSize(AllocElTy);
+  uint64_t CastElTySize = TD->getTypeSize(CastElTy);
+  if (CastElTySize == 0 || AllocElTySize == 0) return 0;
+
+  // See if we can satisfy the modulus by pulling a scale out of the array
+  // size argument.
+  unsigned ArraySizeScale;
+  int ArrayOffset;
+  Value *NumElements = // See if the array size is a decomposable linear expr.
+    DecomposeSimpleLinearExpr(AI.getOperand(0), ArraySizeScale, ArrayOffset);
+ 
+  // If we can now satisfy the modulus, by using a non-1 scale, we really can
+  // do the xform.
+  if ((AllocElTySize*ArraySizeScale) % CastElTySize != 0 ||
+      (AllocElTySize*ArrayOffset   ) % CastElTySize != 0) return 0;
+
+  unsigned Scale = (AllocElTySize*ArraySizeScale)/CastElTySize;
+  Value *Amt = 0;
+  if (Scale == 1) {
+    Amt = NumElements;
+  } else {
+    // If the allocation size is constant, form a constant mul expression
+    Amt = ConstantInt::get(Type::Int32Ty, Scale);
+    if (isa<ConstantInt>(NumElements))
+      Amt = Multiply(cast<ConstantInt>(NumElements), cast<ConstantInt>(Amt));
+    // otherwise multiply the amount and the number of elements
+    else if (Scale != 1) {
+      Instruction *Tmp = BinaryOperator::createMul(Amt, NumElements, "tmp");
+      Amt = InsertNewInstBefore(Tmp, AI);
+    }
+  }
+  
+  if (int Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
+    Value *Off = ConstantInt::get(Type::Int32Ty, Offset, true);
+    Instruction *Tmp = BinaryOperator::createAdd(Amt, Off, "tmp");
+    Amt = InsertNewInstBefore(Tmp, AI);
+  }
+  
+  AllocationInst *New;
+  if (isa<MallocInst>(AI))
+    New = new MallocInst(CastElTy, Amt, AI.getAlignment());
+  else
+    New = new AllocaInst(CastElTy, Amt, AI.getAlignment());
+  InsertNewInstBefore(New, AI);
+  New->takeName(&AI);
+  
+  // If the allocation has multiple uses, insert a cast and change all things
+  // that used it to use the new cast.  This will also hack on CI, but it will
+  // die soon.
+  if (!AI.hasOneUse()) {
+    AddUsesToWorkList(AI);
+    // New is the allocation instruction, pointer typed. AI is the original
+    // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
+    CastInst *NewCast = new BitCastInst(New, AI.getType(), "tmpcast");
+    InsertNewInstBefore(NewCast, AI);
+    AI.replaceAllUsesWith(NewCast);
+  }
+  return ReplaceInstUsesWith(CI, New);
+}
+
+/// CanEvaluateInDifferentType - Return true if we can take the specified value
+/// and return it as type Ty without inserting any new casts and without
+/// changing the computed value.  This is used by code that tries to decide
+/// whether promoting or shrinking integer operations to wider or smaller types
+/// will allow us to eliminate a truncate or extend.
+///
+/// This is a truncation operation if Ty is smaller than V->getType(), or an
+/// extension operation if Ty is larger.
+static bool CanEvaluateInDifferentType(Value *V, const IntegerType *Ty,
+                                       int &NumCastsRemoved) {
+  // We can always evaluate constants in another type.
+  if (isa<ConstantInt>(V))
+    return true;
+  
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  const IntegerType *OrigTy = cast<IntegerType>(V->getType());
+  
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    if (!I->hasOneUse()) return false;
+    // These operators can all arbitrarily be extended or truncated.
+    return CanEvaluateInDifferentType(I->getOperand(0), Ty, NumCastsRemoved) &&
+           CanEvaluateInDifferentType(I->getOperand(1), Ty, NumCastsRemoved);
+
+  case Instruction::Shl:
+    if (!I->hasOneUse()) return false;
+    // If we are truncating the result of this SHL, and if it's a shift of a
+    // constant amount, we can always perform a SHL in a smaller type.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigTy->getBitWidth() && 
+          CI->getLimitedValue(BitWidth) < BitWidth)
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty,NumCastsRemoved);
+    }
+    break;
+  case Instruction::LShr:
+    if (!I->hasOneUse()) return false;
+    // If this is a truncate of a logical shr, we can truncate it to a smaller
+    // lshr iff we know that the bits we would otherwise be shifting in are
+    // already zeros.
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1))) {
+      uint32_t OrigBitWidth = OrigTy->getBitWidth();
+      uint32_t BitWidth = Ty->getBitWidth();
+      if (BitWidth < OrigBitWidth &&
+          MaskedValueIsZero(I->getOperand(0),
+            APInt::getHighBitsSet(OrigBitWidth, OrigBitWidth-BitWidth)) &&
+          CI->getLimitedValue(BitWidth) < BitWidth) {
+        return CanEvaluateInDifferentType(I->getOperand(0), Ty,NumCastsRemoved);
+      }
+    }
+    break;
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+    // If this is a cast from the destination type, we can trivially eliminate
+    // it, and this will remove a cast overall.
+    if (I->getOperand(0)->getType() == Ty) {
+      // If the first operand is itself a cast, and is eliminable, do not count
+      // this as an eliminable cast.  We would prefer to eliminate those two
+      // casts first.
+      if (isa<CastInst>(I->getOperand(0)))
+        return true;
+      
+      ++NumCastsRemoved;
+      return true;
+    }
+    break;
+  default:
+    // TODO: Can handle more cases here.
+    break;
+  }
+  
+  return false;
+}
+
+/// EvaluateInDifferentType - Given an expression that 
+/// CanEvaluateInDifferentType returns true for, actually insert the code to
+/// evaluate the expression.
+Value *InstCombiner::EvaluateInDifferentType(Value *V, const Type *Ty, 
+                                             bool isSigned) {
+  if (Constant *C = dyn_cast<Constant>(V))
+    return ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
+
+  // Otherwise, it must be an instruction.
+  Instruction *I = cast<Instruction>(V);
+  Instruction *Res = 0;
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::AShr:
+  case Instruction::LShr:
+  case Instruction::Shl: {
+    Value *LHS = EvaluateInDifferentType(I->getOperand(0), Ty, isSigned);
+    Value *RHS = EvaluateInDifferentType(I->getOperand(1), Ty, isSigned);
+    Res = BinaryOperator::create((Instruction::BinaryOps)I->getOpcode(),
+                                 LHS, RHS, I->getName());
+    break;
+  }    
+  case Instruction::Trunc:
+  case Instruction::ZExt:
+  case Instruction::SExt:
+  case Instruction::BitCast:
+    // If the source type of the cast is the type we're trying for then we can
+    // just return the source. There's no need to insert it because its not new.
+    if (I->getOperand(0)->getType() == Ty)
+      return I->getOperand(0);
+    
+    // Some other kind of cast, which shouldn't happen, so just ..
+    // FALL THROUGH
+  default: 
+    // TODO: Can handle more cases here.
+    assert(0 && "Unreachable!");
+    break;
+  }
+  
+  return InsertNewInstBefore(Res, *I);
+}
+
+/// @brief Implement the transforms common to all CastInst visitors.
+Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+
+  // Casting undef to anything results in undef so might as just replace it and
+  // get rid of the cast.
+  if (isa<UndefValue>(Src))   // cast undef -> undef
+    return ReplaceInstUsesWith(CI, UndefValue::get(CI.getType()));
+
+  // Many cases of "cast of a cast" are eliminable. If it's eliminable we just
+  // eliminate it now.
+  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    if (Instruction::CastOps opc = 
+        isEliminableCastPair(CSrc, CI.getOpcode(), CI.getType(), TD)) {
+      // The first cast (CSrc) is eliminable so we need to fix up or replace
+      // the second cast (CI). CSrc will then have a good chance of being dead.
+      return CastInst::create(opc, CSrc->getOperand(0), CI.getType());
+    }
+  }
+
+  // If we are casting a select then fold the cast into the select
+  if (SelectInst *SI = dyn_cast<SelectInst>(Src))
+    if (Instruction *NV = FoldOpIntoSelect(CI, SI, this))
+      return NV;
+
+  // If we are casting a PHI then fold the cast into the PHI
+  if (isa<PHINode>(Src))
+    if (Instruction *NV = FoldOpIntoPhi(CI))
+      return NV;
+  
+  return 0;
+}
+
+/// @brief Implement the transforms for cast of pointer (bitcast/ptrtoint)
+Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
+  Value *Src = CI.getOperand(0);
+  
+  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Src)) {
+    // If casting the result of a getelementptr instruction with no offset, turn
+    // this into a cast of the original pointer!
+    if (GEP->hasAllZeroIndices()) {
+      // Changing the cast operand is usually not a good idea but it is safe
+      // here because the pointer operand is being replaced with another 
+      // pointer operand so the opcode doesn't need to change.
+      AddToWorkList(GEP);
+      CI.setOperand(0, GEP->getOperand(0));
+      return &CI;
+    }
+    
+    // If the GEP has a single use, and the base pointer is a bitcast, and the
+    // GEP computes a constant offset, see if we can convert these three
+    // instructions into fewer.  This typically happens with unions and other
+    // non-type-safe code.
+    if (GEP->hasOneUse() && isa<BitCastInst>(GEP->getOperand(0))) {
+      if (GEP->hasAllConstantIndices()) {
+        // We are guaranteed to get a constant from EmitGEPOffset.
+        ConstantInt *OffsetV = cast<ConstantInt>(EmitGEPOffset(GEP, CI, *this));
+        int64_t Offset = OffsetV->getSExtValue();
+        
+        // Get the base pointer input of the bitcast, and the type it points to.
+        Value *OrigBase = cast<BitCastInst>(GEP->getOperand(0))->getOperand(0);
+        const Type *GEPIdxTy =
+          cast<PointerType>(OrigBase->getType())->getElementType();
+        if (GEPIdxTy->isSized()) {
+          SmallVector<Value*, 8> NewIndices;
+          
+          // Start with the index over the outer type.  Note that the type size
+          // might be zero (even if the offset isn't zero) if the indexed type
+          // is something like [0 x {int, int}]
+          const Type *IntPtrTy = TD->getIntPtrType();
+          int64_t FirstIdx = 0;
+          if (int64_t TySize = TD->getTypeSize(GEPIdxTy)) {
+            FirstIdx = Offset/TySize;
+            Offset %= TySize;
+          
+            // Handle silly modulus not returning values values [0..TySize).
+            if (Offset < 0) {
+              --FirstIdx;
+              Offset += TySize;
+              assert(Offset >= 0);
+            }
+            assert((uint64_t)Offset < (uint64_t)TySize &&"Out of range offset");
+          }
+          
+          NewIndices.push_back(ConstantInt::get(IntPtrTy, FirstIdx));
+
+          // Index into the types.  If we fail, set OrigBase to null.
+          while (Offset) {
+            if (const StructType *STy = dyn_cast<StructType>(GEPIdxTy)) {
+              const StructLayout *SL = TD->getStructLayout(STy);
+              if (Offset < (int64_t)SL->getSizeInBytes()) {
+                unsigned Elt = SL->getElementContainingOffset(Offset);
+                NewIndices.push_back(ConstantInt::get(Type::Int32Ty, Elt));
+              
+                Offset -= SL->getElementOffset(Elt);
+                GEPIdxTy = STy->getElementType(Elt);
+              } else {
+                // Otherwise, we can't index into this, bail out.
+                Offset = 0;
+                OrigBase = 0;
+              }
+            } else if (isa<ArrayType>(GEPIdxTy) || isa<VectorType>(GEPIdxTy)) {
+              const SequentialType *STy = cast<SequentialType>(GEPIdxTy);
+              if (uint64_t EltSize = TD->getTypeSize(STy->getElementType())) {
+                NewIndices.push_back(ConstantInt::get(IntPtrTy,Offset/EltSize));
+                Offset %= EltSize;
+              } else {
+                NewIndices.push_back(ConstantInt::get(IntPtrTy, 0));
+              }
+              GEPIdxTy = STy->getElementType();
+            } else {
+              // Otherwise, we can't index into this, bail out.
+              Offset = 0;
+              OrigBase = 0;
+            }
+          }
+          if (OrigBase) {
+            // If we were able to index down into an element, create the GEP
+            // and bitcast the result.  This eliminates one bitcast, potentially
+            // two.
+            Instruction *NGEP = new GetElementPtrInst(OrigBase, &NewIndices[0],
+                                                      NewIndices.size(), "");
+            InsertNewInstBefore(NGEP, CI);
+            NGEP->takeName(GEP);
+            
+            if (isa<BitCastInst>(CI))
+              return new BitCastInst(NGEP, CI.getType());
+            assert(isa<PtrToIntInst>(CI));
+            return new PtrToIntInst(NGEP, CI.getType());
+          }
+        }
+      }      
+    }
+  }
+    
+  return commonCastTransforms(CI);
+}
+
+
+
+/// Only the TRUNC, ZEXT, SEXT, and BITCAST can both operand and result as
+/// integer types. This function implements the common transforms for all those
+/// cases.
+/// @brief Implement the transforms common to CastInst with integer operands
+Instruction *InstCombiner::commonIntCastTransforms(CastInst &CI) {
+  if (Instruction *Result = commonCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+  uint32_t SrcBitSize = SrcTy->getPrimitiveSizeInBits();
+  uint32_t DestBitSize = DestTy->getPrimitiveSizeInBits();
+
+  // See if we can simplify any instructions used by the LHS whose sole 
+  // purpose is to compute bits we don't care about.
+  APInt KnownZero(DestBitSize, 0), KnownOne(DestBitSize, 0);
+  if (SimplifyDemandedBits(&CI, APInt::getAllOnesValue(DestBitSize),
+                           KnownZero, KnownOne))
+    return &CI;
+
+  // If the source isn't an instruction or has more than one use then we
+  // can't do anything more. 
+  Instruction *SrcI = dyn_cast<Instruction>(Src);
+  if (!SrcI || !Src->hasOneUse())
+    return 0;
+
+  // Attempt to propagate the cast into the instruction for int->int casts.
+  int NumCastsRemoved = 0;
+  if (!isa<BitCastInst>(CI) &&
+      CanEvaluateInDifferentType(SrcI, cast<IntegerType>(DestTy),
+                                 NumCastsRemoved)) {
+    // If this cast is a truncate, evaluting in a different type always
+    // eliminates the cast, so it is always a win.  If this is a noop-cast
+    // this just removes a noop cast which isn't pointful, but simplifies
+    // the code.  If this is a zero-extension, we need to do an AND to
+    // maintain the clear top-part of the computation, so we require that
+    // the input have eliminated at least one cast.  If this is a sign
+    // extension, we insert two new casts (to do the extension) so we
+    // require that two casts have been eliminated.
+    bool DoXForm;
+    switch (CI.getOpcode()) {
+    default:
+      // All the others use floating point so we shouldn't actually 
+      // get here because of the check above.
+      assert(0 && "Unknown cast type");
+    case Instruction::Trunc:
+      DoXForm = true;
+      break;
+    case Instruction::ZExt:
+      DoXForm = NumCastsRemoved >= 1;
+      break;
+    case Instruction::SExt:
+      DoXForm = NumCastsRemoved >= 2;
+      break;
+    case Instruction::BitCast:
+      DoXForm = false;
+      break;
+    }
+    
+    if (DoXForm) {
+      Value *Res = EvaluateInDifferentType(SrcI, DestTy, 
+                                           CI.getOpcode() == Instruction::SExt);
+      assert(Res->getType() == DestTy);
+      switch (CI.getOpcode()) {
+      default: assert(0 && "Unknown cast type!");
+      case Instruction::Trunc:
+      case Instruction::BitCast:
+        // Just replace this cast with the result.
+        return ReplaceInstUsesWith(CI, Res);
+      case Instruction::ZExt: {
+        // We need to emit an AND to clear the high bits.
+        assert(SrcBitSize < DestBitSize && "Not a zext?");
+        Constant *C = ConstantInt::get(APInt::getLowBitsSet(DestBitSize,
+                                                            SrcBitSize));
+        return BinaryOperator::createAnd(Res, C);
+      }
+      case Instruction::SExt:
+        // We need to emit a cast to truncate, then a cast to sext.
+        return CastInst::create(Instruction::SExt,
+            InsertCastBefore(Instruction::Trunc, Res, Src->getType(), 
+                             CI), DestTy);
+      }
+    }
+  }
+  
+  Value *Op0 = SrcI->getNumOperands() > 0 ? SrcI->getOperand(0) : 0;
+  Value *Op1 = SrcI->getNumOperands() > 1 ? SrcI->getOperand(1) : 0;
+
+  switch (SrcI->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // If we are discarding information, rewrite.
+    if (DestBitSize <= SrcBitSize && DestBitSize != 1) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (DestBitSize == SrcBitSize || 
+          !ValueRequiresCast(CI.getOpcode(), Op1, DestTy,TD) ||
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Instruction::CastOps opcode = CI.getOpcode();
+        Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
+        Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
+        return BinaryOperator::create(
+            cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+
+    // cast (xor bool X, true) to int  --> xor (cast bool X to int), 1
+    if (isa<ZExtInst>(CI) && SrcBitSize == 1 && 
+        SrcI->getOpcode() == Instruction::Xor &&
+        Op1 == ConstantInt::getTrue() &&
+        (!Op0->hasOneUse() || !isa<CmpInst>(Op0))) {
+      Value *New = InsertOperandCastBefore(Instruction::ZExt, Op0, DestTy, &CI);
+      return BinaryOperator::createXor(New, ConstantInt::get(CI.getType(), 1));
+    }
+    break;
+  case Instruction::SDiv:
+  case Instruction::UDiv:
+  case Instruction::SRem:
+  case Instruction::URem:
+    // If we are just changing the sign, rewrite.
+    if (DestBitSize == SrcBitSize) {
+      // Don't insert two casts if they cannot be eliminated.  We allow 
+      // two casts to be inserted if the sizes are the same.  This could 
+      // only be converting signedness, which is a noop.
+      if (!ValueRequiresCast(CI.getOpcode(), Op1, DestTy, TD) || 
+          !ValueRequiresCast(CI.getOpcode(), Op0, DestTy, TD)) {
+        Value *Op0c = InsertOperandCastBefore(Instruction::BitCast, 
+                                              Op0, DestTy, SrcI);
+        Value *Op1c = InsertOperandCastBefore(Instruction::BitCast, 
+                                              Op1, DestTy, SrcI);
+        return BinaryOperator::create(
+          cast<BinaryOperator>(SrcI)->getOpcode(), Op0c, Op1c);
+      }
+    }
+    break;
+
+  case Instruction::Shl:
+    // Allow changing the sign of the source operand.  Do not allow 
+    // changing the size of the shift, UNLESS the shift amount is a 
+    // constant.  We must not change variable sized shifts to a smaller 
+    // size, because it is undefined to shift more bits out than exist 
+    // in the value.
+    if (DestBitSize == SrcBitSize ||
+        (DestBitSize < SrcBitSize && isa<Constant>(Op1))) {
+      Instruction::CastOps opcode = (DestBitSize == SrcBitSize ?
+          Instruction::BitCast : Instruction::Trunc);
+      Value *Op0c = InsertOperandCastBefore(opcode, Op0, DestTy, SrcI);
+      Value *Op1c = InsertOperandCastBefore(opcode, Op1, DestTy, SrcI);
+      return BinaryOperator::createShl(Op0c, Op1c);
+    }
+    break;
+  case Instruction::AShr:
+    // If this is a signed shr, and if all bits shifted in are about to be
+    // truncated off, turn it into an unsigned shr to allow greater
+    // simplifications.
+    if (DestBitSize < SrcBitSize &&
+        isa<ConstantInt>(Op1)) {
+      uint32_t ShiftAmt = cast<ConstantInt>(Op1)->getLimitedValue(SrcBitSize);
+      if (SrcBitSize > ShiftAmt && SrcBitSize-ShiftAmt >= DestBitSize) {
+        // Insert the new logical shift right.
+        return BinaryOperator::createLShr(Op0, Op1);
+      }
+    }
+    break;
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+  
+  Value *Src = CI.getOperand(0);
+  const Type *Ty = CI.getType();
+  uint32_t DestBitWidth = Ty->getPrimitiveSizeInBits();
+  uint32_t SrcBitWidth = cast<IntegerType>(Src->getType())->getBitWidth();
+  
+  if (Instruction *SrcI = dyn_cast<Instruction>(Src)) {
+    switch (SrcI->getOpcode()) {
+    default: break;
+    case Instruction::LShr:
+      // We can shrink lshr to something smaller if we know the bits shifted in
+      // are already zeros.
+      if (ConstantInt *ShAmtV = dyn_cast<ConstantInt>(SrcI->getOperand(1))) {
+        uint32_t ShAmt = ShAmtV->getLimitedValue(SrcBitWidth);
+        
+        // Get a mask for the bits shifting in.
+        APInt Mask(APInt::getLowBitsSet(SrcBitWidth, ShAmt).shl(DestBitWidth));
+        Value* SrcIOp0 = SrcI->getOperand(0);
+        if (SrcI->hasOneUse() && MaskedValueIsZero(SrcIOp0, Mask)) {
+          if (ShAmt >= DestBitWidth)        // All zeros.
+            return ReplaceInstUsesWith(CI, Constant::getNullValue(Ty));
+
+          // Okay, we can shrink this.  Truncate the input, then return a new
+          // shift.
+          Value *V1 = InsertCastBefore(Instruction::Trunc, SrcIOp0, Ty, CI);
+          Value *V2 = InsertCastBefore(Instruction::Trunc, SrcI->getOperand(1),
+                                       Ty, CI);
+          return BinaryOperator::createLShr(V1, V2);
+        }
+      } else {     // This is a variable shr.
+        
+        // Turn 'trunc (lshr X, Y) to bool' into '(X & (1 << Y)) != 0'.  This is
+        // more LLVM instructions, but allows '1 << Y' to be hoisted if
+        // loop-invariant and CSE'd.
+        if (CI.getType() == Type::Int1Ty && SrcI->hasOneUse()) {
+          Value *One = ConstantInt::get(SrcI->getType(), 1);
+
+          Value *V = InsertNewInstBefore(
+              BinaryOperator::createShl(One, SrcI->getOperand(1),
+                                     "tmp"), CI);
+          V = InsertNewInstBefore(BinaryOperator::createAnd(V,
+                                                            SrcI->getOperand(0),
+                                                            "tmp"), CI);
+          Value *Zero = Constant::getNullValue(V->getType());
+          return new ICmpInst(ICmpInst::ICMP_NE, V, Zero);
+        }
+      }
+      break;
+    }
+  }
+  
+  return 0;
+}
+
+Instruction *InstCombiner::visitZExt(ZExtInst &CI) {
+  // If one of the common conversion will work ..
+  if (Instruction *Result = commonIntCastTransforms(CI))
+    return Result;
+
+  Value *Src = CI.getOperand(0);
+
+  // If this is a cast of a cast
+  if (CastInst *CSrc = dyn_cast<CastInst>(Src)) {   // A->B->C cast
+    // If this is a TRUNC followed by a ZEXT then we are dealing with integral
+    // types and if the sizes are just right we can convert this into a logical
+    // 'and' which will be much cheaper than the pair of casts.
+    if (isa<TruncInst>(CSrc)) {
+      // Get the sizes of the types involved
+      Value *A = CSrc->getOperand(0);
+      uint32_t SrcSize = A->getType()->getPrimitiveSizeInBits();
+      uint32_t MidSize = CSrc->getType()->getPrimitiveSizeInBits();
+      uint32_t DstSize = CI.getType()->getPrimitiveSizeInBits();
+      // If we're actually extending zero bits and the trunc is a no-op
+      if (MidSize < DstSize && SrcSize == DstSize) {
+        // Replace both of the casts with an And of the type mask.
+        APInt AndValue(APInt::getLowBitsSet(SrcSize, MidSize));
+        Constant *AndConst = ConstantInt::get(AndValue);
+        Instruction *And = 
+          BinaryOperator::createAnd(CSrc->getOperand(0), AndConst);
+        // Unfortunately, if the type changed, we need to cast it back.
+        if (And->getType() != CI.getType()) {
+          And->setName(CSrc->getName()+".mask");
+          InsertNewInstBefore(And, CI);
+          And = CastInst::createIntegerCast(And, CI.getType(), false/*ZExt*/);
+        }
+        return And;
+      }
+    }
+  }
+
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) {
+    // If we are just checking for a icmp eq of a single bit and zext'ing it
+    // to an integer, then shift the bit to the appropriate place and then
+    // cast to integer to avoid the comparison.
+    if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+      const APInt &Op1CV = Op1C->getValue();
+      
+      // zext (x <s  0) to i32 --> x>>u31      true if signbit set.
+      // zext (x >s -1) to i32 --> (x>>u31)^1  true if signbit clear.
+      if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+          (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())){
+        Value *In = ICI->getOperand(0);
+        Value *Sh = ConstantInt::get(In->getType(),
+                                    In->getType()->getPrimitiveSizeInBits()-1);
+        In = InsertNewInstBefore(BinaryOperator::createLShr(In, Sh,
+                                                        In->getName()+".lobit"),
+                                 CI);
+        if (In->getType() != CI.getType())
+          In = CastInst::createIntegerCast(In, CI.getType(),
+                                           false/*ZExt*/, "tmp", &CI);
+
+        if (ICI->getPredicate() == ICmpInst::ICMP_SGT) {
+          Constant *One = ConstantInt::get(In->getType(), 1);
+          In = InsertNewInstBefore(BinaryOperator::createXor(In, One,
+                                                          In->getName()+".not"),
+                                   CI);
+        }
+
+        return ReplaceInstUsesWith(CI, In);
+      }
+      
+      
+      
+      // zext (X == 0) to i32 --> X^1      iff X has only the low bit set.
+      // zext (X == 0) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+      // zext (X == 1) to i32 --> X        iff X has only the low bit set.
+      // zext (X == 2) to i32 --> X>>1     iff X has only the 2nd bit set.
+      // zext (X != 0) to i32 --> X        iff X has only the low bit set.
+      // zext (X != 0) to i32 --> X>>1     iff X has only the 2nd bit set.
+      // zext (X != 1) to i32 --> X^1      iff X has only the low bit set.
+      // zext (X != 2) to i32 --> (X>>1)^1 iff X has only the 2nd bit set.
+      if ((Op1CV == 0 || Op1CV.isPowerOf2()) && 
+          // This only works for EQ and NE
+          ICI->isEquality()) {
+        // If Op1C some other power of two, convert:
+        uint32_t BitWidth = Op1C->getType()->getBitWidth();
+        APInt KnownZero(BitWidth, 0), KnownOne(BitWidth, 0);
+        APInt TypeMask(APInt::getAllOnesValue(BitWidth));
+        ComputeMaskedBits(ICI->getOperand(0), TypeMask, KnownZero, KnownOne);
+        
+        APInt KnownZeroMask(~KnownZero);
+        if (KnownZeroMask.isPowerOf2()) { // Exactly 1 possible 1?
+          bool isNE = ICI->getPredicate() == ICmpInst::ICMP_NE;
+          if (Op1CV != 0 && (Op1CV != KnownZeroMask)) {
+            // (X&4) == 2 --> false
+            // (X&4) != 2 --> true
+            Constant *Res = ConstantInt::get(Type::Int1Ty, isNE);
+            Res = ConstantExpr::getZExt(Res, CI.getType());
+            return ReplaceInstUsesWith(CI, Res);
+          }
+          
+          uint32_t ShiftAmt = KnownZeroMask.logBase2();
+          Value *In = ICI->getOperand(0);
+          if (ShiftAmt) {
+            // Perform a logical shr by shiftamt.
+            // Insert the shift to put the result in the low bit.
+            In = InsertNewInstBefore(
+                   BinaryOperator::createLShr(In,
+                                     ConstantInt::get(In->getType(), ShiftAmt),
+                                              In->getName()+".lobit"), CI);
+          }
+          
+          if ((Op1CV != 0) == isNE) { // Toggle the low bit.
+            Constant *One = ConstantInt::get(In->getType(), 1);
+            In = BinaryOperator::createXor(In, One, "tmp");
+            InsertNewInstBefore(cast<Instruction>(In), CI);
+          }
+          
+          if (CI.getType() == In->getType())
+            return ReplaceInstUsesWith(CI, In);
+          else
+            return CastInst::createIntegerCast(In, CI.getType(), false/*ZExt*/);
+        }
+      }
+    }
+  }    
+  return 0;
+}
+
+Instruction *InstCombiner::visitSExt(SExtInst &CI) {
+  if (Instruction *I = commonIntCastTransforms(CI))
+    return I;
+  
+  Value *Src = CI.getOperand(0);
+  
+  // sext (x <s 0) -> ashr x, 31   -> all ones if signed
+  // sext (x >s -1) -> ashr x, 31  -> all ones if not signed
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(Src)) {
+    // If we are just checking for a icmp eq of a single bit and zext'ing it
+    // to an integer, then shift the bit to the appropriate place and then
+    // cast to integer to avoid the comparison.
+    if (ConstantInt *Op1C = dyn_cast<ConstantInt>(ICI->getOperand(1))) {
+      const APInt &Op1CV = Op1C->getValue();
+      
+      // sext (x <s  0) to i32 --> x>>s31      true if signbit set.
+      // sext (x >s -1) to i32 --> (x>>s31)^-1  true if signbit clear.
+      if ((ICI->getPredicate() == ICmpInst::ICMP_SLT && Op1CV == 0) ||
+          (ICI->getPredicate() == ICmpInst::ICMP_SGT &&Op1CV.isAllOnesValue())){
+        Value *In = ICI->getOperand(0);
+        Value *Sh = ConstantInt::get(In->getType(),
+                                     In->getType()->getPrimitiveSizeInBits()-1);
+        In = InsertNewInstBefore(BinaryOperator::createAShr(In, Sh,
+                                                        In->getName()+".lobit"),
+                                 CI);
+        if (In->getType() != CI.getType())
+          In = CastInst::createIntegerCast(In, CI.getType(),
+                                           true/*SExt*/, "tmp", &CI);
+        
+        if (ICI->getPredicate() == ICmpInst::ICMP_SGT)
+          In = InsertNewInstBefore(BinaryOperator::createNot(In,
+                                     In->getName()+".not"), CI);
+        
+        return ReplaceInstUsesWith(CI, In);
+      }
+    }
+  }
+      
+  return 0;
+}
+
+Instruction *InstCombiner::visitFPTrunc(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPExt(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPToUI(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitFPToSI(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitUIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitSIToFP(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitPtrToInt(CastInst &CI) {
+  return commonPointerCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitIntToPtr(CastInst &CI) {
+  return commonCastTransforms(CI);
+}
+
+Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
+  // If the operands are integer typed then apply the integer transforms,
+  // otherwise just apply the common ones.
+  Value *Src = CI.getOperand(0);
+  const Type *SrcTy = Src->getType();
+  const Type *DestTy = CI.getType();
+
+  if (SrcTy->isInteger() && DestTy->isInteger()) {
+    if (Instruction *Result = commonIntCastTransforms(CI))
+      return Result;
+  } else if (isa<PointerType>(SrcTy)) {
+    if (Instruction *I = commonPointerCastTransforms(CI))
+      return I;
+  } else {
+    if (Instruction *Result = commonCastTransforms(CI))
+      return Result;
+  }
+
+
+  // Get rid of casts from one type to the same type. These are useless and can
+  // be replaced by the operand.
+  if (DestTy == Src->getType())
+    return ReplaceInstUsesWith(CI, Src);
+
+  if (const PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+    const PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    const Type *DstElTy = DstPTy->getElementType();
+    const Type *SrcElTy = SrcPTy->getElementType();
+    
+    // If we are casting a malloc or alloca to a pointer to a type of the same
+    // size, rewrite the allocation instruction to allocate the "right" type.
+    if (AllocationInst *AI = dyn_cast<AllocationInst>(Src))
+      if (Instruction *V = PromoteCastOfAllocation(CI, *AI))
+        return V;
+    
+    // If the source and destination are pointers, and this cast is equivalent
+    // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
+    // This can enhance SROA and other transforms that want type-safe pointers.
+    Constant *ZeroUInt = Constant::getNullValue(Type::Int32Ty);
+    unsigned NumZeros = 0;
+    while (SrcElTy != DstElTy && 
+           isa<CompositeType>(SrcElTy) && !isa<PointerType>(SrcElTy) &&
+           SrcElTy->getNumContainedTypes() /* not "{}" */) {
+      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(ZeroUInt);
+      ++NumZeros;
+    }
+
+    // If we found a path from the src to dest, create the getelementptr now.
+    if (SrcElTy == DstElTy) {
+      SmallVector<Value*, 8> Idxs(NumZeros+1, ZeroUInt);
+      return new GetElementPtrInst(Src, &Idxs[0], Idxs.size());
+    }
+  }
+
+  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(Src)) {
+    if (SVI->hasOneUse()) {
+      // Okay, we have (bitconvert (shuffle ..)).  Check to see if this is
+      // a bitconvert to a vector with the same # elts.
+      if (isa<VectorType>(DestTy) && 
+          cast<VectorType>(DestTy)->getNumElements() == 
+                SVI->getType()->getNumElements()) {
+        CastInst *Tmp;
+        // If either of the operands is a cast from CI.getType(), then
+        // evaluating the shuffle in the casted destination's type will allow
+        // us to eliminate at least one cast.
+        if (((Tmp = dyn_cast<CastInst>(SVI->getOperand(0))) && 
+             Tmp->getOperand(0)->getType() == DestTy) ||
+            ((Tmp = dyn_cast<CastInst>(SVI->getOperand(1))) && 
+             Tmp->getOperand(0)->getType() == DestTy)) {
+          Value *LHS = InsertOperandCastBefore(Instruction::BitCast,
+                                               SVI->getOperand(0), DestTy, &CI);
+          Value *RHS = InsertOperandCastBefore(Instruction::BitCast,
+                                               SVI->getOperand(1), DestTy, &CI);
+          // Return a new shuffle vector.  Use the same element ID's, as we
+          // know the vector types match #elts.
+          return new ShuffleVectorInst(LHS, RHS, SVI->getOperand(2));
+        }
+      }
+    }
+  }
+  return 0;
+}
+
+/// GetSelectFoldableOperands - We want to turn code that looks like this:
+///   %C = or %A, %B
+///   %D = select %cond, %C, %A
+/// into:
+///   %C = select %cond, %B, 0
+///   %D = or %A, %C
+///
+/// Assuming that the specified instruction is an operand to the select, return
+/// a bitmask indicating which operands of this instruction are foldable if they
+/// equal the other incoming value of the select.
+///
+static unsigned GetSelectFoldableOperands(Instruction *I) {
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+  case Instruction::Mul:
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    return 3;              // Can fold through either operand.
+  case Instruction::Sub:   // Can only fold on the amount subtracted.
+  case Instruction::Shl:   // Can only fold on the shift amount.
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return 1;
+  default:
+    return 0;              // Cannot fold
+  }
+}
+
+/// GetSelectFoldableConstant - For the same transformation as the previous
+/// function, return the identity constant that goes into the select.
+static Constant *GetSelectFoldableConstant(Instruction *I) {
+  switch (I->getOpcode()) {
+  default: assert(0 && "This cannot happen!"); abort();
+  case Instruction::Add:
+  case Instruction::Sub:
+  case Instruction::Or:
+  case Instruction::Xor:
+  case Instruction::Shl:
+  case Instruction::LShr:
+  case Instruction::AShr:
+    return Constant::getNullValue(I->getType());
+  case Instruction::And:
+    return Constant::getAllOnesValue(I->getType());
+  case Instruction::Mul:
+    return ConstantInt::get(I->getType(), 1);
+  }
+}
+
+/// FoldSelectOpOp - Here we have (select c, TI, FI), and we know that TI and FI
+/// have the same opcode and only one use each.  Try to simplify this.
+Instruction *InstCombiner::FoldSelectOpOp(SelectInst &SI, Instruction *TI,
+                                          Instruction *FI) {
+  if (TI->getNumOperands() == 1) {
+    // If this is a non-volatile load or a cast from the same type,
+    // merge.
+    if (TI->isCast()) {
+      if (TI->getOperand(0)->getType() != FI->getOperand(0)->getType())
+        return 0;
+    } else {
+      return 0;  // unknown unary op.
+    }
+
+    // Fold this by inserting a select from the input values.
+    SelectInst *NewSI = new SelectInst(SI.getCondition(), TI->getOperand(0),
+                                       FI->getOperand(0), SI.getName()+".v");
+    InsertNewInstBefore(NewSI, SI);
+    return CastInst::create(Instruction::CastOps(TI->getOpcode()), NewSI, 
+                            TI->getType());
+  }
+
+  // Only handle binary operators here.
+  if (!isa<BinaryOperator>(TI))
+    return 0;
+
+  // Figure out if the operations have any operands in common.
+  Value *MatchOp, *OtherOpT, *OtherOpF;
+  bool MatchIsOpZero;
+  if (TI->getOperand(0) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = false;
+  } else if (!TI->isCommutative()) {
+    return 0;
+  } else if (TI->getOperand(0) == FI->getOperand(1)) {
+    MatchOp  = TI->getOperand(0);
+    OtherOpT = TI->getOperand(1);
+    OtherOpF = FI->getOperand(0);
+    MatchIsOpZero = true;
+  } else if (TI->getOperand(1) == FI->getOperand(0)) {
+    MatchOp  = TI->getOperand(1);
+    OtherOpT = TI->getOperand(0);
+    OtherOpF = FI->getOperand(1);
+    MatchIsOpZero = true;
+  } else {
+    return 0;
+  }
+
+  // If we reach here, they do have operations in common.
+  SelectInst *NewSI = new SelectInst(SI.getCondition(), OtherOpT,
+                                     OtherOpF, SI.getName()+".v");
+  InsertNewInstBefore(NewSI, SI);
+
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TI)) {
+    if (MatchIsOpZero)
+      return BinaryOperator::create(BO->getOpcode(), MatchOp, NewSI);
+    else
+      return BinaryOperator::create(BO->getOpcode(), NewSI, MatchOp);
+  }
+  assert(0 && "Shouldn't get here");
+  return 0;
+}
+
+Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
+  Value *CondVal = SI.getCondition();
+  Value *TrueVal = SI.getTrueValue();
+  Value *FalseVal = SI.getFalseValue();
+
+  // select true, X, Y  -> X
+  // select false, X, Y -> Y
+  if (ConstantInt *C = dyn_cast<ConstantInt>(CondVal))
+    return ReplaceInstUsesWith(SI, C->getZExtValue() ? TrueVal : FalseVal);
+
+  // select C, X, X -> X
+  if (TrueVal == FalseVal)
+    return ReplaceInstUsesWith(SI, TrueVal);
+
+  if (isa<UndefValue>(TrueVal))   // select C, undef, X -> X
+    return ReplaceInstUsesWith(SI, FalseVal);
+  if (isa<UndefValue>(FalseVal))   // select C, X, undef -> X
+    return ReplaceInstUsesWith(SI, TrueVal);
+  if (isa<UndefValue>(CondVal)) {  // select undef, X, Y -> X or Y
+    if (isa<Constant>(TrueVal))
+      return ReplaceInstUsesWith(SI, TrueVal);
+    else
+      return ReplaceInstUsesWith(SI, FalseVal);
+  }
+
+  if (SI.getType() == Type::Int1Ty) {
+    if (ConstantInt *C = dyn_cast<ConstantInt>(TrueVal)) {
+      if (C->getZExtValue()) {
+        // Change: A = select B, true, C --> A = or B, C
+        return BinaryOperator::createOr(CondVal, FalseVal);
+      } else {
+        // Change: A = select B, false, C --> A = and !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::createAnd(NotCond, FalseVal);
+      }
+    } else if (ConstantInt *C = dyn_cast<ConstantInt>(FalseVal)) {
+      if (C->getZExtValue() == false) {
+        // Change: A = select B, C, false --> A = and B, C
+        return BinaryOperator::createAnd(CondVal, TrueVal);
+      } else {
+        // Change: A = select B, C, true --> A = or !B, C
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+                                             "not."+CondVal->getName()), SI);
+        return BinaryOperator::createOr(NotCond, TrueVal);
+      }
+    }
+  }
+
+  // Selecting between two integer constants?
+  if (ConstantInt *TrueValC = dyn_cast<ConstantInt>(TrueVal))
+    if (ConstantInt *FalseValC = dyn_cast<ConstantInt>(FalseVal)) {
+      // select C, 1, 0 -> zext C to int
+      if (FalseValC->isZero() && TrueValC->getValue() == 1) {
+        return CastInst::create(Instruction::ZExt, CondVal, SI.getType());
+      } else if (TrueValC->isZero() && FalseValC->getValue() == 1) {
+        // select C, 0, 1 -> zext !C to int
+        Value *NotCond =
+          InsertNewInstBefore(BinaryOperator::createNot(CondVal,
+                                               "not."+CondVal->getName()), SI);
+        return CastInst::create(Instruction::ZExt, NotCond, SI.getType());
+      }
+      
+      // FIXME: Turn select 0/-1 and -1/0 into sext from condition!
+
+      if (ICmpInst *IC = dyn_cast<ICmpInst>(SI.getCondition())) {
+
+        // (x <s 0) ? -1 : 0 -> ashr x, 31
+        if (TrueValC->isAllOnesValue() && FalseValC->isZero())
+          if (ConstantInt *CmpCst = dyn_cast<ConstantInt>(IC->getOperand(1))) {
+            if (IC->getPredicate() == ICmpInst::ICMP_SLT && CmpCst->isZero()) {
+              // The comparison constant and the result are not neccessarily the
+              // same width. Make an all-ones value by inserting a AShr.
+              Value *X = IC->getOperand(0);
+              uint32_t Bits = X->getType()->getPrimitiveSizeInBits();
+              Constant *ShAmt = ConstantInt::get(X->getType(), Bits-1);
+              Instruction *SRA = BinaryOperator::create(Instruction::AShr, X,
+                                                        ShAmt, "ones");
+              InsertNewInstBefore(SRA, SI);
+              
+              // Finally, convert to the type of the select RHS.  We figure out
+              // if this requires a SExt, Trunc or BitCast based on the sizes.
+              Instruction::CastOps opc = Instruction::BitCast;
+              uint32_t SRASize = SRA->getType()->getPrimitiveSizeInBits();
+              uint32_t SISize  = SI.getType()->getPrimitiveSizeInBits();
+              if (SRASize < SISize)
+                opc = Instruction::SExt;
+              else if (SRASize > SISize)
+                opc = Instruction::Trunc;
+              return CastInst::create(opc, SRA, SI.getType());
+            }
+          }
+
+
+        // If one of the constants is zero (we know they can't both be) and we
+        // have an icmp instruction with zero, and we have an 'and' with the
+        // non-constant value, eliminate this whole mess.  This corresponds to
+        // cases like this: ((X & 27) ? 27 : 0)
+        if (TrueValC->isZero() || FalseValC->isZero())
+          if (IC->isEquality() && isa<ConstantInt>(IC->getOperand(1)) &&
+              cast<Constant>(IC->getOperand(1))->isNullValue())
+            if (Instruction *ICA = dyn_cast<Instruction>(IC->getOperand(0)))
+              if (ICA->getOpcode() == Instruction::And &&
+                  isa<ConstantInt>(ICA->getOperand(1)) &&
+                  (ICA->getOperand(1) == TrueValC ||
+                   ICA->getOperand(1) == FalseValC) &&
+                  isOneBitSet(cast<ConstantInt>(ICA->getOperand(1)))) {
+                // Okay, now we know that everything is set up, we just don't
+                // know whether we have a icmp_ne or icmp_eq and whether the 
+                // true or false val is the zero.
+                bool ShouldNotVal = !TrueValC->isZero();
+                ShouldNotVal ^= IC->getPredicate() == ICmpInst::ICMP_NE;
+                Value *V = ICA;
+                if (ShouldNotVal)
+                  V = InsertNewInstBefore(BinaryOperator::create(
+                                  Instruction::Xor, V, ICA->getOperand(1)), SI);
+                return ReplaceInstUsesWith(SI, V);
+              }
+      }
+    }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (FCmpInst *FCI = dyn_cast<FCmpInst>(CondVal)) {
+    if (FCI->getOperand(0) == TrueVal && FCI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ)
+        return ReplaceInstUsesWith(SI, FalseVal);
+      // Transform (X != Y) ? X : Y  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX/ABS/etc.
+
+    } else if (FCI->getOperand(0) == FalseVal && FCI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (FCI->getPredicate() == FCmpInst::FCMP_OEQ)
+        return ReplaceInstUsesWith(SI, FalseVal);
+      // Transform (X != Y) ? Y : X  -> Y
+      if (FCI->getPredicate() == FCmpInst::FCMP_ONE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX/ABS/etc.
+    }
+  }
+
+  // See if we are selecting two values based on a comparison of the two values.
+  if (ICmpInst *ICI = dyn_cast<ICmpInst>(CondVal)) {
+    if (ICI->getOperand(0) == TrueVal && ICI->getOperand(1) == FalseVal) {
+      // Transform (X == Y) ? X : Y  -> Y
+      if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+        return ReplaceInstUsesWith(SI, FalseVal);
+      // Transform (X != Y) ? X : Y  -> X
+      if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX/ABS/etc.
+
+    } else if (ICI->getOperand(0) == FalseVal && ICI->getOperand(1) == TrueVal){
+      // Transform (X == Y) ? Y : X  -> X
+      if (ICI->getPredicate() == ICmpInst::ICMP_EQ)
+        return ReplaceInstUsesWith(SI, FalseVal);
+      // Transform (X != Y) ? Y : X  -> Y
+      if (ICI->getPredicate() == ICmpInst::ICMP_NE)
+        return ReplaceInstUsesWith(SI, TrueVal);
+      // NOTE: if we wanted to, this is where to detect MIN/MAX/ABS/etc.
+    }
+  }
+
+  if (Instruction *TI = dyn_cast<Instruction>(TrueVal))
+    if (Instruction *FI = dyn_cast<Instruction>(FalseVal))
+      if (TI->hasOneUse() && FI->hasOneUse()) {
+        Instruction *AddOp = 0, *SubOp = 0;
+
+        // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
+        if (TI->getOpcode() == FI->getOpcode())
+          if (Instruction *IV = FoldSelectOpOp(SI, TI, FI))
+            return IV;
+
+        // Turn select C, (X+Y), (X-Y) --> (X+(select C, Y, (-Y))).  This is
+        // even legal for FP.
+        if (TI->getOpcode() == Instruction::Sub &&
+            FI->getOpcode() == Instruction::Add) {
+          AddOp = FI; SubOp = TI;
+        } else if (FI->getOpcode() == Instruction::Sub &&
+                   TI->getOpcode() == Instruction::Add) {
+          AddOp = TI; SubOp = FI;
+        }
+
+        if (AddOp) {
+          Value *OtherAddOp = 0;
+          if (SubOp->getOperand(0) == AddOp->getOperand(0)) {
+            OtherAddOp = AddOp->getOperand(1);
+          } else if (SubOp->getOperand(0) == AddOp->getOperand(1)) {
+            OtherAddOp = AddOp->getOperand(0);
+          }
+
+          if (OtherAddOp) {
+            // So at this point we know we have (Y -> OtherAddOp):
+            //        select C, (add X, Y), (sub X, Z)
+            Value *NegVal;  // Compute -Z
+            if (Constant *C = dyn_cast<Constant>(SubOp->getOperand(1))) {
+              NegVal = ConstantExpr::getNeg(C);
+            } else {
+              NegVal = InsertNewInstBefore(
+                    BinaryOperator::createNeg(SubOp->getOperand(1), "tmp"), SI);
+            }
+
+            Value *NewTrueOp = OtherAddOp;
+            Value *NewFalseOp = NegVal;
+            if (AddOp != TI)
+              std::swap(NewTrueOp, NewFalseOp);
+            Instruction *NewSel =
+              new SelectInst(CondVal, NewTrueOp,NewFalseOp,SI.getName()+".p");
+
+            NewSel = InsertNewInstBefore(NewSel, SI);
+            return BinaryOperator::createAdd(SubOp->getOperand(0), NewSel);
+          }
+        }
+      }
+
+  // See if we can fold the select into one of our operands.
+  if (SI.getType()->isInteger()) {
+    // See the comment above GetSelectFoldableOperands for a description of the
+    // transformation we are doing here.
+    if (Instruction *TVI = dyn_cast<Instruction>(TrueVal))
+      if (TVI->hasOneUse() && TVI->getNumOperands() == 2 &&
+          !isa<Constant>(FalseVal))
+        if (unsigned SFO = GetSelectFoldableOperands(TVI)) {
+          unsigned OpToFold = 0;
+          if ((SFO & 1) && FalseVal == TVI->getOperand(0)) {
+            OpToFold = 1;
+          } else  if ((SFO & 2) && FalseVal == TVI->getOperand(1)) {
+            OpToFold = 2;
+          }
+
+          if (OpToFold) {
+            Constant *C = GetSelectFoldableConstant(TVI);
+            Instruction *NewSel =
+              new SelectInst(SI.getCondition(), TVI->getOperand(2-OpToFold), C);
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(TVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TVI))
+              return BinaryOperator::create(BO->getOpcode(), FalseVal, NewSel);
+            else {
+              assert(0 && "Unknown instruction!!");
+            }
+          }
+        }
+
+    if (Instruction *FVI = dyn_cast<Instruction>(FalseVal))
+      if (FVI->hasOneUse() && FVI->getNumOperands() == 2 &&
+          !isa<Constant>(TrueVal))
+        if (unsigned SFO = GetSelectFoldableOperands(FVI)) {
+          unsigned OpToFold = 0;
+          if ((SFO & 1) && TrueVal == FVI->getOperand(0)) {
+            OpToFold = 1;
+          } else  if ((SFO & 2) && TrueVal == FVI->getOperand(1)) {
+            OpToFold = 2;
+          }
+
+          if (OpToFold) {
+            Constant *C = GetSelectFoldableConstant(FVI);
+            Instruction *NewSel =
+              new SelectInst(SI.getCondition(), C, FVI->getOperand(2-OpToFold));
+            InsertNewInstBefore(NewSel, SI);
+            NewSel->takeName(FVI);
+            if (BinaryOperator *BO = dyn_cast<BinaryOperator>(FVI))
+              return BinaryOperator::create(BO->getOpcode(), TrueVal, NewSel);
+            else
+              assert(0 && "Unknown instruction!!");
+          }
+        }
+  }
+
+  if (BinaryOperator::isNot(CondVal)) {
+    SI.setOperand(0, BinaryOperator::getNotArgument(CondVal));
+    SI.setOperand(1, FalseVal);
+    SI.setOperand(2, TrueVal);
+    return &SI;
+  }
+
+  return 0;
+}
+
+/// GetKnownAlignment - If the specified pointer has an alignment that we can
+/// determine, return it, otherwise return 0.
+static unsigned GetKnownAlignment(Value *V, TargetData *TD) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    unsigned Align = GV->getAlignment();
+    if (Align == 0 && TD) 
+      Align = TD->getPrefTypeAlignment(GV->getType()->getElementType());
+    return Align;
+  } else if (AllocationInst *AI = dyn_cast<AllocationInst>(V)) {
+    unsigned Align = AI->getAlignment();
+    if (Align == 0 && TD) {
+      if (isa<AllocaInst>(AI))
+        Align = TD->getPrefTypeAlignment(AI->getType()->getElementType());
+      else if (isa<MallocInst>(AI)) {
+        // Malloc returns maximally aligned memory.
+        Align = TD->getABITypeAlignment(AI->getType()->getElementType());
+        Align =
+          std::max(Align,
+                   (unsigned)TD->getABITypeAlignment(Type::DoubleTy));
+        Align =
+          std::max(Align,
+                   (unsigned)TD->getABITypeAlignment(Type::Int64Ty));
+      }
+    }
+    return Align;
+  } else if (isa<BitCastInst>(V) ||
+             (isa<ConstantExpr>(V) && 
+              cast<ConstantExpr>(V)->getOpcode() == Instruction::BitCast)) {
+    User *CI = cast<User>(V);
+    if (isa<PointerType>(CI->getOperand(0)->getType()))
+      return GetKnownAlignment(CI->getOperand(0), TD);
+    return 0;
+  } else if (User *GEPI = dyn_castGetElementPtr(V)) {
+    unsigned BaseAlignment = GetKnownAlignment(GEPI->getOperand(0), TD);
+    if (BaseAlignment == 0) return 0;
+    
+    // If all indexes are zero, it is just the alignment of the base pointer.
+    bool AllZeroOperands = true;
+    for (unsigned i = 1, e = GEPI->getNumOperands(); i != e; ++i)
+      if (!isa<Constant>(GEPI->getOperand(i)) ||
+          !cast<Constant>(GEPI->getOperand(i))->isNullValue()) {
+        AllZeroOperands = false;
+        break;
+      }
+    if (AllZeroOperands)
+      return BaseAlignment;
+    
+    // Otherwise, if the base alignment is >= the alignment we expect for the
+    // base pointer type, then we know that the resultant pointer is aligned at
+    // least as much as its type requires.
+    if (!TD) return 0;
+
+    const Type *BasePtrTy = GEPI->getOperand(0)->getType();
+    const PointerType *PtrTy = cast<PointerType>(BasePtrTy);
+    if (TD->getABITypeAlignment(PtrTy->getElementType())
+        <= BaseAlignment) {
+      const Type *GEPTy = GEPI->getType();
+      const PointerType *GEPPtrTy = cast<PointerType>(GEPTy);
+      return TD->getABITypeAlignment(GEPPtrTy->getElementType());
+    }
+    return 0;
+  }
+  return 0;
+}
+
+
+/// visitCallInst - CallInst simplification.  This mostly only handles folding 
+/// of intrinsic instructions.  For normal calls, it allows visitCallSite to do
+/// the heavy lifting.
+///
+Instruction *InstCombiner::visitCallInst(CallInst &CI) {
+  IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
+  if (!II) return visitCallSite(&CI);
+  
+  // Intrinsics cannot occur in an invoke, so handle them here instead of in
+  // visitCallSite.
+  if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(II)) {
+    bool Changed = false;
+
+    // memmove/cpy/set of zero bytes is a noop.
+    if (Constant *NumBytes = dyn_cast<Constant>(MI->getLength())) {
+      if (NumBytes->isNullValue()) return EraseInstFromFunction(CI);
+
+      if (ConstantInt *CI = dyn_cast<ConstantInt>(NumBytes))
+        if (CI->getZExtValue() == 1) {
+          // Replace the instruction with just byte operations.  We would
+          // transform other cases to loads/stores, but we don't know if
+          // alignment is sufficient.
+        }
+    }
+
+    // If we have a memmove and the source operation is a constant global,
+    // then the source and dest pointers can't alias, so we can change this
+    // into a call to memcpy.
+    if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(II)) {
+      if (GlobalVariable *GVSrc = dyn_cast<GlobalVariable>(MMI->getSource()))
+        if (GVSrc->isConstant()) {
+          Module *M = CI.getParent()->getParent()->getParent();
+          const char *Name;
+          if (CI.getCalledFunction()->getFunctionType()->getParamType(2) == 
+              Type::Int32Ty)
+            Name = "llvm.memcpy.i32";
+          else
+            Name = "llvm.memcpy.i64";
+          Constant *MemCpy = M->getOrInsertFunction(Name,
+                                     CI.getCalledFunction()->getFunctionType());
+          CI.setOperand(0, MemCpy);
+          Changed = true;
+        }
+    }
+
+    // If we can determine a pointer alignment that is bigger than currently
+    // set, update the alignment.
+    if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
+      unsigned Alignment1 = GetKnownAlignment(MI->getOperand(1), TD);
+      unsigned Alignment2 = GetKnownAlignment(MI->getOperand(2), TD);
+      unsigned Align = std::min(Alignment1, Alignment2);
+      if (MI->getAlignment()->getZExtValue() < Align) {
+        MI->setAlignment(ConstantInt::get(Type::Int32Ty, Align));
+        Changed = true;
+      }
+    } else if (isa<MemSetInst>(MI)) {
+      unsigned Alignment = GetKnownAlignment(MI->getDest(), TD);
+      if (MI->getAlignment()->getZExtValue() < Alignment) {
+        MI->setAlignment(ConstantInt::get(Type::Int32Ty, Alignment));
+        Changed = true;
+      }
+    }
+          
+    if (Changed) return II;
+  } else {
+    switch (II->getIntrinsicID()) {
+    default: break;
+    case Intrinsic::ppc_altivec_lvx:
+    case Intrinsic::ppc_altivec_lvxl:
+    case Intrinsic::x86_sse_loadu_ps:
+    case Intrinsic::x86_sse2_loadu_pd:
+    case Intrinsic::x86_sse2_loadu_dq:
+      // Turn PPC lvx     -> load if the pointer is known aligned.
+      // Turn X86 loadups -> load if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        Value *Ptr = InsertCastBefore(Instruction::BitCast, II->getOperand(1),
+                                      PointerType::get(II->getType()), CI);
+        return new LoadInst(Ptr);
+      }
+      break;
+    case Intrinsic::ppc_altivec_stvx:
+    case Intrinsic::ppc_altivec_stvxl:
+      // Turn stvx -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(2), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(1)->getType());
+        Value *Ptr = InsertCastBefore(Instruction::BitCast, II->getOperand(2),
+                                      OpPtrTy, CI);
+        return new StoreInst(II->getOperand(1), Ptr);
+      }
+      break;
+    case Intrinsic::x86_sse_storeu_ps:
+    case Intrinsic::x86_sse2_storeu_pd:
+    case Intrinsic::x86_sse2_storeu_dq:
+    case Intrinsic::x86_sse2_storel_dq:
+      // Turn X86 storeu -> store if the pointer is known aligned.
+      if (GetKnownAlignment(II->getOperand(1), TD) >= 16) {
+        const Type *OpPtrTy = PointerType::get(II->getOperand(2)->getType());
+        Value *Ptr = InsertCastBefore(Instruction::BitCast, II->getOperand(1),
+                                      OpPtrTy, CI);
+        return new StoreInst(II->getOperand(2), Ptr);
+      }
+      break;
+      
+    case Intrinsic::x86_sse_cvttss2si: {
+      // These intrinsics only demands the 0th element of its input vector.  If
+      // we can simplify the input based on that, do so now.
+      uint64_t UndefElts;
+      if (Value *V = SimplifyDemandedVectorElts(II->getOperand(1), 1, 
+                                                UndefElts)) {
+        II->setOperand(1, V);
+        return II;
+      }
+      break;
+    }
+      
+    case Intrinsic::ppc_altivec_vperm:
+      // Turn vperm(V1,V2,mask) -> shuffle(V1,V2,mask) if mask is a constant.
+      if (ConstantVector *Mask = dyn_cast<ConstantVector>(II->getOperand(3))) {
+        assert(Mask->getNumOperands() == 16 && "Bad type for intrinsic!");
+        
+        // Check that all of the elements are integer constants or undefs.
+        bool AllEltsOk = true;
+        for (unsigned i = 0; i != 16; ++i) {
+          if (!isa<ConstantInt>(Mask->getOperand(i)) && 
+              !isa<UndefValue>(Mask->getOperand(i))) {
+            AllEltsOk = false;
+            break;
+          }
+        }
+        
+        if (AllEltsOk) {
+          // Cast the input vectors to byte vectors.
+          Value *Op0 = InsertCastBefore(Instruction::BitCast, 
+                                        II->getOperand(1), Mask->getType(), CI);
+          Value *Op1 = InsertCastBefore(Instruction::BitCast,
+                                        II->getOperand(2), Mask->getType(), CI);
+          Value *Result = UndefValue::get(Op0->getType());
+          
+          // Only extract each element once.
+          Value *ExtractedElts[32];
+          memset(ExtractedElts, 0, sizeof(ExtractedElts));
+          
+          for (unsigned i = 0; i != 16; ++i) {
+            if (isa<UndefValue>(Mask->getOperand(i)))
+              continue;
+            unsigned Idx=cast<ConstantInt>(Mask->getOperand(i))->getZExtValue();
+            Idx &= 31;  // Match the hardware behavior.
+            
+            if (ExtractedElts[Idx] == 0) {
+              Instruction *Elt = 
+                new ExtractElementInst(Idx < 16 ? Op0 : Op1, Idx&15, "tmp");
+              InsertNewInstBefore(Elt, CI);
+              ExtractedElts[Idx] = Elt;
+            }
+          
+            // Insert this value into the result vector.
+            Result = new InsertElementInst(Result, ExtractedElts[Idx], i,"tmp");
+            InsertNewInstBefore(cast<Instruction>(Result), CI);
+          }
+          return CastInst::create(Instruction::BitCast, Result, CI.getType());
+        }
+      }
+      break;
+
+    case Intrinsic::stackrestore: {
+      // If the save is right next to the restore, remove the restore.  This can
+      // happen when variable allocas are DCE'd.
+      if (IntrinsicInst *SS = dyn_cast<IntrinsicInst>(II->getOperand(1))) {
+        if (SS->getIntrinsicID() == Intrinsic::stacksave) {
+          BasicBlock::iterator BI = SS;
+          if (&*++BI == II)
+            return EraseInstFromFunction(CI);
+        }
+      }
+      
+      // If the stack restore is in a return/unwind block and if there are no
+      // allocas or calls between the restore and the return, nuke the restore.
+      TerminatorInst *TI = II->getParent()->getTerminator();
+      if (isa<ReturnInst>(TI) || isa<UnwindInst>(TI)) {
+        BasicBlock::iterator BI = II;
+        bool CannotRemove = false;
+        for (++BI; &*BI != TI; ++BI) {
+          if (isa<AllocaInst>(BI) ||
+              (isa<CallInst>(BI) && !isa<IntrinsicInst>(BI))) {
+            CannotRemove = true;
+            break;
+          }
+        }
+        if (!CannotRemove)
+          return EraseInstFromFunction(CI);
+      }
+      break;
+    }
+    }
+  }
+
+  return visitCallSite(II);
+}
+
+// InvokeInst simplification
+//
+Instruction *InstCombiner::visitInvokeInst(InvokeInst &II) {
+  return visitCallSite(&II);
+}
+
+// visitCallSite - Improvements for call and invoke instructions.
+//
+Instruction *InstCombiner::visitCallSite(CallSite CS) {
+  bool Changed = false;
+
+  // If the callee is a constexpr cast of a function, attempt to move the cast
+  // to the arguments of the call/invoke.
+  if (transformConstExprCastCall(CS)) return 0;
+
+  Value *Callee = CS.getCalledValue();
+
+  if (Function *CalleeF = dyn_cast<Function>(Callee))
+    if (CalleeF->getCallingConv() != CS.getCallingConv()) {
+      Instruction *OldCall = CS.getInstruction();
+      // If the call and callee calling conventions don't match, this call must
+      // be unreachable, as the call is undefined.
+      new StoreInst(ConstantInt::getTrue(),
+                    UndefValue::get(PointerType::get(Type::Int1Ty)), OldCall);
+      if (!OldCall->use_empty())
+        OldCall->replaceAllUsesWith(UndefValue::get(OldCall->getType()));
+      if (isa<CallInst>(OldCall))   // Not worth removing an invoke here.
+        return EraseInstFromFunction(*OldCall);
+      return 0;
+    }
+
+  if (isa<ConstantPointerNull>(Callee) || isa<UndefValue>(Callee)) {
+    // This instruction is not reachable, just remove it.  We insert a store to
+    // undef so that we know that this code is not reachable, despite the fact
+    // that we can't modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::get(Type::Int1Ty)),
+                  CS.getInstruction());
+
+    if (!CS.getInstruction()->use_empty())
+      CS.getInstruction()->
+        replaceAllUsesWith(UndefValue::get(CS.getInstruction()->getType()));
+
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+      // Don't break the CFG, insert a dummy cond branch.
+      new BranchInst(II->getNormalDest(), II->getUnwindDest(),
+                     ConstantInt::getTrue(), II);
+    }
+    return EraseInstFromFunction(*CS.getInstruction());
+  }
+
+  const PointerType *PTy = cast<PointerType>(Callee->getType());
+  const FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
+  if (FTy->isVarArg()) {
+    // See if we can optimize any arguments passed through the varargs area of
+    // the call.
+    for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
+           E = CS.arg_end(); I != E; ++I)
+      if (CastInst *CI = dyn_cast<CastInst>(*I)) {
+        // If this cast does not effect the value passed through the varargs
+        // area, we can eliminate the use of the cast.
+        Value *Op = CI->getOperand(0);
+        if (CI->isLosslessCast()) {
+          *I = Op;
+          Changed = true;
+        }
+      }
+  }
+
+  return Changed ? CS.getInstruction() : 0;
+}
+
+// transformConstExprCastCall - If the callee is a constexpr cast of a function,
+// attempt to move the cast to the arguments of the call/invoke.
+//
+bool InstCombiner::transformConstExprCastCall(CallSite CS) {
+  if (!isa<ConstantExpr>(CS.getCalledValue())) return false;
+  ConstantExpr *CE = cast<ConstantExpr>(CS.getCalledValue());
+  if (CE->getOpcode() != Instruction::BitCast || 
+      !isa<Function>(CE->getOperand(0)))
+    return false;
+  Function *Callee = cast<Function>(CE->getOperand(0));
+  Instruction *Caller = CS.getInstruction();
+
+  // Okay, this is a cast from a function to a different type.  Unless doing so
+  // would cause a type conversion of one of our arguments, change this call to
+  // be a direct call with arguments casted to the appropriate types.
+  //
+  const FunctionType *FT = Callee->getFunctionType();
+  const Type *OldRetTy = Caller->getType();
+
+  const FunctionType *ActualFT =
+    cast<FunctionType>(cast<PointerType>(CE->getType())->getElementType());
+  
+  // If the parameter attributes don't match up, don't do the xform.  We don't
+  // want to lose an sret attribute or something.
+  if (FT->getParamAttrs() != ActualFT->getParamAttrs())
+    return false;
+  
+  // Check to see if we are changing the return type...
+  if (OldRetTy != FT->getReturnType()) {
+    if (Callee->isDeclaration() && !Caller->use_empty() && 
+        // Conversion is ok if changing from pointer to int of same size.
+        !(isa<PointerType>(FT->getReturnType()) &&
+          TD->getIntPtrType() == OldRetTy))
+      return false;   // Cannot transform this return value.
+
+    // If the callsite is an invoke instruction, and the return value is used by
+    // a PHI node in a successor, we cannot change the return type of the call
+    // because there is no place to put the cast instruction (without breaking
+    // the critical edge).  Bail out in this case.
+    if (!Caller->use_empty())
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller))
+        for (Value::use_iterator UI = II->use_begin(), E = II->use_end();
+             UI != E; ++UI)
+          if (PHINode *PN = dyn_cast<PHINode>(*UI))
+            if (PN->getParent() == II->getNormalDest() ||
+                PN->getParent() == II->getUnwindDest())
+              return false;
+  }
+
+  unsigned NumActualArgs = unsigned(CS.arg_end()-CS.arg_begin());
+  unsigned NumCommonArgs = std::min(FT->getNumParams(), NumActualArgs);
+
+  CallSite::arg_iterator AI = CS.arg_begin();
+  for (unsigned i = 0, e = NumCommonArgs; i != e; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    const Type *ActTy = (*AI)->getType();
+    ConstantInt *c = dyn_cast<ConstantInt>(*AI);
+    //Some conversions are safe even if we do not have a body.
+    //Either we can cast directly, or we can upconvert the argument
+    bool isConvertible = ActTy == ParamTy ||
+      (isa<PointerType>(ParamTy) && isa<PointerType>(ActTy)) ||
+      (ParamTy->isInteger() && ActTy->isInteger() &&
+       ParamTy->getPrimitiveSizeInBits() >= ActTy->getPrimitiveSizeInBits()) ||
+      (c && ParamTy->getPrimitiveSizeInBits() >= ActTy->getPrimitiveSizeInBits()
+       && c->getValue().isStrictlyPositive());
+    if (Callee->isDeclaration() && !isConvertible) return false;
+
+    // Most other conversions can be done if we have a body, even if these
+    // lose information, e.g. int->short.
+    // Some conversions cannot be done at all, e.g. float to pointer.
+    // Logic here parallels CastInst::getCastOpcode (the design there
+    // requires legality checks like this be done before calling it).
+    if (ParamTy->isInteger()) {
+      if (const VectorType *VActTy = dyn_cast<VectorType>(ActTy)) {
+        if (VActTy->getBitWidth() != ParamTy->getPrimitiveSizeInBits())
+          return false;
+      }
+      if (!ActTy->isInteger() && !ActTy->isFloatingPoint() &&
+          !isa<PointerType>(ActTy))
+        return false;
+    } else if (ParamTy->isFloatingPoint()) {
+      if (const VectorType *VActTy = dyn_cast<VectorType>(ActTy)) {
+        if (VActTy->getBitWidth() != ParamTy->getPrimitiveSizeInBits())
+          return false;
+      }
+      if (!ActTy->isInteger() && !ActTy->isFloatingPoint())
+        return false;
+    } else if (const VectorType *VParamTy = dyn_cast<VectorType>(ParamTy)) {
+      if (const VectorType *VActTy = dyn_cast<VectorType>(ActTy)) {
+        if (VActTy->getBitWidth() != VParamTy->getBitWidth())
+          return false;
+      }
+      if (VParamTy->getBitWidth() != ActTy->getPrimitiveSizeInBits())      
+        return false;
+    } else if (isa<PointerType>(ParamTy)) {
+      if (!ActTy->isInteger() && !isa<PointerType>(ActTy))
+        return false;
+    } else {
+      return false;
+    }
+  }
+
+  if (FT->getNumParams() < NumActualArgs && !FT->isVarArg() &&
+      Callee->isDeclaration())
+    return false;   // Do not delete arguments unless we have a function body...
+
+  // Okay, we decided that this is a safe thing to do: go ahead and start
+  // inserting cast instructions as necessary...
+  std::vector<Value*> Args;
+  Args.reserve(NumActualArgs);
+
+  AI = CS.arg_begin();
+  for (unsigned i = 0; i != NumCommonArgs; ++i, ++AI) {
+    const Type *ParamTy = FT->getParamType(i);
+    if ((*AI)->getType() == ParamTy) {
+      Args.push_back(*AI);
+    } else {
+      Instruction::CastOps opcode = CastInst::getCastOpcode(*AI,
+          false, ParamTy, false);
+      CastInst *NewCast = CastInst::create(opcode, *AI, ParamTy, "tmp");
+      Args.push_back(InsertNewInstBefore(NewCast, *Caller));
+    }
+  }
+
+  // If the function takes more arguments than the call was taking, add them
+  // now...
+  for (unsigned i = NumCommonArgs; i != FT->getNumParams(); ++i)
+    Args.push_back(Constant::getNullValue(FT->getParamType(i)));
+
+  // If we are removing arguments to the function, emit an obnoxious warning...
+  if (FT->getNumParams() < NumActualArgs)
+    if (!FT->isVarArg()) {
+      cerr << "WARNING: While resolving call to function '"
+           << Callee->getName() << "' arguments were dropped!\n";
+    } else {
+      // Add all of the arguments in their promoted form to the arg list...
+      for (unsigned i = FT->getNumParams(); i != NumActualArgs; ++i, ++AI) {
+        const Type *PTy = getPromotedType((*AI)->getType());
+        if (PTy != (*AI)->getType()) {
+          // Must promote to pass through va_arg area!
+          Instruction::CastOps opcode = CastInst::getCastOpcode(*AI, false, 
+                                                                PTy, false);
+          Instruction *Cast = CastInst::create(opcode, *AI, PTy, "tmp");
+          InsertNewInstBefore(Cast, *Caller);
+          Args.push_back(Cast);
+        } else {
+          Args.push_back(*AI);
+        }
+      }
+    }
+
+  if (FT->getReturnType() == Type::VoidTy)
+    Caller->setName("");   // Void type should not have a name.
+
+  Instruction *NC;
+  if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+    NC = new InvokeInst(Callee, II->getNormalDest(), II->getUnwindDest(),
+                        &Args[0], Args.size(), Caller->getName(), Caller);
+    cast<InvokeInst>(II)->setCallingConv(II->getCallingConv());
+  } else {
+    NC = new CallInst(Callee, &Args[0], Args.size(), Caller->getName(), Caller);
+    if (cast<CallInst>(Caller)->isTailCall())
+      cast<CallInst>(NC)->setTailCall();
+   cast<CallInst>(NC)->setCallingConv(cast<CallInst>(Caller)->getCallingConv());
+  }
+
+  // Insert a cast of the return type as necessary.
+  Value *NV = NC;
+  if (Caller->getType() != NV->getType() && !Caller->use_empty()) {
+    if (NV->getType() != Type::VoidTy) {
+      const Type *CallerTy = Caller->getType();
+      Instruction::CastOps opcode = CastInst::getCastOpcode(NC, false, 
+                                                            CallerTy, false);
+      NV = NC = CastInst::create(opcode, NC, CallerTy, "tmp");
+
+      // If this is an invoke instruction, we should insert it after the first
+      // non-phi, instruction in the normal successor block.
+      if (InvokeInst *II = dyn_cast<InvokeInst>(Caller)) {
+        BasicBlock::iterator I = II->getNormalDest()->begin();
+        while (isa<PHINode>(I)) ++I;
+        InsertNewInstBefore(NC, *I);
+      } else {
+        // Otherwise, it's a call, just insert cast right after the call instr
+        InsertNewInstBefore(NC, *Caller);
+      }
+      AddUsersToWorkList(*Caller);
+    } else {
+      NV = UndefValue::get(Caller->getType());
+    }
+  }
+
+  if (Caller->getType() != Type::VoidTy && !Caller->use_empty())
+    Caller->replaceAllUsesWith(NV);
+  Caller->eraseFromParent();
+  RemoveFromWorkList(Caller);
+  return true;
+}
+
+/// FoldPHIArgBinOpIntoPHI - If we have something like phi [add (a,b), add(c,d)]
+/// and if a/b/c/d and the add's all have a single use, turn this into two phi's
+/// and a single binop.
+Instruction *InstCombiner::FoldPHIArgBinOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+  assert(isa<BinaryOperator>(FirstInst) || isa<GetElementPtrInst>(FirstInst) ||
+         isa<CmpInst>(FirstInst));
+  unsigned Opc = FirstInst->getOpcode();
+  Value *LHSVal = FirstInst->getOperand(0);
+  Value *RHSVal = FirstInst->getOperand(1);
+    
+  const Type *LHSType = LHSVal->getType();
+  const Type *RHSType = RHSVal->getType();
+  
+  // Scan to see if all operands are the same opcode, all have one use, and all
+  // kill their operands (i.e. the operands have one use).
+  for (unsigned i = 0; i != PN.getNumIncomingValues(); ++i) {
+    Instruction *I = dyn_cast<Instruction>(PN.getIncomingValue(i));
+    if (!I || I->getOpcode() != Opc || !I->hasOneUse() ||
+        // Verify type of the LHS matches so we don't fold cmp's of different
+        // types or GEP's with different index types.
+        I->getOperand(0)->getType() != LHSType ||
+        I->getOperand(1)->getType() != RHSType)
+      return 0;
+
+    // If they are CmpInst instructions, check their predicates
+    if (Opc == Instruction::ICmp || Opc == Instruction::FCmp)
+      if (cast<CmpInst>(I)->getPredicate() !=
+          cast<CmpInst>(FirstInst)->getPredicate())
+        return 0;
+    
+    // Keep track of which operand needs a phi node.
+    if (I->getOperand(0) != LHSVal) LHSVal = 0;
+    if (I->getOperand(1) != RHSVal) RHSVal = 0;
+  }
+  
+  // Otherwise, this is safe to transform, determine if it is profitable.
+
+  // If this is a GEP, and if the index (not the pointer) needs a PHI, bail out.
+  // Indexes are often folded into load/store instructions, so we don't want to
+  // hide them behind a phi.
+  if (isa<GetElementPtrInst>(FirstInst) && RHSVal == 0)
+    return 0;
+  
+  Value *InLHS = FirstInst->getOperand(0);
+  Value *InRHS = FirstInst->getOperand(1);
+  PHINode *NewLHS = 0, *NewRHS = 0;
+  if (LHSVal == 0) {
+    NewLHS = new PHINode(LHSType, FirstInst->getOperand(0)->getName()+".pn");
+    NewLHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewLHS->addIncoming(InLHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewLHS, PN);
+    LHSVal = NewLHS;
+  }
+  
+  if (RHSVal == 0) {
+    NewRHS = new PHINode(RHSType, FirstInst->getOperand(1)->getName()+".pn");
+    NewRHS->reserveOperandSpace(PN.getNumOperands()/2);
+    NewRHS->addIncoming(InRHS, PN.getIncomingBlock(0));
+    InsertNewInstBefore(NewRHS, PN);
+    RHSVal = NewRHS;
+  }
+  
+  // Add all operands to the new PHIs.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (NewLHS) {
+      Value *NewInLHS =cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+      NewLHS->addIncoming(NewInLHS, PN.getIncomingBlock(i));
+    }
+    if (NewRHS) {
+      Value *NewInRHS =cast<Instruction>(PN.getIncomingValue(i))->getOperand(1);
+      NewRHS->addIncoming(NewInRHS, PN.getIncomingBlock(i));
+    }
+  }
+    
+  if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::create(BinOp->getOpcode(), LHSVal, RHSVal);
+  else if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
+    return CmpInst::create(CIOp->getOpcode(), CIOp->getPredicate(), LHSVal, 
+                           RHSVal);
+  else {
+    assert(isa<GetElementPtrInst>(FirstInst));
+    return new GetElementPtrInst(LHSVal, RHSVal);
+  }
+}
+
+/// isSafeToSinkLoad - Return true if we know that it is safe sink the load out
+/// of the block that defines it.  This means that it must be obvious the value
+/// of the load is not changed from the point of the load to the end of the
+/// block it is in.
+///
+/// Finally, it is safe, but not profitable, to sink a load targetting a
+/// non-address-taken alloca.  Doing so will cause us to not promote the alloca
+/// to a register.
+static bool isSafeToSinkLoad(LoadInst *L) {
+  BasicBlock::iterator BBI = L, E = L->getParent()->end();
+  
+  for (++BBI; BBI != E; ++BBI)
+    if (BBI->mayWriteToMemory())
+      return false;
+  
+  // Check for non-address taken alloca.  If not address-taken already, it isn't
+  // profitable to do this xform.
+  if (AllocaInst *AI = dyn_cast<AllocaInst>(L->getOperand(0))) {
+    bool isAddressTaken = false;
+    for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+         UI != E; ++UI) {
+      if (isa<LoadInst>(UI)) continue;
+      if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+        // If storing TO the alloca, then the address isn't taken.
+        if (SI->getOperand(1) == AI) continue;
+      }
+      isAddressTaken = true;
+      break;
+    }
+    
+    if (!isAddressTaken)
+      return false;
+  }
+  
+  return true;
+}
+
+
+// FoldPHIArgOpIntoPHI - If all operands to a PHI node are the same "unary"
+// operator and they all are only used by the PHI, PHI together their
+// inputs, and do the operation once, to the result of the PHI.
+Instruction *InstCombiner::FoldPHIArgOpIntoPHI(PHINode &PN) {
+  Instruction *FirstInst = cast<Instruction>(PN.getIncomingValue(0));
+
+  // Scan the instruction, looking for input operations that can be folded away.
+  // If all input operands to the phi are the same instruction (e.g. a cast from
+  // the same type or "+42") we can pull the operation through the PHI, reducing
+  // code size and simplifying code.
+  Constant *ConstantOp = 0;
+  const Type *CastSrcTy = 0;
+  bool isVolatile = false;
+  if (isa<CastInst>(FirstInst)) {
+    CastSrcTy = FirstInst->getOperand(0)->getType();
+  } else if (isa<BinaryOperator>(FirstInst) || isa<CmpInst>(FirstInst)) {
+    // Can fold binop, compare or shift here if the RHS is a constant, 
+    // otherwise call FoldPHIArgBinOpIntoPHI.
+    ConstantOp = dyn_cast<Constant>(FirstInst->getOperand(1));
+    if (ConstantOp == 0)
+      return FoldPHIArgBinOpIntoPHI(PN);
+  } else if (LoadInst *LI = dyn_cast<LoadInst>(FirstInst)) {
+    isVolatile = LI->isVolatile();
+    // We can't sink the load if the loaded value could be modified between the
+    // load and the PHI.
+    if (LI->getParent() != PN.getIncomingBlock(0) ||
+        !isSafeToSinkLoad(LI))
+      return 0;
+  } else if (isa<GetElementPtrInst>(FirstInst)) {
+    if (FirstInst->getNumOperands() == 2)
+      return FoldPHIArgBinOpIntoPHI(PN);
+    // Can't handle general GEPs yet.
+    return 0;
+  } else {
+    return 0;  // Cannot fold this operation.
+  }
+
+  // Check to see if all arguments are the same operation.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    if (!isa<Instruction>(PN.getIncomingValue(i))) return 0;
+    Instruction *I = cast<Instruction>(PN.getIncomingValue(i));
+    if (!I->hasOneUse() || !I->isSameOperationAs(FirstInst))
+      return 0;
+    if (CastSrcTy) {
+      if (I->getOperand(0)->getType() != CastSrcTy)
+        return 0;  // Cast operation must match.
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+      // We can't sink the load if the loaded value could be modified between 
+      // the load and the PHI.
+      if (LI->isVolatile() != isVolatile ||
+          LI->getParent() != PN.getIncomingBlock(i) ||
+          !isSafeToSinkLoad(LI))
+        return 0;
+    } else if (I->getOperand(1) != ConstantOp) {
+      return 0;
+    }
+  }
+
+  // Okay, they are all the same operation.  Create a new PHI node of the
+  // correct type, and PHI together all of the LHS's of the instructions.
+  PHINode *NewPN = new PHINode(FirstInst->getOperand(0)->getType(),
+                               PN.getName()+".in");
+  NewPN->reserveOperandSpace(PN.getNumOperands()/2);
+
+  Value *InVal = FirstInst->getOperand(0);
+  NewPN->addIncoming(InVal, PN.getIncomingBlock(0));
+
+  // Add all operands to the new PHI.
+  for (unsigned i = 1, e = PN.getNumIncomingValues(); i != e; ++i) {
+    Value *NewInVal = cast<Instruction>(PN.getIncomingValue(i))->getOperand(0);
+    if (NewInVal != InVal)
+      InVal = 0;
+    NewPN->addIncoming(NewInVal, PN.getIncomingBlock(i));
+  }
+
+  Value *PhiVal;
+  if (InVal) {
+    // The new PHI unions all of the same values together.  This is really
+    // common, so we handle it intelligently here for compile-time speed.
+    PhiVal = InVal;
+    delete NewPN;
+  } else {
+    InsertNewInstBefore(NewPN, PN);
+    PhiVal = NewPN;
+  }
+
+  // Insert and return the new operation.
+  if (CastInst* FirstCI = dyn_cast<CastInst>(FirstInst))
+    return CastInst::create(FirstCI->getOpcode(), PhiVal, PN.getType());
+  else if (isa<LoadInst>(FirstInst))
+    return new LoadInst(PhiVal, "", isVolatile);
+  else if (BinaryOperator *BinOp = dyn_cast<BinaryOperator>(FirstInst))
+    return BinaryOperator::create(BinOp->getOpcode(), PhiVal, ConstantOp);
+  else if (CmpInst *CIOp = dyn_cast<CmpInst>(FirstInst))
+    return CmpInst::create(CIOp->getOpcode(), CIOp->getPredicate(), 
+                           PhiVal, ConstantOp);
+  else
+    assert(0 && "Unknown operation");
+  return 0;
+}
+
+/// DeadPHICycle - Return true if this PHI node is only used by a PHI node cycle
+/// that is dead.
+static bool DeadPHICycle(PHINode *PN,
+                         SmallPtrSet<PHINode*, 16> &PotentiallyDeadPHIs) {
+  if (PN->use_empty()) return true;
+  if (!PN->hasOneUse()) return false;
+
+  // Remember this node, and if we find the cycle, return.
+  if (!PotentiallyDeadPHIs.insert(PN))
+    return true;
+
+  if (PHINode *PU = dyn_cast<PHINode>(PN->use_back()))
+    return DeadPHICycle(PU, PotentiallyDeadPHIs);
+
+  return false;
+}
+
+// PHINode simplification
+//
+Instruction *InstCombiner::visitPHINode(PHINode &PN) {
+  // If LCSSA is around, don't mess with Phi nodes
+  if (MustPreserveLCSSA) return 0;
+  
+  if (Value *V = PN.hasConstantValue())
+    return ReplaceInstUsesWith(PN, V);
+
+  // If all PHI operands are the same operation, pull them through the PHI,
+  // reducing code size.
+  if (isa<Instruction>(PN.getIncomingValue(0)) &&
+      PN.getIncomingValue(0)->hasOneUse())
+    if (Instruction *Result = FoldPHIArgOpIntoPHI(PN))
+      return Result;
+
+  // If this is a trivial cycle in the PHI node graph, remove it.  Basically, if
+  // this PHI only has a single use (a PHI), and if that PHI only has one use (a
+  // PHI)... break the cycle.
+  if (PN.hasOneUse()) {
+    Instruction *PHIUser = cast<Instruction>(PN.use_back());
+    if (PHINode *PU = dyn_cast<PHINode>(PHIUser)) {
+      SmallPtrSet<PHINode*, 16> PotentiallyDeadPHIs;
+      PotentiallyDeadPHIs.insert(&PN);
+      if (DeadPHICycle(PU, PotentiallyDeadPHIs))
+        return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+   
+    // If this phi has a single use, and if that use just computes a value for
+    // the next iteration of a loop, delete the phi.  This occurs with unused
+    // induction variables, e.g. "for (int j = 0; ; ++j);".  Detecting this
+    // common case here is good because the only other things that catch this
+    // are induction variable analysis (sometimes) and ADCE, which is only run
+    // late.
+    if (PHIUser->hasOneUse() &&
+        (isa<BinaryOperator>(PHIUser) || isa<GetElementPtrInst>(PHIUser)) &&
+        PHIUser->use_back() == &PN) {
+      return ReplaceInstUsesWith(PN, UndefValue::get(PN.getType()));
+    }
+  }
+
+  return 0;
+}
+
+static Value *InsertCastToIntPtrTy(Value *V, const Type *DTy,
+                                   Instruction *InsertPoint,
+                                   InstCombiner *IC) {
+  unsigned PtrSize = DTy->getPrimitiveSizeInBits();
+  unsigned VTySize = V->getType()->getPrimitiveSizeInBits();
+  // We must cast correctly to the pointer type. Ensure that we
+  // sign extend the integer value if it is smaller as this is
+  // used for address computation.
+  Instruction::CastOps opcode = 
+     (VTySize < PtrSize ? Instruction::SExt :
+      (VTySize == PtrSize ? Instruction::BitCast : Instruction::Trunc));
+  return IC->InsertCastBefore(opcode, V, DTy, *InsertPoint);
+}
+
+
+Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
+  Value *PtrOp = GEP.getOperand(0);
+  // Is it 'getelementptr %P, i32 0'  or 'getelementptr %P'
+  // If so, eliminate the noop.
+  if (GEP.getNumOperands() == 1)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  if (isa<UndefValue>(GEP.getOperand(0)))
+    return ReplaceInstUsesWith(GEP, UndefValue::get(GEP.getType()));
+
+  bool HasZeroPointerIndex = false;
+  if (Constant *C = dyn_cast<Constant>(GEP.getOperand(1)))
+    HasZeroPointerIndex = C->isNullValue();
+
+  if (GEP.getNumOperands() == 2 && HasZeroPointerIndex)
+    return ReplaceInstUsesWith(GEP, PtrOp);
+
+  // Eliminate unneeded casts for indices.
+  bool MadeChange = false;
+  
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  for (unsigned i = 1, e = GEP.getNumOperands(); i != e; ++i, ++GTI) {
+    if (isa<SequentialType>(*GTI)) {
+      if (CastInst *CI = dyn_cast<CastInst>(GEP.getOperand(i))) {
+        if (CI->getOpcode() == Instruction::ZExt ||
+            CI->getOpcode() == Instruction::SExt) {
+          const Type *SrcTy = CI->getOperand(0)->getType();
+          // We can eliminate a cast from i32 to i64 iff the target 
+          // is a 32-bit pointer target.
+          if (SrcTy->getPrimitiveSizeInBits() >= TD->getPointerSizeInBits()) {
+            MadeChange = true;
+            GEP.setOperand(i, CI->getOperand(0));
+          }
+        }
+      }
+      // If we are using a wider index than needed for this platform, shrink it
+      // to what we need.  If the incoming value needs a cast instruction,
+      // insert it.  This explicit cast can make subsequent optimizations more
+      // obvious.
+      Value *Op = GEP.getOperand(i);
+      if (TD->getTypeSize(Op->getType()) > TD->getPointerSize())
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          GEP.setOperand(i, ConstantExpr::getTrunc(C, TD->getIntPtrType()));
+          MadeChange = true;
+        } else {
+          Op = InsertCastBefore(Instruction::Trunc, Op, TD->getIntPtrType(),
+                                GEP);
+          GEP.setOperand(i, Op);
+          MadeChange = true;
+        }
+    }
+  }
+  if (MadeChange) return &GEP;
+
+  // If this GEP instruction doesn't move the pointer, and if the input operand
+  // is a bitcast of another pointer, just replace the GEP with a bitcast of the
+  // real input to the dest type.
+  if (GEP.hasAllZeroIndices() && isa<BitCastInst>(GEP.getOperand(0)))
+    return new BitCastInst(cast<BitCastInst>(GEP.getOperand(0))->getOperand(0),
+                           GEP.getType());
+    
+  // Combine Indices - If the source pointer to this getelementptr instruction
+  // is a getelementptr instruction, combine the indices of the two
+  // getelementptr instructions into a single instruction.
+  //
+  SmallVector<Value*, 8> SrcGEPOperands;
+  if (User *Src = dyn_castGetElementPtr(PtrOp))
+    SrcGEPOperands.append(Src->op_begin(), Src->op_end());
+
+  if (!SrcGEPOperands.empty()) {
+    // Note that if our source is a gep chain itself that we wait for that
+    // chain to be resolved before we perform this transformation.  This
+    // avoids us creating a TON of code in some cases.
+    //
+    if (isa<GetElementPtrInst>(SrcGEPOperands[0]) &&
+        cast<Instruction>(SrcGEPOperands[0])->getNumOperands() == 2)
+      return 0;   // Wait until our source is folded to completion.
+
+    SmallVector<Value*, 8> Indices;
+
+    // Find out whether the last index in the source GEP is a sequential idx.
+    bool EndsWithSequential = false;
+    for (gep_type_iterator I = gep_type_begin(*cast<User>(PtrOp)),
+           E = gep_type_end(*cast<User>(PtrOp)); I != E; ++I)
+      EndsWithSequential = !isa<StructType>(*I);
+
+    // Can we combine the two pointer arithmetics offsets?
+    if (EndsWithSequential) {
+      // Replace: gep (gep %P, long B), long A, ...
+      // With:    T = long A+B; gep %P, T, ...
+      //
+      Value *Sum, *SO1 = SrcGEPOperands.back(), *GO1 = GEP.getOperand(1);
+      if (SO1 == Constant::getNullValue(SO1->getType())) {
+        Sum = GO1;
+      } else if (GO1 == Constant::getNullValue(GO1->getType())) {
+        Sum = SO1;
+      } else {
+        // If they aren't the same type, convert both to an integer of the
+        // target's pointer size.
+        if (SO1->getType() != GO1->getType()) {
+          if (Constant *SO1C = dyn_cast<Constant>(SO1)) {
+            SO1 = ConstantExpr::getIntegerCast(SO1C, GO1->getType(), true);
+          } else if (Constant *GO1C = dyn_cast<Constant>(GO1)) {
+            GO1 = ConstantExpr::getIntegerCast(GO1C, SO1->getType(), true);
+          } else {
+            unsigned PS = TD->getPointerSize();
+            if (TD->getTypeSize(SO1->getType()) == PS) {
+              // Convert GO1 to SO1's type.
+              GO1 = InsertCastToIntPtrTy(GO1, SO1->getType(), &GEP, this);
+
+            } else if (TD->getTypeSize(GO1->getType()) == PS) {
+              // Convert SO1 to GO1's type.
+              SO1 = InsertCastToIntPtrTy(SO1, GO1->getType(), &GEP, this);
+            } else {
+              const Type *PT = TD->getIntPtrType();
+              SO1 = InsertCastToIntPtrTy(SO1, PT, &GEP, this);
+              GO1 = InsertCastToIntPtrTy(GO1, PT, &GEP, this);
+            }
+          }
+        }
+        if (isa<Constant>(SO1) && isa<Constant>(GO1))
+          Sum = ConstantExpr::getAdd(cast<Constant>(SO1), cast<Constant>(GO1));
+        else {
+          Sum = BinaryOperator::createAdd(SO1, GO1, PtrOp->getName()+".sum");
+          InsertNewInstBefore(cast<Instruction>(Sum), GEP);
+        }
+      }
+
+      // Recycle the GEP we already have if possible.
+      if (SrcGEPOperands.size() == 2) {
+        GEP.setOperand(0, SrcGEPOperands[0]);
+        GEP.setOperand(1, Sum);
+        return &GEP;
+      } else {
+        Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                       SrcGEPOperands.end()-1);
+        Indices.push_back(Sum);
+        Indices.insert(Indices.end(), GEP.op_begin()+2, GEP.op_end());
+      }
+    } else if (isa<Constant>(*GEP.idx_begin()) &&
+               cast<Constant>(*GEP.idx_begin())->isNullValue() &&
+               SrcGEPOperands.size() != 1) {
+      // Otherwise we can do the fold if the first index of the GEP is a zero
+      Indices.insert(Indices.end(), SrcGEPOperands.begin()+1,
+                     SrcGEPOperands.end());
+      Indices.insert(Indices.end(), GEP.idx_begin()+1, GEP.idx_end());
+    }
+
+    if (!Indices.empty())
+      return new GetElementPtrInst(SrcGEPOperands[0], &Indices[0],
+                                   Indices.size(), GEP.getName());
+
+  } else if (GlobalValue *GV = dyn_cast<GlobalValue>(PtrOp)) {
+    // GEP of global variable.  If all of the indices for this GEP are
+    // constants, we can promote this to a constexpr instead of an instruction.
+
+    // Scan for nonconstants...
+    SmallVector<Constant*, 8> Indices;
+    User::op_iterator I = GEP.idx_begin(), E = GEP.idx_end();
+    for (; I != E && isa<Constant>(*I); ++I)
+      Indices.push_back(cast<Constant>(*I));
+
+    if (I == E) {  // If they are all constants...
+      Constant *CE = ConstantExpr::getGetElementPtr(GV,
+                                                    &Indices[0],Indices.size());
+
+      // Replace all uses of the GEP with the new constexpr...
+      return ReplaceInstUsesWith(GEP, CE);
+    }
+  } else if (Value *X = getBitCastOperand(PtrOp)) {  // Is the operand a cast?
+    if (!isa<PointerType>(X->getType())) {
+      // Not interesting.  Source pointer must be a cast from pointer.
+    } else if (HasZeroPointerIndex) {
+      // transform: GEP (cast [10 x ubyte]* X to [0 x ubyte]*), long 0, ...
+      // into     : GEP [10 x ubyte]* X, long 0, ...
+      //
+      // This occurs when the program declares an array extern like "int X[];"
+      //
+      const PointerType *CPTy = cast<PointerType>(PtrOp->getType());
+      const PointerType *XTy = cast<PointerType>(X->getType());
+      if (const ArrayType *XATy =
+          dyn_cast<ArrayType>(XTy->getElementType()))
+        if (const ArrayType *CATy =
+            dyn_cast<ArrayType>(CPTy->getElementType()))
+          if (CATy->getElementType() == XATy->getElementType()) {
+            // At this point, we know that the cast source type is a pointer
+            // to an array of the same type as the destination pointer
+            // array.  Because the array type is never stepped over (there
+            // is a leading zero) we can fold the cast into this GEP.
+            GEP.setOperand(0, X);
+            return &GEP;
+          }
+    } else if (GEP.getNumOperands() == 2) {
+      // Transform things like:
+      // %t = getelementptr ubyte* cast ([2 x int]* %str to uint*), uint %V
+      // into:  %t1 = getelementptr [2 x int*]* %str, int 0, uint %V; cast
+      const Type *SrcElTy = cast<PointerType>(X->getType())->getElementType();
+      const Type *ResElTy=cast<PointerType>(PtrOp->getType())->getElementType();
+      if (isa<ArrayType>(SrcElTy) &&
+          TD->getTypeSize(cast<ArrayType>(SrcElTy)->getElementType()) ==
+          TD->getTypeSize(ResElTy)) {
+        Value *V = InsertNewInstBefore(
+               new GetElementPtrInst(X, Constant::getNullValue(Type::Int32Ty),
+                                     GEP.getOperand(1), GEP.getName()), GEP);
+        // V and GEP are both pointer types --> BitCast
+        return new BitCastInst(V, GEP.getType());
+      }
+      
+      // Transform things like:
+      // getelementptr sbyte* cast ([100 x double]* X to sbyte*), int %tmp
+      //   (where tmp = 8*tmp2) into:
+      // getelementptr [100 x double]* %arr, int 0, int %tmp.2
+      
+      if (isa<ArrayType>(SrcElTy) &&
+          (ResElTy == Type::Int8Ty || ResElTy == Type::Int8Ty)) {
+        uint64_t ArrayEltSize =
+            TD->getTypeSize(cast<ArrayType>(SrcElTy)->getElementType());
+        
+        // Check to see if "tmp" is a scale by a multiple of ArrayEltSize.  We
+        // allow either a mul, shift, or constant here.
+        Value *NewIdx = 0;
+        ConstantInt *Scale = 0;
+        if (ArrayEltSize == 1) {
+          NewIdx = GEP.getOperand(1);
+          Scale = ConstantInt::get(NewIdx->getType(), 1);
+        } else if (ConstantInt *CI = dyn_cast<ConstantInt>(GEP.getOperand(1))) {
+          NewIdx = ConstantInt::get(CI->getType(), 1);
+          Scale = CI;
+        } else if (Instruction *Inst =dyn_cast<Instruction>(GEP.getOperand(1))){
+          if (Inst->getOpcode() == Instruction::Shl &&
+              isa<ConstantInt>(Inst->getOperand(1))) {
+            ConstantInt *ShAmt = cast<ConstantInt>(Inst->getOperand(1));
+            uint32_t ShAmtVal = ShAmt->getLimitedValue(64);
+            Scale = ConstantInt::get(Inst->getType(), 1ULL << ShAmtVal);
+            NewIdx = Inst->getOperand(0);
+          } else if (Inst->getOpcode() == Instruction::Mul &&
+                     isa<ConstantInt>(Inst->getOperand(1))) {
+            Scale = cast<ConstantInt>(Inst->getOperand(1));
+            NewIdx = Inst->getOperand(0);
+          }
+        }
+
+        // If the index will be to exactly the right offset with the scale taken
+        // out, perform the transformation.
+        if (Scale && Scale->getZExtValue() % ArrayEltSize == 0) {
+          if (isa<ConstantInt>(Scale))
+            Scale = ConstantInt::get(Scale->getType(),
+                                      Scale->getZExtValue() / ArrayEltSize);
+          if (Scale->getZExtValue() != 1) {
+            Constant *C = ConstantExpr::getIntegerCast(Scale, NewIdx->getType(),
+                                                       true /*SExt*/);
+            Instruction *Sc = BinaryOperator::createMul(NewIdx, C, "idxscale");
+            NewIdx = InsertNewInstBefore(Sc, GEP);
+          }
+
+          // Insert the new GEP instruction.
+          Instruction *NewGEP =
+            new GetElementPtrInst(X, Constant::getNullValue(Type::Int32Ty),
+                                  NewIdx, GEP.getName());
+          NewGEP = InsertNewInstBefore(NewGEP, GEP);
+          // The NewGEP must be pointer typed, so must the old one -> BitCast
+          return new BitCastInst(NewGEP, GEP.getType());
+        }
+      }
+    }
+  }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitAllocationInst(AllocationInst &AI) {
+  // Convert: malloc Ty, C - where C is a constant != 1 into: malloc [C x Ty], 1
+  if (AI.isArrayAllocation())    // Check C != 1
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(AI.getArraySize())) {
+      const Type *NewTy = 
+        ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
+      AllocationInst *New = 0;
+
+      // Create and insert the replacement instruction...
+      if (isa<MallocInst>(AI))
+        New = new MallocInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      else {
+        assert(isa<AllocaInst>(AI) && "Unknown type of allocation inst!");
+        New = new AllocaInst(NewTy, 0, AI.getAlignment(), AI.getName());
+      }
+
+      InsertNewInstBefore(New, AI);
+
+      // Scan to the end of the allocation instructions, to skip over a block of
+      // allocas if possible...
+      //
+      BasicBlock::iterator It = New;
+      while (isa<AllocationInst>(*It)) ++It;
+
+      // Now that I is pointing to the first non-allocation-inst in the block,
+      // insert our getelementptr instruction...
+      //
+      Value *NullIdx = Constant::getNullValue(Type::Int32Ty);
+      Value *V = new GetElementPtrInst(New, NullIdx, NullIdx,
+                                       New->getName()+".sub", It);
+
+      // Now make everything use the getelementptr instead of the original
+      // allocation.
+      return ReplaceInstUsesWith(AI, V);
+    } else if (isa<UndefValue>(AI.getArraySize())) {
+      return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+    }
+
+  // If alloca'ing a zero byte object, replace the alloca with a null pointer.
+  // Note that we only do this for alloca's, because malloc should allocate and
+  // return a unique pointer, even for a zero byte allocation.
+  if (isa<AllocaInst>(AI) && AI.getAllocatedType()->isSized() &&
+      TD->getTypeSize(AI.getAllocatedType()) == 0)
+    return ReplaceInstUsesWith(AI, Constant::getNullValue(AI.getType()));
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitFreeInst(FreeInst &FI) {
+  Value *Op = FI.getOperand(0);
+
+  // free undef -> unreachable.
+  if (isa<UndefValue>(Op)) {
+    // Insert a new store to null because we cannot modify the CFG here.
+    new StoreInst(ConstantInt::getTrue(),
+                  UndefValue::get(PointerType::get(Type::Int1Ty)), &FI);
+    return EraseInstFromFunction(FI);
+  }
+  
+  // If we have 'free null' delete the instruction.  This can happen in stl code
+  // when lots of inlining happens.
+  if (isa<ConstantPointerNull>(Op))
+    return EraseInstFromFunction(FI);
+  
+  // Change free <ty>* (cast <ty2>* X to <ty>*) into free <ty2>* X
+  if (BitCastInst *CI = dyn_cast<BitCastInst>(Op)) {
+    FI.setOperand(0, CI->getOperand(0));
+    return &FI;
+  }
+  
+  // Change free (gep X, 0,0,0,0) into free(X)
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op)) {
+    if (GEPI->hasAllZeroIndices()) {
+      AddToWorkList(GEPI);
+      FI.setOperand(0, GEPI->getOperand(0));
+      return &FI;
+    }
+  }
+  
+  // Change free(malloc) into nothing, if the malloc has a single use.
+  if (MallocInst *MI = dyn_cast<MallocInst>(Op))
+    if (MI->hasOneUse()) {
+      EraseInstFromFunction(FI);
+      return EraseInstFromFunction(*MI);
+    }
+
+  return 0;
+}
+
+
+/// InstCombineLoadCast - Fold 'load (cast P)' -> cast (load P)' when possible.
+static Instruction *InstCombineLoadCast(InstCombiner &IC, LoadInst &LI) {
+  User *CI = cast<User>(LI.getOperand(0));
+  Value *CastOp = CI->getOperand(0);
+
+  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+    const Type *SrcPTy = SrcTy->getElementType();
+
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy) || 
+         isa<VectorType>(DestPTy)) {
+      // If the source is an array, the code below will not succeed.  Check to
+      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+      // constants.
+      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
+          if (ASrcTy->getNumElements() != 0) {
+            Value *Idxs[2];
+            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::Int32Ty);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            SrcTy = cast<PointerType>(CastOp->getType());
+            SrcPTy = SrcTy->getElementType();
+          }
+
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy) || 
+            isa<VectorType>(SrcPTy)) &&
+          // Do not allow turning this into a load of an integer, which is then
+          // casted to a pointer, this pessimizes pointer analysis a lot.
+          (isa<PointerType>(SrcPTy) == isa<PointerType>(LI.getType())) &&
+          IC.getTargetData().getTypeSizeInBits(SrcPTy) ==
+               IC.getTargetData().getTypeSizeInBits(DestPTy)) {
+
+        // Okay, we are casting from one integer or pointer type to another of
+        // the same size.  Instead of casting the pointer before the load, cast
+        // the result of the loaded value.
+        Value *NewLoad = IC.InsertNewInstBefore(new LoadInst(CastOp,
+                                                             CI->getName(),
+                                                         LI.isVolatile()),LI);
+        // Now cast the result of the load.
+        return new BitCastInst(NewLoad, LI.getType());
+      }
+    }
+  }
+  return 0;
+}
+
+/// isSafeToLoadUnconditionally - Return true if we know that executing a load
+/// from this value cannot trap.  If it is not obviously safe to load from the
+/// specified pointer, we do a quick local scan of the basic block containing
+/// ScanFrom, to determine if the address is already accessed.
+static bool isSafeToLoadUnconditionally(Value *V, Instruction *ScanFrom) {
+  // If it is an alloca or global variable, it is always safe to load from.
+  if (isa<AllocaInst>(V) || isa<GlobalVariable>(V)) return true;
+
+  // Otherwise, be a little bit agressive by scanning the local block where we
+  // want to check to see if the pointer is already being loaded or stored
+  // from/to.  If so, the previous load or store would have already trapped,
+  // so there is no harm doing an extra load (also, CSE will later eliminate
+  // the load entirely).
+  BasicBlock::iterator BBI = ScanFrom, E = ScanFrom->getParent()->begin();
+
+  while (BBI != E) {
+    --BBI;
+
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI->getOperand(0) == V) return true;
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+      if (SI->getOperand(1) == V) return true;
+
+  }
+  return false;
+}
+
+Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
+  Value *Op = LI.getOperand(0);
+
+  // load (cast X) --> cast (load X) iff safe
+  if (isa<CastInst>(Op))
+    if (Instruction *Res = InstCombineLoadCast(*this, LI))
+      return Res;
+
+  // None of the following transforms are legal for volatile loads.
+  if (LI.isVolatile()) return 0;
+  
+  if (&LI.getParent()->front() != &LI) {
+    BasicBlock::iterator BBI = &LI; --BBI;
+    // If the instruction immediately before this is a store to the same
+    // address, do a simple form of store->load forwarding.
+    if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+      if (SI->getOperand(1) == LI.getOperand(0))
+        return ReplaceInstUsesWith(LI, SI->getOperand(0));
+    if (LoadInst *LIB = dyn_cast<LoadInst>(BBI))
+      if (LIB->getOperand(0) == LI.getOperand(0))
+        return ReplaceInstUsesWith(LI, LIB);
+  }
+
+  if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(Op))
+    if (isa<ConstantPointerNull>(GEPI->getOperand(0))) {
+      // Insert a new store to null instruction before the load to indicate
+      // that this code is not reachable.  We do this instead of inserting
+      // an unreachable instruction directly because we cannot modify the
+      // CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+
+  if (Constant *C = dyn_cast<Constant>(Op)) {
+    // load null/undef -> undef
+    if ((C->isNullValue() || isa<UndefValue>(C))) {
+      // Insert a new store to null instruction before the load to indicate that
+      // this code is not reachable.  We do this instead of inserting an
+      // unreachable instruction directly because we cannot modify the CFG.
+      new StoreInst(UndefValue::get(LI.getType()),
+                    Constant::getNullValue(Op->getType()), &LI);
+      return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+    }
+
+    // Instcombine load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Op))
+      if (GV->isConstant() && !GV->isDeclaration())
+        return ReplaceInstUsesWith(LI, GV->getInitializer());
+
+    // Instcombine load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op))
+      if (CE->getOpcode() == Instruction::GetElementPtr) {
+        if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+          if (GV->isConstant() && !GV->isDeclaration())
+            if (Constant *V = 
+               ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE))
+              return ReplaceInstUsesWith(LI, V);
+        if (CE->getOperand(0)->isNullValue()) {
+          // Insert a new store to null instruction before the load to indicate
+          // that this code is not reachable.  We do this instead of inserting
+          // an unreachable instruction directly because we cannot modify the
+          // CFG.
+          new StoreInst(UndefValue::get(LI.getType()),
+                        Constant::getNullValue(Op->getType()), &LI);
+          return ReplaceInstUsesWith(LI, UndefValue::get(LI.getType()));
+        }
+
+      } else if (CE->isCast()) {
+        if (Instruction *Res = InstCombineLoadCast(*this, LI))
+          return Res;
+      }
+  }
+
+  if (Op->hasOneUse()) {
+    // Change select and PHI nodes to select values instead of addresses: this
+    // helps alias analysis out a lot, allows many others simplifications, and
+    // exposes redundancy in the code.
+    //
+    // Note that we cannot do the transformation unless we know that the
+    // introduced loads cannot trap!  Something like this is valid as long as
+    // the condition is always false: load (select bool %C, int* null, int* %G),
+    // but it would not be valid if we transformed it to load from null
+    // unconditionally.
+    //
+    if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
+      // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
+      if (isSafeToLoadUnconditionally(SI->getOperand(1), SI) &&
+          isSafeToLoadUnconditionally(SI->getOperand(2), SI)) {
+        Value *V1 = InsertNewInstBefore(new LoadInst(SI->getOperand(1),
+                                     SI->getOperand(1)->getName()+".val"), LI);
+        Value *V2 = InsertNewInstBefore(new LoadInst(SI->getOperand(2),
+                                     SI->getOperand(2)->getName()+".val"), LI);
+        return new SelectInst(SI->getCondition(), V1, V2);
+      }
+
+      // load (select (cond, null, P)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(1)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(2));
+          return &LI;
+        }
+
+      // load (select (cond, P, null)) -> load P
+      if (Constant *C = dyn_cast<Constant>(SI->getOperand(2)))
+        if (C->isNullValue()) {
+          LI.setOperand(0, SI->getOperand(1));
+          return &LI;
+        }
+    }
+  }
+  return 0;
+}
+
+/// InstCombineStoreToCast - Fold store V, (cast P) -> store (cast V), P
+/// when possible.
+static Instruction *InstCombineStoreToCast(InstCombiner &IC, StoreInst &SI) {
+  User *CI = cast<User>(SI.getOperand(1));
+  Value *CastOp = CI->getOperand(0);
+
+  const Type *DestPTy = cast<PointerType>(CI->getType())->getElementType();
+  if (const PointerType *SrcTy = dyn_cast<PointerType>(CastOp->getType())) {
+    const Type *SrcPTy = SrcTy->getElementType();
+
+    if (DestPTy->isInteger() || isa<PointerType>(DestPTy)) {
+      // If the source is an array, the code below will not succeed.  Check to
+      // see if a trivial 'gep P, 0, 0' will help matters.  Only do this for
+      // constants.
+      if (const ArrayType *ASrcTy = dyn_cast<ArrayType>(SrcPTy))
+        if (Constant *CSrc = dyn_cast<Constant>(CastOp))
+          if (ASrcTy->getNumElements() != 0) {
+            Value* Idxs[2];
+            Idxs[0] = Idxs[1] = Constant::getNullValue(Type::Int32Ty);
+            CastOp = ConstantExpr::getGetElementPtr(CSrc, Idxs, 2);
+            SrcTy = cast<PointerType>(CastOp->getType());
+            SrcPTy = SrcTy->getElementType();
+          }
+
+      if ((SrcPTy->isInteger() || isa<PointerType>(SrcPTy)) &&
+          IC.getTargetData().getTypeSizeInBits(SrcPTy) ==
+               IC.getTargetData().getTypeSizeInBits(DestPTy)) {
+
+        // Okay, we are casting from one integer or pointer type to another of
+        // the same size.  Instead of casting the pointer before 
+        // the store, cast the value to be stored.
+        Value *NewCast;
+        Value *SIOp0 = SI.getOperand(0);
+        Instruction::CastOps opcode = Instruction::BitCast;
+        const Type* CastSrcTy = SIOp0->getType();
+        const Type* CastDstTy = SrcPTy;
+        if (isa<PointerType>(CastDstTy)) {
+          if (CastSrcTy->isInteger())
+            opcode = Instruction::IntToPtr;
+        } else if (isa<IntegerType>(CastDstTy)) {
+          if (isa<PointerType>(SIOp0->getType()))
+            opcode = Instruction::PtrToInt;
+        }
+        if (Constant *C = dyn_cast<Constant>(SIOp0))
+          NewCast = ConstantExpr::getCast(opcode, C, CastDstTy);
+        else
+          NewCast = IC.InsertNewInstBefore(
+            CastInst::create(opcode, SIOp0, CastDstTy, SIOp0->getName()+".c"), 
+            SI);
+        return new StoreInst(NewCast, CastOp);
+      }
+    }
+  }
+  return 0;
+}
+
+Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
+  Value *Val = SI.getOperand(0);
+  Value *Ptr = SI.getOperand(1);
+
+  if (isa<UndefValue>(Ptr)) {     // store X, undef -> noop (even if volatile)
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+  
+  // If the RHS is an alloca with a single use, zapify the store, making the
+  // alloca dead.
+  if (Ptr->hasOneUse()) {
+    if (isa<AllocaInst>(Ptr)) {
+      EraseInstFromFunction(SI);
+      ++NumCombined;
+      return 0;
+    }
+    
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr))
+      if (isa<AllocaInst>(GEP->getOperand(0)) &&
+          GEP->getOperand(0)->hasOneUse()) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+  }
+
+  // Do really simple DSE, to catch cases where there are several consequtive
+  // stores to the same location, separated by a few arithmetic operations. This
+  // situation often occurs with bitfield accesses.
+  BasicBlock::iterator BBI = &SI;
+  for (unsigned ScanInsts = 6; BBI != SI.getParent()->begin() && ScanInsts;
+       --ScanInsts) {
+    --BBI;
+    
+    if (StoreInst *PrevSI = dyn_cast<StoreInst>(BBI)) {
+      // Prev store isn't volatile, and stores to the same location?
+      if (!PrevSI->isVolatile() && PrevSI->getOperand(1) == SI.getOperand(1)) {
+        ++NumDeadStore;
+        ++BBI;
+        EraseInstFromFunction(*PrevSI);
+        continue;
+      }
+      break;
+    }
+    
+    // If this is a load, we have to stop.  However, if the loaded value is from
+    // the pointer we're loading and is producing the pointer we're storing,
+    // then *this* store is dead (X = load P; store X -> P).
+    if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
+      if (LI == Val && LI->getOperand(0) == Ptr) {
+        EraseInstFromFunction(SI);
+        ++NumCombined;
+        return 0;
+      }
+      // Otherwise, this is a load from some other location.  Stores before it
+      // may not be dead.
+      break;
+    }
+    
+    // Don't skip over loads or things that can modify memory.
+    if (BBI->mayWriteToMemory())
+      break;
+  }
+  
+  
+  if (SI.isVolatile()) return 0;  // Don't hack volatile stores.
+
+  // store X, null    -> turns into 'unreachable' in SimplifyCFG
+  if (isa<ConstantPointerNull>(Ptr)) {
+    if (!isa<UndefValue>(Val)) {
+      SI.setOperand(0, UndefValue::get(Val->getType()));
+      if (Instruction *U = dyn_cast<Instruction>(Val))
+        AddToWorkList(U);  // Dropped a use.
+      ++NumCombined;
+    }
+    return 0;  // Do not modify these!
+  }
+
+  // store undef, Ptr -> noop
+  if (isa<UndefValue>(Val)) {
+    EraseInstFromFunction(SI);
+    ++NumCombined;
+    return 0;
+  }
+
+  // If the pointer destination is a cast, see if we can fold the cast into the
+  // source instead.
+  if (isa<CastInst>(Ptr))
+    if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+      return Res;
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+    if (CE->isCast())
+      if (Instruction *Res = InstCombineStoreToCast(*this, SI))
+        return Res;
+
+  
+  // If this store is the last instruction in the basic block, and if the block
+  // ends with an unconditional branch, try to move it to the successor block.
+  BBI = &SI; ++BBI;
+  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
+    if (BI->isUnconditional())
+      if (SimplifyStoreAtEndOfBlock(SI))
+        return 0;  // xform done!
+  
+  return 0;
+}
+
+/// SimplifyStoreAtEndOfBlock - Turn things like:
+///   if () { *P = v1; } else { *P = v2 }
+/// into a phi node with a store in the successor.
+///
+/// Simplify things like:
+///   *P = v1; if () { *P = v2; }
+/// into a phi node with a store in the successor.
+///
+bool InstCombiner::SimplifyStoreAtEndOfBlock(StoreInst &SI) {
+  BasicBlock *StoreBB = SI.getParent();
+  
+  // Check to see if the successor block has exactly two incoming edges.  If
+  // so, see if the other predecessor contains a store to the same location.
+  // if so, insert a PHI node (if needed) and move the stores down.
+  BasicBlock *DestBB = StoreBB->getTerminator()->getSuccessor(0);
+  
+  // Determine whether Dest has exactly two predecessors and, if so, compute
+  // the other predecessor.
+  pred_iterator PI = pred_begin(DestBB);
+  BasicBlock *OtherBB = 0;
+  if (*PI != StoreBB)
+    OtherBB = *PI;
+  ++PI;
+  if (PI == pred_end(DestBB))
+    return false;
+  
+  if (*PI != StoreBB) {
+    if (OtherBB)
+      return false;
+    OtherBB = *PI;
+  }
+  if (++PI != pred_end(DestBB))
+    return false;
+  
+  
+  // Verify that the other block ends in a branch and is not otherwise empty.
+  BasicBlock::iterator BBI = OtherBB->getTerminator();
+  BranchInst *OtherBr = dyn_cast<BranchInst>(BBI);
+  if (!OtherBr || BBI == OtherBB->begin())
+    return false;
+  
+  // If the other block ends in an unconditional branch, check for the 'if then
+  // else' case.  there is an instruction before the branch.
+  StoreInst *OtherStore = 0;
+  if (OtherBr->isUnconditional()) {
+    // If this isn't a store, or isn't a store to the same location, bail out.
+    --BBI;
+    OtherStore = dyn_cast<StoreInst>(BBI);
+    if (!OtherStore || OtherStore->getOperand(1) != SI.getOperand(1))
+      return false;
+  } else {
+    // Otherwise, the other block ended with a conditional branch. If one of the
+    // destinations is StoreBB, then we have the if/then case.
+    if (OtherBr->getSuccessor(0) != StoreBB && 
+        OtherBr->getSuccessor(1) != StoreBB)
+      return false;
+    
+    // Okay, we know that OtherBr now goes to Dest and StoreBB, so this is an
+    // if/then triangle.  See if there is a store to the same ptr as SI that
+    // lives in OtherBB.
+    for (;; --BBI) {
+      // Check to see if we find the matching store.
+      if ((OtherStore = dyn_cast<StoreInst>(BBI))) {
+        if (OtherStore->getOperand(1) != SI.getOperand(1))
+          return false;
+        break;
+      }
+      // If we find something that may be using the stored value, or if we run
+      // out of instructions, we can't do the xform.
+      if (isa<LoadInst>(BBI) || BBI->mayWriteToMemory() ||
+          BBI == OtherBB->begin())
+        return false;
+    }
+    
+    // In order to eliminate the store in OtherBr, we have to
+    // make sure nothing reads the stored value in StoreBB.
+    for (BasicBlock::iterator I = StoreBB->begin(); &*I != &SI; ++I) {
+      // FIXME: This should really be AA driven.
+      if (isa<LoadInst>(I) || I->mayWriteToMemory())
+        return false;
+    }
+  }
+  
+  // Insert a PHI node now if we need it.
+  Value *MergedVal = OtherStore->getOperand(0);
+  if (MergedVal != SI.getOperand(0)) {
+    PHINode *PN = new PHINode(MergedVal->getType(), "storemerge");
+    PN->reserveOperandSpace(2);
+    PN->addIncoming(SI.getOperand(0), SI.getParent());
+    PN->addIncoming(OtherStore->getOperand(0), OtherBB);
+    MergedVal = InsertNewInstBefore(PN, DestBB->front());
+  }
+  
+  // Advance to a place where it is safe to insert the new store and
+  // insert it.
+  BBI = DestBB->begin();
+  while (isa<PHINode>(BBI)) ++BBI;
+  InsertNewInstBefore(new StoreInst(MergedVal, SI.getOperand(1),
+                                    OtherStore->isVolatile()), *BBI);
+  
+  // Nuke the old stores.
+  EraseInstFromFunction(SI);
+  EraseInstFromFunction(*OtherStore);
+  ++NumCombined;
+  return true;
+}
+
+
+Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  // Change br (not X), label True, label False to: br X, label False, True
+  Value *X = 0;
+  BasicBlock *TrueDest;
+  BasicBlock *FalseDest;
+  if (match(&BI, m_Br(m_Not(m_Value(X)), TrueDest, FalseDest)) &&
+      !isa<Constant>(X)) {
+    // Swap Destinations and condition...
+    BI.setCondition(X);
+    BI.setSuccessor(0, FalseDest);
+    BI.setSuccessor(1, TrueDest);
+    return &BI;
+  }
+
+  // Cannonicalize fcmp_one -> fcmp_oeq
+  FCmpInst::Predicate FPred; Value *Y;
+  if (match(&BI, m_Br(m_FCmp(FPred, m_Value(X), m_Value(Y)), 
+                             TrueDest, FalseDest)))
+    if ((FPred == FCmpInst::FCMP_ONE || FPred == FCmpInst::FCMP_OLE ||
+         FPred == FCmpInst::FCMP_OGE) && BI.getCondition()->hasOneUse()) {
+      FCmpInst *I = cast<FCmpInst>(BI.getCondition());
+      FCmpInst::Predicate NewPred = FCmpInst::getInversePredicate(FPred);
+      Instruction *NewSCC = new FCmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  // Cannonicalize icmp_ne -> icmp_eq
+  ICmpInst::Predicate IPred;
+  if (match(&BI, m_Br(m_ICmp(IPred, m_Value(X), m_Value(Y)),
+                      TrueDest, FalseDest)))
+    if ((IPred == ICmpInst::ICMP_NE  || IPred == ICmpInst::ICMP_ULE ||
+         IPred == ICmpInst::ICMP_SLE || IPred == ICmpInst::ICMP_UGE ||
+         IPred == ICmpInst::ICMP_SGE) && BI.getCondition()->hasOneUse()) {
+      ICmpInst *I = cast<ICmpInst>(BI.getCondition());
+      ICmpInst::Predicate NewPred = ICmpInst::getInversePredicate(IPred);
+      Instruction *NewSCC = new ICmpInst(NewPred, X, Y, "", I);
+      NewSCC->takeName(I);
+      // Swap Destinations and condition...
+      BI.setCondition(NewSCC);
+      BI.setSuccessor(0, FalseDest);
+      BI.setSuccessor(1, TrueDest);
+      RemoveFromWorkList(I);
+      I->eraseFromParent();;
+      AddToWorkList(NewSCC);
+      return &BI;
+    }
+
+  return 0;
+}
+
+Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
+  Value *Cond = SI.getCondition();
+  if (Instruction *I = dyn_cast<Instruction>(Cond)) {
+    if (I->getOpcode() == Instruction::Add)
+      if (ConstantInt *AddRHS = dyn_cast<ConstantInt>(I->getOperand(1))) {
+        // change 'switch (X+4) case 1:' into 'switch (X) case -3'
+        for (unsigned i = 2, e = SI.getNumOperands(); i != e; i += 2)
+          SI.setOperand(i,ConstantExpr::getSub(cast<Constant>(SI.getOperand(i)),
+                                                AddRHS));
+        SI.setOperand(0, I->getOperand(0));
+        AddToWorkList(I);
+        return &SI;
+      }
+  }
+  return 0;
+}
+
+/// CheapToScalarize - Return true if the value is cheaper to scalarize than it
+/// is to leave as a vector operation.
+static bool CheapToScalarize(Value *V, bool isConstant) {
+  if (isa<ConstantAggregateZero>(V)) 
+    return true;
+  if (ConstantVector *C = dyn_cast<ConstantVector>(V)) {
+    if (isConstant) return true;
+    // If all elts are the same, we can extract.
+    Constant *Op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != Op0)
+        return false;
+    return true;
+  }
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I) return false;
+  
+  // Insert element gets simplified to the inserted element or is deleted if
+  // this is constant idx extract element and its a constant idx insertelt.
+  if (I->getOpcode() == Instruction::InsertElement && isConstant &&
+      isa<ConstantInt>(I->getOperand(2)))
+    return true;
+  if (I->getOpcode() == Instruction::Load && I->hasOneUse())
+    return true;
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I))
+    if (BO->hasOneUse() &&
+        (CheapToScalarize(BO->getOperand(0), isConstant) ||
+         CheapToScalarize(BO->getOperand(1), isConstant)))
+      return true;
+  if (CmpInst *CI = dyn_cast<CmpInst>(I))
+    if (CI->hasOneUse() &&
+        (CheapToScalarize(CI->getOperand(0), isConstant) ||
+         CheapToScalarize(CI->getOperand(1), isConstant)))
+      return true;
+  
+  return false;
+}
+
+/// Read and decode a shufflevector mask.
+///
+/// It turns undef elements into values that are larger than the number of
+/// elements in the input.
+static std::vector<unsigned> getShuffleMask(const ShuffleVectorInst *SVI) {
+  unsigned NElts = SVI->getType()->getNumElements();
+  if (isa<ConstantAggregateZero>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 0);
+  if (isa<UndefValue>(SVI->getOperand(2)))
+    return std::vector<unsigned>(NElts, 2*NElts);
+
+  std::vector<unsigned> Result;
+  const ConstantVector *CP = cast<ConstantVector>(SVI->getOperand(2));
+  for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i)
+    if (isa<UndefValue>(CP->getOperand(i)))
+      Result.push_back(NElts*2);  // undef -> 8
+    else
+      Result.push_back(cast<ConstantInt>(CP->getOperand(i))->getZExtValue());
+  return Result;
+}
+
+/// FindScalarElement - Given a vector and an element number, see if the scalar
+/// value is already around as a register, for example if it were inserted then
+/// extracted from the vector.
+static Value *FindScalarElement(Value *V, unsigned EltNo) {
+  assert(isa<VectorType>(V->getType()) && "Not looking at a vector?");
+  const VectorType *PTy = cast<VectorType>(V->getType());
+  unsigned Width = PTy->getNumElements();
+  if (EltNo >= Width)  // Out of range access.
+    return UndefValue::get(PTy->getElementType());
+  
+  if (isa<UndefValue>(V))
+    return UndefValue::get(PTy->getElementType());
+  else if (isa<ConstantAggregateZero>(V))
+    return Constant::getNullValue(PTy->getElementType());
+  else if (ConstantVector *CP = dyn_cast<ConstantVector>(V))
+    return CP->getOperand(EltNo);
+  else if (InsertElementInst *III = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert to a variable element, we don't know what it is.
+    if (!isa<ConstantInt>(III->getOperand(2))) 
+      return 0;
+    unsigned IIElt = cast<ConstantInt>(III->getOperand(2))->getZExtValue();
+    
+    // If this is an insert to the element we are looking for, return the
+    // inserted value.
+    if (EltNo == IIElt) 
+      return III->getOperand(1);
+    
+    // Otherwise, the insertelement doesn't modify the value, recurse on its
+    // vector input.
+    return FindScalarElement(III->getOperand(0), EltNo);
+  } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
+    unsigned InEl = getShuffleMask(SVI)[EltNo];
+    if (InEl < Width)
+      return FindScalarElement(SVI->getOperand(0), InEl);
+    else if (InEl < Width*2)
+      return FindScalarElement(SVI->getOperand(1), InEl - Width);
+    else
+      return UndefValue::get(PTy->getElementType());
+  }
+  
+  // Otherwise, we don't know.
+  return 0;
+}
+
+Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
+
+  // If vector val is undef, replace extract with scalar undef.
+  if (isa<UndefValue>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+
+  // If vector val is constant 0, replace extract with scalar 0.
+  if (isa<ConstantAggregateZero>(EI.getOperand(0)))
+    return ReplaceInstUsesWith(EI, Constant::getNullValue(EI.getType()));
+  
+  if (ConstantVector *C = dyn_cast<ConstantVector>(EI.getOperand(0))) {
+    // If vector val is constant with uniform operands, replace EI
+    // with that operand
+    Constant *op0 = C->getOperand(0);
+    for (unsigned i = 1; i < C->getNumOperands(); ++i)
+      if (C->getOperand(i) != op0) {
+        op0 = 0; 
+        break;
+      }
+    if (op0)
+      return ReplaceInstUsesWith(EI, op0);
+  }
+  
+  // If extracting a specified index from the vector, see if we can recursively
+  // find a previously computed scalar that was inserted into the vector.
+  if (ConstantInt *IdxC = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+    unsigned IndexVal = IdxC->getZExtValue();
+    unsigned VectorWidth = 
+      cast<VectorType>(EI.getOperand(0)->getType())->getNumElements();
+      
+    // If this is extracting an invalid index, turn this into undef, to avoid
+    // crashing the code below.
+    if (IndexVal >= VectorWidth)
+      return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+    
+    // This instruction only demands the single element from the input vector.
+    // If the input vector has a single use, simplify it based on this use
+    // property.
+    if (EI.getOperand(0)->hasOneUse() && VectorWidth != 1) {
+      uint64_t UndefElts;
+      if (Value *V = SimplifyDemandedVectorElts(EI.getOperand(0),
+                                                1 << IndexVal,
+                                                UndefElts)) {
+        EI.setOperand(0, V);
+        return &EI;
+      }
+    }
+    
+    if (Value *Elt = FindScalarElement(EI.getOperand(0), IndexVal))
+      return ReplaceInstUsesWith(EI, Elt);
+    
+    // If the this extractelement is directly using a bitcast from a vector of
+    // the same number of elements, see if we can find the source element from
+    // it.  In this case, we will end up needing to bitcast the scalars.
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(EI.getOperand(0))) {
+      if (const VectorType *VT = 
+              dyn_cast<VectorType>(BCI->getOperand(0)->getType()))
+        if (VT->getNumElements() == VectorWidth)
+          if (Value *Elt = FindScalarElement(BCI->getOperand(0), IndexVal))
+            return new BitCastInst(Elt, EI.getType());
+    }
+  }
+  
+  if (Instruction *I = dyn_cast<Instruction>(EI.getOperand(0))) {
+    if (I->hasOneUse()) {
+      // Push extractelement into predecessor operation if legal and
+      // profitable to do so
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        bool isConstantElt = isa<ConstantInt>(EI.getOperand(1));
+        if (CheapToScalarize(BO, isConstantElt)) {
+          ExtractElementInst *newEI0 = 
+            new ExtractElementInst(BO->getOperand(0), EI.getOperand(1),
+                                   EI.getName()+".lhs");
+          ExtractElementInst *newEI1 =
+            new ExtractElementInst(BO->getOperand(1), EI.getOperand(1),
+                                   EI.getName()+".rhs");
+          InsertNewInstBefore(newEI0, EI);
+          InsertNewInstBefore(newEI1, EI);
+          return BinaryOperator::create(BO->getOpcode(), newEI0, newEI1);
+        }
+      } else if (isa<LoadInst>(I)) {
+        Value *Ptr = InsertCastBefore(Instruction::BitCast, I->getOperand(0),
+                                      PointerType::get(EI.getType()), EI);
+        GetElementPtrInst *GEP = 
+          new GetElementPtrInst(Ptr, EI.getOperand(1), I->getName() + ".gep");
+        InsertNewInstBefore(GEP, EI);
+        return new LoadInst(GEP);
+      }
+    }
+    if (InsertElementInst *IE = dyn_cast<InsertElementInst>(I)) {
+      // Extracting the inserted element?
+      if (IE->getOperand(2) == EI.getOperand(1))
+        return ReplaceInstUsesWith(EI, IE->getOperand(1));
+      // If the inserted and extracted elements are constants, they must not
+      // be the same value, extract from the pre-inserted value instead.
+      if (isa<Constant>(IE->getOperand(2)) &&
+          isa<Constant>(EI.getOperand(1))) {
+        AddUsesToWorkList(EI);
+        EI.setOperand(0, IE->getOperand(0));
+        return &EI;
+      }
+    } else if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+      // If this is extracting an element from a shufflevector, figure out where
+      // it came from and extract from the appropriate input element instead.
+      if (ConstantInt *Elt = dyn_cast<ConstantInt>(EI.getOperand(1))) {
+        unsigned SrcIdx = getShuffleMask(SVI)[Elt->getZExtValue()];
+        Value *Src;
+        if (SrcIdx < SVI->getType()->getNumElements())
+          Src = SVI->getOperand(0);
+        else if (SrcIdx < SVI->getType()->getNumElements()*2) {
+          SrcIdx -= SVI->getType()->getNumElements();
+          Src = SVI->getOperand(1);
+        } else {
+          return ReplaceInstUsesWith(EI, UndefValue::get(EI.getType()));
+        }
+        return new ExtractElementInst(Src, SrcIdx);
+      }
+    }
+  }
+  return 0;
+}
+
+/// CollectSingleShuffleElements - If V is a shuffle of values that ONLY returns
+/// elements from either LHS or RHS, return the shuffle mask and true. 
+/// Otherwise, return false.
+static bool CollectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
+                                         std::vector<Constant*> &Mask) {
+  assert(V->getType() == LHS->getType() && V->getType() == RHS->getType() &&
+         "Invalid CollectSingleShuffleElements");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return true;
+  } else if (V == LHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+    return true;
+  } else if (V == RHS) {
+    for (unsigned i = 0; i != NumElts; ++i)
+      Mask.push_back(ConstantInt::get(Type::Int32Ty, i+NumElts));
+    return true;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (!isa<ConstantInt>(IdxOp))
+      return false;
+    unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+    
+    if (isa<UndefValue>(ScalarOp)) {  // inserting undef into vector.
+      // Okay, we can handle this if the vector we are insertinting into is
+      // transitively ok.
+      if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+        // If so, update the mask to reflect the inserted undef.
+        Mask[InsertedIdx] = UndefValue::get(Type::Int32Ty);
+        return true;
+      }      
+    } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
+      if (isa<ConstantInt>(EI->getOperand(1)) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        
+        // This must be extracting from either LHS or RHS.
+        if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
+          // Okay, we can handle this if the vector we are insertinting into is
+          // transitively ok.
+          if (CollectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
+            // If so, update the mask to reflect the inserted value.
+            if (EI->getOperand(0) == LHS) {
+              Mask[InsertedIdx & (NumElts-1)] = 
+                 ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+            } else {
+              assert(EI->getOperand(0) == RHS);
+              Mask[InsertedIdx & (NumElts-1)] = 
+                ConstantInt::get(Type::Int32Ty, ExtractedIdx+NumElts);
+              
+            }
+            return true;
+          }
+        }
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  return false;
+}
+
+/// CollectShuffleElements - We are building a shuffle of V, using RHS as the
+/// RHS of the shuffle instruction, if it is not null.  Return a shuffle mask
+/// that computes V and the LHS value of the shuffle.
+static Value *CollectShuffleElements(Value *V, std::vector<Constant*> &Mask,
+                                     Value *&RHS) {
+  assert(isa<VectorType>(V->getType()) && 
+         (RHS == 0 || V->getType() == RHS->getType()) &&
+         "Invalid shuffle!");
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+
+  if (isa<UndefValue>(V)) {
+    Mask.assign(NumElts, UndefValue::get(Type::Int32Ty));
+    return V;
+  } else if (isa<ConstantAggregateZero>(V)) {
+    Mask.assign(NumElts, ConstantInt::get(Type::Int32Ty, 0));
+    return V;
+  } else if (InsertElementInst *IEI = dyn_cast<InsertElementInst>(V)) {
+    // If this is an insert of an extract from some other vector, include it.
+    Value *VecOp    = IEI->getOperand(0);
+    Value *ScalarOp = IEI->getOperand(1);
+    Value *IdxOp    = IEI->getOperand(2);
+    
+    if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+      if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+          EI->getOperand(0)->getType() == V->getType()) {
+        unsigned ExtractedIdx =
+          cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+        unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+        
+        // Either the extracted from or inserted into vector must be RHSVec,
+        // otherwise we'd end up with a shuffle of three inputs.
+        if (EI->getOperand(0) == RHS || RHS == 0) {
+          RHS = EI->getOperand(0);
+          Value *V = CollectShuffleElements(VecOp, Mask, RHS);
+          Mask[InsertedIdx & (NumElts-1)] = 
+            ConstantInt::get(Type::Int32Ty, NumElts+ExtractedIdx);
+          return V;
+        }
+        
+        if (VecOp == RHS) {
+          Value *V = CollectShuffleElements(EI->getOperand(0), Mask, RHS);
+          // Everything but the extracted element is replaced with the RHS.
+          for (unsigned i = 0; i != NumElts; ++i) {
+            if (i != InsertedIdx)
+              Mask[i] = ConstantInt::get(Type::Int32Ty, NumElts+i);
+          }
+          return V;
+        }
+        
+        // If this insertelement is a chain that comes from exactly these two
+        // vectors, return the vector and the effective shuffle.
+        if (CollectSingleShuffleElements(IEI, EI->getOperand(0), RHS, Mask))
+          return EI->getOperand(0);
+        
+      }
+    }
+  }
+  // TODO: Handle shufflevector here!
+  
+  // Otherwise, can't do anything fancy.  Return an identity vector.
+  for (unsigned i = 0; i != NumElts; ++i)
+    Mask.push_back(ConstantInt::get(Type::Int32Ty, i));
+  return V;
+}
+
+Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
+  Value *VecOp    = IE.getOperand(0);
+  Value *ScalarOp = IE.getOperand(1);
+  Value *IdxOp    = IE.getOperand(2);
+  
+  // Inserting an undef or into an undefined place, remove this.
+  if (isa<UndefValue>(ScalarOp) || isa<UndefValue>(IdxOp))
+    ReplaceInstUsesWith(IE, VecOp);
+  
+  // If the inserted element was extracted from some other vector, and if the 
+  // indexes are constant, try to turn this into a shufflevector operation.
+  if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)) {
+    if (isa<ConstantInt>(EI->getOperand(1)) && isa<ConstantInt>(IdxOp) &&
+        EI->getOperand(0)->getType() == IE.getType()) {
+      unsigned NumVectorElts = IE.getType()->getNumElements();
+      unsigned ExtractedIdx =
+        cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
+      unsigned InsertedIdx = cast<ConstantInt>(IdxOp)->getZExtValue();
+      
+      if (ExtractedIdx >= NumVectorElts) // Out of range extract.
+        return ReplaceInstUsesWith(IE, VecOp);
+      
+      if (InsertedIdx >= NumVectorElts)  // Out of range insert.
+        return ReplaceInstUsesWith(IE, UndefValue::get(IE.getType()));
+      
+      // If we are extracting a value from a vector, then inserting it right
+      // back into the same place, just use the input vector.
+      if (EI->getOperand(0) == VecOp && ExtractedIdx == InsertedIdx)
+        return ReplaceInstUsesWith(IE, VecOp);      
+      
+      // We could theoretically do this for ANY input.  However, doing so could
+      // turn chains of insertelement instructions into a chain of shufflevector
+      // instructions, and right now we do not merge shufflevectors.  As such,
+      // only do this in a situation where it is clear that there is benefit.
+      if (isa<UndefValue>(VecOp) || isa<ConstantAggregateZero>(VecOp)) {
+        // Turn this into shuffle(EIOp0, VecOp, Mask).  The result has all of
+        // the values of VecOp, except then one read from EIOp0.
+        // Build a new shuffle mask.
+        std::vector<Constant*> Mask;
+        if (isa<UndefValue>(VecOp))
+          Mask.assign(NumVectorElts, UndefValue::get(Type::Int32Ty));
+        else {
+          assert(isa<ConstantAggregateZero>(VecOp) && "Unknown thing");
+          Mask.assign(NumVectorElts, ConstantInt::get(Type::Int32Ty,
+                                                       NumVectorElts));
+        } 
+        Mask[InsertedIdx] = ConstantInt::get(Type::Int32Ty, ExtractedIdx);
+        return new ShuffleVectorInst(EI->getOperand(0), VecOp,
+                                     ConstantVector::get(Mask));
+      }
+      
+      // If this insertelement isn't used by some other insertelement, turn it
+      // (and any insertelements it points to), into one big shuffle.
+      if (!IE.hasOneUse() || !isa<InsertElementInst>(IE.use_back())) {
+        std::vector<Constant*> Mask;
+        Value *RHS = 0;
+        Value *LHS = CollectShuffleElements(&IE, Mask, RHS);
+        if (RHS == 0) RHS = UndefValue::get(LHS->getType());
+        // We now have a shuffle of LHS, RHS, Mask.
+        return new ShuffleVectorInst(LHS, RHS, ConstantVector::get(Mask));
+      }
+    }
+  }
+
+  return 0;
+}
+
+
+Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
+  Value *LHS = SVI.getOperand(0);
+  Value *RHS = SVI.getOperand(1);
+  std::vector<unsigned> Mask = getShuffleMask(&SVI);
+
+  bool MadeChange = false;
+  
+  // Undefined shuffle mask -> undefined value.
+  if (isa<UndefValue>(SVI.getOperand(2)))
+    return ReplaceInstUsesWith(SVI, UndefValue::get(SVI.getType()));
+  
+  // If we have shuffle(x, undef, mask) and any elements of mask refer to
+  // the undef, change them to undefs.
+  if (isa<UndefValue>(SVI.getOperand(1))) {
+    // Scan to see if there are any references to the RHS.  If so, replace them
+    // with undef element refs and set MadeChange to true.
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] >= e && Mask[i] != 2*e) {
+        Mask[i] = 2*e;
+        MadeChange = true;
+      }
+    }
+    
+    if (MadeChange) {
+      // Remap any references to RHS to use LHS.
+      std::vector<Constant*> Elts;
+      for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+        if (Mask[i] == 2*e)
+          Elts.push_back(UndefValue::get(Type::Int32Ty));
+        else
+          Elts.push_back(ConstantInt::get(Type::Int32Ty, Mask[i]));
+      }
+      SVI.setOperand(2, ConstantVector::get(Elts));
+    }
+  }
+  
+  // Canonicalize shuffle(x    ,x,mask) -> shuffle(x, undef,mask')
+  // Canonicalize shuffle(undef,x,mask) -> shuffle(x, undef,mask').
+  if (LHS == RHS || isa<UndefValue>(LHS)) {
+    if (isa<UndefValue>(LHS) && LHS == RHS) {
+      // shuffle(undef,undef,mask) -> undef.
+      return ReplaceInstUsesWith(SVI, LHS);
+    }
+    
+    // Remap any references to RHS to use LHS.
+    std::vector<Constant*> Elts;
+    for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+      if (Mask[i] >= 2*e)
+        Elts.push_back(UndefValue::get(Type::Int32Ty));
+      else {
+        if ((Mask[i] >= e && isa<UndefValue>(RHS)) ||
+            (Mask[i] <  e && isa<UndefValue>(LHS)))
+          Mask[i] = 2*e;     // Turn into undef.
+        else
+          Mask[i] &= (e-1);  // Force to LHS.
+        Elts.push_back(ConstantInt::get(Type::Int32Ty, Mask[i]));
+      }
+    }
+    SVI.setOperand(0, SVI.getOperand(1));
+    SVI.setOperand(1, UndefValue::get(RHS->getType()));
+    SVI.setOperand(2, ConstantVector::get(Elts));
+    LHS = SVI.getOperand(0);
+    RHS = SVI.getOperand(1);
+    MadeChange = true;
+  }
+  
+  // Analyze the shuffle, are the LHS or RHS and identity shuffles?
+  bool isLHSID = true, isRHSID = true;
+    
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] >= e*2) continue;  // Ignore undef values.
+    // Is this an identity shuffle of the LHS value?
+    isLHSID &= (Mask[i] == i);
+      
+    // Is this an identity shuffle of the RHS value?
+    isRHSID &= (Mask[i]-e == i);
+  }
+
+  // Eliminate identity shuffles.
+  if (isLHSID) return ReplaceInstUsesWith(SVI, LHS);
+  if (isRHSID) return ReplaceInstUsesWith(SVI, RHS);
+  
+  // If the LHS is a shufflevector itself, see if we can combine it with this
+  // one without producing an unusual shuffle.  Here we are really conservative:
+  // we are absolutely afraid of producing a shuffle mask not in the input
+  // program, because the code gen may not be smart enough to turn a merged
+  // shuffle into two specific shuffles: it may produce worse code.  As such,
+  // we only merge two shuffles if the result is one of the two input shuffle
+  // masks.  In this case, merging the shuffles just removes one instruction,
+  // which we know is safe.  This is good for things like turning:
+  // (splat(splat)) -> splat.
+  if (ShuffleVectorInst *LHSSVI = dyn_cast<ShuffleVectorInst>(LHS)) {
+    if (isa<UndefValue>(RHS)) {
+      std::vector<unsigned> LHSMask = getShuffleMask(LHSSVI);
+
+      std::vector<unsigned> NewMask;
+      for (unsigned i = 0, e = Mask.size(); i != e; ++i)
+        if (Mask[i] >= 2*e)
+          NewMask.push_back(2*e);
+        else
+          NewMask.push_back(LHSMask[Mask[i]]);
+      
+      // If the result mask is equal to the src shuffle or this shuffle mask, do
+      // the replacement.
+      if (NewMask == LHSMask || NewMask == Mask) {
+        std::vector<Constant*> Elts;
+        for (unsigned i = 0, e = NewMask.size(); i != e; ++i) {
+          if (NewMask[i] >= e*2) {
+            Elts.push_back(UndefValue::get(Type::Int32Ty));
+          } else {
+            Elts.push_back(ConstantInt::get(Type::Int32Ty, NewMask[i]));
+          }
+        }
+        return new ShuffleVectorInst(LHSSVI->getOperand(0),
+                                     LHSSVI->getOperand(1),
+                                     ConstantVector::get(Elts));
+      }
+    }
+  }
+
+  return MadeChange ? &SVI : 0;
+}
+
+
+
+
+/// TryToSinkInstruction - Try to move the specified instruction from its
+/// current block into the beginning of DestBlock, which can only happen if it's
+/// safe to move the instruction past all of the instructions between it and the
+/// end of its block.
+static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
+  assert(I->hasOneUse() && "Invariants didn't hold!");
+
+  // Cannot move control-flow-involving, volatile loads, vaarg, etc.
+  if (isa<PHINode>(I) || I->mayWriteToMemory()) return false;
+
+  // Do not sink alloca instructions out of the entry block.
+  if (isa<AllocaInst>(I) && I->getParent() ==
+        &DestBlock->getParent()->getEntryBlock())
+    return false;
+
+  // We can only sink load instructions if there is nothing between the load and
+  // the end of block that could change the value.
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    for (BasicBlock::iterator Scan = LI, E = LI->getParent()->end();
+         Scan != E; ++Scan)
+      if (Scan->mayWriteToMemory())
+        return false;
+  }
+
+  BasicBlock::iterator InsertPos = DestBlock->begin();
+  while (isa<PHINode>(InsertPos)) ++InsertPos;
+
+  I->moveBefore(InsertPos);
+  ++NumSunkInst;
+  return true;
+}
+
+
+/// AddReachableCodeToWorklist - Walk the function in depth-first order, adding
+/// all reachable code to the worklist.
+///
+/// This has a couple of tricks to make the code faster and more powerful.  In
+/// particular, we constant fold and DCE instructions as we go, to avoid adding
+/// them to the worklist (this significantly speeds up instcombine on code where
+/// many instructions are dead or constant).  Additionally, if we find a branch
+/// whose condition is a known constant, we only visit the reachable successors.
+///
+static void AddReachableCodeToWorklist(BasicBlock *BB, 
+                                       SmallPtrSet<BasicBlock*, 64> &Visited,
+                                       InstCombiner &IC,
+                                       const TargetData *TD) {
+  std::vector<BasicBlock*> Worklist;
+  Worklist.push_back(BB);
+
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    // We have now visited this block!  If we've already been here, ignore it.
+    if (!Visited.insert(BB)) continue;
+    
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
+      Instruction *Inst = BBI++;
+      
+      // DCE instruction if trivially dead.
+      if (isInstructionTriviallyDead(Inst)) {
+        ++NumDeadInst;
+        DOUT << "IC: DCE: " << *Inst;
+        Inst->eraseFromParent();
+        continue;
+      }
+      
+      // ConstantProp instruction if trivially constant.
+      if (Constant *C = ConstantFoldInstruction(Inst, TD)) {
+        DOUT << "IC: ConstFold to: " << *C << " from: " << *Inst;
+        Inst->replaceAllUsesWith(C);
+        ++NumConstProp;
+        Inst->eraseFromParent();
+        continue;
+      }
+      
+      IC.AddToWorkList(Inst);
+    }
+
+    // Recursively visit successors.  If this is a branch or switch on a
+    // constant, only visit the reachable successor.
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && isa<ConstantInt>(BI->getCondition())) {
+        bool CondVal = cast<ConstantInt>(BI->getCondition())->getZExtValue();
+        Worklist.push_back(BI->getSuccessor(!CondVal));
+        continue;
+      }
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (ConstantInt *Cond = dyn_cast<ConstantInt>(SI->getCondition())) {
+        // See if this is an explicit destination.
+        for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+          if (SI->getCaseValue(i) == Cond) {
+            Worklist.push_back(SI->getSuccessor(i));
+            continue;
+          }
+        
+        // Otherwise it is the default destination.
+        Worklist.push_back(SI->getSuccessor(0));
+        continue;
+      }
+    }
+    
+    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
+      Worklist.push_back(TI->getSuccessor(i));
+  }
+}
+
+bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
+  bool Changed = false;
+  TD = &getAnalysis<TargetData>();
+  
+  DEBUG(DOUT << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
+             << F.getNameStr() << "\n");
+
+  {
+    // Do a depth-first traversal of the function, populate the worklist with
+    // the reachable instructions.  Ignore blocks that are not reachable.  Keep
+    // track of which blocks we visit.
+    SmallPtrSet<BasicBlock*, 64> Visited;
+    AddReachableCodeToWorklist(F.begin(), Visited, *this, TD);
+
+    // Do a quick scan over the function.  If we find any blocks that are
+    // unreachable, remove any instructions inside of them.  This prevents
+    // the instcombine code from having to deal with some bad special cases.
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      if (!Visited.count(BB)) {
+        Instruction *Term = BB->getTerminator();
+        while (Term != BB->begin()) {   // Remove instrs bottom-up
+          BasicBlock::iterator I = Term; --I;
+
+          DOUT << "IC: DCE: " << *I;
+          ++NumDeadInst;
+
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          I->eraseFromParent();
+        }
+      }
+  }
+
+  while (!Worklist.empty()) {
+    Instruction *I = RemoveOneFromWorkList();
+    if (I == 0) continue;  // skip null values.
+
+    // Check to see if we can DCE the instruction.
+    if (isInstructionTriviallyDead(I)) {
+      // Add operands to the worklist.
+      if (I->getNumOperands() < 4)
+        AddUsesToWorkList(*I);
+      ++NumDeadInst;
+
+      DOUT << "IC: DCE: " << *I;
+
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      continue;
+    }
+
+    // Instruction isn't dead, see if we can constant propagate it.
+    if (Constant *C = ConstantFoldInstruction(I, TD)) {
+      DOUT << "IC: ConstFold to: " << *C << " from: " << *I;
+
+      // Add operands to the worklist.
+      AddUsesToWorkList(*I);
+      ReplaceInstUsesWith(*I, C);
+
+      ++NumConstProp;
+      I->eraseFromParent();
+      RemoveFromWorkList(I);
+      continue;
+    }
+
+    // See if we can trivially sink this instruction to a successor basic block.
+    if (I->hasOneUse()) {
+      BasicBlock *BB = I->getParent();
+      BasicBlock *UserParent = cast<Instruction>(I->use_back())->getParent();
+      if (UserParent != BB) {
+        bool UserIsSuccessor = false;
+        // See if the user is one of our successors.
+        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
+          if (*SI == UserParent) {
+            UserIsSuccessor = true;
+            break;
+          }
+
+        // If the user is one of our immediate successors, and if that successor
+        // only has us as a predecessors (we'd have to split the critical edge
+        // otherwise), we can keep going.
+        if (UserIsSuccessor && !isa<PHINode>(I->use_back()) &&
+            next(pred_begin(UserParent)) == pred_end(UserParent))
+          // Okay, the CFG is simple enough, try to sink this instruction.
+          Changed |= TryToSinkInstruction(I, UserParent);
+      }
+    }
+
+    // Now that we have an instruction, try combining it to simplify it...
+#ifndef NDEBUG
+    std::string OrigI;
+#endif
+    DEBUG(std::ostringstream SS; I->print(SS); OrigI = SS.str(););
+    if (Instruction *Result = visit(*I)) {
+      ++NumCombined;
+      // Should we replace the old instruction with a new one?
+      if (Result != I) {
+        DOUT << "IC: Old = " << *I
+             << "    New = " << *Result;
+
+        // Everything uses the new instruction now.
+        I->replaceAllUsesWith(Result);
+
+        // Push the new instruction and any users onto the worklist.
+        AddToWorkList(Result);
+        AddUsersToWorkList(*Result);
+
+        // Move the name to the new instruction first.
+        Result->takeName(I);
+
+        // Insert the new instruction into the basic block...
+        BasicBlock *InstParent = I->getParent();
+        BasicBlock::iterator InsertPos = I;
+
+        if (!isa<PHINode>(Result))        // If combining a PHI, don't insert
+          while (isa<PHINode>(InsertPos)) // middle of a block of PHIs.
+            ++InsertPos;
+
+        InstParent->getInstList().insert(InsertPos, Result);
+
+        // Make sure that we reprocess all operands now that we reduced their
+        // use counts.
+        AddUsesToWorkList(*I);
+
+        // Instructions can end up on the worklist more than once.  Make sure
+        // we do not process an instruction that has been deleted.
+        RemoveFromWorkList(I);
+
+        // Erase the old instruction.
+        InstParent->getInstList().erase(I);
+      } else {
+#ifndef NDEBUG
+        DOUT << "IC: Mod = " << OrigI
+             << "    New = " << *I;
+#endif
+
+        // If the instruction was modified, it's possible that it is now dead.
+        // if so, remove it.
+        if (isInstructionTriviallyDead(I)) {
+          // Make sure we process all operands now that we are reducing their
+          // use counts.
+          AddUsesToWorkList(*I);
+
+          // Instructions may end up in the worklist more than once.  Erase all
+          // occurrences of this instruction.
+          RemoveFromWorkList(I);
+          I->eraseFromParent();
+        } else {
+          AddToWorkList(I);
+          AddUsersToWorkList(*I);
+        }
+      }
+      Changed = true;
+    }
+  }
+
+  assert(WorklistMap.empty() && "Worklist empty, but map not?");
+  return Changed;
+}
+
+
+bool InstCombiner::runOnFunction(Function &F) {
+  MustPreserveLCSSA = mustPreserveAnalysisID(LCSSAID);
+  
+  bool EverMadeChange = false;
+
+  // Iterate while there is work to do.
+  unsigned Iteration = 0;
+  while (DoOneIteration(F, Iteration++)) 
+    EverMadeChange = true;
+  return EverMadeChange;
+}
+
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstCombiner();
+}
+

diff --git a/lib/Transforms/Scalar/LICM.cpp b/lib/Transforms/Scalar/LICM.cpp
new file mode 100644
index 0000000..77ac563
--- /dev/null
+++ b/lib/Transforms/Scalar/LICM.cpp

@@ -0,0 +1,797 @@
+//===-- LICM.cpp - Loop Invariant Code Motion Pass ------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs loop invariant code motion, attempting to remove as much
+// code from the body of a loop as possible.  It does this by either hoisting
+// code into the preheader block, or by sinking code to the exit blocks if it is
+// safe.  This pass also promotes must-aliased memory locations in the loop to
+// live in registers, thus hoisting and sinking "invariant" loads and stores.
+//
+// This pass uses alias analysis for two purposes:
+//
+//  1. Moving loop invariant loads and calls out of loops.  If we can determine
+//     that a load or call inside of a loop never aliases anything stored to,
+//     we can hoist it or sink it like any other instruction.
+//  2. Scalar Promotion of Memory - If there is a store instruction inside of
+//     the loop, we try to move the store to happen AFTER the loop instead of
+//     inside of the loop.  This can only happen if a few conditions are true:
+//       A. The pointer stored through is loop invariant
+//       B. There are no stores or loads in the loop which _may_ alias the
+//          pointer.  There are no calls in the loop which mod/ref the pointer.
+//     If these conditions are true, we can promote the loads and stores in the
+//     loop of the pointer to use a temporary alloca'd variable.  We then use
+//     the mem2reg functionality to construct the appropriate SSA form for the
+//     variable.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "licm"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AliasSetTracker.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumSunk      , "Number of instructions sunk out of loop");
+STATISTIC(NumHoisted   , "Number of instructions hoisted out of loop");
+STATISTIC(NumMovedLoads, "Number of load insts hoisted or sunk");
+STATISTIC(NumMovedCalls, "Number of call insts hoisted or sunk");
+STATISTIC(NumPromoted  , "Number of memory locations promoted to registers");
+
+namespace {
+  cl::opt<bool>
+  DisablePromotion("disable-licm-promotion", cl::Hidden,
+                   cl::desc("Disable memory promotion in LICM pass"));
+
+  struct VISIBILITY_HIDDEN LICM : public LoopPass {
+    static char ID; // Pass identification, replacement for typeid
+    LICM() : LoopPass((intptr_t)&ID) {}
+
+    virtual bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();  // For scalar promotion (mem2reg)
+      AU.addRequired<AliasAnalysis>();
+    }
+
+    bool doFinalization() {
+      LoopToAliasMap.clear();
+      return false;
+    }
+
+  private:
+    // Various analyses that we use...
+    AliasAnalysis *AA;       // Current AliasAnalysis information
+    LoopInfo      *LI;       // Current LoopInfo
+    DominatorTree *DT;       // Dominator Tree for the current Loop...
+    DominanceFrontier *DF;   // Current Dominance Frontier
+
+    // State that is updated as we process loops
+    bool Changed;            // Set to true when we change anything.
+    BasicBlock *Preheader;   // The preheader block of the current loop...
+    Loop *CurLoop;           // The current loop we are working on...
+    AliasSetTracker *CurAST; // AliasSet information for the current loop...
+    std::map<Loop *, AliasSetTracker *> LoopToAliasMap;
+
+    /// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+    /// dominated by the specified block, and that are in the current loop) in
+    /// reverse depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit uses before definitions, allowing us to sink a loop body in one
+    /// pass without iteration.
+    ///
+    void SinkRegion(DomTreeNode *N);
+
+    /// HoistRegion - Walk the specified region of the CFG (defined by all
+    /// blocks dominated by the specified block, and that are in the current
+    /// loop) in depth first order w.r.t the DominatorTree.  This allows us to
+    /// visit definitions before uses, allowing us to hoist a loop body in one
+    /// pass without iteration.
+    ///
+    void HoistRegion(DomTreeNode *N);
+
+    /// inSubLoop - Little predicate that returns true if the specified basic
+    /// block is in a subloop of the current one, not the current one itself.
+    ///
+    bool inSubLoop(BasicBlock *BB) {
+      assert(CurLoop->contains(BB) && "Only valid if BB is IN the loop");
+      for (Loop::iterator I = CurLoop->begin(), E = CurLoop->end(); I != E; ++I)
+        if ((*I)->contains(BB))
+          return true;  // A subloop actually contains this block!
+      return false;
+    }
+
+    /// isExitBlockDominatedByBlockInLoop - This method checks to see if the
+    /// specified exit block of the loop is dominated by the specified block
+    /// that is in the body of the loop.  We use these constraints to
+    /// dramatically limit the amount of the dominator tree that needs to be
+    /// searched.
+    bool isExitBlockDominatedByBlockInLoop(BasicBlock *ExitBlock,
+                                           BasicBlock *BlockInLoop) const {
+      // If the block in the loop is the loop header, it must be dominated!
+      BasicBlock *LoopHeader = CurLoop->getHeader();
+      if (BlockInLoop == LoopHeader)
+        return true;
+
+      DomTreeNode *BlockInLoopNode = DT->getNode(BlockInLoop);
+      DomTreeNode *IDom            = DT->getNode(ExitBlock);
+
+      // Because the exit block is not in the loop, we know we have to get _at
+      // least_ its immediate dominator.
+      do {
+        // Get next Immediate Dominator.
+        IDom = IDom->getIDom();
+
+        // If we have got to the header of the loop, then the instructions block
+        // did not dominate the exit node, so we can't hoist it.
+        if (IDom->getBlock() == LoopHeader)
+          return false;
+
+      } while (IDom != BlockInLoopNode);
+
+      return true;
+    }
+
+    /// sink - When an instruction is found to only be used outside of the loop,
+    /// this function moves it to the exit blocks and patches up SSA form as
+    /// needed.
+    ///
+    void sink(Instruction &I);
+
+    /// hoist - When an instruction is found to only use loop invariant operands
+    /// that is safe to hoist, this instruction is called to do the dirty work.
+    ///
+    void hoist(Instruction &I);
+
+    /// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it
+    /// is not a trapping instruction or if it is a trapping instruction and is
+    /// guaranteed to execute.
+    ///
+    bool isSafeToExecuteUnconditionally(Instruction &I);
+
+    /// pointerInvalidatedByLoop - Return true if the body of this loop may
+    /// store into the memory location pointed to by V.
+    ///
+    bool pointerInvalidatedByLoop(Value *V, unsigned Size) {
+      // Check to see if any of the basic blocks in CurLoop invalidate *V.
+      return CurAST->getAliasSetForPointer(V, Size).isMod();
+    }
+
+    bool canSinkOrHoistInst(Instruction &I);
+    bool isLoopInvariantInst(Instruction &I);
+    bool isNotUsedInLoop(Instruction &I);
+
+    /// PromoteValuesInLoop - Look at the stores in the loop and promote as many
+    /// to scalars as we can.
+    ///
+    void PromoteValuesInLoop();
+
+    /// FindPromotableValuesInLoop - Check the current loop for stores to
+    /// definite pointers, which are not loaded and stored through may aliases.
+    /// If these are found, create an alloca for the value, add it to the
+    /// PromotedValues list, and keep track of the mapping from value to
+    /// alloca...
+    ///
+    void FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                                    std::map<Value*, AllocaInst*> &Val2AlMap);
+  };
+
+  char LICM::ID = 0;
+  RegisterPass<LICM> X("licm", "Loop Invariant Code Motion");
+}
+
+LoopPass *llvm::createLICMPass() { return new LICM(); }
+
+/// Hoist expressions out of the specified loop...
+///
+bool LICM::runOnLoop(Loop *L, LPPassManager &LPM) {
+  Changed = false;
+
+  // Get our Loop and Alias Analysis information...
+  LI = &getAnalysis<LoopInfo>();
+  AA = &getAnalysis<AliasAnalysis>();
+  DF = &getAnalysis<DominanceFrontier>();
+  DT = &getAnalysis<DominatorTree>();
+
+  CurAST = new AliasSetTracker(*AA);
+  // Collect Alias info from subloops
+  for (Loop::iterator LoopItr = L->begin(), LoopItrE = L->end();
+       LoopItr != LoopItrE; ++LoopItr) {
+    Loop *InnerL = *LoopItr;
+    AliasSetTracker *InnerAST = LoopToAliasMap[InnerL];
+    assert (InnerAST && "Where is my AST?");
+
+    // What if InnerLoop was modified by other passes ?
+    CurAST->add(*InnerAST);
+  }
+  
+  CurLoop = L;
+
+  // Get the preheader block to move instructions into...
+  Preheader = L->getLoopPreheader();
+  assert(Preheader&&"Preheader insertion pass guarantees we have a preheader!");
+
+  // Loop over the body of this loop, looking for calls, invokes, and stores.
+  // Because subloops have already been incorporated into AST, we skip blocks in
+  // subloops.
+  //
+  for (std::vector<BasicBlock*>::const_iterator I = L->getBlocks().begin(),
+         E = L->getBlocks().end(); I != E; ++I)
+    if (LI->getLoopFor(*I) == L)        // Ignore blocks in subloops...
+      CurAST->add(**I);                 // Incorporate the specified basic block
+
+  // We want to visit all of the instructions in this loop... that are not parts
+  // of our subloops (they have already had their invariants hoisted out of
+  // their loop, into this loop, so there is no need to process the BODIES of
+  // the subloops).
+  //
+  // Traverse the body of the loop in depth first order on the dominator tree so
+  // that we are guaranteed to see definitions before we see uses.  This allows
+  // us to sink instructions in one pass, without iteration.  AFter sinking
+  // instructions, we perform another pass to hoist them out of the loop.
+  //
+  SinkRegion(DT->getNode(L->getHeader()));
+  HoistRegion(DT->getNode(L->getHeader()));
+
+  // Now that all loop invariants have been removed from the loop, promote any
+  // memory references to scalars that we can...
+  if (!DisablePromotion)
+    PromoteValuesInLoop();
+
+  // Clear out loops state information for the next iteration
+  CurLoop = 0;
+  Preheader = 0;
+
+  LoopToAliasMap[L] = CurAST;
+  return Changed;
+}
+
+/// SinkRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in
+/// reverse depth first order w.r.t the DominatorTree.  This allows us to visit
+/// uses before definitions, allowing us to sink a loop body in one pass without
+/// iteration.
+///
+void LICM::SinkRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // We are processing blocks in reverse dfo, so process children first...
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    SinkRegion(Children[i]);
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (inSubLoop(BB)) return;
+
+  for (BasicBlock::iterator II = BB->end(); II != BB->begin(); ) {
+    Instruction &I = *--II;
+
+    // Check to see if we can sink this instruction to the exit blocks
+    // of the loop.  We can do this if the all users of the instruction are
+    // outside of the loop.  In this case, it doesn't even matter if the
+    // operands of the instruction are loop invariant.
+    //
+    if (isNotUsedInLoop(I) && canSinkOrHoistInst(I)) {
+      ++II;
+      sink(I);
+    }
+  }
+}
+
+
+/// HoistRegion - Walk the specified region of the CFG (defined by all blocks
+/// dominated by the specified block, and that are in the current loop) in depth
+/// first order w.r.t the DominatorTree.  This allows us to visit definitions
+/// before uses, allowing us to hoist a loop body in one pass without iteration.
+///
+void LICM::HoistRegion(DomTreeNode *N) {
+  assert(N != 0 && "Null dominator tree node?");
+  BasicBlock *BB = N->getBlock();
+
+  // If this subregion is not in the top level loop at all, exit.
+  if (!CurLoop->contains(BB)) return;
+
+  // Only need to process the contents of this block if it is not part of a
+  // subloop (which would already have been processed).
+  if (!inSubLoop(BB))
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E; ) {
+      Instruction &I = *II++;
+
+      // Try hoisting the instruction out to the preheader.  We can only do this
+      // if all of the operands of the instruction are loop invariant and if it
+      // is safe to hoist the instruction.
+      //
+      if (isLoopInvariantInst(I) && canSinkOrHoistInst(I) &&
+          isSafeToExecuteUnconditionally(I))
+        hoist(I);
+      }
+
+  const std::vector<DomTreeNode*> &Children = N->getChildren();
+  for (unsigned i = 0, e = Children.size(); i != e; ++i)
+    HoistRegion(Children[i]);
+}
+
+/// canSinkOrHoistInst - Return true if the hoister and sinker can handle this
+/// instruction.
+///
+bool LICM::canSinkOrHoistInst(Instruction &I) {
+  // Loads have extra constraints we have to verify before we can hoist them.
+  if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    if (LI->isVolatile())
+      return false;        // Don't hoist volatile loads!
+
+    // Don't hoist loads which have may-aliased stores in loop.
+    unsigned Size = 0;
+    if (LI->getType()->isSized())
+      Size = AA->getTargetData().getTypeSize(LI->getType());
+    return !pointerInvalidatedByLoop(LI->getOperand(0), Size);
+  } else if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+    // Handle obvious cases efficiently.
+    if (Function *Callee = CI->getCalledFunction()) {
+      AliasAnalysis::ModRefBehavior Behavior =AA->getModRefBehavior(Callee, CI);
+      if (Behavior == AliasAnalysis::DoesNotAccessMemory)
+        return true;
+      else if (Behavior == AliasAnalysis::OnlyReadsMemory) {
+        // If this call only reads from memory and there are no writes to memory
+        // in the loop, we can hoist or sink the call as appropriate.
+        bool FoundMod = false;
+        for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+             I != E; ++I) {
+          AliasSet &AS = *I;
+          if (!AS.isForwardingAliasSet() && AS.isMod()) {
+            FoundMod = true;
+            break;
+          }
+        }
+        if (!FoundMod) return true;
+      }
+    }
+
+    // FIXME: This should use mod/ref information to see if we can hoist or sink
+    // the call.
+
+    return false;
+  }
+
+  // Otherwise these instructions are hoistable/sinkable
+  return isa<BinaryOperator>(I) || isa<CastInst>(I) ||
+         isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+         isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
+         isa<ShuffleVectorInst>(I);
+}
+
+/// isNotUsedInLoop - Return true if the only users of this instruction are
+/// outside of the loop.  If this is true, we can sink the instruction to the
+/// exit blocks of the loop.
+///
+bool LICM::isNotUsedInLoop(Instruction &I) {
+  for (Value::use_iterator UI = I.use_begin(), E = I.use_end(); UI != E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    if (PHINode *PN = dyn_cast<PHINode>(User)) {
+      // PHI node uses occur in predecessor blocks!
+      for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+        if (PN->getIncomingValue(i) == &I)
+          if (CurLoop->contains(PN->getIncomingBlock(i)))
+            return false;
+    } else if (CurLoop->contains(User->getParent())) {
+      return false;
+    }
+  }
+  return true;
+}
+
+
+/// isLoopInvariantInst - Return true if all operands of this instruction are
+/// loop invariant.  We also filter out non-hoistable instructions here just for
+/// efficiency.
+///
+bool LICM::isLoopInvariantInst(Instruction &I) {
+  // The instruction is loop invariant if all of its operands are loop-invariant
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i)
+    if (!CurLoop->isLoopInvariant(I.getOperand(i)))
+      return false;
+
+  // If we got this far, the instruction is loop invariant!
+  return true;
+}
+
+/// sink - When an instruction is found to only be used outside of the loop,
+/// this function moves it to the exit blocks and patches up SSA form as needed.
+/// This method is guaranteed to remove the original instruction from its
+/// position, and may either delete it or move it to outside of the loop.
+///
+void LICM::sink(Instruction &I) {
+  DOUT << "LICM sinking instruction: " << I;
+
+  std::vector<BasicBlock*> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumSunk;
+  Changed = true;
+
+  // The case where there is only a single exit node of this loop is common
+  // enough that we handle it as a special (more efficient) case.  It is more
+  // efficient to handle because there are no PHI nodes that need to be placed.
+  if (ExitBlocks.size() == 1) {
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[0], I.getParent())) {
+      // Instruction is not used, just delete it.
+      CurAST->deleteValue(&I);
+      if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
+      I.eraseFromParent();
+    } else {
+      // Move the instruction to the start of the exit block, after any PHI
+      // nodes in it.
+      I.removeFromParent();
+
+      BasicBlock::iterator InsertPt = ExitBlocks[0]->begin();
+      while (isa<PHINode>(InsertPt)) ++InsertPt;
+      ExitBlocks[0]->getInstList().insert(InsertPt, &I);
+    }
+  } else if (ExitBlocks.size() == 0) {
+    // The instruction is actually dead if there ARE NO exit blocks.
+    CurAST->deleteValue(&I);
+    if (!I.use_empty())  // If I has users in unreachable blocks, eliminate.
+      I.replaceAllUsesWith(UndefValue::get(I.getType()));
+    I.eraseFromParent();
+  } else {
+    // Otherwise, if we have multiple exits, use the PromoteMem2Reg function to
+    // do all of the hard work of inserting PHI nodes as necessary.  We convert
+    // the value into a stack object to get it to do this.
+
+    // Firstly, we create a stack object to hold the value...
+    AllocaInst *AI = 0;
+
+    if (I.getType() != Type::VoidTy) {
+      AI = new AllocaInst(I.getType(), 0, I.getName(),
+                          I.getParent()->getParent()->getEntryBlock().begin());
+      CurAST->add(AI);
+    }
+
+    // Secondly, insert load instructions for each use of the instruction
+    // outside of the loop.
+    while (!I.use_empty()) {
+      Instruction *U = cast<Instruction>(I.use_back());
+
+      // If the user is a PHI Node, we actually have to insert load instructions
+      // in all predecessor blocks, not in the PHI block itself!
+      if (PHINode *UPN = dyn_cast<PHINode>(U)) {
+        // Only insert into each predecessor once, so that we don't have
+        // different incoming values from the same block!
+        std::map<BasicBlock*, Value*> InsertedBlocks;
+        for (unsigned i = 0, e = UPN->getNumIncomingValues(); i != e; ++i)
+          if (UPN->getIncomingValue(i) == &I) {
+            BasicBlock *Pred = UPN->getIncomingBlock(i);
+            Value *&PredVal = InsertedBlocks[Pred];
+            if (!PredVal) {
+              // Insert a new load instruction right before the terminator in
+              // the predecessor block.
+              PredVal = new LoadInst(AI, "", Pred->getTerminator());
+              CurAST->add(cast<LoadInst>(PredVal));
+            }
+
+            UPN->setIncomingValue(i, PredVal);
+          }
+
+      } else {
+        LoadInst *L = new LoadInst(AI, "", U);
+        U->replaceUsesOfWith(&I, L);
+        CurAST->add(L);
+      }
+    }
+
+    // Thirdly, insert a copy of the instruction in each exit block of the loop
+    // that is dominated by the instruction, storing the result into the memory
+    // location.  Be careful not to insert the instruction into any particular
+    // basic block more than once.
+    std::set<BasicBlock*> InsertedBlocks;
+    BasicBlock *InstOrigBB = I.getParent();
+
+    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+      BasicBlock *ExitBlock = ExitBlocks[i];
+
+      if (isExitBlockDominatedByBlockInLoop(ExitBlock, InstOrigBB)) {
+        // If we haven't already processed this exit block, do so now.
+        if (InsertedBlocks.insert(ExitBlock).second) {
+          // Insert the code after the last PHI node...
+          BasicBlock::iterator InsertPt = ExitBlock->begin();
+          while (isa<PHINode>(InsertPt)) ++InsertPt;
+
+          // If this is the first exit block processed, just move the original
+          // instruction, otherwise clone the original instruction and insert
+          // the copy.
+          Instruction *New;
+          if (InsertedBlocks.size() == 1) {
+            I.removeFromParent();
+            ExitBlock->getInstList().insert(InsertPt, &I);
+            New = &I;
+          } else {
+            New = I.clone();
+            CurAST->copyValue(&I, New);
+            if (!I.getName().empty())
+              New->setName(I.getName()+".le");
+            ExitBlock->getInstList().insert(InsertPt, New);
+          }
+
+          // Now that we have inserted the instruction, store it into the alloca
+          if (AI) new StoreInst(New, AI, InsertPt);
+        }
+      }
+    }
+
+    // If the instruction doesn't dominate any exit blocks, it must be dead.
+    if (InsertedBlocks.empty()) {
+      CurAST->deleteValue(&I);
+      I.eraseFromParent();
+    }
+
+    // Finally, promote the fine value to SSA form.
+    if (AI) {
+      std::vector<AllocaInst*> Allocas;
+      Allocas.push_back(AI);
+      PromoteMemToReg(Allocas, *DT, *DF, CurAST);
+    }
+  }
+}
+
+/// hoist - When an instruction is found to only use loop invariant operands
+/// that is safe to hoist, this instruction is called to do the dirty work.
+///
+void LICM::hoist(Instruction &I) {
+  DOUT << "LICM hoisting to " << Preheader->getName() << ": " << I;
+
+  // Remove the instruction from its current basic block... but don't delete the
+  // instruction.
+  I.removeFromParent();
+
+  // Insert the new node in Preheader, before the terminator.
+  Preheader->getInstList().insert(Preheader->getTerminator(), &I);
+
+  if (isa<LoadInst>(I)) ++NumMovedLoads;
+  else if (isa<CallInst>(I)) ++NumMovedCalls;
+  ++NumHoisted;
+  Changed = true;
+}
+
+/// isSafeToExecuteUnconditionally - Only sink or hoist an instruction if it is
+/// not a trapping instruction or if it is a trapping instruction and is
+/// guaranteed to execute.
+///
+bool LICM::isSafeToExecuteUnconditionally(Instruction &Inst) {
+  // If it is not a trapping instruction, it is always safe to hoist.
+  if (!Inst.isTrapping()) return true;
+
+  // Otherwise we have to check to make sure that the instruction dominates all
+  // of the exit blocks.  If it doesn't, then there is a path out of the loop
+  // which does not execute this instruction, so we can't hoist it.
+
+  // If the instruction is in the header block for the loop (which is very
+  // common), it is always guaranteed to dominate the exit blocks.  Since this
+  // is a common case, and can save some work, check it now.
+  if (Inst.getParent() == CurLoop->getHeader())
+    return true;
+
+  // It's always safe to load from a global or alloca.
+  if (isa<LoadInst>(Inst))
+    if (isa<AllocationInst>(Inst.getOperand(0)) ||
+        isa<GlobalVariable>(Inst.getOperand(0)))
+      return true;
+
+  // Get the exit blocks for the current loop.
+  std::vector<BasicBlock*> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+
+  // For each exit block, get the DT node and walk up the DT until the
+  // instruction's basic block is found or we exit the loop.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (!isExitBlockDominatedByBlockInLoop(ExitBlocks[i], Inst.getParent()))
+      return false;
+
+  return true;
+}
+
+
+/// PromoteValuesInLoop - Try to promote memory values to scalars by sinking
+/// stores out of the loop and moving loads to before the loop.  We do this by
+/// looping over the stores in the loop, looking for stores to Must pointers
+/// which are loop invariant.  We promote these memory locations to use allocas
+/// instead.  These allocas can easily be raised to register values by the
+/// PromoteMem2Reg functionality.
+///
+void LICM::PromoteValuesInLoop() {
+  // PromotedValues - List of values that are promoted out of the loop.  Each
+  // value has an alloca instruction for it, and a canonical version of the
+  // pointer.
+  std::vector<std::pair<AllocaInst*, Value*> > PromotedValues;
+  std::map<Value*, AllocaInst*> ValueToAllocaMap; // Map of ptr to alloca
+
+  FindPromotableValuesInLoop(PromotedValues, ValueToAllocaMap);
+  if (ValueToAllocaMap.empty()) return;   // If there are values to promote.
+
+  Changed = true;
+  NumPromoted += PromotedValues.size();
+
+  std::vector<Value*> PointerValueNumbers;
+
+  // Emit a copy from the value into the alloca'd value in the loop preheader
+  TerminatorInst *LoopPredInst = Preheader->getTerminator();
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+    Value *Ptr = PromotedValues[i].second;
+
+    // If we are promoting a pointer value, update alias information for the
+    // inserted load.
+    Value *LoadValue = 0;
+    if (isa<PointerType>(cast<PointerType>(Ptr->getType())->getElementType())) {
+      // Locate a load or store through the pointer, and assign the same value
+      // to LI as we are loading or storing.  Since we know that the value is
+      // stored in this loop, this will always succeed.
+      for (Value::use_iterator UI = Ptr->use_begin(), E = Ptr->use_end();
+           UI != E; ++UI)
+        if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+          LoadValue = LI;
+          break;
+        } else if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+          if (SI->getOperand(1) == Ptr) {
+            LoadValue = SI->getOperand(0);
+            break;
+          }
+        }
+      assert(LoadValue && "No store through the pointer found!");
+      PointerValueNumbers.push_back(LoadValue);  // Remember this for later.
+    }
+
+    // Load from the memory we are promoting.
+    LoadInst *LI = new LoadInst(Ptr, Ptr->getName()+".promoted", LoopPredInst);
+
+    if (LoadValue) CurAST->copyValue(LoadValue, LI);
+
+    // Store into the temporary alloca.
+    new StoreInst(LI, PromotedValues[i].first, LoopPredInst);
+  }
+
+  // Scan the basic blocks in the loop, replacing uses of our pointers with
+  // uses of the allocas in question.
+  //
+  const std::vector<BasicBlock*> &LoopBBs = CurLoop->getBlocks();
+  for (std::vector<BasicBlock*>::const_iterator I = LoopBBs.begin(),
+         E = LoopBBs.end(); I != E; ++I) {
+    // Rewrite all loads and stores in the block of the pointer...
+    for (BasicBlock::iterator II = (*I)->begin(), E = (*I)->end();
+         II != E; ++II) {
+      if (LoadInst *L = dyn_cast<LoadInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(L->getOperand(0));
+        if (I != ValueToAllocaMap.end())
+          L->setOperand(0, I->second);    // Rewrite load instruction...
+      } else if (StoreInst *S = dyn_cast<StoreInst>(II)) {
+        std::map<Value*, AllocaInst*>::iterator
+          I = ValueToAllocaMap.find(S->getOperand(1));
+        if (I != ValueToAllocaMap.end())
+          S->setOperand(1, I->second);    // Rewrite store instruction...
+      }
+    }
+  }
+
+  // Now that the body of the loop uses the allocas instead of the original
+  // memory locations, insert code to copy the alloca value back into the
+  // original memory location on all exits from the loop.  Note that we only
+  // want to insert one copy of the code in each exit block, though the loop may
+  // exit to the same block more than once.
+  //
+  std::set<BasicBlock*> ProcessedBlocks;
+
+  std::vector<BasicBlock*> ExitBlocks;
+  CurLoop->getExitBlocks(ExitBlocks);
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
+    if (ProcessedBlocks.insert(ExitBlocks[i]).second) {
+      // Copy all of the allocas into their memory locations.
+      BasicBlock::iterator BI = ExitBlocks[i]->begin();
+      while (isa<PHINode>(*BI))
+        ++BI;             // Skip over all of the phi nodes in the block.
+      Instruction *InsertPos = BI;
+      unsigned PVN = 0;
+      for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i) {
+        // Load from the alloca.
+        LoadInst *LI = new LoadInst(PromotedValues[i].first, "", InsertPos);
+
+        // If this is a pointer type, update alias info appropriately.
+        if (isa<PointerType>(LI->getType()))
+          CurAST->copyValue(PointerValueNumbers[PVN++], LI);
+
+        // Store into the memory we promoted.
+        new StoreInst(LI, PromotedValues[i].second, InsertPos);
+      }
+    }
+
+  // Now that we have done the deed, use the mem2reg functionality to promote
+  // all of the new allocas we just created into real SSA registers.
+  //
+  std::vector<AllocaInst*> PromotedAllocas;
+  PromotedAllocas.reserve(PromotedValues.size());
+  for (unsigned i = 0, e = PromotedValues.size(); i != e; ++i)
+    PromotedAllocas.push_back(PromotedValues[i].first);
+  PromoteMemToReg(PromotedAllocas, *DT, *DF, CurAST);
+}
+
+/// FindPromotableValuesInLoop - Check the current loop for stores to definite
+/// pointers, which are not loaded and stored through may aliases.  If these are
+/// found, create an alloca for the value, add it to the PromotedValues list,
+/// and keep track of the mapping from value to alloca.
+///
+void LICM::FindPromotableValuesInLoop(
+                   std::vector<std::pair<AllocaInst*, Value*> > &PromotedValues,
+                             std::map<Value*, AllocaInst*> &ValueToAllocaMap) {
+  Instruction *FnStart = CurLoop->getHeader()->getParent()->begin()->begin();
+
+  // Loop over all of the alias sets in the tracker object.
+  for (AliasSetTracker::iterator I = CurAST->begin(), E = CurAST->end();
+       I != E; ++I) {
+    AliasSet &AS = *I;
+    // We can promote this alias set if it has a store, if it is a "Must" alias
+    // set, if the pointer is loop invariant, and if we are not eliminating any
+    // volatile loads or stores.
+    if (!AS.isForwardingAliasSet() && AS.isMod() && AS.isMustAlias() &&
+        !AS.isVolatile() && CurLoop->isLoopInvariant(AS.begin()->first)) {
+      assert(AS.begin() != AS.end() &&
+             "Must alias set should have at least one pointer element in it!");
+      Value *V = AS.begin()->first;
+
+      // Check that all of the pointers in the alias set have the same type.  We
+      // cannot (yet) promote a memory location that is loaded and stored in
+      // different sizes.
+      bool PointerOk = true;
+      for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+        if (V->getType() != I->first->getType()) {
+          PointerOk = false;
+          break;
+        }
+
+      if (PointerOk) {
+        const Type *Ty = cast<PointerType>(V->getType())->getElementType();
+        AllocaInst *AI = new AllocaInst(Ty, 0, V->getName()+".tmp", FnStart);
+        PromotedValues.push_back(std::make_pair(AI, V));
+
+        // Update the AST and alias analysis.
+        CurAST->copyValue(V, AI);
+
+        for (AliasSet::iterator I = AS.begin(), E = AS.end(); I != E; ++I)
+          ValueToAllocaMap.insert(std::make_pair(I->first, AI));
+
+        DOUT << "LICM: Promoting value: " << *V << "\n";
+      }
+    }
+  }
+}

diff --git a/lib/Transforms/Scalar/LoopRotation.cpp b/lib/Transforms/Scalar/LoopRotation.cpp
new file mode 100644
index 0000000..d35a8ed
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopRotation.cpp

@@ -0,0 +1,579 @@
+//===- LoopRotation.cpp - Loop Rotation Pass ------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Devang Patel and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements Loop Rotation Pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-rotate"
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+#define MAX_HEADER_SIZE 16
+
+STATISTIC(NumRotated, "Number of loops rotated");
+namespace {
+
+  class VISIBILITY_HIDDEN RenameData {
+  public:
+    RenameData(Instruction *O, Value *P, Instruction *H) 
+      : Original(O), PreHeader(P), Header(H) { }
+  public:
+    Instruction *Original; // Original instruction
+    Value *PreHeader; // Original pre-header replacement
+    Instruction *Header; // New header replacement
+  };
+  
+  class VISIBILITY_HIDDEN LoopRotate : public LoopPass {
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopRotate() : LoopPass((intptr_t)&ID) {}
+
+    // Rotate Loop L as many times as possible. Return true if
+    // loop is rotated at least once.
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    // LCSSA form makes instruction renaming easier.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LCSSAID);
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<ScalarEvolution>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+    }
+
+    // Helper functions
+
+    /// Do actual work
+    bool rotateLoop(Loop *L, LPPassManager &LPM);
+    
+    /// Initialize local data
+    void initialize();
+
+    /// Make sure all Exit block PHINodes have required incoming values.
+    /// If incoming value is constant or defined outside the loop then
+    /// PHINode may not have an entry for original pre-header. 
+    void  updateExitBlock();
+
+    /// Return true if this instruction is used outside original header.
+    bool usedOutsideOriginalHeader(Instruction *In);
+
+    /// Find Replacement information for instruction. Return NULL if it is
+    /// not available.
+    const RenameData *findReplacementData(Instruction *I);
+
+    /// After loop rotation, loop pre-header has multiple sucessors.
+    /// Insert one forwarding basic block to ensure that loop pre-header
+    /// has only one successor.
+    void preserveCanonicalLoopForm(LPPassManager &LPM);
+
+  private:
+
+    Loop *L;
+    BasicBlock *OrigHeader;
+    BasicBlock *OrigPreHeader;
+    BasicBlock *OrigLatch;
+    BasicBlock *NewHeader;
+    BasicBlock *Exit;
+    LPPassManager *LPM_Ptr;
+    SmallVector<RenameData, MAX_HEADER_SIZE> LoopHeaderInfo;
+  };
+  
+  char LoopRotate::ID = 0;
+  RegisterPass<LoopRotate> X ("loop-rotate", "Rotate Loops");
+}
+
+LoopPass *llvm::createLoopRotatePass() { return new LoopRotate(); }
+
+/// Rotate Loop L as many times as possible. Return true if
+/// loop is rotated at least once.
+bool LoopRotate::runOnLoop(Loop *Lp, LPPassManager &LPM) {
+
+  bool RotatedOneLoop = false;
+  initialize();
+  LPM_Ptr = &LPM;
+
+  // One loop can be rotated multiple times.
+  while (rotateLoop(Lp,LPM)) {
+    RotatedOneLoop = true;
+    initialize();
+  }
+
+  return RotatedOneLoop;
+}
+
+/// Rotate loop LP. Return true if the loop is rotated.
+bool LoopRotate::rotateLoop(Loop *Lp, LPPassManager &LPM) {
+
+  L = Lp;
+
+  OrigHeader =  L->getHeader();
+  OrigPreHeader = L->getLoopPreheader();
+  OrigLatch = L->getLoopLatch();
+
+  // If loop has only one block then there is not much to rotate.
+  if (L->getBlocks().size() == 1)
+    return false;
+
+  assert (OrigHeader && OrigLatch && OrigPreHeader &&
+          "Loop is not in canonical form");
+
+  // If loop header is not one of the loop exit block then
+  // either this loop is already rotated or it is not 
+  // suitable for loop rotation transformations.
+  if (!L->isLoopExit(OrigHeader))
+    return false;
+
+  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+  if (!BI)
+    return false;
+  assert (BI->isConditional() && "Branch Instruction is not condiitional");
+
+  // Updating PHInodes in loops with multiple exits adds complexity. 
+  // Keep it simple, and restrict loop rotation to loops with one exit only.
+  // In future, lift this restriction and support for multiple exits if
+  // required.
+  std::vector<BasicBlock *> ExitBlocks;
+  L->getExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() > 1)
+    return false;
+
+  // Check size of original header and reject
+  // loop if it is very big.
+  if (OrigHeader->getInstList().size() > MAX_HEADER_SIZE)
+    return false;
+
+  // Now, this loop is suitable for rotation.
+
+  // Find new Loop header. NewHeader is a Header's one and only successor
+  // that is inside loop.  Header's other successor is out side the
+  // loop. Otherwise loop is not suitable for rotation.
+  Exit = BI->getSuccessor(0);
+  NewHeader = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    std::swap(Exit, NewHeader);
+  assert (NewHeader && "Unable to determine new loop header");
+  assert(L->contains(NewHeader) && !L->contains(Exit) && 
+         "Unable to determine loop header and exit blocks");
+
+  // Copy PHI nodes and other instructions from original header
+  // into original pre-header. Unlike original header, original pre-header is
+  // not a member of loop. 
+  //
+  // New loop header is one and only successor of original header that 
+  // is inside the loop. All other original header successors are outside 
+  // the loop. Copy PHI Nodes from original header into new loop header. 
+  // Add second incoming value, from original loop pre-header into these phi 
+  // nodes. If a value defined in original header is used outside original 
+  // header then new loop header will need new phi nodes with two incoming 
+  // values, one definition from original header and second definition is 
+  // from original loop pre-header.
+
+  // Remove terminator from Original pre-header. Original pre-header will
+  // receive a clone of original header terminator as a new terminator.
+  OrigPreHeader->getInstList().pop_back();
+  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+  PHINode *PN = NULL;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    Instruction *In = I;
+
+    // PHI nodes are not copied into original pre-header. Instead their values
+    // are directly propagated.
+    Value * NPV = PN->getIncomingValueForBlock(OrigPreHeader);
+
+    // Create new PHI node with two incoming values for NewHeader.
+    // One incoming value is from OrigLatch (through OrigHeader) and 
+    // second incoming value is from original pre-header.
+    PHINode *NH = new PHINode(In->getType(), In->getName());
+    NH->addIncoming(PN->getIncomingValueForBlock(OrigLatch), OrigHeader);
+    NH->addIncoming(NPV, OrigPreHeader);
+    NewHeader->getInstList().push_front(NH);
+    
+    // "In" can be replaced by NH at various places.
+    LoopHeaderInfo.push_back(RenameData(In, NPV, NH));
+  }
+
+  // Now, handle non-phi instructions.
+  for (; I != E; ++I) {
+    Instruction *In = I;
+
+    assert (!isa<PHINode>(In) && "PHINode is not expected here");
+    // This is not a PHI instruction. Insert its clone into original pre-header.
+    // If this instruction is using a value from same basic block then
+    // update it to use value from cloned instruction.
+    Instruction *C = In->clone();
+    C->setName(In->getName());
+    OrigPreHeader->getInstList().push_back(C);
+
+    for (unsigned opi = 0, e = In->getNumOperands(); opi != e; ++opi) {
+      if (Instruction *OpPhi = dyn_cast<PHINode>(In->getOperand(opi))) {
+        if (const RenameData *D = findReplacementData(OpPhi)) {
+          // This is using values from original header PHI node.
+          // Here, directly used incoming value from original pre-header.
+          C->setOperand(opi, D->PreHeader);
+        }
+      }
+      else if (Instruction *OpInsn = 
+               dyn_cast<Instruction>(In->getOperand(opi))) {
+        if (const RenameData *D = findReplacementData(OpInsn))
+          C->setOperand(opi, D->PreHeader);
+      }
+    }
+
+
+    // If this instruction is used outside this basic block then
+    // create new PHINode for this instruction.
+    Instruction *NewHeaderReplacement = NULL;
+    if (usedOutsideOriginalHeader(In)) {
+      PHINode *PN = new PHINode(In->getType(), In->getName());
+      PN->addIncoming(In, OrigHeader);
+      PN->addIncoming(C, OrigPreHeader);
+      NewHeader->getInstList().push_front(PN);
+      NewHeaderReplacement = PN;
+    } 
+    
+    // "In" can be replaced by NPH or NH at various places.
+    LoopHeaderInfo.push_back(RenameData(In, C, NewHeaderReplacement));
+  }
+
+  // Rename uses of original header instructions to reflect their new
+  // definitions (either from original pre-header node or from newly created
+  // new header PHINodes.
+  //
+  // Original header instructions are used in
+  // 1) Original header:
+  //
+  //    If instruction is used in non-phi instructions then it is using
+  //    defintion from original heder iteself. Do not replace this use
+  //    with definition from new header or original pre-header.
+  //
+  //    If instruction is used in phi node then it is an incoming 
+  //    value. Rename its use to reflect new definition from new-preheader
+  //    or new header.
+  //
+  // 2) Inside loop but not in original header
+  //
+  //    Replace this use to reflect definition from new header.
+  for(unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+
+    if (!ILoopHeaderInfo.Header)
+      continue;
+
+    Instruction *OldPhi = ILoopHeaderInfo.Original;
+    Instruction *NewPhi = ILoopHeaderInfo.Header;
+
+    // Before replacing uses, collect them first, so that iterator is
+    // not invalidated.
+    SmallVector<Instruction *, 16> AllUses;
+    for (Value::use_iterator UI = OldPhi->use_begin(), UE = OldPhi->use_end();
+         UI != UE; ++UI) {
+      Instruction *U = cast<Instruction>(UI);
+      AllUses.push_back(U);
+    }
+
+    for (SmallVector<Instruction *, 16>::iterator UI = AllUses.begin(), 
+           UE = AllUses.end(); UI != UE; ++UI) {
+      Instruction *U = *UI;
+      BasicBlock *Parent = U->getParent();
+
+      // Used inside original header
+      if (Parent == OrigHeader) {
+        // Do not rename uses inside original header non-phi instructions.
+        PHINode *PU = dyn_cast<PHINode>(U);
+        if (!PU)
+          continue;
+
+        // Do not rename uses inside original header phi nodes, if the
+        // incoming value is for new header.
+        if (PU->getBasicBlockIndex(NewHeader) != -1
+            && PU->getIncomingValueForBlock(NewHeader) == U)
+          continue;
+        
+       U->replaceUsesOfWith(OldPhi, NewPhi);
+       continue;
+      }
+
+      // Used inside loop, but not in original header.
+      if (L->contains(U->getParent())) {
+        if (U != NewPhi)
+          U->replaceUsesOfWith(OldPhi, NewPhi);
+        continue;
+      }
+      
+      // Used inside Exit Block. Since we are in LCSSA form, U must be PHINode.
+      if (U->getParent() == Exit) {
+        assert (isa<PHINode>(U) && "Use in Exit Block that is not PHINode");
+        
+        PHINode *UPhi = cast<PHINode>(U);
+        // UPhi already has one incoming argument from original header. 
+        // Add second incoming argument from new Pre header.
+        UPhi->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+      } else {
+        // Used outside Exit block. Create a new PHI node from exit block
+        // to receive value from ne new header ane pre header.
+        PHINode *PN = new PHINode(U->getType(), U->getName());
+        PN->addIncoming(ILoopHeaderInfo.PreHeader, OrigPreHeader);
+        PN->addIncoming(OldPhi, OrigHeader);
+        Exit->getInstList().push_front(PN);
+        U->replaceUsesOfWith(OldPhi, PN);
+      }
+    }
+  }
+  
+  /// Make sure all Exit block PHINodes have required incoming values.
+  updateExitBlock();
+
+  // Update CFG
+
+  // Removing incoming branch from loop preheader to original header.
+  // Now original header is inside the loop.
+  for (BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+       I != E; ++I) {
+    Instruction *In = I;
+    PHINode *PN = dyn_cast<PHINode>(In);
+    if (!PN)
+      break;
+
+    PN->removeIncomingValue(OrigPreHeader);
+  }
+
+  // Make NewHeader as the new header for the loop.
+  L->moveToHeader(NewHeader);
+
+  preserveCanonicalLoopForm(LPM);
+
+  NumRotated++;
+  return true;
+}
+
+/// Make sure all Exit block PHINodes have required incoming values.
+/// If incoming value is constant or defined outside the loop then
+/// PHINode may not have an entry for original pre-header. 
+void LoopRotate::updateExitBlock() {
+
+  for (BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+       I != E; ++I) {
+
+    PHINode *PN = dyn_cast<PHINode>(I);
+    if (!PN)
+      break;
+
+    // There is already one incoming value from original pre-header block.
+    if (PN->getBasicBlockIndex(OrigPreHeader) != -1)
+      continue;
+
+    const RenameData *ILoopHeaderInfo;
+    Value *V = PN->getIncomingValueForBlock(OrigHeader);
+    if (isa<Instruction>(V) && 
+        (ILoopHeaderInfo = findReplacementData(cast<Instruction>(V)))) {
+      assert(ILoopHeaderInfo->PreHeader && "Missing New Preheader Instruction");
+      PN->addIncoming(ILoopHeaderInfo->PreHeader, OrigPreHeader);
+    } else {
+      PN->addIncoming(V, OrigPreHeader);
+    }
+  }
+}
+
+/// Initialize local data
+void LoopRotate::initialize() {
+  L = NULL;
+  OrigHeader = NULL;
+  OrigPreHeader = NULL;
+  NewHeader = NULL;
+  Exit = NULL;
+
+  LoopHeaderInfo.clear();
+}
+
+/// Return true if this instruction is used by any instructions in the loop that
+/// aren't in original header.
+bool LoopRotate::usedOutsideOriginalHeader(Instruction *In) {
+
+  for (Value::use_iterator UI = In->use_begin(), UE = In->use_end();
+       UI != UE; ++UI) {
+    Instruction *U = cast<Instruction>(UI);
+    if (U->getParent() != OrigHeader) {
+      if (L->contains(U->getParent()))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+/// Find Replacement information for instruction. Return NULL if it is
+/// not available.
+const RenameData *LoopRotate::findReplacementData(Instruction *In) {
+
+  // Since LoopHeaderInfo is small, linear walk is OK.
+  for(unsigned LHI = 0, LHI_E = LoopHeaderInfo.size(); LHI != LHI_E; ++LHI) {
+    const RenameData &ILoopHeaderInfo = LoopHeaderInfo[LHI];
+    if (ILoopHeaderInfo.Original == In)
+      return &ILoopHeaderInfo;
+  }
+  return NULL;
+}
+
+/// After loop rotation, loop pre-header has multiple sucessors.
+/// Insert one forwarding basic block to ensure that loop pre-header
+/// has only one successor.
+void LoopRotate::preserveCanonicalLoopForm(LPPassManager &LPM) {
+
+  // Right now original pre-header has two successors, new header and
+  // exit block. Insert new block between original pre-header and
+  // new header such that loop's new pre-header has only one successor.
+  BasicBlock *NewPreHeader = new BasicBlock("bb.nph", OrigHeader->getParent(), 
+                                NewHeader);
+  LoopInfo &LI = LPM.getAnalysis<LoopInfo>();
+  if (Loop *PL = LI.getLoopFor(OrigPreHeader))
+    PL->addBasicBlockToLoop(NewPreHeader, LI);
+  new BranchInst(NewHeader, NewPreHeader);
+  
+  BranchInst *OrigPH_BI = cast<BranchInst>(OrigPreHeader->getTerminator());
+  if (OrigPH_BI->getSuccessor(0) == NewHeader)
+    OrigPH_BI->setSuccessor(0, NewPreHeader);
+  else {
+    assert (OrigPH_BI->getSuccessor(1) == NewHeader &&
+            "Unexpected original pre-header terminator");
+    OrigPH_BI->setSuccessor(1, NewPreHeader);
+  }
+  
+  for (BasicBlock::iterator I = NewHeader->begin(), E = NewHeader->end();
+       I != E; ++I) {
+    Instruction *In = I;
+    PHINode *PN = dyn_cast<PHINode>(In);
+    if (!PN)
+      break;
+
+    int index = PN->getBasicBlockIndex(OrigPreHeader);
+    assert (index != -1 && "Expected incoming value from Original PreHeader");
+    PN->setIncomingBlock(index, NewPreHeader);
+    assert (PN->getBasicBlockIndex(OrigPreHeader) == -1 && 
+            "Expected only one incoming value from Original PreHeader");
+  }
+
+  if (DominatorTree *DT = getAnalysisToUpdate<DominatorTree>()) {
+    DT->addNewBlock(NewPreHeader, OrigPreHeader);
+    DT->changeImmediateDominator(L->getHeader(), NewPreHeader);
+    DT->changeImmediateDominator(Exit, OrigPreHeader);
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (L->getHeader() != B) {
+        DomTreeNode *Node = DT->getNode(B);
+        if (Node && Node->getBlock() == OrigHeader)
+          DT->changeImmediateDominator(*BI, L->getHeader());
+      }
+    }
+    DT->changeImmediateDominator(OrigHeader, OrigLatch);
+  }
+
+  if(DominanceFrontier *DF = getAnalysisToUpdate<DominanceFrontier>()) {
+
+    // New Preheader's dominance frontier is Exit block.
+    DominanceFrontier::DomSetType NewPHSet;
+    NewPHSet.insert(Exit);
+    DF->addBasicBlock(NewPreHeader, NewPHSet);
+
+    // New Header's dominance frontier now includes itself and Exit block
+    DominanceFrontier::iterator HeadI = DF->find(L->getHeader());
+    if (HeadI != DF->end()) {
+      DominanceFrontier::DomSetType & HeaderSet = HeadI->second;
+      HeaderSet.clear();
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType HeaderSet;
+      HeaderSet.insert(L->getHeader());
+      HeaderSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), HeaderSet);
+    }
+
+    // Original header (new Loop Latch)'s dominance frontier is Exit.
+    DominanceFrontier::iterator LatchI = DF->find(L->getLoopLatch());
+    if (LatchI != DF->end()) {
+      DominanceFrontier::DomSetType &LatchSet = LatchI->second;
+      LatchSet = LatchI->second;
+      LatchSet.clear();
+      LatchSet.insert(Exit);
+    } else {
+      DominanceFrontier::DomSetType LatchSet;
+      LatchSet.insert(Exit);
+      DF->addBasicBlock(L->getHeader(), LatchSet);
+    }
+
+    // If a loop block dominates new loop latch then its frontier is
+    // new header and Exit.
+    BasicBlock *NewLatch = L->getLoopLatch();
+    DominatorTree *DT = getAnalysisToUpdate<DominatorTree>();
+    for (Loop::block_iterator BI = L->block_begin(), BE = L->block_end();
+         BI != BE; ++BI) {
+      BasicBlock *B = *BI;
+      if (DT->dominates(B, NewLatch)) {
+        DominanceFrontier::iterator BDFI = DF->find(B);
+        if (BDFI != DF->end()) {
+          DominanceFrontier::DomSetType &BSet = BDFI->second;
+          BSet = BDFI->second;
+          BSet.clear();
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+        } else {
+          DominanceFrontier::DomSetType BSet;
+          BSet.insert(L->getHeader());
+          BSet.insert(Exit);
+          DF->addBasicBlock(B, BSet);
+        }
+      }
+    }
+  }
+
+  // Preserve canonical loop form, which means Exit block should
+  // have only one predecessor.
+  BasicBlock *NExit = SplitEdge(L->getLoopLatch(), Exit, this);
+
+  // Preserve LCSSA.
+  BasicBlock::iterator I = Exit->begin(), E = Exit->end();
+  PHINode *PN = NULL;
+  for (; (PN = dyn_cast<PHINode>(I)); ++I) {
+    PHINode *NewPN = new PHINode(PN->getType(), PN->getName());
+    unsigned N = PN->getNumIncomingValues();
+    for (unsigned index = 0; index < N; ++index)
+      if (PN->getIncomingBlock(index) == NExit) {
+        NewPN->addIncoming(PN->getIncomingValue(index), L->getLoopLatch());
+        PN->setIncomingValue(index, NewPN);
+        PN->setIncomingBlock(index, NExit);
+        NExit->getInstList().push_front(NewPN);
+      }
+  }
+
+  assert (NewHeader && L->getHeader() == NewHeader 
+          && "Invalid loop header after loop rotation");
+  assert (NewPreHeader && L->getLoopPreheader() == NewPreHeader
+          && "Invalid loop preheader after loop rotation");
+  assert (L->getLoopLatch() 
+          && "Invalid loop latch after loop rotation");
+
+}

diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
new file mode 100644
index 0000000..9689c12
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp

@@ -0,0 +1,1504 @@
+//===- LoopStrengthReduce.cpp - Strength Reduce GEPs in Loops -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nate Begeman and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a strength reduction on array references inside loops that
+// have as one or more of their components the loop induction variable.  This is
+// accomplished by creating a new Value to hold the initial value of the array
+// access for the first iteration, and then creating a new GEP instruction in
+// the loop to increment the value by the appropriate amount.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-reduce"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Type.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Target/TargetLowering.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumReduced , "Number of GEPs strength reduced");
+STATISTIC(NumInserted, "Number of PHIs inserted");
+STATISTIC(NumVariable, "Number of PHIs with variable strides");
+
+namespace {
+
+  struct BasedUser;
+
+  /// IVStrideUse - Keep track of one use of a strided induction variable, where
+  /// the stride is stored externally.  The Offset member keeps track of the 
+  /// offset from the IV, User is the actual user of the operand, and 'Operand'
+  /// is the operand # of the User that is the use.
+  struct VISIBILITY_HIDDEN IVStrideUse {
+    SCEVHandle Offset;
+    Instruction *User;
+    Value *OperandValToReplace;
+
+    // isUseOfPostIncrementedValue - True if this should use the
+    // post-incremented version of this IV, not the preincremented version.
+    // This can only be set in special cases, such as the terminating setcc
+    // instruction for a loop or uses dominated by the loop.
+    bool isUseOfPostIncrementedValue;
+    
+    IVStrideUse(const SCEVHandle &Offs, Instruction *U, Value *O)
+      : Offset(Offs), User(U), OperandValToReplace(O),
+        isUseOfPostIncrementedValue(false) {}
+  };
+  
+  /// IVUsersOfOneStride - This structure keeps track of all instructions that
+  /// have an operand that is based on the trip count multiplied by some stride.
+  /// The stride for all of these users is common and kept external to this
+  /// structure.
+  struct VISIBILITY_HIDDEN IVUsersOfOneStride {
+    /// Users - Keep track of all of the users of this stride as well as the
+    /// initial value and the operand that uses the IV.
+    std::vector<IVStrideUse> Users;
+    
+    void addUser(const SCEVHandle &Offset,Instruction *User, Value *Operand) {
+      Users.push_back(IVStrideUse(Offset, User, Operand));
+    }
+  };
+
+  /// IVInfo - This structure keeps track of one IV expression inserted during
+  /// StrengthReduceStridedIVUsers. It contains the stride, the common base, as
+  /// well as the PHI node and increment value created for rewrite.
+  struct VISIBILITY_HIDDEN IVExpr {
+    SCEVHandle  Stride;
+    SCEVHandle  Base;
+    PHINode    *PHI;
+    Value      *IncV;
+
+    IVExpr()
+      : Stride(SCEVUnknown::getIntegerSCEV(0, Type::Int32Ty)),
+        Base  (SCEVUnknown::getIntegerSCEV(0, Type::Int32Ty)) {}
+    IVExpr(const SCEVHandle &stride, const SCEVHandle &base, PHINode *phi,
+           Value *incv)
+      : Stride(stride), Base(base), PHI(phi), IncV(incv) {}
+  };
+
+  /// IVsOfOneStride - This structure keeps track of all IV expression inserted
+  /// during StrengthReduceStridedIVUsers for a particular stride of the IV.
+  struct VISIBILITY_HIDDEN IVsOfOneStride {
+    std::vector<IVExpr> IVs;
+
+    void addIV(const SCEVHandle &Stride, const SCEVHandle &Base, PHINode *PHI,
+               Value *IncV) {
+      IVs.push_back(IVExpr(Stride, Base, PHI, IncV));
+    }
+  };
+
+  class VISIBILITY_HIDDEN LoopStrengthReduce : public LoopPass {
+    LoopInfo *LI;
+    DominatorTree *DT;
+    ScalarEvolution *SE;
+    const TargetData *TD;
+    const Type *UIntPtrTy;
+    bool Changed;
+
+    /// IVUsesByStride - Keep track of all uses of induction variables that we
+    /// are interested in.  The key of the map is the stride of the access.
+    std::map<SCEVHandle, IVUsersOfOneStride> IVUsesByStride;
+
+    /// IVsByStride - Keep track of all IVs that have been inserted for a
+    /// particular stride.
+    std::map<SCEVHandle, IVsOfOneStride> IVsByStride;
+
+    /// StrideOrder - An ordering of the keys in IVUsesByStride that is stable:
+    /// We use this to iterate over the IVUsesByStride collection without being
+    /// dependent on random ordering of pointers in the process.
+    std::vector<SCEVHandle> StrideOrder;
+
+    /// CastedValues - As we need to cast values to uintptr_t, this keeps track
+    /// of the casted version of each value.  This is accessed by
+    /// getCastedVersionOf.
+    std::map<Value*, Value*> CastedPointers;
+
+    /// DeadInsts - Keep track of instructions we may have made dead, so that
+    /// we can remove them after we are done working.
+    std::set<Instruction*> DeadInsts;
+
+    /// TLI - Keep a pointer of a TargetLowering to consult for determining
+    /// transformation profitability.
+    const TargetLowering *TLI;
+
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopStrengthReduce(const TargetLowering *tli = NULL) : 
+      LoopPass((intptr_t)&ID), TLI(tli) {
+    }
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      // We split critical edges, so we change the CFG.  However, we do update
+      // many analyses if they are around.
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addPreserved<LoopInfo>();
+      AU.addPreserved<DominanceFrontier>();
+      AU.addPreserved<DominatorTree>();
+
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addRequired<ScalarEvolution>();
+    }
+    
+    /// getCastedVersionOf - Return the specified value casted to uintptr_t.
+    ///
+    Value *getCastedVersionOf(Instruction::CastOps opcode, Value *V);
+private:
+    bool AddUsersIfInteresting(Instruction *I, Loop *L,
+                               std::set<Instruction*> &Processed);
+    SCEVHandle GetExpressionSCEV(Instruction *E, Loop *L);
+
+    void OptimizeIndvars(Loop *L);
+    bool FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse,
+                       const SCEVHandle *&CondStride);
+
+    unsigned CheckForIVReuse(const SCEVHandle&, IVExpr&, const Type*,
+                             const std::vector<BasedUser>& UsersToProcess);
+
+    bool ValidStride(int64_t, const std::vector<BasedUser>& UsersToProcess);
+
+    void StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                      IVUsersOfOneStride &Uses,
+                                      Loop *L, bool isOnlyStride);
+    void DeleteTriviallyDeadInstructions(std::set<Instruction*> &Insts);
+  };
+  char LoopStrengthReduce::ID = 0;
+  RegisterPass<LoopStrengthReduce> X("loop-reduce", "Loop Strength Reduction");
+}
+
+LoopPass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
+  return new LoopStrengthReduce(TLI);
+}
+
+/// getCastedVersionOf - Return the specified value casted to uintptr_t. This
+/// assumes that the Value* V is of integer or pointer type only.
+///
+Value *LoopStrengthReduce::getCastedVersionOf(Instruction::CastOps opcode, 
+                                              Value *V) {
+  if (V->getType() == UIntPtrTy) return V;
+  if (Constant *CB = dyn_cast<Constant>(V))
+    return ConstantExpr::getCast(opcode, CB, UIntPtrTy);
+
+  Value *&New = CastedPointers[V];
+  if (New) return New;
+  
+  New = SCEVExpander::InsertCastOfTo(opcode, V, UIntPtrTy);
+  DeadInsts.insert(cast<Instruction>(New));
+  return New;
+}
+
+
+/// DeleteTriviallyDeadInstructions - If any of the instructions is the
+/// specified set are trivially dead, delete them and see if this makes any of
+/// their operands subsequently dead.
+void LoopStrengthReduce::
+DeleteTriviallyDeadInstructions(std::set<Instruction*> &Insts) {
+  while (!Insts.empty()) {
+    Instruction *I = *Insts.begin();
+    Insts.erase(Insts.begin());
+    if (isInstructionTriviallyDead(I)) {
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *U = dyn_cast<Instruction>(I->getOperand(i)))
+          Insts.insert(U);
+      SE->deleteValueFromRecords(I);
+      I->eraseFromParent();
+      Changed = true;
+    }
+  }
+}
+
+
+/// GetExpressionSCEV - Compute and return the SCEV for the specified
+/// instruction.
+SCEVHandle LoopStrengthReduce::GetExpressionSCEV(Instruction *Exp, Loop *L) {
+  // Pointer to pointer bitcast instructions return the same value as their
+  // operand.
+  if (BitCastInst *BCI = dyn_cast<BitCastInst>(Exp)) {
+    if (SE->hasSCEV(BCI) || !isa<Instruction>(BCI->getOperand(0)))
+      return SE->getSCEV(BCI);
+    SCEVHandle R = GetExpressionSCEV(cast<Instruction>(BCI->getOperand(0)), L);
+    SE->setSCEV(BCI, R);
+    return R;
+  }
+
+  // Scalar Evolutions doesn't know how to compute SCEV's for GEP instructions.
+  // If this is a GEP that SE doesn't know about, compute it now and insert it.
+  // If this is not a GEP, or if we have already done this computation, just let
+  // SE figure it out.
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Exp);
+  if (!GEP || SE->hasSCEV(GEP))
+    return SE->getSCEV(Exp);
+    
+  // Analyze all of the subscripts of this getelementptr instruction, looking
+  // for uses that are determined by the trip count of L.  First, skip all
+  // operands the are not dependent on the IV.
+
+  // Build up the base expression.  Insert an LLVM cast of the pointer to
+  // uintptr_t first.
+  SCEVHandle GEPVal = SCEVUnknown::get(
+      getCastedVersionOf(Instruction::PtrToInt, GEP->getOperand(0)));
+
+  gep_type_iterator GTI = gep_type_begin(GEP);
+  
+  for (unsigned i = 1, e = GEP->getNumOperands(); i != e; ++i, ++GTI) {
+    // If this is a use of a recurrence that we can analyze, and it comes before
+    // Op does in the GEP operand list, we will handle this when we process this
+    // operand.
+    if (const StructType *STy = dyn_cast<StructType>(*GTI)) {
+      const StructLayout *SL = TD->getStructLayout(STy);
+      unsigned Idx = cast<ConstantInt>(GEP->getOperand(i))->getZExtValue();
+      uint64_t Offset = SL->getElementOffset(Idx);
+      GEPVal = SCEVAddExpr::get(GEPVal,
+                                SCEVUnknown::getIntegerSCEV(Offset, UIntPtrTy));
+    } else {
+      unsigned GEPOpiBits = 
+        GEP->getOperand(i)->getType()->getPrimitiveSizeInBits();
+      unsigned IntPtrBits = UIntPtrTy->getPrimitiveSizeInBits();
+      Instruction::CastOps opcode = (GEPOpiBits < IntPtrBits ? 
+          Instruction::SExt : (GEPOpiBits > IntPtrBits ? Instruction::Trunc :
+            Instruction::BitCast));
+      Value *OpVal = getCastedVersionOf(opcode, GEP->getOperand(i));
+      SCEVHandle Idx = SE->getSCEV(OpVal);
+
+      uint64_t TypeSize = TD->getTypeSize(GTI.getIndexedType());
+      if (TypeSize != 1)
+        Idx = SCEVMulExpr::get(Idx,
+                               SCEVConstant::get(ConstantInt::get(UIntPtrTy,
+                                                                   TypeSize)));
+      GEPVal = SCEVAddExpr::get(GEPVal, Idx);
+    }
+  }
+
+  SE->setSCEV(GEP, GEPVal);
+  return GEPVal;
+}
+
+/// getSCEVStartAndStride - Compute the start and stride of this expression,
+/// returning false if the expression is not a start/stride pair, or true if it
+/// is.  The stride must be a loop invariant expression, but the start may be
+/// a mix of loop invariant and loop variant expressions.
+static bool getSCEVStartAndStride(const SCEVHandle &SH, Loop *L,
+                                  SCEVHandle &Start, SCEVHandle &Stride) {
+  SCEVHandle TheAddRec = Start;   // Initialize to zero.
+
+  // If the outer level is an AddExpr, the operands are all start values except
+  // for a nested AddRecExpr.
+  if (SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(SH)) {
+    for (unsigned i = 0, e = AE->getNumOperands(); i != e; ++i)
+      if (SCEVAddRecExpr *AddRec =
+             dyn_cast<SCEVAddRecExpr>(AE->getOperand(i))) {
+        if (AddRec->getLoop() == L)
+          TheAddRec = SCEVAddExpr::get(AddRec, TheAddRec);
+        else
+          return false;  // Nested IV of some sort?
+      } else {
+        Start = SCEVAddExpr::get(Start, AE->getOperand(i));
+      }
+        
+  } else if (isa<SCEVAddRecExpr>(SH)) {
+    TheAddRec = SH;
+  } else {
+    return false;  // not analyzable.
+  }
+  
+  SCEVAddRecExpr *AddRec = dyn_cast<SCEVAddRecExpr>(TheAddRec);
+  if (!AddRec || AddRec->getLoop() != L) return false;
+  
+  // FIXME: Generalize to non-affine IV's.
+  if (!AddRec->isAffine()) return false;
+
+  Start = SCEVAddExpr::get(Start, AddRec->getOperand(0));
+  
+  if (!isa<SCEVConstant>(AddRec->getOperand(1)))
+    DOUT << "[" << L->getHeader()->getName()
+         << "] Variable stride: " << *AddRec << "\n";
+
+  Stride = AddRec->getOperand(1);
+  return true;
+}
+
+/// IVUseShouldUsePostIncValue - We have discovered a "User" of an IV expression
+/// and now we need to decide whether the user should use the preinc or post-inc
+/// value.  If this user should use the post-inc version of the IV, return true.
+///
+/// Choosing wrong here can break dominance properties (if we choose to use the
+/// post-inc value when we cannot) or it can end up adding extra live-ranges to
+/// the loop, resulting in reg-reg copies (if we use the pre-inc value when we
+/// should use the post-inc value).
+static bool IVUseShouldUsePostIncValue(Instruction *User, Instruction *IV,
+                                       Loop *L, DominatorTree *DT, Pass *P) {
+  // If the user is in the loop, use the preinc value.
+  if (L->contains(User->getParent())) return false;
+  
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  
+  // Ok, the user is outside of the loop.  If it is dominated by the latch
+  // block, use the post-inc value.
+  if (DT->dominates(LatchBlock, User->getParent()))
+    return true;
+
+  // There is one case we have to be careful of: PHI nodes.  These little guys
+  // can live in blocks that do not dominate the latch block, but (since their
+  // uses occur in the predecessor block, not the block the PHI lives in) should
+  // still use the post-inc value.  Check for this case now.
+  PHINode *PN = dyn_cast<PHINode>(User);
+  if (!PN) return false;  // not a phi, not dominated by latch block.
+  
+  // Look at all of the uses of IV by the PHI node.  If any use corresponds to
+  // a block that is not dominated by the latch block, give up and use the
+  // preincremented value.
+  unsigned NumUses = 0;
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == IV) {
+      ++NumUses;
+      if (!DT->dominates(LatchBlock, PN->getIncomingBlock(i)))
+        return false;
+    }
+
+  // Okay, all uses of IV by PN are in predecessor blocks that really are
+  // dominated by the latch block.  Split the critical edges and use the
+  // post-incremented value.
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+    if (PN->getIncomingValue(i) == IV) {
+      SplitCriticalEdge(PN->getIncomingBlock(i), PN->getParent(), P,
+                        true);
+      // Splitting the critical edge can reduce the number of entries in this
+      // PHI.
+      e = PN->getNumIncomingValues();
+      if (--NumUses == 0) break;
+    }
+  
+  return true;
+}
+
+  
+
+/// AddUsersIfInteresting - Inspect the specified instruction.  If it is a
+/// reducible SCEV, recursively add its users to the IVUsesByStride set and
+/// return true.  Otherwise, return false.
+bool LoopStrengthReduce::AddUsersIfInteresting(Instruction *I, Loop *L,
+                                            std::set<Instruction*> &Processed) {
+  if (!I->getType()->isInteger() && !isa<PointerType>(I->getType()))
+      return false;   // Void and FP expressions cannot be reduced.
+  if (!Processed.insert(I).second)
+    return true;    // Instruction already handled.
+  
+  // Get the symbolic expression for this instruction.
+  SCEVHandle ISE = GetExpressionSCEV(I, L);
+  if (isa<SCEVCouldNotCompute>(ISE)) return false;
+  
+  // Get the start and stride for this expression.
+  SCEVHandle Start = SCEVUnknown::getIntegerSCEV(0, ISE->getType());
+  SCEVHandle Stride = Start;
+  if (!getSCEVStartAndStride(ISE, L, Start, Stride))
+    return false;  // Non-reducible symbolic expression, bail out.
+
+  std::vector<Instruction *> IUsers;
+  // Collect all I uses now because IVUseShouldUsePostIncValue may 
+  // invalidate use_iterator.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E; ++UI)
+    IUsers.push_back(cast<Instruction>(*UI));
+
+  for (unsigned iused_index = 0, iused_size = IUsers.size(); 
+       iused_index != iused_size; ++iused_index) {
+
+    Instruction *User = IUsers[iused_index];
+
+    // Do not infinitely recurse on PHI nodes.
+    if (isa<PHINode>(User) && Processed.count(User))
+      continue;
+
+    // If this is an instruction defined in a nested loop, or outside this loop,
+    // don't recurse into it.
+    bool AddUserToIVUsers = false;
+    if (LI->getLoopFor(User->getParent()) != L) {
+      DOUT << "FOUND USER in other loop: " << *User
+           << "   OF SCEV: " << *ISE << "\n";
+      AddUserToIVUsers = true;
+    } else if (!AddUsersIfInteresting(User, L, Processed)) {
+      DOUT << "FOUND USER: " << *User
+           << "   OF SCEV: " << *ISE << "\n";
+      AddUserToIVUsers = true;
+    }
+
+    if (AddUserToIVUsers) {
+      IVUsersOfOneStride &StrideUses = IVUsesByStride[Stride];
+      if (StrideUses.Users.empty())     // First occurance of this stride?
+        StrideOrder.push_back(Stride);
+      
+      // Okay, we found a user that we cannot reduce.  Analyze the instruction
+      // and decide what to do with it.  If we are a use inside of the loop, use
+      // the value before incrementation, otherwise use it after incrementation.
+      if (IVUseShouldUsePostIncValue(User, I, L, DT, this)) {
+        // The value used will be incremented by the stride more than we are
+        // expecting, so subtract this off.
+        SCEVHandle NewStart = SCEV::getMinusSCEV(Start, Stride);
+        StrideUses.addUser(NewStart, User, I);
+        StrideUses.Users.back().isUseOfPostIncrementedValue = true;
+        DOUT << "   USING POSTINC SCEV, START=" << *NewStart<< "\n";
+      } else {        
+        StrideUses.addUser(Start, User, I);
+      }
+    }
+  }
+  return true;
+}
+
+namespace {
+  /// BasedUser - For a particular base value, keep information about how we've
+  /// partitioned the expression so far.
+  struct BasedUser {
+    /// Base - The Base value for the PHI node that needs to be inserted for
+    /// this use.  As the use is processed, information gets moved from this
+    /// field to the Imm field (below).  BasedUser values are sorted by this
+    /// field.
+    SCEVHandle Base;
+    
+    /// Inst - The instruction using the induction variable.
+    Instruction *Inst;
+
+    /// OperandValToReplace - The operand value of Inst to replace with the
+    /// EmittedBase.
+    Value *OperandValToReplace;
+
+    /// Imm - The immediate value that should be added to the base immediately
+    /// before Inst, because it will be folded into the imm field of the
+    /// instruction.
+    SCEVHandle Imm;
+
+    /// EmittedBase - The actual value* to use for the base value of this
+    /// operation.  This is null if we should just use zero so far.
+    Value *EmittedBase;
+
+    // isUseOfPostIncrementedValue - True if this should use the
+    // post-incremented version of this IV, not the preincremented version.
+    // This can only be set in special cases, such as the terminating setcc
+    // instruction for a loop and uses outside the loop that are dominated by
+    // the loop.
+    bool isUseOfPostIncrementedValue;
+    
+    BasedUser(IVStrideUse &IVSU)
+      : Base(IVSU.Offset), Inst(IVSU.User), 
+        OperandValToReplace(IVSU.OperandValToReplace), 
+        Imm(SCEVUnknown::getIntegerSCEV(0, Base->getType())), EmittedBase(0),
+        isUseOfPostIncrementedValue(IVSU.isUseOfPostIncrementedValue) {}
+
+    // Once we rewrite the code to insert the new IVs we want, update the
+    // operands of Inst to use the new expression 'NewBase', with 'Imm' added
+    // to it.
+    void RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                        SCEVExpander &Rewriter, Loop *L,
+                                        Pass *P);
+    
+    Value *InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                       SCEVExpander &Rewriter,
+                                       Instruction *IP, Loop *L);
+    void dump() const;
+  };
+}
+
+void BasedUser::dump() const {
+  cerr << " Base=" << *Base;
+  cerr << " Imm=" << *Imm;
+  if (EmittedBase)
+    cerr << "  EB=" << *EmittedBase;
+
+  cerr << "   Inst: " << *Inst;
+}
+
+Value *BasedUser::InsertCodeForBaseAtPosition(const SCEVHandle &NewBase, 
+                                              SCEVExpander &Rewriter,
+                                              Instruction *IP, Loop *L) {
+  // Figure out where we *really* want to insert this code.  In particular, if
+  // the user is inside of a loop that is nested inside of L, we really don't
+  // want to insert this expression before the user, we'd rather pull it out as
+  // many loops as possible.
+  LoopInfo &LI = Rewriter.getLoopInfo();
+  Instruction *BaseInsertPt = IP;
+  
+  // Figure out the most-nested loop that IP is in.
+  Loop *InsertLoop = LI.getLoopFor(IP->getParent());
+  
+  // If InsertLoop is not L, and InsertLoop is nested inside of L, figure out
+  // the preheader of the outer-most loop where NewBase is not loop invariant.
+  while (InsertLoop && NewBase->isLoopInvariant(InsertLoop)) {
+    BaseInsertPt = InsertLoop->getLoopPreheader()->getTerminator();
+    InsertLoop = InsertLoop->getParentLoop();
+  }
+  
+  // If there is no immediate value, skip the next part.
+  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Imm))
+    if (SC->getValue()->isZero())
+      return Rewriter.expandCodeFor(NewBase, BaseInsertPt);
+
+  Value *Base = Rewriter.expandCodeFor(NewBase, BaseInsertPt);
+
+  // If we are inserting the base and imm values in the same block, make sure to
+  // adjust the IP position if insertion reused a result.
+  if (IP == BaseInsertPt)
+    IP = Rewriter.getInsertionPoint();
+  
+  // Always emit the immediate (if non-zero) into the same block as the user.
+  SCEVHandle NewValSCEV = SCEVAddExpr::get(SCEVUnknown::get(Base), Imm);
+  return Rewriter.expandCodeFor(NewValSCEV, IP);
+  
+}
+
+
+// Once we rewrite the code to insert the new IVs we want, update the
+// operands of Inst to use the new expression 'NewBase', with 'Imm' added
+// to it.
+void BasedUser::RewriteInstructionToUseNewBase(const SCEVHandle &NewBase,
+                                               SCEVExpander &Rewriter,
+                                               Loop *L, Pass *P) {
+  if (!isa<PHINode>(Inst)) {
+    // By default, insert code at the user instruction.
+    BasicBlock::iterator InsertPt = Inst;
+    
+    // However, if the Operand is itself an instruction, the (potentially
+    // complex) inserted code may be shared by many users.  Because of this, we
+    // want to emit code for the computation of the operand right before its old
+    // computation.  This is usually safe, because we obviously used to use the
+    // computation when it was computed in its current block.  However, in some
+    // cases (e.g. use of a post-incremented induction variable) the NewBase
+    // value will be pinned to live somewhere after the original computation.
+    // In this case, we have to back off.
+    if (!isUseOfPostIncrementedValue) {
+      if (Instruction *OpInst = dyn_cast<Instruction>(OperandValToReplace)) { 
+        InsertPt = OpInst;
+        while (isa<PHINode>(InsertPt)) ++InsertPt;
+      }
+    }
+    Value *NewVal = InsertCodeForBaseAtPosition(NewBase, Rewriter, InsertPt, L);
+    // Adjust the type back to match the Inst.
+    if (isa<PointerType>(OperandValToReplace->getType())) {
+      NewVal = new IntToPtrInst(NewVal, OperandValToReplace->getType(), "cast",
+                                InsertPt);
+    }
+    // Replace the use of the operand Value with the new Phi we just created.
+    Inst->replaceUsesOfWith(OperandValToReplace, NewVal);
+    DOUT << "    CHANGED: IMM =" << *Imm;
+    DOUT << "  \tNEWBASE =" << *NewBase;
+    DOUT << "  \tInst = " << *Inst;
+    return;
+  }
+  
+  // PHI nodes are more complex.  We have to insert one copy of the NewBase+Imm
+  // expression into each operand block that uses it.  Note that PHI nodes can
+  // have multiple entries for the same predecessor.  We use a map to make sure
+  // that a PHI node only has a single Value* for each predecessor (which also
+  // prevents us from inserting duplicate code in some blocks).
+  std::map<BasicBlock*, Value*> InsertedCode;
+  PHINode *PN = cast<PHINode>(Inst);
+  for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
+    if (PN->getIncomingValue(i) == OperandValToReplace) {
+      // If this is a critical edge, split the edge so that we do not insert the
+      // code on all predecessor/successor paths.  We do this unless this is the
+      // canonical backedge for this loop, as this can make some inserted code
+      // be in an illegal position.
+      BasicBlock *PHIPred = PN->getIncomingBlock(i);
+      if (e != 1 && PHIPred->getTerminator()->getNumSuccessors() > 1 &&
+          (PN->getParent() != L->getHeader() || !L->contains(PHIPred))) {
+        
+        // First step, split the critical edge.
+        SplitCriticalEdge(PHIPred, PN->getParent(), P, true);
+            
+        // Next step: move the basic block.  In particular, if the PHI node
+        // is outside of the loop, and PredTI is in the loop, we want to
+        // move the block to be immediately before the PHI block, not
+        // immediately after PredTI.
+        if (L->contains(PHIPred) && !L->contains(PN->getParent())) {
+          BasicBlock *NewBB = PN->getIncomingBlock(i);
+          NewBB->moveBefore(PN->getParent());
+        }
+        
+        // Splitting the edge can reduce the number of PHI entries we have.
+        e = PN->getNumIncomingValues();
+      }
+
+      Value *&Code = InsertedCode[PN->getIncomingBlock(i)];
+      if (!Code) {
+        // Insert the code into the end of the predecessor block.
+        Instruction *InsertPt = PN->getIncomingBlock(i)->getTerminator();
+        Code = InsertCodeForBaseAtPosition(NewBase, Rewriter, InsertPt, L);
+
+        // Adjust the type back to match the PHI.
+        if (isa<PointerType>(PN->getType())) {
+          Code = new IntToPtrInst(Code, PN->getType(), "cast", InsertPt);
+        }
+      }
+      
+      // Replace the use of the operand Value with the new Phi we just created.
+      PN->setIncomingValue(i, Code);
+      Rewriter.clear();
+    }
+  }
+  DOUT << "    CHANGED: IMM =" << *Imm << "  Inst = " << *Inst;
+}
+
+
+/// isTargetConstant - Return true if the following can be referenced by the
+/// immediate field of a target instruction.
+static bool isTargetConstant(const SCEVHandle &V, const Type *UseTy,
+                             const TargetLowering *TLI) {
+  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V)) {
+    int64_t VC = SC->getValue()->getSExtValue();
+    if (TLI) {
+      TargetLowering::AddrMode AM;
+      AM.BaseOffs = VC;
+      return TLI->isLegalAddressingMode(AM, UseTy);
+    } else {
+      // Defaults to PPC. PPC allows a sign-extended 16-bit immediate field.
+      return (VC > -(1 << 16) && VC < (1 << 16)-1);
+    }
+  }
+
+  if (SCEVUnknown *SU = dyn_cast<SCEVUnknown>(V))
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(SU->getValue()))
+      if (TLI && CE->getOpcode() == Instruction::PtrToInt) {
+        Constant *Op0 = CE->getOperand(0);
+        if (GlobalValue *GV = dyn_cast<GlobalValue>(Op0)) {
+          TargetLowering::AddrMode AM;
+          AM.BaseGV = GV;
+          return TLI->isLegalAddressingMode(AM, UseTy);
+        }
+      }
+  return false;
+}
+
+/// MoveLoopVariantsToImediateField - Move any subexpressions from Val that are
+/// loop varying to the Imm operand.
+static void MoveLoopVariantsToImediateField(SCEVHandle &Val, SCEVHandle &Imm,
+                                            Loop *L) {
+  if (Val->isLoopInvariant(L)) return;  // Nothing to do.
+  
+  if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i)
+      if (!SAE->getOperand(i)->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SCEVAddExpr::get(Imm, SAE->getOperand(i));
+      } else {
+        NewOps.push_back(SAE->getOperand(i));
+      }
+
+    if (NewOps.empty())
+      Val = SCEVUnknown::getIntegerSCEV(0, Val->getType());
+    else
+      Val = SCEVAddExpr::get(NewOps);
+  } else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveLoopVariantsToImediateField(Start, Imm, L);
+    
+    std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+    Ops[0] = Start;
+    Val = SCEVAddRecExpr::get(Ops, SARE->getLoop());
+  } else {
+    // Otherwise, all of Val is variant, move the whole thing over.
+    Imm = SCEVAddExpr::get(Imm, Val);
+    Val = SCEVUnknown::getIntegerSCEV(0, Val->getType());
+  }
+}
+
+
+/// MoveImmediateValues - Look at Val, and pull out any additions of constants
+/// that can fit into the immediate field of instructions in the target.
+/// Accumulate these immediate values into the Imm value.
+static void MoveImmediateValues(const TargetLowering *TLI,
+                                Instruction *User,
+                                SCEVHandle &Val, SCEVHandle &Imm,
+                                bool isAddress, Loop *L) {
+  const Type *UseTy = User->getType();
+  if (StoreInst *SI = dyn_cast<StoreInst>(User))
+    UseTy = SI->getOperand(0)->getType();
+
+  if (SCEVAddExpr *SAE = dyn_cast<SCEVAddExpr>(Val)) {
+    std::vector<SCEVHandle> NewOps;
+    NewOps.reserve(SAE->getNumOperands());
+    
+    for (unsigned i = 0; i != SAE->getNumOperands(); ++i) {
+      SCEVHandle NewOp = SAE->getOperand(i);
+      MoveImmediateValues(TLI, User, NewOp, Imm, isAddress, L);
+      
+      if (!NewOp->isLoopInvariant(L)) {
+        // If this is a loop-variant expression, it must stay in the immediate
+        // field of the expression.
+        Imm = SCEVAddExpr::get(Imm, NewOp);
+      } else {
+        NewOps.push_back(NewOp);
+      }
+    }
+
+    if (NewOps.empty())
+      Val = SCEVUnknown::getIntegerSCEV(0, Val->getType());
+    else
+      Val = SCEVAddExpr::get(NewOps);
+    return;
+  } else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Val)) {
+    // Try to pull immediates out of the start value of nested addrec's.
+    SCEVHandle Start = SARE->getStart();
+    MoveImmediateValues(TLI, User, Start, Imm, isAddress, L);
+    
+    if (Start != SARE->getStart()) {
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Start;
+      Val = SCEVAddRecExpr::get(Ops, SARE->getLoop());
+    }
+    return;
+  } else if (SCEVMulExpr *SME = dyn_cast<SCEVMulExpr>(Val)) {
+    // Transform "8 * (4 + v)" -> "32 + 8*V" if "32" fits in the immed field.
+    if (isAddress && isTargetConstant(SME->getOperand(0), UseTy, TLI) &&
+        SME->getNumOperands() == 2 && SME->isLoopInvariant(L)) {
+
+      SCEVHandle SubImm = SCEVUnknown::getIntegerSCEV(0, Val->getType());
+      SCEVHandle NewOp = SME->getOperand(1);
+      MoveImmediateValues(TLI, User, NewOp, SubImm, isAddress, L);
+      
+      // If we extracted something out of the subexpressions, see if we can 
+      // simplify this!
+      if (NewOp != SME->getOperand(1)) {
+        // Scale SubImm up by "8".  If the result is a target constant, we are
+        // good.
+        SubImm = SCEVMulExpr::get(SubImm, SME->getOperand(0));
+        if (isTargetConstant(SubImm, UseTy, TLI)) {
+          // Accumulate the immediate.
+          Imm = SCEVAddExpr::get(Imm, SubImm);
+          
+          // Update what is left of 'Val'.
+          Val = SCEVMulExpr::get(SME->getOperand(0), NewOp);
+          return;
+        }
+      }
+    }
+  }
+
+  // Loop-variant expressions must stay in the immediate field of the
+  // expression.
+  if ((isAddress && isTargetConstant(Val, UseTy, TLI)) ||
+      !Val->isLoopInvariant(L)) {
+    Imm = SCEVAddExpr::get(Imm, Val);
+    Val = SCEVUnknown::getIntegerSCEV(0, Val->getType());
+    return;
+  }
+
+  // Otherwise, no immediates to move.
+}
+
+
+/// SeparateSubExprs - Decompose Expr into all of the subexpressions that are
+/// added together.  This is used to reassociate common addition subexprs
+/// together for maximal sharing when rewriting bases.
+static void SeparateSubExprs(std::vector<SCEVHandle> &SubExprs,
+                             SCEVHandle Expr) {
+  if (SCEVAddExpr *AE = dyn_cast<SCEVAddExpr>(Expr)) {
+    for (unsigned j = 0, e = AE->getNumOperands(); j != e; ++j)
+      SeparateSubExprs(SubExprs, AE->getOperand(j));
+  } else if (SCEVAddRecExpr *SARE = dyn_cast<SCEVAddRecExpr>(Expr)) {
+    SCEVHandle Zero = SCEVUnknown::getIntegerSCEV(0, Expr->getType());
+    if (SARE->getOperand(0) == Zero) {
+      SubExprs.push_back(Expr);
+    } else {
+      // Compute the addrec with zero as its base.
+      std::vector<SCEVHandle> Ops(SARE->op_begin(), SARE->op_end());
+      Ops[0] = Zero;   // Start with zero base.
+      SubExprs.push_back(SCEVAddRecExpr::get(Ops, SARE->getLoop()));
+      
+
+      SeparateSubExprs(SubExprs, SARE->getOperand(0));
+    }
+  } else if (!isa<SCEVConstant>(Expr) ||
+             !cast<SCEVConstant>(Expr)->getValue()->isZero()) {
+    // Do not add zero.
+    SubExprs.push_back(Expr);
+  }
+}
+
+
+/// RemoveCommonExpressionsFromUseBases - Look through all of the uses in Bases,
+/// removing any common subexpressions from it.  Anything truly common is
+/// removed, accumulated, and returned.  This looks for things like (a+b+c) and
+/// (a+c+d) -> (a+c).  The common expression is *removed* from the Bases.
+static SCEVHandle 
+RemoveCommonExpressionsFromUseBases(std::vector<BasedUser> &Uses) {
+  unsigned NumUses = Uses.size();
+
+  // Only one use?  Use its base, regardless of what it is!
+  SCEVHandle Zero = SCEVUnknown::getIntegerSCEV(0, Uses[0].Base->getType());
+  SCEVHandle Result = Zero;
+  if (NumUses == 1) {
+    std::swap(Result, Uses[0].Base);
+    return Result;
+  }
+
+  // To find common subexpressions, count how many of Uses use each expression.
+  // If any subexpressions are used Uses.size() times, they are common.
+  std::map<SCEVHandle, unsigned> SubExpressionUseCounts;
+  
+  // UniqueSubExprs - Keep track of all of the subexpressions we see in the
+  // order we see them.
+  std::vector<SCEVHandle> UniqueSubExprs;
+
+  std::vector<SCEVHandle> SubExprs;
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // If the base is zero (which is common), return zero now, there are no
+    // CSEs we can find.
+    if (Uses[i].Base == Zero) return Zero;
+
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base);
+    // Add one to SubExpressionUseCounts for each subexpr present.
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
+      if (++SubExpressionUseCounts[SubExprs[j]] == 1)
+        UniqueSubExprs.push_back(SubExprs[j]);
+    SubExprs.clear();
+  }
+
+  // Now that we know how many times each is used, build Result.  Iterate over
+  // UniqueSubexprs so that we have a stable ordering.
+  for (unsigned i = 0, e = UniqueSubExprs.size(); i != e; ++i) {
+    std::map<SCEVHandle, unsigned>::iterator I = 
+       SubExpressionUseCounts.find(UniqueSubExprs[i]);
+    assert(I != SubExpressionUseCounts.end() && "Entry not found?");
+    if (I->second == NumUses) {  // Found CSE!
+      Result = SCEVAddExpr::get(Result, I->first);
+    } else {
+      // Remove non-cse's from SubExpressionUseCounts.
+      SubExpressionUseCounts.erase(I);
+    }
+  }
+  
+  // If we found no CSE's, return now.
+  if (Result == Zero) return Result;
+  
+  // Otherwise, remove all of the CSE's we found from each of the base values.
+  for (unsigned i = 0; i != NumUses; ++i) {
+    // Split the expression into subexprs.
+    SeparateSubExprs(SubExprs, Uses[i].Base);
+
+    // Remove any common subexpressions.
+    for (unsigned j = 0, e = SubExprs.size(); j != e; ++j)
+      if (SubExpressionUseCounts.count(SubExprs[j])) {
+        SubExprs.erase(SubExprs.begin()+j);
+        --j; --e;
+      }
+    
+    // Finally, the non-shared expressions together.
+    if (SubExprs.empty())
+      Uses[i].Base = Zero;
+    else
+      Uses[i].Base = SCEVAddExpr::get(SubExprs);
+    SubExprs.clear();
+  }
+ 
+  return Result;
+}
+
+/// isZero - returns true if the scalar evolution expression is zero.
+///
+static bool isZero(SCEVHandle &V) {
+  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(V))
+    return SC->getValue()->isZero();
+  return false;
+}
+
+/// ValidStride - Check whether the given Scale is valid for all loads and 
+/// stores in UsersToProcess.
+///
+bool LoopStrengthReduce::ValidStride(int64_t Scale, 
+                               const std::vector<BasedUser>& UsersToProcess) {
+  for (unsigned i=0, e = UsersToProcess.size(); i!=e; ++i) {
+    // If this is a load or other access, pass the type of the access in.
+    const Type *AccessTy = Type::VoidTy;
+    if (StoreInst *SI = dyn_cast<StoreInst>(UsersToProcess[i].Inst))
+      AccessTy = SI->getOperand(0)->getType();
+    else if (LoadInst *LI = dyn_cast<LoadInst>(UsersToProcess[i].Inst))
+      AccessTy = LI->getType();
+    
+    TargetLowering::AddrMode AM;
+    if (SCEVConstant *SC = dyn_cast<SCEVConstant>(UsersToProcess[i].Imm))
+      AM.BaseOffs = SC->getValue()->getSExtValue();
+    AM.Scale = Scale;
+
+    // If load[imm+r*scale] is illegal, bail out.
+    if (!TLI->isLegalAddressingMode(AM, AccessTy))
+      return false;
+  }
+  return true;
+}
+
+/// CheckForIVReuse - Returns the multiple if the stride is the multiple
+/// of a previous stride and it is a legal value for the target addressing
+/// mode scale component. This allows the users of this stride to be rewritten
+/// as prev iv * factor. It returns 0 if no reuse is possible.
+unsigned LoopStrengthReduce::CheckForIVReuse(const SCEVHandle &Stride, 
+                                IVExpr &IV, const Type *Ty,
+                                const std::vector<BasedUser>& UsersToProcess) {
+  if (!TLI) return 0;
+
+  if (SCEVConstant *SC = dyn_cast<SCEVConstant>(Stride)) {
+    int64_t SInt = SC->getValue()->getSExtValue();
+    if (SInt == 1) return 0;
+
+    for (std::map<SCEVHandle, IVsOfOneStride>::iterator SI= IVsByStride.begin(),
+           SE = IVsByStride.end(); SI != SE; ++SI) {
+      int64_t SSInt = cast<SCEVConstant>(SI->first)->getValue()->getSExtValue();
+      if (SInt != -SSInt &&
+          (unsigned(abs(SInt)) < SSInt || (SInt % SSInt) != 0))
+        continue;
+      int64_t Scale = SInt / SSInt;
+      // Check that this stride is valid for all the types used for loads and
+      // stores; if it can be used for some and not others, we might as well use
+      // the original stride everywhere, since we have to create the IV for it
+      // anyway.
+      if (ValidStride(Scale, UsersToProcess))
+        for (std::vector<IVExpr>::iterator II = SI->second.IVs.begin(),
+               IE = SI->second.IVs.end(); II != IE; ++II)
+          // FIXME: Only handle base == 0 for now.
+          // Only reuse previous IV if it would not require a type conversion.
+          if (isZero(II->Base) && II->Base->getType() == Ty) {
+            IV = *II;
+            return Scale;
+          }
+    }
+  }
+  return 0;
+}
+
+/// PartitionByIsUseOfPostIncrementedValue - Simple boolean predicate that
+/// returns true if Val's isUseOfPostIncrementedValue is true.
+static bool PartitionByIsUseOfPostIncrementedValue(const BasedUser &Val) {
+  return Val.isUseOfPostIncrementedValue;
+}
+
+/// isNonConstantNegative - REturn true if the specified scev is negated, but
+/// not a constant.
+static bool isNonConstantNegative(const SCEVHandle &Expr) {
+  SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(Expr);
+  if (!Mul) return false;
+  
+  // If there is a constant factor, it will be first.
+  SCEVConstant *SC = dyn_cast<SCEVConstant>(Mul->getOperand(0));
+  if (!SC) return false;
+  
+  // Return true if the value is negative, this matches things like (-42 * V).
+  return SC->getValue()->getValue().isNegative();
+}
+
+/// StrengthReduceStridedIVUsers - Strength reduce all of the users of a single
+/// stride of IV.  All of the users may have different starting values, and this
+/// may not be the only stride (we know it is if isOnlyStride is true).
+void LoopStrengthReduce::StrengthReduceStridedIVUsers(const SCEVHandle &Stride,
+                                                      IVUsersOfOneStride &Uses,
+                                                      Loop *L,
+                                                      bool isOnlyStride) {
+  // Transform our list of users and offsets to a bit more complex table.  In
+  // this new vector, each 'BasedUser' contains 'Base' the base of the
+  // strided accessas well as the old information from Uses.  We progressively
+  // move information from the Base field to the Imm field, until we eventually
+  // have the full access expression to rewrite the use.
+  std::vector<BasedUser> UsersToProcess;
+  UsersToProcess.reserve(Uses.Users.size());
+  for (unsigned i = 0, e = Uses.Users.size(); i != e; ++i) {
+    UsersToProcess.push_back(Uses.Users[i]);
+    
+    // Move any loop invariant operands from the offset field to the immediate
+    // field of the use, so that we don't try to use something before it is
+    // computed.
+    MoveLoopVariantsToImediateField(UsersToProcess.back().Base,
+                                    UsersToProcess.back().Imm, L);
+    assert(UsersToProcess.back().Base->isLoopInvariant(L) &&
+           "Base value is not loop invariant!");
+  }
+
+  // We now have a whole bunch of uses of like-strided induction variables, but
+  // they might all have different bases.  We want to emit one PHI node for this
+  // stride which we fold as many common expressions (between the IVs) into as
+  // possible.  Start by identifying the common expressions in the base values 
+  // for the strides (e.g. if we have "A+C+B" and "A+B+D" as our bases, find
+  // "A+B"), emit it to the preheader, then remove the expression from the
+  // UsersToProcess base values.
+  SCEVHandle CommonExprs =
+    RemoveCommonExpressionsFromUseBases(UsersToProcess);
+  
+  // Next, figure out what we can represent in the immediate fields of
+  // instructions.  If we can represent anything there, move it to the imm
+  // fields of the BasedUsers.  We do this so that it increases the commonality
+  // of the remaining uses.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // If the user is not in the current loop, this means it is using the exit
+    // value of the IV.  Do not put anything in the base, make sure it's all in
+    // the immediate field to allow as much factoring as possible.
+    if (!L->contains(UsersToProcess[i].Inst->getParent())) {
+      UsersToProcess[i].Imm = SCEVAddExpr::get(UsersToProcess[i].Imm,
+                                               UsersToProcess[i].Base);
+      UsersToProcess[i].Base = 
+        SCEVUnknown::getIntegerSCEV(0, UsersToProcess[i].Base->getType());
+    } else {
+      
+      // Addressing modes can be folded into loads and stores.  Be careful that
+      // the store is through the expression, not of the expression though.
+      bool isAddress = isa<LoadInst>(UsersToProcess[i].Inst);
+      if (StoreInst *SI = dyn_cast<StoreInst>(UsersToProcess[i].Inst)) {
+        if (SI->getOperand(1) == UsersToProcess[i].OperandValToReplace)
+          isAddress = true;
+      } else if (IntrinsicInst *II =
+                   dyn_cast<IntrinsicInst>(UsersToProcess[i].Inst)) {
+        // Addressing modes can also be folded into prefetches.
+        if (II->getIntrinsicID() == Intrinsic::prefetch &&
+            II->getOperand(1) == UsersToProcess[i].OperandValToReplace)
+          isAddress = true;
+      }
+      
+      MoveImmediateValues(TLI, UsersToProcess[i].Inst, UsersToProcess[i].Base,
+                          UsersToProcess[i].Imm, isAddress, L);
+    }
+  }
+
+  // Check if it is possible to reuse a IV with stride that is factor of this
+  // stride. And the multiple is a number that can be encoded in the scale
+  // field of the target addressing mode.  And we will have a valid
+  // instruction after this substition, including the immediate field, if any.
+  PHINode *NewPHI = NULL;
+  Value   *IncV   = NULL;
+  IVExpr   ReuseIV;
+  unsigned RewriteFactor = CheckForIVReuse(Stride, ReuseIV,
+                                           CommonExprs->getType(),
+                                           UsersToProcess);
+  if (RewriteFactor != 0) {
+    DOUT << "BASED ON IV of STRIDE " << *ReuseIV.Stride
+         << " and BASE " << *ReuseIV.Base << " :\n";
+    NewPHI = ReuseIV.PHI;
+    IncV   = ReuseIV.IncV;
+  }
+
+  const Type *ReplacedTy = CommonExprs->getType();
+  
+  // Now that we know what we need to do, insert the PHI node itself.
+  //
+  DOUT << "INSERTING IV of TYPE " << *ReplacedTy << " of STRIDE "
+       << *Stride << " and BASE " << *CommonExprs << ": ";
+
+  SCEVExpander Rewriter(*SE, *LI);
+  SCEVExpander PreheaderRewriter(*SE, *LI);
+  
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  Instruction *PreInsertPt = Preheader->getTerminator();
+  Instruction *PhiInsertBefore = L->getHeader()->begin();
+  
+  BasicBlock *LatchBlock = L->getLoopLatch();
+
+
+  // Emit the initial base value into the loop preheader.
+  Value *CommonBaseV
+    = PreheaderRewriter.expandCodeFor(CommonExprs, PreInsertPt);
+
+  if (RewriteFactor == 0) {
+    // Create a new Phi for this base, and stick it in the loop header.
+    NewPHI = new PHINode(ReplacedTy, "iv.", PhiInsertBefore);
+    ++NumInserted;
+  
+    // Add common base to the new Phi node.
+    NewPHI->addIncoming(CommonBaseV, Preheader);
+
+    // If the stride is negative, insert a sub instead of an add for the
+    // increment.
+    bool isNegative = isNonConstantNegative(Stride);
+    SCEVHandle IncAmount = Stride;
+    if (isNegative)
+      IncAmount = SCEV::getNegativeSCEV(Stride);
+    
+    // Insert the stride into the preheader.
+    Value *StrideV = PreheaderRewriter.expandCodeFor(IncAmount, PreInsertPt);
+    if (!isa<ConstantInt>(StrideV)) ++NumVariable;
+
+    // Emit the increment of the base value before the terminator of the loop
+    // latch block, and add it to the Phi node.
+    SCEVHandle IncExp = SCEVUnknown::get(StrideV);
+    if (isNegative)
+      IncExp = SCEV::getNegativeSCEV(IncExp);
+    IncExp = SCEVAddExpr::get(SCEVUnknown::get(NewPHI), IncExp);
+  
+    IncV = Rewriter.expandCodeFor(IncExp, LatchBlock->getTerminator());
+    IncV->setName(NewPHI->getName()+".inc");
+    NewPHI->addIncoming(IncV, LatchBlock);
+
+    // Remember this in case a later stride is multiple of this.
+    IVsByStride[Stride].addIV(Stride, CommonExprs, NewPHI, IncV);
+    
+    DOUT << " IV=%" << NewPHI->getNameStr() << " INC=%" << IncV->getNameStr();
+  } else {
+    Constant *C = dyn_cast<Constant>(CommonBaseV);
+    if (!C ||
+        (!C->isNullValue() &&
+         !isTargetConstant(SCEVUnknown::get(CommonBaseV), ReplacedTy, TLI)))
+      // We want the common base emitted into the preheader! This is just
+      // using cast as a copy so BitCast (no-op cast) is appropriate
+      CommonBaseV = new BitCastInst(CommonBaseV, CommonBaseV->getType(), 
+                                    "commonbase", PreInsertPt);
+  }
+  DOUT << "\n";
+
+  // We want to emit code for users inside the loop first.  To do this, we
+  // rearrange BasedUser so that the entries at the end have
+  // isUseOfPostIncrementedValue = false, because we pop off the end of the
+  // vector (so we handle them first).
+  std::partition(UsersToProcess.begin(), UsersToProcess.end(),
+                 PartitionByIsUseOfPostIncrementedValue);
+  
+  // Sort this by base, so that things with the same base are handled
+  // together.  By partitioning first and stable-sorting later, we are
+  // guaranteed that within each base we will pop off users from within the
+  // loop before users outside of the loop with a particular base.
+  //
+  // We would like to use stable_sort here, but we can't.  The problem is that
+  // SCEVHandle's don't have a deterministic ordering w.r.t to each other, so
+  // we don't have anything to do a '<' comparison on.  Because we think the
+  // number of uses is small, do a horrible bubble sort which just relies on
+  // ==.
+  for (unsigned i = 0, e = UsersToProcess.size(); i != e; ++i) {
+    // Get a base value.
+    SCEVHandle Base = UsersToProcess[i].Base;
+    
+    // Compact everything with this base to be consequetive with this one.
+    for (unsigned j = i+1; j != e; ++j) {
+      if (UsersToProcess[j].Base == Base) {
+        std::swap(UsersToProcess[i+1], UsersToProcess[j]);
+        ++i;
+      }
+    }
+  }
+
+  // Process all the users now.  This outer loop handles all bases, the inner
+  // loop handles all users of a particular base.
+  while (!UsersToProcess.empty()) {
+    SCEVHandle Base = UsersToProcess.back().Base;
+
+    // Emit the code for Base into the preheader.
+    Value *BaseV = PreheaderRewriter.expandCodeFor(Base, PreInsertPt);
+
+    DOUT << "  INSERTING code for BASE = " << *Base << ":";
+    if (BaseV->hasName())
+      DOUT << " Result value name = %" << BaseV->getNameStr();
+    DOUT << "\n";
+
+    // If BaseV is a constant other than 0, make sure that it gets inserted into
+    // the preheader, instead of being forward substituted into the uses.  We do
+    // this by forcing a BitCast (noop cast) to be inserted into the preheader 
+    // in this case.
+    if (Constant *C = dyn_cast<Constant>(BaseV)) {
+      if (!C->isNullValue() && !isTargetConstant(Base, ReplacedTy, TLI)) {
+        // We want this constant emitted into the preheader! This is just
+        // using cast as a copy so BitCast (no-op cast) is appropriate
+        BaseV = new BitCastInst(BaseV, BaseV->getType(), "preheaderinsert",
+                             PreInsertPt);       
+      }
+    }
+
+    // Emit the code to add the immediate offset to the Phi value, just before
+    // the instructions that we identified as using this stride and base.
+    do {
+      // FIXME: Use emitted users to emit other users.
+      BasedUser &User = UsersToProcess.back();
+
+      // If this instruction wants to use the post-incremented value, move it
+      // after the post-inc and use its value instead of the PHI.
+      Value *RewriteOp = NewPHI;
+      if (User.isUseOfPostIncrementedValue) {
+        RewriteOp = IncV;
+
+        // If this user is in the loop, make sure it is the last thing in the
+        // loop to ensure it is dominated by the increment.
+        if (L->contains(User.Inst->getParent()))
+          User.Inst->moveBefore(LatchBlock->getTerminator());
+      }
+      if (RewriteOp->getType() != ReplacedTy) {
+        Instruction::CastOps opcode = Instruction::Trunc;
+        if (ReplacedTy->getPrimitiveSizeInBits() ==
+            RewriteOp->getType()->getPrimitiveSizeInBits())
+          opcode = Instruction::BitCast;
+        RewriteOp = SCEVExpander::InsertCastOfTo(opcode, RewriteOp, ReplacedTy);
+      }
+
+      SCEVHandle RewriteExpr = SCEVUnknown::get(RewriteOp);
+
+      // Clear the SCEVExpander's expression map so that we are guaranteed
+      // to have the code emitted where we expect it.
+      Rewriter.clear();
+
+      // If we are reusing the iv, then it must be multiplied by a constant
+      // factor take advantage of addressing mode scale component.
+      if (RewriteFactor != 0) {
+        RewriteExpr =
+          SCEVMulExpr::get(SCEVUnknown::getIntegerSCEV(RewriteFactor,
+                                                       RewriteExpr->getType()),
+                           RewriteExpr);
+
+        // The common base is emitted in the loop preheader. But since we
+        // are reusing an IV, it has not been used to initialize the PHI node.
+        // Add it to the expression used to rewrite the uses.
+        if (!isa<ConstantInt>(CommonBaseV) ||
+            !cast<ConstantInt>(CommonBaseV)->isZero())
+          RewriteExpr = SCEVAddExpr::get(RewriteExpr,
+                                         SCEVUnknown::get(CommonBaseV));
+      }
+
+      // Now that we know what we need to do, insert code before User for the
+      // immediate and any loop-variant expressions.
+      if (!isa<ConstantInt>(BaseV) || !cast<ConstantInt>(BaseV)->isZero())
+        // Add BaseV to the PHI value if needed.
+        RewriteExpr = SCEVAddExpr::get(RewriteExpr, SCEVUnknown::get(BaseV));
+
+      User.RewriteInstructionToUseNewBase(RewriteExpr, Rewriter, L, this);
+
+      // Mark old value we replaced as possibly dead, so that it is elminated
+      // if we just replaced the last use of that value.
+      DeadInsts.insert(cast<Instruction>(User.OperandValToReplace));
+
+      UsersToProcess.pop_back();
+      ++NumReduced;
+
+      // If there are any more users to process with the same base, process them
+      // now.  We sorted by base above, so we just have to check the last elt.
+    } while (!UsersToProcess.empty() && UsersToProcess.back().Base == Base);
+    // TODO: Next, find out which base index is the most common, pull it out.
+  }
+
+  // IMPORTANT TODO: Figure out how to partition the IV's with this stride, but
+  // different starting values, into different PHIs.
+}
+
+/// FindIVForUser - If Cond has an operand that is an expression of an IV,
+/// set the IV user and stride information and return true, otherwise return
+/// false.
+bool LoopStrengthReduce::FindIVForUser(ICmpInst *Cond, IVStrideUse *&CondUse,
+                                       const SCEVHandle *&CondStride) {
+  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e && !CondUse;
+       ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
+    IVUsesByStride.find(StrideOrder[Stride]);
+    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
+    
+    for (std::vector<IVStrideUse>::iterator UI = SI->second.Users.begin(),
+         E = SI->second.Users.end(); UI != E; ++UI)
+      if (UI->User == Cond) {
+        // NOTE: we could handle setcc instructions with multiple uses here, but
+        // InstCombine does it as well for simple uses, it's not clear that it
+        // occurs enough in real life to handle.
+        CondUse = &*UI;
+        CondStride = &SI->first;
+        return true;
+      }
+  }
+  return false;
+}    
+
+// OptimizeIndvars - Now that IVUsesByStride is set up with all of the indvar
+// uses in the loop, look to see if we can eliminate some, in favor of using
+// common indvars for the different uses.
+void LoopStrengthReduce::OptimizeIndvars(Loop *L) {
+  // TODO: implement optzns here.
+
+  // Finally, get the terminating condition for the loop if possible.  If we
+  // can, we want to change it to use a post-incremented version of its
+  // induction variable, to allow coalescing the live ranges for the IV into
+  // one register value.
+  PHINode *SomePHI = cast<PHINode>(L->getHeader()->begin());
+  BasicBlock  *Preheader = L->getLoopPreheader();
+  BasicBlock *LatchBlock =
+   SomePHI->getIncomingBlock(SomePHI->getIncomingBlock(0) == Preheader);
+  BranchInst *TermBr = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+  if (!TermBr || TermBr->isUnconditional() || 
+      !isa<ICmpInst>(TermBr->getCondition()))
+    return;
+  ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
+
+  // Search IVUsesByStride to find Cond's IVUse if there is one.
+  IVStrideUse *CondUse = 0;
+  const SCEVHandle *CondStride = 0;
+
+  if (!FindIVForUser(Cond, CondUse, CondStride))
+    return; // setcc doesn't use the IV.
+  
+
+  // It's possible for the setcc instruction to be anywhere in the loop, and
+  // possible for it to have multiple users.  If it is not immediately before
+  // the latch block branch, move it.
+  if (&*++BasicBlock::iterator(Cond) != (Instruction*)TermBr) {
+    if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
+      Cond->moveBefore(TermBr);
+    } else {
+      // Otherwise, clone the terminating condition and insert into the loopend.
+      Cond = cast<ICmpInst>(Cond->clone());
+      Cond->setName(L->getHeader()->getName() + ".termcond");
+      LatchBlock->getInstList().insert(TermBr, Cond);
+      
+      // Clone the IVUse, as the old use still exists!
+      IVUsesByStride[*CondStride].addUser(CondUse->Offset, Cond,
+                                         CondUse->OperandValToReplace);
+      CondUse = &IVUsesByStride[*CondStride].Users.back();
+    }
+  }
+
+  // If we get to here, we know that we can transform the setcc instruction to
+  // use the post-incremented version of the IV, allowing us to coalesce the
+  // live ranges for the IV correctly.
+  CondUse->Offset = SCEV::getMinusSCEV(CondUse->Offset, *CondStride);
+  CondUse->isUseOfPostIncrementedValue = true;
+}
+
+namespace {
+  // Constant strides come first which in turns are sorted by their absolute
+  // values. If absolute values are the same, then positive strides comes first.
+  // e.g.
+  // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
+  struct StrideCompare {
+    bool operator()(const SCEVHandle &LHS, const SCEVHandle &RHS) {
+      SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
+      SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
+      if (LHSC && RHSC) {
+        int64_t  LV = LHSC->getValue()->getSExtValue();
+        int64_t  RV = RHSC->getValue()->getSExtValue();
+        uint64_t ALV = (LV < 0) ? -LV : LV;
+        uint64_t ARV = (RV < 0) ? -RV : RV;
+        if (ALV == ARV)
+          return LV > RV;
+        else
+          return ALV < ARV;
+      }
+      return (LHSC && !RHSC);
+    }
+  };
+}
+
+bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager &LPM) {
+
+  LI = &getAnalysis<LoopInfo>();
+  DT = &getAnalysis<DominatorTree>();
+  SE = &getAnalysis<ScalarEvolution>();
+  TD = &getAnalysis<TargetData>();
+  UIntPtrTy = TD->getIntPtrType();
+
+  // Find all uses of induction variables in this loop, and catagorize
+  // them by stride.  Start by finding all of the PHI nodes in the header for
+  // this loop.  If they are induction variables, inspect their uses.
+  std::set<Instruction*> Processed;   // Don't reprocess instructions.
+  for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I)
+    AddUsersIfInteresting(I, L, Processed);
+
+  // If we have nothing to do, return.
+  if (IVUsesByStride.empty()) return false;
+
+  // Optimize induction variables.  Some indvar uses can be transformed to use
+  // strides that will be needed for other purposes.  A common example of this
+  // is the exit test for the loop, which can often be rewritten to use the
+  // computation of some other indvar to decide when to terminate the loop.
+  OptimizeIndvars(L);
+
+
+  // FIXME: We can widen subreg IV's here for RISC targets.  e.g. instead of
+  // doing computation in byte values, promote to 32-bit values if safe.
+
+  // FIXME: Attempt to reuse values across multiple IV's.  In particular, we
+  // could have something like "for(i) { foo(i*8); bar(i*16) }", which should be
+  // codegened as "for (j = 0;; j+=8) { foo(j); bar(j+j); }" on X86/PPC.  Need
+  // to be careful that IV's are all the same type.  Only works for intptr_t
+  // indvars.
+
+  // If we only have one stride, we can more aggressively eliminate some things.
+  bool HasOneStride = IVUsesByStride.size() == 1;
+
+#ifndef NDEBUG
+  DOUT << "\nLSR on ";
+  DEBUG(L->dump());
+#endif
+
+  // IVsByStride keeps IVs for one particular loop.
+  IVsByStride.clear();
+
+  // Sort the StrideOrder so we process larger strides first.
+  std::stable_sort(StrideOrder.begin(), StrideOrder.end(), StrideCompare());
+
+  // Note: this processes each stride/type pair individually.  All users passed
+  // into StrengthReduceStridedIVUsers have the same type AND stride.  Also,
+  // node that we iterate over IVUsesByStride indirectly by using StrideOrder.
+  // This extra layer of indirection makes the ordering of strides deterministic
+  // - not dependent on map order.
+  for (unsigned Stride = 0, e = StrideOrder.size(); Stride != e; ++Stride) {
+    std::map<SCEVHandle, IVUsersOfOneStride>::iterator SI = 
+      IVUsesByStride.find(StrideOrder[Stride]);
+    assert(SI != IVUsesByStride.end() && "Stride doesn't exist!");
+    StrengthReduceStridedIVUsers(SI->first, SI->second, L, HasOneStride);
+  }
+
+  // Clean up after ourselves
+  if (!DeadInsts.empty()) {
+    DeleteTriviallyDeadInstructions(DeadInsts);
+
+    BasicBlock::iterator I = L->getHeader()->begin();
+    PHINode *PN;
+    while ((PN = dyn_cast<PHINode>(I))) {
+      ++I;  // Preincrement iterator to avoid invalidating it when deleting PN.
+      
+      // At this point, we know that we have killed one or more GEP
+      // instructions.  It is worth checking to see if the cann indvar is also
+      // dead, so that we can remove it as well.  The requirements for the cann
+      // indvar to be considered dead are:
+      // 1. the cann indvar has one use
+      // 2. the use is an add instruction
+      // 3. the add has one use
+      // 4. the add is used by the cann indvar
+      // If all four cases above are true, then we can remove both the add and
+      // the cann indvar.
+      // FIXME: this needs to eliminate an induction variable even if it's being
+      // compared against some value to decide loop termination.
+      if (PN->hasOneUse()) {
+        Instruction *BO = dyn_cast<Instruction>(*PN->use_begin());
+        if (BO && (isa<BinaryOperator>(BO) || isa<CmpInst>(BO))) {
+          if (BO->hasOneUse() && PN == *(BO->use_begin())) {
+            DeadInsts.insert(BO);
+            // Break the cycle, then delete the PHI.
+            PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
+            SE->deleteValueFromRecords(PN);
+            PN->eraseFromParent();
+          }
+        }
+      }
+    }
+    DeleteTriviallyDeadInstructions(DeadInsts);
+  }
+
+  CastedPointers.clear();
+  IVUsesByStride.clear();
+  StrideOrder.clear();
+  return false;
+}

diff --git a/lib/Transforms/Scalar/LoopUnroll.cpp b/lib/Transforms/Scalar/LoopUnroll.cpp
new file mode 100644
index 0000000..babfc24
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnroll.cpp

@@ -0,0 +1,500 @@
+//===-- LoopUnroll.cpp - Loop unroller pass -------------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass implements a simple loop unroller.  It works best when loops have
+// been canonicalized by the -indvars pass, allowing it to determine the trip
+// counts of loops easily.
+//
+// This pass will multi-block loops only if they contain no non-unrolled 
+// subloops.  The process of unrolling can produce extraneous basic blocks 
+// linked with unconditional branches.  This will be corrected in the future.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unroll"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/IntrinsicInst.h"
+#include <cstdio>
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
+STATISTIC(NumUnrolled,           "Number of loops unrolled (completely or otherwise)");
+
+namespace {
+  cl::opt<unsigned>
+  UnrollThreshold
+    ("unroll-threshold", cl::init(100), cl::Hidden,
+     cl::desc("The cut-off point for automatic loop unrolling"));
+
+  cl::opt<unsigned>
+  UnrollCount
+    ("unroll-count", cl::init(0), cl::Hidden,
+     cl::desc("Use this unroll count for all loops, for testing purposes"));
+
+  class VISIBILITY_HIDDEN LoopUnroll : public LoopPass {
+    LoopInfo *LI;  // The current loop information
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopUnroll() : LoopPass((intptr_t)&ID) {}
+
+    /// A magic value for use with the Threshold parameter to indicate
+    /// that the loop unroll should be performed regardless of how much
+    /// code expansion would result.
+    static const unsigned NoThreshold = UINT_MAX;
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+    bool unrollLoop(Loop *L, unsigned Count, unsigned Threshold);
+    BasicBlock *FoldBlockIntoPredecessor(BasicBlock *BB);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addRequiredID(LCSSAID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreservedID(LCSSAID);
+      AU.addPreserved<LoopInfo>();
+    }
+  };
+  char LoopUnroll::ID = 0;
+  RegisterPass<LoopUnroll> X("loop-unroll", "Unroll loops");
+}
+
+LoopPass *llvm::createLoopUnrollPass() { return new LoopUnroll(); }
+
+/// ApproximateLoopSize - Approximate the size of the loop.
+static unsigned ApproximateLoopSize(const Loop *L) {
+  unsigned Size = 0;
+  for (unsigned i = 0, e = L->getBlocks().size(); i != e; ++i) {
+    BasicBlock *BB = L->getBlocks()[i];
+    Instruction *Term = BB->getTerminator();
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (isa<PHINode>(I) && BB == L->getHeader()) {
+        // Ignore PHI nodes in the header.
+      } else if (I->hasOneUse() && I->use_back() == Term) {
+        // Ignore instructions only used by the loop terminator.
+      } else if (isa<DbgInfoIntrinsic>(I)) {
+        // Ignore debug instructions
+      } else {
+        ++Size;
+      }
+
+      // TODO: Ignore expressions derived from PHI and constants if inval of phi
+      // is a constant, or if operation is associative.  This will get induction
+      // variables.
+    }
+  }
+
+  return Size;
+}
+
+// RemapInstruction - Convert the instruction operands from referencing the
+// current values into those specified by ValueMap.
+//
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+// FoldBlockIntoPredecessor - Folds a basic block into its predecessor if it
+// only has one predecessor, and that predecessor only has one successor.
+// Returns the new combined block.
+BasicBlock *LoopUnroll::FoldBlockIntoPredecessor(BasicBlock *BB) {
+  // Merge basic blocks into their predecessor if there is only one distinct
+  // pred, and if there is only one distinct successor of the predecessor, and
+  // if there are no PHI nodes.
+  //
+  BasicBlock *OnlyPred = BB->getSinglePredecessor();
+  if (!OnlyPred) return 0;
+
+  if (OnlyPred->getTerminator()->getNumSuccessors() != 1)
+    return 0;
+
+  DOUT << "Merging: " << *BB << "into: " << *OnlyPred;
+
+  // Resolve any PHI nodes at the start of the block.  They are all
+  // guaranteed to have exactly one entry if they exist, unless there are
+  // multiple duplicate (but guaranteed to be equal) entries for the
+  // incoming edges.  This occurs when there are multiple edges from
+  // OnlyPred to OnlySucc.
+  //
+  while (PHINode *PN = dyn_cast<PHINode>(&BB->front())) {
+    PN->replaceAllUsesWith(PN->getIncomingValue(0));
+    BB->getInstList().pop_front();  // Delete the phi node...
+  }
+
+  // Delete the unconditional branch from the predecessor...
+  OnlyPred->getInstList().pop_back();
+
+  // Move all definitions in the successor to the predecessor...
+  OnlyPred->getInstList().splice(OnlyPred->end(), BB->getInstList());
+
+  // Make all PHI nodes that referred to BB now refer to Pred as their
+  // source...
+  BB->replaceAllUsesWith(OnlyPred);
+
+  std::string OldName = BB->getName();
+
+  // Erase basic block from the function...
+  LI->removeBlock(BB);
+  BB->eraseFromParent();
+
+  // Inherit predecessor's name if it exists...
+  if (!OldName.empty() && !OnlyPred->hasName())
+    OnlyPred->setName(OldName);
+
+  return OnlyPred;
+}
+
+bool LoopUnroll::runOnLoop(Loop *L, LPPassManager &LPM) {
+  LI = &getAnalysis<LoopInfo>();
+
+  // Unroll the loop.
+  if (!unrollLoop(L, UnrollCount, UnrollThreshold))
+    return false;
+
+  // Update the loop information for this loop.
+  // If we completely unrolled the loop, remove it from the parent.
+  if (L->getNumBackEdges() == 0)
+    LPM.deleteLoopFromQueue(L);
+
+  return true;
+}
+
+/// Unroll the given loop by UnrollCount, or by a heuristically-determined
+/// value if Count is zero. If Threshold is not NoThreshold, it is a value
+/// to limit code size expansion. If the loop size would expand beyond the
+/// threshold value, unrolling is suppressed. The return value is true if
+/// any transformations are performed.
+///
+bool LoopUnroll::unrollLoop(Loop *L, unsigned Count, unsigned Threshold) {
+  assert(L->isLCSSAForm());
+
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *LatchBlock = L->getLoopLatch();
+  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  DOUT << "Loop Unroll: F[" << Header->getParent()->getName()
+       << "] Loop %" << Header->getName() << "\n";
+
+  if (!BI || BI->isUnconditional()) {
+    // The loop-rorate pass can be helpful to avoid this in many cases.
+    DOUT << "  Can't unroll; loop not terminated by a conditional branch.\n";
+    return false;
+  }
+
+  // Determine the trip count and/or trip multiple. A TripCount value of zero
+  // is used to mean an unknown trip count. The TripMultiple value is the
+  // greatest known integer multiple of the trip count.
+  unsigned TripCount = 0;
+  unsigned TripMultiple = 1;
+  if (Value *TripCountValue = L->getTripCount()) {
+    if (ConstantInt *TripCountC = dyn_cast<ConstantInt>(TripCountValue)) {
+      // Guard against huge trip counts. This also guards against assertions in
+      // APInt from the use of getZExtValue, below.
+      if (TripCountC->getValue().getActiveBits() <= 32) {
+        TripCount = (unsigned)TripCountC->getZExtValue();
+      }
+    } else if (BinaryOperator *BO = dyn_cast<BinaryOperator>(TripCountValue)) {
+      switch (BO->getOpcode()) {
+      case BinaryOperator::Mul:
+        if (ConstantInt *MultipleC = dyn_cast<ConstantInt>(BO->getOperand(1))) {
+          if (MultipleC->getValue().getActiveBits() <= 32) {
+            TripMultiple = (unsigned)MultipleC->getZExtValue();
+          }
+        }
+        break;
+      default: break;
+      }
+    }
+  }
+  if (TripCount != 0)
+    DOUT << "  Trip Count = " << TripCount << "\n";
+  if (TripMultiple != 1)
+    DOUT << "  Trip Multiple = " << TripMultiple << "\n";
+
+  // Automatically select an unroll count.
+  if (Count == 0) {
+    // Conservative heuristic: if we know the trip count, see if we can
+    // completely unroll (subject to the threshold, checked below); otherwise
+    // don't unroll.
+    if (TripCount != 0) {
+      Count = TripCount;
+    } else {
+      return false;
+    }
+  }
+
+  // Effectively "DCE" unrolled iterations that are beyond the tripcount
+  // and will never be executed.
+  if (TripCount != 0 && Count > TripCount)
+    Count = TripCount;
+
+  assert(Count > 0);
+  assert(TripMultiple > 0);
+  assert(TripCount == 0 || TripCount % TripMultiple == 0);
+
+  // Enforce the threshold.
+  if (Threshold != NoThreshold) {
+    unsigned LoopSize = ApproximateLoopSize(L);
+    DOUT << "  Loop Size = " << LoopSize << "\n";
+    uint64_t Size = (uint64_t)LoopSize*Count;
+    if (TripCount != 1 && Size > Threshold) {
+      DOUT << "  TOO LARGE TO UNROLL: "
+           << Size << ">" << Threshold << "\n";
+      return false;
+    }
+  }
+
+  // Are we eliminating the loop control altogether?
+  bool CompletelyUnroll = Count == TripCount;
+
+  // If we know the trip count, we know the multiple...
+  unsigned BreakoutTrip = 0;
+  if (TripCount != 0) {
+    BreakoutTrip = TripCount % Count;
+    TripMultiple = 0;
+  } else {
+    // Figure out what multiple to use.
+    BreakoutTrip = TripMultiple =
+      (unsigned)GreatestCommonDivisor64(Count, TripMultiple);
+  }
+
+  if (CompletelyUnroll) {
+    DOUT << "COMPLETELY UNROLLING loop %" << Header->getName()
+         << " with trip count " << TripCount << "!\n";
+  } else {
+    DOUT << "UNROLLING loop %" << Header->getName()
+         << " by " << Count;
+    if (TripMultiple == 0 || BreakoutTrip != TripMultiple) {
+      DOUT << " with a breakout at trip " << BreakoutTrip;
+    } else if (TripMultiple != 1) {
+      DOUT << " with " << TripMultiple << " trips per branch";
+    }
+    DOUT << "!\n";
+  }
+
+  std::vector<BasicBlock*> LoopBlocks = L->getBlocks();
+
+  bool ContinueOnTrue = L->contains(BI->getSuccessor(0));
+  BasicBlock *LoopExit = BI->getSuccessor(ContinueOnTrue);
+
+  // For the first iteration of the loop, we should use the precloned values for
+  // PHI nodes.  Insert associations now.
+  typedef DenseMap<const Value*, Value*> ValueMapTy;
+  ValueMapTy LastValueMap;
+  std::vector<PHINode*> OrigPHINode;
+  for (BasicBlock::iterator I = Header->begin(); isa<PHINode>(I); ++I) {
+    PHINode *PN = cast<PHINode>(I);
+    OrigPHINode.push_back(PN);
+    if (Instruction *I = 
+                dyn_cast<Instruction>(PN->getIncomingValueForBlock(LatchBlock)))
+      if (L->contains(I->getParent()))
+        LastValueMap[I] = I;
+  }
+
+  std::vector<BasicBlock*> Headers;
+  std::vector<BasicBlock*> Latches;
+  Headers.push_back(Header);
+  Latches.push_back(LatchBlock);
+
+  for (unsigned It = 1; It != Count; ++It) {
+    char SuffixBuffer[100];
+    sprintf(SuffixBuffer, ".%d", It);
+    
+    std::vector<BasicBlock*> NewBlocks;
+    
+    for (std::vector<BasicBlock*>::iterator BB = LoopBlocks.begin(),
+         E = LoopBlocks.end(); BB != E; ++BB) {
+      ValueMapTy ValueMap;
+      BasicBlock *New = CloneBasicBlock(*BB, ValueMap, SuffixBuffer);
+      Header->getParent()->getBasicBlockList().push_back(New);
+
+      // Loop over all of the PHI nodes in the block, changing them to use the
+      // incoming values from the previous block.
+      if (*BB == Header)
+        for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+          PHINode *NewPHI = cast<PHINode>(ValueMap[OrigPHINode[i]]);
+          Value *InVal = NewPHI->getIncomingValueForBlock(LatchBlock);
+          if (Instruction *InValI = dyn_cast<Instruction>(InVal))
+            if (It > 1 && L->contains(InValI->getParent()))
+              InVal = LastValueMap[InValI];
+          ValueMap[OrigPHINode[i]] = InVal;
+          New->getInstList().erase(NewPHI);
+        }
+
+      // Update our running map of newest clones
+      LastValueMap[*BB] = New;
+      for (ValueMapTy::iterator VI = ValueMap.begin(), VE = ValueMap.end();
+           VI != VE; ++VI)
+        LastValueMap[VI->first] = VI->second;
+
+      L->addBasicBlockToLoop(New, *LI);
+
+      // Add phi entries for newly created values to all exit blocks except
+      // the successor of the latch block.  The successor of the exit block will
+      // be updated specially after unrolling all the way.
+      if (*BB != LatchBlock)
+        for (Value::use_iterator UI = (*BB)->use_begin(), UE = (*BB)->use_end();
+             UI != UE; ++UI) {
+          Instruction *UseInst = cast<Instruction>(*UI);
+          if (isa<PHINode>(UseInst) && !L->contains(UseInst->getParent())) {
+            PHINode *phi = cast<PHINode>(UseInst);
+            Value *Incoming = phi->getIncomingValueForBlock(*BB);
+            if (isa<Instruction>(Incoming))
+              Incoming = LastValueMap[Incoming];
+          
+            phi->addIncoming(Incoming, New);
+          }
+        }
+
+      // Keep track of new headers and latches as we create them, so that
+      // we can insert the proper branches later.
+      if (*BB == Header)
+        Headers.push_back(New);
+      if (*BB == LatchBlock) {
+        Latches.push_back(New);
+
+        // Also, clear out the new latch's back edge so that it doesn't look
+        // like a new loop, so that it's amenable to being merged with adjacent
+        // blocks later on.
+        TerminatorInst *Term = New->getTerminator();
+        assert(L->contains(Term->getSuccessor(!ContinueOnTrue)));
+        assert(Term->getSuccessor(ContinueOnTrue) == LoopExit);
+        Term->setSuccessor(!ContinueOnTrue, NULL);
+      }
+
+      NewBlocks.push_back(New);
+    }
+    
+    // Remap all instructions in the most recent iteration
+    for (unsigned i = 0; i < NewBlocks.size(); ++i)
+      for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+        RemapInstruction(I, LastValueMap);
+  }
+  
+  // The latch block exits the loop.  If there are any PHI nodes in the
+  // successor blocks, update them to use the appropriate values computed as the
+  // last iteration of the loop.
+  if (Count != 1) {
+    SmallPtrSet<PHINode*, 8> Users;
+    for (Value::use_iterator UI = LatchBlock->use_begin(),
+         UE = LatchBlock->use_end(); UI != UE; ++UI)
+      if (PHINode *phi = dyn_cast<PHINode>(*UI))
+        Users.insert(phi);
+    
+    BasicBlock *LastIterationBB = cast<BasicBlock>(LastValueMap[LatchBlock]);
+    for (SmallPtrSet<PHINode*,8>::iterator SI = Users.begin(), SE = Users.end();
+         SI != SE; ++SI) {
+      PHINode *PN = *SI;
+      Value *InVal = PN->removeIncomingValue(LatchBlock, false);
+      // If this value was defined in the loop, take the value defined by the
+      // last iteration of the loop.
+      if (Instruction *InValI = dyn_cast<Instruction>(InVal)) {
+        if (L->contains(InValI->getParent()))
+          InVal = LastValueMap[InVal];
+      }
+      PN->addIncoming(InVal, LastIterationBB);
+    }
+  }
+
+  // Now, if we're doing complete unrolling, loop over the PHI nodes in the
+  // original block, setting them to their incoming values.
+  if (CompletelyUnroll) {
+    BasicBlock *Preheader = L->getLoopPreheader();
+    for (unsigned i = 0, e = OrigPHINode.size(); i != e; ++i) {
+      PHINode *PN = OrigPHINode[i];
+      PN->replaceAllUsesWith(PN->getIncomingValueForBlock(Preheader));
+      Header->getInstList().erase(PN);
+    }
+  }
+
+  // Now that all the basic blocks for the unrolled iterations are in place,
+  // set up the branches to connect them.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The original branch was replicated in each unrolled iteration.
+    BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
+
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = true;
+
+    // For a complete unroll, make the last iteration end with a branch
+    // to the exit block.
+    if (CompletelyUnroll && j == 0) {
+      Dest = LoopExit;
+      NeedConditional = false;
+    }
+
+    // If we know the trip count or a multiple of it, we can safely use an
+    // unconditional branch for some iterations.
+    if (j != BreakoutTrip && (TripMultiple == 0 || j % TripMultiple != 0)) {
+      NeedConditional = false;
+    }
+
+    if (NeedConditional) {
+      // Update the conditional branch's successor for the following
+      // iteration.
+      Term->setSuccessor(!ContinueOnTrue, Dest);
+    } else {
+      Term->setUnconditionalDest(Dest);
+      // Merge adjacent basic blocks, if possible.
+      if (BasicBlock *Fold = FoldBlockIntoPredecessor(Dest)) {
+        std::replace(Latches.begin(), Latches.end(), Dest, Fold);
+        std::replace(Headers.begin(), Headers.end(), Dest, Fold);
+      }
+    }
+  }
+  
+  // At this point, the code is well formed.  We now do a quick sweep over the
+  // inserted code, doing constant propagation and dead code elimination as we
+  // go.
+  const std::vector<BasicBlock*> &NewLoopBlocks = L->getBlocks();
+  for (std::vector<BasicBlock*>::const_iterator BB = NewLoopBlocks.begin(),
+       BBE = NewLoopBlocks.end(); BB != BBE; ++BB)
+    for (BasicBlock::iterator I = (*BB)->begin(), E = (*BB)->end(); I != E; ) {
+      Instruction *Inst = I++;
+
+      if (isInstructionTriviallyDead(Inst))
+        (*BB)->getInstList().erase(Inst);
+      else if (Constant *C = ConstantFoldInstruction(Inst)) {
+        Inst->replaceAllUsesWith(C);
+        (*BB)->getInstList().erase(Inst);
+      }
+    }
+
+  NumCompletelyUnrolled += CompletelyUnroll;
+  ++NumUnrolled;
+  return true;
+}

diff --git a/lib/Transforms/Scalar/LoopUnswitch.cpp b/lib/Transforms/Scalar/LoopUnswitch.cpp
new file mode 100644
index 0000000..c433e63
--- /dev/null
+++ b/lib/Transforms/Scalar/LoopUnswitch.cpp

@@ -0,0 +1,1074 @@
+//===-- LoopUnswitch.cpp - Hoist loop-invariant conditionals in loop ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass transforms loops that contain branches on loop-invariant conditions
+// to have multiple loops.  For example, it turns the left into the right code:
+//
+//  for (...)                  if (lic)
+//    A                          for (...)
+//    if (lic)                     A; B; C
+//      B                      else
+//    C                          for (...)
+//                                 A; C
+//
+// This can increase the size of the code exponentially (doubling it every time
+// a loop is unswitched) so we only unswitch if the resultant code will be
+// smaller than a threshold.
+//
+// This pass expects LICM to be run before it to hoist invariant conditions out
+// of the loop, to make the unswitching opportunity obvious.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "loop-unswitch"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include <algorithm>
+#include <set>
+using namespace llvm;
+
+STATISTIC(NumBranches, "Number of branches unswitched");
+STATISTIC(NumSwitches, "Number of switches unswitched");
+STATISTIC(NumSelects , "Number of selects unswitched");
+STATISTIC(NumTrivial , "Number of unswitches that are trivial");
+STATISTIC(NumSimplify, "Number of simplifications of unswitched code");
+
+namespace {
+  cl::opt<unsigned>
+  Threshold("loop-unswitch-threshold", cl::desc("Max loop size to unswitch"),
+            cl::init(10), cl::Hidden);
+  
+  class VISIBILITY_HIDDEN LoopUnswitch : public LoopPass {
+    LoopInfo *LI;  // Loop information
+    LPPassManager *LPM;
+
+    // LoopProcessWorklist - Used to check if second loop needs processing
+    // after RewriteLoopBodyWithConditionConstant rewrites first loop.
+    std::vector<Loop*> LoopProcessWorklist;
+    SmallPtrSet<Value *,8> UnswitchedVals;
+    
+    bool OptimizeForSize;
+  public:
+    static char ID; // Pass ID, replacement for typeid
+    LoopUnswitch(bool Os = false) : 
+      LoopPass((intptr_t)&ID), OptimizeForSize(Os) {}
+
+    bool runOnLoop(Loop *L, LPPassManager &LPM);
+
+    /// This transformation requires natural loop information & requires that
+    /// loop preheaders be inserted into the CFG...
+    ///
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(LoopSimplifyID);
+      AU.addPreservedID(LoopSimplifyID);
+      AU.addRequired<LoopInfo>();
+      AU.addPreserved<LoopInfo>();
+      AU.addRequiredID(LCSSAID);
+    }
+
+  private:
+    /// RemoveLoopFromWorklist - If the specified loop is on the loop worklist,
+    /// remove it.
+    void RemoveLoopFromWorklist(Loop *L) {
+      std::vector<Loop*>::iterator I = std::find(LoopProcessWorklist.begin(),
+                                                 LoopProcessWorklist.end(), L);
+      if (I != LoopProcessWorklist.end())
+        LoopProcessWorklist.erase(I);
+    }
+      
+    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,Loop *L);
+    unsigned getLoopUnswitchCost(Loop *L, Value *LIC);
+    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+                                  BasicBlock *ExitBlock);
+    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L);
+
+    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool isEqual);
+
+    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                        BasicBlock *TrueDest, 
+                                        BasicBlock *FalseDest,
+                                        Instruction *InsertPt);
+
+    void SimplifyCode(std::vector<Instruction*> &Worklist);
+    void RemoveBlockIfDead(BasicBlock *BB,
+                           std::vector<Instruction*> &Worklist);
+    void RemoveLoopFromHierarchy(Loop *L);
+  };
+  char LoopUnswitch::ID = 0;
+  RegisterPass<LoopUnswitch> X("loop-unswitch", "Unswitch loops");
+}
+
+LoopPass *llvm::createLoopUnswitchPass(bool Os) { 
+  return new LoopUnswitch(Os); 
+}
+
+/// FindLIVLoopCondition - Cond is a condition that occurs in L.  If it is
+/// invariant in the loop, or has an invariant piece, return the invariant.
+/// Otherwise, return null.
+static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed) {
+  // Constants should be folded, not unswitched on!
+  if (isa<Constant>(Cond)) return false;
+
+  // TODO: Handle: br (VARIANT|INVARIANT).
+  // TODO: Hoist simple expressions out of loops.
+  if (L->isLoopInvariant(Cond)) return Cond;
+  
+  if (BinaryOperator *BO = dyn_cast<BinaryOperator>(Cond))
+    if (BO->getOpcode() == Instruction::And ||
+        BO->getOpcode() == Instruction::Or) {
+      // If either the left or right side is invariant, we can unswitch on this,
+      // which will cause the branch to go away in one loop and the condition to
+      // simplify in the other one.
+      if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed))
+        return LHS;
+      if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed))
+        return RHS;
+    }
+      
+      return 0;
+}
+
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+  assert(L->isLCSSAForm());
+  LI = &getAnalysis<LoopInfo>();
+  LPM = &LPM_Ref;
+  bool Changed = false;
+  
+  // Loop over all of the basic blocks in the loop.  If we find an interior
+  // block that is branching on a loop-invariant condition, we can unswitch this
+  // loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    TerminatorInst *TI = (*I)->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      // If this isn't branching on an invariant condition, we can't unswitch
+      // it.
+      if (BI->isConditional()) {
+        // See if this, or some part of it, is loop invariant.  If so, we can
+        // unswitch on it if we desire.
+        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), L, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(),
+                                             L)) {
+          ++NumBranches;
+          return true;
+        }
+      }      
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), L, Changed);
+      if (LoopCond && SI->getNumCases() > 1) {
+        // Find a value to unswitch on:
+        // FIXME: this should chose the most expensive case!
+        Constant *UnswitchVal = SI->getCaseValue(1);
+        // Do not process same value again and again.
+        if (!UnswitchedVals.insert(UnswitchVal))
+          continue;
+
+        if (UnswitchIfProfitable(LoopCond, UnswitchVal, L)) {
+          ++NumSwitches;
+          return true;
+        }
+      }
+    }
+    
+    // Scan the instructions to check for unswitchable values.
+    for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end(); 
+         BBI != E; ++BBI)
+      if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
+        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), L, Changed);
+        if (LoopCond && UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(),
+                                             L)) {
+          ++NumSelects;
+          return true;
+        }
+      }
+  }
+  
+  assert(L->isLCSSAForm());
+  
+  return Changed;
+}
+
+/// isTrivialLoopExitBlock - Check to see if all paths from BB either:
+///   1. Exit the loop with no side effects.
+///   2. Branch to the latch block with no side-effects.
+///
+/// If these conditions are true, we return true and set ExitBB to the block we
+/// exit through.
+///
+static bool isTrivialLoopExitBlockHelper(Loop *L, BasicBlock *BB,
+                                         BasicBlock *&ExitBB,
+                                         std::set<BasicBlock*> &Visited) {
+  if (!Visited.insert(BB).second) {
+    // Already visited and Ok, end of recursion.
+    return true;
+  } else if (!L->contains(BB)) {
+    // Otherwise, this is a loop exit, this is fine so long as this is the
+    // first exit.
+    if (ExitBB != 0) return false;
+    ExitBB = BB;
+    return true;
+  }
+  
+  // Otherwise, this is an unvisited intra-loop node.  Check all successors.
+  for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI) {
+    // Check to see if the successor is a trivial loop exit.
+    if (!isTrivialLoopExitBlockHelper(L, *SI, ExitBB, Visited))
+      return false;
+  }
+
+  // Okay, everything after this looks good, check to make sure that this block
+  // doesn't include any side effects.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (I->mayWriteToMemory())
+      return false;
+  
+  return true;
+}
+
+/// isTrivialLoopExitBlock - Return true if the specified block unconditionally
+/// leads to an exit from the specified loop, and has no side-effects in the 
+/// process.  If so, return the block that is exited to, otherwise return null.
+static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
+  std::set<BasicBlock*> Visited;
+  Visited.insert(L->getHeader());  // Branches to header are ok.
+  BasicBlock *ExitBB = 0;
+  if (isTrivialLoopExitBlockHelper(L, BB, ExitBB, Visited))
+    return ExitBB;
+  return 0;
+}
+
+/// IsTrivialUnswitchCondition - Check to see if this unswitch condition is
+/// trivial: that is, that the condition controls whether or not the loop does
+/// anything at all.  If this is a trivial condition, unswitching produces no
+/// code duplications (equivalently, it produces a simpler loop and a new empty
+/// loop, which gets deleted).
+///
+/// If this is a trivial condition, return true, otherwise return false.  When
+/// returning true, this sets Cond and Val to the condition that controls the
+/// trivial condition: when Cond dynamically equals Val, the loop is known to
+/// exit.  Finally, this sets LoopExit to the BB that the loop exits to when
+/// Cond == Val.
+///
+static bool IsTrivialUnswitchCondition(Loop *L, Value *Cond, Constant **Val = 0,
+                                       BasicBlock **LoopExit = 0) {
+  BasicBlock *Header = L->getHeader();
+  TerminatorInst *HeaderTerm = Header->getTerminator();
+  
+  BasicBlock *LoopExitBB = 0;
+  if (BranchInst *BI = dyn_cast<BranchInst>(HeaderTerm)) {
+    // If the header block doesn't end with a conditional branch on Cond, we
+    // can't handle it.
+    if (!BI->isConditional() || BI->getCondition() != Cond)
+      return false;
+  
+    // Check to see if a successor of the branch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.
+    if ((LoopExitBB = isTrivialLoopExitBlock(L, BI->getSuccessor(0)))) {
+      if (Val) *Val = ConstantInt::getTrue();
+    } else if ((LoopExitBB = isTrivialLoopExitBlock(L, BI->getSuccessor(1)))) {
+      if (Val) *Val = ConstantInt::getFalse();
+    }
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(HeaderTerm)) {
+    // If this isn't a switch on Cond, we can't handle it.
+    if (SI->getCondition() != Cond) return false;
+    
+    // Check to see if a successor of the switch is guaranteed to go to the
+    // latch block or exit through a one exit block without having any 
+    // side-effects.  If so, determine the value of Cond that causes it to do
+    // this.  Note that we can't trivially unswitch on the default case.
+    for (unsigned i = 1, e = SI->getNumSuccessors(); i != e; ++i)
+      if ((LoopExitBB = isTrivialLoopExitBlock(L, SI->getSuccessor(i)))) {
+        // Okay, we found a trivial case, remember the value that is trivial.
+        if (Val) *Val = SI->getCaseValue(i);
+        break;
+      }
+  }
+
+  // If we didn't find a single unique LoopExit block, or if the loop exit block
+  // contains phi nodes, this isn't trivial.
+  if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
+    return false;   // Can't handle this.
+  
+  if (LoopExit) *LoopExit = LoopExitBB;
+  
+  // We already know that nothing uses any scalar values defined inside of this
+  // loop.  As such, we just have to check to see if this loop will execute any
+  // side-effecting instructions (e.g. stores, calls, volatile loads) in the
+  // part of the loop that the code *would* execute.  We already checked the
+  // tail, check the header now.
+  for (BasicBlock::iterator I = Header->begin(), E = Header->end(); I != E; ++I)
+    if (I->mayWriteToMemory())
+      return false;
+  return true;
+}
+
+/// getLoopUnswitchCost - Return the cost (code size growth) that will happen if
+/// we choose to unswitch the specified loop on the specified value.
+///
+unsigned LoopUnswitch::getLoopUnswitchCost(Loop *L, Value *LIC) {
+  // If the condition is trivial, always unswitch.  There is no code growth for
+  // this case.
+  if (IsTrivialUnswitchCondition(L, LIC))
+    return 0;
+  
+  // FIXME: This is really overly conservative.  However, more liberal 
+  // estimations have thus far resulted in excessive unswitching, which is bad
+  // both in compile time and in code size.  This should be replaced once
+  // someone figures out how a good estimation.
+  return L->getBlocks().size();
+  
+  unsigned Cost = 0;
+  // FIXME: this is brain dead.  It should take into consideration code
+  // shrinkage.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I) {
+    BasicBlock *BB = *I;
+    // Do not include empty blocks in the cost calculation.  This happen due to
+    // loop canonicalization and will be removed.
+    if (BB->begin() == BasicBlock::iterator(BB->getTerminator()))
+      continue;
+    
+    // Count basic blocks.
+    ++Cost;
+  }
+
+  return Cost;
+}
+
+/// UnswitchIfProfitable - We have found that we can unswitch L when
+/// LoopCond == Val to simplify the loop.  If we decide that this is profitable,
+/// unswitch the loop, reprocess the pieces, then return true.
+bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,Loop *L){
+  // Check to see if it would be profitable to unswitch this loop.
+  unsigned Cost = getLoopUnswitchCost(L, LoopCond);
+
+  // Do not do non-trivial unswitch while optimizing for size.
+  if (Cost && OptimizeForSize)
+    return false;
+
+  if (Cost > Threshold) {
+    // FIXME: this should estimate growth by the amount of code shared by the
+    // resultant unswitched loops.
+    //
+    DOUT << "NOT unswitching loop %"
+         << L->getHeader()->getName() << ", cost too high: "
+         << L->getBlocks().size() << "\n";
+    return false;
+  }
+  
+  // If this is a trivial condition to unswitch (which results in no code
+  // duplication), do it now.
+  Constant *CondVal;
+  BasicBlock *ExitBlock;
+  if (IsTrivialUnswitchCondition(L, LoopCond, &CondVal, &ExitBlock)) {
+    UnswitchTrivialCondition(L, LoopCond, CondVal, ExitBlock);
+  } else {
+    UnswitchNontrivialCondition(LoopCond, Val, L);
+  }
+ 
+  return true;
+}
+
+// RemapInstruction - Convert the instruction operands from referencing the
+// current values into those specified by ValueMap.
+//
+static inline void RemapInstruction(Instruction *I,
+                                    DenseMap<const Value *, Value*> &ValueMap) {
+  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
+    Value *Op = I->getOperand(op);
+    DenseMap<const Value *, Value*>::iterator It = ValueMap.find(Op);
+    if (It != ValueMap.end()) Op = It->second;
+    I->setOperand(op, Op);
+  }
+}
+
+// CloneDomInfo - NewBB is cloned from Orig basic block. Now clone Dominator Info.
+// If Orig is in Loop then find and use Orig dominator's cloned block as NewBB 
+// dominator.
+void CloneDomInfo(BasicBlock *NewBB, BasicBlock *Orig, Loop *L, 
+                  DominatorTree *DT, DominanceFrontier *DF,
+                  DenseMap<const Value*, Value*> &VM) {
+
+  DomTreeNode *OrigNode = DT->getNode(Orig);
+  if (!OrigNode)
+    return;
+  BasicBlock *OrigIDom = OrigNode->getBlock();
+  BasicBlock *NewIDom = OrigIDom;
+  if (L->contains(OrigIDom)) {
+    if (!DT->getNode(OrigIDom))
+      CloneDomInfo(NewIDom, OrigIDom, L, DT, DF, VM);
+    NewIDom = cast<BasicBlock>(VM[OrigIDom]);
+  }
+  if (NewBB == NewIDom) {
+    DT->addNewBlock(NewBB, OrigIDom);
+    DT->changeImmediateDominator(NewBB, NewIDom);
+  } else
+    DT->addNewBlock(NewBB, NewIDom);
+
+  DominanceFrontier::DomSetType NewDFSet;
+  if (DF) {
+    DominanceFrontier::iterator DFI = DF->find(Orig);
+    if ( DFI != DF->end()) {
+      DominanceFrontier::DomSetType S = DFI->second;
+      for (DominanceFrontier::DomSetType::iterator I = S.begin(), E = S.end();
+           I != E; ++I) {
+        BasicBlock *BB = *I;
+        if (L->contains(BB)) 
+          NewDFSet.insert(cast<BasicBlock>(VM[Orig]));
+        else
+          NewDFSet.insert(BB);
+      }
+    }
+    DF->addBasicBlock(NewBB, NewDFSet);
+  }
+}
+
+/// CloneLoop - Recursively clone the specified loop and all of its children,
+/// mapping the blocks with the specified map.
+static Loop *CloneLoop(Loop *L, Loop *PL, DenseMap<const Value*, Value*> &VM,
+                       LoopInfo *LI, LPPassManager *LPM) {
+  Loop *New = new Loop();
+
+  LPM->insertLoop(New, PL);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New->addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I)
+    CloneLoop(*I, New, VM, LI, LPM);
+
+  return New;
+}
+
+/// EmitPreheaderBranchOnCondition - Emit a conditional branch on two values
+/// if LIC == Val, branch to TrueDst, otherwise branch to FalseDest.  Insert the
+/// code immediately before InsertPt.
+void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+                                                  BasicBlock *TrueDest,
+                                                  BasicBlock *FalseDest,
+                                                  Instruction *InsertPt) {
+  // Insert a conditional branch on LIC to the two preheaders.  The original
+  // code is the true version and the new code is the false version.
+  Value *BranchVal = LIC;
+  if (!isa<ConstantInt>(Val) || Val->getType() != Type::Int1Ty)
+    BranchVal = new ICmpInst(ICmpInst::ICMP_EQ, LIC, Val, "tmp", InsertPt);
+  else if (Val != ConstantInt::getTrue())
+    // We want to enter the new loop when the condition is true.
+    std::swap(TrueDest, FalseDest);
+
+  // Insert the new branch.
+  BranchInst *BRI = new BranchInst(TrueDest, FalseDest, BranchVal, InsertPt);
+
+  // Update dominator info.
+  // BranchVal is a new preheader so it dominates true and false destination
+  // loop headers.
+  if (DominatorTree *DT = getAnalysisToUpdate<DominatorTree>()) {
+    DT->changeImmediateDominator(TrueDest, BRI->getParent());
+    DT->changeImmediateDominator(FalseDest, BRI->getParent());
+  }
+  // No need to update DominanceFrontier. BRI->getParent() dominated TrueDest
+  // and FalseDest anyway. Now it immediately dominates them.
+}
+
+
+/// UnswitchTrivialCondition - Given a loop that has a trivial unswitchable
+/// condition in it (a cond branch from its header block to its latch block,
+/// where the path through the loop that doesn't execute its body has no 
+/// side-effects), unswitch it.  This doesn't involve any code duplication, just
+/// moving the conditional branch outside of the loop and updating loop info.
+void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, 
+                                            Constant *Val, 
+                                            BasicBlock *ExitBlock) {
+  DOUT << "loop-unswitch: Trivial-Unswitch loop %"
+       << L->getHeader()->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << L->getHeader()->getParent()->getName()
+       << " on cond: " << *Val << " == " << *Cond << "\n";
+  
+  // First step, split the preheader, so that we know that there is a safe place
+  // to insert the conditional branch.  We will change 'OrigPH' to have a
+  // conditional branch on Cond.
+  BasicBlock *OrigPH = L->getLoopPreheader();
+  BasicBlock *NewPH = SplitEdge(OrigPH, L->getHeader(), this);
+
+  // Now that we have a place to insert the conditional branch, create a place
+  // to branch to: this is the exit block out of the loop that we should
+  // short-circuit to.
+  
+  // Split this block now, so that the loop maintains its exit block, and so
+  // that the jump from the preheader can execute the contents of the exit block
+  // without actually branching to it (the exit block should be dominated by the
+  // loop header, not the preheader).
+  assert(!L->contains(ExitBlock) && "Exit block is in the loop?");
+  BasicBlock *NewExit = SplitBlock(ExitBlock, ExitBlock->begin(), this);
+    
+  // Okay, now we have a position to branch from and a position to branch to, 
+  // insert the new conditional branch.
+  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, 
+                                 OrigPH->getTerminator());
+  OrigPH->getTerminator()->eraseFromParent();
+
+  // We need to reprocess this loop, it could be unswitched again.
+  LPM->redoLoop(L);
+  
+  // Now that we know that the loop is never entered when this condition is a
+  // particular value, rewrite the loop with this info.  We know that this will
+  // at least eliminate the old branch.
+  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  ++NumTrivial;
+}
+
+/// VersionLoop - We determined that the loop is profitable to unswitch when LIC
+/// equal Val.  Split it into loop versions and test the condition outside of
+/// either loop.  Return the loops created as Out1/Out2.
+void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val, 
+                                               Loop *L) {
+  Function *F = L->getHeader()->getParent();
+  DOUT << "loop-unswitch: Unswitching loop %"
+       << L->getHeader()->getName() << " [" << L->getBlocks().size()
+       << " blocks] in Function " << F->getName()
+       << " when '" << *Val << "' == " << *LIC << "\n";
+
+  // LoopBlocks contains all of the basic blocks of the loop, including the
+  // preheader of the loop, the body of the loop, and the exit blocks of the 
+  // loop, in that order.
+  std::vector<BasicBlock*> LoopBlocks;
+
+  // First step, split the preheader and exit blocks, and add these blocks to
+  // the LoopBlocks list.
+  BasicBlock *OrigPreheader = L->getLoopPreheader();
+  LoopBlocks.push_back(SplitEdge(OrigPreheader, L->getHeader(), this));
+
+  // We want the loop to come after the preheader, but before the exit blocks.
+  LoopBlocks.insert(LoopBlocks.end(), L->block_begin(), L->block_end());
+
+  std::vector<BasicBlock*> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Split all of the edges from inside the loop to their exit blocks.  Update
+  // the appropriate Phi nodes as we do so.
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *ExitBlock = ExitBlocks[i];
+    std::vector<BasicBlock*> Preds(pred_begin(ExitBlock), pred_end(ExitBlock));
+
+    for (unsigned j = 0, e = Preds.size(); j != e; ++j) {
+      BasicBlock* MiddleBlock = SplitEdge(Preds[j], ExitBlock, this);
+      BasicBlock* StartBlock = Preds[j];
+      BasicBlock* EndBlock;
+      if (MiddleBlock->getSinglePredecessor() == ExitBlock) {
+        EndBlock = MiddleBlock;
+        MiddleBlock = EndBlock->getSinglePredecessor();;
+      } else {
+        EndBlock = ExitBlock;
+      }
+      
+      std::set<PHINode*> InsertedPHIs;
+      PHINode* OldLCSSA = 0;
+      for (BasicBlock::iterator I = EndBlock->begin();
+           (OldLCSSA = dyn_cast<PHINode>(I)); ++I) {
+        Value* OldValue = OldLCSSA->getIncomingValueForBlock(MiddleBlock);
+        PHINode* NewLCSSA = new PHINode(OldLCSSA->getType(),
+                                        OldLCSSA->getName() + ".us-lcssa",
+                                        MiddleBlock->getTerminator());
+        NewLCSSA->addIncoming(OldValue, StartBlock);
+        OldLCSSA->setIncomingValue(OldLCSSA->getBasicBlockIndex(MiddleBlock),
+                                   NewLCSSA);
+        InsertedPHIs.insert(NewLCSSA);
+      }
+
+      BasicBlock::iterator InsertPt = EndBlock->begin();
+      while (dyn_cast<PHINode>(InsertPt)) ++InsertPt;
+      for (BasicBlock::iterator I = MiddleBlock->begin();
+         (OldLCSSA = dyn_cast<PHINode>(I)) && InsertedPHIs.count(OldLCSSA) == 0;
+         ++I) {
+        PHINode *NewLCSSA = new PHINode(OldLCSSA->getType(),
+                                        OldLCSSA->getName() + ".us-lcssa",
+                                        InsertPt);
+        OldLCSSA->replaceAllUsesWith(NewLCSSA);
+        NewLCSSA->addIncoming(OldLCSSA, MiddleBlock);
+      }
+    }    
+  }
+  
+  // The exit blocks may have been changed due to edge splitting, recompute.
+  ExitBlocks.clear();
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  // Add exit blocks to the loop blocks.
+  LoopBlocks.insert(LoopBlocks.end(), ExitBlocks.begin(), ExitBlocks.end());
+
+  // Next step, clone all of the basic blocks that make up the loop (including
+  // the loop preheader and exit blocks), keeping track of the mapping between
+  // the instructions and blocks.
+  std::vector<BasicBlock*> NewBlocks;
+  NewBlocks.reserve(LoopBlocks.size());
+  DenseMap<const Value*, Value*> ValueMap;
+  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+    BasicBlock *New = CloneBasicBlock(LoopBlocks[i], ValueMap, ".us", F);
+    NewBlocks.push_back(New);
+    ValueMap[LoopBlocks[i]] = New;  // Keep the BB mapping.
+  }
+
+  // Update dominator info
+  DominanceFrontier *DF = getAnalysisToUpdate<DominanceFrontier>();
+  if (DominatorTree *DT = getAnalysisToUpdate<DominatorTree>())
+    for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
+      BasicBlock *LBB = LoopBlocks[i];
+      BasicBlock *NBB = NewBlocks[i];
+      CloneDomInfo(NBB, LBB, L, DT, DF, ValueMap);
+    }
+  
+  // Splice the newly inserted blocks into the function right before the
+  // original preheader.
+  F->getBasicBlockList().splice(LoopBlocks[0], F->getBasicBlockList(),
+                                NewBlocks[0], F->end());
+
+  // Now we create the new Loop object for the versioned loop.
+  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), ValueMap, LI, LPM);
+  Loop *ParentLoop = L->getParentLoop();
+  if (ParentLoop) {
+    // Make sure to add the cloned preheader and exit blocks to the parent loop
+    // as well.
+    ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
+  }
+  
+  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
+    BasicBlock *NewExit = cast<BasicBlock>(ValueMap[ExitBlocks[i]]);
+    // The new exit block should be in the same loop as the old one.
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+      ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
+    
+    assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
+           "Exit block should have been split to have one successor!");
+    BasicBlock *ExitSucc = NewExit->getTerminator()->getSuccessor(0);
+    
+    // If the successor of the exit block had PHI nodes, add an entry for
+    // NewExit.
+    PHINode *PN;
+    for (BasicBlock::iterator I = ExitSucc->begin();
+         (PN = dyn_cast<PHINode>(I)); ++I) {
+      Value *V = PN->getIncomingValueForBlock(ExitBlocks[i]);
+      DenseMap<const Value *, Value*>::iterator It = ValueMap.find(V);
+      if (It != ValueMap.end()) V = It->second;
+      PN->addIncoming(V, NewExit);
+    }
+  }
+
+  // Rewrite the code to refer to itself.
+  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i)
+    for (BasicBlock::iterator I = NewBlocks[i]->begin(),
+           E = NewBlocks[i]->end(); I != E; ++I)
+      RemapInstruction(I, ValueMap);
+  
+  // Rewrite the original preheader to select between versions of the loop.
+  BranchInst *OldBR = cast<BranchInst>(OrigPreheader->getTerminator());
+  assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
+         "Preheader splitting did not work correctly!");
+
+  // Emit the new branch that selects between the two versions of this loop.
+  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR);
+  OldBR->eraseFromParent();
+  
+  LoopProcessWorklist.push_back(NewLoop);
+  LPM->redoLoop(L);
+
+  // Now we rewrite the original code to know that the condition is true and the
+  // new code to know that the condition is false.
+  RewriteLoopBodyWithConditionConstant(L      , LIC, Val, false);
+  
+  // It's possible that simplifying one loop could cause the other to be
+  // deleted.  If so, don't simplify it.
+  if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop)
+    RewriteLoopBodyWithConditionConstant(NewLoop, LIC, Val, true);
+}
+
+/// RemoveFromWorklist - Remove all instances of I from the worklist vector
+/// specified.
+static void RemoveFromWorklist(Instruction *I, 
+                               std::vector<Instruction*> &Worklist) {
+  std::vector<Instruction*>::iterator WI = std::find(Worklist.begin(),
+                                                     Worklist.end(), I);
+  while (WI != Worklist.end()) {
+    unsigned Offset = WI-Worklist.begin();
+    Worklist.erase(WI);
+    WI = std::find(Worklist.begin()+Offset, Worklist.end(), I);
+  }
+}
+
+/// ReplaceUsesOfWith - When we find that I really equals V, remove I from the
+/// program, replacing all uses with V and update the worklist.
+static void ReplaceUsesOfWith(Instruction *I, Value *V, 
+                              std::vector<Instruction*> &Worklist) {
+  DOUT << "Replace with '" << *V << "': " << *I;
+
+  // Add uses to the worklist, which may be dead now.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+      Worklist.push_back(Use);
+
+  // Add users to the worklist which may be simplified now.
+  for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+       UI != E; ++UI)
+    Worklist.push_back(cast<Instruction>(*UI));
+  I->replaceAllUsesWith(V);
+  I->eraseFromParent();
+  RemoveFromWorklist(I, Worklist);
+  ++NumSimplify;
+}
+
+/// RemoveBlockIfDead - If the specified block is dead, remove it, update loop
+/// information, and remove any dead successors it has.
+///
+void LoopUnswitch::RemoveBlockIfDead(BasicBlock *BB,
+                                     std::vector<Instruction*> &Worklist) {
+  if (pred_begin(BB) != pred_end(BB)) {
+    // This block isn't dead, since an edge to BB was just removed, see if there
+    // are any easy simplifications we can do now.
+    if (BasicBlock *Pred = BB->getSinglePredecessor()) {
+      // If it has one pred, fold phi nodes in BB.
+      while (isa<PHINode>(BB->begin()))
+        ReplaceUsesOfWith(BB->begin(), 
+                          cast<PHINode>(BB->begin())->getIncomingValue(0), 
+                          Worklist);
+      
+      // If this is the header of a loop and the only pred is the latch, we now
+      // have an unreachable loop.
+      if (Loop *L = LI->getLoopFor(BB))
+        if (L->getHeader() == BB && L->contains(Pred)) {
+          // Remove the branch from the latch to the header block, this makes
+          // the header dead, which will make the latch dead (because the header
+          // dominates the latch).
+          Pred->getTerminator()->eraseFromParent();
+          new UnreachableInst(Pred);
+          
+          // The loop is now broken, remove it from LI.
+          RemoveLoopFromHierarchy(L);
+          
+          // Reprocess the header, which now IS dead.
+          RemoveBlockIfDead(BB, Worklist);
+          return;
+        }
+      
+      // If pred ends in a uncond branch, add uncond branch to worklist so that
+      // the two blocks will get merged.
+      if (BranchInst *BI = dyn_cast<BranchInst>(Pred->getTerminator()))
+        if (BI->isUnconditional())
+          Worklist.push_back(BI);
+    }
+    return;
+  }
+
+  DOUT << "Nuking dead block: " << *BB;
+  
+  // Remove the instructions in the basic block from the worklist.
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+    RemoveFromWorklist(I, Worklist);
+    
+    // Anything that uses the instructions in this basic block should have their
+    // uses replaced with undefs.
+    if (!I->use_empty())
+      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+  }
+  
+  // If this is the edge to the header block for a loop, remove the loop and
+  // promote all subloops.
+  if (Loop *BBLoop = LI->getLoopFor(BB)) {
+    if (BBLoop->getLoopLatch() == BB)
+      RemoveLoopFromHierarchy(BBLoop);
+  }
+
+  // Remove the block from the loop info, which removes it from any loops it
+  // was in.
+  LI->removeBlock(BB);
+  
+  
+  // Remove phi node entries in successors for this block.
+  TerminatorInst *TI = BB->getTerminator();
+  std::vector<BasicBlock*> Succs;
+  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+    Succs.push_back(TI->getSuccessor(i));
+    TI->getSuccessor(i)->removePredecessor(BB);
+  }
+  
+  // Unique the successors, remove anything with multiple uses.
+  std::sort(Succs.begin(), Succs.end());
+  Succs.erase(std::unique(Succs.begin(), Succs.end()), Succs.end());
+  
+  // Remove the basic block, including all of the instructions contained in it.
+  BB->eraseFromParent();
+  
+  // Remove successor blocks here that are not dead, so that we know we only
+  // have dead blocks in this list.  Nondead blocks have a way of becoming dead,
+  // then getting removed before we revisit them, which is badness.
+  //
+  for (unsigned i = 0; i != Succs.size(); ++i)
+    if (pred_begin(Succs[i]) != pred_end(Succs[i])) {
+      // One exception is loop headers.  If this block was the preheader for a
+      // loop, then we DO want to visit the loop so the loop gets deleted.
+      // We know that if the successor is a loop header, that this loop had to
+      // be the preheader: the case where this was the latch block was handled
+      // above and headers can only have two predecessors.
+      if (!LI->isLoopHeader(Succs[i])) {
+        Succs.erase(Succs.begin()+i);
+        --i;
+      }
+    }
+  
+  for (unsigned i = 0, e = Succs.size(); i != e; ++i)
+    RemoveBlockIfDead(Succs[i], Worklist);
+}
+
+/// RemoveLoopFromHierarchy - We have discovered that the specified loop has
+/// become unwrapped, either because the backedge was deleted, or because the
+/// edge into the header was removed.  If the edge into the header from the
+/// latch block was removed, the loop is unwrapped but subloops are still alive,
+/// so they just reparent loops.  If the loops are actually dead, they will be
+/// removed later.
+void LoopUnswitch::RemoveLoopFromHierarchy(Loop *L) {
+  LPM->deleteLoopFromQueue(L);
+  RemoveLoopFromWorklist(L);
+}
+
+
+
+// RewriteLoopBodyWithConditionConstant - We know either that the value LIC has
+// the value specified by Val in the specified loop, or we know it does NOT have
+// that value.  Rewrite any uses of LIC or of properties correlated to it.
+void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                                        Constant *Val,
+                                                        bool IsEqual) {
+  assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
+  
+  // FIXME: Support correlated properties, like:
+  //  for (...)
+  //    if (li1 < li2)
+  //      ...
+  //    if (li1 > li2)
+  //      ...
+  
+  // FOLD boolean conditions (X|LIC), (X&LIC).  Fold conditional branches,
+  // selects, switches.
+  std::vector<User*> Users(LIC->use_begin(), LIC->use_end());
+  std::vector<Instruction*> Worklist;
+
+  // If we know that LIC == Val, or that LIC == NotVal, just replace uses of LIC
+  // in the loop with the appropriate one directly.
+  if (IsEqual || (isa<ConstantInt>(Val) && Val->getType() == Type::Int1Ty)) {
+    Value *Replacement;
+    if (IsEqual)
+      Replacement = Val;
+    else
+      Replacement = ConstantInt::get(Type::Int1Ty, 
+                                     !cast<ConstantInt>(Val)->getZExtValue());
+    
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+        U->replaceUsesOfWith(LIC, Replacement);
+        Worklist.push_back(U);
+      }
+  } else {
+    // Otherwise, we don't know the precise value of LIC, but we do know that it
+    // is certainly NOT "Val".  As such, simplify any uses in the loop that we
+    // can.  This case occurs when we unswitch switch statements.
+    for (unsigned i = 0, e = Users.size(); i != e; ++i)
+      if (Instruction *U = cast<Instruction>(Users[i])) {
+        if (!L->contains(U->getParent()))
+          continue;
+
+        Worklist.push_back(U);
+
+        // If we know that LIC is not Val, use this info to simplify code.
+        if (SwitchInst *SI = dyn_cast<SwitchInst>(U)) {
+          for (unsigned i = 1, e = SI->getNumCases(); i != e; ++i) {
+            if (SI->getCaseValue(i) == Val) {
+              // Found a dead case value.  Don't remove PHI nodes in the 
+              // successor if they become single-entry, those PHI nodes may
+              // be in the Users list.
+              
+              // FIXME: This is a hack.  We need to keep the successor around
+              // and hooked up so as to preserve the loop structure, because
+              // trying to update it is complicated.  So instead we preserve the
+              // loop structure and put the block on an dead code path.
+              
+              BasicBlock* Old = SI->getParent();
+              BasicBlock* Split = SplitBlock(Old, SI, this);
+              
+              Instruction* OldTerm = Old->getTerminator();
+              new BranchInst(Split, SI->getSuccessor(i),
+                             ConstantInt::getTrue(), OldTerm);
+              
+              Old->getTerminator()->eraseFromParent();
+              
+              
+              PHINode *PN;
+              for (BasicBlock::iterator II = SI->getSuccessor(i)->begin();
+                   (PN = dyn_cast<PHINode>(II)); ++II) {
+                Value *InVal = PN->removeIncomingValue(Split, false);
+                PN->addIncoming(InVal, Old);
+              }
+
+              SI->removeCase(i);
+              break;
+            }
+          }
+        }
+        
+        // TODO: We could do other simplifications, for example, turning 
+        // LIC == Val -> false.
+      }
+  }
+  
+  SimplifyCode(Worklist);
+}
+
+/// SimplifyCode - Okay, now that we have simplified some instructions in the 
+/// loop, walk over it and constant prop, dce, and fold control flow where
+/// possible.  Note that this is effectively a very simple loop-structure-aware
+/// optimizer.  During processing of this loop, L could very well be deleted, so
+/// it must not be used.
+///
+/// FIXME: When the loop optimizer is more mature, separate this out to a new
+/// pass.
+///
+void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist) {
+  while (!Worklist.empty()) {
+    Instruction *I = Worklist.back();
+    Worklist.pop_back();
+    
+    // Simple constant folding.
+    if (Constant *C = ConstantFoldInstruction(I)) {
+      ReplaceUsesOfWith(I, C, Worklist);
+      continue;
+    }
+    
+    // Simple DCE.
+    if (isInstructionTriviallyDead(I)) {
+      DOUT << "Remove dead instruction '" << *I;
+      
+      // Add uses to the worklist, which may be dead now.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+        if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
+          Worklist.push_back(Use);
+      I->eraseFromParent();
+      RemoveFromWorklist(I, Worklist);
+      ++NumSimplify;
+      continue;
+    }
+    
+    // Special case hacks that appear commonly in unswitched code.
+    switch (I->getOpcode()) {
+    case Instruction::Select:
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(0))) {
+        ReplaceUsesOfWith(I, I->getOperand(!CB->getZExtValue()+1), Worklist);
+        continue;
+      }
+      break;
+    case Instruction::And:
+      if (isa<ConstantInt>(I->getOperand(0)) && 
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1))) 
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())      // X & 1 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist);
+          else                  // X & 0 -> 0
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist);
+          continue;
+        }
+      break;
+    case Instruction::Or:
+      if (isa<ConstantInt>(I->getOperand(0)) &&
+          I->getOperand(0)->getType() == Type::Int1Ty)   // constant -> RHS
+        cast<BinaryOperator>(I)->swapOperands();
+      if (ConstantInt *CB = dyn_cast<ConstantInt>(I->getOperand(1)))
+        if (CB->getType() == Type::Int1Ty) {
+          if (CB->isOne())   // X | 1 -> 1
+            ReplaceUsesOfWith(I, I->getOperand(1), Worklist);
+          else                  // X | 0 -> X
+            ReplaceUsesOfWith(I, I->getOperand(0), Worklist);
+          continue;
+        }
+      break;
+    case Instruction::Br: {
+      BranchInst *BI = cast<BranchInst>(I);
+      if (BI->isUnconditional()) {
+        // If BI's parent is the only pred of the successor, fold the two blocks
+        // together.
+        BasicBlock *Pred = BI->getParent();
+        BasicBlock *Succ = BI->getSuccessor(0);
+        BasicBlock *SinglePred = Succ->getSinglePredecessor();
+        if (!SinglePred) continue;  // Nothing to do.
+        assert(SinglePred == Pred && "CFG broken");
+
+        DOUT << "Merging blocks: " << Pred->getName() << " <- " 
+             << Succ->getName() << "\n";
+        
+        // Resolve any single entry PHI nodes in Succ.
+        while (PHINode *PN = dyn_cast<PHINode>(Succ->begin()))
+          ReplaceUsesOfWith(PN, PN->getIncomingValue(0), Worklist);
+        
+        // Move all of the successor contents from Succ to Pred.
+        Pred->getInstList().splice(BI, Succ->getInstList(), Succ->begin(),
+                                   Succ->end());
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        
+        // If Succ has any successors with PHI nodes, update them to have
+        // entries coming from Pred instead of Succ.
+        Succ->replaceAllUsesWith(Pred);
+        
+        // Remove Succ from the loop tree.
+        LI->removeBlock(Succ);
+        Succ->eraseFromParent();
+        ++NumSimplify;
+      } else if (ConstantInt *CB = dyn_cast<ConstantInt>(BI->getCondition())){
+        // Conditional branch.  Turn it into an unconditional branch, then
+        // remove dead blocks.
+        break;  // FIXME: Enable.
+
+        DOUT << "Folded branch: " << *BI;
+        BasicBlock *DeadSucc = BI->getSuccessor(CB->getZExtValue());
+        BasicBlock *LiveSucc = BI->getSuccessor(!CB->getZExtValue());
+        DeadSucc->removePredecessor(BI->getParent(), true);
+        Worklist.push_back(new BranchInst(LiveSucc, BI));
+        BI->eraseFromParent();
+        RemoveFromWorklist(BI, Worklist);
+        ++NumSimplify;
+
+        RemoveBlockIfDead(DeadSucc, Worklist);
+      }
+      break;
+    }
+    }
+  }
+}

diff --git a/lib/Transforms/Scalar/LowerGC.cpp b/lib/Transforms/Scalar/LowerGC.cpp
new file mode 100644
index 0000000..27cccd5
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerGC.cpp

@@ -0,0 +1,330 @@
+//===-- LowerGC.cpp - Provide GC support for targets that don't -----------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering for the llvm.gc* intrinsics for targets that do
+// not natively support them (which includes the C backend).  Note that the code
+// generated is not as efficient as it would be for targets that natively
+// support the GC intrinsics, but it is useful for getting new targets
+// up-and-running quickly.
+//
+// This pass implements the code transformation described in this paper:
+//   "Accurate Garbage Collection in an Uncooperative Environment"
+//   Fergus Henderson, ISMM, 2002
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "lowergc"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+namespace {
+  class VISIBILITY_HIDDEN LowerGC : public FunctionPass {
+    /// GCRootInt, GCReadInt, GCWriteInt - The function prototypes for the
+    /// llvm.gcread/llvm.gcwrite/llvm.gcroot intrinsics.
+    Function *GCRootInt, *GCReadInt, *GCWriteInt;
+
+    /// GCRead/GCWrite - These are the functions provided by the garbage
+    /// collector for read/write barriers.
+    Constant *GCRead, *GCWrite;
+
+    /// RootChain - This is the global linked-list that contains the chain of GC
+    /// roots.
+    GlobalVariable *RootChain;
+
+    /// MainRootRecordType - This is the type for a function root entry if it
+    /// had zero roots.
+    const Type *MainRootRecordType;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerGC() : FunctionPass((intptr_t)&ID), 
+                GCRootInt(0), GCReadInt(0), GCWriteInt(0),
+                GCRead(0), GCWrite(0), RootChain(0), MainRootRecordType(0) {}
+    virtual bool doInitialization(Module &M);
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    const StructType *getRootRecordType(unsigned NumRoots);
+  };
+
+  char LowerGC::ID = 0;
+  RegisterPass<LowerGC>
+  X("lowergc", "Lower GC intrinsics, for GCless code generators");
+}
+
+/// createLowerGCPass - This function returns an instance of the "lowergc"
+/// pass, which lowers garbage collection intrinsics to normal LLVM code.
+FunctionPass *llvm::createLowerGCPass() {
+  return new LowerGC();
+}
+
+/// getRootRecordType - This function creates and returns the type for a root
+/// record containing 'NumRoots' roots.
+const StructType *LowerGC::getRootRecordType(unsigned NumRoots) {
+  // Build a struct that is a type used for meta-data/root pairs.
+  std::vector<const Type *> ST;
+  ST.push_back(GCRootInt->getFunctionType()->getParamType(0));
+  ST.push_back(GCRootInt->getFunctionType()->getParamType(1));
+  StructType *PairTy = StructType::get(ST);
+
+  // Build the array of pairs.
+  ArrayType *PairArrTy = ArrayType::get(PairTy, NumRoots);
+
+  // Now build the recursive list type.
+  PATypeHolder RootListH =
+    MainRootRecordType ? (Type*)MainRootRecordType : (Type*)OpaqueType::get();
+  ST.clear();
+  ST.push_back(PointerType::get(RootListH));         // Prev pointer
+  ST.push_back(Type::Int32Ty);                       // NumElements in array
+  ST.push_back(PairArrTy);                           // The pairs
+  StructType *RootList = StructType::get(ST);
+  if (MainRootRecordType)
+    return RootList;
+
+  assert(NumRoots == 0 && "The main struct type should have zero entries!");
+  cast<OpaqueType>((Type*)RootListH.get())->refineAbstractTypeTo(RootList);
+  MainRootRecordType = RootListH;
+  return cast<StructType>(RootListH.get());
+}
+
+/// doInitialization - If this module uses the GC intrinsics, find them now.  If
+/// not, this pass does not do anything.
+bool LowerGC::doInitialization(Module &M) {
+  GCRootInt  = M.getFunction("llvm.gcroot");
+  GCReadInt  = M.getFunction("llvm.gcread");
+  GCWriteInt = M.getFunction("llvm.gcwrite");
+  if (!GCRootInt && !GCReadInt && !GCWriteInt) return false;
+
+  PointerType *VoidPtr = PointerType::get(Type::Int8Ty);
+  PointerType *VoidPtrPtr = PointerType::get(VoidPtr);
+
+  // If the program is using read/write barriers, find the implementations of
+  // them from the GC runtime library.
+  if (GCReadInt)        // Make:  sbyte* %llvm_gc_read(sbyte**)
+    GCRead = M.getOrInsertFunction("llvm_gc_read", VoidPtr, VoidPtr, VoidPtrPtr,
+                                   (Type *)0);
+  if (GCWriteInt)       // Make:  void %llvm_gc_write(sbyte*, sbyte**)
+    GCWrite = M.getOrInsertFunction("llvm_gc_write", Type::VoidTy,
+                                    VoidPtr, VoidPtr, VoidPtrPtr, (Type *)0);
+
+  // If the program has GC roots, get or create the global root list.
+  if (GCRootInt) {
+    const StructType *RootListTy = getRootRecordType(0);
+    const Type *PRLTy = PointerType::get(RootListTy);
+    M.addTypeName("llvm_gc_root_ty", RootListTy);
+
+    // Get the root chain if it already exists.
+    RootChain = M.getGlobalVariable("llvm_gc_root_chain", PRLTy);
+    if (RootChain == 0) {
+      // If the root chain does not exist, insert a new one with linkonce
+      // linkage!
+      RootChain = new GlobalVariable(PRLTy, false,
+                                     GlobalValue::LinkOnceLinkage,
+                                     Constant::getNullValue(PRLTy),
+                                     "llvm_gc_root_chain", &M);
+    } else if (RootChain->hasExternalLinkage() && RootChain->isDeclaration()) {
+      RootChain->setInitializer(Constant::getNullValue(PRLTy));
+      RootChain->setLinkage(GlobalValue::LinkOnceLinkage);
+    }
+  }
+  return true;
+}
+
+/// Coerce - If the specified operand number of the specified instruction does
+/// not have the specified type, insert a cast. Note that this only uses BitCast
+/// because the types involved are all pointers.
+static void Coerce(Instruction *I, unsigned OpNum, Type *Ty) {
+  if (I->getOperand(OpNum)->getType() != Ty) {
+    if (Constant *C = dyn_cast<Constant>(I->getOperand(OpNum)))
+      I->setOperand(OpNum, ConstantExpr::getBitCast(C, Ty));
+    else {
+      CastInst *CI = new BitCastInst(I->getOperand(OpNum), Ty, "", I);
+      I->setOperand(OpNum, CI);
+    }
+  }
+}
+
+/// runOnFunction - If the program is using GC intrinsics, replace any
+/// read/write intrinsics with the appropriate read/write barrier calls, then
+/// inline them.  Finally, build the data structures for
+bool LowerGC::runOnFunction(Function &F) {
+  // Quick exit for programs that are not using GC mechanisms.
+  if (!GCRootInt && !GCReadInt && !GCWriteInt) return false;
+
+  PointerType *VoidPtr    = PointerType::get(Type::Int8Ty);
+  PointerType *VoidPtrPtr = PointerType::get(VoidPtr);
+
+  // If there are read/write barriers in the program, perform a quick pass over
+  // the function eliminating them.  While we are at it, remember where we see
+  // calls to llvm.gcroot.
+  std::vector<CallInst*> GCRoots;
+  std::vector<CallInst*> NormalCalls;
+
+  bool MadeChange = false;
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    for (BasicBlock::iterator II = BB->begin(), E = BB->end(); II != E;)
+      if (CallInst *CI = dyn_cast<CallInst>(II++)) {
+        if (!CI->getCalledFunction() ||
+            !CI->getCalledFunction()->getIntrinsicID())
+          NormalCalls.push_back(CI);   // Remember all normal function calls.
+
+        if (Function *F = CI->getCalledFunction())
+          if (F == GCRootInt)
+            GCRoots.push_back(CI);
+          else if (F == GCReadInt || F == GCWriteInt) {
+            if (F == GCWriteInt) {
+              // Change a llvm.gcwrite call to call llvm_gc_write instead.
+              CI->setOperand(0, GCWrite);
+              // Insert casts of the operands as needed.
+              Coerce(CI, 1, VoidPtr);
+              Coerce(CI, 2, VoidPtr);
+              Coerce(CI, 3, VoidPtrPtr);
+            } else {
+              Coerce(CI, 1, VoidPtr);
+              Coerce(CI, 2, VoidPtrPtr);
+              if (CI->getType() == VoidPtr) {
+                CI->setOperand(0, GCRead);
+              } else {
+                // Create a whole new call to replace the old one.
+                CallInst *NC = new CallInst(GCRead, CI->getOperand(1),
+                                            CI->getOperand(2),
+                                            CI->getName(), CI);
+                // These functions only deal with ptr type results so BitCast
+                // is the correct kind of cast (no-op cast).
+                Value *NV = new BitCastInst(NC, CI->getType(), "", CI);
+                CI->replaceAllUsesWith(NV);
+                BB->getInstList().erase(CI);
+                CI = NC;
+              }
+            }
+
+            MadeChange = true;
+          }
+      }
+
+  // If there are no GC roots in this function, then there is no need to create
+  // a GC list record for it.
+  if (GCRoots.empty()) return MadeChange;
+
+  // Okay, there are GC roots in this function.  On entry to the function, add a
+  // record to the llvm_gc_root_chain, and remove it on exit.
+
+  // Create the alloca, and zero it out.
+  const StructType *RootListTy = getRootRecordType(GCRoots.size());
+  AllocaInst *AI = new AllocaInst(RootListTy, 0, "gcroots", F.begin()->begin());
+
+  // Insert the memset call after all of the allocas in the function.
+  BasicBlock::iterator IP = AI;
+  while (isa<AllocaInst>(IP)) ++IP;
+
+  Constant *Zero = ConstantInt::get(Type::Int32Ty, 0);
+  Constant *One  = ConstantInt::get(Type::Int32Ty, 1);
+
+  // Get a pointer to the prev pointer.
+  Value *PrevPtrPtr = new GetElementPtrInst(AI, Zero, Zero, "prevptrptr", IP);
+
+  // Load the previous pointer.
+  Value *PrevPtr = new LoadInst(RootChain, "prevptr", IP);
+  // Store the previous pointer into the prevptrptr
+  new StoreInst(PrevPtr, PrevPtrPtr, IP);
+
+  // Set the number of elements in this record.
+  Value *NumEltsPtr = new GetElementPtrInst(AI, Zero, One, "numeltsptr", IP);
+  new StoreInst(ConstantInt::get(Type::Int32Ty, GCRoots.size()), NumEltsPtr,IP);
+
+  Value* Par[4];
+  Par[0] = Zero;
+  Par[1] = ConstantInt::get(Type::Int32Ty, 2);
+
+  const PointerType *PtrLocTy =
+    cast<PointerType>(GCRootInt->getFunctionType()->getParamType(0));
+  Constant *Null = ConstantPointerNull::get(PtrLocTy);
+
+  // Initialize all of the gcroot records now, and eliminate them as we go.
+  for (unsigned i = 0, e = GCRoots.size(); i != e; ++i) {
+    // Initialize the meta-data pointer.
+    Par[2] = ConstantInt::get(Type::Int32Ty, i);
+    Par[3] = One;
+    Value *MetaDataPtr = new GetElementPtrInst(AI, Par, 4, "MetaDataPtr", IP);
+    assert(isa<Constant>(GCRoots[i]->getOperand(2)) && "Must be a constant");
+    new StoreInst(GCRoots[i]->getOperand(2), MetaDataPtr, IP);
+
+    // Initialize the root pointer to null on entry to the function.
+    Par[3] = Zero;
+    Value *RootPtrPtr = new GetElementPtrInst(AI, Par, 4, "RootEntPtr", IP);
+    new StoreInst(Null, RootPtrPtr, IP);
+
+    // Each occurrance of the llvm.gcroot intrinsic now turns into an
+    // initialization of the slot with the address and a zeroing out of the
+    // address specified.
+    new StoreInst(Constant::getNullValue(PtrLocTy->getElementType()),
+                  GCRoots[i]->getOperand(1), GCRoots[i]);
+    new StoreInst(GCRoots[i]->getOperand(1), RootPtrPtr, GCRoots[i]);
+    GCRoots[i]->getParent()->getInstList().erase(GCRoots[i]);
+  }
+
+  // Now that the record is all initialized, store the pointer into the global
+  // pointer.
+  Value *C = new BitCastInst(AI, PointerType::get(MainRootRecordType), "", IP);
+  new StoreInst(C, RootChain, IP);
+
+  // On exit from the function we have to remove the entry from the GC root
+  // chain.  Doing this is straight-forward for return and unwind instructions:
+  // just insert the appropriate copy.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (isa<UnwindInst>(BB->getTerminator()) ||
+        isa<ReturnInst>(BB->getTerminator())) {
+      // We could reuse the PrevPtr loaded on entry to the function, but this
+      // would make the value live for the whole function, which is probably a
+      // bad idea.  Just reload the value out of our stack entry.
+      PrevPtr = new LoadInst(PrevPtrPtr, "prevptr", BB->getTerminator());
+      new StoreInst(PrevPtr, RootChain, BB->getTerminator());
+    }
+
+  // If an exception is thrown from a callee we have to make sure to
+  // unconditionally take the record off the stack.  For this reason, we turn
+  // all call instructions into invoke whose cleanup pops the entry off the
+  // stack.  We only insert one cleanup block, which is shared by all invokes.
+  if (!NormalCalls.empty()) {
+    // Create the shared cleanup block.
+    BasicBlock *Cleanup = new BasicBlock("gc_cleanup", &F);
+    UnwindInst *UI = new UnwindInst(Cleanup);
+    PrevPtr = new LoadInst(PrevPtrPtr, "prevptr", UI);
+    new StoreInst(PrevPtr, RootChain, UI);
+
+    // Loop over all of the function calls, turning them into invokes.
+    while (!NormalCalls.empty()) {
+      CallInst *CI = NormalCalls.back();
+      BasicBlock *CBB = CI->getParent();
+      NormalCalls.pop_back();
+
+      // Split the basic block containing the function call.
+      BasicBlock *NewBB = CBB->splitBasicBlock(CI, CBB->getName()+".cont");
+
+      // Remove the unconditional branch inserted at the end of the CBB.
+      CBB->getInstList().pop_back();
+      NewBB->getInstList().remove(CI);
+
+      // Create a new invoke instruction.
+      std::vector<Value*> Args(CI->op_begin()+1, CI->op_end());
+
+      Value *II = new InvokeInst(CI->getCalledValue(), NewBB, Cleanup,
+                                 &Args[0], Args.size(), CI->getName(), CBB);
+      CI->replaceAllUsesWith(II);
+      delete CI;
+    }
+  }
+
+  return true;
+}

diff --git a/lib/Transforms/Scalar/LowerPacked.cpp b/lib/Transforms/Scalar/LowerPacked.cpp
new file mode 100644
index 0000000..0530172
--- /dev/null
+++ b/lib/Transforms/Scalar/LowerPacked.cpp

@@ -0,0 +1,462 @@
+//===- LowerPacked.cpp -  Implementation of LowerPacked Transform ---------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Brad Jones and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements lowering Packed datatypes into more primitive
+// Packed datatypes, and finally to scalar operations.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Argument.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Support/Streams.h"
+#include "llvm/ADT/StringExtras.h"
+#include <algorithm>
+#include <map>
+#include <functional>
+using namespace llvm;
+
+namespace {
+
+/// This pass converts packed operators to an
+/// equivalent operations on smaller packed data, to possibly
+/// scalar operations.  Currently it supports lowering
+/// to scalar operations.
+///
+/// @brief Transforms packed instructions to simpler instructions.
+///
+class VISIBILITY_HIDDEN LowerPacked 
+  : public FunctionPass, public InstVisitor<LowerPacked> {
+public:
+    static char ID; // Pass identification, replacement for typeid
+    LowerPacked() : FunctionPass((intptr_t)&ID) {}
+
+   /// @brief Lowers packed operations to scalar operations.
+   /// @param F The fuction to process
+   virtual bool runOnFunction(Function &F);
+
+   /// @brief Lowers packed load instructions.
+   /// @param LI the load instruction to convert
+   void visitLoadInst(LoadInst& LI);
+
+   /// @brief Lowers packed store instructions.
+   /// @param SI the store instruction to convert
+   void visitStoreInst(StoreInst& SI);
+
+   /// @brief Lowers packed binary operations.
+   /// @param BO the binary operator to convert
+   void visitBinaryOperator(BinaryOperator& BO);
+
+   /// @brief Lowers packed icmp operations.
+   /// @param CI the icmp operator to convert
+   void visitICmpInst(ICmpInst& IC);
+
+   /// @brief Lowers packed select instructions.
+   /// @param SELI the select operator to convert
+   void visitSelectInst(SelectInst& SELI);
+
+   /// @brief Lowers packed extractelement instructions.
+   /// @param EI the extractelement operator to convert
+   void visitExtractElementInst(ExtractElementInst& EE);
+
+   /// @brief Lowers packed insertelement instructions.
+   /// @param EI the insertelement operator to convert
+   void visitInsertElementInst(InsertElementInst& IE);
+
+   /// This function asserts if the instruction is a VectorType but
+   /// is handled by another function.
+   ///
+   /// @brief Asserts if VectorType instruction is not handled elsewhere.
+   /// @param I the unhandled instruction
+   void visitInstruction(Instruction &I) {
+     if (isa<VectorType>(I.getType()))
+       cerr << "Unhandled Instruction with Packed ReturnType: " << I << '\n';
+   }
+private:
+   /// @brief Retrieves lowered values for a packed value.
+   /// @param val the packed value
+   /// @return the lowered values
+   std::vector<Value*>& getValues(Value* val);
+
+   /// @brief Sets lowered values for a packed value.
+   /// @param val the packed value
+   /// @param values the corresponding lowered values
+   void setValues(Value* val,const std::vector<Value*>& values);
+
+   // Data Members
+   /// @brief whether we changed the function or not
+   bool Changed;
+
+   /// @brief a map from old packed values to new smaller packed values
+   std::map<Value*,std::vector<Value*> > packedToScalarMap;
+
+   /// Instructions in the source program to get rid of
+   /// after we do a pass (the old packed instructions)
+   std::vector<Instruction*> instrsToRemove;
+};
+
+char LowerPacked::ID = 0;
+RegisterPass<LowerPacked>
+X("lower-packed",
+  "lowers packed operations to operations on smaller packed datatypes");
+
+} // end namespace
+
+FunctionPass *llvm::createLowerPackedPass() { return new LowerPacked(); }
+
+
+// This function sets lowered values for a corresponding
+// packed value.  Note, in the case of a forward reference
+// getValues(Value*) will have already been called for
+// the packed parameter.  This function will then replace
+// all references in the in the function of the "dummy"
+// value the previous getValues(Value*) call
+// returned with actual references.
+void LowerPacked::setValues(Value* value,const std::vector<Value*>& values)
+{
+   std::map<Value*,std::vector<Value*> >::iterator it =
+         packedToScalarMap.lower_bound(value);
+   if (it == packedToScalarMap.end() || it->first != value) {
+       // there was not a forward reference to this element
+       packedToScalarMap.insert(it,std::make_pair(value,values));
+   }
+   else {
+      // replace forward declarations with actual definitions
+      assert(it->second.size() == values.size() &&
+             "Error forward refences and actual definition differ in size");
+      for (unsigned i = 0, e = values.size(); i != e; ++i) {
+           // replace and get rid of old forward references
+           it->second[i]->replaceAllUsesWith(values[i]);
+           delete it->second[i];
+           it->second[i] = values[i];
+      }
+   }
+}
+
+// This function will examine the packed value parameter
+// and if it is a packed constant or a forward reference
+// properly create the lowered values needed.  Otherwise
+// it will simply retreive values from a
+// setValues(Value*,const std::vector<Value*>&)
+// call.  Failing both of these cases, it will abort
+// the program.
+std::vector<Value*>& LowerPacked::getValues(Value* value)
+{
+   assert(isa<VectorType>(value->getType()) &&
+          "Value must be VectorType");
+
+   // reject further processing if this one has
+   // already been handled
+   std::map<Value*,std::vector<Value*> >::iterator it =
+      packedToScalarMap.lower_bound(value);
+   if (it != packedToScalarMap.end() && it->first == value) {
+       return it->second;
+   }
+
+   if (ConstantVector* CP = dyn_cast<ConstantVector>(value)) {
+       // non-zero constant case
+       std::vector<Value*> results;
+       results.reserve(CP->getNumOperands());
+       for (unsigned i = 0, e = CP->getNumOperands(); i != e; ++i) {
+          results.push_back(CP->getOperand(i));
+       }
+       return packedToScalarMap.insert(it,
+                                       std::make_pair(value,results))->second;
+   }
+   else if (ConstantAggregateZero* CAZ =
+            dyn_cast<ConstantAggregateZero>(value)) {
+       // zero constant
+       const VectorType* PKT = cast<VectorType>(CAZ->getType());
+       std::vector<Value*> results;
+       results.reserve(PKT->getNumElements());
+
+       Constant* C = Constant::getNullValue(PKT->getElementType());
+       for (unsigned i = 0, e = PKT->getNumElements(); i != e; ++i) {
+            results.push_back(C);
+       }
+       return packedToScalarMap.insert(it,
+                                       std::make_pair(value,results))->second;
+   }
+   else if (isa<Instruction>(value)) {
+       // foward reference
+       const VectorType* PKT = cast<VectorType>(value->getType());
+       std::vector<Value*> results;
+       results.reserve(PKT->getNumElements());
+
+      for (unsigned i = 0, e = PKT->getNumElements(); i != e; ++i) {
+           results.push_back(new Argument(PKT->getElementType()));
+      }
+      return packedToScalarMap.insert(it,
+                                      std::make_pair(value,results))->second;
+   }
+   else {
+       // we don't know what it is, and we are trying to retrieve
+       // a value for it
+       assert(false && "Unhandled VectorType value");
+       abort();
+   }
+}
+
+void LowerPacked::visitLoadInst(LoadInst& LI)
+{
+   // Make sure what we are dealing with is a vector type
+   if (const VectorType* PKT = dyn_cast<VectorType>(LI.getType())) {
+       // Initialization, Idx is needed for getelementptr needed later
+       std::vector<Value*> Idx(2);
+       Idx[0] = ConstantInt::get(Type::Int32Ty,0);
+
+       ArrayType* AT = ArrayType::get(PKT->getContainedType(0),
+                                      PKT->getNumElements());
+       PointerType* APT = PointerType::get(AT);
+
+       // Cast the pointer to vector type to an equivalent array
+       Value* array = new BitCastInst(LI.getPointerOperand(), APT, 
+                                      LI.getName() + ".a", &LI);
+
+       // Convert this load into num elements number of loads
+       std::vector<Value*> values;
+       values.reserve(PKT->getNumElements());
+
+       for (unsigned i = 0, e = PKT->getNumElements(); i != e; ++i) {
+            // Calculate the second index we will need
+            Idx[1] = ConstantInt::get(Type::Int32Ty,i);
+
+            // Get the pointer
+            Value* val = new GetElementPtrInst(array,
+                                               &Idx[0], Idx.size(),
+                                               LI.getName() +
+                                               ".ge." + utostr(i),
+                                               &LI);
+
+            // generate the new load and save the result in packedToScalar map
+            values.push_back(new LoadInst(val, LI.getName()+"."+utostr(i),
+                             LI.isVolatile(), &LI));
+       }
+
+       setValues(&LI,values);
+       Changed = true;
+       instrsToRemove.push_back(&LI);
+   }
+}
+
+void LowerPacked::visitBinaryOperator(BinaryOperator& BO)
+{
+   // Make sure both operands are VectorTypes
+   if (isa<VectorType>(BO.getOperand(0)->getType())) {
+       std::vector<Value*>& op0Vals = getValues(BO.getOperand(0));
+       std::vector<Value*>& op1Vals = getValues(BO.getOperand(1));
+       std::vector<Value*> result;
+       assert((op0Vals.size() == op1Vals.size()) &&
+              "The two packed operand to scalar maps must be equal in size.");
+
+       result.reserve(op0Vals.size());
+
+       // generate the new binary op and save the result
+       for (unsigned i = 0; i != op0Vals.size(); ++i) {
+            result.push_back(BinaryOperator::create(BO.getOpcode(),
+                                                    op0Vals[i],
+                                                    op1Vals[i],
+                                                    BO.getName() +
+                                                    "." + utostr(i),
+                                                    &BO));
+       }
+
+       setValues(&BO,result);
+       Changed = true;
+       instrsToRemove.push_back(&BO);
+   }
+}
+
+void LowerPacked::visitICmpInst(ICmpInst& IC)
+{
+   // Make sure both operands are VectorTypes
+   if (isa<VectorType>(IC.getOperand(0)->getType())) {
+       std::vector<Value*>& op0Vals = getValues(IC.getOperand(0));
+       std::vector<Value*>& op1Vals = getValues(IC.getOperand(1));
+       std::vector<Value*> result;
+       assert((op0Vals.size() == op1Vals.size()) &&
+              "The two packed operand to scalar maps must be equal in size.");
+
+       result.reserve(op0Vals.size());
+
+       // generate the new binary op and save the result
+       for (unsigned i = 0; i != op0Vals.size(); ++i) {
+            result.push_back(CmpInst::create(IC.getOpcode(),
+                                             IC.getPredicate(),
+                                             op0Vals[i],
+                                             op1Vals[i],
+                                             IC.getName() +
+                                             "." + utostr(i),
+                                             &IC));
+       }
+
+       setValues(&IC,result);
+       Changed = true;
+       instrsToRemove.push_back(&IC);
+   }
+}
+
+void LowerPacked::visitStoreInst(StoreInst& SI)
+{
+   if (const VectorType* PKT =
+       dyn_cast<VectorType>(SI.getOperand(0)->getType())) {
+       // We will need this for getelementptr
+       std::vector<Value*> Idx(2);
+       Idx[0] = ConstantInt::get(Type::Int32Ty,0);
+
+       ArrayType* AT = ArrayType::get(PKT->getContainedType(0),
+                                      PKT->getNumElements());
+       PointerType* APT = PointerType::get(AT);
+
+       // Cast the pointer to packed to an array of equivalent type
+       Value* array = new BitCastInst(SI.getPointerOperand(), APT, 
+                                      "store.ge.a.", &SI);
+
+       std::vector<Value*>& values = getValues(SI.getOperand(0));
+
+       assert((values.size() == PKT->getNumElements()) &&
+              "Scalar must have the same number of elements as Vector Type");
+
+       for (unsigned i = 0, e = PKT->getNumElements(); i != e; ++i) {
+            // Generate the indices for getelementptr
+            Idx[1] = ConstantInt::get(Type::Int32Ty,i);
+            Value* val = new GetElementPtrInst(array,
+                                               &Idx[0], Idx.size(),
+                                               "store.ge." +
+                                               utostr(i) + ".",
+                                               &SI);
+            new StoreInst(values[i], val, SI.isVolatile(),&SI);
+       }
+
+       Changed = true;
+       instrsToRemove.push_back(&SI);
+   }
+}
+
+void LowerPacked::visitSelectInst(SelectInst& SELI)
+{
+   // Make sure both operands are VectorTypes
+   if (isa<VectorType>(SELI.getType())) {
+       std::vector<Value*>& op0Vals = getValues(SELI.getTrueValue());
+       std::vector<Value*>& op1Vals = getValues(SELI.getFalseValue());
+       std::vector<Value*> result;
+
+      assert((op0Vals.size() == op1Vals.size()) &&
+             "The two packed operand to scalar maps must be equal in size.");
+
+      for (unsigned i = 0; i != op0Vals.size(); ++i) {
+           result.push_back(new SelectInst(SELI.getCondition(),
+                                           op0Vals[i],
+                                           op1Vals[i],
+                                           SELI.getName()+ "." + utostr(i),
+                                           &SELI));
+      }
+
+      setValues(&SELI,result);
+      Changed = true;
+      instrsToRemove.push_back(&SELI);
+   }
+}
+
+void LowerPacked::visitExtractElementInst(ExtractElementInst& EI)
+{
+  std::vector<Value*>& op0Vals = getValues(EI.getOperand(0));
+  const VectorType *PTy = cast<VectorType>(EI.getOperand(0)->getType());
+  Value *op1 = EI.getOperand(1);
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(op1)) {
+    EI.replaceAllUsesWith(op0Vals[C->getZExtValue()]);
+  } else {
+    AllocaInst *alloca = 
+      new AllocaInst(PTy->getElementType(),
+                     ConstantInt::get(Type::Int32Ty, PTy->getNumElements()),
+                     EI.getName() + ".alloca", 
+                     EI.getParent()->getParent()->getEntryBlock().begin());
+    for (unsigned i = 0; i < PTy->getNumElements(); ++i) {
+      GetElementPtrInst *GEP = 
+        new GetElementPtrInst(alloca, ConstantInt::get(Type::Int32Ty, i),
+                              "store.ge", &EI);
+      new StoreInst(op0Vals[i], GEP, &EI);
+    }
+    GetElementPtrInst *GEP = 
+      new GetElementPtrInst(alloca, op1, EI.getName() + ".ge", &EI);
+    LoadInst *load = new LoadInst(GEP, EI.getName() + ".load", &EI);
+    EI.replaceAllUsesWith(load);
+  }
+
+  Changed = true;
+  instrsToRemove.push_back(&EI);
+}
+
+void LowerPacked::visitInsertElementInst(InsertElementInst& IE)
+{
+  std::vector<Value*>& Vals = getValues(IE.getOperand(0));
+  Value *Elt = IE.getOperand(1);
+  Value *Idx = IE.getOperand(2);
+  std::vector<Value*> result;
+  result.reserve(Vals.size());
+
+  if (ConstantInt *C = dyn_cast<ConstantInt>(Idx)) {
+    unsigned idxVal = C->getZExtValue();
+    for (unsigned i = 0; i != Vals.size(); ++i) {
+      result.push_back(i == idxVal ? Elt : Vals[i]);
+    }
+  } else {
+    for (unsigned i = 0; i != Vals.size(); ++i) {
+      ICmpInst *icmp =
+        new ICmpInst(ICmpInst::ICMP_EQ, Idx, 
+                     ConstantInt::get(Type::Int32Ty, i),
+                     "icmp", &IE);
+      SelectInst *select =
+        new SelectInst(icmp, Elt, Vals[i], "select", &IE);
+      result.push_back(select);
+    }
+  }
+
+  setValues(&IE, result);
+  Changed = true;
+  instrsToRemove.push_back(&IE);
+}
+
+bool LowerPacked::runOnFunction(Function& F)
+{
+   // initialize
+   Changed = false;
+
+   // Does three passes:
+   // Pass 1) Converts Packed Operations to
+   //         new Packed Operations on smaller
+   //         datatypes
+   visit(F);
+
+   // Pass 2) Drop all references
+   std::for_each(instrsToRemove.begin(),
+                 instrsToRemove.end(),
+                 std::mem_fun(&Instruction::dropAllReferences));
+
+   // Pass 3) Delete the Instructions to remove aka packed instructions
+   for (std::vector<Instruction*>::iterator i = instrsToRemove.begin(),
+                                            e = instrsToRemove.end();
+        i != e; ++i) {
+        (*i)->getParent()->getInstList().erase(*i);
+   }
+
+   // clean-up
+   packedToScalarMap.clear();
+   instrsToRemove.clear();
+
+   return Changed;
+}
+

diff --git a/lib/Transforms/Scalar/Makefile b/lib/Transforms/Scalar/Makefile
new file mode 100644
index 0000000..79643c4
--- /dev/null
+++ b/lib/Transforms/Scalar/Makefile

@@ -0,0 +1,15 @@
+##===- lib/Transforms/Scalar/Makefile ----------------------*- Makefile -*-===##
+# 
+#                     The LLVM Compiler Infrastructure
+#
+# This file was developed by the LLVM research group and is distributed under
+# the University of Illinois Open Source License. See LICENSE.TXT for details.
+# 
+##===----------------------------------------------------------------------===##
+
+LEVEL = ../../..
+LIBRARYNAME = LLVMScalarOpts
+BUILD_ARCHIVE = 1
+
+include $(LEVEL)/Makefile.common
+

diff --git a/lib/Transforms/Scalar/PredicateSimplifier.cpp b/lib/Transforms/Scalar/PredicateSimplifier.cpp
new file mode 100644
index 0000000..7b41fb2
--- /dev/null
+++ b/lib/Transforms/Scalar/PredicateSimplifier.cpp

@@ -0,0 +1,2640 @@
+//===-- PredicateSimplifier.cpp - Path Sensitive Simplifier ---------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by Nick Lewycky and is distributed under the
+// University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Path-sensitive optimizer. In a branch where x == y, replace uses of
+// x with y. Permits further optimization, such as the elimination of
+// the unreachable call:
+//
+// void test(int *p, int *q)
+// {
+//   if (p != q)
+//     return;
+// 
+//   if (*p != *q)
+//     foo(); // unreachable
+// }
+//
+//===----------------------------------------------------------------------===//
+//
+// The InequalityGraph focusses on four properties; equals, not equals,
+// less-than and less-than-or-equals-to. The greater-than forms are also held
+// just to allow walking from a lesser node to a greater one. These properties
+// are stored in a lattice; LE can become LT or EQ, NE can become LT or GT.
+//
+// These relationships define a graph between values of the same type. Each
+// Value is stored in a map table that retrieves the associated Node. This
+// is how EQ relationships are stored; the map contains pointers from equal
+// Value to the same node. The node contains a most canonical Value* form
+// and the list of known relationships with other nodes.
+//
+// If two nodes are known to be inequal, then they will contain pointers to
+// each other with an "NE" relationship. If node getNode(%x) is less than
+// getNode(%y), then the %x node will contain <%y, GT> and %y will contain
+// <%x, LT>. This allows us to tie nodes together into a graph like this:
+//
+//   %a < %b < %c < %d
+//
+// with four nodes representing the properties. The InequalityGraph provides
+// querying with "isRelatedBy" and mutators "addEquality" and "addInequality".
+// To find a relationship, we start with one of the nodes any binary search
+// through its list to find where the relationships with the second node start.
+// Then we iterate through those to find the first relationship that dominates
+// our context node.
+//
+// To create these properties, we wait until a branch or switch instruction
+// implies that a particular value is true (or false). The VRPSolver is
+// responsible for analyzing the variable and seeing what new inferences
+// can be made from each property. For example:
+//
+//   %P = icmp ne i32* %ptr, null
+//   %a = and i1 %P, %Q
+//   br i1 %a label %cond_true, label %cond_false
+//
+// For the true branch, the VRPSolver will start with %a EQ true and look at
+// the definition of %a and find that it can infer that %P and %Q are both
+// true. From %P being true, it can infer that %ptr NE null. For the false
+// branch it can't infer anything from the "and" instruction.
+//
+// Besides branches, we can also infer properties from instruction that may
+// have undefined behaviour in certain cases. For example, the dividend of
+// a division may never be zero. After the division instruction, we may assume
+// that the dividend is not equal to zero.
+//
+//===----------------------------------------------------------------------===//
+//
+// The ValueRanges class stores the known integer bounds of a Value. When we
+// encounter i8 %a u< %b, the ValueRanges stores that %a = [1, 255] and
+// %b = [0, 254]. Because we store these by Value*, you should always
+// canonicalize through the InequalityGraph first.
+//
+// It never stores an empty range, because that means that the code is
+// unreachable. It never stores a single-element range since that's an equality
+// relationship and better stored in the InequalityGraph, nor an empty range
+// since that is better stored in UnreachableBlocks.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "predsimplify"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/ConstantRange.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include <algorithm>
+#include <deque>
+#include <sstream>
+#include <stack>
+using namespace llvm;
+
+STATISTIC(NumVarsReplaced, "Number of argument substitutions");
+STATISTIC(NumInstruction , "Number of instructions removed");
+STATISTIC(NumSimple      , "Number of simple replacements");
+STATISTIC(NumBlocks      , "Number of blocks marked unreachable");
+STATISTIC(NumSnuggle     , "Number of comparisons snuggled");
+
+namespace {
+  class DomTreeDFS {
+  public:
+    class Node {
+      friend class DomTreeDFS;
+    public:
+      typedef std::vector<Node *>::iterator       iterator;
+      typedef std::vector<Node *>::const_iterator const_iterator;
+
+      unsigned getDFSNumIn()  const { return DFSin;  }
+      unsigned getDFSNumOut() const { return DFSout; }
+
+      BasicBlock *getBlock() const { return BB; }
+
+      iterator begin() { return Children.begin(); }
+      iterator end()   { return Children.end();   }
+
+      const_iterator begin() const { return Children.begin(); }
+      const_iterator end()   const { return Children.end();   }
+
+      bool dominates(const Node *N) const {
+        return DFSin <= N->DFSin && DFSout >= N->DFSout;
+      }
+
+      bool DominatedBy(const Node *N) const {
+        return N->dominates(this);
+      }
+
+      /// Sorts by the number of descendants. With this, you can iterate
+      /// through a sorted list and the first matching entry is the most
+      /// specific match for your basic block. The order provided is stable;
+      /// DomTreeDFS::Nodes with the same number of descendants are sorted by
+      /// DFS in number.
+      bool operator<(const Node &N) const {
+        unsigned   spread =   DFSout -   DFSin;
+        unsigned N_spread = N.DFSout - N.DFSin;
+        if (spread == N_spread) return DFSin < N.DFSin;
+        return spread < N_spread;
+      }
+      bool operator>(const Node &N) const { return N < *this; }
+
+    private:
+      unsigned DFSin, DFSout;
+      BasicBlock *BB;
+
+      std::vector<Node *> Children;
+    };
+
+    // XXX: this may be slow. Instead of using "new" for each node, consider
+    // putting them in a vector to keep them contiguous.
+    explicit DomTreeDFS(DominatorTree *DT) {
+      std::stack<std::pair<Node *, DomTreeNode *> > S;
+
+      Entry = new Node;
+      Entry->BB = DT->getRootNode()->getBlock();
+      S.push(std::make_pair(Entry, DT->getRootNode()));
+
+      NodeMap[Entry->BB] = Entry;
+
+      while (!S.empty()) {
+        std::pair<Node *, DomTreeNode *> &Pair = S.top();
+        Node *N = Pair.first;
+        DomTreeNode *DTNode = Pair.second;
+        S.pop();
+
+        for (DomTreeNode::iterator I = DTNode->begin(), E = DTNode->end();
+             I != E; ++I) {
+          Node *NewNode = new Node;
+          NewNode->BB = (*I)->getBlock();
+          N->Children.push_back(NewNode);
+          S.push(std::make_pair(NewNode, *I));
+
+          NodeMap[NewNode->BB] = NewNode;
+        }
+      }
+
+      renumber();
+
+#ifndef NDEBUG
+      DEBUG(dump());
+#endif
+    }
+
+#ifndef NDEBUG
+    virtual
+#endif
+    ~DomTreeDFS() {
+      std::stack<Node *> S;
+
+      S.push(Entry);
+      while (!S.empty()) {
+        Node *N = S.top(); S.pop();
+
+        for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+          S.push(*I);
+
+        delete N;
+      }
+    }
+
+    /// getRootNode - This returns the entry node for the CFG of the function.
+    Node *getRootNode() const { return Entry; }
+
+    /// getNodeForBlock - return the node for the specified basic block.
+    Node *getNodeForBlock(BasicBlock *BB) const {
+      if (!NodeMap.count(BB)) return 0;
+      return const_cast<DomTreeDFS*>(this)->NodeMap[BB];
+    }
+
+    /// dominates - returns true if the basic block for I1 dominates that of
+    /// the basic block for I2. If the instructions belong to the same basic
+    /// block, the instruction first instruction sequentially in the block is
+    /// considered dominating.
+    bool dominates(Instruction *I1, Instruction *I2) {
+      BasicBlock *BB1 = I1->getParent(),
+                 *BB2 = I2->getParent();
+      if (BB1 == BB2) {
+        if (isa<TerminatorInst>(I1)) return false;
+        if (isa<TerminatorInst>(I2)) return true;
+        if ( isa<PHINode>(I1) && !isa<PHINode>(I2)) return true;
+        if (!isa<PHINode>(I1) &&  isa<PHINode>(I2)) return false;
+
+        for (BasicBlock::const_iterator I = BB2->begin(), E = BB2->end();
+             I != E; ++I) {
+          if (&*I == I1) return true;
+          else if (&*I == I2) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        Node *Node1 = getNodeForBlock(BB1),
+             *Node2 = getNodeForBlock(BB2);
+        return Node1 && Node2 && Node1->dominates(Node2);
+      }
+    }
+
+  private:
+    /// renumber - calculates the depth first search numberings and applies
+    /// them onto the nodes.
+    void renumber() {
+      std::stack<std::pair<Node *, Node::iterator> > S;
+      unsigned n = 0;
+
+      Entry->DFSin = ++n;
+      S.push(std::make_pair(Entry, Entry->begin()));
+
+      while (!S.empty()) {
+        std::pair<Node *, Node::iterator> &Pair = S.top();
+        Node *N = Pair.first;
+        Node::iterator &I = Pair.second;
+
+        if (I == N->end()) {
+          N->DFSout = ++n;
+          S.pop();
+        } else {
+          Node *Next = *I++;
+          Next->DFSin = ++n;
+          S.push(std::make_pair(Next, Next->begin()));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      os << "Predicate simplifier DomTreeDFS: \n";
+      dump(Entry, 0, os);
+      os << "\n\n";
+    }
+
+    void dump(Node *N, int depth, std::ostream &os) const {
+      ++depth;
+      for (int i = 0; i < depth; ++i) { os << " "; }
+      os << "[" << depth << "] ";
+
+      os << N->getBlock()->getName() << " (" << N->getDFSNumIn()
+         << ", " << N->getDFSNumOut() << ")\n";
+
+      for (Node::iterator I = N->begin(), E = N->end(); I != E; ++I)
+        dump(*I, depth, os);
+    }
+#endif
+
+    Node *Entry;
+    std::map<BasicBlock *, Node *> NodeMap;
+  };
+
+  // SLT SGT ULT UGT EQ
+  //   0   1   0   1  0 -- GT                  10
+  //   0   1   0   1  1 -- GE                  11
+  //   0   1   1   0  0 -- SGTULT              12
+  //   0   1   1   0  1 -- SGEULE              13
+  //   0   1   1   1  0 -- SGT                 14
+  //   0   1   1   1  1 -- SGE                 15
+  //   1   0   0   1  0 -- SLTUGT              18
+  //   1   0   0   1  1 -- SLEUGE              19
+  //   1   0   1   0  0 -- LT                  20
+  //   1   0   1   0  1 -- LE                  21
+  //   1   0   1   1  0 -- SLT                 22
+  //   1   0   1   1  1 -- SLE                 23
+  //   1   1   0   1  0 -- UGT                 26
+  //   1   1   0   1  1 -- UGE                 27
+  //   1   1   1   0  0 -- ULT                 28
+  //   1   1   1   0  1 -- ULE                 29
+  //   1   1   1   1  0 -- NE                  30
+  enum LatticeBits {
+    EQ_BIT = 1, UGT_BIT = 2, ULT_BIT = 4, SGT_BIT = 8, SLT_BIT = 16
+  };
+  enum LatticeVal {
+    GT = SGT_BIT | UGT_BIT,
+    GE = GT | EQ_BIT,
+    LT = SLT_BIT | ULT_BIT,
+    LE = LT | EQ_BIT,
+    NE = SLT_BIT | SGT_BIT | ULT_BIT | UGT_BIT,
+    SGTULT = SGT_BIT | ULT_BIT,
+    SGEULE = SGTULT | EQ_BIT,
+    SLTUGT = SLT_BIT | UGT_BIT,
+    SLEUGE = SLTUGT | EQ_BIT,
+    ULT = SLT_BIT | SGT_BIT | ULT_BIT,
+    UGT = SLT_BIT | SGT_BIT | UGT_BIT,
+    SLT = SLT_BIT | ULT_BIT | UGT_BIT,
+    SGT = SGT_BIT | ULT_BIT | UGT_BIT,
+    SLE = SLT | EQ_BIT,
+    SGE = SGT | EQ_BIT,
+    ULE = ULT | EQ_BIT,
+    UGE = UGT | EQ_BIT
+  };
+
+  static bool validPredicate(LatticeVal LV) {
+    switch (LV) {
+      case GT: case GE: case LT: case LE: case NE:
+      case SGTULT: case SGT: case SGEULE:
+      case SLTUGT: case SLT: case SLEUGE:
+      case ULT: case UGT:
+      case SLE: case SGE: case ULE: case UGE:
+        return true;
+      default:
+        return false;
+    }
+  }
+
+  /// reversePredicate - reverse the direction of the inequality
+  static LatticeVal reversePredicate(LatticeVal LV) {
+    unsigned reverse = LV ^ (SLT_BIT|SGT_BIT|ULT_BIT|UGT_BIT); //preserve EQ_BIT
+
+    if ((reverse & (SLT_BIT|SGT_BIT)) == 0)
+      reverse |= (SLT_BIT|SGT_BIT);
+
+    if ((reverse & (ULT_BIT|UGT_BIT)) == 0)
+      reverse |= (ULT_BIT|UGT_BIT);
+
+    LatticeVal Rev = static_cast<LatticeVal>(reverse);
+    assert(validPredicate(Rev) && "Failed reversing predicate.");
+    return Rev;
+  }
+
+  /// ValueNumbering stores the scope-specific value numbers for a given Value.
+  class VISIBILITY_HIDDEN ValueNumbering {
+    class VISIBILITY_HIDDEN VNPair {
+    public:
+      Value *V;
+      unsigned index;
+      DomTreeDFS::Node *Subtree;
+
+      VNPair(Value *V, unsigned index, DomTreeDFS::Node *Subtree)
+        : V(V), index(index), Subtree(Subtree) {}
+
+      bool operator==(const VNPair &RHS) const {
+        return V == RHS.V && Subtree == RHS.Subtree;
+      }
+
+      bool operator<(const VNPair &RHS) const {
+        if (V != RHS.V) return V < RHS.V;
+        return *Subtree < *RHS.Subtree;
+      }
+
+      bool operator<(Value *RHS) const {
+        return V < RHS;
+      }
+    };
+
+    typedef std::vector<VNPair> VNMapType;
+    VNMapType VNMap;
+
+    std::vector<Value *> Values;
+
+    DomTreeDFS *DTDFS;
+
+  public:
+#ifndef NDEBUG
+    virtual ~ValueNumbering() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Values.size(); ++i) {
+        os << i << " = ";
+        WriteAsOperand(os, Values[i-1]);
+        os << " {";
+        for (unsigned j = 0; j < VNMap.size(); ++j) {
+          if (VNMap[j].index == i) {
+            WriteAsOperand(os, VNMap[j].V);
+            os << " (" << VNMap[j].Subtree->getDFSNumIn() << ")  ";
+          }
+        }
+        os << "}\n";
+      }
+    }
+#endif
+
+    /// compare - returns true if V1 is a better canonical value than V2.
+    bool compare(Value *V1, Value *V2) const {
+      if (isa<Constant>(V1))
+        return !isa<Constant>(V2);
+      else if (isa<Constant>(V2))
+        return false;
+      else if (isa<Argument>(V1))
+        return !isa<Argument>(V2);
+      else if (isa<Argument>(V2))
+        return false;
+
+      Instruction *I1 = dyn_cast<Instruction>(V1);
+      Instruction *I2 = dyn_cast<Instruction>(V2);
+
+      if (!I1 || !I2)
+        return V1->getNumUses() < V2->getNumUses();
+
+      return DTDFS->dominates(I1, I2);
+    }
+
+    ValueNumbering(DomTreeDFS *DTDFS) : DTDFS(DTDFS) {}
+
+    /// valueNumber - finds the value number for V under the Subtree. If
+    /// there is no value number, returns zero.
+    unsigned valueNumber(Value *V, DomTreeDFS::Node *Subtree) {
+      if (!(isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V))
+          || V->getType() == Type::VoidTy) return 0;
+
+      VNMapType::iterator E = VNMap.end();
+      VNPair pair(V, 0, Subtree);
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), E, pair);
+      while (I != E && I->V == V) {
+        if (I->Subtree->dominates(Subtree))
+          return I->index;
+        ++I;
+      }
+      return 0;
+    }
+
+    /// getOrInsertVN - always returns a value number, creating it if necessary.
+    unsigned getOrInsertVN(Value *V, DomTreeDFS::Node *Subtree) {
+      if (unsigned n = valueNumber(V, Subtree))
+        return n;
+      else
+        return newVN(V);
+    }
+
+    /// newVN - creates a new value number. Value V must not already have a
+    /// value number assigned.
+    unsigned newVN(Value *V) {
+      assert((isa<Constant>(V) || isa<Argument>(V) || isa<Instruction>(V)) &&
+             "Bad Value for value numbering.");
+      assert(V->getType() != Type::VoidTy && "Won't value number a void value");
+
+      Values.push_back(V);
+
+      VNPair pair = VNPair(V, Values.size(), DTDFS->getRootNode());
+      VNMapType::iterator I = std::lower_bound(VNMap.begin(), VNMap.end(), pair);
+      assert((I == VNMap.end() || value(I->index) != V) &&
+             "Attempt to create a duplicate value number.");
+      VNMap.insert(I, pair);
+
+      return Values.size();
+    }
+
+    /// value - returns the Value associated with a value number.
+    Value *value(unsigned index) const {
+      assert(index != 0 && "Zero index is reserved for not found.");
+      assert(index <= Values.size() && "Index out of range.");
+      return Values[index-1];
+    }
+
+    /// canonicalize - return a Value that is equal to V under Subtree.
+    Value *canonicalize(Value *V, DomTreeDFS::Node *Subtree) {
+      if (isa<Constant>(V)) return V;
+
+      if (unsigned n = valueNumber(V, Subtree))
+        return value(n);
+      else
+        return V;
+    }
+
+    /// addEquality - adds that value V belongs to the set of equivalent
+    /// values defined by value number n under Subtree.
+    void addEquality(unsigned n, Value *V, DomTreeDFS::Node *Subtree) {
+      assert(canonicalize(value(n), Subtree) == value(n) &&
+             "Node's 'canonical' choice isn't best within this subtree.");
+
+      // Suppose that we are given "%x -> node #1 (%y)". The problem is that
+      // we may already have "%z -> node #2 (%x)" somewhere above us in the
+      // graph. We need to find those edges and add "%z -> node #1 (%y)"
+      // to keep the lookups canonical.
+
+      std::vector<Value *> ToRepoint(1, V);
+
+      if (unsigned Conflict = valueNumber(V, Subtree)) {
+        for (VNMapType::iterator I = VNMap.begin(), E = VNMap.end();
+             I != E; ++I) {
+          if (I->index == Conflict && I->Subtree->dominates(Subtree))
+            ToRepoint.push_back(I->V);
+        }
+      }
+
+      for (std::vector<Value *>::iterator VI = ToRepoint.begin(),
+           VE = ToRepoint.end(); VI != VE; ++VI) {
+        Value *V = *VI;
+
+        VNPair pair(V, n, Subtree);
+        VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+        VNMapType::iterator I = std::lower_bound(B, E, pair);
+        if (I != E && I->V == V && I->Subtree == Subtree)
+          I->index = n; // Update best choice
+        else
+          VNMap.insert(I, pair); // New Value
+
+        // XXX: we currently don't have to worry about updating values with
+        // more specific Subtrees, but we will need to for PHI node support.
+
+#ifndef NDEBUG
+        Value *V_n = value(n);
+        if (isa<Constant>(V) && isa<Constant>(V_n)) {
+          assert(V == V_n && "Constant equals different constant?");
+        }
+#endif
+      }
+    }
+
+    /// remove - removes all references to value V.
+    void remove(Value *V) {
+      VNMapType::iterator B = VNMap.begin(), E = VNMap.end();
+      VNPair pair(V, 0, DTDFS->getRootNode());
+      VNMapType::iterator J = std::upper_bound(B, E, pair);
+      VNMapType::iterator I = J;
+
+      while (I != B && (I == E || I->V == V)) --I;
+
+      VNMap.erase(I, J);
+    }
+  };
+
+  /// The InequalityGraph stores the relationships between values.
+  /// Each Value in the graph is assigned to a Node. Nodes are pointer
+  /// comparable for equality. The caller is expected to maintain the logical
+  /// consistency of the system.
+  ///
+  /// The InequalityGraph class may invalidate Node*s after any mutator call.
+  /// @brief The InequalityGraph stores the relationships between values.
+  class VISIBILITY_HIDDEN InequalityGraph {
+    ValueNumbering &VN;
+    DomTreeDFS::Node *TreeRoot;
+
+    InequalityGraph();                  // DO NOT IMPLEMENT
+    InequalityGraph(InequalityGraph &); // DO NOT IMPLEMENT
+  public:
+    InequalityGraph(ValueNumbering &VN, DomTreeDFS::Node *TreeRoot)
+      : VN(VN), TreeRoot(TreeRoot) {}
+
+    class Node;
+
+    /// An Edge is contained inside a Node making one end of the edge implicit
+    /// and contains a pointer to the other end. The edge contains a lattice
+    /// value specifying the relationship and an DomTreeDFS::Node specifying
+    /// the root in the dominator tree to which this edge applies.
+    class VISIBILITY_HIDDEN Edge {
+    public:
+      Edge(unsigned T, LatticeVal V, DomTreeDFS::Node *ST)
+        : To(T), LV(V), Subtree(ST) {}
+
+      unsigned To;
+      LatticeVal LV;
+      DomTreeDFS::Node *Subtree;
+
+      bool operator<(const Edge &edge) const {
+        if (To != edge.To) return To < edge.To;
+        return *Subtree < *edge.Subtree;
+      }
+
+      bool operator<(unsigned to) const {
+        return To < to;
+      }
+
+      bool operator>(unsigned to) const {
+        return To > to;
+      }
+
+      friend bool operator<(unsigned to, const Edge &edge) {
+        return edge.operator>(to);
+      }
+    };
+
+    /// A single node in the InequalityGraph. This stores the canonical Value
+    /// for the node, as well as the relationships with the neighbours.
+    ///
+    /// @brief A single node in the InequalityGraph.
+    class VISIBILITY_HIDDEN Node {
+      friend class InequalityGraph;
+
+      typedef SmallVector<Edge, 4> RelationsType;
+      RelationsType Relations;
+
+      // TODO: can this idea improve performance?
+      //friend class std::vector<Node>;
+      //Node(Node &N) { RelationsType.swap(N.RelationsType); }
+
+    public:
+      typedef RelationsType::iterator       iterator;
+      typedef RelationsType::const_iterator const_iterator;
+
+#ifndef NDEBUG
+      virtual ~Node() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+    private:
+      void dump(std::ostream &os) const {
+        static const std::string names[32] =
+          { "000000", "000001", "000002", "000003", "000004", "000005",
+            "000006", "000007", "000008", "000009", "     >", "    >=",
+            "  s>u<", "s>=u<=", "    s>", "   s>=", "000016", "000017",
+            "  s<u>", "s<=u>=", "     <", "    <=", "    s<", "   s<=",
+            "000024", "000025", "    u>", "   u>=", "    u<", "   u<=",
+            "    !=", "000031" };
+        for (Node::const_iterator NI = begin(), NE = end(); NI != NE; ++NI) {
+          os << names[NI->LV] << " " << NI->To
+             << " (" << NI->Subtree->getDFSNumIn() << "), ";
+        }
+      }
+    public:
+#endif
+
+      iterator begin()             { return Relations.begin(); }
+      iterator end()               { return Relations.end();   }
+      const_iterator begin() const { return Relations.begin(); }
+      const_iterator end()   const { return Relations.end();   }
+
+      iterator find(unsigned n, DomTreeDFS::Node *Subtree) {
+        iterator E = end();
+        for (iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      const_iterator find(unsigned n, DomTreeDFS::Node *Subtree) const {
+        const_iterator E = end();
+        for (const_iterator I = std::lower_bound(begin(), E, n);
+             I != E && I->To == n; ++I) {
+          if (Subtree->DominatedBy(I->Subtree))
+            return I;
+        }
+        return E;
+      }
+
+      /// Updates the lattice value for a given node. Create a new entry if
+      /// one doesn't exist, otherwise it merges the values. The new lattice
+      /// value must not be inconsistent with any previously existing value.
+      void update(unsigned n, LatticeVal R, DomTreeDFS::Node *Subtree) {
+        assert(validPredicate(R) && "Invalid predicate.");
+        iterator I = find(n, Subtree);
+        if (I == end()) {
+          Edge edge(n, R, Subtree);
+          iterator Insert = std::lower_bound(begin(), end(), edge);
+          Relations.insert(Insert, edge);
+        } else {
+          LatticeVal LV = static_cast<LatticeVal>(I->LV & R);
+          assert(validPredicate(LV) && "Invalid union of lattice values.");
+          if (LV != I->LV) {
+            if (Subtree != I->Subtree) {
+              assert(Subtree->DominatedBy(I->Subtree) &&
+                     "Find returned subtree that doesn't apply.");
+
+              Edge edge(n, R, Subtree);
+              iterator Insert = std::lower_bound(begin(), end(), edge);
+              Relations.insert(Insert, edge); // invalidates I
+              I = find(n, Subtree);
+            }
+
+            // Also, we have to tighten any edge that Subtree dominates.
+            for (iterator B = begin(); I->To == n; --I) {
+              if (I->Subtree->DominatedBy(Subtree)) {
+                LatticeVal LV = static_cast<LatticeVal>(I->LV & R);
+                assert(validPredicate(LV) && "Invalid union of lattice values");
+                I->LV = LV;
+              }
+              if (I == B) break;
+            }
+          }
+        }
+      }
+    };
+
+  private:
+
+    std::vector<Node> Nodes;
+
+  public:
+    /// node - returns the node object at a given value number. The pointer
+    /// returned may be invalidated on the next call to node().
+    Node *node(unsigned index) {
+      assert(VN.value(index)); // This triggers the necessary checks.
+      if (Nodes.size() < index) Nodes.resize(index);
+      return &Nodes[index-1];
+    }
+
+    /// isRelatedBy - true iff n1 op n2
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      if (n1 == n2) return LV & EQ_BIT;
+
+      Node *N1 = node(n1);
+      Node::iterator I = N1->find(n2, Subtree), E = N1->end();
+      if (I != E) return (I->LV & LV) == I->LV;
+
+      return false;
+    }
+
+    // The add* methods assume that your input is logically valid and may 
+    // assertion-fail or infinitely loop if you attempt a contradiction.
+
+    /// addInequality - Sets n1 op n2.
+    /// It is also an error to call this on an inequality that is already true.
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV1) {
+      assert(n1 != n2 && "A node can't be inequal to itself.");
+
+      if (LV1 != NE)
+        assert(!isRelatedBy(n1, n2, Subtree, reversePredicate(LV1)) &&
+               "Contradictory inequality.");
+
+      // Suppose we're adding %n1 < %n2. Find all the %a < %n1 and
+      // add %a < %n2 too. This keeps the graph fully connected.
+      if (LV1 != NE) {
+        // Break up the relationship into signed and unsigned comparison parts.
+        // If the signed parts of %a op1 %n1 match that of %n1 op2 %n2, and
+        // op1 and op2 aren't NE, then add %a op3 %n2. The new relationship
+        // should have the EQ_BIT iff it's set for both op1 and op2.
+
+        unsigned LV1_s = LV1 & (SLT_BIT|SGT_BIT);
+        unsigned LV1_u = LV1 & (ULT_BIT|UGT_BIT);
+
+        for (Node::iterator I = node(n1)->begin(), E = node(n1)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n2) {
+
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              LatticeVal ILV = reversePredicate(I->LV);
+              unsigned ILV_s = ILV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = ILV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (ILV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(I->To)->update(n2, NewLV, Local_Subtree);
+                node(n2)->update(I->To, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+
+        for (Node::iterator I = node(n2)->begin(), E = node(n2)->end(); I != E; ++I) {
+          if (I->LV != NE && I->To != n1) {
+            DomTreeDFS::Node *Local_Subtree = NULL;
+            if (Subtree->DominatedBy(I->Subtree))
+              Local_Subtree = Subtree;
+            else if (I->Subtree->DominatedBy(Subtree))
+              Local_Subtree = I->Subtree;
+
+            if (Local_Subtree) {
+              unsigned new_relationship = 0;
+              unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+              unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+
+              if (LV1_s != (SLT_BIT|SGT_BIT) && ILV_s == LV1_s)
+                new_relationship |= ILV_s;
+
+              if (LV1_u != (ULT_BIT|UGT_BIT) && ILV_u == LV1_u)
+                new_relationship |= ILV_u;
+
+              if (new_relationship) {
+                if ((new_relationship & (SLT_BIT|SGT_BIT)) == 0)
+                  new_relationship |= (SLT_BIT|SGT_BIT);
+                if ((new_relationship & (ULT_BIT|UGT_BIT)) == 0)
+                  new_relationship |= (ULT_BIT|UGT_BIT);
+                if ((LV1 & EQ_BIT) && (I->LV & EQ_BIT))
+                  new_relationship |= EQ_BIT;
+
+                LatticeVal NewLV = static_cast<LatticeVal>(new_relationship);
+
+                node(n1)->update(I->To, NewLV, Local_Subtree);
+                node(I->To)->update(n1, reversePredicate(NewLV), Local_Subtree);
+              }
+            }
+          }
+        }
+      }
+
+      node(n1)->update(n2, LV1, Subtree);
+      node(n2)->update(n1, reversePredicate(LV1), Subtree);
+    }
+
+    /// remove - removes a node from the graph by removing all references to
+    /// and from it.
+    void remove(unsigned n) {
+      Node *N = node(n);
+      for (Node::iterator NI = N->begin(), NE = N->end(); NI != NE; ++NI) {
+        Node::iterator Iter = node(NI->To)->find(n, TreeRoot);
+        do {
+          node(NI->To)->Relations.erase(Iter);
+          Iter = node(NI->To)->find(n, TreeRoot);
+        } while (Iter != node(NI->To)->end());
+      }
+      N->Relations.clear();
+    }
+
+#ifndef NDEBUG
+    virtual ~InequalityGraph() {}
+    virtual void dump() {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) {
+      for (unsigned i = 1; i <= Nodes.size(); ++i) {
+        os << i << " = {";
+        node(i)->dump(os);
+        os << "}\n";
+      }
+    }
+#endif
+  };
+
+  class VRPSolver;
+
+  /// ValueRanges tracks the known integer ranges and anti-ranges of the nodes
+  /// in the InequalityGraph.
+  class VISIBILITY_HIDDEN ValueRanges {
+    ValueNumbering &VN;
+    TargetData *TD;
+
+    class VISIBILITY_HIDDEN ScopedRange {
+      typedef std::vector<std::pair<DomTreeDFS::Node *, ConstantRange> >
+              RangeListType;
+      RangeListType RangeList;
+
+      static bool swo(const std::pair<DomTreeDFS::Node *, ConstantRange> &LHS,
+                      const std::pair<DomTreeDFS::Node *, ConstantRange> &RHS) {
+        return *LHS.first < *RHS.first;
+      }
+
+    public:
+#ifndef NDEBUG
+      virtual ~ScopedRange() {}
+      virtual void dump() const {
+        dump(*cerr.stream());
+      }
+
+      void dump(std::ostream &os) const {
+        os << "{";
+        for (const_iterator I = begin(), E = end(); I != E; ++I) {
+          os << I->second << " (" << I->first->getDFSNumIn() << "), ";
+        }
+        os << "}";
+      }
+#endif
+
+      typedef RangeListType::iterator       iterator;
+      typedef RangeListType::const_iterator const_iterator;
+
+      iterator begin() { return RangeList.begin(); }
+      iterator end()   { return RangeList.end(); }
+      const_iterator begin() const { return RangeList.begin(); }
+      const_iterator end()   const { return RangeList.end(); }
+
+      iterator find(DomTreeDFS::Node *Subtree) {
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I = std::lower_bound(begin(), E,
+                                      std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      const_iterator find(DomTreeDFS::Node *Subtree) const {
+        static const ConstantRange empty(1, false);
+        const_iterator E = end();
+        const_iterator I = std::lower_bound(begin(), E,
+                                            std::make_pair(Subtree, empty), swo);
+
+        while (I != E && !I->first->dominates(Subtree)) ++I;
+        return I;
+      }
+
+      void update(const ConstantRange &CR, DomTreeDFS::Node *Subtree) {
+        assert(!CR.isEmptySet() && "Empty ConstantRange.");
+        assert(!CR.isSingleElement() && "Won't store single element.");
+
+        static ConstantRange empty(1, false);
+        iterator E = end();
+        iterator I =
+            std::lower_bound(begin(), E, std::make_pair(Subtree, empty), swo);
+
+        if (I != end() && I->first == Subtree) {
+          ConstantRange CR2 = I->second.maximalIntersectWith(CR);
+          assert(!CR2.isEmptySet() && !CR2.isSingleElement() &&
+                 "Invalid union of ranges.");
+          I->second = CR2;
+        } else
+          RangeList.insert(I, std::make_pair(Subtree, CR));
+      }
+    };
+
+    std::vector<ScopedRange> Ranges;
+
+    void update(unsigned n, const ConstantRange &CR, DomTreeDFS::Node *Subtree){
+      if (CR.isFullSet()) return;
+      if (Ranges.size() < n) Ranges.resize(n);
+      Ranges[n-1].update(CR, Subtree);
+    }
+
+    /// create - Creates a ConstantRange that matches the given LatticeVal
+    /// relation with a given integer.
+    ConstantRange create(LatticeVal LV, const ConstantRange &CR) {
+      assert(!CR.isEmptySet() && "Can't deal with empty set.");
+
+      if (LV == NE)
+        return makeConstantRange(ICmpInst::ICMP_NE, CR);
+
+      unsigned LV_s = LV & (SGT_BIT|SLT_BIT);
+      unsigned LV_u = LV & (UGT_BIT|ULT_BIT);
+      bool hasEQ = LV & EQ_BIT;
+
+      ConstantRange Range(CR.getBitWidth());
+
+      if (LV_s == SGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SGE : ICmpInst::ICMP_SGT, CR));
+      } else if (LV_s == SLT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_SLE : ICmpInst::ICMP_SLT, CR));
+      }
+
+      if (LV_u == UGT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_UGE : ICmpInst::ICMP_UGT, CR));
+      } else if (LV_u == ULT_BIT) {
+        Range = Range.maximalIntersectWith(makeConstantRange(
+                    hasEQ ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_ULT, CR));
+      }
+
+      return Range;
+    }
+
+    /// makeConstantRange - Creates a ConstantRange representing the set of all
+    /// value that match the ICmpInst::Predicate with any of the values in CR.
+    ConstantRange makeConstantRange(ICmpInst::Predicate ICmpOpcode,
+                                    const ConstantRange &CR) {
+      uint32_t W = CR.getBitWidth();
+      switch (ICmpOpcode) {
+        default: assert(!"Invalid ICmp opcode to makeConstantRange()");
+        case ICmpInst::ICMP_EQ:
+          return ConstantRange(CR.getLower(), CR.getUpper());
+        case ICmpInst::ICMP_NE:
+          if (CR.isSingleElement())
+            return ConstantRange(CR.getUpper(), CR.getLower());
+          return ConstantRange(W);
+        case ICmpInst::ICMP_ULT:
+          return ConstantRange(APInt::getMinValue(W), CR.getUnsignedMax());
+        case ICmpInst::ICMP_SLT:
+          return ConstantRange(APInt::getSignedMinValue(W), CR.getSignedMax());
+        case ICmpInst::ICMP_ULE: {
+          APInt UMax(CR.getUnsignedMax());
+          if (UMax.isMaxValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getMinValue(W), UMax + 1);
+        }
+        case ICmpInst::ICMP_SLE: {
+          APInt SMax(CR.getSignedMax());
+          if (SMax.isMaxSignedValue() || (SMax+1).isMaxSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(APInt::getSignedMinValue(W), SMax + 1);
+        }
+        case ICmpInst::ICMP_UGT:
+          return ConstantRange(CR.getUnsignedMin() + 1, APInt::getNullValue(W));
+        case ICmpInst::ICMP_SGT:
+          return ConstantRange(CR.getSignedMin() + 1,
+                               APInt::getSignedMinValue(W));
+        case ICmpInst::ICMP_UGE: {
+          APInt UMin(CR.getUnsignedMin());
+          if (UMin.isMinValue())
+            return ConstantRange(W);
+          return ConstantRange(UMin, APInt::getNullValue(W));
+        }
+        case ICmpInst::ICMP_SGE: {
+          APInt SMin(CR.getSignedMin());
+          if (SMin.isMinSignedValue())
+            return ConstantRange(W);
+          return ConstantRange(SMin, APInt::getSignedMinValue(W));
+        }
+      }
+    }
+
+#ifndef NDEBUG
+    bool isCanonical(Value *V, DomTreeDFS::Node *Subtree) {
+      return V == VN.canonicalize(V, Subtree);
+    }
+#endif
+
+  public:
+
+    ValueRanges(ValueNumbering &VN, TargetData *TD) : VN(VN), TD(TD) {}
+
+#ifndef NDEBUG
+    virtual ~ValueRanges() {}
+
+    virtual void dump() const {
+      dump(*cerr.stream());
+    }
+
+    void dump(std::ostream &os) const {
+      for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
+        os << (i+1) << " = ";
+        Ranges[i].dump(os);
+        os << "\n";
+      }
+    }
+#endif
+
+    /// range - looks up the ConstantRange associated with a value number.
+    ConstantRange range(unsigned n, DomTreeDFS::Node *Subtree) {
+      assert(VN.value(n)); // performs range checks
+
+      if (n <= Ranges.size()) {
+        ScopedRange::iterator I = Ranges[n-1].find(Subtree);
+        if (I != Ranges[n-1].end()) return I->second;
+      }
+
+      Value *V = VN.value(n);
+      ConstantRange CR = range(V);
+      return CR;
+    }
+
+    /// range - determine a range from a Value without performing any lookups.
+    ConstantRange range(Value *V) const {
+      if (ConstantInt *C = dyn_cast<ConstantInt>(V))
+        return ConstantRange(C->getValue());
+      else if (isa<ConstantPointerNull>(V))
+        return ConstantRange(APInt::getNullValue(typeToWidth(V->getType())));
+      else
+        return typeToWidth(V->getType());
+    }
+
+    // typeToWidth - returns the number of bits necessary to store a value of
+    // this type, or zero if unknown.
+    uint32_t typeToWidth(const Type *Ty) const {
+      if (TD)
+        return TD->getTypeSizeInBits(Ty);
+
+      if (const IntegerType *ITy = dyn_cast<IntegerType>(Ty))
+        return ITy->getBitWidth();
+
+      return 0;
+    }
+
+    static bool isRelatedBy(const ConstantRange &CR1, const ConstantRange &CR2,
+                            LatticeVal LV) {
+      switch (LV) {
+      default: assert(!"Impossible lattice value!");
+      case NE:
+        return CR1.maximalIntersectWith(CR2).isEmptySet();
+      case ULT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case ULE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      case UGT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case UGE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SLT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin());
+      case SLE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin());
+      case SGT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case SGE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax());
+      case LT:
+        return CR1.getUnsignedMax().ult(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().slt(CR2.getUnsignedMin());
+      case LE:
+        return CR1.getUnsignedMax().ule(CR2.getUnsignedMin()) &&
+               CR1.getSignedMax().sle(CR2.getUnsignedMin());
+      case GT:
+        return CR1.getUnsignedMin().ugt(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sgt(CR2.getSignedMax());
+      case GE:
+        return CR1.getUnsignedMin().uge(CR2.getUnsignedMax()) &&
+               CR1.getSignedMin().sge(CR2.getSignedMax());
+      case SLTUGT:
+        return CR1.getSignedMax().slt(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().ugt(CR2.getUnsignedMax());
+      case SLEUGE:
+        return CR1.getSignedMax().sle(CR2.getSignedMin()) &&
+               CR1.getUnsignedMin().uge(CR2.getUnsignedMax());
+      case SGTULT:
+        return CR1.getSignedMin().sgt(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ult(CR2.getUnsignedMin());
+      case SGEULE:
+        return CR1.getSignedMin().sge(CR2.getSignedMax()) &&
+               CR1.getUnsignedMax().ule(CR2.getUnsignedMin());
+      }
+    }
+
+    bool isRelatedBy(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                     LatticeVal LV) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      // True iff all values in CR1 are LV to all values in CR2.
+      return isRelatedBy(CR1, CR2, LV);
+    }
+
+    void addToWorklist(Value *V, Constant *C, ICmpInst::Predicate Pred,
+                       VRPSolver *VRP);
+    void markBlock(VRPSolver *VRP);
+
+    void mergeInto(Value **I, unsigned n, unsigned New,
+                   DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange CR_New = range(New, Subtree);
+      ConstantRange Merged = CR_New;
+
+      for (; n != 0; ++I, --n) {
+        unsigned i = VN.valueNumber(*I, Subtree);
+        ConstantRange CR_Kill = i ? range(i, Subtree) : range(*I);
+        if (CR_Kill.isFullSet()) continue;
+        Merged = Merged.maximalIntersectWith(CR_Kill);
+      }
+
+      if (Merged.isFullSet() || Merged == CR_New) return;
+
+      applyRange(New, Merged, Subtree, VRP);
+    }
+
+    void applyRange(unsigned n, const ConstantRange &CR,
+                    DomTreeDFS::Node *Subtree, VRPSolver *VRP) {
+      ConstantRange Merged = CR.maximalIntersectWith(range(n, Subtree));
+      if (Merged.isEmptySet()) {
+        markBlock(VRP);
+        return;
+      }
+
+      if (const APInt *I = Merged.getSingleElement()) {
+        Value *V = VN.value(n); // XXX: redesign worklist.
+        const Type *Ty = V->getType();
+        if (Ty->isInteger()) {
+          addToWorklist(V, ConstantInt::get(*I), ICmpInst::ICMP_EQ, VRP);
+          return;
+        } else if (const PointerType *PTy = dyn_cast<PointerType>(Ty)) {
+          assert(*I == 0 && "Pointer is null but not zero?");
+          addToWorklist(V, ConstantPointerNull::get(PTy),
+                        ICmpInst::ICMP_EQ, VRP);
+          return;
+        }
+      }
+
+      update(n, Merged, Subtree);
+    }
+
+    void addNotEquals(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                      VRPSolver *VRP) {
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      uint32_t W = CR1.getBitWidth();
+
+      if (const APInt *I = CR1.getSingleElement()) {
+        if (CR2.isFullSet()) {
+          ConstantRange NewCR2(CR1.getUpper(), CR1.getLower());
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getLower()) {
+          APInt NewLower(CR2.getLower() + 1),
+                NewUpper(CR2.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        } else if (*I == CR2.getUpper() - 1) {
+          APInt NewLower(CR2.getLower()),
+                NewUpper(CR2.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR2(NewLower, NewUpper);
+          applyRange(n2, NewCR2, Subtree, VRP);
+        }
+      }
+
+      if (const APInt *I = CR2.getSingleElement()) {
+        if (CR1.isFullSet()) {
+          ConstantRange NewCR1(CR2.getUpper(), CR2.getLower());
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getLower()) {
+          APInt NewLower(CR1.getLower() + 1),
+                NewUpper(CR1.getUpper());
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        } else if (*I == CR1.getUpper() - 1) {
+          APInt NewLower(CR1.getLower()),
+                NewUpper(CR1.getUpper() - 1);
+          if (NewLower == NewUpper)
+            NewLower = NewUpper = APInt::getMinValue(W);
+
+          ConstantRange NewCR1(NewLower, NewUpper);
+          applyRange(n1, NewCR1, Subtree, VRP);
+        }
+      }
+    }
+
+    void addInequality(unsigned n1, unsigned n2, DomTreeDFS::Node *Subtree,
+                       LatticeVal LV, VRPSolver *VRP) {
+      assert(!isRelatedBy(n1, n2, Subtree, LV) && "Asked to do useless work.");
+
+      if (LV == NE) {
+        addNotEquals(n1, n2, Subtree, VRP);
+        return;
+      }
+
+      ConstantRange CR1 = range(n1, Subtree);
+      ConstantRange CR2 = range(n2, Subtree);
+
+      if (!CR1.isSingleElement()) {
+        ConstantRange NewCR1 = CR1.maximalIntersectWith(create(LV, CR2));
+        if (NewCR1 != CR1)
+          applyRange(n1, NewCR1, Subtree, VRP);
+      }
+
+      if (!CR2.isSingleElement()) {
+        ConstantRange NewCR2 = CR2.maximalIntersectWith(
+                                       create(reversePredicate(LV), CR1));
+        if (NewCR2 != CR2)
+          applyRange(n2, NewCR2, Subtree, VRP);
+      }
+    }
+  };
+
+  /// UnreachableBlocks keeps tracks of blocks that are for one reason or
+  /// another discovered to be unreachable. This is used to cull the graph when
+  /// analyzing instructions, and to mark blocks with the "unreachable"
+  /// terminator instruction after the function has executed.
+  class VISIBILITY_HIDDEN UnreachableBlocks {
+  private:
+    std::vector<BasicBlock *> DeadBlocks;
+
+  public:
+    /// mark - mark a block as dead
+    void mark(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      if (I == E || *I != BB) DeadBlocks.insert(I, BB);
+    }
+
+    /// isDead - returns whether a block is known to be dead already
+    bool isDead(BasicBlock *BB) {
+      std::vector<BasicBlock *>::iterator E = DeadBlocks.end();
+      std::vector<BasicBlock *>::iterator I =
+        std::lower_bound(DeadBlocks.begin(), E, BB);
+
+      return I != E && *I == BB;
+    }
+
+    /// kill - replace the dead blocks' terminator with an UnreachableInst.
+    bool kill() {
+      bool modified = false;
+      for (std::vector<BasicBlock *>::iterator I = DeadBlocks.begin(),
+           E = DeadBlocks.end(); I != E; ++I) {
+        BasicBlock *BB = *I;
+
+        DOUT << "unreachable block: " << BB->getName() << "\n";
+
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB);
+             SI != SE; ++SI) {
+          BasicBlock *Succ = *SI;
+          Succ->removePredecessor(BB);
+        }
+
+        TerminatorInst *TI = BB->getTerminator();
+        TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        TI->eraseFromParent();
+        new UnreachableInst(BB);
+        ++NumBlocks;
+        modified = true;
+      }
+      DeadBlocks.clear();
+      return modified;
+    }
+  };
+
+  /// VRPSolver keeps track of how changes to one variable affect other
+  /// variables, and forwards changes along to the InequalityGraph. It
+  /// also maintains the correct choice for "canonical" in the IG.
+  /// @brief VRPSolver calculates inferences from a new relationship.
+  class VISIBILITY_HIDDEN VRPSolver {
+  private:
+    friend class ValueRanges;
+
+    struct Operation {
+      Value *LHS, *RHS;
+      ICmpInst::Predicate Op;
+
+      BasicBlock *ContextBB; // XXX use a DomTreeDFS::Node instead
+      Instruction *ContextInst;
+    };
+    std::deque<Operation> WorkList;
+
+    ValueNumbering &VN;
+    InequalityGraph &IG;
+    UnreachableBlocks &UB;
+    ValueRanges &VR;
+    DomTreeDFS *DTDFS;
+    DomTreeDFS::Node *Top;
+    BasicBlock *TopBB;
+    Instruction *TopInst;
+    bool &modified;
+
+    typedef InequalityGraph::Node Node;
+
+    // below - true if the Instruction is dominated by the current context
+    // block or instruction
+    bool below(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      if (TopInst && TopInst->getParent() == BB) {
+        if (isa<TerminatorInst>(TopInst)) return false;
+        if (isa<TerminatorInst>(I)) return true;
+        if ( isa<PHINode>(TopInst) && !isa<PHINode>(I)) return true;
+        if (!isa<PHINode>(TopInst) &&  isa<PHINode>(I)) return false;
+
+        for (BasicBlock::const_iterator Iter = BB->begin(), E = BB->end();
+             Iter != E; ++Iter) {
+          if (&*Iter == TopInst) return true;
+          else if (&*Iter == I) return false;
+        }
+        assert(!"Instructions not found in parent BasicBlock?");
+      } else {
+        DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+        if (!Node) return false;
+        return Top->dominates(Node);
+      }
+    }
+
+    // aboveOrBelow - true if the Instruction either dominates or is dominated
+    // by the current context block or instruction
+    bool aboveOrBelow(Instruction *I) {
+      BasicBlock *BB = I->getParent();
+      DomTreeDFS::Node *Node = DTDFS->getNodeForBlock(BB);
+      if (!Node) return false;
+
+      return Top == Node || Top->dominates(Node) || Node->dominates(Top);
+    }
+
+    bool makeEqual(Value *V1, Value *V2) {
+      DOUT << "makeEqual(" << *V1 << ", " << *V2 << ")\n";
+      DOUT << "context is ";
+      if (TopInst) DOUT << "I: " << *TopInst << "\n";
+      else DOUT << "BB: " << TopBB->getName()
+                << "(" << Top->getDFSNumIn() << ")\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't make two values with different types equal.");
+
+      if (V1 == V2) return true;
+
+      if (isa<Constant>(V1) && isa<Constant>(V2))
+        return false;
+
+      unsigned n1 = VN.valueNumber(V1, Top), n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return true;
+        if (IG.isRelatedBy(n1, n2, Top, NE)) return false;
+      }
+
+      if (n1) assert(V1 == VN.value(n1) && "Value isn't canonical.");
+      if (n2) assert(V2 == VN.value(n2) && "Value isn't canonical.");
+
+      assert(!VN.compare(V2, V1) && "Please order parameters to makeEqual.");
+
+      assert(!isa<Constant>(V2) && "Tried to remove a constant.");
+
+      SetVector<unsigned> Remove;
+      if (n2) Remove.insert(n2);
+
+      if (n1 && n2) {
+        // Suppose we're being told that %x == %y, and %x <= %z and %y >= %z.
+        // We can't just merge %x and %y because the relationship with %z would
+        // be EQ and that's invalid. What we're doing is looking for any nodes
+        // %z such that %x <= %z and %y >= %z, and vice versa.
+
+        Node::iterator end = IG.node(n2)->end();
+
+        // Find the intersection between N1 and N2 which is dominated by
+        // Top. If we find %x where N1 <= %x <= N2 (or >=) then add %x to
+        // Remove.
+        for (Node::iterator I = IG.node(n1)->begin(), E = IG.node(n1)->end();
+             I != E; ++I) {
+          if (!(I->LV & EQ_BIT) || !Top->DominatedBy(I->Subtree)) continue;
+
+          unsigned ILV_s = I->LV & (SLT_BIT|SGT_BIT);
+          unsigned ILV_u = I->LV & (ULT_BIT|UGT_BIT);
+          Node::iterator NI = IG.node(n2)->find(I->To, Top);
+          if (NI != end) {
+            LatticeVal NILV = reversePredicate(NI->LV);
+            unsigned NILV_s = NILV & (SLT_BIT|SGT_BIT);
+            unsigned NILV_u = NILV & (ULT_BIT|UGT_BIT);
+
+            if ((ILV_s != (SLT_BIT|SGT_BIT) && ILV_s == NILV_s) ||
+                (ILV_u != (ULT_BIT|UGT_BIT) && ILV_u == NILV_u))
+              Remove.insert(I->To);
+          }
+        }
+
+        // See if one of the nodes about to be removed is actually a better
+        // canonical choice than n1.
+        unsigned orig_n1 = n1;
+        SetVector<unsigned>::iterator DontRemove = Remove.end();
+        for (SetVector<unsigned>::iterator I = Remove.begin()+1 /* skip n2 */,
+             E = Remove.end(); I != E; ++I) {
+          unsigned n = *I;
+          Value *V = VN.value(n);
+          if (VN.compare(V, V1)) {
+            V1 = V;
+            n1 = n;
+            DontRemove = I;
+          }
+        }
+        if (DontRemove != Remove.end()) {
+          unsigned n = *DontRemove;
+          Remove.remove(n);
+          Remove.insert(orig_n1);
+        }
+      }
+
+      // We'd like to allow makeEqual on two values to perform a simple
+      // substitution without every creating nodes in the IG whenever possible.
+      //
+      // The first iteration through this loop operates on V2 before going
+      // through the Remove list and operating on those too. If all of the
+      // iterations performed simple replacements then we exit early.
+      bool mergeIGNode = false;
+      unsigned i = 0;
+      for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+        if (i) R = VN.value(Remove[i]); // skip n2.
+
+        // Try to replace the whole instruction. If we can, we're done.
+        Instruction *I2 = dyn_cast<Instruction>(R);
+        if (I2 && below(I2)) {
+          std::vector<Instruction *> ToNotify;
+          for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser()))
+              ToNotify.push_back(I);
+          }
+
+          DOUT << "Simply removing " << *I2
+               << ", replacing with " << *V1 << "\n";
+          I2->replaceAllUsesWith(V1);
+          // leave it dead; it'll get erased later.
+          ++NumInstruction;
+          modified = true;
+
+          for (std::vector<Instruction *>::iterator II = ToNotify.begin(),
+               IE = ToNotify.end(); II != IE; ++II) {
+            opsToDef(*II);
+          }
+
+          continue;
+        }
+
+        // Otherwise, replace all dominated uses.
+        for (Value::use_iterator UI = R->use_begin(), UE = R->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+            if (below(I)) {
+              TheUse.set(V1);
+              modified = true;
+              ++NumVarsReplaced;
+              opsToDef(I);
+            }
+          }
+        }
+
+        // If that killed the instruction, stop here.
+        if (I2 && isInstructionTriviallyDead(I2)) {
+          DOUT << "Killed all uses of " << *I2
+               << ", replacing with " << *V1 << "\n";
+          continue;
+        }
+
+        // If we make it to here, then we will need to create a node for N1.
+        // Otherwise, we can skip out early!
+        mergeIGNode = true;
+      }
+
+      if (!isa<Constant>(V1)) {
+        if (Remove.empty()) {
+          VR.mergeInto(&V2, 1, VN.getOrInsertVN(V1, Top), Top, this);
+        } else {
+          std::vector<Value*> RemoveVals;
+          RemoveVals.reserve(Remove.size());
+
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            Value *V = VN.value(*I);
+            if (!V->use_empty())
+              RemoveVals.push_back(V);
+          }
+          VR.mergeInto(&RemoveVals[0], RemoveVals.size(), 
+                       VN.getOrInsertVN(V1, Top), Top, this);
+        }
+      }
+
+      if (mergeIGNode) {
+        // Create N1.
+        if (!n1) n1 = VN.getOrInsertVN(V1, Top);
+
+        // Migrate relationships from removed nodes to N1.
+        for (SetVector<unsigned>::iterator I = Remove.begin(), E = Remove.end();
+             I != E; ++I) {
+          unsigned n = *I;
+          for (Node::iterator NI = IG.node(n)->begin(), NE = IG.node(n)->end();
+               NI != NE; ++NI) {
+            if (NI->Subtree->DominatedBy(Top)) {
+              if (NI->To == n1) {
+                assert((NI->LV & EQ_BIT) && "Node inequal to itself.");
+                continue;
+              }
+              if (Remove.count(NI->To))
+                continue;
+
+              IG.node(NI->To)->update(n1, reversePredicate(NI->LV), Top);
+              IG.node(n1)->update(NI->To, NI->LV, Top);
+            }
+          }
+        }
+
+        // Point V2 (and all items in Remove) to N1.
+        if (!n2)
+          VN.addEquality(n1, V2, Top);
+        else {
+          for (SetVector<unsigned>::iterator I = Remove.begin(),
+               E = Remove.end(); I != E; ++I) {
+            VN.addEquality(n1, VN.value(*I), Top);
+          }
+        }
+
+        // If !Remove.empty() then V2 = Remove[0]->getValue().
+        // Even when Remove is empty, we still want to process V2.
+        i = 0;
+        for (Value *R = V2; i == 0 || i < Remove.size(); ++i) {
+          if (i) R = VN.value(Remove[i]); // skip n2.
+
+          if (Instruction *I2 = dyn_cast<Instruction>(R)) {
+            if (aboveOrBelow(I2))
+            defToOps(I2);
+          }
+          for (Value::use_iterator UI = V2->use_begin(), UE = V2->use_end();
+               UI != UE;) {
+            Use &TheUse = UI.getUse();
+            ++UI;
+            if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+              if (aboveOrBelow(I))
+                opsToDef(I);
+            }
+          }
+        }
+      }
+
+      // re-opsToDef all dominated users of V1.
+      if (Instruction *I = dyn_cast<Instruction>(V1)) {
+        for (Value::use_iterator UI = I->use_begin(), UE = I->use_end();
+             UI != UE;) {
+          Use &TheUse = UI.getUse();
+          ++UI;
+          Value *V = TheUse.getUser();
+          if (!V->use_empty()) {
+            if (Instruction *Inst = dyn_cast<Instruction>(V)) {
+              if (aboveOrBelow(Inst))
+                opsToDef(Inst);
+            }
+          }
+        }
+      }
+
+      return true;
+    }
+
+    /// cmpInstToLattice - converts an CmpInst::Predicate to lattice value
+    /// Requires that the lattice value be valid; does not accept ICMP_EQ.
+    static LatticeVal cmpInstToLattice(ICmpInst::Predicate Pred) {
+      switch (Pred) {
+        case ICmpInst::ICMP_EQ:
+          assert(!"No matching lattice value.");
+          return static_cast<LatticeVal>(EQ_BIT);
+        default:
+          assert(!"Invalid 'icmp' predicate.");
+        case ICmpInst::ICMP_NE:
+          return NE;
+        case ICmpInst::ICMP_UGT:
+          return UGT;
+        case ICmpInst::ICMP_UGE:
+          return UGE;
+        case ICmpInst::ICMP_ULT:
+          return ULT;
+        case ICmpInst::ICMP_ULE:
+          return ULE;
+        case ICmpInst::ICMP_SGT:
+          return SGT;
+        case ICmpInst::ICMP_SGE:
+          return SGE;
+        case ICmpInst::ICMP_SLT:
+          return SLT;
+        case ICmpInst::ICMP_SLE:
+          return SLE;
+      }
+    }
+
+  public:
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              BasicBlock *TopBB)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopBB)),
+        TopBB(TopBB),
+        TopInst(NULL),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+    }
+
+    VRPSolver(ValueNumbering &VN, InequalityGraph &IG, UnreachableBlocks &UB,
+              ValueRanges &VR, DomTreeDFS *DTDFS, bool &modified,
+              Instruction *TopInst)
+      : VN(VN),
+        IG(IG),
+        UB(UB),
+        VR(VR),
+        DTDFS(DTDFS),
+        Top(DTDFS->getNodeForBlock(TopInst->getParent())),
+        TopBB(TopInst->getParent()),
+        TopInst(TopInst),
+        modified(modified)
+    {
+      assert(Top && "VRPSolver created for unreachable basic block.");
+      assert(Top->getBlock() == TopInst->getParent() && "Context mismatch.");
+    }
+
+    bool isRelatedBy(Value *V1, Value *V2, ICmpInst::Predicate Pred) const {
+      if (Constant *C1 = dyn_cast<Constant>(V1))
+        if (Constant *C2 = dyn_cast<Constant>(V2))
+          return ConstantExpr::getCompare(Pred, C1, C2) ==
+                 ConstantInt::getTrue();
+
+      unsigned n1 = VN.valueNumber(V1, Top);
+      unsigned n2 = VN.valueNumber(V2, Top);
+
+      if (n1 && n2) {
+        if (n1 == n2) return Pred == ICmpInst::ICMP_EQ ||
+                             Pred == ICmpInst::ICMP_ULE ||
+                             Pred == ICmpInst::ICMP_UGE ||
+                             Pred == ICmpInst::ICMP_SLE ||
+                             Pred == ICmpInst::ICMP_SGE;
+        if (Pred == ICmpInst::ICMP_EQ) return false;
+        if (IG.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+        if (VR.isRelatedBy(n1, n2, Top, cmpInstToLattice(Pred))) return true;
+      }
+
+      if ((n1 && !n2 && isa<Constant>(V2)) ||
+          (n2 && !n1 && isa<Constant>(V1))) {
+        ConstantRange CR1 = n1 ? VR.range(n1, Top) : VR.range(V1);
+        ConstantRange CR2 = n2 ? VR.range(n2, Top) : VR.range(V2);
+
+        if (Pred == ICmpInst::ICMP_EQ)
+          return CR1.isSingleElement() &&
+                 CR1.getSingleElement() == CR2.getSingleElement();
+
+        return VR.isRelatedBy(CR1, CR2, cmpInstToLattice(Pred));
+      }
+      if (Pred == ICmpInst::ICMP_EQ) return V1 == V2;
+      return false;
+    }
+
+    /// add - adds a new property to the work queue
+    void add(Value *V1, Value *V2, ICmpInst::Predicate Pred,
+             Instruction *I = NULL) {
+      DOUT << "adding " << *V1 << " " << Pred << " " << *V2;
+      if (I) DOUT << " context: " << *I;
+      else DOUT << " default context (" << Top->getDFSNumIn() << ")";
+      DOUT << "\n";
+
+      assert(V1->getType() == V2->getType() &&
+             "Can't relate two values with different types.");
+
+      WorkList.push_back(Operation());
+      Operation &O = WorkList.back();
+      O.LHS = V1, O.RHS = V2, O.Op = Pred, O.ContextInst = I;
+      O.ContextBB = I ? I->getParent() : TopBB;
+    }
+
+    /// defToOps - Given an instruction definition that we've learned something
+    /// new about, find any new relationships between its operands.
+    void defToOps(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+      Value *Canonical = VN.canonicalize(I, Top);
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        // TODO: "and i32 -1, %x" EQ %y then %x EQ %y.
+
+        switch (BO->getOpcode()) {
+          case Instruction::And: {
+            // "and i32 %a, %b" EQ -1 then %a EQ -1 and %b EQ -1
+            ConstantInt *CI = ConstantInt::getAllOnesValue(Ty);
+            if (Canonical == CI) {
+              add(CI, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(CI, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Or: {
+            // "or i32 %a, %b" EQ 0 then %a EQ 0 and %b EQ 0
+            Constant *Zero = Constant::getNullValue(Ty);
+            if (Canonical == Zero) {
+              add(Zero, Op0, ICmpInst::ICMP_EQ, NewContext);
+              add(Zero, Op1, ICmpInst::ICMP_EQ, NewContext);
+            }
+          } break;
+          case Instruction::Xor: {
+            // "xor i32 %c, %a" EQ %b then %a EQ %c ^ %b
+            // "xor i32 %c, %a" EQ %c then %a EQ 0
+            // "xor i32 %c, %a" NE %c then %a NE 0
+            // Repeat the above, with order of operands reversed.
+            Value *LHS = Op0;
+            Value *RHS = Op1;
+            if (!isa<Constant>(LHS)) std::swap(LHS, RHS);
+
+            if (ConstantInt *CI = dyn_cast<ConstantInt>(Canonical)) {
+              if (ConstantInt *Arg = dyn_cast<ConstantInt>(LHS)) {
+                add(RHS, ConstantInt::get(CI->getValue() ^ Arg->getValue()),
+                    ICmpInst::ICMP_EQ, NewContext);
+              }
+            }
+            if (Canonical == LHS) {
+              if (isa<ConstantInt>(Canonical))
+                add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_EQ,
+                    NewContext);
+            } else if (isRelatedBy(LHS, Canonical, ICmpInst::ICMP_NE)) {
+              add(RHS, Constant::getNullValue(Ty), ICmpInst::ICMP_NE,
+                  NewContext);
+            }
+          } break;
+          default:
+            break;
+        }
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "icmp ult i32 %a, %y" EQ true then %a u< y
+        // etc.
+
+        if (Canonical == ConstantInt::getTrue()) {
+          add(IC->getOperand(0), IC->getOperand(1), IC->getPredicate(),
+              NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(IC->getOperand(0), IC->getOperand(1),
+              ICmpInst::getInversePredicate(IC->getPredicate()), NewContext);
+        }
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %a EQ %b and %b NE %c then %x EQ true
+        // %a EQ %c and %b NE %c then %x EQ false
+
+        Value *True  = SI->getTrueValue();
+        Value *False = SI->getFalseValue();
+        if (isRelatedBy(True, False, ICmpInst::ICMP_NE)) {
+          if (Canonical == VN.canonicalize(True, Top) ||
+              isRelatedBy(Canonical, False, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getTrue(),
+                ICmpInst::ICMP_EQ, NewContext);
+          else if (Canonical == VN.canonicalize(False, Top) ||
+                   isRelatedBy(Canonical, True, ICmpInst::ICMP_NE))
+            add(SI->getCondition(), ConstantInt::getFalse(),
+                ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from definition to operand,
+        // jumping the type plane as needed.
+        if (isRelatedBy(GEPI, Constant::getNullValue(GEPI->getType()),
+                        ICmpInst::ICMP_NE)) {
+          Value *Ptr = GEPI->getPointerOperand();
+          add(Ptr, Constant::getNullValue(Ptr->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *SrcTy = CI->getSrcTy();
+
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        uint32_t W = VR.typeToWidth(SrcTy);
+        if (!W) return;
+        ConstantRange CR = VR.range(ci, Top);
+
+        if (CR.isFullSet()) return;
+
+        switch (CI->getOpcode()) {
+          default: break;
+          case Instruction::ZExt:
+          case Instruction::SExt:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR.truncate(W), Top, this);
+            break;
+          case Instruction::BitCast:
+            VR.applyRange(VN.getOrInsertVN(CI->getOperand(0), Top),
+                          CR, Top, this);
+            break;
+        }
+      }
+    }
+
+    /// opsToDef - A new relationship was discovered involving one of this
+    /// instruction's operands. Find any new relationship involving the
+    /// definition, or another operand.
+    void opsToDef(Instruction *I) {
+      Instruction *NewContext = below(I) ? I : TopInst;
+
+      if (BinaryOperator *BO = dyn_cast<BinaryOperator>(I)) {
+        Value *Op0 = VN.canonicalize(BO->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(BO->getOperand(1), Top);
+
+        if (ConstantInt *CI0 = dyn_cast<ConstantInt>(Op0))
+          if (ConstantInt *CI1 = dyn_cast<ConstantInt>(Op1)) {
+            add(BO, ConstantExpr::get(BO->getOpcode(), CI0, CI1),
+                ICmpInst::ICMP_EQ, NewContext);
+            return;
+          }
+
+        // "%y = and i1 true, %x" then %x EQ %y
+        // "%y = or i1 false, %x" then %x EQ %y
+        // "%x = add i32 %y, 0" then %x EQ %y
+        // "%x = mul i32 %y, 0" then %x EQ 0
+
+        Instruction::BinaryOps Opcode = BO->getOpcode();
+        const Type *Ty = BO->getType();
+        assert(!Ty->isFPOrFPVector() && "Float in work queue!");
+
+        Constant *Zero = Constant::getNullValue(Ty);
+        ConstantInt *AllOnes = ConstantInt::getAllOnesValue(Ty);
+
+        switch (Opcode) {
+          default: break;
+          case Instruction::LShr:
+          case Instruction::AShr:
+          case Instruction::Shl:
+          case Instruction::Sub:
+            if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::Or:
+            if (Op0 == AllOnes || Op1 == AllOnes) {
+              add(BO, AllOnes, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } // fall-through
+          case Instruction::Xor:
+          case Instruction::Add:
+            if (Op0 == Zero) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == Zero) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+          case Instruction::And:
+            if (Op0 == AllOnes) {
+              add(BO, Op1, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            } else if (Op1 == AllOnes) {
+              add(BO, Op0, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            // fall-through
+          case Instruction::Mul:
+            if (Op0 == Zero || Op1 == Zero) {
+              add(BO, Zero, ICmpInst::ICMP_EQ, NewContext);
+              return;
+            }
+            break;
+        }
+
+        // "%x = add i32 %y, %z" and %x EQ %y then %z EQ 0
+        // "%x = add i32 %y, %z" and %x EQ %z then %y EQ 0
+        // "%x = shl i32 %y, %z" and %x EQ %y and %y NE 0 then %z EQ 0
+        // "%x = udiv i32 %y, %z" and %x EQ %y then %z EQ 1
+
+        Value *Known = Op0, *Unknown = Op1,
+              *TheBO = VN.canonicalize(BO, Top);
+        if (Known != TheBO) std::swap(Known, Unknown);
+        if (Known == TheBO) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::LShr:
+            case Instruction::AShr:
+            case Instruction::Shl:
+              if (!isRelatedBy(Known, Zero, ICmpInst::ICMP_NE)) break;
+              // otherwise, fall-through.
+            case Instruction::Sub:
+              if (Unknown == Op1) break;
+              // otherwise, fall-through.
+            case Instruction::Xor:
+            case Instruction::Add:
+              add(Unknown, Zero, ICmpInst::ICMP_EQ, NewContext);
+              break;
+            case Instruction::UDiv:
+            case Instruction::SDiv:
+              if (Unknown == Op1) break;
+              if (isRelatedBy(Known, Zero, ICmpInst::ICMP_NE)) {
+                Constant *One = ConstantInt::get(Ty, 1);
+                add(Unknown, One, ICmpInst::ICMP_EQ, NewContext);
+              }
+              break;
+          }
+        }
+
+        // TODO: "%a = add i32 %b, 1" and %b > %z then %a >= %z.
+
+      } else if (ICmpInst *IC = dyn_cast<ICmpInst>(I)) {
+        // "%a = icmp ult i32 %b, %c" and %b u<  %c then %a EQ true
+        // "%a = icmp ult i32 %b, %c" and %b u>= %c then %a EQ false
+        // etc.
+
+        Value *Op0 = VN.canonicalize(IC->getOperand(0), Top);
+        Value *Op1 = VN.canonicalize(IC->getOperand(1), Top);
+
+        ICmpInst::Predicate Pred = IC->getPredicate();
+        if (isRelatedBy(Op0, Op1, Pred))
+          add(IC, ConstantInt::getTrue(), ICmpInst::ICMP_EQ, NewContext);
+        else if (isRelatedBy(Op0, Op1, ICmpInst::getInversePredicate(Pred)))
+          add(IC, ConstantInt::getFalse(), ICmpInst::ICMP_EQ, NewContext);
+
+      } else if (SelectInst *SI = dyn_cast<SelectInst>(I)) {
+        if (I->getType()->isFPOrFPVector()) return;
+
+        // Given: "%a = select i1 %x, i32 %b, i32 %c"
+        // %x EQ true  then %a EQ %b
+        // %x EQ false then %a EQ %c
+        // %b EQ %c then %a EQ %b
+
+        Value *Canonical = VN.canonicalize(SI->getCondition(), Top);
+        if (Canonical == ConstantInt::getTrue()) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (Canonical == ConstantInt::getFalse()) {
+          add(SI, SI->getFalseValue(), ICmpInst::ICMP_EQ, NewContext);
+        } else if (VN.canonicalize(SI->getTrueValue(), Top) ==
+                   VN.canonicalize(SI->getFalseValue(), Top)) {
+          add(SI, SI->getTrueValue(), ICmpInst::ICMP_EQ, NewContext);
+        }
+      } else if (CastInst *CI = dyn_cast<CastInst>(I)) {
+        const Type *DestTy = CI->getDestTy();
+        if (DestTy->isFPOrFPVector()) return;
+
+        Value *Op = VN.canonicalize(CI->getOperand(0), Top);
+        Instruction::CastOps Opcode = CI->getOpcode();
+
+        if (Constant *C = dyn_cast<Constant>(Op)) {
+          add(CI, ConstantExpr::getCast(Opcode, C, DestTy),
+              ICmpInst::ICMP_EQ, NewContext);
+        }
+
+        uint32_t W = VR.typeToWidth(DestTy);
+        unsigned ci = VN.getOrInsertVN(CI, Top);
+        ConstantRange CR = VR.range(VN.getOrInsertVN(Op, Top), Top);
+
+        if (!CR.isFullSet()) {
+          switch (Opcode) {
+            default: break;
+            case Instruction::ZExt:
+              VR.applyRange(ci, CR.zeroExtend(W), Top, this);
+              break;
+            case Instruction::SExt:
+              VR.applyRange(ci, CR.signExtend(W), Top, this);
+              break;
+            case Instruction::Trunc: {
+              ConstantRange Result = CR.truncate(W);
+              if (!Result.isFullSet())
+                VR.applyRange(ci, Result, Top, this);
+            } break;
+            case Instruction::BitCast:
+              VR.applyRange(ci, CR, Top, this);
+              break;
+            // TODO: other casts?
+          }
+        }
+      } else if (GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(I)) {
+        for (GetElementPtrInst::op_iterator OI = GEPI->idx_begin(),
+             OE = GEPI->idx_end(); OI != OE; ++OI) {
+          ConstantInt *Op = dyn_cast<ConstantInt>(VN.canonicalize(*OI, Top));
+          if (!Op || !Op->isZero()) return;
+        }
+        // TODO: The GEPI indices are all zero. Copy from operand to definition,
+        // jumping the type plane as needed.
+        Value *Ptr = GEPI->getPointerOperand();
+        if (isRelatedBy(Ptr, Constant::getNullValue(Ptr->getType()),
+                        ICmpInst::ICMP_NE)) {
+          add(GEPI, Constant::getNullValue(GEPI->getType()), ICmpInst::ICMP_NE,
+              NewContext);
+        }
+      }
+    }
+
+    /// solve - process the work queue
+    void solve() {
+      //DOUT << "WorkList entry, size: " << WorkList.size() << "\n";
+      while (!WorkList.empty()) {
+        //DOUT << "WorkList size: " << WorkList.size() << "\n";
+
+        Operation &O = WorkList.front();
+        TopInst = O.ContextInst;
+        TopBB = O.ContextBB;
+        Top = DTDFS->getNodeForBlock(TopBB); // XXX move this into Context
+
+        O.LHS = VN.canonicalize(O.LHS, Top);
+        O.RHS = VN.canonicalize(O.RHS, Top);
+
+        assert(O.LHS == VN.canonicalize(O.LHS, Top) && "Canonicalize isn't.");
+        assert(O.RHS == VN.canonicalize(O.RHS, Top) && "Canonicalize isn't.");
+
+        DOUT << "solving " << *O.LHS << " " << O.Op << " " << *O.RHS;
+        if (O.ContextInst) DOUT << " context inst: " << *O.ContextInst;
+        else DOUT << " context block: " << O.ContextBB->getName();
+        DOUT << "\n";
+
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+
+        // If they're both Constant, skip it. Check for contradiction and mark
+        // the BB as unreachable if so.
+        if (Constant *CI_L = dyn_cast<Constant>(O.LHS)) {
+          if (Constant *CI_R = dyn_cast<Constant>(O.RHS)) {
+            if (ConstantExpr::getCompare(O.Op, CI_L, CI_R) ==
+                ConstantInt::getFalse())
+              UB.mark(TopBB);
+
+            WorkList.pop_front();
+            continue;
+          }
+        }
+
+        if (VN.compare(O.LHS, O.RHS)) {
+          std::swap(O.LHS, O.RHS);
+          O.Op = ICmpInst::getSwappedPredicate(O.Op);
+        }
+
+        if (O.Op == ICmpInst::ICMP_EQ) {
+          if (!makeEqual(O.RHS, O.LHS))
+            UB.mark(TopBB);
+        } else {
+          LatticeVal LV = cmpInstToLattice(O.Op);
+
+          if ((LV & EQ_BIT) &&
+              isRelatedBy(O.LHS, O.RHS, ICmpInst::getSwappedPredicate(O.Op))) {
+            if (!makeEqual(O.RHS, O.LHS))
+              UB.mark(TopBB);
+          } else {
+            if (isRelatedBy(O.LHS, O.RHS, ICmpInst::getInversePredicate(O.Op))){
+              UB.mark(TopBB);
+              WorkList.pop_front();
+              continue;
+            }
+
+            unsigned n1 = VN.getOrInsertVN(O.LHS, Top);
+            unsigned n2 = VN.getOrInsertVN(O.RHS, Top);
+
+            if (n1 == n2) {
+              if (O.Op != ICmpInst::ICMP_UGE && O.Op != ICmpInst::ICMP_ULE &&
+                  O.Op != ICmpInst::ICMP_SGE && O.Op != ICmpInst::ICMP_SLE)
+                UB.mark(TopBB);
+
+              WorkList.pop_front();
+              continue;
+            }
+
+            if (VR.isRelatedBy(n1, n2, Top, LV) ||
+                IG.isRelatedBy(n1, n2, Top, LV)) {
+              WorkList.pop_front();
+              continue;
+            }
+
+            VR.addInequality(n1, n2, Top, LV, this);
+            if ((!isa<ConstantInt>(O.RHS) && !isa<ConstantInt>(O.LHS)) ||
+                LV == NE)
+              IG.addInequality(n1, n2, Top, LV);
+
+            if (Instruction *I1 = dyn_cast<Instruction>(O.LHS)) {
+              if (aboveOrBelow(I1))
+                defToOps(I1);
+            }
+            if (isa<Instruction>(O.LHS) || isa<Argument>(O.LHS)) {
+              for (Value::use_iterator UI = O.LHS->use_begin(),
+                   UE = O.LHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+            if (Instruction *I2 = dyn_cast<Instruction>(O.RHS)) {
+              if (aboveOrBelow(I2))
+              defToOps(I2);
+            }
+            if (isa<Instruction>(O.RHS) || isa<Argument>(O.RHS)) {
+              for (Value::use_iterator UI = O.RHS->use_begin(),
+                   UE = O.RHS->use_end(); UI != UE;) {
+                Use &TheUse = UI.getUse();
+                ++UI;
+                if (Instruction *I = dyn_cast<Instruction>(TheUse.getUser())) {
+                  if (aboveOrBelow(I))
+                    opsToDef(I);
+                }
+              }
+            }
+          }
+        }
+        WorkList.pop_front();
+      }
+    }
+  };
+
+  void ValueRanges::addToWorklist(Value *V, Constant *C,
+                                  ICmpInst::Predicate Pred, VRPSolver *VRP) {
+    VRP->add(V, C, Pred, VRP->TopInst);
+  }
+
+  void ValueRanges::markBlock(VRPSolver *VRP) {
+    VRP->UB.mark(VRP->TopBB);
+  }
+
+  /// PredicateSimplifier - This class is a simplifier that replaces
+  /// one equivalent variable with another. It also tracks what
+  /// can't be equal and will solve setcc instructions when possible.
+  /// @brief Root of the predicate simplifier optimization.
+  class VISIBILITY_HIDDEN PredicateSimplifier : public FunctionPass {
+    DomTreeDFS *DTDFS;
+    bool modified;
+    ValueNumbering *VN;
+    InequalityGraph *IG;
+    UnreachableBlocks UB;
+    ValueRanges *VR;
+
+    std::vector<DomTreeDFS::Node *> WorkList;
+
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    PredicateSimplifier() : FunctionPass((intptr_t)&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<TargetData>();
+      AU.addPreserved<TargetData>();
+    }
+
+  private:
+    /// Forwards - Adds new properties to VRPSolver and uses them to
+    /// simplify instructions. Because new properties sometimes apply to
+    /// a transition from one BasicBlock to another, this will use the
+    /// PredicateSimplifier::proceedToSuccessor(s) interface to enter the
+    /// basic block.
+    /// @brief Performs abstract execution of the program.
+    class VISIBILITY_HIDDEN Forwards : public InstVisitor<Forwards> {
+      friend class InstVisitor<Forwards>;
+      PredicateSimplifier *PS;
+      DomTreeDFS::Node *DTNode;
+
+    public:
+      ValueNumbering &VN;
+      InequalityGraph &IG;
+      UnreachableBlocks &UB;
+      ValueRanges &VR;
+
+      Forwards(PredicateSimplifier *PS, DomTreeDFS::Node *DTNode)
+        : PS(PS), DTNode(DTNode), VN(*PS->VN), IG(*PS->IG), UB(PS->UB),
+          VR(*PS->VR) {}
+
+      void visitTerminatorInst(TerminatorInst &TI);
+      void visitBranchInst(BranchInst &BI);
+      void visitSwitchInst(SwitchInst &SI);
+
+      void visitAllocaInst(AllocaInst &AI);
+      void visitLoadInst(LoadInst &LI);
+      void visitStoreInst(StoreInst &SI);
+
+      void visitSExtInst(SExtInst &SI);
+      void visitZExtInst(ZExtInst &ZI);
+
+      void visitBinaryOperator(BinaryOperator &BO);
+      void visitICmpInst(ICmpInst &IC);
+    };
+  
+    // Used by terminator instructions to proceed from the current basic
+    // block to the next. Verifies that "current" dominates "next",
+    // then calls visitBasicBlock.
+    void proceedToSuccessors(DomTreeDFS::Node *Current) {
+      for (DomTreeDFS::Node::iterator I = Current->begin(),
+           E = Current->end(); I != E; ++I) {
+        WorkList.push_back(*I);
+      }
+    }
+
+    void proceedToSuccessor(DomTreeDFS::Node *Next) {
+      WorkList.push_back(Next);
+    }
+
+    // Visits each instruction in the basic block.
+    void visitBasicBlock(DomTreeDFS::Node *Node) {
+      BasicBlock *BB = Node->getBlock();
+      DOUT << "Entering Basic Block: " << BB->getName()
+           << " (" << Node->getDFSNumIn() << ")\n";
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
+        visitInstruction(I++, Node);
+      }
+    }
+
+    // Tries to simplify each Instruction and add new properties.
+    void visitInstruction(Instruction *I, DomTreeDFS::Node *DT) {
+      DOUT << "Considering instruction " << *I << "\n";
+      DEBUG(VN->dump());
+      DEBUG(IG->dump());
+      DEBUG(VR->dump());
+
+      // Sometimes instructions are killed in earlier analysis.
+      if (isInstructionTriviallyDead(I)) {
+        ++NumSimple;
+        modified = true;
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->eraseFromParent();
+        return;
+      }
+
+#ifndef NDEBUG
+      // Try to replace the whole instruction.
+      Value *V = VN->canonicalize(I, DT);
+      assert(V == I && "Late instruction canonicalization.");
+      if (V != I) {
+        modified = true;
+        ++NumInstruction;
+        DOUT << "Removing " << *I << ", replacing with " << *V << "\n";
+        if (unsigned n = VN->valueNumber(I, DTDFS->getRootNode()))
+          if (VN->value(n) == I) IG->remove(n);
+        VN->remove(I);
+        I->replaceAllUsesWith(V);
+        I->eraseFromParent();
+        return;
+      }
+
+      // Try to substitute operands.
+      for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i) {
+        Value *Oper = I->getOperand(i);
+        Value *V = VN->canonicalize(Oper, DT);
+        assert(V == Oper && "Late operand canonicalization.");
+        if (V != Oper) {
+          modified = true;
+          ++NumVarsReplaced;
+          DOUT << "Resolving " << *I;
+          I->setOperand(i, V);
+          DOUT << " into " << *I;
+        }
+      }
+#endif
+
+      std::string name = I->getParent()->getName();
+      DOUT << "push (%" << name << ")\n";
+      Forwards visit(this, DT);
+      visit.visit(*I);
+      DOUT << "pop (%" << name << ")\n";
+    }
+  };
+
+  bool PredicateSimplifier::runOnFunction(Function &F) {
+    DominatorTree *DT = &getAnalysis<DominatorTree>();
+    DTDFS = new DomTreeDFS(DT);
+    TargetData *TD = &getAnalysis<TargetData>();
+
+    DOUT << "Entering Function: " << F.getName() << "\n";
+
+    modified = false;
+    DomTreeDFS::Node *Root = DTDFS->getRootNode();
+    VN = new ValueNumbering(DTDFS);
+    IG = new InequalityGraph(*VN, Root);
+    VR = new ValueRanges(*VN, TD);
+    WorkList.push_back(Root);
+
+    do {
+      DomTreeDFS::Node *DTNode = WorkList.back();
+      WorkList.pop_back();
+      if (!UB.isDead(DTNode->getBlock())) visitBasicBlock(DTNode);
+    } while (!WorkList.empty());
+
+    delete DTDFS;
+    delete VR;
+    delete IG;
+
+    modified |= UB.kill();
+
+    return modified;
+  }
+
+  void PredicateSimplifier::Forwards::visitTerminatorInst(TerminatorInst &TI) {
+    PS->proceedToSuccessors(DTNode);
+  }
+
+  void PredicateSimplifier::Forwards::visitBranchInst(BranchInst &BI) {
+    if (BI.isUnconditional()) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    Value *Condition = BI.getCondition();
+    BasicBlock *TrueDest  = BI.getSuccessor(0);
+    BasicBlock *FalseDest = BI.getSuccessor(1);
+
+    if (isa<Constant>(Condition) || TrueDest == FalseDest) {
+      PS->proceedToSuccessors(DTNode);
+      return;
+    }
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *Dest = (*I)->getBlock();
+      DOUT << "Branch thinking about %" << Dest->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(Dest)->getDFSNumIn() << ")\n";
+
+      if (Dest == TrueDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") true set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getTrue(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      } else if (Dest == FalseDest) {
+        DOUT << "(" << DTNode->getBlock()->getName() << ") false set:\n";
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, Dest);
+        VRP.add(ConstantInt::getFalse(), Condition, ICmpInst::ICMP_EQ);
+        VRP.solve();
+        DEBUG(VN.dump());
+        DEBUG(IG.dump());
+        DEBUG(VR.dump());
+      }
+
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitSwitchInst(SwitchInst &SI) {
+    Value *Condition = SI.getCondition();
+
+    // Set the EQProperty in each of the cases BBs, and the NEProperties
+    // in the default BB.
+
+    for (DomTreeDFS::Node::iterator I = DTNode->begin(), E = DTNode->end();
+         I != E; ++I) {
+      BasicBlock *BB = (*I)->getBlock();
+      DOUT << "Switch thinking about BB %" << BB->getName()
+           << "(" << PS->DTDFS->getNodeForBlock(BB)->getDFSNumIn() << ")\n";
+
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, BB);
+      if (BB == SI.getDefaultDest()) {
+        for (unsigned i = 1, e = SI.getNumCases(); i < e; ++i)
+          if (SI.getSuccessor(i) != BB)
+            VRP.add(Condition, SI.getCaseValue(i), ICmpInst::ICMP_NE);
+        VRP.solve();
+      } else if (ConstantInt *CI = SI.findCaseDest(BB)) {
+        VRP.add(Condition, CI, ICmpInst::ICMP_EQ);
+        VRP.solve();
+      }
+      PS->proceedToSuccessor(*I);
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitAllocaInst(AllocaInst &AI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &AI);
+    VRP.add(Constant::getNullValue(AI.getType()), &AI, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitLoadInst(LoadInst &LI) {
+    Value *Ptr = LI.getPointerOperand();
+    // avoid "load uint* null" -> null NE null.
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &LI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitStoreInst(StoreInst &SI) {
+    Value *Ptr = SI.getPointerOperand();
+    if (isa<Constant>(Ptr)) return;
+
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    VRP.add(Constant::getNullValue(Ptr->getType()), Ptr, ICmpInst::ICMP_NE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitSExtInst(SExtInst &SI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &SI);
+    uint32_t SrcBitWidth = cast<IntegerType>(SI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(SI.getDestTy())->getBitWidth();
+    APInt Min(APInt::getHighBitsSet(DstBitWidth, DstBitWidth-SrcBitWidth+1));
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth-1));
+    VRP.add(ConstantInt::get(Min), &SI, ICmpInst::ICMP_SLE);
+    VRP.add(ConstantInt::get(Max), &SI, ICmpInst::ICMP_SGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitZExtInst(ZExtInst &ZI) {
+    VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &ZI);
+    uint32_t SrcBitWidth = cast<IntegerType>(ZI.getSrcTy())->getBitWidth();
+    uint32_t DstBitWidth = cast<IntegerType>(ZI.getDestTy())->getBitWidth();
+    APInt Max(APInt::getLowBitsSet(DstBitWidth, SrcBitWidth));
+    VRP.add(ConstantInt::get(Max), &ZI, ICmpInst::ICMP_UGE);
+    VRP.solve();
+  }
+
+  void PredicateSimplifier::Forwards::visitBinaryOperator(BinaryOperator &BO) {
+    Instruction::BinaryOps ops = BO.getOpcode();
+
+    switch (ops) {
+    default: break;
+      case Instruction::URem:
+      case Instruction::SRem:
+      case Instruction::UDiv:
+      case Instruction::SDiv: {
+        Value *Divisor = BO.getOperand(1);
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(Constant::getNullValue(Divisor->getType()), Divisor,
+                ICmpInst::ICMP_NE);
+        VRP.solve();
+        break;
+      }
+    }
+
+    switch (ops) {
+      default: break;
+      case Instruction::Shl: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+      case Instruction::AShr: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_SLE);
+        VRP.solve();
+      } break;
+      case Instruction::LShr:
+      case Instruction::UDiv: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::URem: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::And: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_ULE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_ULE);
+        VRP.solve();
+      } break;
+      case Instruction::Or: {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &BO);
+        VRP.add(&BO, BO.getOperand(0), ICmpInst::ICMP_UGE);
+        VRP.add(&BO, BO.getOperand(1), ICmpInst::ICMP_UGE);
+        VRP.solve();
+      } break;
+    }
+  }
+
+  void PredicateSimplifier::Forwards::visitICmpInst(ICmpInst &IC) {
+    // If possible, squeeze the ICmp predicate into something simpler.
+    // Eg., if x = [0, 4) and we're being asked icmp uge %x, 3 then change
+    // the predicate to eq.
+
+    // XXX: once we do full PHI handling, modifying the instruction in the
+    // Forwards visitor will cause missed optimizations.
+
+    ICmpInst::Predicate Pred = IC.getPredicate();
+
+    switch (Pred) {
+      default: break;
+      case ICmpInst::ICMP_ULE: Pred = ICmpInst::ICMP_ULT; break;
+      case ICmpInst::ICMP_UGE: Pred = ICmpInst::ICMP_UGT; break;
+      case ICmpInst::ICMP_SLE: Pred = ICmpInst::ICMP_SLT; break;
+      case ICmpInst::ICMP_SGE: Pred = ICmpInst::ICMP_SGT; break;
+    }
+    if (Pred != IC.getPredicate()) {
+      VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+      if (VRP.isRelatedBy(IC.getOperand(1), IC.getOperand(0),
+                          ICmpInst::ICMP_NE)) {
+        ++NumSnuggle;
+        PS->modified = true;
+        IC.setPredicate(Pred);
+      }
+    }
+
+    Pred = IC.getPredicate();
+
+    if (ConstantInt *Op1 = dyn_cast<ConstantInt>(IC.getOperand(1))) {
+      ConstantInt *NextVal = 0;
+      switch (Pred) {
+        default: break;
+        case ICmpInst::ICMP_SLT:
+        case ICmpInst::ICMP_ULT:
+          if (Op1->getValue() != 0)
+            NextVal = ConstantInt::get(Op1->getValue()-1);
+         break;
+        case ICmpInst::ICMP_SGT:
+        case ICmpInst::ICMP_UGT:
+          if (!Op1->getValue().isAllOnesValue())
+            NextVal = ConstantInt::get(Op1->getValue()+1);
+         break;
+
+      }
+      if (NextVal) {
+        VRPSolver VRP(VN, IG, UB, VR, PS->DTDFS, PS->modified, &IC);
+        if (VRP.isRelatedBy(IC.getOperand(0), NextVal,
+                            ICmpInst::getInversePredicate(Pred))) {
+          ICmpInst *NewIC = new ICmpInst(ICmpInst::ICMP_EQ, IC.getOperand(0),
+                                         NextVal, "", &IC);
+          NewIC->takeName(&IC);
+          IC.replaceAllUsesWith(NewIC);
+
+          // XXX: prove this isn't necessary
+          if (unsigned n = VN.valueNumber(&IC, PS->DTDFS->getRootNode()))
+            if (VN.value(n) == &IC) IG.remove(n);
+          VN.remove(&IC);
+
+          IC.eraseFromParent();
+          ++NumSnuggle;
+          PS->modified = true;
+        }
+      }
+    }
+  }
+
+  char PredicateSimplifier::ID = 0;
+  RegisterPass<PredicateSimplifier> X("predsimplify",
+                                      "Predicate Simplifier");
+}
+
+FunctionPass *llvm::createPredicateSimplifierPass() {
+  return new PredicateSimplifier();
+}

diff --git a/lib/Transforms/Scalar/Reassociate.cpp b/lib/Transforms/Scalar/Reassociate.cpp
new file mode 100644
index 0000000..95f9e7b
--- /dev/null
+++ b/lib/Transforms/Scalar/Reassociate.cpp

@@ -0,0 +1,868 @@
+//===- Reassociate.cpp - Reassociate binary expressions -------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass reassociates commutative expressions in an order that is designed
+// to promote better constant propagation, GCSE, LICM, PRE...
+//
+// For example: 4 + (x + 5) -> x + (4 + 5)
+//
+// In the implementation of this algorithm, constants are assigned rank = 0,
+// function arguments are rank = 1, and other values are assigned ranks
+// corresponding to the reverse post order traversal of current function
+// (starting at 2), which effectively gives values in deep loops higher rank
+// than values not in loops.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reassociate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Assembly/Writer.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumLinear , "Number of insts linearized");
+STATISTIC(NumChanged, "Number of insts reassociated");
+STATISTIC(NumAnnihil, "Number of expr tree annihilated");
+STATISTIC(NumFactor , "Number of multiplies factored");
+
+namespace {
+  struct VISIBILITY_HIDDEN ValueEntry {
+    unsigned Rank;
+    Value *Op;
+    ValueEntry(unsigned R, Value *O) : Rank(R), Op(O) {}
+  };
+  inline bool operator<(const ValueEntry &LHS, const ValueEntry &RHS) {
+    return LHS.Rank > RHS.Rank;   // Sort so that highest rank goes to start.
+  }
+}
+
+/// PrintOps - Print out the expression identified in the Ops list.
+///
+static void PrintOps(Instruction *I, const std::vector<ValueEntry> &Ops) {
+  Module *M = I->getParent()->getParent()->getParent();
+  cerr << Instruction::getOpcodeName(I->getOpcode()) << " "
+  << *Ops[0].Op->getType();
+  for (unsigned i = 0, e = Ops.size(); i != e; ++i)
+    WriteAsOperand(*cerr.stream() << " ", Ops[i].Op, false, M)
+      << "," << Ops[i].Rank;
+}
+  
+namespace {  
+  class VISIBILITY_HIDDEN Reassociate : public FunctionPass {
+    std::map<BasicBlock*, unsigned> RankMap;
+    std::map<Value*, unsigned> ValueRankMap;
+    bool MadeChange;
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    Reassociate() : FunctionPass((intptr_t)&ID) {}
+
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  private:
+    void BuildRankMap(Function &F);
+    unsigned getRank(Value *V);
+    void ReassociateExpression(BinaryOperator *I);
+    void RewriteExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops,
+                         unsigned Idx = 0);
+    Value *OptimizeExpression(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExprTree(BinaryOperator *I, std::vector<ValueEntry> &Ops);
+    void LinearizeExpr(BinaryOperator *I);
+    Value *RemoveFactorFromExpression(Value *V, Value *Factor);
+    void ReassociateBB(BasicBlock *BB);
+    
+    void RemoveDeadBinaryOp(Value *V);
+  };
+
+  char Reassociate::ID = 0;
+  RegisterPass<Reassociate> X("reassociate", "Reassociate expressions");
+}
+
+// Public interface to the Reassociate pass
+FunctionPass *llvm::createReassociatePass() { return new Reassociate(); }
+
+void Reassociate::RemoveDeadBinaryOp(Value *V) {
+  Instruction *Op = dyn_cast<Instruction>(V);
+  if (!Op || !isa<BinaryOperator>(Op) || !isa<CmpInst>(Op) || !Op->use_empty())
+    return;
+  
+  Value *LHS = Op->getOperand(0), *RHS = Op->getOperand(1);
+  RemoveDeadBinaryOp(LHS);
+  RemoveDeadBinaryOp(RHS);
+}
+
+
+static bool isUnmovableInstruction(Instruction *I) {
+  if (I->getOpcode() == Instruction::PHI ||
+      I->getOpcode() == Instruction::Alloca ||
+      I->getOpcode() == Instruction::Load ||
+      I->getOpcode() == Instruction::Malloc ||
+      I->getOpcode() == Instruction::Invoke ||
+      I->getOpcode() == Instruction::Call ||
+      I->getOpcode() == Instruction::UDiv || 
+      I->getOpcode() == Instruction::SDiv ||
+      I->getOpcode() == Instruction::FDiv ||
+      I->getOpcode() == Instruction::URem ||
+      I->getOpcode() == Instruction::SRem ||
+      I->getOpcode() == Instruction::FRem)
+    return true;
+  return false;
+}
+
+void Reassociate::BuildRankMap(Function &F) {
+  unsigned i = 2;
+
+  // Assign distinct ranks to function arguments
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I)
+    ValueRankMap[I] = ++i;
+
+  ReversePostOrderTraversal<Function*> RPOT(&F);
+  for (ReversePostOrderTraversal<Function*>::rpo_iterator I = RPOT.begin(),
+         E = RPOT.end(); I != E; ++I) {
+    BasicBlock *BB = *I;
+    unsigned BBRank = RankMap[BB] = ++i << 16;
+
+    // Walk the basic block, adding precomputed ranks for any instructions that
+    // we cannot move.  This ensures that the ranks for these instructions are
+    // all different in the block.
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+      if (isUnmovableInstruction(I))
+        ValueRankMap[I] = ++BBRank;
+  }
+}
+
+unsigned Reassociate::getRank(Value *V) {
+  if (isa<Argument>(V)) return ValueRankMap[V];   // Function argument...
+
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (I == 0) return 0;  // Otherwise it's a global or constant, rank 0.
+
+  unsigned &CachedRank = ValueRankMap[I];
+  if (CachedRank) return CachedRank;    // Rank already known?
+
+  // If this is an expression, return the 1+MAX(rank(LHS), rank(RHS)) so that
+  // we can reassociate expressions for code motion!  Since we do not recurse
+  // for PHI nodes, we cannot have infinite recursion here, because there
+  // cannot be loops in the value graph that do not go through PHI nodes.
+  unsigned Rank = 0, MaxRank = RankMap[I->getParent()];
+  for (unsigned i = 0, e = I->getNumOperands();
+       i != e && Rank != MaxRank; ++i)
+    Rank = std::max(Rank, getRank(I->getOperand(i)));
+
+  // If this is a not or neg instruction, do not count it for rank.  This
+  // assures us that X and ~X will have the same rank.
+  if (!I->getType()->isInteger() ||
+      (!BinaryOperator::isNot(I) && !BinaryOperator::isNeg(I)))
+    ++Rank;
+
+  //DOUT << "Calculated Rank[" << V->getName() << "] = "
+  //     << Rank << "\n";
+
+  return CachedRank = Rank;
+}
+
+/// isReassociableOp - Return true if V is an instruction of the specified
+/// opcode and if it only has one use.
+static BinaryOperator *isReassociableOp(Value *V, unsigned Opcode) {
+  if ((V->hasOneUse() || V->use_empty()) && isa<Instruction>(V) &&
+      cast<Instruction>(V)->getOpcode() == Opcode)
+    return cast<BinaryOperator>(V);
+  return 0;
+}
+
+/// LowerNegateToMultiply - Replace 0-X with X*-1.
+///
+static Instruction *LowerNegateToMultiply(Instruction *Neg) {
+  Constant *Cst = ConstantInt::getAllOnesValue(Neg->getType());
+
+  Instruction *Res = BinaryOperator::createMul(Neg->getOperand(1), Cst, "",Neg);
+  Res->takeName(Neg);
+  Neg->replaceAllUsesWith(Res);
+  Neg->eraseFromParent();
+  return Res;
+}
+
+// Given an expression of the form '(A+B)+(D+C)', turn it into '(((A+B)+C)+D)'.
+// Note that if D is also part of the expression tree that we recurse to
+// linearize it as well.  Besides that case, this does not recurse into A,B, or
+// C.
+void Reassociate::LinearizeExpr(BinaryOperator *I) {
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  BinaryOperator *RHS = cast<BinaryOperator>(I->getOperand(1));
+  assert(isReassociableOp(LHS, I->getOpcode()) &&
+         isReassociableOp(RHS, I->getOpcode()) &&
+         "Not an expression that needs linearization?");
+
+  DOUT << "Linear" << *LHS << *RHS << *I;
+
+  // Move the RHS instruction to live immediately before I, avoiding breaking
+  // dominator properties.
+  RHS->moveBefore(I);
+
+  // Move operands around to do the linearization.
+  I->setOperand(1, RHS->getOperand(0));
+  RHS->setOperand(0, LHS);
+  I->setOperand(0, RHS);
+
+  ++NumLinear;
+  MadeChange = true;
+  DOUT << "Linearized: " << *I;
+
+  // If D is part of this expression tree, tail recurse.
+  if (isReassociableOp(I->getOperand(1), I->getOpcode()))
+    LinearizeExpr(I);
+}
+
+
+/// LinearizeExprTree - Given an associative binary expression tree, traverse
+/// all of the uses putting it into canonical form.  This forces a left-linear
+/// form of the the expression (((a+b)+c)+d), and collects information about the
+/// rank of the non-tree operands.
+///
+/// NOTE: These intentionally destroys the expression tree operands (turning
+/// them into undef values) to reduce #uses of the values.  This means that the
+/// caller MUST use something like RewriteExprTree to put the values back in.
+///
+void Reassociate::LinearizeExprTree(BinaryOperator *I,
+                                    std::vector<ValueEntry> &Ops) {
+  Value *LHS = I->getOperand(0), *RHS = I->getOperand(1);
+  unsigned Opcode = I->getOpcode();
+
+  // First step, linearize the expression if it is in ((A+B)+(C+D)) form.
+  BinaryOperator *LHSBO = isReassociableOp(LHS, Opcode);
+  BinaryOperator *RHSBO = isReassociableOp(RHS, Opcode);
+
+  // If this is a multiply expression tree and it contains internal negations,
+  // transform them into multiplies by -1 so they can be reassociated.
+  if (I->getOpcode() == Instruction::Mul) {
+    if (!LHSBO && LHS->hasOneUse() && BinaryOperator::isNeg(LHS)) {
+      LHS = LowerNegateToMultiply(cast<Instruction>(LHS));
+      LHSBO = isReassociableOp(LHS, Opcode);
+    }
+    if (!RHSBO && RHS->hasOneUse() && BinaryOperator::isNeg(RHS)) {
+      RHS = LowerNegateToMultiply(cast<Instruction>(RHS));
+      RHSBO = isReassociableOp(RHS, Opcode);
+    }
+  }
+
+  if (!LHSBO) {
+    if (!RHSBO) {
+      // Neither the LHS or RHS as part of the tree, thus this is a leaf.  As
+      // such, just remember these operands and their rank.
+      Ops.push_back(ValueEntry(getRank(LHS), LHS));
+      Ops.push_back(ValueEntry(getRank(RHS), RHS));
+      
+      // Clear the leaves out.
+      I->setOperand(0, UndefValue::get(I->getType()));
+      I->setOperand(1, UndefValue::get(I->getType()));
+      return;
+    } else {
+      // Turn X+(Y+Z) -> (Y+Z)+X
+      std::swap(LHSBO, RHSBO);
+      std::swap(LHS, RHS);
+      bool Success = !I->swapOperands();
+      assert(Success && "swapOperands failed");
+      MadeChange = true;
+    }
+  } else if (RHSBO) {
+    // Turn (A+B)+(C+D) -> (((A+B)+C)+D).  This guarantees the the RHS is not
+    // part of the expression tree.
+    LinearizeExpr(I);
+    LHS = LHSBO = cast<BinaryOperator>(I->getOperand(0));
+    RHS = I->getOperand(1);
+    RHSBO = 0;
+  }
+
+  // Okay, now we know that the LHS is a nested expression and that the RHS is
+  // not.  Perform reassociation.
+  assert(!isReassociableOp(RHS, Opcode) && "LinearizeExpr failed!");
+
+  // Move LHS right before I to make sure that the tree expression dominates all
+  // values.
+  LHSBO->moveBefore(I);
+
+  // Linearize the expression tree on the LHS.
+  LinearizeExprTree(LHSBO, Ops);
+
+  // Remember the RHS operand and its rank.
+  Ops.push_back(ValueEntry(getRank(RHS), RHS));
+  
+  // Clear the RHS leaf out.
+  I->setOperand(1, UndefValue::get(I->getType()));
+}
+
+// RewriteExprTree - Now that the operands for this expression tree are
+// linearized and optimized, emit them in-order.  This function is written to be
+// tail recursive.
+void Reassociate::RewriteExprTree(BinaryOperator *I,
+                                  std::vector<ValueEntry> &Ops,
+                                  unsigned i) {
+  if (i+2 == Ops.size()) {
+    if (I->getOperand(0) != Ops[i].Op ||
+        I->getOperand(1) != Ops[i+1].Op) {
+      Value *OldLHS = I->getOperand(0);
+      DOUT << "RA: " << *I;
+      I->setOperand(0, Ops[i].Op);
+      I->setOperand(1, Ops[i+1].Op);
+      DOUT << "TO: " << *I;
+      MadeChange = true;
+      ++NumChanged;
+      
+      // If we reassociated a tree to fewer operands (e.g. (1+a+2) -> (a+3)
+      // delete the extra, now dead, nodes.
+      RemoveDeadBinaryOp(OldLHS);
+    }
+    return;
+  }
+  assert(i+2 < Ops.size() && "Ops index out of range!");
+
+  if (I->getOperand(1) != Ops[i].Op) {
+    DOUT << "RA: " << *I;
+    I->setOperand(1, Ops[i].Op);
+    DOUT << "TO: " << *I;
+    MadeChange = true;
+    ++NumChanged;
+  }
+  
+  BinaryOperator *LHS = cast<BinaryOperator>(I->getOperand(0));
+  assert(LHS->getOpcode() == I->getOpcode() &&
+         "Improper expression tree!");
+  
+  // Compactify the tree instructions together with each other to guarantee
+  // that the expression tree is dominated by all of Ops.
+  LHS->moveBefore(I);
+  RewriteExprTree(LHS, Ops, i+1);
+}
+
+
+
+// NegateValue - Insert instructions before the instruction pointed to by BI,
+// that computes the negative version of the value specified.  The negative
+// version of the value is returned, and BI is left pointing at the instruction
+// that should be processed next by the reassociation pass.
+//
+static Value *NegateValue(Value *V, Instruction *BI) {
+  // We are trying to expose opportunity for reassociation.  One of the things
+  // that we want to do to achieve this is to push a negation as deep into an
+  // expression chain as possible, to expose the add instructions.  In practice,
+  // this means that we turn this:
+  //   X = -(A+12+C+D)   into    X = -A + -12 + -C + -D = -12 + -A + -C + -D
+  // so that later, a: Y = 12+X could get reassociated with the -12 to eliminate
+  // the constants.  We assume that instcombine will clean up the mess later if
+  // we introduce tons of unnecessary negation instructions...
+  //
+  if (Instruction *I = dyn_cast<Instruction>(V))
+    if (I->getOpcode() == Instruction::Add && I->hasOneUse()) {
+      // Push the negates through the add.
+      I->setOperand(0, NegateValue(I->getOperand(0), BI));
+      I->setOperand(1, NegateValue(I->getOperand(1), BI));
+
+      // We must move the add instruction here, because the neg instructions do
+      // not dominate the old add instruction in general.  By moving it, we are
+      // assured that the neg instructions we just inserted dominate the 
+      // instruction we are about to insert after them.
+      //
+      I->moveBefore(BI);
+      I->setName(I->getName()+".neg");
+      return I;
+    }
+
+  // Insert a 'neg' instruction that subtracts the value from zero to get the
+  // negation.
+  //
+  return BinaryOperator::createNeg(V, V->getName() + ".neg", BI);
+}
+
+/// BreakUpSubtract - If we have (X-Y), and if either X is an add, or if this is
+/// only used by an add, transform this into (X+(0-Y)) to promote better
+/// reassociation.
+static Instruction *BreakUpSubtract(Instruction *Sub) {
+  // Don't bother to break this up unless either the LHS is an associable add or
+  // if this is only used by one.
+  if (!isReassociableOp(Sub->getOperand(0), Instruction::Add) &&
+      !isReassociableOp(Sub->getOperand(1), Instruction::Add) &&
+      !(Sub->hasOneUse() &&isReassociableOp(Sub->use_back(), Instruction::Add)))
+    return 0;
+
+  // Convert a subtract into an add and a neg instruction... so that sub
+  // instructions can be commuted with other add instructions...
+  //
+  // Calculate the negative value of Operand 1 of the sub instruction...
+  // and set it as the RHS of the add instruction we just made...
+  //
+  Value *NegVal = NegateValue(Sub->getOperand(1), Sub);
+  Instruction *New =
+    BinaryOperator::createAdd(Sub->getOperand(0), NegVal, "", Sub);
+  New->takeName(Sub);
+
+  // Everyone now refers to the add instruction.
+  Sub->replaceAllUsesWith(New);
+  Sub->eraseFromParent();
+
+  DOUT << "Negated: " << *New;
+  return New;
+}
+
+/// ConvertShiftToMul - If this is a shift of a reassociable multiply or is used
+/// by one, change this into a multiply by a constant to assist with further
+/// reassociation.
+static Instruction *ConvertShiftToMul(Instruction *Shl) {
+  // If an operand of this shift is a reassociable multiply, or if the shift
+  // is used by a reassociable multiply or add, turn into a multiply.
+  if (isReassociableOp(Shl->getOperand(0), Instruction::Mul) ||
+      (Shl->hasOneUse() && 
+       (isReassociableOp(Shl->use_back(), Instruction::Mul) ||
+        isReassociableOp(Shl->use_back(), Instruction::Add)))) {
+    Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
+    MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+    
+    Instruction *Mul = BinaryOperator::createMul(Shl->getOperand(0), MulCst,
+                                                 "", Shl);
+    Mul->takeName(Shl);
+    Shl->replaceAllUsesWith(Mul);
+    Shl->eraseFromParent();
+    return Mul;
+  }
+  return 0;
+}
+
+// Scan backwards and forwards among values with the same rank as element i to
+// see if X exists.  If X does not exist, return i.
+static unsigned FindInOperandList(std::vector<ValueEntry> &Ops, unsigned i,
+                                  Value *X) {
+  unsigned XRank = Ops[i].Rank;
+  unsigned e = Ops.size();
+  for (unsigned j = i+1; j != e && Ops[j].Rank == XRank; ++j)
+    if (Ops[j].Op == X)
+      return j;
+  // Scan backwards
+  for (unsigned j = i-1; j != ~0U && Ops[j].Rank == XRank; --j)
+    if (Ops[j].Op == X)
+      return j;
+  return i;
+}
+
+/// EmitAddTreeOfValues - Emit a tree of add instructions, summing Ops together
+/// and returning the result.  Insert the tree before I.
+static Value *EmitAddTreeOfValues(Instruction *I, std::vector<Value*> &Ops) {
+  if (Ops.size() == 1) return Ops.back();
+  
+  Value *V1 = Ops.back();
+  Ops.pop_back();
+  Value *V2 = EmitAddTreeOfValues(I, Ops);
+  return BinaryOperator::createAdd(V2, V1, "tmp", I);
+}
+
+/// RemoveFactorFromExpression - If V is an expression tree that is a 
+/// multiplication sequence, and if this sequence contains a multiply by Factor,
+/// remove Factor from the tree and return the new tree.
+Value *Reassociate::RemoveFactorFromExpression(Value *V, Value *Factor) {
+  BinaryOperator *BO = isReassociableOp(V, Instruction::Mul);
+  if (!BO) return 0;
+  
+  std::vector<ValueEntry> Factors;
+  LinearizeExprTree(BO, Factors);
+
+  bool FoundFactor = false;
+  for (unsigned i = 0, e = Factors.size(); i != e; ++i)
+    if (Factors[i].Op == Factor) {
+      FoundFactor = true;
+      Factors.erase(Factors.begin()+i);
+      break;
+    }
+  if (!FoundFactor) {
+    // Make sure to restore the operands to the expression tree.
+    RewriteExprTree(BO, Factors);
+    return 0;
+  }
+  
+  if (Factors.size() == 1) return Factors[0].Op;
+  
+  RewriteExprTree(BO, Factors);
+  return BO;
+}
+
+/// FindSingleUseMultiplyFactors - If V is a single-use multiply, recursively
+/// add its operands as factors, otherwise add V to the list of factors.
+static void FindSingleUseMultiplyFactors(Value *V,
+                                         std::vector<Value*> &Factors) {
+  BinaryOperator *BO;
+  if ((!V->hasOneUse() && !V->use_empty()) ||
+      !(BO = dyn_cast<BinaryOperator>(V)) ||
+      BO->getOpcode() != Instruction::Mul) {
+    Factors.push_back(V);
+    return;
+  }
+  
+  // Otherwise, add the LHS and RHS to the list of factors.
+  FindSingleUseMultiplyFactors(BO->getOperand(1), Factors);
+  FindSingleUseMultiplyFactors(BO->getOperand(0), Factors);
+}
+
+
+
+Value *Reassociate::OptimizeExpression(BinaryOperator *I,
+                                       std::vector<ValueEntry> &Ops) {
+  // Now that we have the linearized expression tree, try to optimize it.
+  // Start by folding any constants that we found.
+  bool IterateOptimization = false;
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  unsigned Opcode = I->getOpcode();
+  
+  if (Constant *V1 = dyn_cast<Constant>(Ops[Ops.size()-2].Op))
+    if (Constant *V2 = dyn_cast<Constant>(Ops.back().Op)) {
+      Ops.pop_back();
+      Ops.back().Op = ConstantExpr::get(Opcode, V1, V2);
+      return OptimizeExpression(I, Ops);
+    }
+
+  // Check for destructive annihilation due to a constant being used.
+  if (ConstantInt *CstVal = dyn_cast<ConstantInt>(Ops.back().Op))
+    switch (Opcode) {
+    default: break;
+    case Instruction::And:
+      if (CstVal->isZero()) {                // ... & 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (CstVal->isAllOnesValue()) { // ... & -1 -> ...
+        Ops.pop_back();
+      }
+      break;
+    case Instruction::Mul:
+      if (CstVal->isZero()) {                // ... * 0 -> 0
+        ++NumAnnihil;
+        return CstVal;
+      } else if (cast<ConstantInt>(CstVal)->isOne()) {
+        Ops.pop_back();                      // ... * 1 -> ...
+      }
+      break;
+    case Instruction::Or:
+      if (CstVal->isAllOnesValue()) {        // ... | -1 -> -1
+        ++NumAnnihil;
+        return CstVal;
+      }
+      // FALLTHROUGH!
+    case Instruction::Add:
+    case Instruction::Xor:
+      if (CstVal->isZero())                  // ... [|^+] 0 -> ...
+        Ops.pop_back();
+      break;
+    }
+  if (Ops.size() == 1) return Ops[0].Op;
+
+  // Handle destructive annihilation do to identities between elements in the
+  // argument list here.
+  switch (Opcode) {
+  default: break;
+  case Instruction::And:
+  case Instruction::Or:
+  case Instruction::Xor:
+    // Scan the operand lists looking for X and ~X pairs, along with X,X pairs.
+    // If we find any, we can simplify the expression. X&~X == 0, X|~X == -1.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      // First, check for X and ~X in the operand list.
+      assert(i < Ops.size());
+      if (BinaryOperator::isNot(Ops[i].Op)) {    // Cannot occur for ^.
+        Value *X = BinaryOperator::getNotArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          if (Opcode == Instruction::And) {   // ...&X&~X = 0
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else if (Opcode == Instruction::Or) {   // ...|X|~X = -1
+            ++NumAnnihil;
+            return ConstantInt::getAllOnesValue(X->getType());
+          }
+        }
+      }
+
+      // Next, check for duplicate pairs of values, which we assume are next to
+      // each other, due to our sorting criteria.
+      assert(i < Ops.size());
+      if (i+1 != Ops.size() && Ops[i+1].Op == Ops[i].Op) {
+        if (Opcode == Instruction::And || Opcode == Instruction::Or) {
+          // Drop duplicate values.
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        } else {
+          assert(Opcode == Instruction::Xor);
+          if (e == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(Ops[0].Op->getType());
+          }
+          // ... X^X -> ...
+          Ops.erase(Ops.begin()+i, Ops.begin()+i+2);
+          i -= 1; e -= 2;
+          IterateOptimization = true;
+          ++NumAnnihil;
+        }
+      }
+    }
+    break;
+
+  case Instruction::Add:
+    // Scan the operand lists looking for X and -X pairs.  If we find any, we
+    // can simplify the expression. X+-X == 0.
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      assert(i < Ops.size());
+      // Check for X and -X in the operand list.
+      if (BinaryOperator::isNeg(Ops[i].Op)) {
+        Value *X = BinaryOperator::getNegArgument(Ops[i].Op);
+        unsigned FoundX = FindInOperandList(Ops, i, X);
+        if (FoundX != i) {
+          // Remove X and -X from the operand list.
+          if (Ops.size() == 2) {
+            ++NumAnnihil;
+            return Constant::getNullValue(X->getType());
+          } else {
+            Ops.erase(Ops.begin()+i);
+            if (i < FoundX)
+              --FoundX;
+            else
+              --i;   // Need to back up an extra one.
+            Ops.erase(Ops.begin()+FoundX);
+            IterateOptimization = true;
+            ++NumAnnihil;
+            --i;     // Revisit element.
+            e -= 2;  // Removed two elements.
+          }
+        }
+      }
+    }
+    
+
+    // Scan the operand list, checking to see if there are any common factors
+    // between operands.  Consider something like A*A+A*B*C+D.  We would like to
+    // reassociate this to A*(A+B*C)+D, which reduces the number of multiplies.
+    // To efficiently find this, we count the number of times a factor occurs
+    // for any ADD operands that are MULs.
+    std::map<Value*, unsigned> FactorOccurrences;
+    unsigned MaxOcc = 0;
+    Value *MaxOccVal = 0;
+    for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+      if (BinaryOperator *BOp = dyn_cast<BinaryOperator>(Ops[i].Op)) {
+        if (BOp->getOpcode() == Instruction::Mul && BOp->use_empty()) {
+          // Compute all of the factors of this added value.
+          std::vector<Value*> Factors;
+          FindSingleUseMultiplyFactors(BOp, Factors);
+          assert(Factors.size() > 1 && "Bad linearize!");
+
+          // Add one to FactorOccurrences for each unique factor in this op.
+          if (Factors.size() == 2) {
+            unsigned Occ = ++FactorOccurrences[Factors[0]];
+            if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[0]; }
+            if (Factors[0] != Factors[1]) {   // Don't double count A*A.
+              Occ = ++FactorOccurrences[Factors[1]];
+              if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[1]; }
+            }
+          } else {
+            std::set<Value*> Duplicates;
+            for (unsigned i = 0, e = Factors.size(); i != e; ++i) {
+              if (Duplicates.insert(Factors[i]).second) {
+                unsigned Occ = ++FactorOccurrences[Factors[i]];
+                if (Occ > MaxOcc) { MaxOcc = Occ; MaxOccVal = Factors[i]; }
+              }
+            }
+          }
+        }
+      }
+    }
+
+    // If any factor occurred more than one time, we can pull it out.
+    if (MaxOcc > 1) {
+      DOUT << "\nFACTORING [" << MaxOcc << "]: " << *MaxOccVal << "\n";
+      
+      // Create a new instruction that uses the MaxOccVal twice.  If we don't do
+      // this, we could otherwise run into situations where removing a factor
+      // from an expression will drop a use of maxocc, and this can cause 
+      // RemoveFactorFromExpression on successive values to behave differently.
+      Instruction *DummyInst = BinaryOperator::createAdd(MaxOccVal, MaxOccVal);
+      std::vector<Value*> NewMulOps;
+      for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
+        if (Value *V = RemoveFactorFromExpression(Ops[i].Op, MaxOccVal)) {
+          NewMulOps.push_back(V);
+          Ops.erase(Ops.begin()+i);
+          --i; --e;
+        }
+      }
+      
+      // No need for extra uses anymore.
+      delete DummyInst;
+
+      unsigned NumAddedValues = NewMulOps.size();
+      Value *V = EmitAddTreeOfValues(I, NewMulOps);
+      Value *V2 = BinaryOperator::createMul(V, MaxOccVal, "tmp", I);
+
+      // Now that we have inserted V and its sole use, optimize it. This allows
+      // us to handle cases that require multiple factoring steps, such as this:
+      // A*A*B + A*A*C   -->   A*(A*B+A*C)   -->   A*(A*(B+C))
+      if (NumAddedValues > 1)
+        ReassociateExpression(cast<BinaryOperator>(V));
+      
+      ++NumFactor;
+      
+      if (Ops.size() == 0)
+        return V2;
+
+      // Add the new value to the list of things being added.
+      Ops.insert(Ops.begin(), ValueEntry(getRank(V2), V2));
+      
+      // Rewrite the tree so that there is now a use of V.
+      RewriteExprTree(I, Ops);
+      return OptimizeExpression(I, Ops);
+    }
+    break;
+  //case Instruction::Mul:
+  }
+
+  if (IterateOptimization)
+    return OptimizeExpression(I, Ops);
+  return 0;
+}
+
+
+/// ReassociateBB - Inspect all of the instructions in this basic block,
+/// reassociating them as we go.
+void Reassociate::ReassociateBB(BasicBlock *BB) {
+  for (BasicBlock::iterator BBI = BB->begin(); BBI != BB->end(); ) {
+    Instruction *BI = BBI++;
+    if (BI->getOpcode() == Instruction::Shl &&
+        isa<ConstantInt>(BI->getOperand(1)))
+      if (Instruction *NI = ConvertShiftToMul(BI)) {
+        MadeChange = true;
+        BI = NI;
+      }
+
+    // Reject cases where it is pointless to do this.
+    if (!isa<BinaryOperator>(BI) || BI->getType()->isFloatingPoint() || 
+        isa<VectorType>(BI->getType()))
+      continue;  // Floating point ops are not associative.
+
+    // If this is a subtract instruction which is not already in negate form,
+    // see if we can convert it to X+-Y.
+    if (BI->getOpcode() == Instruction::Sub) {
+      if (!BinaryOperator::isNeg(BI)) {
+        if (Instruction *NI = BreakUpSubtract(BI)) {
+          MadeChange = true;
+          BI = NI;
+        }
+      } else {
+        // Otherwise, this is a negation.  See if the operand is a multiply tree
+        // and if this is not an inner node of a multiply tree.
+        if (isReassociableOp(BI->getOperand(1), Instruction::Mul) &&
+            (!BI->hasOneUse() ||
+             !isReassociableOp(BI->use_back(), Instruction::Mul))) {
+          BI = LowerNegateToMultiply(BI);
+          MadeChange = true;
+        }
+      }
+    }
+
+    // If this instruction is a commutative binary operator, process it.
+    if (!BI->isAssociative()) continue;
+    BinaryOperator *I = cast<BinaryOperator>(BI);
+
+    // If this is an interior node of a reassociable tree, ignore it until we
+    // get to the root of the tree, to avoid N^2 analysis.
+    if (I->hasOneUse() && isReassociableOp(I->use_back(), I->getOpcode()))
+      continue;
+
+    // If this is an add tree that is used by a sub instruction, ignore it 
+    // until we process the subtract.
+    if (I->hasOneUse() && I->getOpcode() == Instruction::Add &&
+        cast<Instruction>(I->use_back())->getOpcode() == Instruction::Sub)
+      continue;
+
+    ReassociateExpression(I);
+  }
+}
+
+void Reassociate::ReassociateExpression(BinaryOperator *I) {
+  
+  // First, walk the expression tree, linearizing the tree, collecting
+  std::vector<ValueEntry> Ops;
+  LinearizeExprTree(I, Ops);
+  
+  DOUT << "RAIn:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  // Now that we have linearized the tree to a list and have gathered all of
+  // the operands and their ranks, sort the operands by their rank.  Use a
+  // stable_sort so that values with equal ranks will have their relative
+  // positions maintained (and so the compiler is deterministic).  Note that
+  // this sorts so that the highest ranking values end up at the beginning of
+  // the vector.
+  std::stable_sort(Ops.begin(), Ops.end());
+  
+  // OptimizeExpression - Now that we have the expression tree in a convenient
+  // sorted form, optimize it globally if possible.
+  if (Value *V = OptimizeExpression(I, Ops)) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    DOUT << "Reassoc to scalar: " << *V << "\n";
+    I->replaceAllUsesWith(V);
+    RemoveDeadBinaryOp(I);
+    return;
+  }
+  
+  // We want to sink immediates as deeply as possible except in the case where
+  // this is a multiply tree used only by an add, and the immediate is a -1.
+  // In this case we reassociate to put the negation on the outside so that we
+  // can fold the negation into the add: (-X)*Y + Z -> Z-X*Y
+  if (I->getOpcode() == Instruction::Mul && I->hasOneUse() &&
+      cast<Instruction>(I->use_back())->getOpcode() == Instruction::Add &&
+      isa<ConstantInt>(Ops.back().Op) &&
+      cast<ConstantInt>(Ops.back().Op)->isAllOnesValue()) {
+    Ops.insert(Ops.begin(), Ops.back());
+    Ops.pop_back();
+  }
+  
+  DOUT << "RAOut:\t"; DEBUG(PrintOps(I, Ops)); DOUT << "\n";
+  
+  if (Ops.size() == 1) {
+    // This expression tree simplified to something that isn't a tree,
+    // eliminate it.
+    I->replaceAllUsesWith(Ops[0].Op);
+    RemoveDeadBinaryOp(I);
+  } else {
+    // Now that we ordered and optimized the expressions, splat them back into
+    // the expression tree, removing any unneeded nodes.
+    RewriteExprTree(I, Ops);
+  }
+}
+
+
+bool Reassociate::runOnFunction(Function &F) {
+  // Recalculate the rank map for F
+  BuildRankMap(F);
+
+  MadeChange = false;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
+    ReassociateBB(FI);
+
+  // We are done with the rank map...
+  RankMap.clear();
+  ValueRankMap.clear();
+  return MadeChange;
+}
+

diff --git a/lib/Transforms/Scalar/Reg2Mem.cpp b/lib/Transforms/Scalar/Reg2Mem.cpp
new file mode 100644
index 0000000..ef7411a
--- /dev/null
+++ b/lib/Transforms/Scalar/Reg2Mem.cpp

@@ -0,0 +1,91 @@
+//===- Reg2Mem.cpp - Convert registers to allocas -------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file demotes all registers to memory references.  It is intented to be
+// the inverse of PromoteMemoryToRegister.  By converting to loads, the only
+// values live accross basic blocks are allocas and loads before phi nodes.
+// It is intended that this should make CFG hacking much easier.
+// To make later hacking easier, the entry block is split into two, such that
+// all introduced allocas and nothing else are in the entry block.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "reg2mem"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Pass.h"
+#include "llvm/Function.h"
+#include "llvm/Module.h"
+#include "llvm/BasicBlock.h"
+#include "llvm/Instructions.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+#include <list>
+using namespace llvm;
+
+STATISTIC(NumDemoted, "Number of registers demoted");
+
+namespace {
+  struct VISIBILITY_HIDDEN RegToMem : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    RegToMem() : FunctionPass((intptr_t)&ID) {}
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequiredID(BreakCriticalEdgesID);
+      AU.addPreservedID(BreakCriticalEdgesID);
+    }
+
+   bool valueEscapes(Instruction* i) {
+      BasicBlock* bb = i->getParent();
+      for(Value::use_iterator ii = i->use_begin(), ie = i->use_end();
+          ii != ie; ++ii)
+        if (cast<Instruction>(*ii)->getParent() != bb ||
+            isa<PHINode>(*ii))
+          return true;
+      return false;
+    }
+
+    virtual bool runOnFunction(Function &F) {
+      if (!F.isDeclaration()) {
+        //give us a clean block
+        BasicBlock* bbold = &F.getEntryBlock();
+        BasicBlock* bbnew = new BasicBlock("allocablock", &F, 
+                                           &F.getEntryBlock());
+        new BranchInst(bbold, bbnew);
+
+        //find the instructions
+        std::list<Instruction*> worklist;
+        for (Function::iterator ibb = F.begin(), ibe = F.end();
+             ibb != ibe; ++ibb)
+          for (BasicBlock::iterator iib = ibb->begin(), iie = ibb->end();
+               iib != iie; ++iib) {
+            if(valueEscapes(iib))
+              worklist.push_front(&*iib);
+          }
+        //demote escaped instructions
+        NumDemoted += worklist.size();
+        for (std::list<Instruction*>::iterator ilb = worklist.begin(), 
+               ile = worklist.end(); ilb != ile; ++ilb)
+          DemoteRegToStack(**ilb, false);
+        return true;
+      }
+      return false;
+    }
+  };
+  
+  char RegToMem::ID = 0;
+  RegisterPass<RegToMem> X("reg2mem", "Demote all values to stack slots");
+}
+
+// createDemoteRegisterToMemory - Provide an entry point to create this pass.
+//
+const PassInfo *llvm::DemoteRegisterToMemoryID = X.getPassInfo();
+FunctionPass *llvm::createDemoteRegisterToMemoryPass() {
+  return new RegToMem();
+}

diff --git a/lib/Transforms/Scalar/SCCP.cpp b/lib/Transforms/Scalar/SCCP.cpp
new file mode 100644
index 0000000..0e4fe8f
--- /dev/null
+++ b/lib/Transforms/Scalar/SCCP.cpp

@@ -0,0 +1,1691 @@
+//===- SCCP.cpp - Sparse Conditional Constant Propagation -----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sparse conditional constant propagation and merging:
+//
+// Specifically, this:
+//   * Assumes values are constant unless proven otherwise
+//   * Assumes BasicBlocks are dead unless proven otherwise
+//   * Proves values to be constant, and replaces them with constants
+//   * Proves conditional branches to be unconditional
+//
+// Notice that:
+//   * This pass has a habit of making definitions be dead.  It is a good idea
+//     to to run a DCE pass sometime after running this pass.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "sccp"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CallSite.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/InstVisitor.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+using namespace llvm;
+
+STATISTIC(NumInstRemoved, "Number of instructions removed");
+STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+
+STATISTIC(IPNumInstRemoved, "Number ofinstructions removed by IPSCCP");
+STATISTIC(IPNumDeadBlocks , "Number of basic blocks unreachable by IPSCCP");
+STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
+STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
+
+namespace {
+/// LatticeVal class - This class represents the different lattice values that
+/// an LLVM value may occupy.  It is a simple class with value semantics.
+///
+class VISIBILITY_HIDDEN LatticeVal {
+  enum {
+    /// undefined - This LLVM Value has no known value yet.
+    undefined,
+    
+    /// constant - This LLVM Value has a specific constant value.
+    constant,
+
+    /// forcedconstant - This LLVM Value was thought to be undef until
+    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
+    /// with another (different) constant, it goes to overdefined, instead of
+    /// asserting.
+    forcedconstant,
+    
+    /// overdefined - This instruction is not known to be constant, and we know
+    /// it has a value.
+    overdefined
+  } LatticeValue;    // The current lattice position
+  
+  Constant *ConstantVal; // If Constant value, the current value
+public:
+  inline LatticeVal() : LatticeValue(undefined), ConstantVal(0) {}
+  
+  // markOverdefined - Return true if this is a new status to be in...
+  inline bool markOverdefined() {
+    if (LatticeValue != overdefined) {
+      LatticeValue = overdefined;
+      return true;
+    }
+    return false;
+  }
+
+  // markConstant - Return true if this is a new status for us.
+  inline bool markConstant(Constant *V) {
+    if (LatticeValue != constant) {
+      if (LatticeValue == undefined) {
+        LatticeValue = constant;
+        assert(V && "Marking constant with NULL");
+        ConstantVal = V;
+      } else {
+        assert(LatticeValue == forcedconstant && 
+               "Cannot move from overdefined to constant!");
+        // Stay at forcedconstant if the constant is the same.
+        if (V == ConstantVal) return false;
+        
+        // Otherwise, we go to overdefined.  Assumptions made based on the
+        // forced value are possibly wrong.  Assuming this is another constant
+        // could expose a contradiction.
+        LatticeValue = overdefined;
+      }
+      return true;
+    } else {
+      assert(ConstantVal == V && "Marking constant with different value");
+    }
+    return false;
+  }
+
+  inline void markForcedConstant(Constant *V) {
+    assert(LatticeValue == undefined && "Can't force a defined value!");
+    LatticeValue = forcedconstant;
+    ConstantVal = V;
+  }
+  
+  inline bool isUndefined() const { return LatticeValue == undefined; }
+  inline bool isConstant() const {
+    return LatticeValue == constant || LatticeValue == forcedconstant;
+  }
+  inline bool isOverdefined() const { return LatticeValue == overdefined; }
+
+  inline Constant *getConstant() const {
+    assert(isConstant() && "Cannot get the constant of a non-constant!");
+    return ConstantVal;
+  }
+};
+
+} // end anonymous namespace
+
+
+//===----------------------------------------------------------------------===//
+//
+/// SCCPSolver - This class is a general purpose solver for Sparse Conditional
+/// Constant Propagation.
+///
+class SCCPSolver : public InstVisitor<SCCPSolver> {
+  SmallSet<BasicBlock*, 16> BBExecutable;// The basic blocks that are executable
+  std::map<Value*, LatticeVal> ValueState;  // The state each value is in.
+
+  /// GlobalValue - If we are tracking any values for the contents of a global
+  /// variable, we keep a mapping from the constant accessor to the element of
+  /// the global, to the currently known value.  If the value becomes
+  /// overdefined, it's entry is simply removed from this map.
+  DenseMap<GlobalVariable*, LatticeVal> TrackedGlobals;
+
+  /// TrackedFunctionRetVals - If we are tracking arguments into and the return
+  /// value out of a function, it will have an entry in this map, indicating
+  /// what the known return value for the function is.
+  DenseMap<Function*, LatticeVal> TrackedFunctionRetVals;
+
+  // The reason for two worklists is that overdefined is the lowest state
+  // on the lattice, and moving things to overdefined as fast as possible
+  // makes SCCP converge much faster.
+  // By having a separate worklist, we accomplish this because everything
+  // possibly overdefined will become overdefined at the soonest possible
+  // point.
+  std::vector<Value*> OverdefinedInstWorkList;
+  std::vector<Value*> InstWorkList;
+
+
+  std::vector<BasicBlock*>  BBWorkList;  // The BasicBlock work list
+
+  /// UsersOfOverdefinedPHIs - Keep track of any users of PHI nodes that are not
+  /// overdefined, despite the fact that the PHI node is overdefined.
+  std::multimap<PHINode*, Instruction*> UsersOfOverdefinedPHIs;
+
+  /// KnownFeasibleEdges - Entries in this set are edges which have already had
+  /// PHI nodes retriggered.
+  typedef std::pair<BasicBlock*,BasicBlock*> Edge;
+  std::set<Edge> KnownFeasibleEdges;
+public:
+
+  /// MarkBlockExecutable - This method can be used by clients to mark all of
+  /// the blocks that are known to be intrinsically live in the processed unit.
+  void MarkBlockExecutable(BasicBlock *BB) {
+    DOUT << "Marking Block Executable: " << BB->getName() << "\n";
+    BBExecutable.insert(BB);   // Basic block is executable!
+    BBWorkList.push_back(BB);  // Add the block to the work list!
+  }
+
+  /// TrackValueOfGlobalVariable - Clients can use this method to
+  /// inform the SCCPSolver that it should track loads and stores to the
+  /// specified global variable if it can.  This is only legal to call if
+  /// performing Interprocedural SCCP.
+  void TrackValueOfGlobalVariable(GlobalVariable *GV) {
+    const Type *ElTy = GV->getType()->getElementType();
+    if (ElTy->isFirstClassType()) {
+      LatticeVal &IV = TrackedGlobals[GV];
+      if (!isa<UndefValue>(GV->getInitializer()))
+        IV.markConstant(GV->getInitializer());
+    }
+  }
+
+  /// AddTrackedFunction - If the SCCP solver is supposed to track calls into
+  /// and out of the specified function (which cannot have its address taken),
+  /// this method must be called.
+  void AddTrackedFunction(Function *F) {
+    assert(F->hasInternalLinkage() && "Can only track internal functions!");
+    // Add an entry, F -> undef.
+    TrackedFunctionRetVals[F];
+  }
+
+  /// Solve - Solve for constants and executable blocks.
+  ///
+  void Solve();
+
+  /// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+  /// that branches on undef values cannot reach any of their successors.
+  /// However, this is not a safe assumption.  After we solve dataflow, this
+  /// method should be use to handle this.  If this returns true, the solver
+  /// should be rerun.
+  bool ResolvedUndefsIn(Function &F);
+
+  /// getExecutableBlocks - Once we have solved for constants, return the set of
+  /// blocks that is known to be executable.
+  SmallSet<BasicBlock*, 16> &getExecutableBlocks() {
+    return BBExecutable;
+  }
+
+  /// getValueMapping - Once we have solved for constants, return the mapping of
+  /// LLVM values to LatticeVals.
+  std::map<Value*, LatticeVal> &getValueMapping() {
+    return ValueState;
+  }
+
+  /// getTrackedFunctionRetVals - Get the inferred return value map.
+  ///
+  const DenseMap<Function*, LatticeVal> &getTrackedFunctionRetVals() {
+    return TrackedFunctionRetVals;
+  }
+
+  /// getTrackedGlobals - Get and return the set of inferred initializers for
+  /// global variables.
+  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+    return TrackedGlobals;
+  }
+
+  inline void markOverdefined(Value *V) {
+    markOverdefined(ValueState[V], V);
+  }
+
+private:
+  // markConstant - Make a value be marked as "constant".  If the value
+  // is not already a constant, add it to the instruction work list so that
+  // the users of the instruction are updated later.
+  //
+  inline void markConstant(LatticeVal &IV, Value *V, Constant *C) {
+    if (IV.markConstant(C)) {
+      DOUT << "markConstant: " << *C << ": " << *V;
+      InstWorkList.push_back(V);
+    }
+  }
+  
+  inline void markForcedConstant(LatticeVal &IV, Value *V, Constant *C) {
+    IV.markForcedConstant(C);
+    DOUT << "markForcedConstant: " << *C << ": " << *V;
+    InstWorkList.push_back(V);
+  }
+  
+  inline void markConstant(Value *V, Constant *C) {
+    markConstant(ValueState[V], V, C);
+  }
+
+  // markOverdefined - Make a value be marked as "overdefined". If the
+  // value is not already overdefined, add it to the overdefined instruction
+  // work list so that the users of the instruction are updated later.
+
+  inline void markOverdefined(LatticeVal &IV, Value *V) {
+    if (IV.markOverdefined()) {
+      DEBUG(DOUT << "markOverdefined: ";
+            if (Function *F = dyn_cast<Function>(V))
+              DOUT << "Function '" << F->getName() << "'\n";
+            else
+              DOUT << *V);
+      // Only instructions go on the work list
+      OverdefinedInstWorkList.push_back(V);
+    }
+  }
+
+  inline void mergeInValue(LatticeVal &IV, Value *V, LatticeVal &MergeWithV) {
+    if (IV.isOverdefined() || MergeWithV.isUndefined())
+      return;  // Noop.
+    if (MergeWithV.isOverdefined())
+      markOverdefined(IV, V);
+    else if (IV.isUndefined())
+      markConstant(IV, V, MergeWithV.getConstant());
+    else if (IV.getConstant() != MergeWithV.getConstant())
+      markOverdefined(IV, V);
+  }
+  
+  inline void mergeInValue(Value *V, LatticeVal &MergeWithV) {
+    return mergeInValue(ValueState[V], V, MergeWithV);
+  }
+
+
+  // getValueState - Return the LatticeVal object that corresponds to the value.
+  // This function is necessary because not all values should start out in the
+  // underdefined state... Argument's should be overdefined, and
+  // constants should be marked as constants.  If a value is not known to be an
+  // Instruction object, then use this accessor to get its value from the map.
+  //
+  inline LatticeVal &getValueState(Value *V) {
+    std::map<Value*, LatticeVal>::iterator I = ValueState.find(V);
+    if (I != ValueState.end()) return I->second;  // Common case, in the map
+
+    if (Constant *C = dyn_cast<Constant>(V)) {
+      if (isa<UndefValue>(V)) {
+        // Nothing to do, remain undefined.
+      } else {
+        LatticeVal &LV = ValueState[C];
+        LV.markConstant(C);          // Constants are constant
+        return LV;
+      }
+    }
+    // All others are underdefined by default...
+    return ValueState[V];
+  }
+
+  // markEdgeExecutable - Mark a basic block as executable, adding it to the BB
+  // work list if it is not already executable...
+  //
+  void markEdgeExecutable(BasicBlock *Source, BasicBlock *Dest) {
+    if (!KnownFeasibleEdges.insert(Edge(Source, Dest)).second)
+      return;  // This edge is already known to be executable!
+
+    if (BBExecutable.count(Dest)) {
+      DOUT << "Marking Edge Executable: " << Source->getName()
+           << " -> " << Dest->getName() << "\n";
+
+      // The destination is already executable, but we just made an edge
+      // feasible that wasn't before.  Revisit the PHI nodes in the block
+      // because they have potentially new operands.
+      for (BasicBlock::iterator I = Dest->begin(); isa<PHINode>(I); ++I)
+        visitPHINode(*cast<PHINode>(I));
+
+    } else {
+      MarkBlockExecutable(Dest);
+    }
+  }
+
+  // getFeasibleSuccessors - Return a vector of booleans to indicate which
+  // successors are reachable from a given terminator instruction.
+  //
+  void getFeasibleSuccessors(TerminatorInst &TI, SmallVector<bool, 16> &Succs);
+
+  // isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+  // block to the 'To' basic block is currently feasible...
+  //
+  bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
+
+  // OperandChangedState - This method is invoked on all of the users of an
+  // instruction that was just changed state somehow....  Based on this
+  // information, we need to update the specified user of this instruction.
+  //
+  void OperandChangedState(User *U) {
+    // Only instructions use other variable values!
+    Instruction &I = cast<Instruction>(*U);
+    if (BBExecutable.count(I.getParent()))   // Inst is executable?
+      visit(I);
+  }
+
+private:
+  friend class InstVisitor<SCCPSolver>;
+
+  // visit implementations - Something changed in this instruction... Either an
+  // operand made a transition, or the instruction is newly executable.  Change
+  // the value type of I to reflect these changes if appropriate.
+  //
+  void visitPHINode(PHINode &I);
+
+  // Terminators
+  void visitReturnInst(ReturnInst &I);
+  void visitTerminatorInst(TerminatorInst &TI);
+
+  void visitCastInst(CastInst &I);
+  void visitSelectInst(SelectInst &I);
+  void visitBinaryOperator(Instruction &I);
+  void visitCmpInst(CmpInst &I);
+  void visitExtractElementInst(ExtractElementInst &I);
+  void visitInsertElementInst(InsertElementInst &I);
+  void visitShuffleVectorInst(ShuffleVectorInst &I);
+
+  // Instructions that cannot be folded away...
+  void visitStoreInst     (Instruction &I);
+  void visitLoadInst      (LoadInst &I);
+  void visitGetElementPtrInst(GetElementPtrInst &I);
+  void visitCallInst      (CallInst &I) { visitCallSite(CallSite::get(&I)); }
+  void visitInvokeInst    (InvokeInst &II) {
+    visitCallSite(CallSite::get(&II));
+    visitTerminatorInst(II);
+  }
+  void visitCallSite      (CallSite CS);
+  void visitUnwindInst    (TerminatorInst &I) { /*returns void*/ }
+  void visitUnreachableInst(TerminatorInst &I) { /*returns void*/ }
+  void visitAllocationInst(Instruction &I) { markOverdefined(&I); }
+  void visitVANextInst    (Instruction &I) { markOverdefined(&I); }
+  void visitVAArgInst     (Instruction &I) { markOverdefined(&I); }
+  void visitFreeInst      (Instruction &I) { /*returns void*/ }
+
+  void visitInstruction(Instruction &I) {
+    // If a new instruction is added to LLVM that we don't handle...
+    cerr << "SCCP: Don't know how to handle: " << I;
+    markOverdefined(&I);   // Just in case
+  }
+};
+
+// getFeasibleSuccessors - Return a vector of booleans to indicate which
+// successors are reachable from a given terminator instruction.
+//
+void SCCPSolver::getFeasibleSuccessors(TerminatorInst &TI,
+                                       SmallVector<bool, 16> &Succs) {
+  Succs.resize(TI.getNumSuccessors());
+  if (BranchInst *BI = dyn_cast<BranchInst>(&TI)) {
+    if (BI->isUnconditional()) {
+      Succs[0] = true;
+    } else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined() ||
+          (BCValue.isConstant() && !isa<ConstantInt>(BCValue.getConstant()))) {
+        // Overdefined condition variables, and branches on unfoldable constant
+        // conditions, mean the branch could go either way.
+        Succs[0] = Succs[1] = true;
+      } else if (BCValue.isConstant()) {
+        // Constant condition variables mean the branch can only go a single way
+        Succs[BCValue.getConstant() == ConstantInt::getFalse()] = true;
+      }
+    }
+  } else if (isa<InvokeInst>(&TI)) {
+    // Invoke instructions successors are always executable.
+    Succs[0] = Succs[1] = true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(&TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined() ||   // Overdefined condition?
+        (SCValue.isConstant() && !isa<ConstantInt>(SCValue.getConstant()))) {
+      // All destinations are executable!
+      Succs.assign(TI.getNumSuccessors(), true);
+    } else if (SCValue.isConstant()) {
+      Constant *CPV = SCValue.getConstant();
+      // Make sure to skip the "default value" which isn't a value
+      for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i) {
+        if (SI->getSuccessorValue(i) == CPV) {// Found the right branch...
+          Succs[i] = true;
+          return;
+        }
+      }
+
+      // Constant value not equal to any of the branches... must execute
+      // default branch then...
+      Succs[0] = true;
+    }
+  } else {
+    assert(0 && "SCCP: Don't know how to handle this terminator!");
+  }
+}
+
+
+// isEdgeFeasible - Return true if the control flow edge from the 'From' basic
+// block to the 'To' basic block is currently feasible...
+//
+bool SCCPSolver::isEdgeFeasible(BasicBlock *From, BasicBlock *To) {
+  assert(BBExecutable.count(To) && "Dest should always be alive!");
+
+  // Make sure the source basic block is executable!!
+  if (!BBExecutable.count(From)) return false;
+
+  // Check to make sure this edge itself is actually feasible now...
+  TerminatorInst *TI = From->getTerminator();
+  if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+    if (BI->isUnconditional())
+      return true;
+    else {
+      LatticeVal &BCValue = getValueState(BI->getCondition());
+      if (BCValue.isOverdefined()) {
+        // Overdefined condition variables mean the branch could go either way.
+        return true;
+      } else if (BCValue.isConstant()) {
+        // Not branching on an evaluatable constant?
+        if (!isa<ConstantInt>(BCValue.getConstant())) return true;
+
+        // Constant condition variables mean the branch can only go a single way
+        return BI->getSuccessor(BCValue.getConstant() ==
+                                       ConstantInt::getFalse()) == To;
+      }
+      return false;
+    }
+  } else if (isa<InvokeInst>(TI)) {
+    // Invoke instructions successors are always executable.
+    return true;
+  } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+    LatticeVal &SCValue = getValueState(SI->getCondition());
+    if (SCValue.isOverdefined()) {  // Overdefined condition?
+      // All destinations are executable!
+      return true;
+    } else if (SCValue.isConstant()) {
+      Constant *CPV = SCValue.getConstant();
+      if (!isa<ConstantInt>(CPV))
+        return true;  // not a foldable constant?
+
+      // Make sure to skip the "default value" which isn't a value
+      for (unsigned i = 1, E = SI->getNumSuccessors(); i != E; ++i)
+        if (SI->getSuccessorValue(i) == CPV) // Found the taken branch...
+          return SI->getSuccessor(i) == To;
+
+      // Constant value not equal to any of the branches... must execute
+      // default branch then...
+      return SI->getDefaultDest() == To;
+    }
+    return false;
+  } else {
+    cerr << "Unknown terminator instruction: " << *TI;
+    abort();
+  }
+}
+
+// visit Implementations - Something changed in this instruction... Either an
+// operand made a transition, or the instruction is newly executable.  Change
+// the value type of I to reflect these changes if appropriate.  This method
+// makes sure to do the following actions:
+//
+// 1. If a phi node merges two constants in, and has conflicting value coming
+//    from different branches, or if the PHI node merges in an overdefined
+//    value, then the PHI node becomes overdefined.
+// 2. If a phi node merges only constants in, and they all agree on value, the
+//    PHI node becomes a constant value equal to that.
+// 3. If V <- x (op) y && isConstant(x) && isConstant(y) V = Constant
+// 4. If V <- x (op) y && (isOverdefined(x) || isOverdefined(y)) V = Overdefined
+// 5. If V <- MEM or V <- CALL or V <- (unknown) then V = Overdefined
+// 6. If a conditional branch has a value that is constant, make the selected
+//    destination executable
+// 7. If a conditional branch has a value that is overdefined, make all
+//    successors executable.
+//
+void SCCPSolver::visitPHINode(PHINode &PN) {
+  LatticeVal &PNIV = getValueState(&PN);
+  if (PNIV.isOverdefined()) {
+    // There may be instructions using this PHI node that are not overdefined
+    // themselves.  If so, make sure that they know that the PHI node operand
+    // changed.
+    std::multimap<PHINode*, Instruction*>::iterator I, E;
+    tie(I, E) = UsersOfOverdefinedPHIs.equal_range(&PN);
+    if (I != E) {
+      SmallVector<Instruction*, 16> Users;
+      for (; I != E; ++I) Users.push_back(I->second);
+      while (!Users.empty()) {
+        visit(Users.back());
+        Users.pop_back();
+      }
+    }
+    return;  // Quick exit
+  }
+
+  // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
+  // and slow us down a lot.  Just mark them overdefined.
+  if (PN.getNumIncomingValues() > 64) {
+    markOverdefined(PNIV, &PN);
+    return;
+  }
+
+  // Look at all of the executable operands of the PHI node.  If any of them
+  // are overdefined, the PHI becomes overdefined as well.  If they are all
+  // constant, and they agree with each other, the PHI becomes the identical
+  // constant.  If they are constant and don't agree, the PHI is overdefined.
+  // If there are no executable operands, the PHI remains undefined.
+  //
+  Constant *OperandVal = 0;
+  for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
+    LatticeVal &IV = getValueState(PN.getIncomingValue(i));
+    if (IV.isUndefined()) continue;  // Doesn't influence PHI node.
+
+    if (isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent())) {
+      if (IV.isOverdefined()) {   // PHI node becomes overdefined!
+        markOverdefined(PNIV, &PN);
+        return;
+      }
+
+      if (OperandVal == 0) {   // Grab the first value...
+        OperandVal = IV.getConstant();
+      } else {                // Another value is being merged in!
+        // There is already a reachable operand.  If we conflict with it,
+        // then the PHI node becomes overdefined.  If we agree with it, we
+        // can continue on.
+
+        // Check to see if there are two different constants merging...
+        if (IV.getConstant() != OperandVal) {
+          // Yes there is.  This means the PHI node is not constant.
+          // You must be overdefined poor PHI.
+          //
+          markOverdefined(PNIV, &PN);    // The PHI node now becomes overdefined
+          return;    // I'm done analyzing you
+        }
+      }
+    }
+  }
+
+  // If we exited the loop, this means that the PHI node only has constant
+  // arguments that agree with each other(and OperandVal is the constant) or
+  // OperandVal is null because there are no defined incoming arguments.  If
+  // this is the case, the PHI remains undefined.
+  //
+  if (OperandVal)
+    markConstant(PNIV, &PN, OperandVal);      // Acquire operand value
+}
+
+void SCCPSolver::visitReturnInst(ReturnInst &I) {
+  if (I.getNumOperands() == 0) return;  // Ret void
+
+  // If we are tracking the return value of this function, merge it in.
+  Function *F = I.getParent()->getParent();
+  if (F->hasInternalLinkage() && !TrackedFunctionRetVals.empty()) {
+    DenseMap<Function*, LatticeVal>::iterator TFRVI =
+      TrackedFunctionRetVals.find(F);
+    if (TFRVI != TrackedFunctionRetVals.end() &&
+        !TFRVI->second.isOverdefined()) {
+      LatticeVal &IV = getValueState(I.getOperand(0));
+      mergeInValue(TFRVI->second, F, IV);
+    }
+  }
+}
+
+
+void SCCPSolver::visitTerminatorInst(TerminatorInst &TI) {
+  SmallVector<bool, 16> SuccFeasible;
+  getFeasibleSuccessors(TI, SuccFeasible);
+
+  BasicBlock *BB = TI.getParent();
+
+  // Mark all feasible successors executable...
+  for (unsigned i = 0, e = SuccFeasible.size(); i != e; ++i)
+    if (SuccFeasible[i])
+      markEdgeExecutable(BB, TI.getSuccessor(i));
+}
+
+void SCCPSolver::visitCastInst(CastInst &I) {
+  Value *V = I.getOperand(0);
+  LatticeVal &VState = getValueState(V);
+  if (VState.isOverdefined())          // Inherit overdefinedness of operand
+    markOverdefined(&I);
+  else if (VState.isConstant())        // Propagate constant value
+    markConstant(&I, ConstantExpr::getCast(I.getOpcode(), 
+                                           VState.getConstant(), I.getType()));
+}
+
+void SCCPSolver::visitSelectInst(SelectInst &I) {
+  LatticeVal &CondValue = getValueState(I.getCondition());
+  if (CondValue.isUndefined())
+    return;
+  if (CondValue.isConstant()) {
+    if (ConstantInt *CondCB = dyn_cast<ConstantInt>(CondValue.getConstant())){
+      mergeInValue(&I, getValueState(CondCB->getZExtValue() ? I.getTrueValue()
+                                                          : I.getFalseValue()));
+      return;
+    }
+  }
+  
+  // Otherwise, the condition is overdefined or a constant we can't evaluate.
+  // See if we can produce something better than overdefined based on the T/F
+  // value.
+  LatticeVal &TVal = getValueState(I.getTrueValue());
+  LatticeVal &FVal = getValueState(I.getFalseValue());
+  
+  // select ?, C, C -> C.
+  if (TVal.isConstant() && FVal.isConstant() && 
+      TVal.getConstant() == FVal.getConstant()) {
+    markConstant(&I, FVal.getConstant());
+    return;
+  }
+
+  if (TVal.isUndefined()) {  // select ?, undef, X -> X.
+    mergeInValue(&I, FVal);
+  } else if (FVal.isUndefined()) {  // select ?, X, undef -> X.
+    mergeInValue(&I, TVal);
+  } else {
+    markOverdefined(&I);
+  }
+}
+
+// Handle BinaryOperators and Shift Instructions...
+void SCCPSolver::visitBinaryOperator(Instruction &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If this is an AND or OR with 0 or -1, it doesn't matter that the other
+    // operand is overdefined.
+    if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Or) {
+      LatticeVal *NonOverdefVal = 0;
+      if (!V1State.isOverdefined()) {
+        NonOverdefVal = &V1State;
+      } else if (!V2State.isOverdefined()) {
+        NonOverdefVal = &V2State;
+      }
+
+      if (NonOverdefVal) {
+        if (NonOverdefVal->isUndefined()) {
+          // Could annihilate value.
+          if (I.getOpcode() == Instruction::And)
+            markConstant(IV, &I, Constant::getNullValue(I.getType()));
+          else if (const VectorType *PT = dyn_cast<VectorType>(I.getType()))
+            markConstant(IV, &I, ConstantVector::getAllOnesValue(PT));
+          else
+            markConstant(IV, &I, ConstantInt::getAllOnesValue(I.getType()));
+          return;
+        } else {
+          if (I.getOpcode() == Instruction::And) {
+            if (NonOverdefVal->getConstant()->isNullValue()) {
+              markConstant(IV, &I, NonOverdefVal->getConstant());
+              return;      // X and 0 = 0
+            }
+          } else {
+            if (ConstantInt *CI =
+                     dyn_cast<ConstantInt>(NonOverdefVal->getConstant()))
+              if (CI->isAllOnesValue()) {
+                markConstant(IV, &I, NonOverdefVal->getConstant());
+                return;    // X or -1 = -1
+              }
+          }
+        }
+      }
+    }
+
+
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::get(I.getOpcode(), In1.getConstant(),
+                                              In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
+                                           V2State.getConstant()));
+  }
+}
+
+// Handle ICmpInst instruction...
+void SCCPSolver::visitCmpInst(CmpInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &V1State = getValueState(I.getOperand(0));
+  LatticeVal &V2State = getValueState(I.getOperand(1));
+
+  if (V1State.isOverdefined() || V2State.isOverdefined()) {
+    // If both operands are PHI nodes, it is possible that this instruction has
+    // a constant value, despite the fact that the PHI node doesn't.  Check for
+    // this condition now.
+    if (PHINode *PN1 = dyn_cast<PHINode>(I.getOperand(0)))
+      if (PHINode *PN2 = dyn_cast<PHINode>(I.getOperand(1)))
+        if (PN1->getParent() == PN2->getParent()) {
+          // Since the two PHI nodes are in the same basic block, they must have
+          // entries for the same predecessors.  Walk the predecessor list, and
+          // if all of the incoming values are constants, and the result of
+          // evaluating this expression with all incoming value pairs is the
+          // same, then this expression is a constant even though the PHI node
+          // is not a constant!
+          LatticeVal Result;
+          for (unsigned i = 0, e = PN1->getNumIncomingValues(); i != e; ++i) {
+            LatticeVal &In1 = getValueState(PN1->getIncomingValue(i));
+            BasicBlock *InBlock = PN1->getIncomingBlock(i);
+            LatticeVal &In2 =
+              getValueState(PN2->getIncomingValueForBlock(InBlock));
+
+            if (In1.isOverdefined() || In2.isOverdefined()) {
+              Result.markOverdefined();
+              break;  // Cannot fold this operation over the PHI nodes!
+            } else if (In1.isConstant() && In2.isConstant()) {
+              Constant *V = ConstantExpr::getCompare(I.getPredicate(), 
+                                                     In1.getConstant(), 
+                                                     In2.getConstant());
+              if (Result.isUndefined())
+                Result.markConstant(V);
+              else if (Result.isConstant() && Result.getConstant() != V) {
+                Result.markOverdefined();
+                break;
+              }
+            }
+          }
+
+          // If we found a constant value here, then we know the instruction is
+          // constant despite the fact that the PHI nodes are overdefined.
+          if (Result.isConstant()) {
+            markConstant(IV, &I, Result.getConstant());
+            // Remember that this instruction is virtually using the PHI node
+            // operands.
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN1, &I));
+            UsersOfOverdefinedPHIs.insert(std::make_pair(PN2, &I));
+            return;
+          } else if (Result.isUndefined()) {
+            return;
+          }
+
+          // Okay, this really is overdefined now.  Since we might have
+          // speculatively thought that this was not overdefined before, and
+          // added ourselves to the UsersOfOverdefinedPHIs list for the PHIs,
+          // make sure to clean out any entries that we put there, for
+          // efficiency.
+          std::multimap<PHINode*, Instruction*>::iterator It, E;
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN1);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+          tie(It, E) = UsersOfOverdefinedPHIs.equal_range(PN2);
+          while (It != E) {
+            if (It->second == &I) {
+              UsersOfOverdefinedPHIs.erase(It++);
+            } else
+              ++It;
+          }
+        }
+
+    markOverdefined(IV, &I);
+  } else if (V1State.isConstant() && V2State.isConstant()) {
+    markConstant(IV, &I, ConstantExpr::getCompare(I.getPredicate(), 
+                                                  V1State.getConstant(), 
+                                                  V2State.getConstant()));
+  }
+}
+
+void SCCPSolver::visitExtractElementInst(ExtractElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &IdxState = getValueState(I.getOperand(1));
+
+  if (ValState.isOverdefined() || IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getExtractElement(ValState.getConstant(),
+                                                     IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitInsertElementInst(InsertElementInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &ValState = getValueState(I.getOperand(0));
+  LatticeVal &EltState = getValueState(I.getOperand(1));
+  LatticeVal &IdxState = getValueState(I.getOperand(2));
+
+  if (ValState.isOverdefined() || EltState.isOverdefined() ||
+      IdxState.isOverdefined())
+    markOverdefined(&I);
+  else if(ValState.isConstant() && EltState.isConstant() &&
+          IdxState.isConstant())
+    markConstant(&I, ConstantExpr::getInsertElement(ValState.getConstant(),
+                                                    EltState.getConstant(),
+                                                    IdxState.getConstant()));
+  else if (ValState.isUndefined() && EltState.isConstant() &&
+           IdxState.isConstant()) 
+    markConstant(&I,ConstantExpr::getInsertElement(UndefValue::get(I.getType()),
+                                                   EltState.getConstant(),
+                                                   IdxState.getConstant()));
+#endif
+}
+
+void SCCPSolver::visitShuffleVectorInst(ShuffleVectorInst &I) {
+  // FIXME : SCCP does not handle vectors properly.
+  markOverdefined(&I);
+  return;
+#if 0
+  LatticeVal &V1State   = getValueState(I.getOperand(0));
+  LatticeVal &V2State   = getValueState(I.getOperand(1));
+  LatticeVal &MaskState = getValueState(I.getOperand(2));
+
+  if (MaskState.isUndefined() ||
+      (V1State.isUndefined() && V2State.isUndefined()))
+    return;  // Undefined output if mask or both inputs undefined.
+  
+  if (V1State.isOverdefined() || V2State.isOverdefined() ||
+      MaskState.isOverdefined()) {
+    markOverdefined(&I);
+  } else {
+    // A mix of constant/undef inputs.
+    Constant *V1 = V1State.isConstant() ? 
+        V1State.getConstant() : UndefValue::get(I.getType());
+    Constant *V2 = V2State.isConstant() ? 
+        V2State.getConstant() : UndefValue::get(I.getType());
+    Constant *Mask = MaskState.isConstant() ? 
+      MaskState.getConstant() : UndefValue::get(I.getOperand(2)->getType());
+    markConstant(&I, ConstantExpr::getShuffleVector(V1, V2, Mask));
+  }
+#endif
+}
+
+// Handle getelementptr instructions... if all operands are constants then we
+// can turn this into a getelementptr ConstantExpr.
+//
+void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I.getNumOperands());
+
+  for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
+    LatticeVal &State = getValueState(I.getOperand(i));
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet...
+    else if (State.isOverdefined()) {
+      markOverdefined(IV, &I);
+      return;
+    }
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  Constant *Ptr = Operands[0];
+  Operands.erase(Operands.begin());  // Erase the pointer from idx list...
+
+  markConstant(IV, &I, ConstantExpr::getGetElementPtr(Ptr, &Operands[0],
+                                                      Operands.size()));
+}
+
+void SCCPSolver::visitStoreInst(Instruction &SI) {
+  if (TrackedGlobals.empty() || !isa<GlobalVariable>(SI.getOperand(1)))
+    return;
+  GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
+  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+
+  // Get the value we are storing into the global.
+  LatticeVal &PtrVal = getValueState(SI.getOperand(0));
+
+  mergeInValue(I->second, GV, PtrVal);
+  if (I->second.isOverdefined())
+    TrackedGlobals.erase(I);      // No need to keep tracking this!
+}
+
+
+// Handle load instructions.  If the operand is a constant pointer to a constant
+// global, we can replace the load with the loaded constant value!
+void SCCPSolver::visitLoadInst(LoadInst &I) {
+  LatticeVal &IV = ValueState[&I];
+  if (IV.isOverdefined()) return;
+
+  LatticeVal &PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUndefined()) return;   // The pointer is not resolved yet!
+  if (PtrVal.isConstant() && !I.isVolatile()) {
+    Value *Ptr = PtrVal.getConstant();
+    if (isa<ConstantPointerNull>(Ptr)) {
+      // load null -> null
+      markConstant(IV, &I, Constant::getNullValue(I.getType()));
+      return;
+    }
+
+    // Transform load (constant global) into the value loaded.
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      if (GV->isConstant()) {
+        if (!GV->isDeclaration()) {
+          markConstant(IV, &I, GV->getInitializer());
+          return;
+        }
+      } else if (!TrackedGlobals.empty()) {
+        // If we are tracking this global, merge in the known value for it.
+        DenseMap<GlobalVariable*, LatticeVal>::iterator It =
+          TrackedGlobals.find(GV);
+        if (It != TrackedGlobals.end()) {
+          mergeInValue(IV, &I, It->second);
+          return;
+        }
+      }
+    }
+
+    // Transform load (constantexpr_GEP global, 0, ...) into the value loaded.
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Ptr))
+      if (CE->getOpcode() == Instruction::GetElementPtr)
+    if (GlobalVariable *GV = dyn_cast<GlobalVariable>(CE->getOperand(0)))
+      if (GV->isConstant() && !GV->isDeclaration())
+        if (Constant *V =
+             ConstantFoldLoadThroughGEPConstantExpr(GV->getInitializer(), CE)) {
+          markConstant(IV, &I, V);
+          return;
+        }
+  }
+
+  // Otherwise we cannot say for certain what value this load will produce.
+  // Bail out.
+  markOverdefined(IV, &I);
+}
+
+void SCCPSolver::visitCallSite(CallSite CS) {
+  Function *F = CS.getCalledFunction();
+
+  // If we are tracking this function, we must make sure to bind arguments as
+  // appropriate.
+  DenseMap<Function*, LatticeVal>::iterator TFRVI =TrackedFunctionRetVals.end();
+  if (F && F->hasInternalLinkage())
+    TFRVI = TrackedFunctionRetVals.find(F);
+
+  if (TFRVI != TrackedFunctionRetVals.end()) {
+    // If this is the first call to the function hit, mark its entry block
+    // executable.
+    if (!BBExecutable.count(F->begin()))
+      MarkBlockExecutable(F->begin());
+
+    CallSite::arg_iterator CAI = CS.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI, ++CAI) {
+      LatticeVal &IV = ValueState[AI];
+      if (!IV.isOverdefined())
+        mergeInValue(IV, AI, getValueState(*CAI));
+    }
+  }
+  Instruction *I = CS.getInstruction();
+  if (I->getType() == Type::VoidTy) return;
+
+  LatticeVal &IV = ValueState[I];
+  if (IV.isOverdefined()) return;
+
+  // Propagate the return value of the function to the value of the instruction.
+  if (TFRVI != TrackedFunctionRetVals.end()) {
+    mergeInValue(IV, I, TFRVI->second);
+    return;
+  }
+
+  if (F == 0 || !F->isDeclaration() || !canConstantFoldCallTo(F)) {
+    markOverdefined(IV, I);
+    return;
+  }
+
+  SmallVector<Constant*, 8> Operands;
+  Operands.reserve(I->getNumOperands()-1);
+
+  for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
+       AI != E; ++AI) {
+    LatticeVal &State = getValueState(*AI);
+    if (State.isUndefined())
+      return;  // Operands are not resolved yet...
+    else if (State.isOverdefined()) {
+      markOverdefined(IV, I);
+      return;
+    }
+    assert(State.isConstant() && "Unknown state!");
+    Operands.push_back(State.getConstant());
+  }
+
+  if (Constant *C = ConstantFoldCall(F, &Operands[0], Operands.size()))
+    markConstant(IV, I, C);
+  else
+    markOverdefined(IV, I);
+}
+
+
+void SCCPSolver::Solve() {
+  // Process the work lists until they are empty!
+  while (!BBWorkList.empty() || !InstWorkList.empty() ||
+         !OverdefinedInstWorkList.empty()) {
+    // Process the instruction work list...
+    while (!OverdefinedInstWorkList.empty()) {
+      Value *I = OverdefinedInstWorkList.back();
+      OverdefinedInstWorkList.pop_back();
+
+      DOUT << "\nPopped off OI-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined
+      // Update all of the users of this instruction's value...
+      //
+      for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+           UI != E; ++UI)
+        OperandChangedState(*UI);
+    }
+    // Process the instruction work list...
+    while (!InstWorkList.empty()) {
+      Value *I = InstWorkList.back();
+      InstWorkList.pop_back();
+
+      DOUT << "\nPopped off I-WL: " << *I;
+
+      // "I" got into the work list because it either made the transition from
+      // bottom to constant
+      //
+      // Anything on this worklist that is overdefined need not be visited
+      // since all of its users will have already been marked as overdefined.
+      // Update all of the users of this instruction's value...
+      //
+      if (!getValueState(I).isOverdefined())
+        for (Value::use_iterator UI = I->use_begin(), E = I->use_end();
+             UI != E; ++UI)
+          OperandChangedState(*UI);
+    }
+
+    // Process the basic block work list...
+    while (!BBWorkList.empty()) {
+      BasicBlock *BB = BBWorkList.back();
+      BBWorkList.pop_back();
+
+      DOUT << "\nPopped off BBWL: " << *BB;
+
+      // Notify all instructions in this basic block that they are newly
+      // executable.
+      visit(BB);
+    }
+  }
+}
+
+/// ResolvedUndefsIn - While solving the dataflow for a function, we assume
+/// that branches on undef values cannot reach any of their successors.
+/// However, this is not a safe assumption.  After we solve dataflow, this
+/// method should be use to handle this.  If this returns true, the solver
+/// should be rerun.
+///
+/// This method handles this by finding an unresolved branch and marking it one
+/// of the edges from the block as being feasible, even though the condition
+/// doesn't say it would otherwise be.  This allows SCCP to find the rest of the
+/// CFG and only slightly pessimizes the analysis results (by marking one,
+/// potentially infeasible, edge feasible).  This cannot usefully modify the
+/// constraints on the condition of the branch, as that would impact other users
+/// of the value.
+///
+/// This scan also checks for values that use undefs, whose results are actually
+/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
+/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
+/// even if X isn't defined.
+bool SCCPSolver::ResolvedUndefsIn(Function &F) {
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (!BBExecutable.count(BB))
+      continue;
+    
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      // Look for instructions which produce undef values.
+      if (I->getType() == Type::VoidTy) continue;
+      
+      LatticeVal &LV = getValueState(I);
+      if (!LV.isUndefined()) continue;
+
+      // Get the lattice values of the first two operands for use below.
+      LatticeVal &Op0LV = getValueState(I->getOperand(0));
+      LatticeVal Op1LV;
+      if (I->getNumOperands() == 2) {
+        // If this is a two-operand instruction, and if both operands are
+        // undefs, the result stays undef.
+        Op1LV = getValueState(I->getOperand(1));
+        if (Op0LV.isUndefined() && Op1LV.isUndefined())
+          continue;
+      }
+      
+      // If this is an instructions whose result is defined even if the input is
+      // not fully defined, propagate the information.
+      const Type *ITy = I->getType();
+      switch (I->getOpcode()) {
+      default: break;          // Leave the instruction as an undef.
+      case Instruction::ZExt:
+        // After a zero extend, we know the top part is zero.  SExt doesn't have
+        // to be handled here, because we don't know whether the top part is 1's
+        // or 0's.
+        assert(Op0LV.isUndefined());
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Mul:
+      case Instruction::And:
+        // undef * X -> 0.   X could be zero.
+        // undef & X -> 0.   X could be zero.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+
+      case Instruction::Or:
+        // undef | X -> -1.   X could be -1.
+        if (const VectorType *PTy = dyn_cast<VectorType>(ITy))
+          markForcedConstant(LV, I, ConstantVector::getAllOnesValue(PTy));
+        else          
+          markForcedConstant(LV, I, ConstantInt::getAllOnesValue(ITy));
+        return true;
+
+      case Instruction::SDiv:
+      case Instruction::UDiv:
+      case Instruction::SRem:
+      case Instruction::URem:
+        // X / undef -> undef.  No change.
+        // X % undef -> undef.  No change.
+        if (Op1LV.isUndefined()) break;
+        
+        // undef / X -> 0.   X could be maxint.
+        // undef % X -> 0.   X could be 1.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+        
+      case Instruction::AShr:
+        // undef >>s X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >>s undef -> X.  X could be 0, X could have the high-bit known set.
+        if (Op0LV.isConstant())
+          markForcedConstant(LV, I, Op0LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      case Instruction::LShr:
+      case Instruction::Shl:
+        // undef >> X -> undef.  No change.
+        // undef << X -> undef.  No change.
+        if (Op0LV.isUndefined()) break;
+        
+        // X >> undef -> 0.  X could be 0.
+        // X << undef -> 0.  X could be 0.
+        markForcedConstant(LV, I, Constant::getNullValue(ITy));
+        return true;
+      case Instruction::Select:
+        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
+        if (Op0LV.isUndefined()) {
+          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
+            Op1LV = getValueState(I->getOperand(2));
+        } else if (Op1LV.isUndefined()) {
+          // c ? undef : undef -> undef.  No change.
+          Op1LV = getValueState(I->getOperand(2));
+          if (Op1LV.isUndefined())
+            break;
+          // Otherwise, c ? undef : x -> x.
+        } else {
+          // Leave Op1LV as Operand(1)'s LatticeValue.
+        }
+        
+        if (Op1LV.isConstant())
+          markForcedConstant(LV, I, Op1LV.getConstant());
+        else
+          markOverdefined(LV, I);
+        return true;
+      }
+    }
+  
+    TerminatorInst *TI = BB->getTerminator();
+    if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
+      if (!BI->isConditional()) continue;
+      if (!getValueState(BI->getCondition()).isUndefined())
+        continue;
+    } else if (SwitchInst *SI = dyn_cast<SwitchInst>(TI)) {
+      if (!getValueState(SI->getCondition()).isUndefined())
+        continue;
+    } else {
+      continue;
+    }
+    
+    // If the edge to the first successor isn't thought to be feasible yet, mark
+    // it so now.
+    if (KnownFeasibleEdges.count(Edge(BB, TI->getSuccessor(0))))
+      continue;
+    
+    // Otherwise, it isn't already thought to be feasible.  Mark it as such now
+    // and return.  This will make other blocks reachable, which will allow new
+    // values to be discovered and existing ones to be moved in the lattice.
+    markEdgeExecutable(BB, TI->getSuccessor(0));
+    return true;
+  }
+
+  return false;
+}
+
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// SCCP Class - This class uses the SCCPSolver to implement a per-function
+  /// Sparse Conditional Constant Propagator.
+  ///
+  struct VISIBILITY_HIDDEN SCCP : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    SCCP() : FunctionPass((intptr_t)&ID) {}
+
+    // runOnFunction - Run the Sparse Conditional Constant Propagation
+    // algorithm, and return true if the function was modified.
+    //
+    bool runOnFunction(Function &F);
+
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.setPreservesCFG();
+    }
+  };
+
+  char SCCP::ID = 0;
+  RegisterPass<SCCP> X("sccp", "Sparse Conditional Constant Propagation");
+} // end anonymous namespace
+
+
+// createSCCPPass - This is the public interface to this file...
+FunctionPass *llvm::createSCCPPass() {
+  return new SCCP();
+}
+
+
+// runOnFunction() - Run the Sparse Conditional Constant Propagation algorithm,
+// and return true if the function was modified.
+//
+bool SCCP::runOnFunction(Function &F) {
+  DOUT << "SCCP on function '" << F.getName() << "'\n";
+  SCCPSolver Solver;
+
+  // Mark the first block of the function as being executable.
+  Solver.MarkBlockExecutable(F.begin());
+
+  // Mark all arguments to the function as being overdefined.
+  for (Function::arg_iterator AI = F.arg_begin(), E = F.arg_end(); AI != E;++AI)
+    Solver.markOverdefined(AI);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+    DOUT << "RESOLVING UNDEFs\n";
+    ResolvedUndefs = Solver.ResolvedUndefsIn(F);
+  }
+
+  bool MadeChanges = false;
+
+  // If we decided that there are basic blocks that are dead in this function,
+  // delete their contents now.  Note that we cannot actually delete the blocks,
+  // as we cannot modify the CFG of the function.
+  //
+  SmallSet<BasicBlock*, 16> &ExecutableBBs = Solver.getExecutableBlocks();
+  SmallVector<Instruction*, 32> Insts;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (!ExecutableBBs.count(BB)) {
+      DOUT << "  BasicBlock Dead:" << *BB;
+      ++NumDeadBlocks;
+
+      // Delete the instructions backwards, as it has a reduced likelihood of
+      // having to update as many def-use and use-def chains.
+      for (BasicBlock::iterator I = BB->begin(), E = BB->getTerminator();
+           I != E; ++I)
+        Insts.push_back(I);
+      while (!Insts.empty()) {
+        Instruction *I = Insts.back();
+        Insts.pop_back();
+        if (!I->use_empty())
+          I->replaceAllUsesWith(UndefValue::get(I->getType()));
+        BB->getInstList().erase(I);
+        MadeChanges = true;
+        ++NumInstRemoved;
+      }
+    } else {
+      // Iterate over all of the instructions in a function, replacing them with
+      // constants if we have found them to be of constant values.
+      //
+      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+        Instruction *Inst = BI++;
+        if (Inst->getType() != Type::VoidTy) {
+          LatticeVal &IV = Values[Inst];
+          if ((IV.isConstant() || IV.isUndefined()) &&
+              !isa<TerminatorInst>(Inst)) {
+            Constant *Const = IV.isConstant()
+              ? IV.getConstant() : UndefValue::get(Inst->getType());
+            DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+            // Replaces all of the uses of a variable with uses of the constant.
+            Inst->replaceAllUsesWith(Const);
+
+            // Delete the instruction.
+            BB->getInstList().erase(Inst);
+
+            // Hey, we just changed something!
+            MadeChanges = true;
+            ++NumInstRemoved;
+          }
+        }
+      }
+    }
+
+  return MadeChanges;
+}
+
+namespace {
+  //===--------------------------------------------------------------------===//
+  //
+  /// IPSCCP Class - This class implements interprocedural Sparse Conditional
+  /// Constant Propagation.
+  ///
+  struct VISIBILITY_HIDDEN IPSCCP : public ModulePass {
+    static char ID;
+    IPSCCP() : ModulePass((intptr_t)&ID) {}
+    bool runOnModule(Module &M);
+  };
+
+  char IPSCCP::ID = 0;
+  RegisterPass<IPSCCP>
+  Y("ipsccp", "Interprocedural Sparse Conditional Constant Propagation");
+} // end anonymous namespace
+
+// createIPSCCPPass - This is the public interface to this file...
+ModulePass *llvm::createIPSCCPPass() {
+  return new IPSCCP();
+}
+
+
+static bool AddressIsTaken(GlobalValue *GV) {
+  // Delete any dead constantexpr klingons.
+  GV->removeDeadConstantUsers();
+
+  for (Value::use_iterator UI = GV->use_begin(), E = GV->use_end();
+       UI != E; ++UI)
+    if (StoreInst *SI = dyn_cast<StoreInst>(*UI)) {
+      if (SI->getOperand(0) == GV || SI->isVolatile())
+        return true;  // Storing addr of GV.
+    } else if (isa<InvokeInst>(*UI) || isa<CallInst>(*UI)) {
+      // Make sure we are calling the function, not passing the address.
+      CallSite CS = CallSite::get(cast<Instruction>(*UI));
+      for (CallSite::arg_iterator AI = CS.arg_begin(),
+             E = CS.arg_end(); AI != E; ++AI)
+        if (*AI == GV)
+          return true;
+    } else if (LoadInst *LI = dyn_cast<LoadInst>(*UI)) {
+      if (LI->isVolatile())
+        return true;
+    } else {
+      return true;
+    }
+  return false;
+}
+
+bool IPSCCP::runOnModule(Module &M) {
+  SCCPSolver Solver;
+
+  // Loop over all functions, marking arguments to those with their addresses
+  // taken or that are external as overdefined.
+  //
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+    if (!F->hasInternalLinkage() || AddressIsTaken(F)) {
+      if (!F->isDeclaration())
+        Solver.MarkBlockExecutable(F->begin());
+      for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+           AI != E; ++AI)
+        Solver.markOverdefined(AI);
+    } else {
+      Solver.AddTrackedFunction(F);
+    }
+
+  // Loop over global variables.  We inform the solver about any internal global
+  // variables that do not have their 'addresses taken'.  If they don't have
+  // their addresses taken, we can propagate constants through them.
+  for (Module::global_iterator G = M.global_begin(), E = M.global_end();
+       G != E; ++G)
+    if (!G->isConstant() && G->hasInternalLinkage() && !AddressIsTaken(G))
+      Solver.TrackValueOfGlobalVariable(G);
+
+  // Solve for constants.
+  bool ResolvedUndefs = true;
+  while (ResolvedUndefs) {
+    Solver.Solve();
+
+    DOUT << "RESOLVING UNDEFS\n";
+    ResolvedUndefs = false;
+    for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F)
+      ResolvedUndefs |= Solver.ResolvedUndefsIn(*F);
+  }
+
+  bool MadeChanges = false;
+
+  // Iterate over all of the instructions in the module, replacing them with
+  // constants if we have found them to be of constant values.
+  //
+  SmallSet<BasicBlock*, 16> &ExecutableBBs = Solver.getExecutableBlocks();
+  SmallVector<Instruction*, 32> Insts;
+  SmallVector<BasicBlock*, 32> BlocksToErase;
+  std::map<Value*, LatticeVal> &Values = Solver.getValueMapping();
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
+         AI != E; ++AI)
+      if (!AI->use_empty()) {
+        LatticeVal &IV = Values[AI];
+        if (IV.isConstant() || IV.isUndefined()) {
+          Constant *CST = IV.isConstant() ?
+            IV.getConstant() : UndefValue::get(AI->getType());
+          DOUT << "***  Arg " << *AI << " = " << *CST <<"\n";
+
+          // Replaces all of the uses of a variable with uses of the
+          // constant.
+          AI->replaceAllUsesWith(CST);
+          ++IPNumArgsElimed;
+        }
+      }
+
+    for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+      if (!ExecutableBBs.count(BB)) {
+        DOUT << "  BasicBlock Dead:" << *BB;
+        ++IPNumDeadBlocks;
+
+        // Delete the instructions backwards, as it has a reduced likelihood of
+        // having to update as many def-use and use-def chains.
+        TerminatorInst *TI = BB->getTerminator();
+        for (BasicBlock::iterator I = BB->begin(), E = TI; I != E; ++I)
+          Insts.push_back(I);
+
+        while (!Insts.empty()) {
+          Instruction *I = Insts.back();
+          Insts.pop_back();
+          if (!I->use_empty())
+            I->replaceAllUsesWith(UndefValue::get(I->getType()));
+          BB->getInstList().erase(I);
+          MadeChanges = true;
+          ++IPNumInstRemoved;
+        }
+
+        for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
+          BasicBlock *Succ = TI->getSuccessor(i);
+          if (Succ->begin() != Succ->end() && isa<PHINode>(Succ->begin()))
+            TI->getSuccessor(i)->removePredecessor(BB);
+        }
+        if (!TI->use_empty())
+          TI->replaceAllUsesWith(UndefValue::get(TI->getType()));
+        BB->getInstList().erase(TI);
+
+        if (&*BB != &F->front())
+          BlocksToErase.push_back(BB);
+        else
+          new UnreachableInst(BB);
+
+      } else {
+        for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
+          Instruction *Inst = BI++;
+          if (Inst->getType() != Type::VoidTy) {
+            LatticeVal &IV = Values[Inst];
+            if (IV.isConstant() || IV.isUndefined() &&
+                !isa<TerminatorInst>(Inst)) {
+              Constant *Const = IV.isConstant()
+                ? IV.getConstant() : UndefValue::get(Inst->getType());
+              DOUT << "  Constant: " << *Const << " = " << *Inst;
+
+              // Replaces all of the uses of a variable with uses of the
+              // constant.
+              Inst->replaceAllUsesWith(Const);
+
+              // Delete the instruction.
+              if (!isa<TerminatorInst>(Inst) && !isa<CallInst>(Inst))
+                BB->getInstList().erase(Inst);
+
+              // Hey, we just changed something!
+              MadeChanges = true;
+              ++IPNumInstRemoved;
+            }
+          }
+        }
+      }
+
+    // Now that all instructions in the function are constant folded, erase dead
+    // blocks, because we can now use ConstantFoldTerminator to get rid of
+    // in-edges.
+    for (unsigned i = 0, e = BlocksToErase.size(); i != e; ++i) {
+      // If there are any PHI nodes in this successor, drop entries for BB now.
+      BasicBlock *DeadBB = BlocksToErase[i];
+      while (!DeadBB->use_empty()) {
+        Instruction *I = cast<Instruction>(DeadBB->use_back());
+        bool Folded = ConstantFoldTerminator(I->getParent());
+        if (!Folded) {
+          // The constant folder may not have been able to fold the terminator
+          // if this is a branch or switch on undef.  Fold it manually as a
+          // branch to the first successor.
+          if (BranchInst *BI = dyn_cast<BranchInst>(I)) {
+            assert(BI->isConditional() && isa<UndefValue>(BI->getCondition()) &&
+                   "Branch should be foldable!");
+          } else if (SwitchInst *SI = dyn_cast<SwitchInst>(I)) {
+            assert(isa<UndefValue>(SI->getCondition()) && "Switch should fold");
+          } else {
+            assert(0 && "Didn't fold away reference to block!");
+          }
+          
+          // Make this an uncond branch to the first successor.
+          TerminatorInst *TI = I->getParent()->getTerminator();
+          new BranchInst(TI->getSuccessor(0), TI);
+          
+          // Remove entries in successor phi nodes to remove edges.
+          for (unsigned i = 1, e = TI->getNumSuccessors(); i != e; ++i)
+            TI->getSuccessor(i)->removePredecessor(TI->getParent());
+          
+          // Remove the old terminator.
+          TI->eraseFromParent();
+        }
+      }
+
+      // Finally, delete the basic block.
+      F->getBasicBlockList().erase(DeadBB);
+    }
+    BlocksToErase.clear();
+  }
+
+  // If we inferred constant or undef return values for a function, we replaced
+  // all call uses with the inferred value.  This means we don't need to bother
+  // actually returning anything from the function.  Replace all return
+  // instructions with return undef.
+  const DenseMap<Function*, LatticeVal> &RV =Solver.getTrackedFunctionRetVals();
+  for (DenseMap<Function*, LatticeVal>::const_iterator I = RV.begin(),
+         E = RV.end(); I != E; ++I)
+    if (!I->second.isOverdefined() &&
+        I->first->getReturnType() != Type::VoidTy) {
+      Function *F = I->first;
+      for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB)
+        if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator()))
+          if (!isa<UndefValue>(RI->getOperand(0)))
+            RI->setOperand(0, UndefValue::get(F->getReturnType()));
+    }
+
+  // If we infered constant or undef values for globals variables, we can delete
+  // the global and any stores that remain to it.
+  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
+  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
+         E = TG.end(); I != E; ++I) {
+    GlobalVariable *GV = I->first;
+    assert(!I->second.isOverdefined() &&
+           "Overdefined values should have been taken out of the map!");
+    DOUT << "Found that GV '" << GV->getName()<< "' is constant!\n";
+    while (!GV->use_empty()) {
+      StoreInst *SI = cast<StoreInst>(GV->use_back());
+      SI->eraseFromParent();
+    }
+    M.getGlobalList().erase(GV);
+    ++IPNumGlobalConst;
+  }
+
+  return MadeChanges;
+}

diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
new file mode 100644
index 0000000..e303468
--- /dev/null
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp

@@ -0,0 +1,1335 @@
+//===- ScalarReplAggregates.cpp - Scalar Replacement of Aggregates --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This transformation implements the well known scalar replacement of
+// aggregates transformation.  This xform breaks up alloca instructions of
+// aggregate type (structure or array) into individual alloca instructions for
+// each member (if possible).  Then, if possible, it transforms the individual
+// alloca instructions into nice clean scalar SSA form.
+//
+// This combines a simple SRoA algorithm with the Mem2Reg algorithm because
+// often interact, especially for C++ programs.  As such, iterating between
+// SRoA, then Mem2Reg until we run out of things to promote works well.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "scalarrepl"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/GlobalVariable.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Analysis/Dominators.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/GetElementPtrTypeIterator.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringExtras.h"
+using namespace llvm;
+
+STATISTIC(NumReplaced,  "Number of allocas broken up");
+STATISTIC(NumPromoted,  "Number of allocas promoted");
+STATISTIC(NumConverted, "Number of aggregates converted to scalar");
+STATISTIC(NumGlobals,   "Number of allocas copied from constant global");
+
+namespace {
+  struct VISIBILITY_HIDDEN SROA : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    SROA(signed T = -1) : FunctionPass((intptr_t)&ID) {
+      if (T == -1)
+        SRThreshold = 128;
+      else
+        SRThreshold = T;
+    }
+
+    bool runOnFunction(Function &F);
+
+    bool performScalarRepl(Function &F);
+    bool performPromotion(Function &F);
+
+    // getAnalysisUsage - This pass does not require any passes, but we know it
+    // will not alter the CFG, so say so.
+    virtual void getAnalysisUsage(AnalysisUsage &AU) const {
+      AU.addRequired<DominatorTree>();
+      AU.addRequired<DominanceFrontier>();
+      AU.addRequired<TargetData>();
+      AU.setPreservesCFG();
+    }
+
+  private:
+    /// AllocaInfo - When analyzing uses of an alloca instruction, this captures
+    /// information about the uses.  All these fields are initialized to false
+    /// and set to true when something is learned.
+    struct AllocaInfo {
+      /// isUnsafe - This is set to true if the alloca cannot be SROA'd.
+      bool isUnsafe : 1;
+      
+      /// needsCanon - This is set to true if there is some use of the alloca
+      /// that requires canonicalization.
+      bool needsCanon : 1;
+      
+      /// isMemCpySrc - This is true if this aggregate is memcpy'd from.
+      bool isMemCpySrc : 1;
+
+      /// isMemCpyDst - This is true if this aggregate is memcpy'd into.
+      bool isMemCpyDst : 1;
+
+      AllocaInfo()
+        : isUnsafe(false), needsCanon(false), 
+          isMemCpySrc(false), isMemCpyDst(false) {}
+    };
+    
+    unsigned SRThreshold;
+
+    void MarkUnsafe(AllocaInfo &I) { I.isUnsafe = true; }
+
+    int isSafeAllocaToScalarRepl(AllocationInst *AI);
+
+    void isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                               AllocaInfo &Info);
+    void isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                         AllocaInfo &Info);
+    void isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                        unsigned OpNo, AllocaInfo &Info);
+    void isSafeUseOfBitCastedAllocation(BitCastInst *User, AllocationInst *AI,
+                                        AllocaInfo &Info);
+    
+    void DoScalarReplacement(AllocationInst *AI, 
+                             std::vector<AllocationInst*> &WorkList);
+    void CanonicalizeAllocaUsers(AllocationInst *AI);
+    AllocaInst *AddNewAlloca(Function &F, const Type *Ty, AllocationInst *Base);
+    
+    void RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                    SmallVector<AllocaInst*, 32> &NewElts);
+    
+    const Type *CanConvertToScalar(Value *V, bool &IsNotTrivial);
+    void ConvertToScalar(AllocationInst *AI, const Type *Ty);
+    void ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset);
+    static Instruction *isOnlyCopiedFromConstantGlobal(AllocationInst *AI);
+  };
+
+  char SROA::ID = 0;
+  RegisterPass<SROA> X("scalarrepl", "Scalar Replacement of Aggregates");
+}
+
+// Public interface to the ScalarReplAggregates pass
+FunctionPass *llvm::createScalarReplAggregatesPass(signed int Threshold) { 
+  return new SROA(Threshold);
+}
+
+
+bool SROA::runOnFunction(Function &F) {
+  bool Changed = performPromotion(F);
+  while (1) {
+    bool LocalChange = performScalarRepl(F);
+    if (!LocalChange) break;   // No need to repromote if no scalarrepl
+    Changed = true;
+    LocalChange = performPromotion(F);
+    if (!LocalChange) break;   // No need to re-scalarrepl if no promotion
+  }
+
+  return Changed;
+}
+
+
+bool SROA::performPromotion(Function &F) {
+  std::vector<AllocaInst*> Allocas;
+  DominatorTree         &DT = getAnalysis<DominatorTree>();
+  DominanceFrontier &DF = getAnalysis<DominanceFrontier>();
+
+  BasicBlock &BB = F.getEntryBlock();  // Get the entry node for the function
+
+  bool Changed = false;
+
+  while (1) {
+    Allocas.clear();
+
+    // Find allocas that are safe to promote, by looking at all instructions in
+    // the entry node
+    for (BasicBlock::iterator I = BB.begin(), E = --BB.end(); I != E; ++I)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(I))       // Is it an alloca?
+        if (isAllocaPromotable(AI))
+          Allocas.push_back(AI);
+
+    if (Allocas.empty()) break;
+
+    PromoteMemToReg(Allocas, DT, DF);
+    NumPromoted += Allocas.size();
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+// performScalarRepl - This algorithm is a simple worklist driven algorithm,
+// which runs on all of the malloc/alloca instructions in the function, removing
+// them if they are only used by getelementptr instructions.
+//
+bool SROA::performScalarRepl(Function &F) {
+  std::vector<AllocationInst*> WorkList;
+
+  // Scan the entry basic block, adding any alloca's and mallocs to the worklist
+  BasicBlock &BB = F.getEntryBlock();
+  for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ++I)
+    if (AllocationInst *A = dyn_cast<AllocationInst>(I))
+      WorkList.push_back(A);
+
+  const TargetData &TD = getAnalysis<TargetData>();
+  
+  // Process the worklist
+  bool Changed = false;
+  while (!WorkList.empty()) {
+    AllocationInst *AI = WorkList.back();
+    WorkList.pop_back();
+    
+    // Handle dead allocas trivially.  These can be formed by SROA'ing arrays
+    // with unused elements.
+    if (AI->use_empty()) {
+      AI->eraseFromParent();
+      continue;
+    }
+    
+    // If we can turn this aggregate value (potentially with casts) into a
+    // simple scalar value that can be mem2reg'd into a register value.
+    bool IsNotTrivial = false;
+    if (const Type *ActualType = CanConvertToScalar(AI, IsNotTrivial))
+      if (IsNotTrivial && ActualType != Type::VoidTy) {
+        ConvertToScalar(AI, ActualType);
+        Changed = true;
+        continue;
+      }
+
+    // Check to see if we can perform the core SROA transformation.  We cannot
+    // transform the allocation instruction if it is an array allocation
+    // (allocations OF arrays are ok though), and an allocation of a scalar
+    // value cannot be decomposed at all.
+    if (!AI->isArrayAllocation() &&
+        (isa<StructType>(AI->getAllocatedType()) ||
+         isa<ArrayType>(AI->getAllocatedType())) &&
+        AI->getAllocatedType()->isSized() &&
+        TD.getTypeSize(AI->getAllocatedType()) < SRThreshold) {
+      // Check that all of the users of the allocation are capable of being
+      // transformed.
+      switch (isSafeAllocaToScalarRepl(AI)) {
+      default: assert(0 && "Unexpected value!");
+      case 0:  // Not safe to scalar replace.
+        break;
+      case 1:  // Safe, but requires cleanup/canonicalizations first
+        CanonicalizeAllocaUsers(AI);
+        // FALL THROUGH.
+      case 3:  // Safe to scalar replace.
+        DoScalarReplacement(AI, WorkList);
+        Changed = true;
+        continue;
+      }
+    }
+    
+    // Check to see if this allocation is only modified by a memcpy/memmove from
+    // a constant global.  If this is the case, we can change all users to use
+    // the constant global instead.  This is commonly produced by the CFE by
+    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
+    // is only subsequently read.
+    if (Instruction *TheCopy = isOnlyCopiedFromConstantGlobal(AI)) {
+      DOUT << "Found alloca equal to global: " << *AI;
+      DOUT << "  memcpy = " << *TheCopy;
+      Constant *TheSrc = cast<Constant>(TheCopy->getOperand(2));
+      AI->replaceAllUsesWith(ConstantExpr::getBitCast(TheSrc, AI->getType()));
+      TheCopy->eraseFromParent();  // Don't mutate the global.
+      AI->eraseFromParent();
+      ++NumGlobals;
+      Changed = true;
+      continue;
+    }
+        
+    // Otherwise, couldn't process this.
+  }
+
+  return Changed;
+}
+
+/// DoScalarReplacement - This alloca satisfied the isSafeAllocaToScalarRepl
+/// predicate, do SROA now.
+void SROA::DoScalarReplacement(AllocationInst *AI, 
+                               std::vector<AllocationInst*> &WorkList) {
+  DOUT << "Found inst to SROA: " << *AI;
+  SmallVector<AllocaInst*, 32> ElementAllocas;
+  if (const StructType *ST = dyn_cast<StructType>(AI->getAllocatedType())) {
+    ElementAllocas.reserve(ST->getNumContainedTypes());
+    for (unsigned i = 0, e = ST->getNumContainedTypes(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ST->getContainedType(i), 0, 
+                                      AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  } else {
+    const ArrayType *AT = cast<ArrayType>(AI->getAllocatedType());
+    ElementAllocas.reserve(AT->getNumElements());
+    const Type *ElTy = AT->getElementType();
+    for (unsigned i = 0, e = AT->getNumElements(); i != e; ++i) {
+      AllocaInst *NA = new AllocaInst(ElTy, 0, AI->getAlignment(),
+                                      AI->getName() + "." + utostr(i), AI);
+      ElementAllocas.push_back(NA);
+      WorkList.push_back(NA);  // Add to worklist for recursive processing
+    }
+  }
+
+  // Now that we have created the alloca instructions that we want to use,
+  // expand the getelementptr instructions to use them.
+  //
+  while (!AI->use_empty()) {
+    Instruction *User = cast<Instruction>(AI->use_back());
+    if (BitCastInst *BCInst = dyn_cast<BitCastInst>(User)) {
+      RewriteBitCastUserOfAlloca(BCInst, AI, ElementAllocas);
+      BCInst->eraseFromParent();
+      continue;
+    }
+    
+    GetElementPtrInst *GEPI = cast<GetElementPtrInst>(User);
+    // We now know that the GEP is of the form: GEP <ptr>, 0, <cst>
+    unsigned Idx =
+       (unsigned)cast<ConstantInt>(GEPI->getOperand(2))->getZExtValue();
+
+    assert(Idx < ElementAllocas.size() && "Index out of range?");
+    AllocaInst *AllocaToUse = ElementAllocas[Idx];
+
+    Value *RepValue;
+    if (GEPI->getNumOperands() == 3) {
+      // Do not insert a new getelementptr instruction with zero indices, only
+      // to have it optimized out later.
+      RepValue = AllocaToUse;
+    } else {
+      // We are indexing deeply into the structure, so we still need a
+      // getelement ptr instruction to finish the indexing.  This may be
+      // expanded itself once the worklist is rerun.
+      //
+      SmallVector<Value*, 8> NewArgs;
+      NewArgs.push_back(Constant::getNullValue(Type::Int32Ty));
+      NewArgs.append(GEPI->op_begin()+3, GEPI->op_end());
+      RepValue = new GetElementPtrInst(AllocaToUse, &NewArgs[0],
+                                       NewArgs.size(), "", GEPI);
+      RepValue->takeName(GEPI);
+    }
+    
+    // If this GEP is to the start of the aggregate, check for memcpys.
+    if (Idx == 0) {
+      bool IsStartOfAggregateGEP = true;
+      for (unsigned i = 3, e = GEPI->getNumOperands(); i != e; ++i) {
+        if (!isa<ConstantInt>(GEPI->getOperand(i))) {
+          IsStartOfAggregateGEP = false;
+          break;
+        }
+        if (!cast<ConstantInt>(GEPI->getOperand(i))->isZero()) {
+          IsStartOfAggregateGEP = false;
+          break;
+        }
+      }
+      
+      if (IsStartOfAggregateGEP)
+        RewriteBitCastUserOfAlloca(GEPI, AI, ElementAllocas);
+    }
+    
+
+    // Move all of the users over to the new GEP.
+    GEPI->replaceAllUsesWith(RepValue);
+    // Delete the old GEP
+    GEPI->eraseFromParent();
+  }
+
+  // Finally, delete the Alloca instruction
+  AI->eraseFromParent();
+  NumReplaced++;
+}
+
+
+/// isSafeElementUse - Check to see if this use is an allowed use for a
+/// getelementptr instruction of an array aggregate allocation.  isFirstElt
+/// indicates whether Ptr is known to the start of the aggregate.
+///
+void SROA::isSafeElementUse(Value *Ptr, bool isFirstElt, AllocationInst *AI,
+                            AllocaInfo &Info) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I) {
+    Instruction *User = cast<Instruction>(*I);
+    switch (User->getOpcode()) {
+    case Instruction::Load:  break;
+    case Instruction::Store:
+      // Store is ok if storing INTO the pointer, not storing the pointer
+      if (User->getOperand(0) == Ptr) return MarkUnsafe(Info);
+      break;
+    case Instruction::GetElementPtr: {
+      GetElementPtrInst *GEP = cast<GetElementPtrInst>(User);
+      bool AreAllZeroIndices = isFirstElt;
+      if (GEP->getNumOperands() > 1) {
+        if (!isa<ConstantInt>(GEP->getOperand(1)) ||
+            !cast<ConstantInt>(GEP->getOperand(1))->isZero())
+          // Using pointer arithmetic to navigate the array.
+          return MarkUnsafe(Info);
+       
+        if (AreAllZeroIndices) {
+          for (unsigned i = 2, e = GEP->getNumOperands(); i != e; ++i) {
+            if (!isa<ConstantInt>(GEP->getOperand(i)) ||    
+                !cast<ConstantInt>(GEP->getOperand(i))->isZero()) {
+              AreAllZeroIndices = false;
+              break;
+            }
+          }
+        }
+      }
+      isSafeElementUse(GEP, AreAllZeroIndices, AI, Info);
+      if (Info.isUnsafe) return;
+      break;
+    }
+    case Instruction::BitCast:
+      if (isFirstElt) {
+        isSafeUseOfBitCastedAllocation(cast<BitCastInst>(User), AI, Info);
+        if (Info.isUnsafe) return;
+        break;
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    case Instruction::Call:
+      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(User)) {
+        if (isFirstElt) {
+          isSafeMemIntrinsicOnAllocation(MI, AI, I.getOperandNo(), Info);
+          if (Info.isUnsafe) return;
+          break;
+        }
+      }
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    default:
+      DOUT << "  Transformation preventing inst: " << *User;
+      return MarkUnsafe(Info);
+    }
+  }
+  return;  // All users look ok :)
+}
+
+/// AllUsersAreLoads - Return true if all users of this value are loads.
+static bool AllUsersAreLoads(Value *Ptr) {
+  for (Value::use_iterator I = Ptr->use_begin(), E = Ptr->use_end();
+       I != E; ++I)
+    if (cast<Instruction>(*I)->getOpcode() != Instruction::Load)
+      return false;
+  return true;
+}
+
+/// isSafeUseOfAllocation - Check to see if this user is an allowed use for an
+/// aggregate allocation.
+///
+void SROA::isSafeUseOfAllocation(Instruction *User, AllocationInst *AI,
+                                 AllocaInfo &Info) {
+  if (BitCastInst *C = dyn_cast<BitCastInst>(User))
+    return isSafeUseOfBitCastedAllocation(C, AI, Info);
+
+  GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(User);
+  if (GEPI == 0)
+    return MarkUnsafe(Info);
+
+  gep_type_iterator I = gep_type_begin(GEPI), E = gep_type_end(GEPI);
+
+  // The GEP is not safe to transform if not of the form "GEP <ptr>, 0, <cst>".
+  if (I == E ||
+      I.getOperand() != Constant::getNullValue(I.getOperand()->getType())) {
+    return MarkUnsafe(Info);
+  }
+
+  ++I;
+  if (I == E) return MarkUnsafe(Info);  // ran out of GEP indices??
+
+  bool IsAllZeroIndices = true;
+  
+  // If this is a use of an array allocation, do a bit more checking for sanity.
+  if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+    uint64_t NumElements = AT->getNumElements();
+
+    if (ConstantInt *Idx = dyn_cast<ConstantInt>(I.getOperand())) {
+      IsAllZeroIndices &= Idx->isZero();
+      
+      // Check to make sure that index falls within the array.  If not,
+      // something funny is going on, so we won't do the optimization.
+      //
+      if (Idx->getZExtValue() >= NumElements)
+        return MarkUnsafe(Info);
+
+      // We cannot scalar repl this level of the array unless any array
+      // sub-indices are in-range constants.  In particular, consider:
+      // A[0][i].  We cannot know that the user isn't doing invalid things like
+      // allowing i to index an out-of-range subscript that accesses A[1].
+      //
+      // Scalar replacing *just* the outer index of the array is probably not
+      // going to be a win anyway, so just give up.
+      for (++I; I != E && (isa<ArrayType>(*I) || isa<VectorType>(*I)); ++I) {
+        uint64_t NumElements;
+        if (const ArrayType *SubArrayTy = dyn_cast<ArrayType>(*I))
+          NumElements = SubArrayTy->getNumElements();
+        else
+          NumElements = cast<VectorType>(*I)->getNumElements();
+        
+        ConstantInt *IdxVal = dyn_cast<ConstantInt>(I.getOperand());
+        if (!IdxVal) return MarkUnsafe(Info);
+        if (IdxVal->getZExtValue() >= NumElements)
+          return MarkUnsafe(Info);
+        IsAllZeroIndices &= IdxVal->isZero();
+      }
+      
+    } else {
+      IsAllZeroIndices = 0;
+      
+      // If this is an array index and the index is not constant, we cannot
+      // promote... that is unless the array has exactly one or two elements in
+      // it, in which case we CAN promote it, but we have to canonicalize this
+      // out if this is the only problem.
+      if ((NumElements == 1 || NumElements == 2) &&
+          AllUsersAreLoads(GEPI)) {
+        Info.needsCanon = true;
+        return;  // Canonicalization required!
+      }
+      return MarkUnsafe(Info);
+    }
+  }
+
+  // If there are any non-simple uses of this getelementptr, make sure to reject
+  // them.
+  return isSafeElementUse(GEPI, IsAllZeroIndices, AI, Info);
+}
+
+/// isSafeMemIntrinsicOnAllocation - Return true if the specified memory
+/// intrinsic can be promoted by SROA.  At this point, we know that the operand
+/// of the memintrinsic is a pointer to the beginning of the allocation.
+void SROA::isSafeMemIntrinsicOnAllocation(MemIntrinsic *MI, AllocationInst *AI,
+                                          unsigned OpNo, AllocaInfo &Info) {
+  // If not constant length, give up.
+  ConstantInt *Length = dyn_cast<ConstantInt>(MI->getLength());
+  if (!Length) return MarkUnsafe(Info);
+  
+  // If not the whole aggregate, give up.
+  const TargetData &TD = getAnalysis<TargetData>();
+  if (Length->getZExtValue() != TD.getTypeSize(AI->getType()->getElementType()))
+    return MarkUnsafe(Info);
+  
+  // We only know about memcpy/memset/memmove.
+  if (!isa<MemCpyInst>(MI) && !isa<MemSetInst>(MI) && !isa<MemMoveInst>(MI))
+    return MarkUnsafe(Info);
+  
+  // Otherwise, we can transform it.  Determine whether this is a memcpy/set
+  // into or out of the aggregate.
+  if (OpNo == 1)
+    Info.isMemCpyDst = true;
+  else {
+    assert(OpNo == 2);
+    Info.isMemCpySrc = true;
+  }
+}
+
+/// isSafeUseOfBitCastedAllocation - Return true if all users of this bitcast
+/// are 
+void SROA::isSafeUseOfBitCastedAllocation(BitCastInst *BC, AllocationInst *AI,
+                                          AllocaInfo &Info) {
+  for (Value::use_iterator UI = BC->use_begin(), E = BC->use_end();
+       UI != E; ++UI) {
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(UI)) {
+      isSafeUseOfBitCastedAllocation(BCU, AI, Info);
+    } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(UI)) {
+      isSafeMemIntrinsicOnAllocation(MI, AI, UI.getOperandNo(), Info);
+    } else {
+      return MarkUnsafe(Info);
+    }
+    if (Info.isUnsafe) return;
+  }
+}
+
+/// RewriteBitCastUserOfAlloca - BCInst (transitively) bitcasts AI, or indexes
+/// to its first element.  Transform users of the cast to use the new values
+/// instead.
+void SROA::RewriteBitCastUserOfAlloca(Instruction *BCInst, AllocationInst *AI,
+                                      SmallVector<AllocaInst*, 32> &NewElts) {
+  Constant *Zero = Constant::getNullValue(Type::Int32Ty);
+  const TargetData &TD = getAnalysis<TargetData>();
+  
+  Value::use_iterator UI = BCInst->use_begin(), UE = BCInst->use_end();
+  while (UI != UE) {
+    if (BitCastInst *BCU = dyn_cast<BitCastInst>(*UI)) {
+      RewriteBitCastUserOfAlloca(BCU, AI, NewElts);
+      ++UI;
+      BCU->eraseFromParent();
+      continue;
+    }
+
+    // Otherwise, must be memcpy/memmove/memset of the entire aggregate.  Split
+    // into one per element.
+    MemIntrinsic *MI = dyn_cast<MemIntrinsic>(*UI);
+    
+    // If it's not a mem intrinsic, it must be some other user of a gep of the
+    // first pointer.  Just leave these alone.
+    if (!MI) {
+      ++UI;
+      continue;
+    }
+    
+    // If this is a memcpy/memmove, construct the other pointer as the
+    // appropriate type.
+    Value *OtherPtr = 0;
+    if (MemCpyInst *MCI = dyn_cast<MemCpyInst>(MI)) {
+      if (BCInst == MCI->getRawDest())
+        OtherPtr = MCI->getRawSource();
+      else {
+        assert(BCInst == MCI->getRawSource());
+        OtherPtr = MCI->getRawDest();
+      }
+    } else if (MemMoveInst *MMI = dyn_cast<MemMoveInst>(MI)) {
+      if (BCInst == MMI->getRawDest())
+        OtherPtr = MMI->getRawSource();
+      else {
+        assert(BCInst == MMI->getRawSource());
+        OtherPtr = MMI->getRawDest();
+      }
+    }
+    
+    // If there is an other pointer, we want to convert it to the same pointer
+    // type as AI has, so we can GEP through it.
+    if (OtherPtr) {
+      // It is likely that OtherPtr is a bitcast, if so, remove it.
+      if (BitCastInst *BC = dyn_cast<BitCastInst>(OtherPtr))
+        OtherPtr = BC->getOperand(0);
+      if (ConstantExpr *BCE = dyn_cast<ConstantExpr>(OtherPtr))
+        if (BCE->getOpcode() == Instruction::BitCast)
+          OtherPtr = BCE->getOperand(0);
+      
+      // If the pointer is not the right type, insert a bitcast to the right
+      // type.
+      if (OtherPtr->getType() != AI->getType())
+        OtherPtr = new BitCastInst(OtherPtr, AI->getType(), OtherPtr->getName(),
+                                   MI);
+    }
+
+    // Process each element of the aggregate.
+    Value *TheFn = MI->getOperand(0);
+    const Type *BytePtrTy = MI->getRawDest()->getType();
+    bool SROADest = MI->getRawDest() == BCInst;
+
+    for (unsigned i = 0, e = NewElts.size(); i != e; ++i) {
+      // If this is a memcpy/memmove, emit a GEP of the other element address.
+      Value *OtherElt = 0;
+      if (OtherPtr) {
+        OtherElt = new GetElementPtrInst(OtherPtr, Zero,
+                                         ConstantInt::get(Type::Int32Ty, i),
+                                         OtherPtr->getNameStr()+"."+utostr(i),
+                                         MI);
+      }
+
+      Value *EltPtr = NewElts[i];
+      const Type *EltTy =cast<PointerType>(EltPtr->getType())->getElementType();
+      
+      // If we got down to a scalar, insert a load or store as appropriate.
+      if (EltTy->isFirstClassType()) {
+        if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
+          Value *Elt = new LoadInst(SROADest ? OtherElt : EltPtr, "tmp",
+                                    MI);
+          new StoreInst(Elt, SROADest ? EltPtr : OtherElt, MI);
+          continue;
+        } else {
+          assert(isa<MemSetInst>(MI));
+
+          // If the stored element is zero (common case), just store a null
+          // constant.
+          Constant *StoreVal;
+          if (ConstantInt *CI = dyn_cast<ConstantInt>(MI->getOperand(2))) {
+            if (CI->isZero()) {
+              StoreVal = Constant::getNullValue(EltTy);  // 0.0, null, 0, <0,0>
+            } else {
+              // If EltTy is a vector type, get the element type.
+              const Type *ValTy = EltTy;
+              if (const VectorType *VTy = dyn_cast<VectorType>(ValTy))
+                ValTy = VTy->getElementType();
+              
+              // Construct an integer with the right value.
+              unsigned EltSize = TD.getTypeSize(ValTy);
+              APInt OneVal(EltSize*8, CI->getZExtValue());
+              APInt TotalVal(OneVal);
+              // Set each byte.
+              for (unsigned i = 0; i != EltSize-1; ++i) {
+                TotalVal = TotalVal.shl(8);
+                TotalVal |= OneVal;
+              }
+              
+              // Convert the integer value to the appropriate type.
+              StoreVal = ConstantInt::get(TotalVal);
+              if (isa<PointerType>(ValTy))
+                StoreVal = ConstantExpr::getIntToPtr(StoreVal, ValTy);
+              else if (ValTy->isFloatingPoint())
+                StoreVal = ConstantExpr::getBitCast(StoreVal, ValTy);
+              assert(StoreVal->getType() == ValTy && "Type mismatch!");
+              
+              // If the requested value was a vector constant, create it.
+              if (EltTy != ValTy) {
+                unsigned NumElts = cast<VectorType>(ValTy)->getNumElements();
+                SmallVector<Constant*, 16> Elts(NumElts, StoreVal);
+                StoreVal = ConstantVector::get(&Elts[0], NumElts);
+              }
+            }
+            new StoreInst(StoreVal, EltPtr, MI);
+            continue;
+          }
+          // Otherwise, if we're storing a byte variable, use a memset call for
+          // this element.
+        }
+      }
+      
+      // Cast the element pointer to BytePtrTy.
+      if (EltPtr->getType() != BytePtrTy)
+        EltPtr = new BitCastInst(EltPtr, BytePtrTy, EltPtr->getNameStr(), MI);
+    
+      // Cast the other pointer (if we have one) to BytePtrTy. 
+      if (OtherElt && OtherElt->getType() != BytePtrTy)
+        OtherElt = new BitCastInst(OtherElt, BytePtrTy,OtherElt->getNameStr(),
+                                   MI);
+    
+      unsigned EltSize = TD.getTypeSize(EltTy);
+
+      // Finally, insert the meminst for this element.
+      if (isa<MemCpyInst>(MI) || isa<MemMoveInst>(MI)) {
+        Value *Ops[] = {
+          SROADest ? EltPtr : OtherElt,  // Dest ptr
+          SROADest ? OtherElt : EltPtr,  // Src ptr
+          ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+          Zero  // Align
+        };
+        new CallInst(TheFn, Ops, 4, "", MI);
+      } else {
+        assert(isa<MemSetInst>(MI));
+        Value *Ops[] = {
+          EltPtr, MI->getOperand(2),  // Dest, Value,
+          ConstantInt::get(MI->getOperand(3)->getType(), EltSize), // Size
+          Zero  // Align
+        };
+        new CallInst(TheFn, Ops, 4, "", MI);
+      }
+    }
+
+    // Finally, MI is now dead, as we've modified its actions to occur on all of
+    // the elements of the aggregate.
+    ++UI;
+    MI->eraseFromParent();
+  }
+}
+
+/// HasStructPadding - Return true if the specified type has any structure
+/// padding, false otherwise.
+static bool HasStructPadding(const Type *Ty, const TargetData &TD) {
+  if (const StructType *STy = dyn_cast<StructType>(Ty)) {
+    const StructLayout *SL = TD.getStructLayout(STy);
+    unsigned PrevFieldBitOffset = 0;
+    for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+      unsigned FieldBitOffset = SL->getElementOffset(i)*8;
+      
+      // Padding in sub-elements?
+      if (HasStructPadding(STy->getElementType(i), TD))
+        return true;
+      
+      // Check to see if there is any padding between this element and the
+      // previous one.
+      if (i) {
+        unsigned PrevFieldEnd = 
+        PrevFieldBitOffset+TD.getTypeSizeInBits(STy->getElementType(i-1));
+        if (PrevFieldEnd < FieldBitOffset)
+          return true;
+      }
+      
+      PrevFieldBitOffset = FieldBitOffset;
+    }
+    
+    //  Check for tail padding.
+    if (unsigned EltCount = STy->getNumElements()) {
+      unsigned PrevFieldEnd = PrevFieldBitOffset +
+                   TD.getTypeSizeInBits(STy->getElementType(EltCount-1));
+      if (PrevFieldEnd < SL->getSizeInBytes()*8)
+        return true;
+    }
+
+  } else if (const ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
+    return HasStructPadding(ATy->getElementType(), TD);
+  }
+  return false;
+}
+
+/// isSafeStructAllocaToScalarRepl - Check to see if the specified allocation of
+/// an aggregate can be broken down into elements.  Return 0 if not, 3 if safe,
+/// or 1 if safe after canonicalization has been performed.
+///
+int SROA::isSafeAllocaToScalarRepl(AllocationInst *AI) {
+  // Loop over the use list of the alloca.  We can only transform it if all of
+  // the users are safe to transform.
+  AllocaInfo Info;
+  
+  for (Value::use_iterator I = AI->use_begin(), E = AI->use_end();
+       I != E; ++I) {
+    isSafeUseOfAllocation(cast<Instruction>(*I), AI, Info);
+    if (Info.isUnsafe) {
+      DOUT << "Cannot transform: " << *AI << "  due to user: " << **I;
+      return 0;
+    }
+  }
+  
+  // Okay, we know all the users are promotable.  If the aggregate is a memcpy
+  // source and destination, we have to be careful.  In particular, the memcpy
+  // could be moving around elements that live in structure padding of the LLVM
+  // types, but may actually be used.  In these cases, we refuse to promote the
+  // struct.
+  if (Info.isMemCpySrc && Info.isMemCpyDst &&
+      HasStructPadding(AI->getType()->getElementType(), 
+                       getAnalysis<TargetData>()))
+    return 0;
+  
+  // If we require cleanup, return 1, otherwise return 3.
+  return Info.needsCanon ? 1 : 3;
+}
+
+/// CanonicalizeAllocaUsers - If SROA reported that it can promote the specified
+/// allocation, but only if cleaned up, perform the cleanups required.
+void SROA::CanonicalizeAllocaUsers(AllocationInst *AI) {
+  // At this point, we know that the end result will be SROA'd and promoted, so
+  // we can insert ugly code if required so long as sroa+mem2reg will clean it
+  // up.
+  for (Value::use_iterator UI = AI->use_begin(), E = AI->use_end();
+       UI != E; ) {
+    GetElementPtrInst *GEPI = dyn_cast<GetElementPtrInst>(*UI++);
+    if (!GEPI) continue;
+    gep_type_iterator I = gep_type_begin(GEPI);
+    ++I;
+
+    if (const ArrayType *AT = dyn_cast<ArrayType>(*I)) {
+      uint64_t NumElements = AT->getNumElements();
+
+      if (!isa<ConstantInt>(I.getOperand())) {
+        if (NumElements == 1) {
+          GEPI->setOperand(2, Constant::getNullValue(Type::Int32Ty));
+        } else {
+          assert(NumElements == 2 && "Unhandled case!");
+          // All users of the GEP must be loads.  At each use of the GEP, insert
+          // two loads of the appropriate indexed GEP and select between them.
+          Value *IsOne = new ICmpInst(ICmpInst::ICMP_NE, I.getOperand(), 
+                              Constant::getNullValue(I.getOperand()->getType()),
+             "isone", GEPI);
+          // Insert the new GEP instructions, which are properly indexed.
+          SmallVector<Value*, 8> Indices(GEPI->op_begin()+1, GEPI->op_end());
+          Indices[1] = Constant::getNullValue(Type::Int32Ty);
+          Value *ZeroIdx = new GetElementPtrInst(GEPI->getOperand(0),
+                                                 &Indices[0], Indices.size(),
+                                                 GEPI->getName()+".0", GEPI);
+          Indices[1] = ConstantInt::get(Type::Int32Ty, 1);
+          Value *OneIdx = new GetElementPtrInst(GEPI->getOperand(0),
+                                                &Indices[0], Indices.size(),
+                                                GEPI->getName()+".1", GEPI);
+          // Replace all loads of the variable index GEP with loads from both
+          // indexes and a select.
+          while (!GEPI->use_empty()) {
+            LoadInst *LI = cast<LoadInst>(GEPI->use_back());
+            Value *Zero = new LoadInst(ZeroIdx, LI->getName()+".0", LI);
+            Value *One  = new LoadInst(OneIdx , LI->getName()+".1", LI);
+            Value *R = new SelectInst(IsOne, One, Zero, LI->getName(), LI);
+            LI->replaceAllUsesWith(R);
+            LI->eraseFromParent();
+          }
+          GEPI->eraseFromParent();
+        }
+      }
+    }
+  }
+}
+
+/// MergeInType - Add the 'In' type to the accumulated type so far.  If the
+/// types are incompatible, return true, otherwise update Accum and return
+/// false.
+///
+/// There are three cases we handle here:
+///   1) An effectively-integer union, where the pieces are stored into as
+///      smaller integers (common with byte swap and other idioms).
+///   2) A union of vector types of the same size and potentially its elements.
+///      Here we turn element accesses into insert/extract element operations.
+///   3) A union of scalar types, such as int/float or int/pointer.  Here we
+///      merge together into integers, allowing the xform to work with #1 as
+///      well.
+static bool MergeInType(const Type *In, const Type *&Accum,
+                        const TargetData &TD) {
+  // If this is our first type, just use it.
+  const VectorType *PTy;
+  if (Accum == Type::VoidTy || In == Accum) {
+    Accum = In;
+  } else if (In == Type::VoidTy) {
+    // Noop.
+  } else if (In->isInteger() && Accum->isInteger()) {   // integer union.
+    // Otherwise pick whichever type is larger.
+    if (cast<IntegerType>(In)->getBitWidth() > 
+        cast<IntegerType>(Accum)->getBitWidth())
+      Accum = In;
+  } else if (isa<PointerType>(In) && isa<PointerType>(Accum)) {
+    // Pointer unions just stay as one of the pointers.
+  } else if (isa<VectorType>(In) || isa<VectorType>(Accum)) {
+    if ((PTy = dyn_cast<VectorType>(Accum)) && 
+        PTy->getElementType() == In) {
+      // Accum is a vector, and we are accessing an element: ok.
+    } else if ((PTy = dyn_cast<VectorType>(In)) && 
+               PTy->getElementType() == Accum) {
+      // In is a vector, and accum is an element: ok, remember In.
+      Accum = In;
+    } else if ((PTy = dyn_cast<VectorType>(In)) && isa<VectorType>(Accum) &&
+               PTy->getBitWidth() == cast<VectorType>(Accum)->getBitWidth()) {
+      // Two vectors of the same size: keep Accum.
+    } else {
+      // Cannot insert an short into a <4 x int> or handle
+      // <2 x int> -> <4 x int>
+      return true;
+    }
+  } else {
+    // Pointer/FP/Integer unions merge together as integers.
+    switch (Accum->getTypeID()) {
+    case Type::PointerTyID: Accum = TD.getIntPtrType(); break;
+    case Type::FloatTyID:   Accum = Type::Int32Ty; break;
+    case Type::DoubleTyID:  Accum = Type::Int64Ty; break;
+    default:
+      assert(Accum->isInteger() && "Unknown FP type!");
+      break;
+    }
+    
+    switch (In->getTypeID()) {
+    case Type::PointerTyID: In = TD.getIntPtrType(); break;
+    case Type::FloatTyID:   In = Type::Int32Ty; break;
+    case Type::DoubleTyID:  In = Type::Int64Ty; break;
+    default:
+      assert(In->isInteger() && "Unknown FP type!");
+      break;
+    }
+    return MergeInType(In, Accum, TD);
+  }
+  return false;
+}
+
+/// getUIntAtLeastAsBitAs - Return an unsigned integer type that is at least
+/// as big as the specified type.  If there is no suitable type, this returns
+/// null.
+const Type *getUIntAtLeastAsBitAs(unsigned NumBits) {
+  if (NumBits > 64) return 0;
+  if (NumBits > 32) return Type::Int64Ty;
+  if (NumBits > 16) return Type::Int32Ty;
+  if (NumBits > 8) return Type::Int16Ty;
+  return Type::Int8Ty;    
+}
+
+/// CanConvertToScalar - V is a pointer.  If we can convert the pointee to a
+/// single scalar integer type, return that type.  Further, if the use is not
+/// a completely trivial use that mem2reg could promote, set IsNotTrivial.  If
+/// there are no uses of this pointer, return Type::VoidTy to differentiate from
+/// failure.
+///
+const Type *SROA::CanConvertToScalar(Value *V, bool &IsNotTrivial) {
+  const Type *UsedType = Type::VoidTy; // No uses, no forced type.
+  const TargetData &TD = getAnalysis<TargetData>();
+  const PointerType *PTy = cast<PointerType>(V->getType());
+
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    Instruction *User = cast<Instruction>(*UI);
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      if (MergeInType(LI->getType(), UsedType, TD))
+        return 0;
+      
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      // Storing the pointer, not into the value?
+      if (SI->getOperand(0) == V) return 0;
+      
+      // NOTE: We could handle storing of FP imms into integers here!
+      
+      if (MergeInType(SI->getOperand(0)->getType(), UsedType, TD))
+        return 0;
+    } else if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+      IsNotTrivial = true;
+      const Type *SubTy = CanConvertToScalar(CI, IsNotTrivial);
+      if (!SubTy || MergeInType(SubTy, UsedType, TD)) return 0;
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      // Check to see if this is stepping over an element: GEP Ptr, int C
+      if (GEP->getNumOperands() == 2 && isa<ConstantInt>(GEP->getOperand(1))) {
+        unsigned Idx = cast<ConstantInt>(GEP->getOperand(1))->getZExtValue();
+        unsigned ElSize = TD.getTypeSize(PTy->getElementType());
+        unsigned BitOffset = Idx*ElSize*8;
+        if (BitOffset > 64 || !isPowerOf2_32(ElSize)) return 0;
+        
+        IsNotTrivial = true;
+        const Type *SubElt = CanConvertToScalar(GEP, IsNotTrivial);
+        if (SubElt == 0) return 0;
+        if (SubElt != Type::VoidTy && SubElt->isInteger()) {
+          const Type *NewTy = 
+            getUIntAtLeastAsBitAs(TD.getTypeSize(SubElt)*8+BitOffset);
+          if (NewTy == 0 || MergeInType(NewTy, UsedType, TD)) return 0;
+          continue;
+        }
+      } else if (GEP->getNumOperands() == 3 && 
+                 isa<ConstantInt>(GEP->getOperand(1)) &&
+                 isa<ConstantInt>(GEP->getOperand(2)) &&
+                 cast<ConstantInt>(GEP->getOperand(1))->isZero()) {
+        // We are stepping into an element, e.g. a structure or an array:
+        // GEP Ptr, int 0, uint C
+        const Type *AggTy = PTy->getElementType();
+        unsigned Idx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+        
+        if (const ArrayType *ATy = dyn_cast<ArrayType>(AggTy)) {
+          if (Idx >= ATy->getNumElements()) return 0;  // Out of range.
+        } else if (const VectorType *VectorTy = dyn_cast<VectorType>(AggTy)) {
+          // Getting an element of the vector.
+          if (Idx >= VectorTy->getNumElements()) return 0;  // Out of range.
+
+          // Merge in the vector type.
+          if (MergeInType(VectorTy, UsedType, TD)) return 0;
+          
+          const Type *SubTy = CanConvertToScalar(GEP, IsNotTrivial);
+          if (SubTy == 0) return 0;
+          
+          if (SubTy != Type::VoidTy && MergeInType(SubTy, UsedType, TD))
+            return 0;
+
+          // We'll need to change this to an insert/extract element operation.
+          IsNotTrivial = true;
+          continue;    // Everything looks ok
+          
+        } else if (isa<StructType>(AggTy)) {
+          // Structs are always ok.
+        } else {
+          return 0;
+        }
+        const Type *NTy = getUIntAtLeastAsBitAs(TD.getTypeSize(AggTy)*8);
+        if (NTy == 0 || MergeInType(NTy, UsedType, TD)) return 0;
+        const Type *SubTy = CanConvertToScalar(GEP, IsNotTrivial);
+        if (SubTy == 0) return 0;
+        if (SubTy != Type::VoidTy && MergeInType(SubTy, UsedType, TD))
+          return 0;
+        continue;    // Everything looks ok
+      }
+      return 0;
+    } else {
+      // Cannot handle this!
+      return 0;
+    }
+  }
+  
+  return UsedType;
+}
+
+/// ConvertToScalar - The specified alloca passes the CanConvertToScalar
+/// predicate and is non-trivial.  Convert it to something that can be trivially
+/// promoted into a register by mem2reg.
+void SROA::ConvertToScalar(AllocationInst *AI, const Type *ActualTy) {
+  DOUT << "CONVERT TO SCALAR: " << *AI << "  TYPE = "
+       << *ActualTy << "\n";
+  ++NumConverted;
+  
+  BasicBlock *EntryBlock = AI->getParent();
+  assert(EntryBlock == &EntryBlock->getParent()->getEntryBlock() &&
+         "Not in the entry block!");
+  EntryBlock->getInstList().remove(AI);  // Take the alloca out of the program.
+  
+  // Create and insert the alloca.
+  AllocaInst *NewAI = new AllocaInst(ActualTy, 0, AI->getName(),
+                                     EntryBlock->begin());
+  ConvertUsesToScalar(AI, NewAI, 0);
+  delete AI;
+}
+
+
+/// ConvertUsesToScalar - Convert all of the users of Ptr to use the new alloca
+/// directly.  This happens when we are converting an "integer union" to a
+/// single integer scalar, or when we are converting a "vector union" to a
+/// vector with insert/extractelement instructions.
+///
+/// Offset is an offset from the original alloca, in bits that need to be
+/// shifted to the right.  By the end of this, there should be no uses of Ptr.
+void SROA::ConvertUsesToScalar(Value *Ptr, AllocaInst *NewAI, unsigned Offset) {
+  const TargetData &TD = getAnalysis<TargetData>();
+  while (!Ptr->use_empty()) {
+    Instruction *User = cast<Instruction>(Ptr->use_back());
+    
+    if (LoadInst *LI = dyn_cast<LoadInst>(User)) {
+      // The load is a bit extract from NewAI shifted right by Offset bits.
+      Value *NV = new LoadInst(NewAI, LI->getName(), LI);
+      if (NV->getType() == LI->getType()) {
+        // We win, no conversion needed.
+      } else if (const VectorType *PTy = dyn_cast<VectorType>(NV->getType())) {
+        // If the result alloca is a vector type, this is either an element
+        // access or a bitcast to another vector type.
+        if (isa<VectorType>(LI->getType())) {
+          NV = new BitCastInst(NV, LI->getType(), LI->getName(), LI);
+        } else {
+          // Must be an element access.
+          unsigned Elt = Offset/(TD.getTypeSize(PTy->getElementType())*8);
+          NV = new ExtractElementInst(
+                         NV, ConstantInt::get(Type::Int32Ty, Elt), "tmp", LI);
+        }
+      } else if (isa<PointerType>(NV->getType())) {
+        assert(isa<PointerType>(LI->getType()));
+        // Must be ptr->ptr cast.  Anything else would result in NV being
+        // an integer.
+        NV = new BitCastInst(NV, LI->getType(), LI->getName(), LI);
+      } else {
+        const IntegerType *NTy = cast<IntegerType>(NV->getType());
+        unsigned LIBitWidth = TD.getTypeSizeInBits(LI->getType());
+        
+        // If this is a big-endian system and the load is narrower than the
+        // full alloca type, we need to do a shift to get the right bits.
+        int ShAmt = 0;
+        if (TD.isBigEndian()) {
+          ShAmt = NTy->getBitWidth()-LIBitWidth-Offset;
+        } else {
+          ShAmt = Offset;
+        }
+        
+        // Note: we support negative bitwidths (with shl) which are not defined.
+        // We do this to support (f.e.) loads off the end of a structure where
+        // only some bits are used.
+        if (ShAmt > 0 && (unsigned)ShAmt < NTy->getBitWidth())
+          NV = BinaryOperator::createLShr(NV, 
+                                          ConstantInt::get(NV->getType(),ShAmt),
+                                          LI->getName(), LI);
+        else if (ShAmt < 0 && (unsigned)-ShAmt < NTy->getBitWidth())
+          NV = BinaryOperator::createShl(NV, 
+                                         ConstantInt::get(NV->getType(),-ShAmt),
+                                         LI->getName(), LI);
+        
+        // Finally, unconditionally truncate the integer to the right width.
+        if (LIBitWidth < NTy->getBitWidth())
+          NV = new TruncInst(NV, IntegerType::get(LIBitWidth),
+                             LI->getName(), LI);
+        
+        // If the result is an integer, this is a trunc or bitcast.
+        if (isa<IntegerType>(LI->getType())) {
+          assert(NV->getType() == LI->getType() && "Truncate wasn't enough?");
+        } else if (LI->getType()->isFloatingPoint()) {
+          // Just do a bitcast, we know the sizes match up.
+          NV = new BitCastInst(NV, LI->getType(), LI->getName(), LI);
+        } else {
+          // Otherwise must be a pointer.
+          NV = new IntToPtrInst(NV, LI->getType(), LI->getName(), LI);
+        }
+      }
+      LI->replaceAllUsesWith(NV);
+      LI->eraseFromParent();
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(User)) {
+      assert(SI->getOperand(0) != Ptr && "Consistency error!");
+
+      // Convert the stored type to the actual type, shift it left to insert
+      // then 'or' into place.
+      Value *SV = SI->getOperand(0);
+      const Type *AllocaType = NewAI->getType()->getElementType();
+      if (SV->getType() == AllocaType) {
+        // All is well.
+      } else if (const VectorType *PTy = dyn_cast<VectorType>(AllocaType)) {
+        Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI);
+
+        // If the result alloca is a vector type, this is either an element
+        // access or a bitcast to another vector type.
+        if (isa<VectorType>(SV->getType())) {
+          SV = new BitCastInst(SV, AllocaType, SV->getName(), SI);
+        } else {            
+          // Must be an element insertion.
+          unsigned Elt = Offset/(TD.getTypeSize(PTy->getElementType())*8);
+          SV = new InsertElementInst(Old, SV,
+                                     ConstantInt::get(Type::Int32Ty, Elt),
+                                     "tmp", SI);
+        }
+      } else if (isa<PointerType>(AllocaType)) {
+        // If the alloca type is a pointer, then all the elements must be
+        // pointers.
+        if (SV->getType() != AllocaType)
+          SV = new BitCastInst(SV, AllocaType, SV->getName(), SI);
+      } else {
+        Value *Old = new LoadInst(NewAI, NewAI->getName()+".in", SI);
+
+        // If SV is a float, convert it to the appropriate integer type.
+        // If it is a pointer, do the same, and also handle ptr->ptr casts
+        // here.
+        unsigned SrcWidth = TD.getTypeSizeInBits(SV->getType());
+        unsigned DestWidth = AllocaType->getPrimitiveSizeInBits();
+        if (SV->getType()->isFloatingPoint())
+          SV = new BitCastInst(SV, IntegerType::get(SrcWidth),
+                               SV->getName(), SI);
+        else if (isa<PointerType>(SV->getType()))
+          SV = new PtrToIntInst(SV, TD.getIntPtrType(), SV->getName(), SI);
+                 
+        // Always zero extend the value if needed.
+        if (SV->getType() != AllocaType)
+          SV = new ZExtInst(SV, AllocaType, SV->getName(), SI);
+        
+        // If this is a big-endian system and the store is narrower than the
+        // full alloca type, we need to do a shift to get the right bits.
+        int ShAmt = 0;
+        if (TD.isBigEndian()) {
+          ShAmt = DestWidth-SrcWidth-Offset;
+        } else {
+          ShAmt = Offset;
+        }
+        
+        // Note: we support negative bitwidths (with shr) which are not defined.
+        // We do this to support (f.e.) stores off the end of a structure where
+        // only some bits in the structure are set.
+        APInt Mask(APInt::getLowBitsSet(DestWidth, SrcWidth));
+        if (ShAmt > 0 && (unsigned)ShAmt < DestWidth) {
+          SV = BinaryOperator::createShl(SV, 
+                                         ConstantInt::get(SV->getType(), ShAmt),
+                                         SV->getName(), SI);
+          Mask <<= ShAmt;
+        } else if (ShAmt < 0 && (unsigned)-ShAmt < DestWidth) {
+          SV = BinaryOperator::createLShr(SV,
+                                         ConstantInt::get(SV->getType(),-ShAmt),
+                                          SV->getName(), SI);
+          Mask = Mask.lshr(ShAmt);
+        }
+        
+        // Mask out the bits we are about to insert from the old value, and or
+        // in the new bits.
+        if (SrcWidth != DestWidth) {
+          assert(DestWidth > SrcWidth);
+          Old = BinaryOperator::createAnd(Old, ConstantInt::get(~Mask),
+                                          Old->getName()+".mask", SI);
+          SV = BinaryOperator::createOr(Old, SV, SV->getName()+".ins", SI);
+        }
+      }
+      new StoreInst(SV, NewAI, SI);
+      SI->eraseFromParent();
+      
+    } else if (BitCastInst *CI = dyn_cast<BitCastInst>(User)) {
+       ConvertUsesToScalar(CI, NewAI, Offset);
+      CI->eraseFromParent();
+    } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(User)) {
+      const PointerType *AggPtrTy = 
+        cast<PointerType>(GEP->getOperand(0)->getType());
+      const TargetData &TD = getAnalysis<TargetData>();
+      unsigned AggSizeInBits = TD.getTypeSize(AggPtrTy->getElementType())*8;
+      
+      // Check to see if this is stepping over an element: GEP Ptr, int C
+      unsigned NewOffset = Offset;
+      if (GEP->getNumOperands() == 2) {
+        unsigned Idx = cast<ConstantInt>(GEP->getOperand(1))->getZExtValue();
+        unsigned BitOffset = Idx*AggSizeInBits;
+        
+        NewOffset += BitOffset;
+      } else if (GEP->getNumOperands() == 3) {
+        // We know that operand #2 is zero.
+        unsigned Idx = cast<ConstantInt>(GEP->getOperand(2))->getZExtValue();
+        const Type *AggTy = AggPtrTy->getElementType();
+        if (const SequentialType *SeqTy = dyn_cast<SequentialType>(AggTy)) {
+          unsigned ElSizeBits = TD.getTypeSize(SeqTy->getElementType())*8;
+
+          NewOffset += ElSizeBits*Idx;
+        } else if (const StructType *STy = dyn_cast<StructType>(AggTy)) {
+          unsigned EltBitOffset =
+            TD.getStructLayout(STy)->getElementOffset(Idx)*8;
+          
+          NewOffset += EltBitOffset;
+        } else {
+          assert(0 && "Unsupported operation!");
+          abort();
+        }
+      } else {
+        assert(0 && "Unsupported operation!");
+        abort();
+      }
+      ConvertUsesToScalar(GEP, NewAI, NewOffset);
+      GEP->eraseFromParent();
+    } else {
+      assert(0 && "Unsupported operation!");
+      abort();
+    }
+  }
+}
+
+
+/// PointsToConstantGlobal - Return true if V (possibly indirectly) points to
+/// some part of a constant global variable.  This intentionally only accepts
+/// constant expressions because we don't can't rewrite arbitrary instructions.
+static bool PointsToConstantGlobal(Value *V) {
+  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
+    return GV->isConstant();
+  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V))
+    if (CE->getOpcode() == Instruction::BitCast || 
+        CE->getOpcode() == Instruction::GetElementPtr)
+      return PointsToConstantGlobal(CE->getOperand(0));
+  return false;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
+/// pointer to an alloca.  Ignore any reads of the pointer, return false if we
+/// see any stores or other unknown uses.  If we see pointer arithmetic, keep
+/// track of whether it moves the pointer (with isOffset) but otherwise traverse
+/// the uses.  If we see a memcpy/memmove that targets an unoffseted pointer to
+/// the alloca, and if the source pointer is a pointer to a constant  global, we
+/// can optimize this.
+static bool isOnlyCopiedFromConstantGlobal(Value *V, Instruction *&TheCopy,
+                                           bool isOffset) {
+  for (Value::use_iterator UI = V->use_begin(), E = V->use_end(); UI!=E; ++UI) {
+    if (isa<LoadInst>(*UI)) {
+      // Ignore loads, they are always ok.
+      continue;
+    }
+    if (BitCastInst *BCI = dyn_cast<BitCastInst>(*UI)) {
+      // If uses of the bitcast are ok, we are ok.
+      if (!isOnlyCopiedFromConstantGlobal(BCI, TheCopy, isOffset))
+        return false;
+      continue;
+    }
+    if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(*UI)) {
+      // If the GEP has all zero indices, it doesn't offset the pointer.  If it
+      // doesn't, it does.
+      if (!isOnlyCopiedFromConstantGlobal(GEP, TheCopy,
+                                         isOffset || !GEP->hasAllZeroIndices()))
+        return false;
+      continue;
+    }
+    
+    // If this is isn't our memcpy/memmove, reject it as something we can't
+    // handle.
+    if (!isa<MemCpyInst>(*UI) && !isa<MemMoveInst>(*UI))
+      return false;
+
+    // If we already have seen a copy, reject the second one.
+    if (TheCopy) return false;
+    
+    // If the pointer has been offset from the start of the alloca, we can't
+    // safely handle this.
+    if (isOffset) return false;
+
+    // If the memintrinsic isn't using the alloca as the dest, reject it.
+    if (UI.getOperandNo() != 1) return false;
+    
+    MemIntrinsic *MI = cast<MemIntrinsic>(*UI);
+    
+    // If the source of the memcpy/move is not a constant global, reject it.
+    if (!PointsToConstantGlobal(MI->getOperand(2)))
+      return false;
+    
+    // Otherwise, the transform is safe.  Remember the copy instruction.
+    TheCopy = MI;
+  }
+  return true;
+}
+
+/// isOnlyCopiedFromConstantGlobal - Return true if the specified alloca is only
+/// modified by a copy from a constant global.  If we can prove this, we can
+/// replace any uses of the alloca with uses of the global directly.
+Instruction *SROA::isOnlyCopiedFromConstantGlobal(AllocationInst *AI) {
+  Instruction *TheCopy = 0;
+  if (::isOnlyCopiedFromConstantGlobal(AI, TheCopy, false))
+    return TheCopy;
+  return 0;
+}

diff --git a/lib/Transforms/Scalar/SimplifyCFG.cpp b/lib/Transforms/Scalar/SimplifyCFG.cpp
new file mode 100644
index 0000000..6b47ef7
--- /dev/null
+++ b/lib/Transforms/Scalar/SimplifyCFG.cpp

@@ -0,0 +1,145 @@
+//===- SimplifyCFG.cpp - CFG Simplification Pass --------------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements dead code elimination and basic block merging.
+//
+// Specifically, this:
+//   * removes basic blocks with no predecessors
+//   * merges a basic block into its predecessor if there is only one and the
+//     predecessor only has one successor.
+//   * Eliminates PHI nodes for basic blocks with a single predecessor
+//   * Eliminates a basic block that only contains an unconditional branch
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "simplifycfg"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Constants.h"
+#include "llvm/Instructions.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Pass.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumSimpl, "Number of blocks simplified");
+
+namespace {
+  struct VISIBILITY_HIDDEN CFGSimplifyPass : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    CFGSimplifyPass() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+  };
+  char CFGSimplifyPass::ID = 0;
+  RegisterPass<CFGSimplifyPass> X("simplifycfg", "Simplify the CFG");
+}
+
+// Public interface to the CFGSimplification pass
+FunctionPass *llvm::createCFGSimplificationPass() {
+  return new CFGSimplifyPass();
+}
+
+static bool MarkAliveBlocks(BasicBlock *BB,
+                            SmallPtrSet<BasicBlock*, 16> &Reachable) {
+  
+  std::vector<BasicBlock*> Worklist;
+  Worklist.push_back(BB);
+  bool Changed = false;
+  while (!Worklist.empty()) {
+    BB = Worklist.back();
+    Worklist.pop_back();
+    
+    if (!Reachable.insert(BB))
+      continue;
+
+    // Do a quick scan of the basic block, turning any obviously unreachable
+    // instructions into LLVM unreachable insts.  The instruction combining pass
+    // canonnicalizes unreachable insts into stores to null or undef.
+    for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ++BBI)
+      if (StoreInst *SI = dyn_cast<StoreInst>(BBI))
+        if (isa<ConstantPointerNull>(SI->getOperand(1)) ||
+            isa<UndefValue>(SI->getOperand(1))) {
+          // Loop over all of the successors, removing BB's entry from any PHI
+          // nodes.
+          for (succ_iterator I = succ_begin(BB), SE = succ_end(BB); I != SE;++I)
+            (*I)->removePredecessor(BB);
+
+          new UnreachableInst(SI);
+
+          // All instructions after this are dead.
+          while (BBI != E) {
+            if (!BBI->use_empty())
+              BBI->replaceAllUsesWith(UndefValue::get(BBI->getType()));
+            BB->getInstList().erase(BBI++);
+          }
+          break;
+        }
+
+
+    Changed |= ConstantFoldTerminator(BB);
+    for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI != SE; ++SI)
+      Worklist.push_back(*SI);
+  }
+  return Changed;
+}
+
+
+// It is possible that we may require multiple passes over the code to fully
+// simplify the CFG.
+//
+bool CFGSimplifyPass::runOnFunction(Function &F) {
+  SmallPtrSet<BasicBlock*, 16> Reachable;
+  bool Changed = MarkAliveBlocks(F.begin(), Reachable);
+
+  // If there are unreachable blocks in the CFG...
+  if (Reachable.size() != F.size()) {
+    assert(Reachable.size() < F.size());
+    NumSimpl += F.size()-Reachable.size();
+
+    // Loop over all of the basic blocks that are not reachable, dropping all of
+    // their internal references...
+    for (Function::iterator BB = ++F.begin(), E = F.end(); BB != E; ++BB)
+      if (!Reachable.count(BB)) {
+        for (succ_iterator SI = succ_begin(BB), SE = succ_end(BB); SI!=SE; ++SI)
+          if (Reachable.count(*SI))
+            (*SI)->removePredecessor(BB);
+        BB->dropAllReferences();
+      }
+
+    for (Function::iterator I = ++F.begin(); I != F.end();)
+      if (!Reachable.count(I))
+        I = F.getBasicBlockList().erase(I);
+      else
+        ++I;
+
+    Changed = true;
+  }
+
+  bool LocalChange = true;
+  while (LocalChange) {
+    LocalChange = false;
+
+    // Loop over all of the basic blocks (except the first one) and remove them
+    // if they are unneeded...
+    //
+    for (Function::iterator BBIt = ++F.begin(); BBIt != F.end(); ) {
+      if (SimplifyCFG(BBIt++)) {
+        LocalChange = true;
+        ++NumSimpl;
+      }
+    }
+    Changed |= LocalChange;
+  }
+
+  return Changed;
+}

diff --git a/lib/Transforms/Scalar/TailDuplication.cpp b/lib/Transforms/Scalar/TailDuplication.cpp
new file mode 100644
index 0000000..22d8157
--- /dev/null
+++ b/lib/Transforms/Scalar/TailDuplication.cpp

@@ -0,0 +1,364 @@
+//===- TailDuplication.cpp - Simplify CFG through tail duplication --------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass performs a limited form of tail duplication, intended to simplify
+// CFGs by removing some unconditional branches.  This pass is necessary to
+// straighten out loops created by the C front-end, but also is capable of
+// making other code nicer.  After this pass is run, the CFG simplify pass
+// should be run to clean up the mess.
+//
+// This pass could be enhanced in the future to use profile information to be
+// more aggressive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailduplicate"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constant.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/Pass.h"
+#include "llvm/Type.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of unconditional branches eliminated");
+
+namespace {
+  cl::opt<unsigned>
+  Threshold("taildup-threshold", cl::desc("Max block size to tail duplicate"),
+            cl::init(6), cl::Hidden);
+  class VISIBILITY_HIDDEN TailDup : public FunctionPass {
+    bool runOnFunction(Function &F);
+  public:
+    static char ID; // Pass identification, replacement for typeid
+    TailDup() : FunctionPass((intptr_t)&ID) {}
+
+  private:
+    inline bool shouldEliminateUnconditionalBranch(TerminatorInst *TI);
+    inline void eliminateUnconditionalBranch(BranchInst *BI);
+  };
+  char TailDup::ID = 0;
+  RegisterPass<TailDup> X("tailduplicate", "Tail Duplication");
+}
+
+// Public interface to the Tail Duplication pass
+FunctionPass *llvm::createTailDuplicationPass() { return new TailDup(); }
+
+/// runOnFunction - Top level algorithm - Loop over each unconditional branch in
+/// the function, eliminating it if it looks attractive enough.
+///
+bool TailDup::runOnFunction(Function &F) {
+  bool Changed = false;
+  for (Function::iterator I = F.begin(), E = F.end(); I != E; )
+    if (shouldEliminateUnconditionalBranch(I->getTerminator())) {
+      eliminateUnconditionalBranch(cast<BranchInst>(I->getTerminator()));
+      Changed = true;
+    } else {
+      ++I;
+    }
+  return Changed;
+}
+
+/// shouldEliminateUnconditionalBranch - Return true if this branch looks
+/// attractive to eliminate.  We eliminate the branch if the destination basic
+/// block has <= 5 instructions in it, not counting PHI nodes.  In practice,
+/// since one of these is a terminator instruction, this means that we will add
+/// up to 4 instructions to the new block.
+///
+/// We don't count PHI nodes in the count since they will be removed when the
+/// contents of the block are copied over.
+///
+bool TailDup::shouldEliminateUnconditionalBranch(TerminatorInst *TI) {
+  BranchInst *BI = dyn_cast<BranchInst>(TI);
+  if (!BI || !BI->isUnconditional()) return false;  // Not an uncond branch!
+
+  BasicBlock *Dest = BI->getSuccessor(0);
+  if (Dest == BI->getParent()) return false;        // Do not loop infinitely!
+
+  // Do not inline a block if we will just get another branch to the same block!
+  TerminatorInst *DTI = Dest->getTerminator();
+  if (BranchInst *DBI = dyn_cast<BranchInst>(DTI))
+    if (DBI->isUnconditional() && DBI->getSuccessor(0) == Dest)
+      return false;                                 // Do not loop infinitely!
+
+  // FIXME: DemoteRegToStack cannot yet demote invoke instructions to the stack,
+  // because doing so would require breaking critical edges.  This should be
+  // fixed eventually.
+  if (!DTI->use_empty())
+    return false;
+
+  // Do not bother working on dead blocks...
+  pred_iterator PI = pred_begin(Dest), PE = pred_end(Dest);
+  if (PI == PE && Dest != Dest->getParent()->begin())
+    return false;   // It's just a dead block, ignore it...
+
+  // Also, do not bother with blocks with only a single predecessor: simplify
+  // CFG will fold these two blocks together!
+  ++PI;
+  if (PI == PE) return false;  // Exactly one predecessor!
+
+  BasicBlock::iterator I = Dest->begin();
+  while (isa<PHINode>(*I)) ++I;
+
+  for (unsigned Size = 0; I != Dest->end(); ++I) {
+    if (Size == Threshold) return false;  // The block is too large.
+    // Only count instructions that are not debugger intrinsics.
+    if (!isa<DbgInfoIntrinsic>(I)) ++Size;
+  }
+
+  // Do not tail duplicate a block that has thousands of successors into a block
+  // with a single successor if the block has many other predecessors.  This can
+  // cause an N^2 explosion in CFG edges (and PHI node entries), as seen in
+  // cases that have a large number of indirect gotos.
+  unsigned NumSuccs = DTI->getNumSuccessors();
+  if (NumSuccs > 8) {
+    unsigned TooMany = 128;
+    if (NumSuccs >= TooMany) return false;
+    TooMany = TooMany/NumSuccs;
+    for (; PI != PE; ++PI)
+      if (TooMany-- == 0) return false;
+  }
+  
+  // Finally, if this unconditional branch is a fall-through, be careful about
+  // tail duplicating it.  In particular, we don't want to taildup it if the
+  // original block will still be there after taildup is completed: doing so
+  // would eliminate the fall-through, requiring unconditional branches.
+  Function::iterator DestI = Dest;
+  if (&*--DestI == BI->getParent()) {
+    // The uncond branch is a fall-through.  Tail duplication of the block is
+    // will eliminate the fall-through-ness and end up cloning the terminator
+    // at the end of the Dest block.  Since the original Dest block will
+    // continue to exist, this means that one or the other will not be able to
+    // fall through.  One typical example that this helps with is code like:
+    // if (a)
+    //   foo();
+    // if (b)
+    //   foo();
+    // Cloning the 'if b' block into the end of the first foo block is messy.
+    
+    // The messy case is when the fall-through block falls through to other
+    // blocks.  This is what we would be preventing if we cloned the block.
+    DestI = Dest;
+    if (++DestI != Dest->getParent()->end()) {
+      BasicBlock *DestSucc = DestI;
+      // If any of Dest's successors are fall-throughs, don't do this xform.
+      for (succ_iterator SI = succ_begin(Dest), SE = succ_end(Dest);
+           SI != SE; ++SI)
+        if (*SI == DestSucc)
+          return false;
+    }
+  }
+
+  return true;
+}
+
+/// FindObviousSharedDomOf - We know there is a branch from SrcBlock to
+/// DestBlock, and that SrcBlock is not the only predecessor of DstBlock.  If we
+/// can find a predecessor of SrcBlock that is a dominator of both SrcBlock and
+/// DstBlock, return it.
+static BasicBlock *FindObviousSharedDomOf(BasicBlock *SrcBlock,
+                                          BasicBlock *DstBlock) {
+  // SrcBlock must have a single predecessor.
+  pred_iterator PI = pred_begin(SrcBlock), PE = pred_end(SrcBlock);
+  if (PI == PE || ++PI != PE) return 0;
+
+  BasicBlock *SrcPred = *pred_begin(SrcBlock);
+
+  // Look at the predecessors of DstBlock.  One of them will be SrcBlock.  If
+  // there is only one other pred, get it, otherwise we can't handle it.
+  PI = pred_begin(DstBlock); PE = pred_end(DstBlock);
+  BasicBlock *DstOtherPred = 0;
+  if (*PI == SrcBlock) {
+    if (++PI == PE) return 0;
+    DstOtherPred = *PI;
+    if (++PI != PE) return 0;
+  } else {
+    DstOtherPred = *PI;
+    if (++PI == PE || *PI != SrcBlock || ++PI != PE) return 0;
+  }
+
+  // We can handle two situations here: "if then" and "if then else" blocks.  An
+  // 'if then' situation is just where DstOtherPred == SrcPred.
+  if (DstOtherPred == SrcPred)
+    return SrcPred;
+
+  // Check to see if we have an "if then else" situation, which means that
+  // DstOtherPred will have a single predecessor and it will be SrcPred.
+  PI = pred_begin(DstOtherPred); PE = pred_end(DstOtherPred);
+  if (PI != PE && *PI == SrcPred) {
+    if (++PI != PE) return 0;  // Not a single pred.
+    return SrcPred;  // Otherwise, it's an "if then" situation.  Return the if.
+  }
+
+  // Otherwise, this is something we can't handle.
+  return 0;
+}
+
+
+/// eliminateUnconditionalBranch - Clone the instructions from the destination
+/// block into the source block, eliminating the specified unconditional branch.
+/// If the destination block defines values used by successors of the dest
+/// block, we may need to insert PHI nodes.
+///
+void TailDup::eliminateUnconditionalBranch(BranchInst *Branch) {
+  BasicBlock *SourceBlock = Branch->getParent();
+  BasicBlock *DestBlock = Branch->getSuccessor(0);
+  assert(SourceBlock != DestBlock && "Our predicate is broken!");
+
+  DOUT << "TailDuplication[" << SourceBlock->getParent()->getName()
+       << "]: Eliminating branch: " << *Branch;
+
+  // See if we can avoid duplicating code by moving it up to a dominator of both
+  // blocks.
+  if (BasicBlock *DomBlock = FindObviousSharedDomOf(SourceBlock, DestBlock)) {
+    DOUT << "Found shared dominator: " << DomBlock->getName() << "\n";
+
+    // If there are non-phi instructions in DestBlock that have no operands
+    // defined in DestBlock, and if the instruction has no side effects, we can
+    // move the instruction to DomBlock instead of duplicating it.
+    BasicBlock::iterator BBI = DestBlock->begin();
+    while (isa<PHINode>(BBI)) ++BBI;
+    while (!isa<TerminatorInst>(BBI)) {
+      Instruction *I = BBI++;
+
+      bool CanHoist = !I->isTrapping() && !I->mayWriteToMemory();
+      if (CanHoist) {
+        for (unsigned op = 0, e = I->getNumOperands(); op != e; ++op)
+          if (Instruction *OpI = dyn_cast<Instruction>(I->getOperand(op)))
+            if (OpI->getParent() == DestBlock ||
+                (isa<InvokeInst>(OpI) && OpI->getParent() == DomBlock)) {
+              CanHoist = false;
+              break;
+            }
+        if (CanHoist) {
+          // Remove from DestBlock, move right before the term in DomBlock.
+          DestBlock->getInstList().remove(I);
+          DomBlock->getInstList().insert(DomBlock->getTerminator(), I);
+          DOUT << "Hoisted: " << *I;
+        }
+      }
+    }
+  }
+
+  // Tail duplication can not update SSA properties correctly if the values
+  // defined in the duplicated tail are used outside of the tail itself.  For
+  // this reason, we spill all values that are used outside of the tail to the
+  // stack.
+  for (BasicBlock::iterator I = DestBlock->begin(); I != DestBlock->end(); ++I)
+    for (Value::use_iterator UI = I->use_begin(), E = I->use_end(); UI != E;
+         ++UI) {
+      bool ShouldDemote = false;
+      if (cast<Instruction>(*UI)->getParent() != DestBlock) {
+        // We must allow our successors to use tail values in their PHI nodes
+        // (if the incoming value corresponds to the tail block).
+        if (PHINode *PN = dyn_cast<PHINode>(*UI)) {
+          for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+            if (PN->getIncomingValue(i) == I &&
+                PN->getIncomingBlock(i) != DestBlock) {
+              ShouldDemote = true;
+              break;
+            }
+
+        } else {
+          ShouldDemote = true;
+        }
+      } else if (PHINode *PN = dyn_cast<PHINode>(cast<Instruction>(*UI))) {
+        // If the user of this instruction is a PHI node in the current block,
+        // which has an entry from another block using the value, spill it.
+        for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
+          if (PN->getIncomingValue(i) == I &&
+              PN->getIncomingBlock(i) != DestBlock) {
+            ShouldDemote = true;
+            break;
+          }
+      }
+
+      if (ShouldDemote) {
+        // We found a use outside of the tail.  Create a new stack slot to
+        // break this inter-block usage pattern.
+        DemoteRegToStack(*I);
+        break;
+      }
+    }
+
+  // We are going to have to map operands from the original block B to the new
+  // copy of the block B'.  If there are PHI nodes in the DestBlock, these PHI
+  // nodes also define part of this mapping.  Loop over these PHI nodes, adding
+  // them to our mapping.
+  //
+  std::map<Value*, Value*> ValueMapping;
+
+  BasicBlock::iterator BI = DestBlock->begin();
+  bool HadPHINodes = isa<PHINode>(BI);
+  for (; PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+    ValueMapping[PN] = PN->getIncomingValueForBlock(SourceBlock);
+
+  // Clone the non-phi instructions of the dest block into the source block,
+  // keeping track of the mapping...
+  //
+  for (; BI != DestBlock->end(); ++BI) {
+    Instruction *New = BI->clone();
+    New->setName(BI->getName());
+    SourceBlock->getInstList().push_back(New);
+    ValueMapping[BI] = New;
+  }
+
+  // Now that we have built the mapping information and cloned all of the
+  // instructions (giving us a new terminator, among other things), walk the new
+  // instructions, rewriting references of old instructions to use new
+  // instructions.
+  //
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  for (; BI != SourceBlock->end(); ++BI)
+    for (unsigned i = 0, e = BI->getNumOperands(); i != e; ++i)
+      if (Value *Remapped = ValueMapping[BI->getOperand(i)])
+        BI->setOperand(i, Remapped);
+
+  // Next we check to see if any of the successors of DestBlock had PHI nodes.
+  // If so, we need to add entries to the PHI nodes for SourceBlock now.
+  for (succ_iterator SI = succ_begin(DestBlock), SE = succ_end(DestBlock);
+       SI != SE; ++SI) {
+    BasicBlock *Succ = *SI;
+    for (BasicBlock::iterator PNI = Succ->begin(); isa<PHINode>(PNI); ++PNI) {
+      PHINode *PN = cast<PHINode>(PNI);
+      // Ok, we have a PHI node.  Figure out what the incoming value was for the
+      // DestBlock.
+      Value *IV = PN->getIncomingValueForBlock(DestBlock);
+
+      // Remap the value if necessary...
+      if (Value *MappedIV = ValueMapping[IV])
+        IV = MappedIV;
+      PN->addIncoming(IV, SourceBlock);
+    }
+  }
+
+  // Next, remove the old branch instruction, and any PHI node entries that we
+  // had.
+  BI = Branch; ++BI;  // Get an iterator to the first new instruction
+  DestBlock->removePredecessor(SourceBlock); // Remove entries in PHI nodes...
+  SourceBlock->getInstList().erase(Branch);  // Destroy the uncond branch...
+
+  // Final step: now that we have finished everything up, walk the cloned
+  // instructions one last time, constant propagating and DCE'ing them, because
+  // they may not be needed anymore.
+  //
+  if (HadPHINodes)
+    while (BI != SourceBlock->end())
+      if (!dceInstruction(BI) && !doConstantPropagation(BI))
+        ++BI;
+
+  ++NumEliminated;  // We just killed a branch!
+}

diff --git a/lib/Transforms/Scalar/TailRecursionElimination.cpp b/lib/Transforms/Scalar/TailRecursionElimination.cpp
new file mode 100644
index 0000000..497b81f
--- /dev/null
+++ b/lib/Transforms/Scalar/TailRecursionElimination.cpp

@@ -0,0 +1,462 @@
+//===- TailRecursionElimination.cpp - Eliminate Tail Calls ----------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file was developed by the LLVM research group and is distributed under
+// the University of Illinois Open Source License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file transforms calls of the current function (self recursion) followed
+// by a return instruction with a branch to the entry of the function, creating
+// a loop.  This pass also implements the following extensions to the basic
+// algorithm:
+//
+//  1. Trivial instructions between the call and return do not prevent the
+//     transformation from taking place, though currently the analysis cannot
+//     support moving any really useful instructions (only dead ones).
+//  2. This pass transforms functions that are prevented from being tail
+//     recursive by an associative expression to use an accumulator variable,
+//     thus compiling the typical naive factorial or 'fib' implementation into
+//     efficient code.
+//  3. TRE is performed if the function returns void, if the return
+//     returns the result returned by the call, or if the function returns a
+//     run-time constant on all exits from the function.  It is possible, though
+//     unlikely, that the return returns something else (like constant 0), and
+//     can still be TRE'd.  It can be TRE'd if ALL OTHER return instructions in
+//     the function return the exact same value.
+//  4. If it can prove that callees do not access theier caller stack frame,
+//     they are marked as eligible for tail call elimination (by the code
+//     generator).
+//
+// There are several improvements that could be made:
+//
+//  1. If the function has any alloca instructions, these instructions will be
+//     moved out of the entry block of the function, causing them to be
+//     evaluated each time through the tail recursion.  Safely keeping allocas
+//     in the entry block requires analysis to proves that the tail-called
+//     function does not read or write the stack object.
+//  2. Tail recursion is only performed if the call immediately preceeds the
+//     return instruction.  It's possible that there could be a jump between
+//     the call and the return.
+//  3. There can be intervening operations between the call and the return that
+//     prevent the TRE from occurring.  For example, there could be GEP's and
+//     stores to memory that will not be read or written by the call.  This
+//     requires some substantial analysis (such as with DSA) to prove safe to
+//     move ahead of the call, but doing so could allow many more TREs to be
+//     performed, for example in TreeAdd/TreeAlloc from the treeadd benchmark.
+//  4. The algorithm we use to detect if callees access their caller stack
+//     frames is very primitive.
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "tailcallelim"
+#include "llvm/Transforms/Scalar.h"
+#include "llvm/Constants.h"
+#include "llvm/DerivedTypes.h"
+#include "llvm/Function.h"
+#include "llvm/Instructions.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CFG.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Support/Compiler.h"
+using namespace llvm;
+
+STATISTIC(NumEliminated, "Number of tail calls removed");
+STATISTIC(NumAccumAdded, "Number of accumulators introduced");
+
+namespace {
+  struct VISIBILITY_HIDDEN TailCallElim : public FunctionPass {
+    static char ID; // Pass identification, replacement for typeid
+    TailCallElim() : FunctionPass((intptr_t)&ID) {}
+
+    virtual bool runOnFunction(Function &F);
+
+  private:
+    bool ProcessReturningBlock(ReturnInst *RI, BasicBlock *&OldEntry,
+                               bool &TailCallsAreMarkedTail,
+                               std::vector<PHINode*> &ArgumentPHIs,
+                               bool CannotTailCallElimCallsMarkedTail);
+    bool CanMoveAboveCall(Instruction *I, CallInst *CI);
+    Value *CanTransformAccumulatorRecursion(Instruction *I, CallInst *CI);
+  };
+  char TailCallElim::ID = 0;
+  RegisterPass<TailCallElim> X("tailcallelim", "Tail Call Elimination");
+}
+
+// Public interface to the TailCallElimination pass
+FunctionPass *llvm::createTailCallEliminationPass() {
+  return new TailCallElim();
+}
+
+
+/// AllocaMightEscapeToCalls - Return true if this alloca may be accessed by
+/// callees of this function.  We only do very simple analysis right now, this
+/// could be expanded in the future to use mod/ref information for particular
+/// call sites if desired.
+static bool AllocaMightEscapeToCalls(AllocaInst *AI) {
+  // FIXME: do simple 'address taken' analysis.
+  return true;
+}
+
+/// FunctionContainsAllocas - Scan the specified basic block for alloca
+/// instructions.  If it contains any that might be accessed by calls, return
+/// true.
+static bool CheckForEscapingAllocas(BasicBlock *BB,
+                                    bool &CannotTCETailMarkedCall) {
+  bool RetVal = false;
+  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      RetVal |= AllocaMightEscapeToCalls(AI);
+
+      // If this alloca is in the body of the function, or if it is a variable
+      // sized allocation, we cannot tail call eliminate calls marked 'tail'
+      // with this mechanism.
+      if (BB != &BB->getParent()->getEntryBlock() ||
+          !isa<ConstantInt>(AI->getArraySize()))
+        CannotTCETailMarkedCall = true;
+    }
+  return RetVal;
+}
+
+bool TailCallElim::runOnFunction(Function &F) {
+  // If this function is a varargs function, we won't be able to PHI the args
+  // right, so don't even try to convert it...
+  if (F.getFunctionType()->isVarArg()) return false;
+
+  BasicBlock *OldEntry = 0;
+  bool TailCallsAreMarkedTail = false;
+  std::vector<PHINode*> ArgumentPHIs;
+  bool MadeChange = false;
+
+  bool FunctionContainsEscapingAllocas = false;
+
+  // CannotTCETailMarkedCall - If true, we cannot perform TCE on tail calls
+  // marked with the 'tail' attribute, because doing so would cause the stack
+  // size to increase (real TCE would deallocate variable sized allocas, TCE
+  // doesn't).
+  bool CannotTCETailMarkedCall = false;
+
+  // Loop over the function, looking for any returning blocks, and keeping track
+  // of whether this function has any non-trivially used allocas.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (FunctionContainsEscapingAllocas && CannotTCETailMarkedCall)
+      break;
+
+    FunctionContainsEscapingAllocas |=
+      CheckForEscapingAllocas(BB, CannotTCETailMarkedCall);
+  }
+  
+  /// FIXME: The code generator produces really bad code when an 'escaping
+  /// alloca' is changed from being a static alloca to being a dynamic alloca.
+  /// Until this is resolved, disable this transformation if that would ever
+  /// happen.  This bug is PR962.
+  if (FunctionContainsEscapingAllocas)
+    return false;
+  
+
+  // Second pass, change any tail calls to loops.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+    if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator()))
+      MadeChange |= ProcessReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
+                                          ArgumentPHIs,CannotTCETailMarkedCall);
+
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  if (!ArgumentPHIs.empty()) {
+    for (unsigned i = 0, e = ArgumentPHIs.size(); i != e; ++i) {
+      PHINode *PN = ArgumentPHIs[i];
+
+      // If the PHI Node is a dynamic constant, replace it with the value it is.
+      if (Value *PNV = PN->hasConstantValue()) {
+        PN->replaceAllUsesWith(PNV);
+        PN->eraseFromParent();
+      }
+    }
+  }
+
+  // Finally, if this function contains no non-escaping allocas, mark all calls
+  // in the function as eligible for tail calls (there is no stack memory for
+  // them to access).
+  if (!FunctionContainsEscapingAllocas)
+    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
+      for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I)
+        if (CallInst *CI = dyn_cast<CallInst>(I))
+          CI->setTailCall();
+
+  return MadeChange;
+}
+
+
+/// CanMoveAboveCall - Return true if it is safe to move the specified
+/// instruction from after the call to before the call, assuming that all
+/// instructions between the call and this instruction are movable.
+///
+bool TailCallElim::CanMoveAboveCall(Instruction *I, CallInst *CI) {
+  // FIXME: We can move load/store/call/free instructions above the call if the
+  // call does not mod/ref the memory location being processed.
+  if (I->mayWriteToMemory() || isa<LoadInst>(I))
+    return false;
+
+  // Otherwise, if this is a side-effect free instruction, check to make sure
+  // that it does not use the return value of the call.  If it doesn't use the
+  // return value of the call, it must only use things that are defined before
+  // the call, or movable instructions between the call and the instruction
+  // itself.
+  for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
+    if (I->getOperand(i) == CI)
+      return false;
+  return true;
+}
+
+// isDynamicConstant - Return true if the specified value is the same when the
+// return would exit as it was when the initial iteration of the recursive
+// function was executed.
+//
+// We currently handle static constants and arguments that are not modified as
+// part of the recursion.
+//
+static bool isDynamicConstant(Value *V, CallInst *CI) {
+  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
+
+  // Check to see if this is an immutable argument, if so, the value
+  // will be available to initialize the accumulator.
+  if (Argument *Arg = dyn_cast<Argument>(V)) {
+    // Figure out which argument number this is...
+    unsigned ArgNo = 0;
+    Function *F = CI->getParent()->getParent();
+    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
+      ++ArgNo;
+
+    // If we are passing this argument into call as the corresponding
+    // argument operand, then the argument is dynamically constant.
+    // Otherwise, we cannot transform this function safely.
+    if (CI->getOperand(ArgNo+1) == Arg)
+      return true;
+  }
+  // Not a constant or immutable argument, we can't safely transform.
+  return false;
+}
+
+// getCommonReturnValue - Check to see if the function containing the specified
+// return instruction and tail call consistently returns the same
+// runtime-constant value at all exit points.  If so, return the returned value.
+//
+static Value *getCommonReturnValue(ReturnInst *TheRI, CallInst *CI) {
+  Function *F = TheRI->getParent()->getParent();
+  Value *ReturnedValue = 0;
+
+  for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+    if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+      if (RI != TheRI) {
+        Value *RetOp = RI->getOperand(0);
+
+        // We can only perform this transformation if the value returned is
+        // evaluatable at the start of the initial invocation of the function,
+        // instead of at the end of the evaluation.
+        //
+        if (!isDynamicConstant(RetOp, CI))
+          return 0;
+
+        if (ReturnedValue && RetOp != ReturnedValue)
+          return 0;     // Cannot transform if differing values are returned.
+        ReturnedValue = RetOp;
+      }
+  return ReturnedValue;
+}
+
+/// CanTransformAccumulatorRecursion - If the specified instruction can be
+/// transformed using accumulator recursion elimination, return the constant
+/// which is the start of the accumulator value.  Otherwise return null.
+///
+Value *TailCallElim::CanTransformAccumulatorRecursion(Instruction *I,
+                                                      CallInst *CI) {
+  if (!I->isAssociative()) return 0;
+  assert(I->getNumOperands() == 2 &&
+         "Associative operations should have 2 args!");
+
+  // Exactly one operand should be the result of the call instruction...
+  if (I->getOperand(0) == CI && I->getOperand(1) == CI ||
+      I->getOperand(0) != CI && I->getOperand(1) != CI)
+    return 0;
+
+  // The only user of this instruction we allow is a single return instruction.
+  if (!I->hasOneUse() || !isa<ReturnInst>(I->use_back()))
+    return 0;
+
+  // Ok, now we have to check all of the other return instructions in this
+  // function.  If they return non-constants or differing values, then we cannot
+  // transform the function safely.
+  return getCommonReturnValue(cast<ReturnInst>(I->use_back()), CI);
+}
+
+bool TailCallElim::ProcessReturningBlock(ReturnInst *Ret, BasicBlock *&OldEntry,
+                                         bool &TailCallsAreMarkedTail,
+                                         std::vector<PHINode*> &ArgumentPHIs,
+                                       bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = Ret->getParent();
+  Function *F = BB->getParent();
+
+  if (&BB->front() == Ret) // Make sure there is something before the ret...
+    return false;
+
+  // Scan backwards from the return, checking to see if there is a tail call in
+  // this block.  If so, set CI to it.
+  CallInst *CI;
+  BasicBlock::iterator BBI = Ret;
+  while (1) {
+    CI = dyn_cast<CallInst>(BBI);
+    if (CI && CI->getCalledFunction() == F)
+      break;
+
+    if (BBI == BB->begin())
+      return false;          // Didn't find a potential tail call.
+    --BBI;
+  }
+
+  // If this call is marked as a tail call, and if there are dynamic allocas in
+  // the function, we cannot perform this optimization.
+  if (CI->isTailCall() && CannotTailCallElimCallsMarkedTail)
+    return false;
+
+  // If we are introducing accumulator recursion to eliminate associative
+  // operations after the call instruction, this variable contains the initial
+  // value for the accumulator.  If this value is set, we actually perform
+  // accumulator recursion elimination instead of simple tail recursion
+  // elimination.
+  Value *AccumulatorRecursionEliminationInitVal = 0;
+  Instruction *AccumulatorRecursionInstr = 0;
+
+  // Ok, we found a potential tail call.  We can currently only transform the
+  // tail call if all of the instructions between the call and the return are
+  // movable to above the call itself, leaving the call next to the return.
+  // Check that this is the case now.
+  for (BBI = CI, ++BBI; &*BBI != Ret; ++BBI)
+    if (!CanMoveAboveCall(BBI, CI)) {
+      // If we can't move the instruction above the call, it might be because it
+      // is an associative operation that could be tranformed using accumulator
+      // recursion elimination.  Check to see if this is the case, and if so,
+      // remember the initial accumulator value for later.
+      if ((AccumulatorRecursionEliminationInitVal =
+                             CanTransformAccumulatorRecursion(BBI, CI))) {
+        // Yes, this is accumulator recursion.  Remember which instruction
+        // accumulates.
+        AccumulatorRecursionInstr = BBI;
+      } else {
+        return false;   // Otherwise, we cannot eliminate the tail recursion!
+      }
+    }
+
+  // We can only transform call/return pairs that either ignore the return value
+  // of the call and return void, ignore the value of the call and return a
+  // constant, return the value returned by the tail call, or that are being
+  // accumulator recursion variable eliminated.
+  if (Ret->getNumOperands() != 0 && Ret->getReturnValue() != CI &&
+      !isa<UndefValue>(Ret->getReturnValue()) &&
+      AccumulatorRecursionEliminationInitVal == 0 &&
+      !getCommonReturnValue(Ret, CI))
+    return false;
+
+  // OK! We can transform this tail call.  If this is the first one found,
+  // create the new entry block, allowing us to branch back to the old entry.
+  if (OldEntry == 0) {
+    OldEntry = &F->getEntryBlock();
+    BasicBlock *NewEntry = new BasicBlock("", F, OldEntry);
+    NewEntry->takeName(OldEntry);
+    OldEntry->setName("tailrecurse");
+    new BranchInst(OldEntry, NewEntry);
+
+    // If this tail call is marked 'tail' and if there are any allocas in the
+    // entry block, move them up to the new entry block.
+    TailCallsAreMarkedTail = CI->isTailCall();
+    if (TailCallsAreMarkedTail)
+      // Move all fixed sized allocas from OldEntry to NewEntry.
+      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
+             NEBI = NewEntry->begin(); OEBI != E; )
+        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+          if (isa<ConstantInt>(AI->getArraySize()))
+            AI->moveBefore(NEBI);
+
+    // Now that we have created a new block, which jumps to the entry
+    // block, insert a PHI node for each argument of the function.
+    // For now, we initialize each PHI to only have the real arguments
+    // which are passed in.
+    Instruction *InsertPos = OldEntry->begin();
+    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
+         I != E; ++I) {
+      PHINode *PN = new PHINode(I->getType(), I->getName()+".tr", InsertPos);
+      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+      PN->addIncoming(I, NewEntry);
+      ArgumentPHIs.push_back(PN);
+    }
+  }
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.  We
+  // have to choose whether we move allocas in the entry block to the new entry
+  // block or not, so we can't make a good choice for both.  NOTE: We could do
+  // slightly better here in the case that the function has no entry block
+  // allocas.
+  if (TailCallsAreMarkedTail && !CI->isTailCall())
+    return false;
+
+  // Ok, now that we know we have a pseudo-entry block WITH all of the
+  // required PHI nodes, add entries into the PHI node for the actual
+  // parameters passed into the tail-recursive call.
+  for (unsigned i = 0, e = CI->getNumOperands()-1; i != e; ++i)
+    ArgumentPHIs[i]->addIncoming(CI->getOperand(i+1), BB);
+
+  // If we are introducing an accumulator variable to eliminate the recursion,
+  // do so now.  Note that we _know_ that no subsequent tail recursion
+  // eliminations will happen on this function because of the way the
+  // accumulator recursion predicate is set up.
+  //
+  if (AccumulatorRecursionEliminationInitVal) {
+    Instruction *AccRecInstr = AccumulatorRecursionInstr;
+    // Start by inserting a new PHI node for the accumulator.
+    PHINode *AccPN = new PHINode(AccRecInstr->getType(), "accumulator.tr",
+                                 OldEntry->begin());
+
+    // Loop over all of the predecessors of the tail recursion block.  For the
+    // real entry into the function we seed the PHI with the initial value,
+    // computed earlier.  For any other existing branches to this block (due to
+    // other tail recursions eliminated) the accumulator is not modified.
+    // Because we haven't added the branch in the current block to OldEntry yet,
+    // it will not show up as a predecessor.
+    for (pred_iterator PI = pred_begin(OldEntry), PE = pred_end(OldEntry);
+         PI != PE; ++PI) {
+      if (*PI == &F->getEntryBlock())
+        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, *PI);
+      else
+        AccPN->addIncoming(AccPN, *PI);
+    }
+
+    // Add an incoming argument for the current block, which is computed by our
+    // associative accumulator instruction.
+    AccPN->addIncoming(AccRecInstr, BB);
+
+    // Next, rewrite the accumulator recursion instruction so that it does not
+    // use the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+
+    // Finally, rewrite any return instructions in the program to return the PHI
+    // node instead of the "initval" that they do currently.  This loop will
+    // actually rewrite the return value we are destroying, but that's ok.
+    for (Function::iterator BBI = F->begin(), E = F->end(); BBI != E; ++BBI)
+      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI->getTerminator()))
+        RI->setOperand(0, AccPN);
+    ++NumAccumAdded;
+  }
+
+  // Now that all of the PHI nodes are in place, remove the call and
+  // ret instructions, replacing them with an unconditional branch.
+  new BranchInst(OldEntry, Ret);
+  BB->getInstList().erase(Ret);  // Remove return.
+  BB->getInstList().erase(CI);   // Remove call.
+  ++NumEliminated;
+  return true;
+}
commit	f17a25c88b892d30c2b41ba7ecdfbdfb2b4be9cc	[log] [tgz]
author	Dan Gohman <djg@cray.com>	Wed Jul 18 16:29:46 2007 +0000
committer	Dan Gohman <djg@cray.com>	Wed Jul 18 16:29:46 2007 +0000
tree	ebb79ea1ee5e3bc1fdf38541a811a8b804f0679a