lib/Transforms/Scalar/LoopStrengthReduce.cpp - platform/external/llvm - Gitiles

 //===- LoopStrengthReduce.cpp - Strength Reduce IVs in Loops --------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
 // This file is distributed under the University of Illinois Open Source
 // License. See LICENSE.TXT for details.
 //
 //===----------------------------------------------------------------------===//
 //
 // This transformation analyzes and transforms the induction variables (and
 // computations derived from them) into forms suitable for efficient execution
 // on the target.
 //
 // This pass performs a strength reduction on array references inside loops that
 // have as one or more of their components the loop induction variable, it
 // rewrites expressions to take advantage of scaled-index addressing modes
 // available on the target, and it performs a variety of other optimizations
 // related to loop induction variables.
 //
 // Terminology note: this code has a lot of handling for "post-increment" or
 // "post-inc" users. This is not talking about post-increment addressing modes;
 // it is instead talking about code like this:
 //
 //   %i = phi [ 0, %entry ], [ %i.next, %latch ]
 //   ...
 //   %i.next = add %i, 1
 //   %c = icmp eq %i.next, %n
 //
 // The SCEV for %i is {0,+,1}<%L>. The SCEV for %i.next is {1,+,1}<%L>, however
 // it's useful to think about these as the same register, with some uses using
 // the value of the register before the add and some using // it after. In this
 // example, the icmp is a post-increment user, since it uses %i.next, which is
 // the value of the induction variable after the increment. The other common
 // case of post-increment users is users outside the loop.
 //
 // TODO: More sophistication in the way Formulae are generated.
 //
 // TODO: Handle multiple loops at a time.
 //
 // TODO: test/CodeGen/X86/full-lsr.ll should get full lsr. The problem is
 //       that {0,+,1}<%bb> is getting picked first because all 7 uses can
 //       use it, and while it's a pretty good solution, it means that LSR
 //       doesn't look further to find an even better solution.
 //
 // TODO: Should TargetLowering::AddrMode::BaseGV be changed to a ConstantExpr
 //       instead of a GlobalValue?
 //
 // TODO: When truncation is free, truncate ICmp users' operands to make it a
 //       smaller encoding (on x86 at least).
 //
 // TODO: When a negated register is used by an add (such as in a list of
 //       multiple base registers, or as the increment expression in an addrec),
 //       we may not actually need both reg and (-1 * reg) in registers; the
 //       negation can be implemented by using a sub instead of an add. The
 //       lack of support for taking this into consideration when making
 //       register pressure decisions is partly worked around by the "Special"
 //       use kind.
 //
 //===----------------------------------------------------------------------===//

 #define DEBUG_TYPE "loop-reduce"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Constants.h"
 #include "llvm/Instructions.h"
 #include "llvm/IntrinsicInst.h"
 #include "llvm/DerivedTypes.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/Dominators.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ValueHandle.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetLowering.h"
 #include <algorithm>
 using namespace llvm;

 namespace {

 // Constant strides come first which in turns are sorted by their absolute
 // values. If absolute values are the same, then positive strides comes first.
 // e.g.
 // 4, -1, X, 1, 2 ==> 1, -1, 2, 4, X
 struct StrideCompare {
   const ScalarEvolution &SE;
   explicit StrideCompare(const ScalarEvolution &se) : SE(se) {}

   bool operator()(const SCEV *const &LHS, const SCEV *const &RHS) const {
     const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(LHS);
     const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS);
     if (LHSC && RHSC) {
       unsigned BitWidth = std::max(SE.getTypeSizeInBits(LHS->getType()),
                                    SE.getTypeSizeInBits(RHS->getType()));
       APInt  LV = LHSC->getValue()->getValue();
       APInt  RV = RHSC->getValue()->getValue();
       LV.sextOrTrunc(BitWidth);
       RV.sextOrTrunc(BitWidth);
       APInt ALV = LV.abs();
       APInt ARV = RV.abs();
       if (ALV == ARV) {
         if (LV != RV)
           return LV.sgt(RV);
       } else {
         return ALV.ult(ARV);
       }

       // If it's the same value but different type, sort by bit width so
       // that we emit larger induction variables before smaller
       // ones, letting the smaller be re-written in terms of larger ones.
       return SE.getTypeSizeInBits(RHS->getType()) <
              SE.getTypeSizeInBits(LHS->getType());
     }
     return LHSC && !RHSC;
   }
 };

 /// RegSortData - This class holds data which is used to order reuse
 /// candidates.
 class RegSortData {
 public:
   /// Bits - This represents the set of LSRUses (by index) which reference a
   /// particular register.
   SmallBitVector Bits;

   /// MaxComplexity - This represents the greatest complexity (see the comments
   /// on Formula::getComplexity) seen with a particular register.
   uint32_t MaxComplexity;

   /// Index - This holds an arbitrary value used as a last-resort tie breaker
   /// to ensure deterministic behavior.
   unsigned Index;

   RegSortData() {}

   void print(raw_ostream &OS) const;
   void dump() const;
 };

 }

 void RegSortData::print(raw_ostream &OS) const {
   OS << "[NumUses=" << Bits.count()
      << ", MaxComplexity=";
   OS.write_hex(MaxComplexity);
   OS << ", Index=" << Index << ']';
 }

 void RegSortData::dump() const {
   print(errs()); errs() << '\n';
 }

 namespace {

 /// RegCount - This is a helper class to sort a given set of registers
 /// according to associated RegSortData values.
 class RegCount {
 public:
   const SCEV *Reg;
   RegSortData Sort;

   RegCount(const SCEV *R, const RegSortData &RSD)
     : Reg(R), Sort(RSD) {}

   // Sort by count. Returning true means the register is preferred.
   bool operator<(const RegCount &Other) const {
     // Sort by the number of unique uses of this register.
     unsigned A = Sort.Bits.count();
     unsigned B = Other.Sort.Bits.count();
     if (A != B) return A > B;

     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
       const SCEVAddRecExpr *BR = dyn_cast<SCEVAddRecExpr>(Other.Reg);
       // AddRecs have higher priority than other things.
       if (!BR) return true;

       // Prefer affine values.
       if (AR->isAffine() != BR->isAffine())
         return AR->isAffine();

       const Loop *AL = AR->getLoop();
       const Loop *BL = BR->getLoop();
       if (AL != BL) {
         unsigned ADepth = AL->getLoopDepth();
         unsigned BDepth = BL->getLoopDepth();
         // Prefer a less deeply nested addrec.
         if (ADepth != BDepth)
           return ADepth < BDepth;

         // Different loops at the same depth; do something arbitrary.
         BasicBlock *AH = AL->getHeader();
         BasicBlock *BH = BL->getHeader();
         for (Function::iterator I = AH, E = AH->getParent()->end(); I != E; ++I)
           if (&*I == BH) return true;
         return false;
       }

       // Sort addrecs by stride.
       const SCEV *AStep = AR->getOperand(1);
       const SCEV *BStep = BR->getOperand(1);
       if (AStep != BStep) {
         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStep)) {
           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStep);
           if (!BC) return true;
           // Arbitrarily prefer wider registers.
           if (AC->getValue()->getValue().getBitWidth() !=
               BC->getValue()->getValue().getBitWidth())
             return AC->getValue()->getValue().getBitWidth() >
                    BC->getValue()->getValue().getBitWidth();
           // Ignore the sign bit, assuming that striding by a negative value
           // is just as easy as by a positive value.
           // Prefer the addrec with the lesser absolute value stride, as it
           // will allow uses to have simpler addressing modes.
           return AC->getValue()->getValue().abs()
             .ult(BC->getValue()->getValue().abs());
         }
       }

       // Then sort by the register which will permit the simplest uses.
       // This is a heuristic; currently we only track the most complex use as a
       // representative.
       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
         return Sort.MaxComplexity < Other.Sort.MaxComplexity;

       // Then sort them by their start values.
       const SCEV *AStart = AR->getStart();
       const SCEV *BStart = BR->getStart();
       if (AStart != BStart) {
         if (const SCEVConstant *AC = dyn_cast<SCEVConstant>(AStart)) {
           const SCEVConstant *BC = dyn_cast<SCEVConstant>(BStart);
           if (!BC) return true;
           // Arbitrarily prefer wider registers.
           if (AC->getValue()->getValue().getBitWidth() !=
               BC->getValue()->getValue().getBitWidth())
             return AC->getValue()->getValue().getBitWidth() >
                    BC->getValue()->getValue().getBitWidth();
           // Prefer positive over negative if the absolute values are the same.
           if (AC->getValue()->getValue().abs() ==
               BC->getValue()->getValue().abs())
             return AC->getValue()->getValue().isStrictlyPositive();
           // Prefer the addrec with the lesser absolute value start.
           return AC->getValue()->getValue().abs()
             .ult(BC->getValue()->getValue().abs());
         }
       }
     } else {
       // AddRecs have higher priority than other things.
       if (isa<SCEVAddRecExpr>(Other.Reg)) return false;
       // Sort by the register which will permit the simplest uses.
       // This is a heuristic; currently we only track the most complex use as a
       // representative.
       if (Sort.MaxComplexity != Other.Sort.MaxComplexity)
         return Sort.MaxComplexity < Other.Sort.MaxComplexity;
     }


     // Tie-breaker: the arbitrary index, to ensure a reliable ordering.
     return Sort.Index < Other.Sort.Index;
   }

   void print(raw_ostream &OS) const;
   void dump() const;
 };

 }

 void RegCount::print(raw_ostream &OS) const {
   OS << *Reg << ':';
   Sort.print(OS);
 }

 void RegCount::dump() const {
   print(errs()); errs() << '\n';
 }

 namespace {

 /// Formula - This class holds information that describes a formula for
 /// satisfying a use. It may include broken-out immediates and scaled registers.
 struct Formula {
   /// AM - This is used to represent complex addressing, as well as other kinds
   /// of interesting uses.
   TargetLowering::AddrMode AM;

   /// BaseRegs - The list of "base" registers for this use. When this is
   /// non-empty, AM.HasBaseReg should be set to true.
   SmallVector<const SCEV *, 2> BaseRegs;

   /// ScaledReg - The 'scaled' register for this use. This should be non-null
   /// when AM.Scale is not zero.
   const SCEV *ScaledReg;

   Formula() : ScaledReg(0) {}

   unsigned getNumRegs() const;
   uint32_t getComplexity() const;
   const Type *getType() const;

   void InitialMatch(const SCEV *S, Loop *L,
                     ScalarEvolution &SE, DominatorTree &DT);

   /// referencesReg - Test if this formula references the given register.
   bool referencesReg(const SCEV *S) const {
     return S == ScaledReg ||
            std::find(BaseRegs.begin(), BaseRegs.end(), S) != BaseRegs.end();
   }

   bool operator==(const Formula &Other) const {
     return BaseRegs == Other.BaseRegs &&
            ScaledReg == Other.ScaledReg &&
            AM.Scale == Other.AM.Scale &&
            AM.BaseOffs == Other.AM.BaseOffs &&
            AM.BaseGV == Other.AM.BaseGV;
   }

   // This sorts at least partially based on host pointer values which are
   // not deterministic, so it is only usable for uniqification.
   bool operator<(const Formula &Other) const {
     if (BaseRegs != Other.BaseRegs)
       return BaseRegs < Other.BaseRegs;
     if (ScaledReg != Other.ScaledReg)
       return ScaledReg < Other.ScaledReg;
     if (AM.Scale != Other.AM.Scale)
       return AM.Scale < Other.AM.Scale;
     if (AM.BaseOffs != Other.AM.BaseOffs)
       return AM.BaseOffs < Other.AM.BaseOffs;
     if (AM.BaseGV != Other.AM.BaseGV)
       return AM.BaseGV < Other.AM.BaseGV;
     return false;
   }

   void print(raw_ostream &OS) const;
   void dump() const;
 };

 }

 /// getNumRegs - Return the total number of register operands used by this
 /// formula.
 unsigned Formula::getNumRegs() const {
   return !!ScaledReg + BaseRegs.size();
 }

 /// getComplexity - Return an oversimplified value indicating the complexity
 /// of this formula. This is used as a tie-breaker in choosing register
 /// preferences.
 uint32_t Formula::getComplexity() const {
   // Encode the information in a uint32_t so that comparing with operator<
   // will be interesting.
   return
     // Most significant, the number of registers. This saturates because we
     // need the bits, and because beyond a few hundred it doesn't really matter.
     (std::min(getNumRegs(), (1u<<15)-1) << 17) |
     // Having multiple base regs is worse than having a base reg and a scale.
     ((BaseRegs.size() > 1) << 16) |
     // Scale absolute value.
     ((AM.Scale != 0 ? (Log2_64(abs64(AM.Scale)) + 1) : 0u) << 9) |
     // Scale sign, which is less significant than the absolute value.
     ((AM.Scale < 0) << 8) |
     // Offset absolute value.
     ((AM.BaseOffs != 0 ? (Log2_64(abs64(AM.BaseOffs)) + 1) : 0u) << 1) |
     // If a GV is present, treat it like a maximal offset.
     ((AM.BaseGV ? ((1u<<7)-1) : 0) << 1) |
     // Offset sign, which is less significant than the absolute offset.
     ((AM.BaseOffs < 0) << 0);
 }

 /// getType - Return the type of this formula, if it has one, or null
 /// otherwise. This type is meaningless except for the bit size.
 const Type *Formula::getType() const {
   return !BaseRegs.empty() ? BaseRegs.front()->getType() :
          ScaledReg ? ScaledReg->getType() :
          AM.BaseGV ? AM.BaseGV->getType() :
          0;
 }

 namespace {

 /// ComplexitySorter - A predicate which orders Formulae by the number of
 /// registers they contain.
 struct ComplexitySorter {
   bool operator()(const Formula &LHS, const Formula &RHS) const {
     unsigned L = LHS.getNumRegs();
     unsigned R = RHS.getNumRegs();
     if (L != R) return L < R;

     return LHS.getComplexity() < RHS.getComplexity();
   }
 };

 }

 /// DoInitialMatch - Recurrsion helper for InitialMatch.
 static void DoInitialMatch(const SCEV *S, Loop *L,
                            SmallVectorImpl<const SCEV *> &Good,
                            SmallVectorImpl<const SCEV *> &Bad,
                            ScalarEvolution &SE, DominatorTree &DT) {
   // Collect expressions which properly dominate the loop header.
   if (S->properlyDominates(L->getHeader(), &DT)) {
     Good.push_back(S);
     return;
   }

   // Look at add operands.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I)
       DoInitialMatch(*I, L, Good, Bad, SE, DT);
     return;
   }

   // Look at addrec operands.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     if (!AR->getStart()->isZero()) {
       DoInitialMatch(AR->getStart(), L, Good, Bad, SE, DT);
       DoInitialMatch(SE.getAddRecExpr(SE.getIntegerSCEV(0, AR->getType()),
                                       AR->getStepRecurrence(SE),
                                       AR->getLoop()),
                      L, Good, Bad, SE, DT);
       return;
     }
   }

   // Handle a multiplication by -1 (negation) if it didn't fold.
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(S))
     if (Mul->getOperand(0)->isAllOnesValue()) {
       SmallVector<const SCEV *, 4> Ops(Mul->op_begin()+1, Mul->op_end());
       const SCEV *NewMul = SE.getMulExpr(Ops);

       SmallVector<const SCEV *, 4> MyGood;
       SmallVector<const SCEV *, 4> MyBad;
       DoInitialMatch(NewMul, L, MyGood, MyBad, SE, DT);
       const SCEV *NegOne = SE.getSCEV(ConstantInt::getAllOnesValue(
         SE.getEffectiveSCEVType(NewMul->getType())));
       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyGood.begin(),
            E = MyGood.end(); I != E; ++I)
         Good.push_back(SE.getMulExpr(NegOne, *I));
       for (SmallVectorImpl<const SCEV *>::const_iterator I = MyBad.begin(),
            E = MyBad.end(); I != E; ++I)
         Bad.push_back(SE.getMulExpr(NegOne, *I));
       return;
     }

   // Ok, we can't do anything interesting. Just stuff the whole thing into a
   // register and hope for the best.
   Bad.push_back(S);
 }

 /// InitialMatch - Incorporate loop-variant parts of S into this Formula,
 /// attempting to keep all loop-invariant and loop-computable values in a
 /// single base register.
 void Formula::InitialMatch(const SCEV *S, Loop *L,
                            ScalarEvolution &SE, DominatorTree &DT) {
   SmallVector<const SCEV *, 4> Good;
   SmallVector<const SCEV *, 4> Bad;
   DoInitialMatch(S, L, Good, Bad, SE, DT);
   if (!Good.empty()) {
     BaseRegs.push_back(SE.getAddExpr(Good));
     AM.HasBaseReg = true;
   }
   if (!Bad.empty()) {
     BaseRegs.push_back(SE.getAddExpr(Bad));
     AM.HasBaseReg = true;
   }
 }

 void Formula::print(raw_ostream &OS) const {
   bool First = true;
   if (AM.BaseGV) {
     if (!First) OS << " + "; else First = false;
     WriteAsOperand(OS, AM.BaseGV, /*PrintType=*/false);
   }
   if (AM.BaseOffs != 0) {
     if (!First) OS << " + "; else First = false;
     OS << AM.BaseOffs;
   }
   for (SmallVectorImpl<const SCEV *>::const_iterator I = BaseRegs.begin(),
        E = BaseRegs.end(); I != E; ++I) {
     if (!First) OS << " + "; else First = false;
     OS << "reg(";
     OS << **I;
     OS << ")";
   }
   if (AM.Scale != 0) {
     if (!First) OS << " + "; else First = false;
     OS << AM.Scale << "*reg(";
     if (ScaledReg)
       OS << *ScaledReg;
     else
       OS << "<unknown>";
     OS << ")";
   }
 }

 void Formula::dump() const {
   print(errs()); errs() << '\n';
 }

 /// getSDiv - Return an expression for LHS /s RHS, if it can be determined,
 /// or null otherwise. If IgnoreSignificantBits is true, expressions like
 /// (X * Y) /s Y are simplified to Y, ignoring that the multiplication may
 /// overflow, which is useful when the result will be used in a context where
 /// the most significant bits are ignored.
 static const SCEV *getSDiv(const SCEV *LHS, const SCEV *RHS,
                            ScalarEvolution &SE,
                            bool IgnoreSignificantBits = false) {
   // Handle the trivial case, which works for any SCEV type.
   if (LHS == RHS)
     return SE.getIntegerSCEV(1, LHS->getType());

   // Handle x /s -1 as x * -1, to give ScalarEvolution a chance to do some
   // folding.
   if (RHS->isAllOnesValue())
     return SE.getMulExpr(LHS, RHS);

   // Check for a division of a constant by a constant.
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(LHS)) {
     const SCEVConstant *RC = dyn_cast<SCEVConstant>(RHS);
     if (!RC)
       return 0;
     if (C->getValue()->getValue().srem(RC->getValue()->getValue()) != 0)
       return 0;
     return SE.getConstant(C->getValue()->getValue()
                .sdiv(RC->getValue()->getValue()));
   }

   // Distribute the sdiv over addrec operands.
   if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(LHS)) {
     const SCEV *Start = getSDiv(AR->getStart(), RHS, SE,
                                 IgnoreSignificantBits);
     if (!Start) return 0;
     const SCEV *Step = getSDiv(AR->getStepRecurrence(SE), RHS, SE,
                                IgnoreSignificantBits);
     if (!Step) return 0;
     return SE.getAddRecExpr(Start, Step, AR->getLoop());
   }

   // Distribute the sdiv over add operands.
   if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(LHS)) {
     SmallVector<const SCEV *, 8> Ops;
     for (SCEVAddExpr::op_iterator I = Add->op_begin(), E = Add->op_end();
          I != E; ++I) {
       const SCEV *Op = getSDiv(*I, RHS, SE,
                                IgnoreSignificantBits);
       if (!Op) return 0;
       Ops.push_back(Op);
     }
     return SE.getAddExpr(Ops);
   }

   // Check for a multiply operand that we can pull RHS out of.
   if (const SCEVMulExpr *Mul = dyn_cast<SCEVMulExpr>(LHS))
     if (IgnoreSignificantBits || Mul->hasNoSignedWrap()) {
       SmallVector<const SCEV *, 4> Ops;
       bool Found = false;
       for (SCEVMulExpr::op_iterator I = Mul->op_begin(), E = Mul->op_end();
            I != E; ++I) {
         if (!Found)
           if (const SCEV *Q = getSDiv(*I, RHS, SE, IgnoreSignificantBits)) {
             Ops.push_back(Q);
             Found = true;
             continue;
           }
         Ops.push_back(*I);
       }
       return Found ? SE.getMulExpr(Ops) : 0;
     }

   // Otherwise we don't know.
   return 0;
 }

 namespace {

 /// LSRUse - This class holds the state that LSR keeps for each use in
 /// IVUsers, as well as uses invented by LSR itself. It includes information
 /// about what kinds of things can be folded into the user, information
 /// about the user itself, and information about how the use may be satisfied.
 /// TODO: Represent multiple users of the same expression in common?
 class LSRUse {
   SmallSet<Formula, 8> FormulaeUniquifier;

 public:
   /// KindType - An enum for a kind of use, indicating what types of
   /// scaled and immediate operands it might support.
   enum KindType {
     Basic,   ///< A normal use, with no folding.
     Special, ///< A special case of basic, allowing -1 scales.
     Address, ///< An address use; folding according to TargetLowering
     ICmpZero ///< An equality icmp with both operands folded into one.
     // TODO: Add a generic icmp too?
   };

   KindType Kind;
   const Type *AccessTy;
   Instruction *UserInst;
   Value *OperandValToReplace;

   /// PostIncLoop - If this user is to use the post-incremented value of an
   /// induction variable, this variable is non-null and holds the loop
   /// associated with the induction variable.
   const Loop *PostIncLoop;

   /// Formulae - A list of ways to build a value that can satisfy this user.
   /// After the list is populated, one of these is selected heuristically and
   /// used to formulate a replacement for OperandValToReplace in UserInst.
   SmallVector<Formula, 12> Formulae;

   LSRUse() : Kind(Basic), AccessTy(0),
              UserInst(0), OperandValToReplace(0), PostIncLoop(0) {}

   void InsertInitialFormula(const SCEV *S, Loop *L,
                             ScalarEvolution &SE, DominatorTree &DT);
   void InsertSupplementalFormula(const SCEV *S);

   bool InsertFormula(const Formula &F);

   void Rewrite(Loop *L, SCEVExpander &Rewriter,
                SmallVectorImpl<WeakVH> &DeadInsts,
                ScalarEvolution &SE, DominatorTree &DT,
                Pass *P) const;

   void print(raw_ostream &OS) const;
   void dump() const;

 private:
   Value *Expand(BasicBlock::iterator IP,
                 Loop *L, SCEVExpander &Rewriter,
                 SmallVectorImpl<WeakVH> &DeadInsts,
                 ScalarEvolution &SE, DominatorTree &DT) const;
 };

 }

 /// ExtractImmediate - If S involves the addition of a constant integer value,
 /// return that integer value, and mutate S to point to a new SCEV with that
 /// value excluded.
 static int64_t ExtractImmediate(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVConstant *C = dyn_cast<SCEVConstant>(S)) {
     if (C->getValue()->getValue().getMinSignedBits() <= 64) {
       S = SE.getIntegerSCEV(0, C->getType());
       return C->getValue()->getSExtValue();
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
     S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
     int64_t Result = ExtractImmediate(NewOps.front(), SE);
     S = SE.getAddRecExpr(NewOps, AR->getLoop());
     return Result;
   }
   return 0;
 }

 /// ExtractSymbol - If S involves the addition of a GlobalValue address,
 /// return that symbol, and mutate S to point to a new SCEV with that
 /// value excluded.
 static GlobalValue *ExtractSymbol(const SCEV *&S, ScalarEvolution &SE) {
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S)) {
     if (GlobalValue *GV = dyn_cast<GlobalValue>(U->getValue())) {
       S = SE.getIntegerSCEV(0, GV->getType());
       return GV;
     }
   } else if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(Add->op_begin(), Add->op_end());
     GlobalValue *Result = ExtractSymbol(NewOps.back(), SE);
     S = SE.getAddExpr(NewOps);
     return Result;
   } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(S)) {
     SmallVector<const SCEV *, 8> NewOps(AR->op_begin(), AR->op_end());
     GlobalValue *Result = ExtractSymbol(NewOps.front(), SE);
     S = SE.getAddRecExpr(NewOps, AR->getLoop());
     return Result;
   }
   return 0;
 }

 /// isLegalUse - Test whether the use described by AM is "legal", meaning
 /// it can be completely folded into the user instruction at isel time.
 /// This includes address-mode folding and special icmp tricks.
 static bool isLegalUse(const TargetLowering::AddrMode &AM,
                        LSRUse::KindType Kind, const Type *AccessTy,
                        const TargetLowering *TLI) {
   switch (Kind) {
   case LSRUse::Address:
     // If we have low-level target information, ask the target if it can
     // completely fold this address.
     if (TLI) return TLI->isLegalAddressingMode(AM, AccessTy);

     // Otherwise, just guess that reg+reg addressing is legal.
     return !AM.BaseGV && AM.BaseOffs == 0 && AM.Scale <= 1;

   case LSRUse::ICmpZero:
     // There's not even a target hook for querying whether it would be legal
     // to fold a GV into an ICmp.
     if (AM.BaseGV)
       return false;

     // ICmp only has two operands; don't allow more than two non-trivial parts.
     if (AM.Scale != 0 && AM.HasBaseReg && AM.BaseOffs != 0)
       return false;

     // ICmp only supports no scale or a -1 scale, as we can "fold" a -1 scale
     // by putting the scaled register in the other operand of the icmp.
     if (AM.Scale != 0 && AM.Scale != -1)
       return false;

     // If we have low-level target information, ask the target if it can
     // fold an integer immediate on an icmp.
     if (AM.BaseOffs != 0) {
       if (TLI) return TLI->isLegalICmpImmediate(-AM.BaseOffs);
       return false;
     }

     return true;

   case LSRUse::Basic:
     // Only handle single-register values.
     return !AM.BaseGV && AM.Scale == 0 && AM.BaseOffs == 0;

   case LSRUse::Special:
     // Only handle -1 scales, or no scale.
     return AM.Scale == 0 || AM.Scale == -1;
   }

   return false;
 }

 static bool isAlwaysFoldable(const SCEV *S,
                              bool HasBaseReg,
                              LSRUse::KindType Kind, const Type *AccessTy,
                              const TargetLowering *TLI,
                              ScalarEvolution &SE) {
   // Fast-path: zero is always foldable.
   if (S->isZero()) return true;

   // Conservatively, create an address with an immediate and a
   // base and a scale.
   TargetLowering::AddrMode AM;
   AM.BaseOffs = ExtractImmediate(S, SE);
   AM.BaseGV = ExtractSymbol(S, SE);
   AM.HasBaseReg = HasBaseReg;
   AM.Scale = Kind == LSRUse::ICmpZero ? -1 : 1;

   // If there's anything else involved, it's not foldable.
   if (!S->isZero()) return false;

   return isLegalUse(AM, Kind, AccessTy, TLI);
 }

 /// InsertFormula - If the given formula has not yet been inserted, add it
 /// to the list, and return true. Return false otherwise.
 bool LSRUse::InsertFormula(const Formula &F) {
   Formula Copy = F;

   // Sort the base regs, to avoid adding the same solution twice with
   // the base regs in different orders. This uses host pointer values, but
   // it doesn't matter since it's only used for uniquifying.
   std::sort(Copy.BaseRegs.begin(), Copy.BaseRegs.end());

   DEBUG(for (SmallVectorImpl<const SCEV *>::const_iterator I =
              F.BaseRegs.begin(), E = F.BaseRegs.end(); I != E; ++I)
           assert(!(*I)->isZero() && "Zero allocated in a base register!");
         assert((!F.ScaledReg || !F.ScaledReg->isZero()) &&
                "Zero allocated in a scaled register!"));

   if (FormulaeUniquifier.insert(Copy)) {
     Formulae.push_back(F);
     return true;
   }

   return false;
 }

 void
 LSRUse::InsertInitialFormula(const SCEV *S, Loop *L,
                              ScalarEvolution &SE, DominatorTree &DT) {
   Formula F;
   F.InitialMatch(S, L, SE, DT);
   bool Inserted = InsertFormula(F);
   assert(Inserted && "Initial formula already exists!"); (void)Inserted;
 }

 void
 LSRUse::InsertSupplementalFormula(const SCEV *S) {
   Formula F;
   F.BaseRegs.push_back(S);
   F.AM.HasBaseReg = true;
   bool Inserted = InsertFormula(F);
   assert(Inserted && "Supplemental formula already exists!"); (void)Inserted;
 }

 /// getImmediateDominator - A handy utility for the specific DominatorTree
 /// query that we need here.
 ///
 static BasicBlock *getImmediateDominator(BasicBlock *BB, DominatorTree &DT) {
   DomTreeNode *Node = DT.getNode(BB);
   if (!Node) return 0;
   Node = Node->getIDom();
   if (!Node) return 0;
   return Node->getBlock();
 }

 Value *LSRUse::Expand(BasicBlock::iterator IP,
                       Loop *L, SCEVExpander &Rewriter,
                       SmallVectorImpl<WeakVH> &DeadInsts,
                       ScalarEvolution &SE, DominatorTree &DT) const {
   // Then, collect some instructions which we will remain dominated by when
   // expanding the replacement. These must be dominated by any operands that
   // will be required in the expansion.
   SmallVector<Instruction *, 4> Inputs;
   if (Instruction *I = dyn_cast<Instruction>(OperandValToReplace))
     Inputs.push_back(I);
   if (Kind == ICmpZero)
     if (Instruction *I =
           dyn_cast<Instruction>(cast<ICmpInst>(UserInst)->getOperand(1)))
       Inputs.push_back(I);
   if (PostIncLoop && !L->contains(UserInst))
     Inputs.push_back(L->getLoopLatch()->getTerminator());

   // Then, climb up the immediate dominator tree as far as we can go while
   // still being dominated by the input positions.
   for (;;) {
     bool AllDominate = true;
     Instruction *BetterPos = 0;
     BasicBlock *IDom = getImmediateDominator(IP->getParent(), DT);
     if (!IDom) break;
     Instruction *Tentative = IDom->getTerminator();
     for (SmallVectorImpl<Instruction *>::const_iterator I = Inputs.begin(),
          E = Inputs.end(); I != E; ++I) {
       Instruction *Inst = *I;
       if (Inst == Tentative || !DT.dominates(Inst, Tentative)) {
         AllDominate = false;
         break;
       }
       if (IDom == Inst->getParent() &&
           (!BetterPos || DT.dominates(BetterPos, Inst)))
         BetterPos = next(BasicBlock::iterator(Inst));
     }
     if (!AllDominate)
       break;
     if (BetterPos)
       IP = BetterPos;
     else
       IP = Tentative;
   }
   while (isa<PHINode>(IP)) ++IP;

   // The first formula in the list is the winner.
   const Formula &F = Formulae.front();

   // Inform the Rewriter if we have a post-increment use, so that it can
   // perform an advantageous expansion.
   Rewriter.setPostInc(PostIncLoop);

   // This is the type that the user actually needs.
   const Type *OpTy = OperandValToReplace->getType();
   // This will be the type that we'll initially expand to.
   const Type *Ty = F.getType();
   if (!Ty)
     // No type known; just expand directly to the ultimate type.
     Ty = OpTy;
   else if (SE.getEffectiveSCEVType(Ty) == SE.getEffectiveSCEVType(OpTy))
     // Expand directly to the ultimate type if it's the right size.
     Ty = OpTy;
   // This is the type to do integer arithmetic in.
   const Type *IntTy = SE.getEffectiveSCEVType(Ty);

   // Build up a list of operands to add together to form the full base.
   SmallVector<const SCEV *, 8> Ops;

   // Expand the BaseRegs portion.
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
     const SCEV *Reg = *I;
     assert(!Reg->isZero() && "Zero allocated in a base register!");

     // If we're expanding for a post-inc user for the add-rec's loop, make the
     // post-inc adjustment.
     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg))
       if (AR->getLoop() == PostIncLoop)
         Reg = SE.getAddExpr(Reg, AR->getStepRecurrence(SE));

     Ops.push_back(SE.getUnknown(Rewriter.expandCodeFor(Reg, 0, IP)));
   }

   // Expand the ScaledReg portion.
   Value *ICmpScaledV = 0;
   if (F.AM.Scale != 0) {
     const SCEV *ScaledS = F.ScaledReg;

     // If we're expanding for a post-inc user for the add-rec's loop, make the
     // post-inc adjustment.
     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(ScaledS))
       if (AR->getLoop() == PostIncLoop)
         ScaledS = SE.getAddExpr(ScaledS, AR->getStepRecurrence(SE));

     if (Kind == ICmpZero) {
       // An interesting way of "folding" with an icmp is to use a negated
       // scale, which we'll implement by inserting it into the other operand
       // of the icmp.
       assert(F.AM.Scale == -1 &&
              "The only scale supported by ICmpZero uses is -1!");
       ICmpScaledV = Rewriter.expandCodeFor(ScaledS, 0, IP);
     } else {
       // Otherwise just expand the scaled register and an explicit scale,
       // which is expected to be matched as part of the address.
       ScaledS = SE.getUnknown(Rewriter.expandCodeFor(ScaledS, 0, IP));
       const Type *ScaledTy = SE.getEffectiveSCEVType(ScaledS->getType());
       ScaledS = SE.getMulExpr(ScaledS,
                               SE.getSCEV(ConstantInt::get(ScaledTy,
                                                           F.AM.Scale)));
       Ops.push_back(ScaledS);
     }
   }

   // Expand the immediate portions.
   if (F.AM.BaseGV)
     Ops.push_back(SE.getSCEV(F.AM.BaseGV));
   if (F.AM.BaseOffs != 0) {
     if (Kind == ICmpZero) {
       // The other interesting way of "folding" with an ICmpZero is to use a
       // negated immediate.
       if (!ICmpScaledV)
         ICmpScaledV = ConstantInt::get(IntTy, -F.AM.BaseOffs);
       else {
         Ops.push_back(SE.getUnknown(ICmpScaledV));
         ICmpScaledV = ConstantInt::get(IntTy, F.AM.BaseOffs);
       }
     } else {
       // Just add the immediate values. These again are expected to be matched
       // as part of the address.
       Ops.push_back(SE.getSCEV(ConstantInt::get(IntTy, F.AM.BaseOffs)));
     }
   }

   // Emit instructions summing all the operands.
   const SCEV *FullS = Ops.empty() ?
                       SE.getIntegerSCEV(0, IntTy) :
                       SE.getAddExpr(Ops);
   Value *FullV = Rewriter.expandCodeFor(FullS, Ty, IP);

   // We're done expanding now, so reset the rewriter.
   Rewriter.setPostInc(0);

   // An ICmpZero Formula represents an ICmp which we're handling as a
   // comparison against zero. Now that we've expanded an expression for that
   // form, update the ICmp's other operand.
   if (Kind == ICmpZero) {
     ICmpInst *CI = cast<ICmpInst>(UserInst);
     DeadInsts.push_back(CI->getOperand(1));
     assert(!F.AM.BaseGV && "ICmp does not support folding a global value and "
                            "a scale at the same time!");
     if (F.AM.Scale == -1) {
       if (ICmpScaledV->getType() != OpTy) {
         Instruction *Cast =
           CastInst::Create(CastInst::getCastOpcode(ICmpScaledV, false,
                                                    OpTy, false),
                            ICmpScaledV, OpTy, "tmp", CI);
         ICmpScaledV = Cast;
       }
       CI->setOperand(1, ICmpScaledV);
     } else {
       assert(F.AM.Scale == 0 &&
              "ICmp does not support folding a global value and "
              "a scale at the same time!");
       Constant *C = ConstantInt::getSigned(SE.getEffectiveSCEVType(OpTy),
                                            -(uint64_t)F.AM.BaseOffs);
       if (C->getType() != OpTy)
         C = ConstantExpr::getCast(CastInst::getCastOpcode(C, false,
                                                           OpTy, false),
                                   C, OpTy);

       CI->setOperand(1, C);
     }
   }

   return FullV;
 }

 /// Rewrite - Emit instructions for the leading candidate expression for this
 /// LSRUse (this is called "expanding"), and update the UserInst to reference
 /// the newly expanded value.
 void LSRUse::Rewrite(Loop *L, SCEVExpander &Rewriter,
                      SmallVectorImpl<WeakVH> &DeadInsts,
                      ScalarEvolution &SE, DominatorTree &DT,
                      Pass *P) const {
   const Type *OpTy = OperandValToReplace->getType();

   // First, find an insertion point that dominates UserInst. For PHI nodes,
   // find the nearest block which dominates all the relevant uses.
   if (PHINode *PN = dyn_cast<PHINode>(UserInst)) {
     DenseMap<BasicBlock *, Value *> Inserted;
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i)
       if (PN->getIncomingValue(i) == OperandValToReplace) {
         BasicBlock *BB = PN->getIncomingBlock(i);

         // If this is a critical edge, split the edge so that we do not insert
         // the code on all predecessor/successor paths.  We do this unless this
         // is the canonical backedge for this loop, which complicates post-inc
         // users.
         if (e != 1 && BB->getTerminator()->getNumSuccessors() > 1 &&
             !isa<IndirectBrInst>(BB->getTerminator()) &&
             (PN->getParent() != L->getHeader() || !L->contains(BB))) {
           // Split the critical edge.
           BasicBlock *NewBB = SplitCriticalEdge(BB, PN->getParent(), P);

           // If PN is outside of the loop and BB is in the loop, we want to
           // move the block to be immediately before the PHI block, not
           // immediately after BB.
           if (L->contains(BB) && !L->contains(PN))
             NewBB->moveBefore(PN->getParent());

           // Splitting the edge can reduce the number of PHI entries we have.
           e = PN->getNumIncomingValues();
           BB = NewBB;
           i = PN->getBasicBlockIndex(BB);
         }

         std::pair<DenseMap<BasicBlock *, Value *>::iterator, bool> Pair =
           Inserted.insert(std::make_pair(BB, static_cast<Value *>(0)));
         if (!Pair.second)
           PN->setIncomingValue(i, Pair.first->second);
         else {
           Value *FullV = Expand(BB->getTerminator(),
                                 L, Rewriter, DeadInsts, SE, DT);

           // If this is reuse-by-noop-cast, insert the noop cast.
           if (FullV->getType() != OpTy)
             FullV =
               CastInst::Create(CastInst::getCastOpcode(FullV, false,
                                                        OpTy, false),
                                FullV, OperandValToReplace->getType(),
                                "tmp", BB->getTerminator());

           PN->setIncomingValue(i, FullV);
           Pair.first->second = FullV;
         }
       }
   } else {
     Value *FullV = Expand(UserInst, L, Rewriter, DeadInsts, SE, DT);

     // If this is reuse-by-noop-cast, insert the noop cast.
     if (FullV->getType() != OpTy) {
       Instruction *Cast =
         CastInst::Create(CastInst::getCastOpcode(FullV, false, OpTy, false),
                          FullV, OpTy, "tmp", UserInst);
       FullV = Cast;
     }

     // Update the user.
     UserInst->replaceUsesOfWith(OperandValToReplace, FullV);
   }

   DeadInsts.push_back(OperandValToReplace);
 }

 void LSRUse::print(raw_ostream &OS) const {
   OS << "LSR Use: Kind=";
   switch (Kind) {
   case Basic:    OS << "Basic"; break;
   case Special:  OS << "Special"; break;
   case ICmpZero: OS << "ICmpZero"; break;
   case Address:
     OS << "Address of ";
     if (isa<PointerType>(AccessTy))
       OS << "pointer"; // the full pointer type could be really verbose
     else
       OS << *AccessTy;
   }

   OS << ", UserInst=";
   // Store is common and interesting enough to be worth special-casing.
   if (StoreInst *Store = dyn_cast<StoreInst>(UserInst)) {
     OS << "store ";
     WriteAsOperand(OS, Store->getOperand(0), /*PrintType=*/false);
   } else if (UserInst->getType()->isVoidTy())
     OS << UserInst->getOpcodeName();
   else
     WriteAsOperand(OS, UserInst, /*PrintType=*/false);

   OS << ", OperandValToReplace=";
   WriteAsOperand(OS, OperandValToReplace, /*PrintType=*/false);

   if (PostIncLoop) {
     OS << ", PostIncLoop=";
     WriteAsOperand(OS, PostIncLoop->getHeader(), /*PrintType=*/false);
   }
 }

 void LSRUse::dump() const {
   print(errs()); errs() << '\n';
 }

 namespace {

 /// Score - This class is used to measure and compare candidate formulae.
 class Score {
   unsigned NumRegs;
   unsigned NumPhis;
   unsigned NumIVMuls;
   unsigned NumBaseAdds;
   unsigned NumImms;

 public:
   Score()
     : NumRegs(0), NumPhis(0), NumIVMuls(0), NumBaseAdds(0), NumImms(0) {}

   void RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
                    ScalarEvolution &SE);

   void Rate(const SCEV *Reg, const SmallBitVector &Bits,
             const SmallVector<LSRUse, 16> &Uses, const Loop *L,
             ScalarEvolution &SE);

   bool operator<(const Score &Other) const;

   void print_details(raw_ostream &OS, const SCEV *Reg,
                      const SmallPtrSet<const SCEV *, 8> &Regs) const;

   void print(raw_ostream &OS) const;
   void dump() const;

 private:
   void RateRegister(const SCEV *Reg, SmallPtrSet<const SCEV *, 8> &Regs,
                     const Loop *L);
   void RateFormula(const Formula &F, SmallPtrSet<const SCEV *, 8> &Regs,
                    const Loop *L);

   void Loose();
 };

 }

 /// RateRegister - Tally up interesting quantities from the given register.
 void Score::RateRegister(const SCEV *Reg,
                          SmallPtrSet<const SCEV *, 8> &Regs,
                          const Loop *L) {
   if (Regs.insert(Reg))
     if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(Reg)) {
       NumPhis += AR->getLoop() == L;

       // Add the step value register, if it needs one.
       if (!AR->isAffine() || !isa<SCEVConstant>(AR->getOperand(1)))
         RateRegister(AR->getOperand(1), Regs, L);
     }
 }

 void Score::RateFormula(const Formula &F,
                         SmallPtrSet<const SCEV *, 8> &Regs,
                         const Loop *L) {
   // Tally up the registers.
   if (F.ScaledReg)
     RateRegister(F.ScaledReg, Regs, L);
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I) {
     const SCEV *BaseReg = *I;
     RateRegister(BaseReg, Regs, L);

     NumIVMuls += isa<SCEVMulExpr>(BaseReg) &&
                  BaseReg->hasComputableLoopEvolution(L);
   }

   if (F.BaseRegs.size() > 1)
     NumBaseAdds += F.BaseRegs.size() - 1;

   // Tally up the non-zero immediates.
   if (F.AM.BaseGV || F.AM.BaseOffs != 0)
     ++NumImms;
 }

 /// Loose - Set this score to a loosing value.
 void Score::Loose() {
   NumRegs = ~0u;
   NumPhis = ~0u;
   NumIVMuls = ~0u;
   NumBaseAdds = ~0u;
   NumImms = ~0u;
 }

 /// RateInitial - Compute a score for the initial "fully reduced" solution.
 void Score::RateInitial(SmallVector<LSRUse, 16> const &Uses, const Loop *L,
                         ScalarEvolution &SE) {
   SmallPtrSet<const SCEV *, 8> Regs;
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
        E = Uses.end(); I != E; ++I)
     RateFormula(I->Formulae.front(), Regs, L);
   NumRegs += Regs.size();

   DEBUG(print_details(dbgs(), 0, Regs));
 }

 /// Rate - Compute a score for the solution where the reuse associated with
 /// putting Reg in a register is selected.
 void Score::Rate(const SCEV *Reg, const SmallBitVector &Bits,
                  const SmallVector<LSRUse, 16> &Uses, const Loop *L,
                  ScalarEvolution &SE) {
   SmallPtrSet<const SCEV *, 8> Regs;
   for (size_t i = 0, e = Uses.size(); i != e; ++i) {
     const LSRUse &LU = Uses[i];

     const Formula *BestFormula = 0;
     if (i >= Bits.size() || !Bits.test(i))
       // This use doesn't use the current register. Just go with the current
       // leading candidate formula.
       BestFormula = &LU.Formulae.front();
     else
       // Find the best formula for this use that uses the current register.
       for (SmallVectorImpl<Formula>::const_iterator I = LU.Formulae.begin(),
            E = LU.Formulae.end(); I != E; ++I) {
         const Formula &F = *I;
         if (F.referencesReg(Reg) &&
             (!BestFormula || ComplexitySorter()(F, *BestFormula)))
           BestFormula = &F;
       }

     // If we didn't find *any* forumlae, because earlier we eliminated some
     // in greedy fashion, skip the current register's reuse opportunity.
     if (!BestFormula) {
       DEBUG(dbgs() << "Reuse with reg " << *Reg
                    << " wouldn't help any users.\n");
       Loose();
       return;
     }

     // For an in-loop post-inc user, don't allow multiple base registers,
     // because that would require an awkward in-loop add after the increment.
     if (LU.PostIncLoop && LU.PostIncLoop->contains(LU.UserInst) &&
         BestFormula->BaseRegs.size() > 1) {
       DEBUG(dbgs() << "Reuse with reg " << *Reg
                    << " would require an in-loop post-inc add: ";
             BestFormula->dump());
       Loose();
       return;
     }

     RateFormula(*BestFormula, Regs, L);
   }
   NumRegs += Regs.size();

   DEBUG(print_details(dbgs(), Reg, Regs));
 }

 /// operator< - Choose the better score.
 bool Score::operator<(const Score &Other) const {
   if (NumRegs != Other.NumRegs)
     return NumRegs < Other.NumRegs;
   if (NumPhis != Other.NumPhis)
     return NumPhis < Other.NumPhis;
   if (NumIVMuls != Other.NumIVMuls)
     return NumIVMuls < Other.NumIVMuls;
   if (NumBaseAdds != Other.NumBaseAdds)
     return NumBaseAdds < Other.NumBaseAdds;
   return NumImms < Other.NumImms;
 }

 void Score::print_details(raw_ostream &OS,
                           const SCEV *Reg,
                           const SmallPtrSet<const SCEV *, 8> &Regs) const {
   if (Reg) OS << "Reuse with reg " << *Reg << " would require ";
   else     OS << "The initial solution would require ";
   print(OS);
   OS << ". Regs:";
   for (SmallPtrSet<const SCEV *, 8>::const_iterator I = Regs.begin(),
        E = Regs.end(); I != E; ++I)
     OS << ' ' << **I;
   OS << '\n';
 }

 void Score::print(raw_ostream &OS) const {
   OS << NumRegs << " reg" << (NumRegs == 1 ? "" : "s");
   if (NumPhis != 0)
     OS << ", including " << NumPhis << " PHI" << (NumPhis == 1 ? "" : "s");
   if (NumIVMuls != 0)
     OS << ", plus " << NumIVMuls << " IV mul" << (NumIVMuls == 1 ? "" : "s");
   if (NumBaseAdds != 0)
     OS << ", plus " << NumBaseAdds << " base add"
        << (NumBaseAdds == 1 ? "" : "s");
   if (NumImms != 0)
     OS << ", plus " << NumImms << " imm" << (NumImms == 1 ? "" : "s");
 }

 void Score::dump() const {
   print(errs()); errs() << '\n';
 }

 /// isAddressUse - Returns true if the specified instruction is using the
 /// specified value as an address.
 static bool isAddressUse(Instruction *Inst, Value *OperandVal) {
   bool isAddress = isa<LoadInst>(Inst);
   if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
     if (SI->getOperand(1) == OperandVal)
       isAddress = true;
   } else if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
     // of intrinsics.
     switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::prefetch:
       case Intrinsic::x86_sse2_loadu_dq:
       case Intrinsic::x86_sse2_loadu_pd:
       case Intrinsic::x86_sse_loadu_ps:
       case Intrinsic::x86_sse_storeu_ps:
       case Intrinsic::x86_sse2_storeu_pd:
       case Intrinsic::x86_sse2_storeu_dq:
       case Intrinsic::x86_sse2_storel_dq:
         if (II->getOperand(1) == OperandVal)
           isAddress = true;
         break;
     }
   }
   return isAddress;
 }

 /// getAccessType - Return the type of the memory being accessed.
 static const Type *getAccessType(const Instruction *Inst) {
   const Type *AccessTy = Inst->getType();
   if (const StoreInst *SI = dyn_cast<StoreInst>(Inst))
     AccessTy = SI->getOperand(0)->getType();
   else if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(Inst)) {
     // Addressing modes can also be folded into prefetches and a variety
     // of intrinsics.
     switch (II->getIntrinsicID()) {
     default: break;
     case Intrinsic::x86_sse_storeu_ps:
     case Intrinsic::x86_sse2_storeu_pd:
     case Intrinsic::x86_sse2_storeu_dq:
     case Intrinsic::x86_sse2_storel_dq:
       AccessTy = II->getOperand(1)->getType();
       break;
     }
   }
   return AccessTy;
 }

 /// DeleteTriviallyDeadInstructions - If any of the instructions is the
 /// specified set are trivially dead, delete them and see if this makes any of
 /// their operands subsequently dead.
 static bool
 DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakVH> &DeadInsts) {
   bool Changed = false;

   while (!DeadInsts.empty()) {
     Instruction *I = dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val());

     if (I == 0 || !isInstructionTriviallyDead(I))
       continue;

     for (User::op_iterator OI = I->op_begin(), E = I->op_end(); OI != E; ++OI)
       if (Instruction *U = dyn_cast<Instruction>(*OI)) {
         *OI = 0;
         if (U->use_empty())
           DeadInsts.push_back(U);
       }

     I->eraseFromParent();
     Changed = true;
   }

   return Changed;
 }

 namespace {

 /// LSRInstance - This class holds state for the main loop strength
 /// reduction logic.
 class LSRInstance {
   IVUsers &IU;
   ScalarEvolution &SE;
   DominatorTree &DT;
   const TargetLowering *const TLI;
   Loop *const L;
   bool Changed;

   /// CurrentArbitraryRegIndex - To ensure a deterministic ordering, assign an
   /// arbitrary index value to each register as a sort tie breaker.
   unsigned CurrentArbitraryRegIndex;

   /// Factors - Interesting factors between use strides.
   SmallSetVector<int64_t, 4> Factors;

   /// Types - Interesting use types, to facilitate truncation reuse.
   SmallSetVector<const Type *, 4> Types;

   /// Uses - The list of interesting uses.
   SmallVector<LSRUse, 16> Uses;

   // TODO: Reorganize these data structures.
   typedef DenseMap<const SCEV *, RegSortData> RegUsesTy;
   RegUsesTy RegUses;
   SmallVector<const SCEV *, 16> RegSequence;

   void OptimizeShadowIV();
   bool FindIVUserForCond(ICmpInst *Cond, IVStrideUse *&CondUse,
                          const SCEV* &CondStride);
   ICmpInst *OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse);
   bool StrideMightBeShared(const SCEV* Stride);
   bool OptimizeLoopTermCond(Instruction *&IVIncInsertPos);

   LSRUse &getNewUse() {
     Uses.push_back(LSRUse());
     return Uses.back();
   }

   void CountRegister(const SCEV *Reg, uint32_t Complexity, size_t LUIdx);
   void CountRegisters(const Formula &F, size_t LUIdx);

   void GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
                                    Formula Base);
   void GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
                                    Formula Base);
   void GenerateFormulaeFromReplacedBaseReg(LSRUse &LU,
                                            unsigned LUIdx,
                                            const Formula &Base, unsigned i,
                                            const SmallVectorImpl<const SCEV *>
                                              &AddOps);
   void GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
                                   Formula Base);
   void GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
                                 Formula Base);
   void GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
                            Formula Base);
   void GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
                              Formula Base);

   void GenerateConstantOffsetReuse();

   void GenerateAllReuseFormulae();

   void GenerateLoopInvariantRegisterUses();

 public:
   LSRInstance(const TargetLowering *tli, Loop *l, Pass *P);

   bool getChanged() const { return Changed; }

   void print(raw_ostream &OS) const;
   void dump() const;
 };

 }

 /// OptimizeShadowIV - If IV is used in a int-to-float cast
 /// inside the loop then try to eliminate the cast opeation.
 void LSRInstance::OptimizeShadowIV() {
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return;

   for (size_t StrideIdx = 0, e = IU.StrideOrder.size();
        StrideIdx != e; ++StrideIdx) {
     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
     if (!isa<SCEVConstant>(SI->first))
       continue;

     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
            E = SI->second->Users.end(); UI != E; /* empty */) {
       ilist<IVStrideUse>::iterator CandidateUI = UI;
       ++UI;
       Instruction *ShadowUse = CandidateUI->getUser();
       const Type *DestTy = NULL;

       /* If shadow use is a int->float cast then insert a second IV
          to eliminate this cast.

            for (unsigned i = 0; i < n; ++i)
              foo((double)i);

          is transformed into

            double d = 0.0;
            for (unsigned i = 0; i < n; ++i, ++d)
              foo(d);
       */
       if (UIToFPInst *UCast = dyn_cast<UIToFPInst>(CandidateUI->getUser()))
         DestTy = UCast->getDestTy();
       else if (SIToFPInst *SCast = dyn_cast<SIToFPInst>(CandidateUI->getUser()))
         DestTy = SCast->getDestTy();
       if (!DestTy) continue;

       if (TLI) {
         // If target does not support DestTy natively then do not apply
         // this transformation.
         EVT DVT = TLI->getValueType(DestTy);
         if (!TLI->isTypeLegal(DVT)) continue;
       }

       PHINode *PH = dyn_cast<PHINode>(ShadowUse->getOperand(0));
       if (!PH) continue;
       if (PH->getNumIncomingValues() != 2) continue;

       const Type *SrcTy = PH->getType();
       int Mantissa = DestTy->getFPMantissaWidth();
       if (Mantissa == -1) continue;
       if ((int)SE.getTypeSizeInBits(SrcTy) > Mantissa)
         continue;

       unsigned Entry, Latch;
       if (PH->getIncomingBlock(0) == L->getLoopPreheader()) {
         Entry = 0;
         Latch = 1;
       } else {
         Entry = 1;
         Latch = 0;
       }

       ConstantInt *Init = dyn_cast<ConstantInt>(PH->getIncomingValue(Entry));
       if (!Init) continue;
       Constant *NewInit = ConstantFP::get(DestTy, Init->getZExtValue());

       BinaryOperator *Incr =
         dyn_cast<BinaryOperator>(PH->getIncomingValue(Latch));
       if (!Incr) continue;
       if (Incr->getOpcode() != Instruction::Add
           && Incr->getOpcode() != Instruction::Sub)
         continue;

       /* Initialize new IV, double d = 0.0 in above example. */
       ConstantInt *C = NULL;
       if (Incr->getOperand(0) == PH)
         C = dyn_cast<ConstantInt>(Incr->getOperand(1));
       else if (Incr->getOperand(1) == PH)
         C = dyn_cast<ConstantInt>(Incr->getOperand(0));
       else
         continue;

       if (!C) continue;

       // Ignore negative constants, as the code below doesn't handle them
       // correctly. TODO: Remove this restriction.
       if (!C->getValue().isStrictlyPositive()) continue;

       /* Add new PHINode. */
       PHINode *NewPH = PHINode::Create(DestTy, "IV.S.", PH);

       /* create new increment. '++d' in above example. */
       Constant *CFP = ConstantFP::get(DestTy, C->getZExtValue());
       BinaryOperator *NewIncr =
         BinaryOperator::Create(Incr->getOpcode() == Instruction::Add ?
                                  Instruction::FAdd : Instruction::FSub,
                                NewPH, CFP, "IV.S.next.", Incr);

       NewPH->addIncoming(NewInit, PH->getIncomingBlock(Entry));
       NewPH->addIncoming(NewIncr, PH->getIncomingBlock(Latch));

       /* Remove cast operation */
       ShadowUse->replaceAllUsesWith(NewPH);
       ShadowUse->eraseFromParent();
       break;
     }
   }
 }

 /// FindIVUserForCond - If Cond has an operand that is an expression of an IV,
 /// set the IV user and stride information and return true, otherwise return
 /// false.
 bool LSRInstance::FindIVUserForCond(ICmpInst *Cond,
                                     IVStrideUse *&CondUse,
                                     const SCEV* &CondStride) {
   for (unsigned StrideIdx = 0, e = IU.StrideOrder.size();
        StrideIdx != e && !CondUse; ++StrideIdx) {
     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
       IU.IVUsesByStride.find(IU.StrideOrder[StrideIdx]);
     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");

     for (ilist<IVStrideUse>::iterator UI = SI->second->Users.begin(),
          E = SI->second->Users.end(); UI != E; ++UI)
       if (UI->getUser() == Cond) {
         // NOTE: we could handle setcc instructions with multiple uses here, but
         // InstCombine does it as well for simple uses, it's not clear that it
         // occurs enough in real life to handle.
         CondUse = UI;
         CondStride = SI->first;
         return true;
       }
   }
   return false;
 }

 /// OptimizeMax - Rewrite the loop's terminating condition if it uses
 /// a max computation.
 ///
 /// This is a narrow solution to a specific, but acute, problem. For loops
 /// like this:
 ///
 ///   i = 0;
 ///   do {
 ///     p[i] = 0.0;
 ///   } while (++i < n);
 ///
 /// the trip count isn't just 'n', because 'n' might not be positive. And
 /// unfortunately this can come up even for loops where the user didn't use
 /// a C do-while loop. For example, seemingly well-behaved top-test loops
 /// will commonly be lowered like this:
 //
 ///   if (n > 0) {
 ///     i = 0;
 ///     do {
 ///       p[i] = 0.0;
 ///     } while (++i < n);
 ///   }
 ///
 /// and then it's possible for subsequent optimization to obscure the if
 /// test in such a way that indvars can't find it.
 ///
 /// When indvars can't find the if test in loops like this, it creates a
 /// max expression, which allows it to give the loop a canonical
 /// induction variable:
 ///
 ///   i = 0;
 ///   max = n < 1 ? 1 : n;
 ///   do {
 ///     p[i] = 0.0;
 ///   } while (++i != max);
 ///
 /// Canonical induction variables are necessary because the loop passes
 /// are designed around them. The most obvious example of this is the
 /// LoopInfo analysis, which doesn't remember trip count values. It
 /// expects to be able to rediscover the trip count each time it is
 /// needed, and it does this using a simple analysis that only succeeds if
 /// the loop has a canonical induction variable.
 ///
 /// However, when it comes time to generate code, the maximum operation
 /// can be quite costly, especially if it's inside of an outer loop.
 ///
 /// This function solves this problem by detecting this type of loop and
 /// rewriting their conditions from ICMP_NE back to ICMP_SLT, and deleting
 /// the instructions for the maximum computation.
 ///
 ICmpInst *LSRInstance::OptimizeMax(ICmpInst *Cond, IVStrideUse* &CondUse) {
   // Check that the loop matches the pattern we're looking for.
   if (Cond->getPredicate() != CmpInst::ICMP_EQ &&
       Cond->getPredicate() != CmpInst::ICMP_NE)
     return Cond;

   SelectInst *Sel = dyn_cast<SelectInst>(Cond->getOperand(1));
   if (!Sel || !Sel->hasOneUse()) return Cond;

   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
   if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
     return Cond;
   const SCEV *One = SE.getIntegerSCEV(1, BackedgeTakenCount->getType());

   // Add one to the backedge-taken count to get the trip count.
   const SCEV *IterationCount = SE.getAddExpr(BackedgeTakenCount, One);

   // Check for a max calculation that matches the pattern.
   if (!isa<SCEVSMaxExpr>(IterationCount) && !isa<SCEVUMaxExpr>(IterationCount))
     return Cond;
   const SCEVNAryExpr *Max = cast<SCEVNAryExpr>(IterationCount);
   if (Max != SE.getSCEV(Sel)) return Cond;

   // To handle a max with more than two operands, this optimization would
   // require additional checking and setup.
   if (Max->getNumOperands() != 2)
     return Cond;

   const SCEV *MaxLHS = Max->getOperand(0);
   const SCEV *MaxRHS = Max->getOperand(1);
   if (!MaxLHS || MaxLHS != One) return Cond;
   // Check the relevant induction variable for conformance to
   // the pattern.
   const SCEV *IV = SE.getSCEV(Cond->getOperand(0));
   const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(IV);
   if (!AR || !AR->isAffine() ||
       AR->getStart() != One ||
       AR->getStepRecurrence(SE) != One)
     return Cond;

   assert(AR->getLoop() == L &&
          "Loop condition operand is an addrec in a different loop!");

   // Check the right operand of the select, and remember it, as it will
   // be used in the new comparison instruction.
   Value *NewRHS = 0;
   if (SE.getSCEV(Sel->getOperand(1)) == MaxRHS)
     NewRHS = Sel->getOperand(1);
   else if (SE.getSCEV(Sel->getOperand(2)) == MaxRHS)
     NewRHS = Sel->getOperand(2);
   if (!NewRHS) return Cond;

   // Determine the new comparison opcode. It may be signed or unsigned,
   // and the original comparison may be either equality or inequality.
   CmpInst::Predicate Pred =
     isa<SCEVSMaxExpr>(Max) ? CmpInst::ICMP_SLT : CmpInst::ICMP_ULT;
   if (Cond->getPredicate() == CmpInst::ICMP_EQ)
     Pred = CmpInst::getInversePredicate(Pred);

   // Ok, everything looks ok to change the condition into an SLT or SGE and
   // delete the max calculation.
   ICmpInst *NewCond =
     new ICmpInst(Cond, Pred, Cond->getOperand(0), NewRHS, "scmp");

   // Delete the max calculation instructions.
   Cond->replaceAllUsesWith(NewCond);
   CondUse->setUser(NewCond);
   Instruction *Cmp = cast<Instruction>(Sel->getOperand(0));
   Cond->eraseFromParent();
   Sel->eraseFromParent();
   if (Cmp->use_empty())
     Cmp->eraseFromParent();
   return NewCond;
 }

 bool LSRInstance::StrideMightBeShared(const SCEV* Stride) {
   int64_t SInt = cast<SCEVConstant>(Stride)->getValue()->getSExtValue();
   for (unsigned i = 0, e = IU.StrideOrder.size(); i != e; ++i) {
     std::map<const SCEV *, IVUsersOfOneStride *>::iterator SI =
       IU.IVUsesByStride.find(IU.StrideOrder[i]);
     const SCEV *Share = SI->first;
     if (!isa<SCEVConstant>(SI->first) || Share == Stride)
       continue;
     int64_t SSInt = cast<SCEVConstant>(Share)->getValue()->getSExtValue();
     if (SSInt == SInt)
       return true; // This can definitely be reused.
     if (unsigned(abs64(SSInt)) < SInt || (SSInt % SInt) != 0)
       continue;
     int64_t Scale = SSInt / SInt;

     // This AM will be used for conservative queries. At this point in the
     // process we don't know which users will have a base reg, immediate,
     // etc., so we conservatively assume that it may not, making more
     // strides valid, thus erring on the side of assuming that there
     // might be sharing.
     TargetLowering::AddrMode AM;
     AM.Scale = Scale;

     // Any pre-inc iv use?
     IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[Share];
     for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
            E = StrideUses.Users.end(); I != E; ++I) {
       bool isAddress = isAddressUse(I->getUser(), I->getOperandValToReplace());
       if (!I->isUseOfPostIncrementedValue() &&
           isLegalUse(AM, isAddress ? LSRUse::Address : LSRUse::Basic,
                      isAddress ? getAccessType(I->getUser()) : 0,
                      TLI))
         return true;
     }
   }
   return false;
 }

 /// OptimizeLoopTermCond - Change loop terminating condition to use the
 /// postinc iv when possible.
 bool
 LSRInstance::OptimizeLoopTermCond(Instruction *&IVIncInsertPos) {
   SmallPtrSet<Instruction *, 4> PostIncs;

   BasicBlock *LatchBlock = L->getLoopLatch();
   SmallVector<BasicBlock*, 8> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);

   for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
     BasicBlock *ExitingBlock = ExitingBlocks[i];

     // Get the terminating condition for the loop if possible.  If we
     // can, we want to change it to use a post-incremented version of its
     // induction variable, to allow coalescing the live ranges for the IV into
     // one register value.

     BranchInst *TermBr = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
     if (!TermBr)
       continue;
     // FIXME: Overly conservative, termination condition could be an 'or' etc..
     if (TermBr->isUnconditional() || !isa<ICmpInst>(TermBr->getCondition()))
       continue;

     // Search IVUsesByStride to find Cond's IVUse if there is one.
     IVStrideUse *CondUse = 0;
     const SCEV *CondStride = 0;
     ICmpInst *Cond = cast<ICmpInst>(TermBr->getCondition());
     if (!FindIVUserForCond(Cond, CondUse, CondStride))
       continue;

     // If the trip count is computed in terms of a max (due to ScalarEvolution
     // being unable to find a sufficient guard, for example), change the loop
     // comparison to use SLT or ULT instead of NE.
     // One consequence of doing this now is that it disrupts the count-down
     // optimization. That's not always a bad thing though, because in such
     // cases it may still be worthwhile to avoid a max.
     Cond = OptimizeMax(Cond, CondUse);

     // If this exiting block is the latch block, and the condition has only
     // one use inside the loop (the branch), use the post-incremented value
     // of the induction variable
     if (ExitingBlock != LatchBlock) {
       // If this exiting block dominates the latch block, it may also use
       // the post-inc value if it won't be shared with other uses.
       // Check for dominance.
       if (!DT.dominates(ExitingBlock, LatchBlock))
         continue;
       // Check for sharing within the same stride.
       bool SameStrideSharing = false;
       IVUsersOfOneStride &StrideUses = *IU.IVUsesByStride[CondStride];
       for (ilist<IVStrideUse>::iterator I = StrideUses.Users.begin(),
              E = StrideUses.Users.end(); I != E; ++I) {
         if (I->getUser() == Cond)
           continue;
         if (!I->isUseOfPostIncrementedValue()) {
           SameStrideSharing = true;
           break;
         }
       }
       if (SameStrideSharing)
         continue;
       // Check for sharing from a different stride.
       if (isa<SCEVConstant>(CondStride) && StrideMightBeShared(CondStride))
         continue;
     }
     if (!Cond->hasOneUse()) {
       bool HasOneUseInLoop = true;
       for (Value::use_iterator UI = Cond->use_begin(), UE = Cond->use_end();
            UI != UE; ++UI) {
         Instruction *U = cast<Instruction>(*UI);
         if (U == TermBr)
           continue;
         if (L->contains(U)) {
           HasOneUseInLoop = false;
           break;
         }
       }
       if (!HasOneUseInLoop)
         continue;
     }

     DEBUG(dbgs() << "  Change loop exiting icmp to use postinc iv: "
                  << *Cond << '\n');

     // It's possible for the setcc instruction to be anywhere in the loop, and
     // possible for it to have multiple users.  If it is not immediately before
     // the exiting block branch, move it.
     if (&*++BasicBlock::iterator(Cond) != TermBr) {
       if (Cond->hasOneUse()) {   // Condition has a single use, just move it.
         Cond->moveBefore(TermBr);
       } else {
         // Otherwise, clone the terminating condition and insert into the
         // loopend.
         ICmpInst *OldCond = Cond;
         Cond = cast<ICmpInst>(Cond->clone());
         Cond->setName(L->getHeader()->getName() + ".termcond");
         ExitingBlock->getInstList().insert(TermBr, Cond);

         // Clone the IVUse, as the old use still exists!
         IU.IVUsesByStride[CondStride]->addUser(CondUse->getOffset(), Cond,
                                              CondUse->getOperandValToReplace());
         CondUse = &IU.IVUsesByStride[CondStride]->Users.back();
         TermBr->replaceUsesOfWith(OldCond, Cond);
       }
     }

     // If we get to here, we know that we can transform the setcc instruction to
     // use the post-incremented version of the IV, allowing us to coalesce the
     // live ranges for the IV correctly.
     CondUse->setOffset(SE.getMinusSCEV(CondUse->getOffset(), CondStride));
     CondUse->setIsUseOfPostIncrementedValue(true);
     Changed = true;

     PostIncs.insert(Cond);
   }

   // Determine an insertion point for the loop induction variable increment. It
   // must dominate all the post-inc comparisons we just set up, and it must
   // dominate the loop latch edge.
   IVIncInsertPos = L->getLoopLatch()->getTerminator();
   for (SmallPtrSet<Instruction *, 4>::iterator I = PostIncs.begin(),
        E = PostIncs.end(); I != E; ++I) {
     BasicBlock *BB =
       DT.findNearestCommonDominator(IVIncInsertPos->getParent(),
                                     (*I)->getParent());
     if (BB == (*I)->getParent())
       IVIncInsertPos = *I;
     else if (BB != IVIncInsertPos->getParent())
       IVIncInsertPos = BB->getTerminator();
   }

   return Changed;
 }

 /// CountRegisters - Note the given register.
 void LSRInstance::CountRegister(const SCEV *Reg, uint32_t Complexity,
                                 size_t LUIdx) {
   std::pair<RegUsesTy::iterator, bool> Pair =
     RegUses.insert(std::make_pair(Reg, RegSortData()));
   RegSortData &BV = Pair.first->second;
   if (Pair.second) {
     BV.Index = CurrentArbitraryRegIndex++;
     BV.MaxComplexity = Complexity;
     RegSequence.push_back(Reg);
   } else {
     BV.MaxComplexity = std::max(BV.MaxComplexity, Complexity);
   }
   BV.Bits.resize(std::max(BV.Bits.size(), LUIdx + 1));
   BV.Bits.set(LUIdx);
 }

 /// CountRegisters - Note which registers are used by the given formula,
 /// updating RegUses.
 void LSRInstance::CountRegisters(const Formula &F, size_t LUIdx) {
   uint32_t Complexity = F.getComplexity();
   if (F.ScaledReg)
     CountRegister(F.ScaledReg, Complexity, LUIdx);
   for (SmallVectorImpl<const SCEV *>::const_iterator I = F.BaseRegs.begin(),
        E = F.BaseRegs.end(); I != E; ++I)
     CountRegister(*I, Complexity, LUIdx);
 }

 /// GenerateSymbolicOffsetReuse - Generate reuse formulae using symbolic
 /// offsets.
 void LSRInstance::GenerateSymbolicOffsetReuse(LSRUse &LU, unsigned LUIdx,
                                               Formula Base) {
   // We can't add a symbolic offset if the address already contains one.
   if (Base.AM.BaseGV) return;

   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *G = Base.BaseRegs[i];
     GlobalValue *GV = ExtractSymbol(G, SE);
     if (G->isZero())
       continue;
     Formula F = Base;
     F.AM.BaseGV = GV;
     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
       continue;
     F.BaseRegs[i] = G;
     if (LU.InsertFormula(F))
       CountRegisters(LU.Formulae.back(), LUIdx);
   }
 }

 /// GenerateICmpZeroScaledReuse - For ICmpZero, check to see if we can scale up
 /// the comparison. For example, x == y -> x*c == y*c.
 void LSRInstance::GenerateICmpZeroScaledReuse(LSRUse &LU, unsigned LUIdx,
                                               Formula Base) {
   if (LU.Kind != LSRUse::ICmpZero) return;

   // Determine the integer type for the base formula.
   const Type *IntTy = Base.getType();
   if (!IntTy) return;
   if (SE.getTypeSizeInBits(IntTy) > 64) return;
   IntTy = SE.getEffectiveSCEVType(IntTy);

   assert(!Base.AM.BaseGV && "ICmpZero use is not legal!");

   // Check each interesting stride.
   for (SmallSetVector<int64_t, 4>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     int64_t Factor = *I;
     Formula F = Base;

     // Check that the multiplication doesn't overflow.
     F.AM.BaseOffs = (uint64_t)F.AM.BaseOffs * Factor;
     if ((int64_t)F.AM.BaseOffs / Factor != F.AM.BaseOffs)
       continue;

     // Check that this scale is legal.
     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI))
       continue;

     const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));

     // Check that multiplying with each base register doesn't overflow.
     for (size_t i = 0, e = F.BaseRegs.size(); i != e; ++i) {
       F.BaseRegs[i] = SE.getMulExpr(F.BaseRegs[i], FactorS);
       if (getSDiv(F.BaseRegs[i], FactorS, SE) != Base.BaseRegs[i])
         goto next;
     }

     // Check that multiplying with the scaled register doesn't overflow.
     if (F.ScaledReg) {
       F.ScaledReg = SE.getMulExpr(F.ScaledReg, FactorS);
       if (getSDiv(F.ScaledReg, FactorS, SE) != Base.ScaledReg)
         continue;
     }

     // If we make it here and it's legal, add it.
     if (LU.InsertFormula(F))
       CountRegisters(LU.Formulae.back(), LUIdx);
   next:;
   }
 }

 /// GenerateFormulaeFromReplacedBaseReg - If removing base register with
 /// index i from the BaseRegs list and adding the registers in AddOps
 /// to the address forms an interesting formula, pursue it.
 void
 LSRInstance::GenerateFormulaeFromReplacedBaseReg(
                                              LSRUse &LU,
                                              unsigned LUIdx,
                                              const Formula &Base, unsigned i,
                                              const SmallVectorImpl<const SCEV *>
                                                &AddOps) {
   if (AddOps.empty()) return;

   Formula F = Base;
   std::swap(F.BaseRegs[i], F.BaseRegs.back());
   F.BaseRegs.pop_back();
   for (SmallVectorImpl<const SCEV *>::const_iterator I = AddOps.begin(),
        E = AddOps.end(); I != E; ++I)
     F.BaseRegs.push_back(*I);
   F.AM.HasBaseReg = !F.BaseRegs.empty();
   if (LU.InsertFormula(F)) {
     CountRegisters(LU.Formulae.back(), LUIdx);
     // Recurse.
     GenerateReassociationReuse(LU, LUIdx, LU.Formulae.back());
   }
 }

 /// GenerateReassociationReuse - Split out subexpressions from adds and
 /// the bases of addrecs.
 void LSRInstance::GenerateReassociationReuse(LSRUse &LU, unsigned LUIdx,
                                              Formula Base) {
   SmallVector<const SCEV *, 8> AddOps;
   for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i) {
     const SCEV *BaseReg = Base.BaseRegs[i];
     if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(BaseReg)) {
       for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
            J != JE; ++J) {
         // Don't pull a constant into a register if the constant could be
         // folded into an immediate field.
         if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE)) continue;
         SmallVector<const SCEV *, 8> InnerAddOps;
         for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
           if (K != J)
             InnerAddOps.push_back(*K);
         // Splitting a 2-operand add both ways is redundant. Pruning this
         // now saves compile time.
         if (InnerAddOps.size() < 2 && next(J) == JE)
           continue;
         AddOps.push_back(*J);
         const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
         AddOps.push_back(InnerAdd);
         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
         AddOps.clear();
       }
     } else if (const SCEVAddRecExpr *AR = dyn_cast<SCEVAddRecExpr>(BaseReg)) {
       if (const SCEVAddExpr *Add = dyn_cast<SCEVAddExpr>(AR->getStart())) {
         for (SCEVAddExpr::op_iterator J = Add->op_begin(), JE = Add->op_end();
              J != JE; ++J) {
           // Don't pull a constant into a register if the constant could be
           // folded into an immediate field.
           if (isAlwaysFoldable(*J, true, LU.Kind, LU.AccessTy, TLI, SE))
             continue;
           SmallVector<const SCEV *, 8> InnerAddOps;
           for (SCEVAddExpr::op_iterator K = Add->op_begin(); K != JE; ++K)
             if (K != J)
               InnerAddOps.push_back(*K);
           AddOps.push_back(*J);
           const SCEV *InnerAdd = SE.getAddExpr(InnerAddOps);
           AddOps.push_back(SE.getAddRecExpr(InnerAdd,
                                             AR->getStepRecurrence(SE),
                                             AR->getLoop()));
           GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
           AddOps.clear();
         }
       } else if (!isAlwaysFoldable(AR->getStart(), Base.BaseRegs.size() > 1,
                                    LU.Kind, LU.AccessTy,
                                    TLI, SE)) {
         AddOps.push_back(AR->getStart());
         AddOps.push_back(SE.getAddRecExpr(SE.getIntegerSCEV(0,
                                                             BaseReg->getType()),
                                           AR->getStepRecurrence(SE),
                                           AR->getLoop()));
         GenerateFormulaeFromReplacedBaseReg(LU, LUIdx, Base, i, AddOps);
         AddOps.clear();
       }
     }
   }
 }

 /// GenerateCombinationReuse - Generate a formula consisting of all of the
 /// loop-dominating registers added into a single register.
 void LSRInstance::GenerateCombinationReuse(LSRUse &LU, unsigned LUIdx,
                                            Formula Base) {
   if (Base.BaseRegs.size() <= 1) return;

   Formula F = Base;
   F.BaseRegs.clear();
   SmallVector<const SCEV *, 4> Ops;
   for (SmallVectorImpl<const SCEV *>::const_iterator
        I = Base.BaseRegs.begin(), E = Base.BaseRegs.end(); I != E; ++I) {
     const SCEV *BaseReg = *I;
     if (BaseReg->properlyDominates(L->getHeader(), &DT) &&
         !BaseReg->hasComputableLoopEvolution(L))
       Ops.push_back(BaseReg);
     else
       F.BaseRegs.push_back(BaseReg);
   }
   if (Ops.size() > 1) {
     F.BaseRegs.push_back(SE.getAddExpr(Ops));
     if (LU.InsertFormula(F))
       CountRegisters(LU.Formulae.back(), LUIdx);
   }
 }

 /// GenerateScaledReuse - Generate stride factor reuse formulae by making
 /// use of scaled-offset address modes, for example.
 void LSRInstance::GenerateScaledReuse(LSRUse &LU, unsigned LUIdx,
                                       Formula Base) {
   // Determine the integer type for the base formula.
   const Type *IntTy = Base.getType();
   if (!IntTy) return;
   IntTy = SE.getEffectiveSCEVType(IntTy);

   // Check each interesting stride.
   for (SmallSetVector<int64_t, 4>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     int64_t Factor = *I;

     // If this Formula already has a scaled register, we can't add another one.
     if (Base.AM.Scale != 0)
       continue;
     Formula F = Base;
     F.AM.Scale = Factor;
     // Check whether this scale is going to be legal.
     if (!isLegalUse(F.AM, LU.Kind, LU.AccessTy, TLI)) {
       // As a special-case, handle special out-of-loop Basic users specially.
       // TODO: Reconsider this special case.
       if (LU.Kind == LSRUse::Basic &&
           isLegalUse(F.AM, LSRUse::Special, LU.AccessTy, TLI) &&
           !L->contains(LU.UserInst))
         LU.Kind = LSRUse::Special;
       else
         continue;
     }
     // For each addrec base reg, apply the scale, if possible.
     for (size_t i = 0, e = Base.BaseRegs.size(); i != e; ++i)
       if (const SCEVAddRecExpr *AR =
             dyn_cast<SCEVAddRecExpr>(Base.BaseRegs[i])) {
         const SCEV *FactorS = SE.getSCEV(ConstantInt::get(IntTy, Factor));
         // Divide out the factor, ignoring high bits, since we'll be
         // scaling the value back up in the end.
         if (const SCEV *Quotient = getSDiv(AR, FactorS, SE, true)) {
           // TODO: This could be optimized to avoid all the copying.
           Formula NewF = F;
           NewF.ScaledReg = Quotient;
           std::swap(NewF.BaseRegs[i], NewF.BaseRegs.back());
           NewF.BaseRegs.pop_back();
           NewF.AM.HasBaseReg = !NewF.BaseRegs.empty();
           if (LU.InsertFormula(NewF))
             CountRegisters(LU.Formulae.back(), LUIdx);
         }
       }
   }
 }

 /// GenerateTruncateReuse - Generate reuse formulae from different IV types.
 void LSRInstance::GenerateTruncateReuse(LSRUse &LU, unsigned LUIdx,
                                         Formula Base) {
   // This requires TargetLowering to tell us which truncates are free.
   if (!TLI) return;

   // Don't attempt to truncate symbolic values.
   if (Base.AM.BaseGV) return;

   // Determine the integer type for the base formula.
   const Type *DstTy = Base.getType();
   if (!DstTy) return;
   DstTy = SE.getEffectiveSCEVType(DstTy);

   for (SmallSetVector<const Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     const Type *SrcTy = *I;
     if (SrcTy != DstTy && TLI->isTruncateFree(SrcTy, DstTy)) {
       Formula F = Base;
       if (F.ScaledReg) F.ScaledReg = SE.getAnyExtendExpr(F.ScaledReg, *I);
       for (SmallVectorImpl<const SCEV *>::iterator J = F.BaseRegs.begin(),
            JE = F.BaseRegs.end(); J != JE; ++J)
         *J = SE.getAnyExtendExpr(*J, SrcTy);
       if (LU.InsertFormula(F))
         CountRegisters(LU.Formulae.back(), LUIdx);
     }
   }
 }

 namespace {

 /// WorkItem - Helper class for GenerateConstantOffsetReuse. It's used to
 /// defer modifications so that the search phase doesn't have to worry about
 /// the data structures moving underneath it.
 struct WorkItem {
   LSRUse *LU;
   size_t LUIdx;
   int64_t Imm;
   const SCEV *OrigReg;

   WorkItem(LSRUse *U, size_t LI, int64_t I, const SCEV *R)
     : LU(U), LUIdx(LI), Imm(I), OrigReg(R) {}

   void print(raw_ostream &OS) const;
   void dump() const;
 };

 void WorkItem::print(raw_ostream &OS) const {
   OS << "in use ";
   LU->print(OS);
   OS << " (at index " << LUIdx << "), add offset " << Imm
      << " and compensate by adjusting refences to " << *OrigReg << "\n";
 }

 void WorkItem::dump() const {
   print(errs()); errs() << '\n';
 }

 }

 /// GenerateConstantOffsetReuse - Look for registers which are a constant
 /// distance apart and try to form reuse opportunities between them.
 void LSRInstance::GenerateConstantOffsetReuse() {
   // Group the registers by their value without any added constant offset.
   typedef std::map<int64_t, const SCEV *> ImmMapTy;
   typedef DenseMap<const SCEV *, ImmMapTy> RegMapTy;
   RegMapTy Map;
   SmallVector<const SCEV *, 8> Sequence;
   for (SmallVectorImpl<const SCEV *>::iterator I = RegSequence.begin(),
        E = RegSequence.end(); I != E; ++I) {
     const SCEV *Reg = *I;
     int64_t Imm = ExtractImmediate(Reg, SE);
     std::pair<RegMapTy::iterator, bool> Pair =
       Map.insert(std::make_pair(Reg, ImmMapTy()));
     if (Pair.second)
       Sequence.push_back(Reg);
     Pair.first->second.insert(std::make_pair(Imm, *I));
   }

   // Insert an artificial expression at offset 0 (if there isn't one already),
   // as this may lead to more reuse opportunities.
   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
        E = Sequence.end(); I != E; ++I)
     Map.find(*I)->second.insert(ImmMapTy::value_type(0, 0));

   // Now examine each set of registers with the same base value. Build up
   // a list of work to do and do the work in a separate step so that we're
   // not adding formulae and register counts while we're searching.
   SmallVector<WorkItem, 32> WorkItems;
   for (SmallVectorImpl<const SCEV *>::const_iterator I = Sequence.begin(),
        E = Sequence.end(); I != E; ++I) {
     const SCEV *Reg = *I;
     const ImmMapTy &Imms = Map.find(Reg)->second;
     // Examine each offset.
     for (ImmMapTy::const_iterator J = Imms.begin(), JE = Imms.end();
          J != JE; ++J) {
       const SCEV *OrigReg = J->second;
       // Skip the artifical register at offset 0.
       if (!OrigReg) continue;

       int64_t JImm = J->first;
       const SmallBitVector &Bits = RegUses.find(OrigReg)->second.Bits;

       // Examine each other offset associated with the same register. This is
       // quadradic in the number of registers with the same base, but it's
       // uncommon for this to be a large number.
       for (ImmMapTy::const_iterator M = Imms.begin(); M != JE; ++M) {
         if (M == J) continue;

         // Compute the difference between the two.
         int64_t Imm = (uint64_t)JImm - M->first;
         for (int LUIdx = Bits.find_first(); LUIdx != -1;
              LUIdx = Bits.find_next(LUIdx))
           // Make a memo of this use, offset, and register tuple.
           WorkItems.push_back(WorkItem(&Uses[LUIdx], LUIdx, Imm, OrigReg));
       }
     }
   }

   // Now iterate through the worklist and add new formulae.
   for (SmallVectorImpl<WorkItem>::const_iterator I = WorkItems.begin(),
        E = WorkItems.end(); I != E; ++I) {
     const WorkItem &WI = *I;
     LSRUse &LU = *WI.LU;
     size_t LUIdx = WI.LUIdx;
     int64_t Imm = WI.Imm;
     const SCEV *OrigReg = WI.OrigReg;

     const Type *IntTy = SE.getEffectiveSCEVType(OrigReg->getType());
     const SCEV *NegImmS = SE.getSCEV(ConstantInt::get(IntTy,
                                                       -(uint64_t)Imm));

     for (size_t L = 0, LE = LU.Formulae.size(); L != LE; ++L) {
       Formula F = LU.Formulae[L];
       // Use the immediate in the scaled register.
       if (F.ScaledReg == OrigReg) {
         int64_t Offs = (uint64_t)F.AM.BaseOffs +
                        Imm * (uint64_t)F.AM.Scale;
         // Don't create 50 + reg(-50).
         if (F.referencesReg(SE.getSCEV(
                    ConstantInt::get(IntTy, -(uint64_t)Offs))))
           continue;
         Formula NewF = F;
         NewF.AM.BaseOffs = Offs;
         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
           continue;
         const SCEV *Diff = SE.getAddExpr(NegImmS, NewF.ScaledReg);
         if (Diff->isZero()) continue;
         NewF.ScaledReg = Diff;
         if (LU.InsertFormula(NewF))
           CountRegisters(LU.Formulae.back(), LUIdx);
       }
       // Use the immediate in a base register.
       for (size_t N = 0, NE = F.BaseRegs.size(); N != NE; ++N) {
         const SCEV *BaseReg = F.BaseRegs[N];
         if (BaseReg != OrigReg)
           continue;
         Formula NewF = F;
         NewF.AM.BaseOffs = (uint64_t)NewF.AM.BaseOffs + Imm;
         if (!isLegalUse(NewF.AM, LU.Kind, LU.AccessTy, TLI))
           continue;
         const SCEV *Diff = SE.getAddExpr(NegImmS, BaseReg);
         if (Diff->isZero()) continue;
         // Don't create 50 + reg(-50).
         if (Diff ==
             SE.getSCEV(ConstantInt::get(IntTy,
                                         -(uint64_t)NewF.AM.BaseOffs)))
           continue;
         NewF.BaseRegs[N] = Diff;
         if (LU.InsertFormula(NewF))
           CountRegisters(LU.Formulae.back(), LUIdx);
       }
     }
   }
 }

 /// GenerateAllReuseFormulae - Generate formulae for each use.
 void
 LSRInstance::GenerateAllReuseFormulae() {
   SmallVector<Formula, 12> Save;
   for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
     LSRUse &LU = Uses[LUIdx];

     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateSymbolicOffsetReuse(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateICmpZeroScaledReuse(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateReassociationReuse(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateCombinationReuse(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateScaledReuse(LU, LUIdx, LU.Formulae[i]);
     for (size_t i = 0, f = LU.Formulae.size(); i != f; ++i)
       GenerateTruncateReuse(LU, LUIdx, LU.Formulae[i]);
   }

   GenerateConstantOffsetReuse();
 }

 /// GenerateLoopInvariantRegisterUses - Check for other uses of loop-invariant
 /// values which we're tracking. These other uses will pin these values in
 /// registers, making them less profitable for elimination.
 /// TODO: This currently misses non-constant addrec step registers.
 /// TODO: Should this give more weight to users inside the loop?
 void
 LSRInstance::GenerateLoopInvariantRegisterUses() {
   for (size_t i = 0, e = RegSequence.size(); i != e; ++i)
     if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(RegSequence[i])) {
       const Value *V = U->getValue();
       if (const Instruction *Inst = dyn_cast<Instruction>(V))
         if (L->contains(Inst)) continue;
       for (Value::use_const_iterator UI = V->use_begin(), UE = V->use_end();
            UI != UE; ++UI) {
         const Instruction *UserInst = dyn_cast<Instruction>(*UI);
         // Ignore non-instructions.
         if (!UserInst)
           continue;
         // Ignore instructions in other functions (as can happen with
         // Constants).
         if (UserInst->getParent()->getParent() != L->getHeader()->getParent())
           continue;
         // Ignore instructions not dominated by the loop.
         const BasicBlock *UseBB = !isa<PHINode>(UserInst) ?
           UserInst->getParent() :
           cast<PHINode>(UserInst)->getIncomingBlock(
             PHINode::getIncomingValueNumForOperand(UI.getOperandNo()));
         if (!DT.dominates(L->getHeader(), UseBB))
           continue;
         // Ignore uses which are part of other SCEV expressions, to avoid
         // analyzing them multiple times.
         if (SE.isSCEVable(UserInst->getType()) &&
             !isa<SCEVUnknown>(SE.getSCEV(const_cast<Instruction *>(UserInst))))
           continue;
         // Ignore icmp instructions which are already being analyzed.
         if (const ICmpInst *ICI = dyn_cast<ICmpInst>(UserInst)) {
           unsigned OtherIdx = !UI.getOperandNo();
           Value *OtherOp = const_cast<Value *>(ICI->getOperand(OtherIdx));
           if (SE.getSCEV(OtherOp)->hasComputableLoopEvolution(L))
             continue;
         }

         LSRUse &LU = getNewUse();
         LU.UserInst = const_cast<Instruction *>(UserInst);
         LU.OperandValToReplace = UI.getUse();
         LU.InsertSupplementalFormula(U);
         CountRegisters(LU.Formulae.back(), Uses.size() - 1);
       }
     }
 }

 #ifndef NDEBUG

 static void debug_winner(SmallVector<LSRUse, 16> const &Uses) {
   dbgs() << "LSR has selected formulae for each use:\n";
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
        E = Uses.end(); I != E; ++I) {
     const LSRUse &LU = *I;
     dbgs() << "  ";
     LU.print(dbgs());
     dbgs() << '\n';
     dbgs() << "    ";
     LU.Formulae.front().print(dbgs());
     dbgs() << "\n";
   }
 }

 #endif

 LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
   : IU(P->getAnalysis<IVUsers>()),
     SE(P->getAnalysis<ScalarEvolution>()),
     DT(P->getAnalysis<DominatorTree>()),
     TLI(tli), L(l), Changed(false), CurrentArbitraryRegIndex(0) {

   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm()) return;

   // If there's no interesting work to be done, bail early.
   if (IU.IVUsesByStride.empty()) return;

   DEBUG(dbgs() << "\nLSR on loop ";
         WriteAsOperand(dbgs(), L->getHeader(), /*PrintType=*/false);
         dbgs() << ":\n");

   // Sort the StrideOrder so we process larger strides first.
   std::stable_sort(IU.StrideOrder.begin(), IU.StrideOrder.end(),
                    StrideCompare(SE));

   /// OptimizeShadowIV - If IV is used in a int-to-float cast
   /// inside the loop then try to eliminate the cast opeation.
   OptimizeShadowIV();

   // Change loop terminating condition to use the postinc iv when possible.
   Instruction *IVIncInsertPos;
   Changed |= OptimizeLoopTermCond(IVIncInsertPos);

   for (SmallVectorImpl<const SCEV *>::const_iterator SIter =
        IU.StrideOrder.begin(), SEnd = IU.StrideOrder.end();
        SIter != SEnd; ++SIter) {
     const SCEV *Stride = *SIter;

     // Collect interesting types.
     Types.insert(SE.getEffectiveSCEVType(Stride->getType()));

     // Collect interesting factors.
     for (SmallVectorImpl<const SCEV *>::const_iterator NewStrideIter =
          SIter + 1; NewStrideIter != SEnd; ++NewStrideIter) {
       const SCEV *OldStride = Stride;
       const SCEV *NewStride = *NewStrideIter;

       if (SE.getTypeSizeInBits(OldStride->getType()) !=
           SE.getTypeSizeInBits(NewStride->getType())) {
         if (SE.getTypeSizeInBits(OldStride->getType()) >
             SE.getTypeSizeInBits(NewStride->getType()))
           NewStride = SE.getSignExtendExpr(NewStride, OldStride->getType());
         else
           OldStride = SE.getSignExtendExpr(OldStride, NewStride->getType());
       }
       if (const SCEVConstant *Factor =
             dyn_cast_or_null<SCEVConstant>(getSDiv(NewStride, OldStride,
                                                    SE, true)))
         if (Factor->getValue()->getValue().getMinSignedBits() <= 64)
           Factors.insert(Factor->getValue()->getValue().getSExtValue());
     }

     std::map<const SCEV *, IVUsersOfOneStride *>::const_iterator SI =
       IU.IVUsesByStride.find(Stride);
     assert(SI != IU.IVUsesByStride.end() && "Stride doesn't exist!");
     for (ilist<IVStrideUse>::const_iterator UI = SI->second->Users.begin(),
          E = SI->second->Users.end(); UI != E; ++UI) {
       // Record the uses.
       LSRUse &LU = getNewUse();
       LU.UserInst = UI->getUser();
       LU.OperandValToReplace = UI->getOperandValToReplace();
       if (isAddressUse(LU.UserInst, LU.OperandValToReplace)) {
         LU.Kind = LSRUse::Address;
         LU.AccessTy = getAccessType(LU.UserInst);
       }
       if (UI->isUseOfPostIncrementedValue())
         LU.PostIncLoop = L;

       const SCEV *S = IU.getCanonicalExpr(*UI);

       // Equality (== and !=) ICmps are special. We can rewrite (i == N) as
       // (N - i == 0), and this allows (N - i) to be the expression that we
       // work with rather than just N or i, so we can consider the register
       // requirements for both N and i at the same time. Limiting this code
       // to equality icmps is not a problem because all interesting loops
       // use equality icmps, thanks to IndVarSimplify.
       if (ICmpInst *CI = dyn_cast<ICmpInst>(LU.UserInst))
         if (CI->isEquality()) {
           // Swap the operands if needed to put the OperandValToReplace on
           // the left, for consistency.
           Value *NV = CI->getOperand(1);
           if (NV == LU.OperandValToReplace) {
             CI->setOperand(1, CI->getOperand(0));
             CI->setOperand(0, NV);
           }

           // x == y  -->  x - y == 0
           const SCEV *N = SE.getSCEV(NV);
           if (N->isLoopInvariant(L)) {
             LU.Kind = LSRUse::ICmpZero;
             S = SE.getMinusSCEV(N, S);
           }

           // -1 and the negations of all interesting strides (except the
           // negation of -1) are now also interesting.
           for (size_t i = 0, e = Factors.size(); i != e; ++i)
             if (Factors[i] != -1)
               Factors.insert(-(uint64_t)Factors[i]);
           Factors.insert(-1);
         }

       // Ok, now enumerate all the different formulae we can find to compute
       // the value for this expression.
       LU.InsertInitialFormula(S, L, SE, DT);
       CountRegisters(LU.Formulae.back(), Uses.size() - 1);
     }
   }

   // If all uses use the same type, don't bother looking for truncation-based
   // reuse.
   if (Types.size() == 1)
     Types.clear();

   // Now use the reuse data to generate a bunch of interesting ways
   // to formulate the values needed for the uses.
   GenerateAllReuseFormulae();

   // If there are any uses of registers that we're tracking that have escaped
   // IVUsers' attention, add trivial uses for them, so that the register
   // voting process takes the into consideration.
   GenerateLoopInvariantRegisterUses();

   // Sort the formulae. TODO: This is redundantly sorted below.
   for (SmallVectorImpl<LSRUse>::iterator I = Uses.begin(), E = Uses.end();
        I != E; ++I) {
     LSRUse &LU = *I;
     std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
                      ComplexitySorter());
   }

   // Ok, we've now collected all the uses and noted their register uses. The
   // next step is to start looking at register reuse possibilities.
   DEBUG(print(dbgs()); dbgs() << '\n');

   // Start by assuming we'll assign each use its own register. This is
   // sometimes called "full" strength reduction, or "superhero" mode.
   // Sometimes this is the best solution, but if there are opportunities for
   // reuse we may find a better solution.
   Score CurScore;
   CurScore.RateInitial(Uses, L, SE);

   // Create a sorted list of registers with those with the most uses appearing
   // earlier in the list. We'll visit them first, as they're the most likely
   // to represent profitable reuse opportunities.
   SmallVector<RegCount, 8> RegOrder;
   for (SmallVectorImpl<const SCEV *>::const_iterator I =
        RegSequence.begin(), E = RegSequence.end(); I != E; ++I)
     RegOrder.push_back(RegCount(*I, RegUses.find(*I)->second));
   std::stable_sort(RegOrder.begin(), RegOrder.end());

   // Visit each register. Determine which ones represent profitable reuse
   // opportunities and remember them.
   // TODO: Extract this code into a function.
   for (SmallVectorImpl<RegCount>::const_iterator I = RegOrder.begin(),
        E = RegOrder.end(); I != E; ++I) {
     const SCEV *Reg = I->Reg;
     const SmallBitVector &Bits = I->Sort.Bits;

     // Registers with only one use don't represent reuse opportunities, so
     // when we get there, we're done.
     if (Bits.count() <= 1) break;

     DEBUG(dbgs() << "Reg " << *Reg << ": ";
           I->Sort.print(dbgs());
           dbgs() << '\n');

     // Determine the total number of registers will be needed if we make use
     // of the reuse opportunity represented by the current register.
     Score NewScore;
     NewScore.Rate(Reg, Bits, Uses, L, SE);

     // Now decide whether this register's reuse opportunity is an overall win.
     // Currently the decision is heavily skewed for register pressure.
     if (!(NewScore < CurScore)) {
       continue;
     }

     // Ok, use this opportunity.
     DEBUG(dbgs() << "This candidate has been accepted.\n");
     CurScore = NewScore;

     // Now that we've selected a new reuse opportunity, delete formulae that
     // do not participate in that opportunity.
     for (int j = Bits.find_first(); j != -1; j = Bits.find_next(j)) {
       LSRUse &LU = Uses[j];
       for (unsigned k = 0, h = LU.Formulae.size(); k != h; ++k) {
         Formula &F = LU.Formulae[k];
         if (!F.referencesReg(Reg)) {
           std::swap(LU.Formulae[k], LU.Formulae.back());
           LU.Formulae.pop_back();
           --k; --h;
         }
       }
       // Also re-sort the list to put the formulae with the fewest registers
       // at the front.
       // TODO: Do this earlier, we don't need it each time.
       std::stable_sort(LU.Formulae.begin(), LU.Formulae.end(),
                        ComplexitySorter());
     }
   }

   // Ok, we've now made all our decisions. The first formula for each use
   // will be used.
   DEBUG(dbgs() << "Concluding, we need "; CurScore.print(dbgs());
         dbgs() << ".\n";
         debug_winner(Uses));

   // Free memory no longer needed.
   RegOrder.clear();
   Factors.clear();
   Types.clear();
   RegUses.clear();
   RegSequence.clear();

   // Keep track of instructions we may have made dead, so that
   // we can remove them after we are done working.
   SmallVector<WeakVH, 16> DeadInsts;

   SCEVExpander Rewriter(SE);
   Rewriter.disableCanonicalMode();
   Rewriter.setIVIncInsertPos(L, IVIncInsertPos);

   // Expand the new value definitions and update the users.
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
        E = Uses.end(); I != E; ++I) {
     // Formulae should be legal.
     DEBUG(for (SmallVectorImpl<Formula>::const_iterator J = I->Formulae.begin(),
                JE = I->Formulae.end(); J != JE; ++J)
             assert(isLegalUse(J->AM, I->Kind, I->AccessTy, TLI) &&
                    "Illegal formula generated!"));

     // Expand the new code and update the user.
     I->Rewrite(L, Rewriter, DeadInsts, SE, DT, P);
     Changed = true;
   }

   // Clean up after ourselves. This must be done before deleting any
   // instructions.
   Rewriter.clear();

   Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
 }

 void LSRInstance::print(raw_ostream &OS) const {
   OS << "LSR has identified the following interesting factors and types: ";
   bool First = true;

   for (SmallSetVector<int64_t, 4>::const_iterator
        I = Factors.begin(), E = Factors.end(); I != E; ++I) {
     if (!First) OS << ", ";
     First = false;
     OS << '*' << *I;
   }

   for (SmallSetVector<const Type *, 4>::const_iterator
        I = Types.begin(), E = Types.end(); I != E; ++I) {
     if (!First) OS << ", ";
     First = false;
     OS << '(' << **I << ')';
   }
   OS << '\n';

   OS << "LSR is examining the following uses, and candidate formulae:\n";
   for (SmallVectorImpl<LSRUse>::const_iterator I = Uses.begin(),
        E = Uses.end(); I != E; ++I) {
     const LSRUse &LU = *I;
     dbgs() << "  ";
     LU.print(OS);
     OS << '\n';
     for (SmallVectorImpl<Formula>::const_iterator J = LU.Formulae.begin(),
          JE = LU.Formulae.end(); J != JE; ++J) {
       OS << "    ";
       J->print(OS);
       OS << "\n";
     }
   }
 }

 void LSRInstance::dump() const {
   print(errs()); errs() << '\n';
 }

 namespace {

 class LoopStrengthReduce : public LoopPass {
   /// TLI - Keep a pointer of a TargetLowering to consult for determining
   /// transformation profitability.
   const TargetLowering *const TLI;

 public:
   static char ID; // Pass ID, replacement for typeid
   explicit LoopStrengthReduce(const TargetLowering *tli = NULL);

 private:
   bool runOnLoop(Loop *L, LPPassManager &LPM);
   void getAnalysisUsage(AnalysisUsage &AU) const;
 };

 }

 char LoopStrengthReduce::ID = 0;
 static RegisterPass<LoopStrengthReduce>
 X("loop-reduce", "Loop Strength Reduction");

 Pass *llvm::createLoopStrengthReducePass(const TargetLowering *TLI) {
   return new LoopStrengthReduce(TLI);
 }

 LoopStrengthReduce::LoopStrengthReduce(const TargetLowering *tli)
   : LoopPass(&ID), TLI(tli) {}

 void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   // We split critical edges, so we change the CFG.  However, we do update
   // many analyses if they are around.
   AU.addPreservedID(LoopSimplifyID);
   AU.addPreserved<LoopInfo>();
   AU.addPreserved("domfrontier");

   AU.addRequiredID(LoopSimplifyID);
   AU.addRequired<DominatorTree>();
   AU.addPreserved<DominatorTree>();
   AU.addRequired<ScalarEvolution>();
   AU.addPreserved<ScalarEvolution>();
   AU.addRequired<IVUsers>();
   AU.addPreserved<IVUsers>();
 }

 bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
   bool Changed = false;

   // Run the main LSR transformation.
   Changed |= LSRInstance(TLI, L, this).getChanged();

   // At this point, it is worth checking to see if any recurrence PHIs are also
   // dead, so that we can remove them as well.
   Changed |= DeleteDeadPHIs(L->getHeader());

   return Changed;
 }