Extend getConstantTripCount to deal with a larger subset of loop bounds; make loop
unroll/unroll-and-jam more powerful; add additional affine expr builder methods

- use previously added analysis/simplification to infer multiple of unroll
  factor trip counts, making loop unroll/unroll-and-jam more general.

- for loop unroll, support bounds that are single result affine map's with the
  same set of operands. For unknown loop bounds, loop unroll will now work as
  long as trip count can be determined to be a multiple of unroll factor.

- extend getConstantTripCount to deal with single result affine map's with the
  same operands. move it to mlir/Analysis/LoopAnalysis.cpp

- add additional builder utility methods for affine expr arithmetic
  (difference, mod/floordiv/ceildiv w.r.t postitive constant). simplify code to
  use the utility methods.

- move affine analysis routines to AffineAnalysis.cpp/.h from
  AffineStructures.cpp/.h.

- Rename LoopUnrollJam to LoopUnrollAndJam to match class name.

- add an additional simplification for simplifyFloorDiv, simplifyCeilDiv

- Rename AffineMap::getNumOperands() getNumInputs: an affine map by itself does
  not have operands. Operands are passed to it through affine_apply, from loop
  bounds/if condition's, etc., operands are stored in the latter.

This should be sufficiently powerful for now as far as unroll/unroll-and-jam go for TPU
code generation, and can move to other analyses/transformations.

Loop nests like these are now unrolled without any cleanup loop being generated.

  for %i = 1 to 100 {
    // unroll factor 4: no cleanup loop will be generated.
    for %j = (d0) -> (d0) (%i) to (d0) -> (5*d0 + 3) (%i) {
      %x = "foo"(%j) : (affineint) -> i32
    }
  }

  for %i = 1 to 100 {
    // unroll factor 4: no cleanup loop will be generated.
    for %j = (d0) -> (d0) (%i) to (d0) -> (d0 - d mod 4 - 1) (%i) {
      %y = "foo"(%j) : (affineint) -> i32
    }
  }

  for %i = 1 to 100 {
    for %j = (d0) -> (d0) (%i) to (d0) -> (d0 + 128) (%i) {
      %x = "foo"() : () -> i32
    }
  }

TODO(bondhugula): extend this to LoopUnrollAndJam as well in the next CL (with minor
changes).

PiperOrigin-RevId: 212661212
diff --git a/include/mlir/Analysis/AffineAnalysis.h b/include/mlir/Analysis/AffineAnalysis.h
new file mode 100644
index 0000000..605791d
--- /dev/null
+++ b/include/mlir/Analysis/AffineAnalysis.h
@@ -0,0 +1,42 @@
+//===- AffineAnalysis.h - analyses for affine structures --------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for methods that perform analysis
+// involving affine structures (AffineExpr, AffineMap, IntegerSet, etc.) and
+// other IR structures that in turn use these.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+#define MLIR_ANALYSIS_AFFINE_ANALYSIS_H
+
+#include "llvm/ADT/Optional.h"
+
+namespace mlir {
+
+class AffineExpr;
+class MLIRContext;
+
+/// Simplify an affine expression through flattening and some amount of
+/// simple analysis. This has complexity linear in the number of nodes in
+/// 'expr'. Return nullptr, if the expression can't be simplified.
+AffineExpr *simplifyAffineExpr(AffineExpr *expr, unsigned numDims,
+                               unsigned numSymbols, MLIRContext *context);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_AFFINE_ANALYSIS_H
diff --git a/include/mlir/Analysis/AffineStructures.h b/include/mlir/Analysis/AffineStructures.h
index aaae119..ac6e728 100644
--- a/include/mlir/Analysis/AffineStructures.h
+++ b/include/mlir/Analysis/AffineStructures.h
@@ -38,12 +38,6 @@
 class MLValue;
 class HyperRectangularSet;
 
-/// Simplify an affine expression through flattening and some amount of
-/// simple analysis. This has complexity linear in the number of nodes in
-/// 'expr'. Return nullptr, if the expression can't be simplified.
-AffineExpr *simplifyAffineExpr(AffineExpr *expr, unsigned numDims,
-                               unsigned numSymbols, MLIRContext *context);
-
 /// A mutable affine map. Its affine expressions are however unique.
 struct MutableAffineMap {
 public:
diff --git a/include/mlir/Analysis/LoopAnalysis.h b/include/mlir/Analysis/LoopAnalysis.h
new file mode 100644
index 0000000..482a74c
--- /dev/null
+++ b/include/mlir/Analysis/LoopAnalysis.h
@@ -0,0 +1,49 @@
+//===- LoopAnalysis.h - loop analysis methods -------------------*- C++ -*-===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This header file defines prototypes for methods to analyze loops.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef MLIR_ANALYSIS_LOOP_ANALYSIS_H
+#define MLIR_ANALYSIS_LOOP_ANALYSIS_H
+
+#include "llvm/ADT/Optional.h"
+
+namespace mlir {
+
+class AffineExpr;
+class ForStmt;
+
+/// Returns the trip count of the loop as an affine expression if the latter is
+/// expressible as an affine expression, and nullptr otherwise. The trip count
+/// expression is simplified before returning.
+AffineExpr *getTripCount(const ForStmt &forStmt);
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// uses affine expression analysis and is able to determine constant trip count
+/// in non-trivial cases.
+llvm::Optional<uint64_t> getConstantTripCount(const ForStmt &forStmt);
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t getLargestDivisorOfTripCount(const ForStmt &forStmt);
+
+} // end namespace mlir
+
+#endif // MLIR_ANALYSIS_LOOP_ANALYSIS_H
diff --git a/include/mlir/IR/AffineExpr.h b/include/mlir/IR/AffineExpr.h
index dccbe91..f2f3a9f 100644
--- a/include/mlir/IR/AffineExpr.h
+++ b/include/mlir/IR/AffineExpr.h
@@ -70,8 +70,8 @@
   /// floordiv, ceildiv, and mod is only allowed w.r.t constants.
   bool isPureAffine() const;
 
-  /// Returns the greatest known common divisor of this affine expression.
-  uint64_t getKnownGcd() const;
+  /// Returns the greatest known integral divisor of this affine expression.
+  uint64_t getLargestKnownDivisor() const;
 
   /// Return true if the affine expression is a multiple of 'factor'.
   bool isMultipleOf(int64_t factor) const;
@@ -107,6 +107,11 @@
                             MLIRContext *context) {
     return get(AffineExpr::Kind::Add, lhs, rhs, context);
   }
+  static AffineExpr *getAdd(AffineExpr *expr, int64_t rhs,
+                            MLIRContext *context);
+  static AffineExpr *getSub(AffineExpr *lhs, AffineExpr *rhs,
+                            MLIRContext *context);
+
   static AffineExpr *getMul(AffineExpr *lhs, AffineExpr *rhs,
                             MLIRContext *context) {
     return get(AffineExpr::Kind::Mul, lhs, rhs, context);
@@ -119,6 +124,8 @@
                                 MLIRContext *context) {
     return get(AffineExpr::Kind::CeilDiv, lhs, rhs, context);
   }
+  static AffineExpr *getCeilDiv(AffineExpr *lhs, uint64_t rhs,
+                                MLIRContext *context);
   static AffineExpr *getMod(AffineExpr *lhs, AffineExpr *rhs,
                             MLIRContext *context) {
     return get(AffineExpr::Kind::Mod, lhs, rhs, context);
diff --git a/include/mlir/IR/AffineMap.h b/include/mlir/IR/AffineMap.h
index 9f3ffee..1f5d497 100644
--- a/include/mlir/IR/AffineMap.h
+++ b/include/mlir/IR/AffineMap.h
@@ -69,7 +69,7 @@
   unsigned getNumDims() const { return numDims; }
   unsigned getNumSymbols() const { return numSymbols; }
   unsigned getNumResults() const { return numResults; }
-  unsigned getNumOperands() const { return numDims + numSymbols; }
+  unsigned getNumInputs() const { return numDims + numSymbols; }
 
   ArrayRef<AffineExpr *> getResults() const {
     return ArrayRef<AffineExpr *>(results, numResults);
diff --git a/include/mlir/IR/Builders.h b/include/mlir/IR/Builders.h
index f6cd245..7cd0dd5 100644
--- a/include/mlir/IR/Builders.h
+++ b/include/mlir/IR/Builders.h
@@ -106,9 +106,13 @@
   AffineExpr *getAddExpr(AffineExpr *lhs, AffineExpr *rhs);
   AffineExpr *getSubExpr(AffineExpr *lhs, AffineExpr *rhs);
   AffineExpr *getMulExpr(AffineExpr *lhs, AffineExpr *rhs);
+  AffineExpr *getMulExpr(AffineExpr *lhs, int64_t rhs);
   AffineExpr *getModExpr(AffineExpr *lhs, AffineExpr *rhs);
+  AffineExpr *getModExpr(AffineExpr *lhs, uint64_t rhs);
   AffineExpr *getFloorDivExpr(AffineExpr *lhs, AffineExpr *rhs);
+  AffineExpr *getFloorDivExpr(AffineExpr *lhs, uint64_t rhs);
   AffineExpr *getCeilDivExpr(AffineExpr *lhs, AffineExpr *rhs);
+  AffineExpr *getCeilDivExpr(AffineExpr *lhs, uint64_t rhs);
 
   // Integer set.
   IntegerSet *getIntegerSet(unsigned dimCount, unsigned symbolCount,
diff --git a/include/mlir/IR/Statement.h b/include/mlir/IR/Statement.h
index 55a0aec..3b2bbeb 100644
--- a/include/mlir/IR/Statement.h
+++ b/include/mlir/IR/Statement.h
@@ -110,6 +110,7 @@
     return operand_iterator(this, getNumOperands());
   }
 
+  /// Returns an iterator on the underlying MLValue's (MLValue *).
   llvm::iterator_range<operand_iterator> getOperands() {
     return {operand_begin(), operand_end()};
   }
@@ -126,6 +127,7 @@
     return const_operand_iterator(this, getNumOperands());
   }
 
+  /// Returns a const iterator on the underlying MLValue's (MLValue *).
   llvm::iterator_range<const_operand_iterator> getOperands() const {
     return {operand_begin(), operand_end()};
   }
diff --git a/include/mlir/IR/Statements.h b/include/mlir/IR/Statements.h
index 513bc08..c0de576 100644
--- a/include/mlir/IR/Statements.h
+++ b/include/mlir/IR/Statements.h
@@ -81,6 +81,7 @@
     return operand_iterator(this, getNumOperands());
   }
 
+  /// Returns an iterator on the underlying MLValue's (MLValue *).
   llvm::iterator_range<operand_iterator> getOperands() {
     return {operand_begin(), operand_end()};
   }
@@ -97,6 +98,7 @@
     return const_operand_iterator(this, getNumOperands());
   }
 
+  /// Returns a const iterator on the underlying MLValue's (MLValue *).
   llvm::iterator_range<const_operand_iterator> getOperands() const {
     return {operand_begin(), operand_end()};
   }
@@ -244,6 +246,13 @@
   void setLowerBound(ArrayRef<MLValue *> operands, AffineMap *map);
   /// Set upper bound.
   void setUpperBound(ArrayRef<MLValue *> operands, AffineMap *map);
+
+  /// Set the lower bound map without changing operands.
+  void setLowerBoundMap(AffineMap *map);
+
+  /// Set the upper bound map without changing operands.
+  void setUpperBoundMap(AffineMap *map);
+
   /// Set loop step.
   void setStep(int64_t step) { this->step = step; }
 
@@ -266,9 +275,6 @@
   /// Sets the upper bound to the given constant value.
   void setConstantUpperBound(int64_t value);
 
-  /// Returns the trip count if it's a constant.
-  Optional<uint64_t> getConstantTripCount() const;
-
   //===--------------------------------------------------------------------===//
   // Operands
   //===--------------------------------------------------------------------===//
@@ -361,16 +367,19 @@
     return stmt.getStmtOperand(opStart + idx);
   }
 
-  using operand_iterator = ForStmt::const_operand_iterator;
-  using operand_range = ForStmt::const_operand_range;
+  using operand_iterator = ForStmt::operand_iterator;
+  using operand_range = ForStmt::operand_range;
 
   operand_iterator operand_begin() const {
-    return operand_iterator(&stmt, opStart);
+    // These are iterators over MLValue *. Not casting away const'ness would
+    // require the caller to use const MLValue *.
+    return operand_iterator(const_cast<ForStmt *>(&stmt), opStart);
   }
   operand_iterator operand_end() const {
-    return operand_iterator(&stmt, opEnd);
+    return operand_iterator(const_cast<ForStmt *>(&stmt), opEnd);
   }
 
+  /// Returns an iterator on the underlying MLValue's (MLValue *).
   operand_range getOperands() const { return {operand_begin(), operand_end()}; }
   ArrayRef<StmtOperand> getStmtOperands() const {
     auto ops = stmt.getStmtOperands();
diff --git a/lib/Analysis/AffineAnalysis.cpp b/lib/Analysis/AffineAnalysis.cpp
new file mode 100644
index 0000000..629a5eb
--- /dev/null
+++ b/lib/Analysis/AffineAnalysis.cpp
@@ -0,0 +1,302 @@
+//===- AffineAnalysis.cpp - Affine structures analysis routines -----------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous analysis routines for affine structures
+// (expressions, maps, sets), and other utilities relying on such analysis.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/IR/AffineExprVisitor.h"
+#include "llvm/ADT/ArrayRef.h"
+
+using namespace mlir;
+
+/// Constructs an affine expression from a flat ArrayRef. If there are local
+/// identifiers (neither dimensional nor symbolic) that appear in the sum of
+/// products expression, 'localExprs' is expected to have the AffineExpr for it,
+/// and is substituted into. The ArrayRef 'eq' is expected to be in the format
+/// [dims, symbols, locals, constant term].
+static AffineExpr *toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
+                                unsigned numSymbols,
+                                ArrayRef<AffineExpr *> localExprs,
+                                MLIRContext *context) {
+  // Assert expected numLocals = eq.size() - numDims - numSymbols - 1
+  assert(eq.size() - numDims - numSymbols - 1 == localExprs.size() &&
+         "unexpected number of local expressions");
+
+  AffineExpr *expr = AffineConstantExpr::get(0, context);
+  // Dimensions and symbols.
+  for (unsigned j = 0; j < numDims + numSymbols; j++) {
+    if (eq[j] != 0) {
+      AffineExpr *id =
+          j < numDims
+              ? static_cast<AffineExpr *>(AffineDimExpr::get(j, context))
+              : AffineSymbolExpr::get(j - numDims, context);
+      auto *term = AffineBinaryOpExpr::getMul(
+          AffineConstantExpr::get(eq[j], context), id, context);
+      expr = AffineBinaryOpExpr::getAdd(expr, term, context);
+    }
+  }
+
+  // Local identifiers.
+  for (unsigned j = numDims + numSymbols; j < eq.size() - 1; j++) {
+    if (eq[j] != 0) {
+      auto *term = AffineBinaryOpExpr::getMul(
+          AffineConstantExpr::get(eq[j], context),
+          localExprs[j - numDims - numSymbols], context);
+      expr = AffineBinaryOpExpr::getAdd(expr, term, context);
+    }
+  }
+
+  // Constant term.
+  unsigned constTerm = eq[eq.size() - 1];
+  if (constTerm != 0)
+    expr = AffineBinaryOpExpr::getAdd(
+        expr, AffineConstantExpr::get(constTerm, context), context);
+  return expr;
+}
+
+namespace {
+
+// This class is used to flatten a pure affine expression (AffineExpr *, which
+// is in a tree form) into a sum of products (w.r.t constants) when possible,
+// and in that process simplifying the expression. The simplification performed
+// includes the accumulation of contributions for each dimensional and symbolic
+// identifier together, the simplification of floordiv/ceildiv/mod exprssions
+// and other simplifications that in turn happen as a result. A simplification
+// that this flattening naturally performs is of simplifying the numerator and
+// denominator of floordiv/ceildiv, and folding a modulo expression to a zero,
+// if possible. Three examples are below:
+//
+// (d0 + 3 * d1) + d0) - 2 * d1) - d0 simplified to  d0 + d1
+// (d0 - d0 mod 4 + 4) mod 4  simplified to 0.
+// (3*d0 + 2*d1 + d0) floordiv 2 + d1 simplified to 2*d0 + 2*d1
+//
+// For a modulo, floordiv, or a ceildiv expression, an additional identifier
+// (called a local identifier) is introduced to rewrite it as a sum of products
+// (w.r.t constants). For example, for the second example above, d0 % 4 is
+// replaced by d0 - 4*q with q being introduced: the expression then simplifies
+// to: (d0 - (d0 - 4q) + 4) = 4q + 4, modulo of which w.r.t 4 simplifies to
+// zero. Note that an affine expression may not always be expressible in a sum
+// of products form due to the presence of modulo/floordiv/ceildiv expressions
+// that may not be eliminated after simplification; in such cases, the final
+// expression can be reconstructed by replacing the local identifier with its
+// explicit form stored in localExprs (note that the explicit form itself would
+// have been simplified and not necessarily the original form).
+//
+// This is a linear time post order walk for an affine expression that attempts
+// the above simplifications through visit methods, with partial results being
+// stored in 'operandExprStack'. When a parent expr is visited, the flattened
+// expressions corresponding to its two operands would already be on the stack -
+// the parent expr looks at the two flattened expressions and combines the two.
+// It pops off the operand expressions and pushes the combined result (although
+// this is done in-place on its LHS operand expr. When the walk is completed,
+// the flattened form of the top-level expression would be left on the stack.
+//
+class AffineExprFlattener : public AffineExprVisitor<AffineExprFlattener> {
+public:
+  // Flattend expression layout: [dims, symbols, locals, constant]
+  // Stack that holds the LHS and RHS operands while visiting a binary op expr.
+  // In future, consider adding a prepass to determine how big the SmallVector's
+  // will be, and linearize this to std::vector<int64_t> to prevent
+  // SmallVector moves on re-allocation.
+  std::vector<SmallVector<int64_t, 32>> operandExprStack;
+
+  inline unsigned getNumCols() const {
+    return numDims + numSymbols + numLocals + 1;
+  }
+
+  unsigned numDims;
+  unsigned numSymbols;
+  // Number of newly introduced identifiers to flatten mod/floordiv/ceildiv
+  // expressions that could not be simplified.
+  unsigned numLocals;
+  // AffineExpr's corresponding to the floordiv/ceildiv/mod expressions for
+  // which new identifiers were introduced; if the latter do not get canceled
+  // out, these expressions are needed to reconstruct the AffineExpr * / tree
+  // form. Note that these expressions themselves would have been simplified
+  // (recursively) by this pass. Eg. d0 + (d0 + 2*d1 + d0) ceildiv 4 will be
+  // simplified to d0 + q, where q = (d0 + d1) ceildiv 2. (d0 + d1) ceildiv 2
+  // would be the local expression stored for q.
+  SmallVector<AffineExpr *, 4> localExprs;
+  MLIRContext *context;
+
+  AffineExprFlattener(unsigned numDims, unsigned numSymbols,
+                      MLIRContext *context)
+      : numDims(numDims), numSymbols(numSymbols), numLocals(0),
+        context(context) {
+    operandExprStack.reserve(8);
+  }
+
+  void visitMulExpr(AffineBinaryOpExpr *expr) {
+    assert(operandExprStack.size() >= 2);
+    // This is a pure affine expr; the RHS will be a constant.
+    assert(isa<AffineConstantExpr>(expr->getRHS()));
+    // Get the RHS constant.
+    auto rhsConst = operandExprStack.back()[getConstantIndex()];
+    operandExprStack.pop_back();
+    // Update the LHS in place instead of pop and push.
+    auto &lhs = operandExprStack.back();
+    for (unsigned i = 0, e = lhs.size(); i < e; i++) {
+      lhs[i] *= rhsConst;
+    }
+  }
+
+  void visitAddExpr(AffineBinaryOpExpr *expr) {
+    assert(operandExprStack.size() >= 2);
+    const auto &rhs = operandExprStack.back();
+    auto &lhs = operandExprStack[operandExprStack.size() - 2];
+    assert(lhs.size() == rhs.size());
+    // Update the LHS in place.
+    for (unsigned i = 0; i < rhs.size(); i++) {
+      lhs[i] += rhs[i];
+    }
+    // Pop off the RHS.
+    operandExprStack.pop_back();
+  }
+
+  void visitModExpr(AffineBinaryOpExpr *expr) {
+    assert(operandExprStack.size() >= 2);
+    // This is a pure affine expr; the RHS will be a constant.
+    assert(isa<AffineConstantExpr>(expr->getRHS()));
+    auto rhsConst = operandExprStack.back()[getConstantIndex()];
+    operandExprStack.pop_back();
+    auto &lhs = operandExprStack.back();
+    // TODO(bondhugula): handle modulo by zero case when this issue is fixed
+    // at the other places in the IR.
+    assert(rhsConst != 0 && "RHS constant can't be zero");
+
+    // Check if the LHS expression is a multiple of modulo factor.
+    unsigned i;
+    for (i = 0; i < lhs.size(); i++)
+      if (lhs[i] % rhsConst != 0)
+        break;
+    // If yes, modulo expression here simplifies to zero.
+    if (i == lhs.size()) {
+      lhs.assign(lhs.size(), 0);
+      return;
+    }
+
+    // Add an existential quantifier. expr1 % expr2 is replaced by (expr1 -
+    // q * expr2) where q is the existential quantifier introduced.
+    addLocalId(AffineBinaryOpExpr::getFloorDiv(
+        toAffineExpr(lhs, numDims, numSymbols, localExprs, context),
+        AffineConstantExpr::get(rhsConst, context), context));
+    lhs[getLocalVarStartIndex() + numLocals - 1] = -rhsConst;
+  }
+  void visitCeilDivExpr(AffineBinaryOpExpr *expr) {
+    visitDivExpr(expr, /*isCeil=*/true);
+  }
+  void visitFloorDivExpr(AffineBinaryOpExpr *expr) {
+    visitDivExpr(expr, /*isCeil=*/false);
+  }
+  void visitDimExpr(AffineDimExpr *expr) {
+    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+    auto &eq = operandExprStack.back();
+    eq[getDimStartIndex() + expr->getPosition()] = 1;
+  }
+  void visitSymbolExpr(AffineSymbolExpr *expr) {
+    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+    auto &eq = operandExprStack.back();
+    eq[getSymbolStartIndex() + expr->getPosition()] = 1;
+  }
+  void visitConstantExpr(AffineConstantExpr *expr) {
+    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
+    auto &eq = operandExprStack.back();
+    eq[getConstantIndex()] = expr->getValue();
+  }
+
+private:
+  void visitDivExpr(AffineBinaryOpExpr *expr, bool isCeil) {
+    assert(operandExprStack.size() >= 2);
+    assert(isa<AffineConstantExpr>(expr->getRHS()));
+    // This is a pure affine expr; the RHS is a positive constant.
+    auto rhsConst = operandExprStack.back()[getConstantIndex()];
+    // TODO(bondhugula): handle division by zero at the same time the issue is
+    // fixed at other places.
+    assert(rhsConst != 0 && "RHS constant can't be zero");
+    operandExprStack.pop_back();
+    auto &lhs = operandExprStack.back();
+
+    // Simplify the floordiv, ceildiv if possible by canceling out the greatest
+    // common divisors of the numerator and denominator.
+    uint64_t gcd = std::abs(rhsConst);
+    for (unsigned i = 0; i < lhs.size(); i++)
+      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
+    // Simplify the numerator and the denominator.
+    if (gcd != 1) {
+      for (unsigned i = 0; i < lhs.size(); i++)
+        lhs[i] = lhs[i] / gcd;
+    }
+    int64_t denominator = rhsConst / gcd;
+    // If the denominator becomes 1, the updated LHS is the result. (The
+    // denominator can't be negative since rhsConst is positive).
+    if (denominator == 1)
+      return;
+
+    // If the denominator cannot be simplified to one, we will have to retain
+    // the ceil/floor expr (simplified up until here). Add an existential
+    // quantifier to express its result, i.e., expr1 div expr2 is replaced
+    // by a new identifier, q.
+    auto divKind =
+        isCeil ? AffineExpr::Kind::CeilDiv : AffineExpr::Kind::FloorDiv;
+    addLocalId(AffineBinaryOpExpr::get(
+        divKind, toAffineExpr(lhs, numDims, numSymbols, localExprs, context),
+        AffineConstantExpr::get(denominator, context), context));
+    lhs.assign(lhs.size(), 0);
+    lhs[getLocalVarStartIndex() + numLocals - 1] = 1;
+  }
+
+  // Add an existential quantifier (used to flatten a mod, floordiv, ceildiv
+  // expr). localExpr is the simplified tree expression (AffineExpr *)
+  // corresponding to the quantifier.
+  void addLocalId(AffineExpr *localExpr) {
+    for (auto &subExpr : operandExprStack) {
+      subExpr.insert(subExpr.begin() + getLocalVarStartIndex() + numLocals, 0);
+    }
+    localExprs.push_back(localExpr);
+    numLocals++;
+  }
+
+  inline unsigned getConstantIndex() const { return getNumCols() - 1; }
+  inline unsigned getLocalVarStartIndex() const { return numDims + numSymbols; }
+  inline unsigned getSymbolStartIndex() const { return numDims; }
+  inline unsigned getDimStartIndex() const { return 0; }
+};
+
+} // end anonymous namespace
+
+AffineExpr *mlir::simplifyAffineExpr(AffineExpr *expr, unsigned numDims,
+                                     unsigned numSymbols,
+                                     MLIRContext *context) {
+  // TODO(bondhugula): only pure affine for now. The simplification here can be
+  // extended to semi-affine maps in the future.
+  if (!expr->isPureAffine())
+    return nullptr;
+
+  AffineExprFlattener flattener(numDims, numSymbols, context);
+  flattener.walkPostOrder(expr);
+  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
+  auto *simplifiedExpr = toAffineExpr(flattenedExpr, numDims, numSymbols,
+                                      flattener.localExprs, context);
+  flattener.operandExprStack.pop_back();
+  assert(flattener.operandExprStack.empty());
+  if (simplifiedExpr == expr)
+    return nullptr;
+  return simplifiedExpr;
+}
diff --git a/lib/Analysis/AffineStructures.cpp b/lib/Analysis/AffineStructures.cpp
index bff76a8..6965137 100644
--- a/lib/Analysis/AffineStructures.cpp
+++ b/lib/Analysis/AffineStructures.cpp
@@ -20,291 +20,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "mlir/Analysis/AffineStructures.h"
-
-#include "mlir/IR/AffineExprVisitor.h"
+#include "mlir/Analysis/AffineAnalysis.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/IntegerSet.h"
 #include "mlir/IR/StandardOps.h"
-#include "llvm/Support/raw_ostream.h"
 
 using namespace mlir;
 
-/// Constructs an affine expression from a flat ArrayRef. If there are local
-/// identifiers (neither dimensional nor symbolic) that appear in the sum of
-/// products expression, 'localExprs' is expected to have the AffineExpr for it,
-/// and is substituted into. The ArrayRef 'eq' is expected to be in the format
-/// [dims, symbols, locals, constant term].
-static AffineExpr *toAffineExpr(ArrayRef<int64_t> eq, unsigned numDims,
-                                unsigned numSymbols,
-                                ArrayRef<AffineExpr *> localExprs,
-                                MLIRContext *context) {
-  // Assert expected numLocals = eq.size() - numDims - numSymbols - 1
-  assert(eq.size() - numDims - numSymbols - 1 == localExprs.size() &&
-         "unexpected number of local expressions");
-
-  AffineExpr *expr = AffineConstantExpr::get(0, context);
-  // Dimensions and symbols.
-  for (unsigned j = 0; j < numDims + numSymbols; j++) {
-    if (eq[j] != 0) {
-      AffineExpr *id =
-          j < numDims
-              ? static_cast<AffineExpr *>(AffineDimExpr::get(j, context))
-              : AffineSymbolExpr::get(j - numDims, context);
-      auto *term = AffineBinaryOpExpr::getMul(
-          AffineConstantExpr::get(eq[j], context), id, context);
-      expr = AffineBinaryOpExpr::getAdd(expr, term, context);
-    }
-  }
-
-  // Local identifiers.
-  for (unsigned j = numDims + numSymbols; j < eq.size() - 1; j++) {
-    if (eq[j] != 0) {
-      auto *term = AffineBinaryOpExpr::getMul(
-          AffineConstantExpr::get(eq[j], context),
-          localExprs[j - numDims - numSymbols], context);
-      expr = AffineBinaryOpExpr::getAdd(expr, term, context);
-    }
-  }
-
-  // Constant term.
-  unsigned constTerm = eq[eq.size() - 1];
-  if (constTerm != 0)
-    expr = AffineBinaryOpExpr::getAdd(
-        expr, AffineConstantExpr::get(constTerm, context), context);
-  return expr;
-}
-
-namespace {
-
-// This class is used to flatten a pure affine expression (AffineExpr *, which
-// is in a tree form) into a sum of products (w.r.t constants) when possible,
-// and in that process simplifying the expression. The simplification performed
-// includes the accumulation of contributions for each dimensional and symbolic
-// identifier together, the simplification of floordiv/ceildiv/mod exprssions
-// and other simplifications that in turn happen as a result. A simplification
-// that this flattening naturally performs is of simplifying the numerator and
-// denominator of floordiv/ceildiv, and folding a modulo expression to a zero,
-// if possible. Three examples are below:
-//
-// (d0 + 3 * d1) + d0) - 2 * d1) - d0 simplified to  d0 + d1
-// (d0 - d0 mod 4 + 4) mod 4  simplified to 0.
-// (3*d0 + 2*d1 + d0) floordiv 2 + d1 simplified to 2*d0 + 2*d1
-//
-// For a modulo, floordiv, or a ceildiv expression, an additional identifier
-// (called a local identifier) is introduced to rewrite it as a sum of products
-// (w.r.t constants). For example, for the second example above, d0 % 4 is
-// replaced by d0 - 4*q with q being introduced: the expression then simplifies
-// to: (d0 - (d0 - 4q) + 4) = 4q + 4, modulo of which w.r.t 4 simplifies to
-// zero. Note that an affine expression may not always be expressible in a sum
-// of products form due to the presence of modulo/floordiv/ceildiv expressions
-// that may not be eliminated after simplification; in such cases, the final
-// expression can be reconstructed by replacing the local identifier with its
-// explicit form stored in localExprs (note that the explicit form itself would
-// have been simplified and not necessarily the original form).
-//
-// This is a linear time post order walk for an affine expression that attempts
-// the above simplifications through visit methods, with partial results being
-// stored in 'operandExprStack'. When a parent expr is visited, the flattened
-// expressions corresponding to its two operands would already be on the stack -
-// the parent expr looks at the two flattened expressions and combines the two.
-// It pops off the operand expressions and pushes the combined result (although
-// this is done in-place on its LHS operand expr. When the walk is completed,
-// the flattened form of the top-level expression would be left on the stack.
-//
-class AffineExprFlattener : public AffineExprVisitor<AffineExprFlattener> {
-public:
-  // Flattend expression layout: [dims, symbols, locals, constant]
-  // Stack that holds the LHS and RHS operands while visiting a binary op expr.
-  // In future, consider adding a prepass to determine how big the SmallVector's
-  // will be, and linearize this to std::vector<int64_t> to prevent
-  // SmallVector moves on re-allocation.
-  std::vector<SmallVector<int64_t, 32>> operandExprStack;
-
-  inline unsigned getNumCols() const {
-    return numDims + numSymbols + numLocals + 1;
-  }
-
-  unsigned numDims;
-  unsigned numSymbols;
-  // Number of newly introduced identifiers to flatten mod/floordiv/ceildiv
-  // expressions that could not be simplified.
-  unsigned numLocals;
-  // AffineExpr's corresponding to the floordiv/ceildiv/mod expressions for
-  // which new identifiers were introduced; if the latter do not get canceled
-  // out, these expressions are needed to reconstruct the AffineExpr * / tree
-  // form. Note that these expressions themselves would have been simplified
-  // (recursively) by this pass. Eg. d0 + (d0 + 2*d1 + d0) ceildiv 4 will be
-  // simplified to d0 + q, where q = (d0 + d1) ceildiv 2. (d0 + d1) ceildiv 2
-  // would be the local expression stored for q.
-  SmallVector<AffineExpr *, 4> localExprs;
-  MLIRContext *context;
-
-  AffineExprFlattener(unsigned numDims, unsigned numSymbols,
-                      MLIRContext *context)
-      : numDims(numDims), numSymbols(numSymbols), numLocals(0),
-        context(context) {
-    operandExprStack.reserve(8);
-  }
-
-  void visitMulExpr(AffineBinaryOpExpr *expr) {
-    assert(operandExprStack.size() >= 2);
-    // This is a pure affine expr; the RHS will be a constant.
-    assert(isa<AffineConstantExpr>(expr->getRHS()));
-    // Get the RHS constant.
-    auto rhsConst = operandExprStack.back()[getConstantIndex()];
-    operandExprStack.pop_back();
-    // Update the LHS in place instead of pop and push.
-    auto &lhs = operandExprStack.back();
-    for (unsigned i = 0, e = lhs.size(); i < e; i++) {
-      lhs[i] *= rhsConst;
-    }
-  }
-
-  void visitAddExpr(AffineBinaryOpExpr *expr) {
-    assert(operandExprStack.size() >= 2);
-    const auto &rhs = operandExprStack.back();
-    auto &lhs = operandExprStack[operandExprStack.size() - 2];
-    assert(lhs.size() == rhs.size());
-    // Update the LHS in place.
-    for (unsigned i = 0; i < rhs.size(); i++) {
-      lhs[i] += rhs[i];
-    }
-    // Pop off the RHS.
-    operandExprStack.pop_back();
-  }
-
-  void visitModExpr(AffineBinaryOpExpr *expr) {
-    assert(operandExprStack.size() >= 2);
-    // This is a pure affine expr; the RHS will be a constant.
-    assert(isa<AffineConstantExpr>(expr->getRHS()));
-    auto rhsConst = operandExprStack.back()[getConstantIndex()];
-    operandExprStack.pop_back();
-    auto &lhs = operandExprStack.back();
-    // TODO(bondhugula): handle modulo by zero case when this issue is fixed
-    // at the other places in the IR.
-    assert(rhsConst != 0 && "RHS constant can't be zero");
-
-    // Check if the LHS expression is a multiple of modulo factor.
-    unsigned i;
-    for (i = 0; i < lhs.size(); i++)
-      if (lhs[i] % rhsConst != 0)
-        break;
-    // If yes, modulo expression here simplifies to zero.
-    if (i == lhs.size()) {
-      lhs.assign(lhs.size(), 0);
-      return;
-    }
-
-    // Add an existential quantifier. expr1 % expr2 is replaced by (expr1 -
-    // q * expr2) where q is the existential quantifier introduced.
-    addLocalId(AffineBinaryOpExpr::get(
-        AffineExpr::Kind::FloorDiv,
-        toAffineExpr(lhs, numDims, numSymbols, localExprs, context),
-        AffineConstantExpr::get(rhsConst, context), context));
-    lhs[getLocalVarStartIndex() + numLocals - 1] = -rhsConst;
-  }
-  void visitCeilDivExpr(AffineBinaryOpExpr *expr) {
-    visitDivExpr(expr, /*isCeil=*/true);
-  }
-  void visitFloorDivExpr(AffineBinaryOpExpr *expr) {
-    visitDivExpr(expr, /*isCeil=*/false);
-  }
-  void visitDimExpr(AffineDimExpr *expr) {
-    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
-    auto &eq = operandExprStack.back();
-    eq[getDimStartIndex() + expr->getPosition()] = 1;
-  }
-  void visitSymbolExpr(AffineSymbolExpr *expr) {
-    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
-    auto &eq = operandExprStack.back();
-    eq[getSymbolStartIndex() + expr->getPosition()] = 1;
-  }
-  void visitConstantExpr(AffineConstantExpr *expr) {
-    operandExprStack.emplace_back(SmallVector<int64_t, 32>(getNumCols(), 0));
-    auto &eq = operandExprStack.back();
-    eq[getConstantIndex()] = expr->getValue();
-  }
-
-private:
-  void visitDivExpr(AffineBinaryOpExpr *expr, bool isCeil) {
-    assert(operandExprStack.size() >= 2);
-    assert(isa<AffineConstantExpr>(expr->getRHS()));
-    // This is a pure affine expr; the RHS is a positive constant.
-    auto rhsConst = operandExprStack.back()[getConstantIndex()];
-    // TODO(bondhugula): handle division by zero at the same time the issue is
-    // fixed at other places.
-    assert(rhsConst != 0 && "RHS constant can't be zero");
-    operandExprStack.pop_back();
-    auto &lhs = operandExprStack.back();
-
-    // Simplify the floordiv, ceildiv if possible by canceling out the greatest
-    // common divisors of the numerator and denominator.
-    uint64_t gcd = std::abs(rhsConst);
-    for (unsigned i = 0; i < lhs.size(); i++)
-      gcd = llvm::GreatestCommonDivisor64(gcd, std::abs(lhs[i]));
-    // Simplify the numerator and the denominator.
-    if (gcd != 1) {
-      for (unsigned i = 0; i < lhs.size(); i++)
-        lhs[i] = lhs[i] / gcd;
-    }
-    int64_t denominator = rhsConst / gcd;
-    // If the denominator becomes 1, the updated LHS is the result. (The
-    // denominator can't be negative since rhsConst is positive).
-    if (denominator == 1)
-      return;
-
-    // If the denominator cannot be simplified to one, we will have to retain
-    // the ceil/floor expr (simplified up until here). Add an existential
-    // quantifier to express its result, i.e., expr1 div expr2 is replaced
-    // by a new identifier, q.
-    auto divKind =
-        isCeil ? AffineExpr::Kind::CeilDiv : AffineExpr::Kind::FloorDiv;
-    addLocalId(AffineBinaryOpExpr::get(
-        divKind, toAffineExpr(lhs, numDims, numSymbols, localExprs, context),
-        AffineConstantExpr::get(denominator, context), context));
-    lhs.assign(lhs.size(), 0);
-    lhs[getLocalVarStartIndex() + numLocals - 1] = 1;
-  }
-
-  // Add an existential quantifier (used to flatten a mod, floordiv, ceildiv
-  // expr). localExpr is the simplified tree expression (AffineExpr *)
-  // corresponding to the quantifier.
-  void addLocalId(AffineExpr *localExpr) {
-    for (auto &subExpr : operandExprStack) {
-      subExpr.insert(subExpr.begin() + getLocalVarStartIndex() + numLocals, 0);
-    }
-    localExprs.push_back(localExpr);
-    numLocals++;
-  }
-
-  inline unsigned getConstantIndex() const { return getNumCols() - 1; }
-  inline unsigned getLocalVarStartIndex() const { return numDims + numSymbols; }
-  inline unsigned getSymbolStartIndex() const { return numDims; }
-  inline unsigned getDimStartIndex() const { return 0; }
-};
-
-} // end anonymous namespace
-
-AffineExpr *mlir::simplifyAffineExpr(AffineExpr *expr, unsigned numDims,
-                                     unsigned numSymbols,
-                                     MLIRContext *context) {
-  // TODO(bondhugula): only pure affine for now. The simplification here can be
-  // extended to semi-affine maps as well.
-  if (!expr->isPureAffine())
-    return nullptr;
-
-  AffineExprFlattener flattener(numDims, numSymbols, context);
-  flattener.walkPostOrder(expr);
-  ArrayRef<int64_t> flattenedExpr = flattener.operandExprStack.back();
-  auto *simplifiedExpr = toAffineExpr(flattenedExpr, numDims, numSymbols,
-                                      flattener.localExprs, context);
-  flattener.operandExprStack.pop_back();
-  assert(flattener.operandExprStack.empty());
-  if (simplifiedExpr == expr)
-    return nullptr;
-  return simplifiedExpr;
-}
-
 MutableAffineMap::MutableAffineMap(AffineMap *map, MLIRContext *context)
     : numDims(map->getNumDims()), numSymbols(map->getNumSymbols()),
       context(context) {
diff --git a/lib/Analysis/LoopAnalysis.cpp b/lib/Analysis/LoopAnalysis.cpp
new file mode 100644
index 0000000..dc56d62
--- /dev/null
+++ b/lib/Analysis/LoopAnalysis.cpp
@@ -0,0 +1,128 @@
+//===- LoopAnalysis.cpp - Misc loop analysis routines //-------------------===//
+//
+// Copyright 2019 The MLIR Authors.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+// =============================================================================
+//
+// This file implements miscellaneous loop analysis routines.
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/Analysis/LoopAnalysis.h"
+
+#include "mlir/Analysis/AffineAnalysis.h"
+#include "mlir/IR/AffineExpr.h"
+#include "mlir/IR/AffineMap.h"
+#include "mlir/IR/Statements.h"
+
+using mlir::AffineExpr;
+
+/// Returns the trip count of the loop as an affine expression if the latter is
+/// expressible as an affine expression, and nullptr otherwise. The trip count
+/// expression is simplified before returning.
+AffineExpr *mlir::getTripCount(const ForStmt &forStmt) {
+  // upper_bound - lower_bound + 1
+  int64_t loopSpan;
+
+  int64_t step = forStmt.getStep();
+  auto *context = forStmt.getContext();
+
+  if (forStmt.hasConstantBounds()) {
+    int64_t lb = forStmt.getConstantLowerBound();
+    int64_t ub = forStmt.getConstantUpperBound();
+    loopSpan = ub - lb + 1;
+  } else {
+    const AffineBound lb = forStmt.getLowerBound();
+    const AffineBound ub = forStmt.getUpperBound();
+    auto lbMap = lb.getMap();
+    auto ubMap = ub.getMap();
+    // TODO(bondhugula): handle max/min of multiple expressions.
+    if (lbMap->getNumResults() != 1 || ubMap->getNumResults() != 1 ||
+        lbMap->getNumDims() != ubMap->getNumDims() ||
+        lbMap->getNumSymbols() != ubMap->getNumSymbols()) {
+      return nullptr;
+    }
+
+    // TODO(bondhugula): handle bounds with different operands.
+    unsigned i, e = lb.getNumOperands();
+    for (i = 0; i < e; i++) {
+      if (lb.getStmtOperand(i).get() != ub.getStmtOperand(i).get())
+        break;
+    }
+    // Bounds have different operands, unhandled for now.
+    if (i != e)
+      return nullptr;
+
+    // ub_expr - lb_expr + 1
+    auto *loopSpanExpr = AffineBinaryOpExpr::getAdd(
+        AffineBinaryOpExpr::getSub(ubMap->getResult(0), lbMap->getResult(0),
+                                   context),
+        1, context);
+
+    if (auto *expr = simplifyAffineExpr(loopSpanExpr, lbMap->getNumDims(),
+                                        lbMap->getNumSymbols(), context))
+      loopSpanExpr = expr;
+
+    auto *cExpr = dyn_cast<AffineConstantExpr>(loopSpanExpr);
+    if (!cExpr)
+      return AffineBinaryOpExpr::getCeilDiv(loopSpanExpr, std::abs(step),
+                                            context);
+    loopSpan = cExpr->getValue();
+  }
+
+  // 0 iteration loops.
+  if ((loopSpan < 0 && step >= 1) || (loopSpan > 0 && step <= -1))
+    return 0;
+
+  return AffineConstantExpr::get(
+      static_cast<uint64_t>(loopSpan % step == 0 ? loopSpan / step
+                                                 : loopSpan / step + 1),
+      context);
+}
+
+/// Returns the trip count of the loop if it's a constant, None otherwise. This
+/// method uses affine expression analysis (in turn using getTripCount) and is
+/// able to determine constant trip count in non-trivial cases.
+llvm::Optional<uint64_t> mlir::getConstantTripCount(const ForStmt &forStmt) {
+  AffineExpr *tripCountExpr = getTripCount(forStmt);
+
+  if (auto *constExpr = dyn_cast_or_null<AffineConstantExpr>(tripCountExpr))
+    return constExpr->getValue();
+
+  return None;
+}
+
+/// Returns the greatest known integral divisor of the trip count. Affine
+/// expression analysis is used (indirectly through getTripCount), and
+/// this method is thus able to determine non-trivial divisors.
+uint64_t mlir::getLargestDivisorOfTripCount(const ForStmt &forStmt) {
+  AffineExpr *tripCountExpr = getTripCount(forStmt);
+
+  if (!tripCountExpr)
+    return 1;
+
+  if (auto *constExpr = dyn_cast<AffineConstantExpr>(tripCountExpr)) {
+    uint64_t tripCount = constExpr->getValue();
+
+    // 0 iteration loops (greatest divisor is 2^64 - 1).
+    if (tripCount == 0)
+      return ULONG_MAX;
+
+    // The greatest divisor is the trip count.
+    return tripCount;
+  }
+
+  // Trip count is not a known constant; return its largest known divisor.
+  return tripCountExpr->getLargestKnownDivisor();
+}
diff --git a/lib/IR/AffineExpr.cpp b/lib/IR/AffineExpr.cpp
index 6393b3e..f43f288 100644
--- a/lib/IR/AffineExpr.cpp
+++ b/lib/IR/AffineExpr.cpp
@@ -47,6 +47,25 @@
   }
 }
 
+AffineExpr *AffineBinaryOpExpr::getSub(AffineExpr *lhs, AffineExpr *rhs,
+                                       MLIRContext *context) {
+  return getAdd(lhs, getMul(rhs, AffineConstantExpr::get(-1, context), context),
+                context);
+}
+
+/// Get affine expression that is expr incremented by 'inc'.
+AffineExpr *AffineBinaryOpExpr::getAdd(AffineExpr *expr, int64_t rhs,
+                                       MLIRContext *context) {
+  return get(AffineExpr::Kind::Add, expr, AffineConstantExpr::get(rhs, context),
+             context);
+}
+
+AffineExpr *AffineBinaryOpExpr::getCeilDiv(AffineExpr *lhs, uint64_t rhs,
+                                           MLIRContext *context) {
+  return get(AffineExpr::Kind::CeilDiv, lhs,
+             AffineConstantExpr::get(rhs, context), context);
+}
+
 /// Returns true if this expression is made out of only symbols and
 /// constants (no dimensional identifiers).
 bool AffineExpr::isSymbolicOrConstant() const {
@@ -101,7 +120,8 @@
   }
 }
 
-uint64_t AffineExpr::getKnownGcd() const {
+/// Returns the greatest known integral divisor of this affine expression.
+uint64_t AffineExpr::getLargestKnownDivisor() const {
   AffineBinaryOpExpr *binExpr = nullptr;
   switch (kind) {
   case Kind::SymbolId:
@@ -112,15 +132,17 @@
     return std::abs(cast<AffineConstantExpr>(this)->getValue());
   case Kind::Mul:
     binExpr = cast<AffineBinaryOpExpr>(const_cast<AffineExpr *>(this));
-    return binExpr->getLHS()->getKnownGcd() * binExpr->getRHS()->getKnownGcd();
+    return binExpr->getLHS()->getLargestKnownDivisor() *
+           binExpr->getRHS()->getLargestKnownDivisor();
   case Kind::Add:
     LLVM_FALLTHROUGH;
   case Kind::FloorDiv:
   case Kind::CeilDiv:
   case Kind::Mod:
     binExpr = cast<AffineBinaryOpExpr>(const_cast<AffineExpr *>(this));
-    return llvm::GreatestCommonDivisor64(binExpr->getLHS()->getKnownGcd(),
-                                         binExpr->getRHS()->getKnownGcd());
+    return llvm::GreatestCommonDivisor64(
+        binExpr->getLHS()->getLargestKnownDivisor(),
+        binExpr->getRHS()->getLargestKnownDivisor());
   }
 }
 
@@ -138,17 +160,18 @@
     binExpr = cast<AffineBinaryOpExpr>(const_cast<AffineExpr *>(this));
     // It's probably not worth optimizing this further (to not traverse the
     // whole sub-tree under - it that would require a version of isMultipleOf
-    // that on a 'false' return also returns the known GCD).
-    return (l = binExpr->getLHS()->getKnownGcd()) % factor == 0 ||
-           (u = binExpr->getRHS()->getKnownGcd()) % factor == 0 ||
+    // that on a 'false' return also returns the largest known divisor).
+    return (l = binExpr->getLHS()->getLargestKnownDivisor()) % factor == 0 ||
+           (u = binExpr->getRHS()->getLargestKnownDivisor()) % factor == 0 ||
            (l * u) % factor == 0;
   case Kind::Add:
   case Kind::FloorDiv:
   case Kind::CeilDiv:
   case Kind::Mod:
     binExpr = cast<AffineBinaryOpExpr>(const_cast<AffineExpr *>(this));
-    return llvm::GreatestCommonDivisor64(binExpr->getLHS()->getKnownGcd(),
-                                         binExpr->getRHS()->getKnownGcd()) %
+    return llvm::GreatestCommonDivisor64(
+               binExpr->getLHS()->getLargestKnownDivisor(),
+               binExpr->getRHS()->getLargestKnownDivisor()) %
                factor ==
            0;
   }
diff --git a/lib/IR/AffineMap.cpp b/lib/IR/AffineMap.cpp
index 8d56f24..afa35ba 100644
--- a/lib/IR/AffineMap.cpp
+++ b/lib/IR/AffineMap.cpp
@@ -63,7 +63,7 @@
   // If only one of them is a symbolic expressions, make it the RHS.
   if (isa<AffineConstantExpr>(lhs) ||
       (lhs->isSymbolicOrConstant() && !rhs->isSymbolicOrConstant())) {
-    return AffineBinaryOpExpr::get(Kind::Add, rhs, lhs, context);
+    return AffineBinaryOpExpr::getAdd(rhs, lhs, context);
   }
 
   // At this point, if there was a constant, it would be on the right.
@@ -77,8 +77,8 @@
   auto *lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
   if (lBin && rhsConst && lBin->getKind() == Kind::Add) {
     if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS()))
-      return AffineBinaryOpExpr::get(
-          Kind::Add, lBin->getLHS(),
+      return AffineBinaryOpExpr::getAdd(
+          lBin->getLHS(),
           AffineConstantExpr::get(lrhs->getValue() + rhsConst->getValue(),
                                   context),
           context);
@@ -88,10 +88,9 @@
   // + d1 into (d0 + d1) + 2.
   if (lBin && lBin->getKind() == Kind::Add) {
     if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS())) {
-      return AffineBinaryOpExpr::get(
-          Kind::Add,
-          AffineBinaryOpExpr::get(Kind::Add, lBin->getLHS(), rhs, context),
-          lrhs, context);
+      return AffineBinaryOpExpr::getAdd(
+          AffineBinaryOpExpr::getAdd(lBin->getLHS(), rhs, context), lrhs,
+          context);
     }
   }
 
@@ -115,7 +114,7 @@
   // constant. (Note that a constant is trivially symbolic).
   if (!rhs->isSymbolicOrConstant() || isa<AffineConstantExpr>(lhs)) {
     // At least one of them has to be symbolic.
-    return AffineBinaryOpExpr::get(Kind::Mul, rhs, lhs, context);
+    return AffineBinaryOpExpr::getMul(rhs, lhs, context);
   }
 
   // At this point, if there was a constant, it would be on the right.
@@ -133,8 +132,8 @@
   auto *lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
   if (lBin && rhsConst && lBin->getKind() == Kind::Mul) {
     if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS()))
-      return AffineBinaryOpExpr::get(
-          Kind::Mul, lBin->getLHS(),
+      return AffineBinaryOpExpr::getMul(
+          lBin->getLHS(),
           AffineConstantExpr::get(lrhs->getValue() * rhsConst->getValue(),
                                   context),
           context);
@@ -144,10 +143,9 @@
   // * 2) * d1 into (d0 * d1) * 2.
   if (lBin && lBin->getKind() == Kind::Mul) {
     if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS())) {
-      return AffineBinaryOpExpr::get(
-          Kind::Mul,
-          AffineBinaryOpExpr::get(Kind::Mul, lBin->getLHS(), rhs, context),
-          lrhs, context);
+      return AffineBinaryOpExpr::getMul(
+          AffineBinaryOpExpr::getMul(lBin->getLHS(), rhs, context), lrhs,
+          context);
     }
   }
 
@@ -167,13 +165,16 @@
   // Fold floordiv of a multiply with a constant that is a multiple of the
   // divisor. Eg: (i * 128) floordiv 64 = i * 2.
   if (rhsConst) {
+    if (rhsConst->getValue() == 1)
+      return lhs;
+
     auto *lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
     if (lBin && lBin->getKind() == Kind::Mul) {
       if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS())) {
         // rhsConst is known to be positive if a constant.
         if (lrhs->getValue() % rhsConst->getValue() == 0)
-          return AffineBinaryOpExpr::get(
-              Kind::Mul, lBin->getLHS(),
+          return AffineBinaryOpExpr::getMul(
+              lBin->getLHS(),
               AffineConstantExpr::get(lrhs->getValue() / rhsConst->getValue(),
                                       context),
               context);
@@ -199,13 +200,16 @@
   // Fold ceildiv of a multiply with a constant that is a multiple of the
   // divisor. Eg: (i * 128) ceildiv 64 = i * 2.
   if (rhsConst) {
+    if (rhsConst->getValue() == 1)
+      return lhs;
+
     auto *lBin = dyn_cast<AffineBinaryOpExpr>(lhs);
     if (lBin && lBin->getKind() == Kind::Mul) {
       if (auto *lrhs = dyn_cast<AffineConstantExpr>(lBin->getRHS())) {
         // rhsConst is known to be positive if a constant.
         if (lrhs->getValue() % rhsConst->getValue() == 0)
-          return AffineBinaryOpExpr::get(
-              Kind::Mul, lBin->getLHS(),
+          return AffineBinaryOpExpr::getMul(
+              lBin->getLHS(),
               AffineConstantExpr::get(lrhs->getValue() / rhsConst->getValue(),
                                       context),
               context);
@@ -230,7 +234,7 @@
   // mod 64 is folded to 0, and less trivially, (i*(j*4*(k*32))) mod 128 = 0.
   if (rhsConst) {
     // rhsConst is known to be positive if a constant.
-    if (lhs->getKnownGcd() % rhsConst->getValue() == 0)
+    if (lhs->getLargestKnownDivisor() % rhsConst->getValue() == 0)
       return AffineConstantExpr::get(0, context);
   }
 
diff --git a/lib/IR/AsmPrinter.cpp b/lib/IR/AsmPrinter.cpp
index 28a97ea..83b9493 100644
--- a/lib/IR/AsmPrinter.cpp
+++ b/lib/IR/AsmPrinter.cpp
@@ -108,7 +108,7 @@
 
     // Check if the affine map is single dim id or single symbol identity -
     // (i)->(i) or ()[s]->(i)
-    return boundMap->getNumOperands() == 1 && boundMap->getNumResults() == 1 &&
+    return boundMap->getNumInputs() == 1 && boundMap->getNumResults() == 1 &&
            (isa<AffineDimExpr>(boundMap->getResult(0)) ||
             isa<AffineSymbolExpr>(boundMap->getResult(0)));
   }
diff --git a/lib/IR/Builders.cpp b/lib/IR/Builders.cpp
index a03de2f..f93508d 100644
--- a/lib/IR/Builders.cpp
+++ b/lib/IR/Builders.cpp
@@ -161,18 +161,46 @@
   return AffineBinaryOpExpr::get(AffineExpr::Kind::Mul, lhs, rhs, context);
 }
 
+// Most multiply expressions are pure affine (rhs is a constant).
+AffineExpr *Builder::getMulExpr(AffineExpr *lhs, int64_t rhs) {
+  return AffineBinaryOpExpr::get(AffineExpr::Kind::Mul, lhs,
+                                 getConstantExpr(rhs), context);
+}
+
+AffineExpr *Builder::getSubExpr(AffineExpr *lhs, AffineExpr *rhs) {
+  return getAddExpr(lhs, getMulExpr(rhs, getConstantExpr(-1)));
+}
+
 AffineExpr *Builder::getModExpr(AffineExpr *lhs, AffineExpr *rhs) {
   return AffineBinaryOpExpr::get(AffineExpr::Kind::Mod, lhs, rhs, context);
 }
 
+// Most modulo expressions are pure affine.
+AffineExpr *Builder::getModExpr(AffineExpr *lhs, uint64_t rhs) {
+  return AffineBinaryOpExpr::get(AffineExpr::Kind::Mod, lhs,
+                                 getConstantExpr(rhs), context);
+}
+
 AffineExpr *Builder::getFloorDivExpr(AffineExpr *lhs, AffineExpr *rhs) {
   return AffineBinaryOpExpr::get(AffineExpr::Kind::FloorDiv, lhs, rhs, context);
 }
 
+// Most floordiv expressions are pure affine.
+AffineExpr *Builder::getFloorDivExpr(AffineExpr *lhs, uint64_t rhs) {
+  return AffineBinaryOpExpr::get(AffineExpr::Kind::FloorDiv, lhs,
+                                 getConstantExpr(rhs), context);
+}
+
 AffineExpr *Builder::getCeilDivExpr(AffineExpr *lhs, AffineExpr *rhs) {
   return AffineBinaryOpExpr::get(AffineExpr::Kind::CeilDiv, lhs, rhs, context);
 }
 
+// Most ceildiv expressions are pure affine.
+AffineExpr *Builder::getCeilDivExpr(AffineExpr *lhs, uint64_t rhs) {
+  return AffineBinaryOpExpr::get(AffineExpr::Kind::CeilDiv, lhs,
+                                 getConstantExpr(rhs), context);
+}
+
 IntegerSet *Builder::getIntegerSet(unsigned dimCount, unsigned symbolCount,
                                    ArrayRef<AffineExpr *> constraints,
                                    ArrayRef<bool> isEq) {
diff --git a/lib/IR/Statement.cpp b/lib/IR/Statement.cpp
index c8d3336..14c920c 100644
--- a/lib/IR/Statement.cpp
+++ b/lib/IR/Statement.cpp
@@ -278,9 +278,9 @@
 ForStmt *ForStmt::create(Location *location, ArrayRef<MLValue *> lbOperands,
                          AffineMap *lbMap, ArrayRef<MLValue *> ubOperands,
                          AffineMap *ubMap, int64_t step, MLIRContext *context) {
-  assert(lbOperands.size() == lbMap->getNumOperands() &&
+  assert(lbOperands.size() == lbMap->getNumInputs() &&
          "lower bound operand count does not match the affine map");
-  assert(ubOperands.size() == ubMap->getNumOperands() &&
+  assert(ubOperands.size() == ubMap->getNumInputs() &&
          "upper bound operand count does not match the affine map");
 
   unsigned numOperands = lbOperands.size() + ubOperands.size();
@@ -306,11 +306,11 @@
 }
 
 const AffineBound ForStmt::getLowerBound() const {
-  return AffineBound(*this, 0, lbMap->getNumOperands(), lbMap);
+  return AffineBound(*this, 0, lbMap->getNumInputs(), lbMap);
 }
 
 const AffineBound ForStmt::getUpperBound() const {
-  return AffineBound(*this, lbMap->getNumOperands(), getNumOperands(), ubMap);
+  return AffineBound(*this, lbMap->getNumInputs(), getNumOperands(), ubMap);
 }
 
 void ForStmt::setLowerBound(ArrayRef<MLValue *> operands, AffineMap *map) {
@@ -327,6 +327,18 @@
   this->ubMap = map;
 }
 
+void ForStmt::setLowerBoundMap(AffineMap *map) {
+  assert(lbMap->getNumDims() == map->getNumDims() &&
+         lbMap->getNumSymbols() == map->getNumSymbols());
+  this->lbMap = map;
+}
+
+void ForStmt::setUpperBoundMap(AffineMap *map) {
+  assert(ubMap->getNumDims() == map->getNumDims() &&
+         ubMap->getNumSymbols() == map->getNumSymbols());
+  this->ubMap = map;
+}
+
 bool ForStmt::hasConstantLowerBound() const {
   return lbMap->isSingleConstant();
 }
@@ -343,24 +355,6 @@
   return ubMap->getSingleConstantResult();
 }
 
-Optional<uint64_t> ForStmt::getConstantTripCount() const {
-  // TODO(bondhugula): handle arbitrary lower/upper bounds.
-  if (!hasConstantBounds())
-    return None;
-  int64_t lb = getConstantLowerBound();
-  int64_t ub = getConstantUpperBound();
-  int64_t step = getStep();
-
-  // 0 iteration loops.
-  if ((step >= 1 && lb > ub) || (step <= -1 && lb < ub))
-    return 0;
-
-  uint64_t tripCount = static_cast<uint64_t>((ub - lb + 1) % step == 0
-                                                 ? (ub - lb + 1) / step
-                                                 : (ub - lb + 1) / step + 1);
-  return tripCount;
-}
-
 void ForStmt::setConstantLowerBound(int64_t value) {
   MLIRContext *context = getContext();
   auto *expr = AffineConstantExpr::get(value, context);
@@ -463,9 +457,9 @@
 
     auto *newFor = ForStmt::create(
         getLoc(),
-        ArrayRef<MLValue *>(operands).take_front(lbMap->getNumOperands()),
-        lbMap, ArrayRef<MLValue *>(operands).take_back(ubMap->getNumOperands()),
-        ubMap, forStmt->getStep(), context);
+        ArrayRef<MLValue *>(operands).take_front(lbMap->getNumInputs()), lbMap,
+        ArrayRef<MLValue *>(operands).take_back(ubMap->getNumInputs()), ubMap,
+        forStmt->getStep(), context);
 
     // Remember the induction variable mapping.
     operandMap[forStmt] = newFor;
diff --git a/lib/Parser/Parser.cpp b/lib/Parser/Parser.cpp
index 5641421..1442434 100644
--- a/lib/Parser/Parser.cpp
+++ b/lib/Parser/Parser.cpp
@@ -2357,8 +2357,8 @@
     if (!map)
       return ParseFailure;
 
-    if (parseDimAndSymbolList(operands, map->getNumDims(),
-                              map->getNumOperands(), "affine map"))
+    if (parseDimAndSymbolList(operands, map->getNumDims(), map->getNumInputs(),
+                              "affine map"))
       return ParseFailure;
     return ParseSuccess;
   }
diff --git a/lib/Transforms/LoopUnroll.cpp b/lib/Transforms/LoopUnroll.cpp
index e663ce0..2fd8fe2 100644
--- a/lib/Transforms/LoopUnroll.cpp
+++ b/lib/Transforms/LoopUnroll.cpp
@@ -19,18 +19,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/IR/AffineExpr.h"
-#include "mlir/IR/Attributes.h"
+#include "mlir/IR/AffineMap.h"
 #include "mlir/IR/Builders.h"
-#include "mlir/IR/CFGFunction.h"
-#include "mlir/IR/MLFunction.h"
-#include "mlir/IR/Module.h"
-#include "mlir/IR/OperationSet.h"
 #include "mlir/IR/StandardOps.h"
-#include "mlir/IR/Statements.h"
 #include "mlir/IR/StmtVisitor.h"
 #include "mlir/Transforms/Pass.h"
-#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -128,7 +125,7 @@
     ShortLoopGatherer(unsigned minTripCount) : minTripCount(minTripCount) {}
 
     void visitForStmt(ForStmt *forStmt) {
-      Optional<uint64_t> tripCount = forStmt->getConstantTripCount();
+      Optional<uint64_t> tripCount = getConstantTripCount(*forStmt);
       if (tripCount.hasValue() && tripCount.getValue() <= minTripCount)
         loops.push_back(forStmt);
     }
@@ -174,12 +171,43 @@
 // Unrolls this loop completely. Fails assertion if loop bounds are
 // non-constant.
 bool LoopUnroll::loopUnrollFull(ForStmt *forStmt) {
-  Optional<uint64_t> tripCount = forStmt->getConstantTripCount();
+  Optional<uint64_t> tripCount = getConstantTripCount(*forStmt);
   if (tripCount.hasValue())
     return loopUnrollByFactor(forStmt, tripCount.getValue());
   return false;
 }
 
+/// Returns the upper bound of an unrolled loop with lower bound 'lb' and with
+/// the specified trip count, stride, and unroll factor.
+static AffineMap *getUnrolledLoopUpperBound(AffineMap *lbMap,
+                                            uint64_t tripCount,
+                                            unsigned unrollFactor, int64_t step,
+                                            MLFuncBuilder *builder) {
+  assert(lbMap->getNumResults() == 1);
+  auto *lbExpr = lbMap->getResult(0);
+  // lbExpr + (count - count % unrollFactor - 1) * step).
+  auto *expr = builder->getAddExpr(
+      lbExpr, builder->getConstantExpr(
+                  (tripCount - tripCount % unrollFactor - 1) * step));
+  return builder->getAffineMap(lbMap->getNumDims(), lbMap->getNumSymbols(),
+                               {expr}, {});
+}
+
+/// Returns the lower bound of the cleanup loop when unrolling a loop with lower
+/// bound 'lb' and with the specified trip count, stride, and unroll factor.
+static AffineMap *getCleanupLoopLowerBound(AffineMap *lbMap, uint64_t tripCount,
+                                           unsigned unrollFactor, int64_t step,
+                                           MLFuncBuilder *builder) {
+  assert(lbMap->getNumResults() == 1);
+  auto *lbExpr = lbMap->getResult(0);
+  // lbExpr + (count - count % unrollFactor) * step);
+  auto *expr = builder->getAddExpr(
+      lbExpr,
+      builder->getConstantExpr((tripCount - tripCount % unrollFactor) * step));
+  return builder->getAffineMap(lbMap->getNumDims(), lbMap->getNumSymbols(),
+                               {expr}, {});
+}
+
 /// Unrolls this loop by the specified unroll factor.
 bool LoopUnroll::loopUnrollByFactor(ForStmt *forStmt, uint64_t unrollFactor) {
   assert(unrollFactor >= 1 && "unroll factor shoud be >= 1");
@@ -187,44 +215,96 @@
   if (unrollFactor == 1 || forStmt->getStatements().empty())
     return false;
 
-  if (!forStmt->hasConstantBounds())
+  Optional<uint64_t> mayBeConstantTripCount = getConstantTripCount(*forStmt);
+
+  if (!mayBeConstantTripCount.hasValue() &&
+      getLargestDivisorOfTripCount(*forStmt) % unrollFactor != 0)
     return false;
 
-  int64_t lb = forStmt->getConstantLowerBound();
+  const AffineBound &lb = forStmt->getLowerBound();
+  const AffineBound &ub = forStmt->getLowerBound();
+  auto lbMap = lb.getMap();
+  auto ubMap = lb.getMap();
+
+  // Loops with max/min expressions won't be unrolled here (the output can't be
+  // expressed as an MLFunction in the general case). However, the right way to
+  // do such unrolling for an MLFunction would be to specialize the loop for the
+  // 'hotspot' case and unroll that hotspot case.
+  if (lbMap->getNumResults() != 1 || ubMap->getNumResults() != 1)
+    return false;
+
+  // TODO(bondhugula): handle bounds with different sets of operands.
+  // Same operand list for now.
+  if (lbMap->getNumDims() != ubMap->getNumDims() ||
+      lbMap->getNumSymbols() != ubMap->getNumSymbols())
+    return false;
+  unsigned i, e = lb.getNumOperands();
+  for (i = 0; i < e; i++) {
+    if (lb.getStmtOperand(i).get() != ub.getStmtOperand(i).get())
+      break;
+  }
+  if (i != e)
+    return false;
+
   int64_t step = forStmt->getStep();
-  uint64_t tripCount = forStmt->getConstantTripCount().getValue();
 
   // If the trip count is lower than the unroll factor, no unrolled body.
   // TODO(bondhugula): option to specify cleanup loop unrolling.
-  if (tripCount < unrollFactor)
-    return true;
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() < unrollFactor)
+    return false;
 
   // Generate the cleanup loop if trip count isn't a multiple of unrollFactor.
-  if (tripCount % unrollFactor) {
+  // If the trip count is unknown, we currently unroll only when the unknown
+  // trip count is known to be a multiple of unroll factor - hence, no cleanup
+  // loop will be necessary in those cases.
+  // TODO(bondhugula): handle generation of cleanup loop for unknown trip count
+  // when it's not known to be a multiple of unroll factor (still for single
+  // result / same operands case).
+  if (mayBeConstantTripCount.hasValue() &&
+      mayBeConstantTripCount.getValue() % unrollFactor != 0) {
+    uint64_t tripCount = mayBeConstantTripCount.getValue();
     DenseMap<const MLValue *, MLValue *> operandMap;
     MLFuncBuilder builder(forStmt->getBlock(), ++StmtBlock::iterator(forStmt));
     auto *cleanupForStmt = cast<ForStmt>(builder.clone(*forStmt, operandMap));
-    cleanupForStmt->setConstantLowerBound(
-        lb + (tripCount - tripCount % unrollFactor) * step);
+    if (forStmt->hasConstantLowerBound()) {
+      cleanupForStmt->setConstantLowerBound(
+          forStmt->getConstantLowerBound() +
+          (tripCount - tripCount % unrollFactor) * step);
+    } else {
+      cleanupForStmt->setLowerBoundMap(
+          getCleanupLoopLowerBound(forStmt->getLowerBoundMap(), tripCount,
+                                   unrollFactor, step, &builder));
+    }
     // Promote the loop body up if this has turned into a single iteration loop.
     promoteIfSingleIteration(cleanupForStmt);
+
+    // The upper bound needs to be adjusted.
+    if (forStmt->hasConstantUpperBound()) {
+      forStmt->setConstantUpperBound(
+          forStmt->getConstantLowerBound() +
+          (tripCount - tripCount % unrollFactor - 1) * step);
+    } else {
+      forStmt->setUpperBoundMap(
+          getUnrolledLoopUpperBound(forStmt->getLowerBoundMap(), tripCount,
+                                    unrollFactor, step, &builder));
+    }
   }
 
+  // Scale the step of loop being unrolled by unroll factor.
+  forStmt->setStep(step * unrollFactor);
+
   // Builder to insert unrolled bodies right after the last statement in the
   // body of 'forStmt'.
   MLFuncBuilder builder(forStmt, StmtBlock::iterator(forStmt->end()));
-  forStmt->setStep(step * unrollFactor);
-  forStmt->setConstantUpperBound(
-      lb + (tripCount - tripCount % unrollFactor - 1) * step);
 
   // Keep a pointer to the last statement in the original block so that we know
   // what to clone (since we are doing this in-place).
-  StmtBlock::iterator srcBlockEnd = --forStmt->end();
+  StmtBlock::iterator srcBlockEnd = std::prev(forStmt->end());
 
-  // Unroll the contents of 'forStmt' (unrollFactor-1 additional copies
-  // appended).
+  // Unroll the contents of 'forStmt' (append unrollFactor-1 additional copies).
   for (unsigned i = 1; i < unrollFactor; i++) {
-    DenseMap<const MLValue *, MLValue *> operandMapping;
+    DenseMap<const MLValue *, MLValue *> operandMap;
 
     // If the induction variable is used, create a remapping to the value for
     // this unrolled instance.
@@ -236,15 +316,13 @@
       auto *ivUnroll =
           builder.create<AffineApplyOp>(forStmt->getLoc(), bumpMap, forStmt)
               ->getResult(0);
-      operandMapping[forStmt] = cast<MLValue>(ivUnroll);
+      operandMap[forStmt] = cast<MLValue>(ivUnroll);
     }
 
-    // Clone the original body of the loop (this doesn't include the last stmt).
-    for (auto it = forStmt->begin(); it != srcBlockEnd; it++) {
-      builder.clone(*it, operandMapping);
+    // Clone the original body of 'forStmt'.
+    for (auto it = forStmt->begin(); it != std::next(srcBlockEnd); it++) {
+      builder.clone(*it, operandMap);
     }
-    // Clone the last statement in the original body.
-    builder.clone(*srcBlockEnd, operandMapping);
   }
 
   // Promote the loop body up if this has turned into a single iteration loop.
diff --git a/lib/Transforms/LoopUnrollJam.cpp b/lib/Transforms/LoopUnrollAndJam.cpp
similarity index 87%
rename from lib/Transforms/LoopUnrollJam.cpp
rename to lib/Transforms/LoopUnrollAndJam.cpp
index 6fb6134..7fb3309 100644
--- a/lib/Transforms/LoopUnrollJam.cpp
+++ b/lib/Transforms/LoopUnrollAndJam.cpp
@@ -1,5 +1,4 @@
-//===- LoopUnrollAndJam.cpp - Code to perform loop unroll jam
-//----------------===//
+//===- LoopUnrollAndJam.cpp - Code to perform loop unroll and jam ---------===//
 //
 // Copyright 2019 The MLIR Authors.
 //
@@ -16,10 +15,10 @@
 // limitations under the License.
 // =============================================================================
 //
-// This file implements loop unroll jam for MLFunctions. Unroll and jam is a
+// This file implements loop unroll and jam for MLFunctions. Unroll and jam is a
 // transformation that improves locality, in particular, register reuse, while
 // also improving instruction level parallelism. The example below shows what it
-// does in nearly the general case. Loop unroll jam currently works if the
+// does in nearly the general case. Loop unroll and jam currently works if the
 // bounds of the loops inner to the loop being unroll-jammed do not depend on
 // the latter.
 //
@@ -44,25 +43,27 @@
 // stmt's, bodies of those loops will not be jammed.
 //
 //===----------------------------------------------------------------------===//
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/StandardOps.h"
 #include "mlir/IR/StmtVisitor.h"
 #include "mlir/Transforms/Pass.h"
-#include "mlir/Transforms/Passes.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace mlir;
 
-// Loop unroll jam factor.
+// Loop unroll and jam factor.
 static llvm::cl::opt<unsigned>
     clUnrollJamFactor("unroll-jam-factor", llvm::cl::Hidden,
                       llvm::cl::desc("Use this unroll jam factor for all loops"
                                      " (default 4)"));
 
 namespace {
-/// Loop unroll jam pass. For test purposes, this just unroll jams the first
+/// Loop unroll jam pass. Currently, this just unroll jams the first
 /// outer loop in an MLFunction.
 struct LoopUnrollAndJam : public MLFunctionPass {
   Optional<unsigned> unrollJamFactor;
@@ -127,10 +128,8 @@
         while (it != End && !isa<ForStmt>(it))
           ++it;
         if (it != subBlockStart)
-          // Record the last statement (one behind the iterator) while not
-          // changing the iterator position.
-          subBlocks.push_back({subBlockStart, (--it)++});
-        // Process all for Stmts that appear next.
+          subBlocks.push_back({subBlockStart, std::prev(it)});
+        // Process all for stmts that appear next.
         while (it != End && isa<ForStmt>(it))
           walkForStmt(cast<ForStmt>(it++));
       }
@@ -142,12 +141,14 @@
   if (unrollJamFactor == 1 || forStmt->getStatements().empty())
     return false;
 
-  if (!forStmt->hasConstantBounds())
+  Optional<uint64_t> mayTripCount = getConstantTripCount(*forStmt).getValue();
+
+  if (!mayTripCount.hasValue())
     return false;
 
+  uint64_t tripCount = mayTripCount.getValue();
   int64_t lb = forStmt->getConstantLowerBound();
   int64_t step = forStmt->getStep();
-  uint64_t tripCount = forStmt->getConstantTripCount().getValue();
 
   // If the trip count is lower than the unroll jam factor, no unrolled body.
   // TODO(bondhugula): option to specify cleanup loop unrolling.
@@ -164,7 +165,8 @@
   if (tripCount % unrollJamFactor) {
     DenseMap<const MLValue *, MLValue *> operandMap;
     // Insert the cleanup loop right after 'forStmt'.
-    MLFuncBuilder builder(forStmt->getBlock(), ++StmtBlock::iterator(forStmt));
+    MLFuncBuilder builder(forStmt->getBlock(),
+                          std::next(StmtBlock::iterator(forStmt)));
     auto *cleanupForStmt = cast<ForStmt>(builder.clone(*forStmt, operandMap));
     cleanupForStmt->setConstantLowerBound(
         lb + (tripCount - tripCount % unrollJamFactor) * step);
@@ -200,13 +202,10 @@
                 ->getResult(0);
         operandMapping[forStmt] = cast<MLValue>(ivUnroll);
       }
-      // Clone the sub-block being unroll-jammed (this doesn't include the last
-      // stmt because subBlock.second is inclusive).
-      for (auto it = subBlock.first; it != subBlock.second; ++it) {
+      // Clone the sub-block being unroll-jammed.
+      for (auto it = subBlock.first; it != std::next(subBlock.second); ++it) {
         builder.clone(*it, operandMapping);
       }
-      // Clone the last statement of the sub-block.
-      builder.clone(*subBlock.second, operandMapping);
     }
   }
 
diff --git a/lib/Transforms/LoopUtils.cpp b/lib/Transforms/LoopUtils.cpp
index cc2dc40..2eecb6f 100644
--- a/lib/Transforms/LoopUtils.cpp
+++ b/lib/Transforms/LoopUtils.cpp
@@ -19,16 +19,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "mlir/Transforms/Passes.h"
+
+#include "mlir/Analysis/LoopAnalysis.h"
 #include "mlir/IR/Builders.h"
 #include "mlir/IR/StandardOps.h"
 #include "mlir/IR/Statements.h"
 #include "mlir/IR/StmtVisitor.h"
-#include "mlir/Transforms/Passes.h"
 
 /// Promotes the loop body of a forStmt to its containing block if the forStmt
 /// was known to have a single iteration. Returns false otherwise.
+// TODO(bondhugula): extend this for arbitrary affine bounds.
 bool mlir::promoteIfSingleIteration(ForStmt *forStmt) {
-  Optional<uint64_t> tripCount = forStmt->getConstantTripCount();
+  Optional<uint64_t> tripCount = getConstantTripCount(*forStmt);
   if (!tripCount.hasValue() || !forStmt->hasConstantLowerBound())
     return false;
 
diff --git a/test/IR/affine-map.mlir b/test/IR/affine-map.mlir
index 5d16f13..7d610fb 100644
--- a/test/IR/affine-map.mlir
+++ b/test/IR/affine-map.mlir
@@ -167,6 +167,10 @@
 // CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (0, 0, 0, (d0 * 4 + 3) mod 2)
 #map49 = (i, j)[s0] -> ( (i * 4 + 8) mod 4, 32 * j * s0 * 8 mod 256, (4*i + (j * (s0 * 2))) mod 2, (4*i + 3) mod 2)
 
+// Floordiv, ceildiv divide by one.
+// CHECK: #map{{[0-9]+}} = (d0, d1)[s0] -> (d0 * 2 + 1, d1 + s0)
+#map50 = (i, j)[s0] -> ( (i * 2 + 1) ceildiv 1, (j + s0) floordiv 1)
+
 // CHECK: extfunc @f0(memref<2x4xi8, #map{{[0-9]+}}, 1>)
 extfunc @f0(memref<2x4xi8, #map0, 1>)
 
@@ -331,3 +335,6 @@
 
 // CHECK: extfunc @f49(memref<100x100xi8, #map{{[0-9]+}}>)
 extfunc @f49(memref<100x100xi8, #map49>)
+
+// CHECK: extfunc @f50(memref<100x100xi8, #map{{[0-9]+}}>)
+extfunc @f50(memref<100x100xi8, #map50>)
diff --git a/test/Transforms/unroll.mlir b/test/Transforms/unroll.mlir
index 10ee717..dbda50b 100644
--- a/test/Transforms/unroll.mlir
+++ b/test/Transforms/unroll.mlir
@@ -426,3 +426,79 @@
   } // UNROLL-BY-4:  }
   return
 }
+
+// Test cases with loop bound operands.
+
+// No cleanup will be generated here.
+// UNROLL-BY-4-LABEL: mlfunc @loop_nest_operand1() {
+mlfunc @loop_nest_operand1() {
+  // UNROLL-BY-4: for %i0 = 1 to 100 step 2 {
+  for %i = 1 to 100 step 2 {
+    // UNROLL-BY-4: %0 = "foo"() : () -> i32
+    // UNROLL-BY-4: %1 = "foo"() : () -> i32
+    // UNROLL-BY-4: %2 = "foo"() : () -> i32
+    // UNROLL-BY-4: %3 = "foo"() : () -> i32
+    for %j = (d0) -> (0) (%i) to (d0) -> (d0 - d0 mod 4 - 1) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// No cleanup will be generated here.
+// UNROLL-BY-4-LABEL: mlfunc @loop_nest_operand2() {
+mlfunc @loop_nest_operand2() {
+  // UNROLL-BY-4: for %i0 = 1 to 100 step 2 {
+  for %i = 1 to 100 step 2 {
+    // UNROLL-BY-4: %0 = "foo"() : () -> i32
+    // UNROLL-BY-4: %1 = "foo"() : () -> i32
+    // UNROLL-BY-4: %2 = "foo"() : () -> i32
+    // UNROLL-BY-4: %3 = "foo"() : () -> i32
+    for %j = (d0) -> (d0) (%i) to (d0) -> (5*d0 + 3) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}
+
+// Difference between loop bounds is constant, but not a multiple of unroll
+// factor. A cleanup loop is generated.
+// UNROLL-BY-4-LABEL: mlfunc @loop_nest_operand3() {
+mlfunc @loop_nest_operand3() {
+  // UNROLL-BY-4: for %i0 = 1 to 100 step 2 {
+  for %i = 1 to 100 step 2 {
+    // UNROLL-BY-4: for %i1 = %i0 to #map{{[0-9]+}}(%i0) step 4 {
+    // UNROLL-BY-4-NEXT: %0 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: }
+    // UNROLL-BY-4-NEXT: for %i2 = #map{{[0-9]+}}(%i0) to #map{{[0-9]+}}(%i0) {
+    // UNROLL-BY-4-NEXT: %4 = "foo"() : () -> i32
+    // UNROLL-BY-4-NEXT: }
+    for %j = (d0) -> (d0) (%i) to (d0) -> (d0 + 4) (%i) {
+      %x = "foo"() : () -> i32
+    }
+  } // UNROLL-BY-4: }
+  return
+}
+
+// Will not be unrolled for now. TODO(bondhugula): handle this.
+// xUNROLL-BY-4-LABEL: mlfunc @loop_nest_operand4(%arg0 : affineint) {
+mlfunc @loop_nest_operand4(%N : affineint) {
+  // UNROLL-BY-4: for %i0 = 1 to 100 step 2 {
+  for %i = 1 to 100 step 2 {
+    // UNROLL-BY-4: for %i1 = 0 to %arg0 {
+    // xUNROLL-BY-4: for %i1 = 0 to #map{{[0-9]+}}(%N) step 4 {
+    // xUNROLL-BY-4: %0 = "foo"() : () -> i32
+    // xUNROLL-BY-4-NEXT: %1 = "foo"() : () -> i32
+    // xUNROLL-BY-4-NEXT: %2 = "foo"() : () -> i32
+    // xUNROLL-BY-4-NEXT: %3 = "foo"() : () -> i32
+    // xUNROLL-BY-4-NEXT: }
+    // a cleanup loop should be generated here.
+    for %j = (d0) -> (0) (%N) to %N {
+      %x = "foo"() : () -> i32
+    }
+  }
+  return
+}