mlir/lib/Dialect/Linalg/Transforms/Tiling.cpp - toolchain/llvm-project - Gitiles

 //===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//
 //
 // Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
 //
 // This file implements the linalg dialect Tiling pass.
 //
 //===----------------------------------------------------------------------===//

 #include "mlir/Dialect/Linalg/IR/LinalgOps.h"
 #include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
 #include "mlir/Dialect/Linalg/Passes.h"
 #include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
 #include "mlir/Dialect/Linalg/Utils/Utils.h"
 #include "mlir/Dialect/LoopOps/LoopOps.h"
 #include "mlir/EDSC/Helpers.h"
 #include "mlir/IR/AffineExpr.h"
 #include "mlir/IR/AffineExprVisitor.h"
 #include "mlir/IR/AffineMap.h"
 #include "mlir/IR/OpImplementation.h"
 #include "mlir/Pass/Pass.h"
 #include "mlir/Support/LLVM.h"
 #include "mlir/Support/STLExtras.h"
 #include "mlir/Transforms/FoldUtils.h"

 #include "llvm/Support/CommandLine.h"

 using namespace mlir;
 using namespace mlir::edsc;
 using namespace mlir::edsc::intrinsics;
 using namespace mlir::linalg;
 using namespace mlir::linalg::intrinsics;
 using namespace mlir::loop;

 #define DEBUG_TYPE "linalg-tiling"

 static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
 static llvm::cl::list<unsigned>
     clTileSizes("linalg-tile-sizes",
                 llvm::cl::desc("Tile sizes by which to tile linalg operations"),
                 llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
                 llvm::cl::cat(clOptionsCategory));

 static bool isZero(Value v) {
   return isa_and_nonnull<ConstantIndexOp>(v.getDefiningOp()) &&
          cast<ConstantIndexOp>(v.getDefiningOp()).getValue() == 0;
 }

 using LoopIndexToRangeIndexMap = DenseMap<int, int>;

 // Creates a number of ranges equal to the number of non-zero in `tileSizes`.
 // One for each loop of the LinalgOp that is tiled. The `tileSizes` argument has
 // one entry per surrounding loop. It uses zero as the convention that a
 // particular loop is not tiled. This convention simplifies implementations by
 // avoiding affine map manipulations.
 // The returned ranges correspond to the loop ranges, in the proper order, that
 // are tiled and for which new loops will be created. Also the function returns
 // a map from loop indices of the LinalgOp to the corresponding non-empty range
 // indices of newly created loops.
 static std::tuple<SmallVector<SubViewOp::Range, 4>, LoopIndexToRangeIndexMap>
 makeTiledLoopRanges(OpBuilder &b, Location loc, AffineMap map,
                     ArrayRef<Value> allViewSizes, ArrayRef<Value> allTileSizes,
                     OperationFolder *folder) {
   assert(allTileSizes.size() == map.getNumResults());
   // Apply `map` to get view sizes in loop order.
   auto viewSizes = applyMapToValues(b, loc, map, allViewSizes, folder);
   SmallVector<Value, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());

   // Traverse the tile sizes, which are in loop order, erase zeros everywhere.
   LoopIndexToRangeIndexMap loopIndexToRangeIndex;
   for (int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
     if (isZero(tileSizes[idx - zerosCount])) {
       viewSizes.erase(viewSizes.begin() + idx - zerosCount);
       tileSizes.erase(tileSizes.begin() + idx - zerosCount);
       ++zerosCount;
       continue;
     }
     loopIndexToRangeIndex[idx] = idx - zerosCount;
   }

   // Create a new range with the applied tile sizes.
   SmallVector<SubViewOp::Range, 4> res;
   for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
     res.push_back(SubViewOp::Range{constant_index(folder, 0), viewSizes[idx],
                                    tileSizes[idx]});
   }
   return std::make_tuple(res, loopIndexToRangeIndex);
 }

 namespace {

 // Helper visitor to determine whether an AffineExpr is tiled.
 // This is achieved by traversing every AffineDimExpr with position `pos` and
 // checking whether the corresponding `tileSizes[pos]` is non-zero.
 // This also enforces only positive coefficients occur in multiplications.
 //
 // Example:
 //   `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
 //
 struct TileCheck : public AffineExprVisitor<TileCheck> {
   TileCheck(ArrayRef<Value> tileSizes) : isTiled(false), tileSizes(tileSizes) {}

   void visitDimExpr(AffineDimExpr expr) {
     isTiled |= !isZero(tileSizes[expr.getPosition()]);
   }
   void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
     visit(expr.getLHS());
     visit(expr.getRHS());
     if (expr.getKind() == mlir::AffineExprKind::Mul)
       assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
              "nonpositive multiplying coefficient");
   }
   bool isTiled;
   ArrayRef<Value> tileSizes;
 };

 } // namespace

 // IndexedGenericOp explicitly uses induction variables in the loop body. The
 // values of the indices that are used in the loop body for any given access of
 // input/output memref before `subview` op was applied should be invariant with
 // respect to tiling.
 //
 // Therefore, if the operation is tiled, we have to transform the indices
 // accordingly, i.e. offset them by the values of the corresponding induction
 // variables that are captured implicitly in the body of the op.
 //
 // Example. `linalg.indexed_generic` before tiling:
 //
 // #id_2d = (i, j) -> (i, j)
 // #pointwise_2d_trait = {
 //   indexing_maps = [#id_2d, #id_2d],
 //   iterator_types = ["parallel", "parallel"],
 //   n_views = [1, 1]
 // }
 // linalg.indexed_generic #pointwise_2d_trait %operand, %result {
 //   ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
 //     <some operations that use %i, %j>
 // }: memref<50x100xf32>, memref<50x100xf32>
 //
 // After tiling pass with tiles sizes 10 and 25:
 //
 // #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
 //
 // %c1 = constant 1 : index
 // %c0 = constant 0 : index
 // %c25 = constant 25 : index
 // %c10 = constant 10 : index
 // operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
 // operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
 // loop.for %k = %c0 to operand_dim_0 step %c10 {
 //   loop.for %l = %c0 to operand_dim_1 step %c25 {
 //     %4 = std.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
 //       : memref<50x100xf32> to memref<?x?xf32, #strided>
 //     %5 = std.subview %result[%k, %l][%c10, %c25][%c1, %c1]
 //       : memref<50x100xf32> to memref<?x?xf32, #strided>
 //     linalg.indexed_generic pointwise_2d_trait %4, %5 {
 //     ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
 //       // Indices `k` and `l` are implicitly captured in the body.
 //       %transformed_i = addi %i, %k : index // index `i` is offset by %k
 //       %transformed_j = addi %j, %l : index // index `j` is offset by %l
 //       // Every use of %i, %j is replaced with %transformed_i, %transformed_j
 //       <some operations that use %transformed_i, %transformed_j>
 //     }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
 //   }
 // }
 //
 // TODO(pifon, ntv): Investigate whether mixing implicit and explicit indices
 // does not lead to losing information.
 static void transformIndexedGenericOpIndices(
     OpBuilder &b, LinalgOp op, ArrayRef<ValueHandle *> pivs,
     const LoopIndexToRangeIndexMap &loopIndexToRangeIndex) {
   auto indexedGenericOp = dyn_cast<IndexedGenericOp>(op.getOperation());
   if (!indexedGenericOp)
     return;

   // `linalg.indexed_generic` comes in two flavours. One has a region with a
   // single block that defines the loop body. The other has a `fun` attribute
   // that refers to an existing function symbol. The `fun` function call will be
   // inserted in the loop body in that case.
   //
   // TODO(pifon): Add support for `linalg.indexed_generic` with `fun` attribute.
   auto &region = indexedGenericOp.region();
   if (region.empty()) {
     indexedGenericOp.emitOpError("expected a region");
     return;
   }
   auto &block = region.getBlocks().front();

   OpBuilder::InsertionGuard g(b);
   b.setInsertionPointToStart(&block);
   for (unsigned i = 0; i < indexedGenericOp.getNumLoops(); ++i) {
     auto rangeIndex = loopIndexToRangeIndex.find(i);
     if (rangeIndex == loopIndexToRangeIndex.end())
       continue;
     Value oldIndex = block.getArgument(i);
     // Offset the index argument `i` by the value of the corresponding induction
     // variable and replace all uses of the previous value.
     Value newIndex = b.create<AddIOp>(indexedGenericOp.getLoc(), oldIndex,
                                       pivs[rangeIndex->second]->getValue());
     for (auto &use : oldIndex.getUses()) {
       if (use.getOwner() == newIndex.getDefiningOp())
         continue;
       use.set(newIndex);
     }
   }
 }

 static bool isTiled(AffineExpr expr, ArrayRef<Value> tileSizes) {
   if (!expr)
     return false;
   TileCheck t(tileSizes);
   t.visit(expr);
   return t.isTiled;
 }

 // Checks whether the view with index `viewIndex` within `linalgOp` varies with
 // respect to a non-zero `tileSize`.
 static bool isTiled(AffineMap map, ArrayRef<Value> tileSizes) {
   if (!map)
     return false;
   for (unsigned r = 0; r < map.getNumResults(); ++r)
     if (isTiled(map.getResult(r), tileSizes))
       return true;
   return false;
 }

 static SmallVector<Value, 4>
 makeTiledViews(OpBuilder &b, Location loc, LinalgOp linalgOp,
                ArrayRef<Value> ivs, ArrayRef<Value> tileSizes,
                ArrayRef<Value> viewSizes, OperationFolder *folder) {
   assert(ivs.size() == static_cast<size_t>(llvm::count_if(
                            llvm::make_range(tileSizes.begin(), tileSizes.end()),
                            [](Value v) { return !isZero(v); })) &&
          "expected as many ivs as non-zero sizes");

   using edsc::intrinsics::select;
   using edsc::op::operator+;
   using edsc::op::operator<;

   // Construct (potentially temporary) mins and maxes on which to apply maps
   // that define tile subviews.
   SmallVector<Value, 8> lbs, subViewSizes;
   for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
     bool isTiled = !isZero(tileSizes[idx]);
     lbs.push_back(isTiled ? ivs[idxIvs++] : (Value)constant_index(folder, 0));
     subViewSizes.push_back(isTiled ? tileSizes[idx] : viewSizes[idx]);
   }

   auto *op = linalgOp.getOperation();

   SmallVector<Value, 4> res;
   res.reserve(op->getNumOperands());
   auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
   for (unsigned viewIndex = 0; viewIndex < linalgOp.getNumInputsAndOutputs();
        ++viewIndex) {
     Value view = *(viewIteratorBegin + viewIndex);
     unsigned rank = view.getType().cast<MemRefType>().getRank();
     auto map = loopToOperandRangesMaps(linalgOp)[viewIndex];
     // If the view is not tiled, we can use it as is.
     if (!isTiled(map, tileSizes)) {
       res.push_back(view);
       continue;
     }

     // Construct a new subview for the tile.
     SmallVector<Value, 4> offsets, sizes, strides;
     offsets.reserve(rank);
     sizes.reserve(rank);
     strides.reserve(rank);
     for (unsigned r = 0; r < rank; ++r) {
       if (!isTiled(map.getSubMap({r}), tileSizes)) {
         offsets.push_back(constant_index(folder, 0));
         sizes.push_back(dim(view, r));
         strides.push_back(constant_index(folder, 1));
         continue;
       }

       // Tiling creates a new slice at the proper index, the slice step is 1
       // (i.e. the slice view does not subsample, stepping occurs in the loop).
       auto m = map.getSubMap({r});
       auto offset = applyMapToValues(b, loc, m, lbs, folder).front();
       offsets.push_back(offset);
       auto size = applyMapToValues(b, loc, m, subViewSizes, folder).front();
       sizes.push_back(size);
       strides.push_back(constant_index(folder, 1));
     }
     // TODO(b/144419024) Atm std.subview is not guaranteed in-bounds. Depending
     // on the semantics we attach to it, we may need to use min(size, dim) here
     // and canonicalize later.
     res.push_back(b.create<SubViewOp>(loc, view, offsets, sizes, strides));
   }

   // Traverse the mins/maxes and erase those that don't have uses left.
   // This is a special type of folding that we only apply when `folder` is
   // defined.
   if (folder)
     for (auto v : llvm::concat<Value>(lbs, subViewSizes))
       if (v.use_empty())
         v.getDefiningOp()->erase();

   return res;
 }

 Optional<TiledLinalgOp>
 mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, ArrayRef<Value> tileSizes,
                            ArrayRef<unsigned> permutation,
                            OperationFolder *folder) {
   // 1. Enforce the convention that "tiling by zero" skips tiling a particular
   // dimension. This convention is significantly simpler to handle instead of
   // adjusting affine maps to account for missing dimensions.
   assert(op.getNumParallelLoops() + op.getNumReductionLoops() +
                  op.getNumWindowLoops() ==
              tileSizes.size() &&
          "expected matching number of tile sizes and loops");

   // If permutation is empty, use the identity. Build the permutation map
   // otherwise.
   auto invPermutationMap = AffineMap::getMultiDimIdentityMap(
       tileSizes.size(), ScopedContext::getContext());
   if (!permutation.empty())
     invPermutationMap = inversePermutation(
         AffineMap::getPermutationMap(permutation, ScopedContext::getContext()));

   OpBuilder::InsertionGuard g(b);
   b.setInsertionPoint(op);
   ScopedContext scope(b, op.getLoc());
   // 2. Build the tiled loop ranges.
   auto viewSizes = getViewSizes(op);
   // The flattened loopToOperandRangesMaps is expected to be an invertible
   // permutation map (asserted in the inverse calculation).
   auto viewSizesToLoopsMap =
       inversePermutation(concatAffineMaps(loopToOperandRangesMaps(op)));
   assert(viewSizesToLoopsMap && "expected invertible map");

   SmallVector<SubViewOp::Range, 4> loopRanges;
   LoopIndexToRangeIndexMap loopIndexToRangeIndex;
   std::tie(loopRanges, loopIndexToRangeIndex) =
       makeTiledLoopRanges(b, scope.getLocation(), viewSizesToLoopsMap,
                           viewSizes, tileSizes, folder);
   if (!permutation.empty())
     applyPermutationToVector(loopRanges, permutation);

   // 3. Create the tiled loops.
   LinalgOp res = op;
   SmallVector<IndexHandle, 4> ivs(loopRanges.size());
   auto pivs = makeHandlePointers(MutableArrayRef<IndexHandle>(ivs));
   LoopNestRangeBuilder(pivs, loopRanges)([&] {
     auto b = ScopedContext::getBuilder();
     auto loc = ScopedContext::getLocation();
     SmallVector<Value, 4> ivValues(ivs.begin(), ivs.end());

     // If we have to apply a permutation to the tiled loop nest, we have to
     // reorder the induction variables This permutation is the right one
     // assuming that loopRanges have previously been permuted by
     // (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of
     // that one: (d0,d1,d2)->(d2,d0,d1)
     if (!permutation.empty())
       ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues, folder);

     auto views =
         makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes, folder);
     auto operands = getAssumedNonViewOperands(op);
     views.append(operands.begin(), operands.end());
     res = op.clone(b, loc, views);
   });

   // 4. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
   transformIndexedGenericOpIndices(b, res, pivs, loopIndexToRangeIndex);

   // 5. Gather the newly created loops and return them with the new op.
   SmallVector<ForOp, 8> loops;
   loops.reserve(ivs.size());
   for (auto iv : ivs)
     loops.push_back(loop::getForInductionVarOwner(iv));

   return TiledLinalgOp{res, loops};
 }

 Optional<TiledLinalgOp> mlir::linalg::tileLinalgOp(
     OpBuilder &b, LinalgOp op, ArrayRef<int64_t> tileSizes,
     ArrayRef<unsigned> permutation, OperationFolder *folder) {
   if (tileSizes.empty())
     return llvm::None;

   // The following uses the convention that "tiling by zero" skips tiling a
   // particular dimension. This convention is significantly simpler to handle
   // instead of adjusting affine maps to account for missing dimensions.
   auto nLoops = op.getNumParallelLoops() + op.getNumReductionLoops() +
                 op.getNumWindowLoops();
   tileSizes = tileSizes.take_front(nLoops);
   // If only 0 tilings are left, then return.
   if (llvm::all_of(tileSizes, [](int64_t v) { return v == 0; }))
     return llvm::None;

   // Create a builder for tile size constants.
   OpBuilder::InsertionGuard g(b);
   b.setInsertionPoint(op);
   ScopedContext scope(b, op.getLoc());

   // Materialize concrete tile size values to pass the generic tiling function.
   SmallVector<Value, 8> tileSizeValues;
   tileSizeValues.reserve(tileSizes.size());
   for (auto ts : tileSizes)
     tileSizeValues.push_back(constant_index(folder, ts));
   // Pad tile sizes with zero values to enforce our convention.
   if (tileSizeValues.size() < nLoops) {
     for (unsigned i = tileSizeValues.size(); i < nLoops; ++i)
       tileSizeValues.push_back(constant_index(folder, 0));
   }

   return tileLinalgOp(b, op, tileSizeValues, permutation, folder);
 }

 static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
   OpBuilder b(f);
   OperationFolder folder(f.getContext());
   f.walk([tileSizes, &b, &folder](LinalgOp op) {
     auto opLoopsPair =
         tileLinalgOp(b, op, tileSizes, /*permutation=*/{}, &folder);
     // If tiling occurred successfully, erase old op.
     if (opLoopsPair)
       op.erase();
   });
   f.walk([](LinalgOp op) {
     if (!op.getOperation()->hasNoSideEffect())
       return;
     if (op.getOperation()->use_empty())
       op.erase();
   });
 }

 namespace {
 struct LinalgTilingPass : public FunctionPass<LinalgTilingPass> {
   LinalgTilingPass() = default;
   LinalgTilingPass(ArrayRef<int64_t> sizes);

   void runOnFunction() override { tileLinalgOps(getFunction(), tileSizes); }

   SmallVector<int64_t, 8> tileSizes;
 };
 } // namespace

 LinalgTilingPass::LinalgTilingPass(ArrayRef<int64_t> sizes) {
   this->tileSizes.assign(sizes.begin(), sizes.end());
 }

 std::unique_ptr<OpPassBase<FuncOp>>
 mlir::linalg::createLinalgTilingPass(ArrayRef<int64_t> tileSizes) {
   return std::make_unique<LinalgTilingPass>(tileSizes);
 }

 static PassRegistration<LinalgTilingPass>
     pass("linalg-tile", "Tile operations in the linalg dialect", [] {
       auto pass = std::make_unique<LinalgTilingPass>();
       pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
       return pass;
     });
	//===- Tiling.cpp - Implementation of linalg Tiling -----------------------===//
	//
	// Part of the MLIR Project, under the Apache License v2.0 with LLVM Exceptions.
	// See https://llvm.org/LICENSE.txt for license information.
	// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
	//
	//===----------------------------------------------------------------------===//
	//
	// This file implements the linalg dialect Tiling pass.
	//
	//===----------------------------------------------------------------------===//

	#include "mlir/Dialect/Linalg/IR/LinalgOps.h"
	#include "mlir/Dialect/Linalg/IR/LinalgTypes.h"
	#include "mlir/Dialect/Linalg/Passes.h"
	#include "mlir/Dialect/Linalg/Utils/Intrinsics.h"
	#include "mlir/Dialect/Linalg/Utils/Utils.h"
	#include "mlir/Dialect/LoopOps/LoopOps.h"
	#include "mlir/EDSC/Helpers.h"
	#include "mlir/IR/AffineExpr.h"
	#include "mlir/IR/AffineExprVisitor.h"
	#include "mlir/IR/AffineMap.h"
	#include "mlir/IR/OpImplementation.h"
	#include "mlir/Pass/Pass.h"
	#include "mlir/Support/LLVM.h"
	#include "mlir/Support/STLExtras.h"
	#include "mlir/Transforms/FoldUtils.h"

	#include "llvm/Support/CommandLine.h"

	using namespace mlir;
	using namespace mlir::edsc;
	using namespace mlir::edsc::intrinsics;
	using namespace mlir::linalg;
	using namespace mlir::linalg::intrinsics;
	using namespace mlir::loop;

	#define DEBUG_TYPE "linalg-tiling"

	static llvm::cl::OptionCategory clOptionsCategory(DEBUG_TYPE " options");
	static llvm::cl::list<unsigned>
	clTileSizes("linalg-tile-sizes",
	llvm::cl::desc("Tile sizes by which to tile linalg operations"),
	llvm::cl::ZeroOrMore, llvm::cl::MiscFlags::CommaSeparated,
	llvm::cl::cat(clOptionsCategory));

	static bool isZero(Value v) {
	return isa_and_nonnull<ConstantIndexOp>(v.getDefiningOp()) &&
	cast<ConstantIndexOp>(v.getDefiningOp()).getValue() == 0;
	}

	using LoopIndexToRangeIndexMap = DenseMap<int, int>;

	// Creates a number of ranges equal to the number of non-zero in `tileSizes`.
	// One for each loop of the LinalgOp that is tiled. The `tileSizes` argument has
	// one entry per surrounding loop. It uses zero as the convention that a
	// particular loop is not tiled. This convention simplifies implementations by
	// avoiding affine map manipulations.
	// The returned ranges correspond to the loop ranges, in the proper order, that
	// are tiled and for which new loops will be created. Also the function returns
	// a map from loop indices of the LinalgOp to the corresponding non-empty range
	// indices of newly created loops.
	static std::tuple<SmallVector<SubViewOp::Range, 4>, LoopIndexToRangeIndexMap>
	makeTiledLoopRanges(OpBuilder &b, Location loc, AffineMap map,
	ArrayRef<Value> allViewSizes, ArrayRef<Value> allTileSizes,
	OperationFolder *folder) {
	assert(allTileSizes.size() == map.getNumResults());
	// Apply `map` to get view sizes in loop order.
	auto viewSizes = applyMapToValues(b, loc, map, allViewSizes, folder);
	SmallVector<Value, 4> tileSizes(allTileSizes.begin(), allTileSizes.end());

	// Traverse the tile sizes, which are in loop order, erase zeros everywhere.
	LoopIndexToRangeIndexMap loopIndexToRangeIndex;
	for (int idx = 0, e = tileSizes.size(), zerosCount = 0; idx < e; ++idx) {
	if (isZero(tileSizes[idx - zerosCount])) {
	viewSizes.erase(viewSizes.begin() + idx - zerosCount);
	tileSizes.erase(tileSizes.begin() + idx - zerosCount);
	++zerosCount;
	continue;
	}
	loopIndexToRangeIndex[idx] = idx - zerosCount;
	}

	// Create a new range with the applied tile sizes.
	SmallVector<SubViewOp::Range, 4> res;
	for (unsigned idx = 0, e = tileSizes.size(); idx < e; ++idx) {
	res.push_back(SubViewOp::Range{constant_index(folder, 0), viewSizes[idx],
	tileSizes[idx]});
	}
	return std::make_tuple(res, loopIndexToRangeIndex);
	}

	namespace {

	// Helper visitor to determine whether an AffineExpr is tiled.
	// This is achieved by traversing every AffineDimExpr with position `pos` and
	// checking whether the corresponding `tileSizes[pos]` is non-zero.
	// This also enforces only positive coefficients occur in multiplications.
	//
	// Example:
	// `d0 + 2 * d1 + d3` is tiled by [0, 0, 0, 2] but not by [0, 0, 2, 0]
	//
	struct TileCheck : public AffineExprVisitor<TileCheck> {
	TileCheck(ArrayRef<Value> tileSizes) : isTiled(false), tileSizes(tileSizes) {}

	void visitDimExpr(AffineDimExpr expr) {
	isTiled \|= !isZero(tileSizes[expr.getPosition()]);
	}
	void visitAffineBinaryOpExpr(AffineBinaryOpExpr expr) {
	visit(expr.getLHS());
	visit(expr.getRHS());
	if (expr.getKind() == mlir::AffineExprKind::Mul)
	assert(expr.getRHS().cast<AffineConstantExpr>().getValue() > 0 &&
	"nonpositive multiplying coefficient");
	}
	bool isTiled;
	ArrayRef<Value> tileSizes;
	};

	} // namespace

	// IndexedGenericOp explicitly uses induction variables in the loop body. The
	// values of the indices that are used in the loop body for any given access of
	// input/output memref before `subview` op was applied should be invariant with
	// respect to tiling.
	//
	// Therefore, if the operation is tiled, we have to transform the indices
	// accordingly, i.e. offset them by the values of the corresponding induction
	// variables that are captured implicitly in the body of the op.
	//
	// Example. `linalg.indexed_generic` before tiling:
	//
	// #id_2d = (i, j) -> (i, j)
	// #pointwise_2d_trait = {
	// indexing_maps = [#id_2d, #id_2d],
	// iterator_types = ["parallel", "parallel"],
	// n_views = [1, 1]
	// }
	// linalg.indexed_generic #pointwise_2d_trait %operand, %result {
	// ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
	// <some operations that use %i, %j>
	// }: memref<50x100xf32>, memref<50x100xf32>
	//
	// After tiling pass with tiles sizes 10 and 25:
	//
	// #strided = (i, j)[s0, s1, s2] -> (i * s1 + s0 + j * s2)
	//
	// %c1 = constant 1 : index
	// %c0 = constant 0 : index
	// %c25 = constant 25 : index
	// %c10 = constant 10 : index
	// operand_dim_0 = dim %operand, 0 : memref<50x100xf32>
	// operand_dim_1 = dim %operand, 1 : memref<50x100xf32>
	// loop.for %k = %c0 to operand_dim_0 step %c10 {
	// loop.for %l = %c0 to operand_dim_1 step %c25 {
	// %4 = std.subview %operand[%k, %l][%c10, %c25][%c1, %c1]
	// : memref<50x100xf32> to memref<?x?xf32, #strided>
	// %5 = std.subview %result[%k, %l][%c10, %c25][%c1, %c1]
	// : memref<50x100xf32> to memref<?x?xf32, #strided>
	// linalg.indexed_generic pointwise_2d_trait %4, %5 {
	// ^bb0(%i: index, %j: index, %operand_in: f32, %result_in: f32):
	// // Indices `k` and `l` are implicitly captured in the body.
	// %transformed_i = addi %i, %k : index // index `i` is offset by %k
	// %transformed_j = addi %j, %l : index // index `j` is offset by %l
	// // Every use of %i, %j is replaced with %transformed_i, %transformed_j
	// <some operations that use %transformed_i, %transformed_j>
	// }: memref<?x?xf32, #strided>, memref<?x?xf32, #strided>
	// }
	// }
	//
	// TODO(pifon, ntv): Investigate whether mixing implicit and explicit indices
	// does not lead to losing information.
	static void transformIndexedGenericOpIndices(
	OpBuilder &b, LinalgOp op, ArrayRef<ValueHandle *> pivs,
	const LoopIndexToRangeIndexMap &loopIndexToRangeIndex) {
	auto indexedGenericOp = dyn_cast<IndexedGenericOp>(op.getOperation());
	if (!indexedGenericOp)
	return;

	// `linalg.indexed_generic` comes in two flavours. One has a region with a
	// single block that defines the loop body. The other has a `fun` attribute
	// that refers to an existing function symbol. The `fun` function call will be
	// inserted in the loop body in that case.
	//
	// TODO(pifon): Add support for `linalg.indexed_generic` with `fun` attribute.
	auto &region = indexedGenericOp.region();
	if (region.empty()) {
	indexedGenericOp.emitOpError("expected a region");
	return;
	}
	auto &block = region.getBlocks().front();

	OpBuilder::InsertionGuard g(b);
	b.setInsertionPointToStart(&block);
	for (unsigned i = 0; i < indexedGenericOp.getNumLoops(); ++i) {
	auto rangeIndex = loopIndexToRangeIndex.find(i);
	if (rangeIndex == loopIndexToRangeIndex.end())
	continue;
	Value oldIndex = block.getArgument(i);
	// Offset the index argument `i` by the value of the corresponding induction
	// variable and replace all uses of the previous value.
	Value newIndex = b.create<AddIOp>(indexedGenericOp.getLoc(), oldIndex,
	pivs[rangeIndex->second]->getValue());
	for (auto &use : oldIndex.getUses()) {
	if (use.getOwner() == newIndex.getDefiningOp())
	continue;
	use.set(newIndex);
	}
	}
	}

	static bool isTiled(AffineExpr expr, ArrayRef<Value> tileSizes) {
	if (!expr)
	return false;
	TileCheck t(tileSizes);
	t.visit(expr);
	return t.isTiled;
	}

	// Checks whether the view with index `viewIndex` within `linalgOp` varies with
	// respect to a non-zero `tileSize`.
	static bool isTiled(AffineMap map, ArrayRef<Value> tileSizes) {
	if (!map)
	return false;
	for (unsigned r = 0; r < map.getNumResults(); ++r)
	if (isTiled(map.getResult(r), tileSizes))
	return true;
	return false;
	}

	static SmallVector<Value, 4>
	makeTiledViews(OpBuilder &b, Location loc, LinalgOp linalgOp,
	ArrayRef<Value> ivs, ArrayRef<Value> tileSizes,
	ArrayRef<Value> viewSizes, OperationFolder *folder) {
	assert(ivs.size() == static_cast<size_t>(llvm::count_if(
	llvm::make_range(tileSizes.begin(), tileSizes.end()),
	[](Value v) { return !isZero(v); })) &&
	"expected as many ivs as non-zero sizes");

	using edsc::intrinsics::select;
	using edsc::op::operator+;
	using edsc::op::operator<;

	// Construct (potentially temporary) mins and maxes on which to apply maps
	// that define tile subviews.
	SmallVector<Value, 8> lbs, subViewSizes;
	for (unsigned idx = 0, idxIvs = 0, e = tileSizes.size(); idx < e; ++idx) {
	bool isTiled = !isZero(tileSizes[idx]);
	lbs.push_back(isTiled ? ivs[idxIvs++] : (Value)constant_index(folder, 0));
	subViewSizes.push_back(isTiled ? tileSizes[idx] : viewSizes[idx]);
	}

	auto *op = linalgOp.getOperation();

	SmallVector<Value, 4> res;
	res.reserve(op->getNumOperands());
	auto viewIteratorBegin = linalgOp.getInputsAndOutputs().begin();
	for (unsigned viewIndex = 0; viewIndex < linalgOp.getNumInputsAndOutputs();
	++viewIndex) {
	Value view = *(viewIteratorBegin + viewIndex);
	unsigned rank = view.getType().cast<MemRefType>().getRank();
	auto map = loopToOperandRangesMaps(linalgOp)[viewIndex];
	// If the view is not tiled, we can use it as is.
	if (!isTiled(map, tileSizes)) {
	res.push_back(view);
	continue;
	}

	// Construct a new subview for the tile.
	SmallVector<Value, 4> offsets, sizes, strides;
	offsets.reserve(rank);
	sizes.reserve(rank);
	strides.reserve(rank);
	for (unsigned r = 0; r < rank; ++r) {
	if (!isTiled(map.getSubMap({r}), tileSizes)) {
	offsets.push_back(constant_index(folder, 0));
	sizes.push_back(dim(view, r));
	strides.push_back(constant_index(folder, 1));
	continue;
	}

	// Tiling creates a new slice at the proper index, the slice step is 1
	// (i.e. the slice view does not subsample, stepping occurs in the loop).
	auto m = map.getSubMap({r});
	auto offset = applyMapToValues(b, loc, m, lbs, folder).front();
	offsets.push_back(offset);
	auto size = applyMapToValues(b, loc, m, subViewSizes, folder).front();
	sizes.push_back(size);
	strides.push_back(constant_index(folder, 1));
	}
	// TODO(b/144419024) Atm std.subview is not guaranteed in-bounds. Depending
	// on the semantics we attach to it, we may need to use min(size, dim) here
	// and canonicalize later.
	res.push_back(b.create<SubViewOp>(loc, view, offsets, sizes, strides));
	}

	// Traverse the mins/maxes and erase those that don't have uses left.
	// This is a special type of folding that we only apply when `folder` is
	// defined.
	if (folder)
	for (auto v : llvm::concat<Value>(lbs, subViewSizes))
	if (v.use_empty())
	v.getDefiningOp()->erase();

	return res;
	}

	Optional<TiledLinalgOp>
	mlir::linalg::tileLinalgOp(OpBuilder &b, LinalgOp op, ArrayRef<Value> tileSizes,
	ArrayRef<unsigned> permutation,
	OperationFolder *folder) {
	// 1. Enforce the convention that "tiling by zero" skips tiling a particular
	// dimension. This convention is significantly simpler to handle instead of
	// adjusting affine maps to account for missing dimensions.
	assert(op.getNumParallelLoops() + op.getNumReductionLoops() +
	op.getNumWindowLoops() ==
	tileSizes.size() &&
	"expected matching number of tile sizes and loops");

	// If permutation is empty, use the identity. Build the permutation map
	// otherwise.
	auto invPermutationMap = AffineMap::getMultiDimIdentityMap(
	tileSizes.size(), ScopedContext::getContext());
	if (!permutation.empty())
	invPermutationMap = inversePermutation(
	AffineMap::getPermutationMap(permutation, ScopedContext::getContext()));

	OpBuilder::InsertionGuard g(b);
	b.setInsertionPoint(op);
	ScopedContext scope(b, op.getLoc());
	// 2. Build the tiled loop ranges.
	auto viewSizes = getViewSizes(op);
	// The flattened loopToOperandRangesMaps is expected to be an invertible
	// permutation map (asserted in the inverse calculation).
	auto viewSizesToLoopsMap =
	inversePermutation(concatAffineMaps(loopToOperandRangesMaps(op)));
	assert(viewSizesToLoopsMap && "expected invertible map");

	SmallVector<SubViewOp::Range, 4> loopRanges;
	LoopIndexToRangeIndexMap loopIndexToRangeIndex;
	std::tie(loopRanges, loopIndexToRangeIndex) =
	makeTiledLoopRanges(b, scope.getLocation(), viewSizesToLoopsMap,
	viewSizes, tileSizes, folder);
	if (!permutation.empty())
	applyPermutationToVector(loopRanges, permutation);

	// 3. Create the tiled loops.
	LinalgOp res = op;
	SmallVector<IndexHandle, 4> ivs(loopRanges.size());
	auto pivs = makeHandlePointers(MutableArrayRef<IndexHandle>(ivs));
	LoopNestRangeBuilder(pivs, loopRanges)([&] {
	auto b = ScopedContext::getBuilder();
	auto loc = ScopedContext::getLocation();
	SmallVector<Value, 4> ivValues(ivs.begin(), ivs.end());

	// If we have to apply a permutation to the tiled loop nest, we have to
	// reorder the induction variables This permutation is the right one
	// assuming that loopRanges have previously been permuted by
	// (i,j,k)->(k,i,j) So this permutation should be the inversePermutation of
	// that one: (d0,d1,d2)->(d2,d0,d1)
	if (!permutation.empty())
	ivValues = applyMapToValues(b, loc, invPermutationMap, ivValues, folder);

	auto views =
	makeTiledViews(b, loc, op, ivValues, tileSizes, viewSizes, folder);
	auto operands = getAssumedNonViewOperands(op);
	views.append(operands.begin(), operands.end());
	res = op.clone(b, loc, views);
	});

	// 4. Transforms index arguments of `linalg.generic` w.r.t. to the tiling.
	transformIndexedGenericOpIndices(b, res, pivs, loopIndexToRangeIndex);

	// 5. Gather the newly created loops and return them with the new op.
	SmallVector<ForOp, 8> loops;
	loops.reserve(ivs.size());
	for (auto iv : ivs)
	loops.push_back(loop::getForInductionVarOwner(iv));

	return TiledLinalgOp{res, loops};
	}

	Optional<TiledLinalgOp> mlir::linalg::tileLinalgOp(
	OpBuilder &b, LinalgOp op, ArrayRef<int64_t> tileSizes,
	ArrayRef<unsigned> permutation, OperationFolder *folder) {
	if (tileSizes.empty())
	return llvm::None;

	// The following uses the convention that "tiling by zero" skips tiling a
	// particular dimension. This convention is significantly simpler to handle
	// instead of adjusting affine maps to account for missing dimensions.
	auto nLoops = op.getNumParallelLoops() + op.getNumReductionLoops() +
	op.getNumWindowLoops();
	tileSizes = tileSizes.take_front(nLoops);
	// If only 0 tilings are left, then return.
	if (llvm::all_of(tileSizes, [](int64_t v) { return v == 0; }))
	return llvm::None;

	// Create a builder for tile size constants.
	OpBuilder::InsertionGuard g(b);
	b.setInsertionPoint(op);
	ScopedContext scope(b, op.getLoc());

	// Materialize concrete tile size values to pass the generic tiling function.
	SmallVector<Value, 8> tileSizeValues;
	tileSizeValues.reserve(tileSizes.size());
	for (auto ts : tileSizes)
	tileSizeValues.push_back(constant_index(folder, ts));
	// Pad tile sizes with zero values to enforce our convention.
	if (tileSizeValues.size() < nLoops) {
	for (unsigned i = tileSizeValues.size(); i < nLoops; ++i)
	tileSizeValues.push_back(constant_index(folder, 0));
	}

	return tileLinalgOp(b, op, tileSizeValues, permutation, folder);
	}

	static void tileLinalgOps(FuncOp f, ArrayRef<int64_t> tileSizes) {
	OpBuilder b(f);
	OperationFolder folder(f.getContext());
	f.walk([tileSizes, &b, &folder](LinalgOp op) {
	auto opLoopsPair =
	tileLinalgOp(b, op, tileSizes, /permutation=/{}, &folder);
	// If tiling occurred successfully, erase old op.
	if (opLoopsPair)
	op.erase();
	});
	f.walk([](LinalgOp op) {
	if (!op.getOperation()->hasNoSideEffect())
	return;
	if (op.getOperation()->use_empty())
	op.erase();
	});
	}

	namespace {
	struct LinalgTilingPass : public FunctionPass<LinalgTilingPass> {
	LinalgTilingPass() = default;
	LinalgTilingPass(ArrayRef<int64_t> sizes);

	void runOnFunction() override { tileLinalgOps(getFunction(), tileSizes); }

	SmallVector<int64_t, 8> tileSizes;
	};
	} // namespace

	LinalgTilingPass::LinalgTilingPass(ArrayRef<int64_t> sizes) {
	this->tileSizes.assign(sizes.begin(), sizes.end());
	}

	std::unique_ptr<OpPassBase<FuncOp>>
	mlir::linalg::createLinalgTilingPass(ArrayRef<int64_t> tileSizes) {
	return std::make_unique<LinalgTilingPass>(tileSizes);
	}

	static PassRegistration<LinalgTilingPass>
	pass("linalg-tile", "Tile operations in the linalg dialect", [] {
	auto pass = std::make_unique<LinalgTilingPass>();
	pass->tileSizes.assign(clTileSizes.begin(), clTileSizes.end());
	return pass;
	});