Rewrite getPrevectorMap using schedule trees operations

Schedule trees are a lot easier to work with, for both humans and machines. For
humans the more structured schedule representation is easier to reason about.
Together with the more abstract isl programming interface this can result in a
lot cleaner code (see this changeset). For machines, the structured schedule and
the fact that we now use explicit piecewise affine expressions instead of
integer maps makes it easier to generate code from this schedule tree. As a
result, we can already see a slight compile-time improvement -- for 3mm from
0m0.593s to 0m0.551s seconds (-7 %). More importantly, future optimizations such
as full-partial tile separation will most likely result in more streamlined code
to be generated.

Contributed-by: Roman Gareev <gareevroman@gmail.com>
llvm-svn: 243458
diff --git a/polly/lib/Transform/ScheduleOptimizer.cpp b/polly/lib/Transform/ScheduleOptimizer.cpp
index 8c35898..26b9668 100644
--- a/polly/lib/Transform/ScheduleOptimizer.cpp
+++ b/polly/lib/Transform/ScheduleOptimizer.cpp
@@ -118,16 +118,14 @@
   /// @return True, if we believe @p NewSchedule is an improvement for @p S.
   bool isProfitableSchedule(Scop &S, __isl_keep isl_union_map *NewSchedule);
 
-  /// @brief Create a map that pre-vectorizes one scheduling dimension.
+  /// @brief Pre-vectorizes one scheduling dimension of a schedule band.
   ///
-  /// getPrevectorMap creates a map that maps each input dimension to the same
-  /// output dimension, except for the dimension DimToVectorize.
-  /// DimToVectorize is strip mined by 'VectorWidth' and the newly created
-  /// point loop of DimToVectorize is moved to the innermost level.
+  /// prevectSchedBand splits out the dimension DimToVectorize, tiles it and
+  /// sinks the resulting point loop.
   ///
-  /// Example (DimToVectorize=0, ScheduleDimensions=2, VectorWidth=4):
+  /// Example (DimToVectorize=0, VectorWidth=4):
   ///
-  /// | Before transformation
+  /// | Before transformation:
   /// |
   /// | A[i,j] -> [i,j]
   /// |
@@ -135,17 +133,12 @@
   /// |    for (j = 0; j < 128; j++)
   /// |      A(i,j);
   ///
-  ///   Prevector map:
-  ///   [i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  ///
   /// | After transformation:
   /// |
-  /// | A[i,j] -> [it,j,ip] : it % 4 = 0 and it <= ip <= it + 3 and i = ip
-  /// |
-  /// | for (it = 0; it < 128; it+=4)
+  /// | for (it = 0; it < 32; it+=1)
   /// |    for (j = 0; j < 128; j++)
-  /// |      for (ip = max(0,it); ip < min(128, it + 3); ip++)
-  /// |        A(ip,j);
+  /// |      for (ip = 0; ip <= 3; ip++)
+  /// |        A(4 * it + ip,j);
   ///
   /// The goal of this transformation is to create a trivially vectorizable
   /// loop.  This means a parallel loop at the innermost level that has a
@@ -156,9 +149,9 @@
   /// DimToVectorize can be divided by VectorWidth. The default VectorWidth is
   /// currently constant and not yet target specific. This function does not
   /// reason about parallelism.
-  static __isl_give isl_map *getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                             int ScheduleDimensions,
-                                             int VectorWidth = 4);
+  static __isl_give isl_schedule_node *
+  prevectSchedBand(__isl_take isl_schedule_node *Node, unsigned DimToVectorize,
+                   int VectorWidth = 4);
 
   /// @brief Apply additional optimizations on the bands in the schedule tree.
   ///
@@ -170,7 +163,7 @@
   ///      - if the band has more than one loop dimension
   ///
   ///  - Prevectorize the schedule of the band (or the point loop in case of
-  ///    tiling)
+  ///    tiling).
   ///      - if vectorization is enabled
   ///
   /// @param Node The schedule node to (possibly) optimize.
@@ -204,58 +197,33 @@
 
 char IslScheduleOptimizer::ID = 0;
 
-__isl_give isl_map *
-IslScheduleOptimizer::getPrevectorMap(isl_ctx *ctx, int DimToVectorize,
-                                      int ScheduleDimensions, int VectorWidth) {
-  isl_space *Space;
-  isl_local_space *LocalSpace, *LocalSpaceRange;
-  isl_set *Modulo;
-  isl_map *TilingMap;
-  isl_constraint *c;
-  isl_aff *Aff;
-  int PointDimension; /* ip */
-  int TileDimension;  /* it */
-  isl_val *VectorWidthMP;
+__isl_give isl_schedule_node *
+IslScheduleOptimizer::prevectSchedBand(__isl_take isl_schedule_node *Node,
+                                       unsigned DimToVectorize,
+                                       int VectorWidth) {
+  assert(isl_schedule_node_get_type(Node) == isl_schedule_node_band);
 
-  assert(0 <= DimToVectorize && DimToVectorize < ScheduleDimensions);
+  auto Space = isl_schedule_node_band_get_space(Node);
+  auto ScheduleDimensions = isl_space_dim(Space, isl_dim_set);
+  isl_space_free(Space);
+  assert(DimToVectorize < ScheduleDimensions);
 
-  Space = isl_space_alloc(ctx, 0, ScheduleDimensions, ScheduleDimensions + 1);
-  TilingMap = isl_map_universe(isl_space_copy(Space));
-  LocalSpace = isl_local_space_from_space(Space);
-  PointDimension = ScheduleDimensions;
-  TileDimension = DimToVectorize;
-
-  // Create an identity map for everything except DimToVectorize and map
-  // DimToVectorize to the point loop at the innermost dimension.
-  for (int i = 0; i < ScheduleDimensions; i++)
-    if (i == DimToVectorize)
-      TilingMap =
-          isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, PointDimension);
-    else
-      TilingMap = isl_map_equate(TilingMap, isl_dim_in, i, isl_dim_out, i);
-
-  // it % 'VectorWidth' = 0
-  LocalSpaceRange = isl_local_space_range(isl_local_space_copy(LocalSpace));
-  Aff = isl_aff_zero_on_domain(LocalSpaceRange);
-  Aff = isl_aff_set_constant_si(Aff, VectorWidth);
-  Aff = isl_aff_set_coefficient_si(Aff, isl_dim_in, TileDimension, 1);
-  VectorWidthMP = isl_val_int_from_si(ctx, VectorWidth);
-  Aff = isl_aff_mod_val(Aff, VectorWidthMP);
-  Modulo = isl_pw_aff_zero_set(isl_pw_aff_from_aff(Aff));
-  TilingMap = isl_map_intersect_range(TilingMap, Modulo);
-
-  // it <= ip
-  TilingMap = isl_map_order_le(TilingMap, isl_dim_out, TileDimension,
-                               isl_dim_out, PointDimension);
-
-  // ip <= it + ('VectorWidth' - 1)
-  c = isl_inequality_alloc(LocalSpace);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, TileDimension, 1);
-  isl_constraint_set_coefficient_si(c, isl_dim_out, PointDimension, -1);
-  isl_constraint_set_constant_si(c, VectorWidth - 1);
-  TilingMap = isl_map_add_constraint(TilingMap, c);
-
-  return TilingMap;
+  if (DimToVectorize > 0) {
+    Node = isl_schedule_node_band_split(Node, DimToVectorize);
+    Node = isl_schedule_node_child(Node, 0);
+  }
+  if (DimToVectorize < ScheduleDimensions - 1)
+    Node = isl_schedule_node_band_split(Node, 1);
+  Space = isl_schedule_node_band_get_space(Node);
+  auto Sizes = isl_multi_val_zero(Space);
+  auto Ctx = isl_schedule_node_get_ctx(Node);
+  Sizes =
+      isl_multi_val_set_val(Sizes, 0, isl_val_int_from_si(Ctx, VectorWidth));
+  Node = isl_schedule_node_band_tile(Node, Sizes);
+  Node = isl_schedule_node_child(Node, 0);
+  Node = isl_schedule_node_band_sink(Node);
+  Node = isl_schedule_node_child(Node, 0);
+  return Node;
 }
 
 isl_schedule_node *IslScheduleOptimizer::optimizeBand(isl_schedule_node *Node,
@@ -307,21 +275,12 @@
   if (PollyVectorizerChoice == VECTORIZER_NONE)
     return Res;
 
-  auto Schedule = isl_schedule_node_band_get_partial_schedule(Res);
-
-  for (int i = Dims - 1; i >= 0; i--) {
+  for (int i = Dims - 1; i >= 0; i--)
     if (isl_schedule_node_band_member_get_coincident(Res, i)) {
-      auto TileMap = IslScheduleOptimizer::getPrevectorMap(Ctx, i, Dims);
-      auto TileUMap = isl_union_map_from_map(TileMap);
-      auto Schedule2 = isl_union_map_apply_range(
-          isl_union_map_from_multi_union_pw_aff(Schedule), TileUMap);
-      Schedule = isl_multi_union_pw_aff_from_union_map(Schedule2);
+      Res = IslScheduleOptimizer::prevectSchedBand(Res, i);
       break;
     }
-  }
 
-  Res = isl_schedule_node_delete(Res);
-  Res = isl_schedule_node_insert_partial_schedule(Res, Schedule);
   return Res;
 }