Add recursive task scheduling strategy to taskloop implementation

Summary:
Taskloop implementation is extended by using recursive task scheduling.
Envirable KMP_TASKLOOP_MIN_TASKS added as a manual threshold for the user
to switch from recursive to linear tasks scheduling.

Details:
* The calculations for the loop parameters are moved from __kmp_taskloop_linear
  upper level
* Initial calculation is done in the __kmpc_taskloop, further range splitting
  is done in the __kmp_taskloop_recur.
* Added threshold to switch from recursive to linear tasks scheduling;
* One half of split range is scheduled as an internal task which just moves
  sub-range parameters to the stealing thread that continues recursive
  scheduling (if number of tasks still enough), the other half is processed
  recursively;
* Internal task duplication routine fixed to assign parent task, that was not
  needed when all tasks were scheduled by same thread, but is needed now.

Patch by Andrey Churbanov

Differential Revision: https://reviews.llvm.org/D35273

llvm-svn: 308338
diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 8f1eb02..27207b3 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -3237,6 +3237,7 @@
                      0);
   }
   taskdata->td_alloc_thread = thread;
+  taskdata->td_parent = parent_task;
   taskdata->td_taskgroup =
       parent_task
           ->td_taskgroup; // task inherits the taskgroup from the parent task
@@ -3263,32 +3264,37 @@
   return task;
 }
 
-// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// Routine optionally generated by the compiler for setting the lastprivate flag
 // and calling needed constructors for private/firstprivate objects
 // (used to form taskloop tasks from pattern task)
+// Parameters: dest task, src task, lastprivate flag.
 typedef void (*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
 
 // __kmp_taskloop_linear: Start tasks of the taskloop linearly
 //
 // loc       Source location information
 // gtid      Global thread ID
-// task      Task with whole loop iteration range
-// lb        Pointer to loop lower bound
-// ub        Pointer to loop upper bound
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
 // st        Loop stride
-// sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-// grainsize Schedule value if specified
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
 // task_dup  Tasks duplication routine
 void __kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
                            kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
-                           int sched, kmp_uint64 grainsize, void *task_dup) {
+                           kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                           kmp_uint64 grainsize, kmp_uint64 extras,
+                           kmp_uint64 tc, void *task_dup) {
   KMP_COUNT_BLOCK(OMP_TASKLOOP);
   KMP_TIME_PARTITIONED_BLOCK(OMP_taskloop_scheduling);
   p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
-  kmp_uint64 tc;
   kmp_uint64 lower = *lb; // compiler provides global bounds here
   kmp_uint64 upper = *ub;
-  kmp_uint64 i, num_tasks = 0, extras = 0;
+  kmp_uint64 i;
   kmp_info_t *thread = __kmp_threads[gtid];
   kmp_taskdata_t *current_task = thread->th.th_current_task;
   kmp_task_t *next_task;
@@ -3298,6 +3304,254 @@
   size_t upper_offset =
       (char *)ub - (char *)task; // remember offset of ub in the task structure
 
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+  KA_TRACE(20, ("__kmp_taskloop_linear: T#%d: %lld tasks, grainsize %lld, "
+                "extras %lld, i=%lld,%lld(%d)%lld, dup %p\n", gtid, num_tasks,
+                grainsize, extras, lower, upper, ub_glob, st, task_dup));
+
+  // Launch num_tasks tasks, assign grainsize iterations each task
+  for (i = 0; i < num_tasks; ++i) {
+    kmp_uint64 chunk_minus_1;
+    if (extras == 0) {
+      chunk_minus_1 = grainsize - 1;
+    } else {
+      chunk_minus_1 = grainsize;
+      --extras; // first extras iterations get bigger chunk (grainsize+1)
+    }
+    upper = lower + st * chunk_minus_1;
+    if (i == num_tasks - 1) {
+      // schedule the last task, set lastprivate flag if needed
+      if (st == 1) { // most common case
+        KMP_DEBUG_ASSERT(upper == *ub);
+        if (upper == ub_glob)
+          lastpriv = 1;
+      } else if (st > 0) { // positive loop stride
+        KMP_DEBUG_ASSERT((kmp_uint64)st > *ub - upper);
+        if ((kmp_uint64)st > ub_glob - upper)
+          lastpriv = 1;
+      } else {  // negative loop stride
+        KMP_DEBUG_ASSERT(upper + st < *ub);
+        if (upper - ub_glob < (kmp_uint64)(-st))
+          lastpriv = 1;
+      }
+    }
+    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+    // adjust task-specific bounds
+    *(kmp_uint64 *)((char *)next_task + lower_offset) = lower;
+    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
+    if (ptask_dup != NULL) // set lastprivate flag, construct fistprivates, etc.
+      ptask_dup(next_task, task, lastpriv);
+    KA_TRACE(40, ("__kmp_taskloop_linear: T#%d; task %p: lower %lld, "
+                  "upper %lld (offsets %p %p)\n",
+                  gtid, next_task, lower, upper, lower_offset, upper_offset));
+    __kmp_omp_task(gtid, next_task, true); // schedule new task
+    lower = upper + st; // adjust lower bound for the next iteration
+  }
+  // free the pattern task and exit
+  __kmp_task_start(gtid, task, current_task); // make internal bookkeeping
+  // do not execute the pattern task, just do internal bookkeeping
+  __kmp_task_finish(gtid, task, current_task);
+}
+
+// Structure to keep taskloop parameters for auxiliary task
+// kept in the shareds of the task structure.
+typedef struct __taskloop_params {
+  kmp_task_t *task;
+  kmp_uint64 *lb;
+  kmp_uint64 *ub;
+  void *task_dup;
+  kmp_int64 st;
+  kmp_uint64 ub_glob;
+  kmp_uint64 num_tasks;
+  kmp_uint64 grainsize;
+  kmp_uint64 extras;
+  kmp_uint64 tc;
+  kmp_uint64 num_t_min;
+} __taskloop_params_t;
+
+void __kmp_taskloop_recur(ident_t *, int, kmp_task_t *, kmp_uint64 *,
+                          kmp_uint64 *, kmp_int64, kmp_uint64, kmp_uint64,
+                          kmp_uint64, kmp_uint64, kmp_uint64, kmp_uint64,
+                          void *);
+
+// Execute part of the the taskloop submitted as a task.
+int __kmp_taskloop_task(int gtid, void *ptask) {
+  __taskloop_params_t *p = (__taskloop_params_t*)((kmp_task_t*)ptask)->shareds;
+  kmp_task_t *task = p->task;
+  kmp_uint64 *lb = p->lb;
+  kmp_uint64 *ub = p->ub;
+  void *task_dup = p->task_dup;
+//  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_int64 st = p->st;
+  kmp_uint64 ub_glob = p->ub_glob;
+  kmp_uint64 num_tasks = p->num_tasks;
+  kmp_uint64 grainsize = p->grainsize;
+  kmp_uint64 extras = p->extras;
+  kmp_uint64 tc = p->tc;
+  kmp_uint64 num_t_min = p->num_t_min;
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KA_TRACE(20, ("__kmp_taskloop_task: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  KMP_DEBUG_ASSERT(num_tasks*2+1 > num_t_min);
+  if (num_tasks > num_t_min)
+    __kmp_taskloop_recur(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(NULL, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+
+  KA_TRACE(40, ("__kmp_taskloop_task(exit): T#%d\n", gtid));
+  return 0;
+}
+
+// Schedule part of the the taskloop as a task,
+// execute the rest of the the taskloop.
+//
+// loc       Source location information
+// gtid      Global thread ID
+// task      Pattern task, exposes the loop iteration range
+// lb        Pointer to loop lower bound in task structure
+// ub        Pointer to loop upper bound in task structure
+// st        Loop stride
+// ub_glob   Global upper bound (used for lastprivate check)
+// num_tasks Number of tasks to execute
+// grainsize Number of loop iterations per task
+// extras    Number of chunks with grainsize+1 iterations
+// tc        Iterations count
+// num_t_min Threashold to launch tasks recursively
+// task_dup  Tasks duplication routine
+void __kmp_taskloop_recur(ident_t *loc, int gtid, kmp_task_t *task,
+                          kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                          kmp_uint64 ub_glob, kmp_uint64 num_tasks,
+                          kmp_uint64 grainsize, kmp_uint64 extras,
+                          kmp_uint64 tc, kmp_uint64 num_t_min, void *task_dup) {
+#if KMP_DEBUG
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+  KMP_DEBUG_ASSERT(num_tasks > num_t_min);
+  KA_TRACE(20, ("__kmp_taskloop_recur: T#%d, task %p: %lld tasks, grainsize"
+                " %lld, extras %lld, i=%lld,%lld(%d), dup %p\n", gtid, taskdata,
+                num_tasks, grainsize, extras, *lb, *ub, st, task_dup));
+#endif
+  p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+  kmp_uint64 lower = *lb;
+  kmp_uint64 upper = *ub;
+  kmp_info_t *thread = __kmp_threads[gtid];
+//  kmp_taskdata_t *current_task = thread->th.th_current_task;
+  kmp_task_t *next_task;
+  kmp_int32 lastpriv = 0;
+  size_t lower_offset =
+      (char *)lb - (char *)task; // remember offset of lb in the task structure
+  size_t upper_offset =
+      (char *)ub - (char *)task; // remember offset of ub in the task structure
+
+  KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+  KMP_DEBUG_ASSERT(num_tasks > extras);
+  KMP_DEBUG_ASSERT(num_tasks > 0);
+
+  // split the loop in two halves
+  kmp_uint64 lb1, ub0, tc0, tc1, ext0, ext1;
+  kmp_uint64 gr_size0 = grainsize;
+  kmp_uint64 n_tsk0 = num_tasks >> 1; // num_tasks/2 to execute
+  kmp_uint64 n_tsk1 = num_tasks - n_tsk0; // to schedule as a task
+  if (n_tsk0 <= extras) {
+    gr_size0++; // integrate extras into grainsize
+    ext0 = 0; // no extra iters in 1st half
+    ext1 = extras - n_tsk0; // remaining extras
+    tc0 = gr_size0 * n_tsk0;
+    tc1 = tc - tc0;
+  } else { // n_tsk0 > extras
+    ext1 = 0; // no extra iters in 2nd half
+    ext0 = extras;
+    tc1 = grainsize * n_tsk1;
+    tc0 = tc - tc1;
+  }
+  ub0 = lower + st * (tc0 - 1);
+  lb1 = ub0 + st;
+
+  // create pattern task for 2nd half of the loop
+  next_task = __kmp_task_dup_alloc(thread, task); // duplicate the task
+  // adjust lower bound (upper bound is not changed) for the 2nd half
+  *(kmp_uint64 *)((char *)next_task + lower_offset) = lb1;
+  if (ptask_dup != NULL) // construct fistprivates, etc.
+    ptask_dup(next_task, task, 0);
+  *ub = ub0; // adjust upper bound for the 1st half
+
+  // create auxiliary task for 2nd half of the loop
+  kmp_task_t *new_task =
+      __kmpc_omp_task_alloc(loc, gtid, 1, 3 * sizeof(void*),
+                            sizeof(__taskloop_params_t), &__kmp_taskloop_task);
+  __taskloop_params_t * p = (__taskloop_params_t *)new_task->shareds;
+  p->task = next_task;
+  p->lb = (kmp_uint64 *)((char *)next_task + lower_offset);
+  p->ub = (kmp_uint64 *)((char *)next_task + upper_offset);
+  p->task_dup = task_dup;
+  p->st = st;
+  p->ub_glob = ub_glob;
+  p->num_tasks = n_tsk1;
+  p->grainsize = grainsize;
+  p->extras = ext1;
+  p->tc = tc1;
+  p->num_t_min = num_t_min;
+  __kmp_omp_task(gtid, new_task, true); // schedule new task
+
+  // execute the 1st half of current subrange
+  if (n_tsk0 > num_t_min)
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                         gr_size0, ext0, tc0, num_t_min, task_dup);
+  else
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, n_tsk0,
+                          gr_size0, ext0, tc0, task_dup);
+
+  KA_TRACE(40, ("__kmpc_taskloop_recur(exit): T#%d\n", gtid));
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound in task structure
+@param ub        Pointer to loop upper bound in task structure
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
+                     int sched, kmp_uint64 grainsize, void *task_dup) {
+  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
+  KMP_DEBUG_ASSERT(task != NULL);
+
+  KA_TRACE(20, ("__kmpc_taskloop: T#%d, task %p, lb %lld, ub %lld, st %lld, "
+                "grain %llu(%d), dup %p\n", gtid, taskdata, *lb, *ub, st,
+                grainsize, sched, task_dup));
+
+  if (nogroup == 0)
+    __kmpc_taskgroup(loc, gtid);
+
+  // =========================================================================
+  // calculate loop parameters
+  kmp_uint64 tc;
+  kmp_uint64 lower = *lb; // compiler provides global bounds here
+  kmp_uint64 upper = *ub;
+  kmp_uint64 ub_glob = upper; // global upper used to calc lastprivate flag
+  kmp_uint64 num_tasks = 0, extras = 0;
+  kmp_uint64 num_tasks_min = __kmp_taskloop_min_tasks;
+  kmp_info_t *thread = __kmp_threads[gtid];
+  kmp_taskdata_t *current_task = thread->th.th_current_task;
+
   // compute trip count
   if (st == 1) { // most common case
     tc = upper - lower + 1;
@@ -3314,6 +3568,10 @@
     __kmp_task_finish(gtid, task, current_task);
     return;
   }
+  if (num_tasks_min == 0)
+    // TODO: can we choose better default heuristic?
+    num_tasks_min = KMP_MIN(thread->th.th_team_nproc * 10,
+                            INITIAL_TASK_DEQUE_SIZE);
 
   // compute num_tasks/grainsize based on the input provided
   switch (sched) {
@@ -3338,9 +3596,8 @@
       extras = 0;
     } else {
       num_tasks = tc / grainsize;
-      grainsize =
-          tc /
-          num_tasks; // adjust grainsize for balanced distribution of iterations
+      // adjust grainsize for balanced distribution of iterations
+      grainsize = tc / num_tasks;
       extras = tc % num_tasks;
     }
     break;
@@ -3350,95 +3607,32 @@
   KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
   KMP_DEBUG_ASSERT(num_tasks > extras);
   KMP_DEBUG_ASSERT(num_tasks > 0);
-  KA_TRACE(20, ("__kmpc_taskloop: T#%d will launch: num_tasks %lld, grainsize "
-                "%lld, extras %lld\n",
-                gtid, num_tasks, grainsize, extras));
-
-  // Main loop, launch num_tasks tasks, assign grainsize iterations each task
-  for (i = 0; i < num_tasks; ++i) {
-    kmp_uint64 chunk_minus_1;
-    if (extras == 0) {
-      chunk_minus_1 = grainsize - 1;
-    } else {
-      chunk_minus_1 = grainsize;
-      --extras; // first extras iterations get bigger chunk (grainsize+1)
-    }
-    upper = lower + st * chunk_minus_1;
-    if (i == num_tasks - 1) {
-      // schedule the last task, set lastprivate flag
-      lastpriv = 1;
-#if KMP_DEBUG
-      if (st == 1)
-        KMP_DEBUG_ASSERT(upper == *ub);
-      else if (st > 0)
-        KMP_DEBUG_ASSERT(upper + st > *ub);
-      else
-        KMP_DEBUG_ASSERT(upper + st < *ub);
-#endif
-    }
-    next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
-    *(kmp_uint64 *)((char *)next_task + lower_offset) =
-        lower; // adjust task-specific bounds
-    *(kmp_uint64 *)((char *)next_task + upper_offset) = upper;
-    if (ptask_dup != NULL)
-      ptask_dup(next_task, task,
-                lastpriv); // set lastprivate flag, construct fistprivates, etc.
-    KA_TRACE(20, ("__kmpc_taskloop: T#%d schedule task %p: lower %lld, upper "
-                  "%lld (offsets %p %p)\n",
-                  gtid, next_task, lower, upper, lower_offset, upper_offset));
-    __kmp_omp_task(gtid, next_task, true); // schedule new task
-    lower = upper + st; // adjust lower bound for the next iteration
-  }
-  // free the pattern task and exit
-  __kmp_task_start(gtid, task, current_task);
-  // do not execute the pattern task, just do bookkeeping
-  __kmp_task_finish(gtid, task, current_task);
-}
-
-/*!
-@ingroup TASKING
-@param loc       Source location information
-@param gtid      Global thread ID
-@param task      Task structure
-@param if_val    Value of the if clause
-@param lb        Pointer to loop lower bound
-@param ub        Pointer to loop upper bound
-@param st        Loop stride
-@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
-@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
-@param grainsize Schedule value if specified
-@param task_dup  Tasks duplication routine
-
-Execute the taskloop construct.
-*/
-void __kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
-                     kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st, int nogroup,
-                     int sched, kmp_uint64 grainsize, void *task_dup) {
-  kmp_taskdata_t *taskdata = KMP_TASK_TO_TASKDATA(task);
-  KMP_DEBUG_ASSERT(task != NULL);
-
-  KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub "
-                "%lld st %lld, grain %llu(%d)\n",
-                gtid, taskdata, *lb, *ub, st, grainsize, sched));
+  // =========================================================================
 
   // check if clause value first
   if (if_val == 0) { // if(0) specified, mark task as serial
     taskdata->td_flags.task_serial = 1;
     taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
-  }
-  if (nogroup == 0) {
-    __kmpc_taskgroup(loc, gtid);
+    // always start serial tasks linearly
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
+  } else if (num_tasks > num_tasks_min) {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go recursive: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_recur(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                         grainsize, extras, tc, num_tasks_min, task_dup);
+  } else {
+    KA_TRACE(20, ("__kmpc_taskloop: T#%d, go linear: tc %llu, #tasks %llu"
+                  "(%lld), grain %llu, extras %llu\n", gtid, tc, num_tasks,
+                  num_tasks_min, grainsize, extras));
+    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, ub_glob, num_tasks,
+                          grainsize, extras, tc, task_dup);
   }
 
-  if (1 /* AC: use some heuristic here to choose task scheduling method */) {
-    __kmp_taskloop_linear(loc, gtid, task, lb, ub, st, sched, grainsize,
-                          task_dup);
-  }
-
-  if (nogroup == 0) {
+  if (nogroup == 0)
     __kmpc_end_taskgroup(loc, gtid);
-  }
-  KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+  KA_TRACE(20, ("__kmpc_taskloop(exit): T#%d\n", gtid));
 }
 
 #endif