Add new OpenMP 4.5 taskloop construct feature

From the standard: The taskloop construct specifies that the iterations of one
or more associated loops will be executed in parallel using OpenMP tasks. The
iterations are distributed across tasks created by the construct and scheduled
to be executed.

This initial implementation uses a simple linear tasks distribution algorithm.
Later we can add other algorithms to speedup generation of huge number of tasks
(i.e., tree-like tasks generation should be faster).

This needs to be put into the OpenMP runtime library in order for the
compiler team to develop the compiler side of the implementation.

Differential Revision: http://reviews.llvm.org/D17404

llvm-svn: 262535
diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c
index 3ed9a9d..faf1193 100644
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@@ -1000,6 +1000,7 @@
 #if OMP_41_ENABLED
     taskdata->td_flags.proxy           = flags->proxy;
     taskdata->td_task_team         = thread->th.th_task_team;
+    taskdata->td_size_alloc        = shareds_offset + sizeof_shareds;
 #endif
     taskdata->td_flags.tasktype    = TASK_EXPLICIT;
 
@@ -2877,4 +2878,231 @@
     KA_TRACE(10, ("__kmp_proxy_task_completed_ooo(exit): proxy task completing ooo %p\n", taskdata ) );
 }
 
+//---------------------------------------------------------------------------------
+// __kmp_task_dup_alloc: Allocate the taskdata and make a copy of source task for taskloop
+//
+// thread:   allocating thread
+// task_src: pointer to source task to be duplicated
+// returns:  a pointer to the allocated kmp_task_t structure (task).
+kmp_task_t *
+__kmp_task_dup_alloc( kmp_info_t *thread, kmp_task_t *task_src )
+{
+    kmp_task_t     *task;
+    kmp_taskdata_t *taskdata;
+    kmp_taskdata_t *taskdata_src;
+    kmp_taskdata_t *parent_task = thread->th.th_current_task;
+    size_t shareds_offset;
+    size_t task_size;
+
+    KA_TRACE(10, ("__kmp_task_dup_alloc(enter): Th %p, source task %p\n", thread, task_src) );
+    taskdata_src = KMP_TASK_TO_TASKDATA( task_src );
+    KMP_DEBUG_ASSERT( taskdata_src->td_flags.proxy == TASK_FULL ); // it should not be proxy task
+    KMP_DEBUG_ASSERT( taskdata_src->td_flags.tasktype == TASK_EXPLICIT );
+    task_size = taskdata_src->td_size_alloc;
+
+    // Allocate a kmp_taskdata_t block and a kmp_task_t block.
+    KA_TRACE(30, ("__kmp_task_dup_alloc: Th %p, malloc size %ld\n", thread, task_size) );
+    #if USE_FAST_MEMORY
+    taskdata = (kmp_taskdata_t *)__kmp_fast_allocate( thread, task_size );
+    #else
+    taskdata = (kmp_taskdata_t *)__kmp_thread_malloc( thread, task_size );
+    #endif /* USE_FAST_MEMORY */
+    KMP_MEMCPY(taskdata, taskdata_src, task_size);
+
+    task = KMP_TASKDATA_TO_TASK(taskdata);
+
+    // Initialize new task (only specific fields not affected by memcpy)
+    taskdata->td_task_id = KMP_GEN_TASK_ID();
+    if( task->shareds != NULL ) { // need setup shareds pointer
+        shareds_offset = (char*)task_src->shareds - (char*)taskdata_src;
+        task->shareds = &((char*)taskdata)[shareds_offset];
+        KMP_DEBUG_ASSERT( (((kmp_uintptr_t)task->shareds) & (sizeof(void*)-1)) == 0 );
+    }
+    taskdata->td_alloc_thread = thread;
+    taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
+
+    // Only need to keep track of child task counts if team parallel and tasking not serialized
+    if ( !( taskdata->td_flags.team_serial || taskdata->td_flags.tasking_ser ) ) {
+        KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
+        if ( parent_task->td_taskgroup )
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_taskgroup->count) );
+        // Only need to keep track of allocated child tasks for explicit tasks since implicit not deallocated
+        if ( taskdata->td_parent->td_flags.tasktype == TASK_EXPLICIT )
+            KMP_TEST_THEN_INC32( (kmp_int32 *)(& taskdata->td_parent->td_allocated_child_tasks) );
+    }
+
+    KA_TRACE(20, ("__kmp_task_dup_alloc(exit): Th %p, created task %p, parent=%p\n",
+                  thread, taskdata, taskdata->td_parent) );
+#if OMPT_SUPPORT
+    __kmp_task_init_ompt(taskdata, thread->th.th_info.ds.ds_gtid, (void*)task->routine);
+#endif
+    return task;
+}
+
+// Routine optionally generated by th ecompiler for setting the lastprivate flag
+// and calling needed constructors for private/firstprivate objects
+// (used to form taskloop tasks from pattern task)
+typedef void(*p_task_dup_t)(kmp_task_t *, kmp_task_t *, kmp_int32);
+
+//---------------------------------------------------------------------------------
+// __kmp_taskloop_linear: Start tasks of the taskloop linearly
+//
+// loc       Source location information
+// gtid      Global thread ID
+// task      Task with whole loop iteration range
+// lb        Pointer to loop lower bound
+// ub        Pointer to loop upper bound
+// st        Loop stride
+// sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+// grainsize Schedule value if specified
+// task_dup  Tasks duplication routine
+void
+__kmp_taskloop_linear(ident_t *loc, int gtid, kmp_task_t *task,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int sched, kmp_uint64 grainsize, void *task_dup )
+{
+    p_task_dup_t ptask_dup = (p_task_dup_t)task_dup;
+    kmp_uint64 tc;
+    kmp_uint64 lower = *lb; // compiler provides global bounds here
+    kmp_uint64 upper = *ub;
+    kmp_uint64 i, num_tasks, extras;
+    kmp_info_t *thread = __kmp_threads[gtid];
+    kmp_taskdata_t *current_task = thread->th.th_current_task;
+    kmp_task_t *next_task;
+    kmp_int32 lastpriv = 0;
+    size_t lower_offset = (char*)lb - (char*)task; // remember offset of lb in the task structure
+    size_t upper_offset = (char*)ub - (char*)task; // remember offset of ub in the task structure
+
+    // compute trip count
+    if ( st == 1 ) {   // most common case
+        tc = upper - lower + 1;
+    } else if ( st < 0 ) {
+        tc = (lower - upper) / (-st) + 1;
+    } else {       // st > 0
+        tc = (upper - lower) / st + 1;
+    }
+    if(tc == 0) {
+        // free the pattern task and exit
+        __kmp_task_start( gtid, task, current_task );
+        // do not execute anything for zero-trip loop
+        __kmp_task_finish( gtid, task, current_task );
+        return;
+    }
+
+    // compute num_tasks/grainsize based on the input provided
+    switch( sched ) {
+    case 0: // no schedule clause specified, we can choose the default
+            // let's try to schedule (team_size*10) tasks
+        grainsize = thread->th.th_team_nproc * 10;
+    case 2: // num_tasks provided
+        if( grainsize > tc ) {
+            num_tasks = tc;   // too big num_tasks requested, adjust values
+            grainsize = 1;
+            extras = 0;
+        } else {
+            num_tasks = grainsize;
+            grainsize = tc / num_tasks;
+            extras = tc % num_tasks;
+        }
+        break;
+    case 1: // grainsize provided
+        if( grainsize > tc ) {
+            num_tasks = 1;    // too big grainsize requested, adjust values
+            grainsize = tc;
+            extras = 0;
+        } else {
+            num_tasks = tc / grainsize;
+            grainsize = tc / num_tasks; // adjust grainsize for balanced distribution of iterations
+            extras = tc % num_tasks;
+        }
+        break;
+    default:
+        KMP_ASSERT2(0, "unknown scheduling of taskloop");
+    }
+    KMP_DEBUG_ASSERT(tc == num_tasks * grainsize + extras);
+    KMP_DEBUG_ASSERT(num_tasks > extras);
+    KMP_DEBUG_ASSERT(num_tasks > 0);
+
+    // Main loop, launch num_tasks tasks, assign grainsize iterations each task
+    for( i = 0; i < num_tasks; ++i ) {
+        kmp_uint64 chunk_minus_1;
+        if( extras == 0 ) {
+            chunk_minus_1 = grainsize - 1;
+        } else {
+            chunk_minus_1 = grainsize;
+            --extras; // first extras iterations get bigger chunk (grainsize+1)
+        }
+        upper = lower + st * chunk_minus_1;
+        if( i == num_tasks - 1 ) {
+            // schedule the last task, set lastprivate flag
+            lastpriv = 1;
+#if KMP_DEBUG
+            if( st == 1 )
+                KMP_DEBUG_ASSERT(upper == *ub);
+            else if( st > 0 )
+                KMP_DEBUG_ASSERT(upper+st > *ub);
+            else
+                KMP_DEBUG_ASSERT(upper+st < *ub);
+#endif
+        }
+        next_task = __kmp_task_dup_alloc(thread, task); // allocate new task
+        *(kmp_uint64*)((char*)next_task + lower_offset) = lower; // adjust task-specific bounds
+        *(kmp_uint64*)((char*)next_task + upper_offset) = upper;
+        if( ptask_dup != NULL )
+            ptask_dup(next_task, task, lastpriv); // set lastprivate flag, construct fistprivates, etc.
+        __kmp_omp_task(gtid, next_task, true); // schedule new task
+        lower = upper + st; // adjust lower bound for the next iteration
+    }
+    // free the pattern task and exit
+    __kmp_task_start( gtid, task, current_task );
+    // do not execute the pattern task, just do bookkeeping
+    __kmp_task_finish( gtid, task, current_task );
+}
+
+/*!
+@ingroup TASKING
+@param loc       Source location information
+@param gtid      Global thread ID
+@param task      Task structure
+@param if_val    Value of the if clause
+@param lb        Pointer to loop lower bound
+@param ub        Pointer to loop upper bound
+@param st        Loop stride
+@param nogroup   Flag, 1 if nogroup clause specified, 0 otherwise
+@param sched     Schedule specified 0/1/2 for none/grainsize/num_tasks
+@param grainsize Schedule value if specified
+@param task_dup  Tasks duplication routine
+
+Execute the taskloop construct.
+*/
+void
+__kmpc_taskloop(ident_t *loc, int gtid, kmp_task_t *task, int if_val,
+                kmp_uint64 *lb, kmp_uint64 *ub, kmp_int64 st,
+                int nogroup, int sched, kmp_uint64 grainsize, void *task_dup )
+{
+    kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+    KMP_DEBUG_ASSERT( task != NULL );
+
+    KA_TRACE(10, ("__kmpc_taskloop(enter): T#%d, pattern task %p, lb %lld ub %lld st %lld, grain %llu(%d)\n",
+        gtid, taskdata, *lb, *ub, st, grainsize, sched));
+
+    // check if clause value first
+    if( if_val == 0 ) { // if(0) specified, mark task as serial
+        taskdata->td_flags.task_serial = 1;
+        taskdata->td_flags.tiedness = TASK_TIED; // AC: serial task cannot be untied
+    }
+    if( nogroup == 0 ) {
+        __kmpc_taskgroup( loc, gtid );
+    }
+
+    if( 1 /* AC: use some heuristic here to choose task scheduling method */ ) {
+        __kmp_taskloop_linear( loc, gtid, task, lb, ub, st, sched, grainsize, task_dup );
+    }
+
+    if( nogroup == 0 ) {
+        __kmpc_end_taskgroup( loc, gtid );
+    }
+    KA_TRACE(10, ("__kmpc_taskloop(exit): T#%d\n", gtid));
+}
+
 #endif