For your Christmas hacking pleasure.
This release use aligns with Intel(r) Composer XE 2013 SP1 Product Update 2 

New features
* The library can now be built with clang (though wiht some
  limitations since clang does not support 128 bit floats)
* Support for Vtune analysis of load imbalance
* Code contribution from Steven Noonan to build the runtime for ARM*
  architecture processors 
* First implementation of runtime API for OpenMP cancellation

Bug Fixes
* Fixed hang on Windows (only) when using KMP_BLOCKTIME=0

llvm-svn: 197914
diff --git a/openmp/runtime/src/kmp_tasking.c b/openmp/runtime/src/kmp_tasking.c
index ea5cdc0..8cac009 100644
--- a/openmp/runtime/src/kmp_tasking.c
+++ b/openmp/runtime/src/kmp_tasking.c
@@ -1,7 +1,7 @@
 /*
  * kmp_tasking.c -- OpenMP 3.0 tasking support.
- * $Revision: 42522 $
- * $Date: 2013-07-16 05:28:49 -0500 (Tue, 16 Jul 2013) $
+ * $Revision: 42852 $
+ * $Date: 2013-12-04 10:50:49 -0600 (Wed, 04 Dec 2013) $
  */
 
 
@@ -620,13 +620,28 @@
 #if OMP_40_ENABLED
         if ( taskdata->td_taskgroup )
             KMP_TEST_THEN_DEC32( (kmp_int32 *)(& taskdata->td_taskgroup->count) );
-        __kmp_release_deps(gtid,taskdata);    
+        __kmp_release_deps(gtid,taskdata);
 #endif
     }
 
     KA_TRACE(20, ("__kmp_task_finish: T#%d finished task %p, %d incomplete children\n",
                   gtid, taskdata, children) );
 
+#if OMP_40_ENABLED
+    /* If the tasks' destructor thunk flag has been set, we need to invoke the
+       destructor thunk that has been generated by the compiler.
+       The code is placed here, since at this point other tasks might have been released
+       hence overlapping the destructor invokations with some other work in the
+       released tasks.  The OpenMP spec is not specific on when the destructors are
+       invoked, so we should be free to choose.
+     */
+    if (taskdata->td_flags.destructors_thunk) {
+        kmp_routine_entry_t destr_thunk = task->destructors;
+        KMP_ASSERT(destr_thunk);
+        destr_thunk(gtid, task);
+    }
+#endif // OMP_40_ENABLED
+
     // bookkeeping for resuming task:
     // GEH - note tasking_ser => task_serial
     KMP_DEBUG_ASSERT( (taskdata->td_flags.tasking_ser || taskdata->td_flags.task_serial) ==
@@ -739,10 +754,10 @@
     task->td_flags.complete    = 0;
     task->td_flags.freed       = 0;
 
-#if OMP_40_ENABLED    
+#if OMP_40_ENABLED
     task->td_dephash = NULL;
     task->td_depnode = NULL;
-#endif    
+#endif
 
     if (set_curr_task) {  // only do this initialization the first time a thread is created
         task->td_incomplete_child_tasks = 0;
@@ -850,7 +865,7 @@
 
     taskdata->td_task_id      = KMP_GEN_TASK_ID();
     taskdata->td_team         = team;
-    taskdata->td_alloc_thread = thread; 
+    taskdata->td_alloc_thread = thread;
     taskdata->td_parent       = parent_task;
     taskdata->td_level        = parent_task->td_level + 1; // increment nesting level
     taskdata->td_ident        = loc_ref;
@@ -863,6 +878,9 @@
     taskdata->td_flags.tiedness    = flags->tiedness;
     taskdata->td_flags.final       = flags->final;
     taskdata->td_flags.merged_if0  = flags->merged_if0;
+#if OMP_40_ENABLED
+    taskdata->td_flags.destructors_thunk = flags->destructors_thunk;
+#endif // OMP_40_ENABLED
     taskdata->td_flags.tasktype    = TASK_EXPLICIT;
 
     // GEH - TODO: fix this to copy parent task's value of tasking_ser flag
@@ -890,7 +908,7 @@
     taskdata->td_taskgroup = parent_task->td_taskgroup; // task inherits the taskgroup from the parent task
     taskdata->td_dephash = NULL;
     taskdata->td_depnode = NULL;
-#endif
+#endif 
     // Only need to keep track of child task counts if team parallel and tasking not serialized
     if ( !( taskdata -> td_flags.team_serial || taskdata -> td_flags.tasking_ser ) ) {
         KMP_TEST_THEN_INC32( (kmp_int32 *)(& parent_task->td_incomplete_child_tasks) );
@@ -946,24 +964,46 @@
 __kmp_invoke_task( kmp_int32 gtid, kmp_task_t *task, kmp_taskdata_t * current_task )
 {
     kmp_taskdata_t * taskdata = KMP_TASK_TO_TASKDATA(task);
+#if OMP_40_ENABLED
+    int discard = 0 /* false */;
+#endif
     KA_TRACE(30, ("__kmp_invoke_task(enter): T#%d invoking task %p, current_task=%p\n",
                   gtid, taskdata, current_task) );
 
     __kmp_task_start( gtid, task, current_task );
 
+#if OMP_40_ENABLED
+    // TODO: cancel tasks if the parallel region has also been cancelled
+    // TODO: check if this sequence can be hoisted above __kmp_task_start
+    // if cancellation has been enabled for this run ...
+    if (__kmp_omp_cancellation) {
+        kmp_info_t *this_thr = __kmp_threads [ gtid ];
+        kmp_team_t * this_team = this_thr->th.th_team;
+        kmp_taskgroup_t * taskgroup = taskdata->td_taskgroup;
+        if ((taskgroup && taskgroup->cancel_request) || (this_team->t.t_cancel_request == cancel_parallel)) {
+            // this task belongs to a task group and we need to cancel it
+            discard = 1 /* true */;
+        }
+    }
+
     //
     // Invoke the task routine and pass in relevant data.
     // Thunks generated by gcc take a different argument list.
     //
+    if (!discard) {
+#endif // OMP_40_ENABLED
 #ifdef KMP_GOMP_COMPAT
-    if (taskdata->td_flags.native) {
-        ((void (*)(void *))(*(task->routine)))(task->shareds);
-    }
-    else
+        if (taskdata->td_flags.native) {
+            ((void (*)(void *))(*(task->routine)))(task->shareds);
+        }
+        else
 #endif /* KMP_GOMP_COMPAT */
-    {
-        (*(task->routine))(gtid, task);
+        {
+            (*(task->routine))(gtid, task);
+        }
+#if OMP_40_ENABLED
     }
+#endif // OMP_40_ENABLED
 
     __kmp_task_finish( gtid, task, current_task );
 
@@ -1079,10 +1119,8 @@
             // GEH: if team serialized, avoid reading the volatile variable below.
             while ( TCR_4(taskdata -> td_incomplete_child_tasks) != 0 ) {
                 __kmp_execute_tasks( thread, gtid, &(taskdata->td_incomplete_child_tasks),
-                                     0, FALSE, &thread_finished, 
-#if USE_ITT_BUILD
-                                     itt_sync_obj, 
-#endif /* USE_ITT_BUILD */
+                                     0, FALSE, &thread_finished
+                                     USE_ITT_BUILD_ARG(itt_sync_obj),
                                      __kmp_task_stealing_constraint );
             }
         }
@@ -1134,10 +1172,8 @@
             __kmp_itt_taskwait_starting( gtid, itt_sync_obj );
 #endif /* USE_ITT_BUILD */
         if ( ! taskdata->td_flags.team_serial ) {
-            __kmp_execute_tasks( thread, gtid, NULL, 0, FALSE, &thread_finished,
-#if USE_ITT_BUILD
-                                 itt_sync_obj, 
-#endif /* USE_ITT_BUILD */
+            __kmp_execute_tasks( thread, gtid, NULL, 0, FALSE, &thread_finished
+                                 USE_ITT_BUILD_ARG(itt_sync_obj),
                                  __kmp_task_stealing_constraint );
         }
 
@@ -1162,7 +1198,7 @@
 // __kmpc_taskgroup: Start a new taskgroup
 
 void
-__kmpc_taskgroup( ident* loc, int gtid )
+__kmpc_taskgroup( ident_t* loc, int gtid )
 {
     kmp_info_t      * thread = __kmp_threads[ gtid ];
     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
@@ -1170,6 +1206,7 @@
         (kmp_taskgroup_t *)__kmp_thread_malloc( thread, sizeof( kmp_taskgroup_t ) );
     KA_TRACE(10, ("__kmpc_taskgroup: T#%d loc=%p group=%p\n", gtid, loc, tg_new) );
     tg_new->count = 0;
+    tg_new->cancel_request = cancel_noreq;
     tg_new->parent = taskdata->td_taskgroup;
     taskdata->td_taskgroup = tg_new;
 }
@@ -1180,7 +1217,7 @@
 //                       and its descendants are complete
 
 void
-__kmpc_end_taskgroup( ident* loc, int gtid )
+__kmpc_end_taskgroup( ident_t* loc, int gtid )
 {
     kmp_info_t      * thread = __kmp_threads[ gtid ];
     kmp_taskdata_t  * taskdata = thread->th.th_current_task;
@@ -1201,10 +1238,8 @@
         if ( ! taskdata->td_flags.team_serial ) {
             while ( TCR_4(taskgroup->count) != 0 ) {
                 __kmp_execute_tasks( thread, gtid, &(taskgroup->count),
-                                     0, FALSE, &thread_finished, 
-#if USE_ITT_BUILD
-                                     itt_sync_obj,
-#endif /* USE_ITT_BUILD */
+                                     0, FALSE, &thread_finished
+                                     USE_ITT_BUILD_ARG(itt_sync_obj),
                                      __kmp_task_stealing_constraint );
             }
         }
@@ -1420,15 +1455,13 @@
 // checker is the value to check to terminate the spin.
 
 int
-__kmp_execute_tasks( kmp_info_t *thread, 
-                     kmp_int32 gtid, 
+__kmp_execute_tasks( kmp_info_t *thread,
+                     kmp_int32 gtid,
                      volatile kmp_uint *spinner,
                      kmp_uint checker,
-                     int final_spin, 
-                     int *thread_finished, 
-#if USE_ITT_BUILD
-                     void * itt_sync_obj,
-#endif /* USE_ITT_BUILD */
+                     int final_spin,
+                     int *thread_finished
+                     USE_ITT_BUILD_ARG(void * itt_sync_obj),
                      kmp_int32 is_constrained )
 {
     kmp_task_team_t *     task_team;
@@ -2297,11 +2330,9 @@
 // in team > 1 !
 
 void
-__kmp_task_team_wait( kmp_info_t *this_thr, 
+__kmp_task_team_wait( kmp_info_t *this_thr,
                       kmp_team_t *team
-#if USE_ITT_BUILD
-                      , void * itt_sync_obj
-#endif /* USE_ITT_BUILD */
+                      USE_ITT_BUILD_ARG(void * itt_sync_obj)
                       )
 {
     kmp_task_team_t *task_team = team->t.t_task_team;
@@ -2320,9 +2351,7 @@
         // termination condition.
         //
         __kmp_wait_sleep( this_thr, &task_team->tt.tt_unfinished_threads, 0, TRUE
-#if USE_ITT_BUILD
-                          , itt_sync_obj
-#endif /* USE_ITT_BUILD */
+                          USE_ITT_BUILD_ARG(itt_sync_obj)
                           );
 
         //
@@ -2361,7 +2390,8 @@
 #if USE_ITT_BUILD
     KMP_FSYNC_SPIN_INIT( spin, (kmp_uint32*) NULL );
 #endif /* USE_ITT_BUILD */
-    while (! __kmp_execute_tasks( thread, gtid, spin, 0, TRUE, &flag, NULL ) ) {
+    while (! __kmp_execute_tasks( thread, gtid, spin, 0, TRUE, &flag 
+                                  USE_ITT_BUILD_ARG(NULL), 0 ) ) {
 #if USE_ITT_BUILD
         // TODO: What about itt_sync_obj??
         KMP_FSYNC_SPIN_PREPARE( spin );