Update stats-gathering code

Have developer timers use partitioning scheme which also required that some
redundant developer timers be removed in favor of the already existing normal
timers. Move per thread stats initialization to just after global thread id
assignment which is as early as possible. Also put all global stats
initialization code in __kmp_stats_init() and all global stats destruction code
in __kmp_stats_fini().

Differential Revision: https://reviews.llvm.org/D26361

llvm-svn: 286892
diff --git a/openmp/runtime/src/kmp_barrier.cpp b/openmp/runtime/src/kmp_barrier.cpp
index 7843b68..5e77614 100644
--- a/openmp/runtime/src/kmp_barrier.cpp
+++ b/openmp/runtime/src/kmp_barrier.cpp
@@ -50,7 +50,7 @@
                             void (*reduce)(void *, void *)
                             USE_ITT_BUILD_ARG(void * itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_linear_gather);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_gather);
     register kmp_team_t *team = this_thr->th.th_team;
     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
     register kmp_info_t **other_threads = team->t.t_threads;
@@ -130,7 +130,7 @@
                              int propagate_icvs
                              USE_ITT_BUILD_ARG(void *itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_linear_release);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_linear_release);
     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
     register kmp_team_t *team;
 
@@ -149,7 +149,7 @@
         if (nproc > 1) {
 #if KMP_BARRIER_ICV_PUSH
             {
-                KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
+                KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
                 if (propagate_icvs) {
                     ngo_load(&team->t.t_implicit_task_taskdata[0].td_icvs);
                     for (i=1; i<nproc; ++i) {
@@ -225,7 +225,7 @@
                           void (*reduce)(void *, void *)
                           USE_ITT_BUILD_ARG(void *itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_tree_gather);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_gather);
     register kmp_team_t *team = this_thr->th.th_team;
     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
     register kmp_info_t **other_threads = team->t.t_threads;
@@ -323,7 +323,7 @@
                            int propagate_icvs
                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_tree_release);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_tree_release);
     register kmp_team_t *team;
     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
     register kmp_uint32 nproc;
@@ -393,7 +393,7 @@
 
 #if KMP_BARRIER_ICV_PUSH
             {
-                KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
+                KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
                 if (propagate_icvs) {
                     __kmp_init_implicit_task(team->t.t_ident, team->t.t_threads[child_tid],
                                              team, child_tid, FALSE);
@@ -426,7 +426,7 @@
                            void (*reduce)(void *, void *)
                            USE_ITT_BUILD_ARG(void *itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_gather);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_gather);
     register kmp_team_t *team = this_thr->th.th_team;
     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
     register kmp_info_t **other_threads = team->t.t_threads;
@@ -535,7 +535,7 @@
                             int propagate_icvs
                             USE_ITT_BUILD_ARG(void *itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_hyper_release);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hyper_release);
     register kmp_team_t    *team;
     register kmp_bstate_t  *thr_bar       = & this_thr -> th.th_bar[ bt ].bb;
     register kmp_info_t   **other_threads;
@@ -742,7 +742,7 @@
                                   int gtid, int tid, void (*reduce) (void *, void *)
                                   USE_ITT_BUILD_ARG(void * itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_hier_gather);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_gather);
     register kmp_team_t *team = this_thr->th.th_team;
     register kmp_bstate_t *thr_bar = & this_thr->th.th_bar[bt].bb;
     register kmp_uint32 nproc = this_thr->th.th_team_nproc;
@@ -883,7 +883,7 @@
                                    int propagate_icvs
                                    USE_ITT_BUILD_ARG(void * itt_sync_obj) )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_hier_release);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_hier_release);
     register kmp_team_t *team;
     register kmp_bstate_t *thr_bar = &this_thr->th.th_bar[bt].bb;
     register kmp_uint32 nproc;
@@ -1067,9 +1067,8 @@
 __kmp_barrier(enum barrier_type bt, int gtid, int is_split, size_t reduce_size,
               void *reduce_data, void (*reduce)(void *, void *))
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_barrier);
-    KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
     KMP_TIME_PARTITIONED_BLOCK(OMP_plain_barrier);
+    KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
     register int tid = __kmp_tid_from_gtid(gtid);
     register kmp_info_t *this_thr = __kmp_threads[gtid];
     register kmp_team_t *team = this_thr->th.th_team;
@@ -1333,7 +1332,8 @@
 void
 __kmp_end_split_barrier(enum barrier_type bt, int gtid)
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_end_split_barrier);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_end_split_barrier);
+    KMP_SET_THREAD_STATE_BLOCK(PLAIN_BARRIER);
     int tid = __kmp_tid_from_gtid(gtid);
     kmp_info_t *this_thr = __kmp_threads[gtid];
     kmp_team_t *team = this_thr->th.th_team;
@@ -1376,9 +1376,8 @@
 void
 __kmp_join_barrier(int gtid)
 {
-    KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
+    KMP_TIME_PARTITIONED_BLOCK(OMP_join_barrier);
     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
-    KMP_TIME_DEVELOPER_BLOCK(KMP_join_barrier);
     register kmp_info_t *this_thr = __kmp_threads[gtid];
     register kmp_team_t *team;
     register kmp_uint nproc;
@@ -1592,9 +1591,8 @@
 void
 __kmp_fork_barrier(int gtid, int tid)
 {
-    KMP_TIME_PARTITIONED_BLOCK(OMP_fork_join_barrier);
+    KMP_TIME_PARTITIONED_BLOCK(OMP_fork_barrier);
     KMP_SET_THREAD_STATE_BLOCK(FORK_JOIN_BARRIER);
-    KMP_TIME_DEVELOPER_BLOCK(KMP_fork_barrier);
     kmp_info_t *this_thr = __kmp_threads[gtid];
     kmp_team_t *team = (tid == 0) ? this_thr->th.th_team : NULL;
 #if USE_ITT_BUILD
@@ -1707,7 +1705,7 @@
        the fixed ICVs in the master's thread struct, because it is not always the case that the
        threads arrays have been allocated when __kmp_fork_call() is executed. */
     {
-        KMP_TIME_DEVELOPER_BLOCK(USER_icv_copy);
+        KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(USER_icv_copy);
         if (!KMP_MASTER_TID(tid)) {  // master thread already has ICVs
             // Copy the initial ICVs from the master's thread struct to the implicit task for this tid.
             KA_TRACE(10, ("__kmp_fork_barrier: T#%d(%d) is PULLing ICVs\n", gtid, tid));
@@ -1762,7 +1760,7 @@
 void
 __kmp_setup_icv_copy(kmp_team_t *team, int new_nproc, kmp_internal_control_t *new_icvs, ident_t *loc )
 {
-    KMP_TIME_DEVELOPER_BLOCK(KMP_setup_icv_copy);
+    KMP_TIME_DEVELOPER_PARTITIONED_BLOCK(KMP_setup_icv_copy);
 
     KMP_DEBUG_ASSERT(team && new_nproc && new_icvs);
     KMP_DEBUG_ASSERT((!TCR_4(__kmp_init_parallel)) || new_icvs->nproc);