[OpenMP] Update target codegen for NVPTX device.

This patch includes updates for codegen of the target region for the NVPTX
device. It moves initializers from the compiler to the runtime and updates
the worker loop to assume parallel work is retrieved from the runtime. A
subsequent patch will update the codegen to retrieve the parallel work using
calls to the runtime. It includes the removal of the inline attribute
for the worker loop and disabling debug info in it.

This allows codegen for a target directive and serial execution on the
NVPTX device.

Reviewers: ABataev
Differential Revision: https://reviews.llvm.org/D28125

llvm-svn: 291121
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
index 287089d..59c4d5b 100644
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -8,9 +8,6 @@
 #ifndef HEADER
 #define HEADER
 
-// CHECK-DAG: [[OMP_NT:@.+]] = common addrspace(3) global i32 0
-// CHECK-DAG: [[OMP_WID:@.+]] = common addrspace(3) global i64 0
-
 template<typename tx, typename ty>
 struct TT{
   tx X;
@@ -26,19 +23,22 @@
   double cn[5][n];
   TT<long long, char> d;
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l87}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -54,31 +54,34 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l87]]()
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]()
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
   // CHECK: {{call|invoke}} void [[T1]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -93,19 +96,22 @@
   {
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l158}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -121,35 +127,38 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l158]](i[[SZ:32|64]] [[ARG1:%[^)]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
   // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
   // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
   // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T3]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T2]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // CHECK: load i16, i16* [[AA_CADDR]],
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -160,19 +169,22 @@
     aa += 1;
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l261}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -188,7 +200,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+foo.+l261]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -219,26 +231,29 @@
   // CHECK-DAG:    [[REF_CN:%.+]] = load double*, double** [[LOCAL_CN]],
   // CHECK-DAG:    [[REF_D:%.+]] = load [[TT]]*, [[TT]]** [[LOCAL_D]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T4]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T3]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   //
   // Use captures.
   // CHECK-64-DAG:  load i32, i32* [[REF_A]]
@@ -249,10 +264,10 @@
   // CHECK-DAG:  getelementptr inbounds double, double* [[REF_CN]], i[[SZ]] %{{.+}}
   // CHECK-DAG:     getelementptr inbounds [[TT]], [[TT]]* [[REF_D]], i32 0, i32 0
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -338,19 +353,22 @@
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+l298}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -366,7 +384,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+static.+l298]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -382,36 +400,37 @@
   // CHECK-DAG:      [[REF_AAA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AAA]] to i8*
   // CHECK-DAG:      [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T5]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T4]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
-  //
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
   // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
   // CHECK-DAG:    load i16, i16* [[REF_AA]]
   // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
-  //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -420,19 +439,22 @@
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l316}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -448,7 +470,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+S1.+l316]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]](
   // Create local storage for each capture.
   // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
   // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -466,35 +488,39 @@
   // CHECK-DAG:   [[VAL_VLA1:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA1]],
   // CHECK-DAG:   [[VAL_VLA2:%.+]] = load i[[SZ]], i[[SZ]]* [[LOCAL_VLA2]],
   // CHECK-DAG:   [[REF_C:%.+]] = load i16*, i16** [[LOCAL_C]],
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
   //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T6]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T5]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   // Use captures.
   // CHECK-DAG:   getelementptr inbounds [[S1]], [[S1]]* [[REF_THIS]], i32 0, i32 0
   // CHECK-64-DAG:load i32, i32* [[REF_B]]
   // CHECK-32-DAG:load i32, i32* [[LOCAL_B]]
   // CHECK-DAG:   getelementptr inbounds i16, i16* [[REF_C]], i[[SZ]] %{{.+}}
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //
@@ -503,19 +529,22 @@
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l281}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker()
+  // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
+  // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
+  // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
+  // CHECK: store i8 0, i8* [[OMP_EXEC_STATUS]],
   // CHECK: br label {{%?}}[[AWAIT_WORK:.+]]
   //
   // CHECK: [[AWAIT_WORK]]
   // CHECK: call void @llvm.nvvm.barrier0()
-  // CHECK: [[WORK:%.+]] = load i64, i64 addrspace(3)* [[OMP_WID]],
-  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i64 [[WORK]], 0
+  // CHECK: [[WORK:%.+]] = load i8*, i8** [[OMP_WORK_FN]],
+  // CHECK: [[SHOULD_EXIT:%.+]] = icmp eq i8* [[WORK]], null
   // CHECK: br i1 [[SHOULD_EXIT]], label {{%?}}[[EXIT:.+]], label {{%?}}[[SEL_WORKERS:.+]]
   //
   // CHECK: [[SEL_WORKERS]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[NT:%.+]] = load i32, i32 addrspace(3)* [[OMP_NT]]
-  // CHECK: [[IS_ACTIVE:%.+]] = icmp slt i32 [[TID]], [[NT]]
+  // CHECK: [[ST:%.+]] = load i8, i8* [[OMP_EXEC_STATUS]],
+  // CHECK: [[IS_ACTIVE:%.+]] = icmp ne i8 [[ST]], 0
   // CHECK: br i1 [[IS_ACTIVE]], label {{%?}}[[EXEC_PARALLEL:.+]], label {{%?}}[[BAR_PARALLEL:.+]]
   //
   // CHECK: [[EXEC_PARALLEL]]
@@ -531,7 +560,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T7:@__omp_offloading_.+template.+l281]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -544,36 +573,39 @@
   // CHECK-DAG:   [[REF_AA:%.+]] = bitcast i[[SZ]]* [[LOCAL_AA]] to i16*
   // CHECK-DAG:   [[REF_B:%.+]] = load [10 x i32]*, [10 x i32]** [[LOCAL_B]],
   //
-  // CHECK: [[NTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
-  // CHECK: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
-  // CHECK: [[A:%.+]] = sub i32 [[WS]], 1
-  // CHECK: [[B:%.+]] = sub i32 [[NTID]], 1
-  // CHECK: [[MID:%.+]] = and i32 [[B]],
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: [[EXCESS:%.+]] = icmp ugt i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[EXCESS]], label {{%?}}[[EXIT:.+]], label {{%?}}[[CHECK_WORKER:.+]]
-  //
-  // CHECK: [[CHECK_WORKER]]
-  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[MID]]
-  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[MASTER:.+]]
+  // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK-DAG: [[TH_LIMIT:%.+]] = sub i32 [[NTH]], [[WS]]
+  // CHECK: [[IS_WORKER:%.+]] = icmp ult i32 [[TID]], [[TH_LIMIT]]
+  // CHECK: br i1 [[IS_WORKER]], label {{%?}}[[WORKER:.+]], label {{%?}}[[CHECK_MASTER:.+]]
   //
   // CHECK: [[WORKER]]
-  // CHECK: {{call|invoke}} void [[T7]]_worker()
-  // CHECK: br label {{%?}}[[EXIT]]
+  // CHECK: {{call|invoke}} void [[T6]]_worker()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[CHECK_MASTER]]
+  // CHECK-DAG: [[CMTID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  // CHECK-DAG: [[CMNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[CMWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[IS_MASTER:%.+]] = icmp eq i32 [[CMTID]],
+  // CHECK: br i1 [[IS_MASTER]], label {{%?}}[[MASTER:.+]], label {{%?}}[[EXIT]]
   //
   // CHECK: [[MASTER]]
-  // CHECK: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  // CHECK: call void @__kmpc_kernel_init(i32 0, i32 [[TID]])
+  // CHECK-DAG: [[MNTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK-DAG: [[MWS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
+  // CHECK: [[MTMP1:%.+]] = sub i32 [[MNTH]], [[MWS]]
+  // CHECK: call void @__kmpc_kernel_init(i32 [[MTMP1]]
   //
   // CHECK-64-DAG: load i32, i32* [[REF_A]]
   // CHECK-32-DAG: load i32, i32* [[LOCAL_A]]
   // CHECK-DAG:    load i16, i16* [[REF_AA]]
   // CHECK-DAG:    getelementptr inbounds [10 x i32], [10 x i32]* [[REF_B]], i[[SZ]] 0, i[[SZ]] 2
   //
-  // CHECK: br label {{%?}}[[TERM:.+]]
+  // CHECK: br label {{%?}}[[TERMINATE:.+]]
   //
-  // CHECK: [[TERM]]
-  // CHECK: store i64 0, i64 addrspace(3)* [[OMP_WID]],
+  // CHECK: [[TERMINATE]]
+  // CHECK: call void @__kmpc_kernel_deinit()
   // CHECK: call void @llvm.nvvm.barrier0()
   // CHECK: br label {{%?}}[[EXIT]]
   //