[OpenMP] Codegen for the 'target parallel' directive on the NVPTX device.

This patch adds codegen for the 'target parallel' directive on the NVPTX
device.  We term offload OpenMP directives such as 'target parallel' and
'target teams distribute parallel for' as SPMD constructs.  SPMD constructs,
in contrast to Generic ones like the plain 'target', can never contain
a serial region.

SPMD constructs can be handled more efficiently on the GPU and do not
require the Warp Loop of the Generic codegen scheme. This patch adds
SPMD codegen support for 'target parallel' on the NVPTX device and can
be reused for other SPMD constructs.

Reviewers: ABataev
Differential Revision: https://reviews.llvm.org/D28755

llvm-svn: 292428
diff --git a/clang/test/OpenMP/nvptx_target_codegen.cpp b/clang/test/OpenMP/nvptx_target_codegen.cpp
index 59c4d5b..dab0c5a 100644
--- a/clang/test/OpenMP/nvptx_target_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_codegen.cpp
@@ -8,6 +8,14 @@
 #ifndef HEADER
 #define HEADER
 
+// Check that the execution mode of all 6 target regions is set to Generic Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l98}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l175}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l284}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l321}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l339}}_exec_mode = weak constant i8 1
+// CHECK-DAG: {{@__omp_offloading_.+l304}}_exec_mode = weak constant i8 1
+
 template<typename tx, typename ty>
 struct TT{
   tx X;
@@ -23,7 +31,7 @@
   double cn[5][n];
   TT<long long, char> d;
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l90}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l98}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -54,7 +62,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l90]]()
+  // CHECK: define {{.*}}void [[T1:@__omp_offloading_.+foo.+l98]]()
   // CHECK-DAG: [[TID:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.tid.x()
   // CHECK-DAG: [[NTH:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
   // CHECK-DAG: [[WS:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.warpsize()
@@ -96,7 +104,7 @@
   {
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l167}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l175}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -127,7 +135,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l167]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
+  // CHECK: define {{.*}}void [[T2:@__omp_offloading_.+foo.+l175]](i[[SZ:32|64]] [[ARG1:%[a-zA-Z_]+]])
   // CHECK: [[AA_ADDR:%.+]] = alloca i[[SZ]],
   // CHECK: store i[[SZ]] [[ARG1]], i[[SZ]]* [[AA_ADDR]],
   // CHECK: [[AA_CADDR:%.+]] = bitcast i[[SZ]]* [[AA_ADDR]] to i16*
@@ -169,7 +177,7 @@
     aa += 1;
   }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l276}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+foo.+l284}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -200,7 +208,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l276]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T3:@__omp_offloading_.+foo.+l284]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:    [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:    [[LOCAL_B:%.+]] = alloca [10 x float]*
@@ -353,7 +361,7 @@
   return a;
 }
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+313}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+static.+321}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -384,7 +392,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l313]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T4:@__omp_offloading_.+static.+l321]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
@@ -439,7 +447,7 @@
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l331}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+S1.+l339}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -470,7 +478,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l331]](
+  // CHECK: define {{.*}}void [[T5:@__omp_offloading_.+S1.+l339]](
   // Create local storage for each capture.
   // CHECK:       [[LOCAL_THIS:%.+]] = alloca [[S1:%struct.*]]*
   // CHECK:       [[LOCAL_B:%.+]] = alloca i[[SZ]]
@@ -529,7 +537,7 @@
 
 
 
-  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l296}}_worker()
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l304}}_worker()
   // CHECK-DAG: [[OMP_EXEC_STATUS:%.+]] = alloca i8,
   // CHECK-DAG: [[OMP_WORK_FN:%.+]] = alloca i8*,
   // CHECK: store i8* null, i8** [[OMP_WORK_FN]],
@@ -560,7 +568,7 @@
   // CHECK: [[EXIT]]
   // CHECK: ret void
 
-  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l296]](i[[SZ]]
+  // CHECK: define {{.*}}void [[T6:@__omp_offloading_.+template.+l304]](i[[SZ]]
   // Create local storage for each capture.
   // CHECK:  [[LOCAL_A:%.+]] = alloca i[[SZ]]
   // CHECK:  [[LOCAL_AA:%.+]] = alloca i[[SZ]]
diff --git a/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
new file mode 100644
index 0000000..7d16624
--- /dev/null
+++ b/clang/test/OpenMP/nvptx_target_parallel_codegen.cpp
@@ -0,0 +1,136 @@
+// Test target codegen - host bc file has to be created first.
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple powerpc64le-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm-bc %s -o %t-ppc-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx64-unknown-unknown -fopenmp-targets=nvptx64-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-ppc-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-64
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple i386-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm-bc %s -o %t-x86-host.bc
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// RUN: %clang_cc1 -verify -fopenmp -fopenmp-version=45 -fexceptions -fcxx-exceptions -x c++ -triple nvptx-unknown-unknown -fopenmp-targets=nvptx-nvidia-cuda -emit-llvm %s -fopenmp-is-device -fopenmp-host-ir-file-path %t-x86-host.bc -o - | FileCheck %s --check-prefix CHECK --check-prefix CHECK-32
+// expected-no-diagnostics
+#ifndef HEADER
+#define HEADER
+
+// Check that the execution mode of all 2 target regions on the gpu is set to SPMD Mode.
+// CHECK-DAG: {{@__omp_offloading_.+l26}}_exec_mode = weak constant i8 0
+// CHECK-DAG: {{@__omp_offloading_.+l31}}_exec_mode = weak constant i8 0
+
+template<typename tx>
+tx ftemplate(int n) {
+  tx a = 0;
+  short aa = 0;
+  tx b[10];
+
+  #pragma omp target parallel if(target: 0)
+  {
+    a += 1;
+  }
+
+  #pragma omp target parallel map(tofrom: aa)
+  {
+    aa += 1;
+  }
+
+  #pragma omp target parallel map(tofrom:a, aa, b) if(target: n>40)
+  {
+    a += 1;
+    aa += 1;
+    b[2] += 1;
+  }
+
+  return a;
+}
+
+int bar(int n){
+  int a = 0;
+
+  a += ftemplate<int>(n);
+
+  return a;
+}
+
+  // CHECK-NOT: define {{.*}}void {{@__omp_offloading_.+template.+l17}}
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l26}}(
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP1:@.+]](i32* null, i32* null, i16* [[AA]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP1]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i16* {{[^%]*}}[[ARG:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: store i16* [[ARG]], i16** [[AA_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[VAL:%.+]] = load i16, i16* [[AA]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: ret void
+  // CHECK: }
+
+
+
+
+
+
+  // CHECK-LABEL: define {{.*}}void {{@__omp_offloading_.+template.+l31}}(
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* {{%.+}}, i32** [[A_ADDR]], align
+  // CHECK: store i16* {{%.+}}, i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* {{%.+}}, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[THREAD_LIMIT:%.+]] = call i32 @llvm.nvvm.read.ptx.sreg.ntid.x()
+  // CHECK: call void @__kmpc_spmd_kernel_init(i32 [[THREAD_LIMIT]],
+  // CHECK: br label {{%?}}[[EXEC:.+]]
+  //
+  // CHECK: [[EXEC]]
+  // CHECK: {{call|invoke}} void [[OP2:@.+]](i32* null, i32* null, i32* [[A]], i16* [[AA]], [10 x i32]* [[B]])
+  // CHECK: br label {{%?}}[[DONE:.+]]
+  //
+  // CHECK: [[DONE]]
+  // CHECK: call void @__kmpc_spmd_kernel_deinit()
+  // CHECK: br label {{%?}}[[EXIT:.+]]
+  //
+  // CHECK: [[EXIT]]
+  // CHECK: ret void
+  // CHECK: }
+
+  // CHECK: define internal void [[OP2]](i32* noalias %.global_tid., i32* noalias %.bound_tid., i32* {{[^%]*}}[[ARG1:%.+]], i16* {{[^%]*}}[[ARG2:%.+]], [10 x i32]* {{[^%]*}}[[ARG3:%.+]])
+  // CHECK: = alloca i32*, align
+  // CHECK: = alloca i32*, align
+  // CHECK: [[A_ADDR:%.+]] = alloca i32*, align
+  // CHECK: [[AA_ADDR:%.+]] = alloca i16*, align
+  // CHECK: [[B_ADDR:%.+]] = alloca [10 x i32]*, align
+  // CHECK: store i32* [[ARG1]], i32** [[A_ADDR]], align
+  // CHECK: store i16* [[ARG2]], i16** [[AA_ADDR]], align
+  // CHECK: store [10 x i32]* [[ARG3]], [10 x i32]** [[B_ADDR]], align
+  // CHECK: [[A:%.+]] = load i32*, i32** [[A_ADDR]], align
+  // CHECK: [[AA:%.+]] = load i16*, i16** [[AA_ADDR]], align
+  // CHECK: [[B:%.+]] = load [10 x i32]*, [10 x i32]** [[B_ADDR]], align
+  // CHECK: store i32 {{%.+}}, i32* [[A]], align
+  // CHECK: store i16 {{%.+}}, i16* [[AA]], align
+  // CHECK: [[ELT:%.+]] = getelementptr inbounds [10 x i32], [10 x i32]* [[B]],
+  // CHECK: store i32 {{%.+}}, i32* [[ELT]], align
+  // CHECK: ret void
+  // CHECK: }
+#endif