[OPENMP][NVPTX]Use __kmpc_barrier_simple_spmd(nullptr, 0) instead of
nvvm_barrier0.

Use runtime functions instead of the direct call to the nvvm intrinsics.
It allows to prevent some dangerous LLVM optimizations, that breaks the
code for the NVPTX target.

llvm-svn: 350328
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index 5e2676b..97b8f79 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -96,8 +96,11 @@
   OMPRTL_NVPTX__kmpc_get_team_static_memory,
   /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
   OMPRTL_NVPTX__kmpc_restore_team_static_memory,
-  // Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
+  /// Call to void __kmpc_barrier(ident_t *loc, kmp_int32 global_tid);
   OMPRTL__kmpc_barrier,
+  /// Call to void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
+  /// global_tid);
+  OMPRTL__kmpc_barrier_simple_spmd,
 };
 
 /// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
@@ -640,17 +643,6 @@
       "nvptx_num_threads");
 }
 
-/// Get barrier to synchronize all threads in a block.
-static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
-  llvm::Function *F = llvm::Intrinsic::getDeclaration(
-      &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0);
-  F->addFnAttr(llvm::Attribute::Convergent);
-  CGF.EmitRuntimeCall(F);
-}
-
-/// Synchronize all GPU threads in a block.
-static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
-
 /// Get the value of the thread_limit clause in the teams directive.
 /// For the 'generic' execution mode, the runtime encodes thread_limit in
 /// the launch parameters, always starting thread_limit+warpSize threads per
@@ -1813,6 +1805,17 @@
     cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
     break;
   }
+  case OMPRTL__kmpc_barrier_simple_spmd: {
+    // Build void __kmpc_barrier_simple_spmd(ident_t *loc, kmp_int32
+    // global_tid);
+    llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
+    auto *FnTy =
+        llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
+    RTLFn =
+        CGM.CreateRuntimeFunction(FnTy, /*Name*/ "__kmpc_barrier_simple_spmd");
+    cast<llvm::Function>(RTLFn)->addFnAttr(llvm::Attribute::Convergent);
+    break;
+  }
   }
   return RTLFn;
 }
@@ -2665,6 +2668,20 @@
   }
 }
 
+void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) {
+  // Always emit simple barriers!
+  if (!CGF.HaveInsertPoint())
+    return;
+  // Build call __kmpc_barrier_simple_spmd(nullptr, 0);
+  // This function does not use parameters, so we can emit just default values.
+  llvm::Value *Args[] = {
+      llvm::ConstantPointerNull::get(
+          cast<llvm::PointerType>(getIdentTyPointerTy())),
+      llvm::ConstantInt::get(CGF.Int32Ty, /*V=*/0, /*isSigned=*/true)};
+  CGF.EmitRuntimeCall(
+      createNVPTXRuntimeFunction(OMPRTL__kmpc_barrier_simple_spmd), Args);
+}
+
 void CGOpenMPRuntimeNVPTX::emitBarrierCall(CodeGenFunction &CGF,
                                            SourceLocation Loc,
                                            OpenMPDirectiveKind Kind, bool,