[OPENMP][NVPTX]Added extra sync point to the inter-warp copy function.
The parallel reduction operation requires an extra synchronization point
in the inter-warp copy function to avoid divergence.
llvm-svn: 349525
diff --git a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
index b055132..59086d2 100644
--- a/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
+++ b/clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp
@@ -3089,6 +3089,7 @@
/// void inter_warp_copy_func(void* reduce_data, num_warps)
/// shared smem[warp_size];
/// For all data entries D in reduce_data:
+/// sync
/// If (I am the first lane in each warp)
/// Copy my local D to smem[warp_id]
/// sync
@@ -3203,6 +3204,10 @@
Bld.CreateCondBr(Cmp, BodyBB, ExitBB);
CGF.EmitBlock(BodyBB);
}
+ // kmpc_barrier.
+ CGM.getOpenMPRuntime().emitBarrierCall(CGF, Loc, OMPD_unknown,
+ /*EmitChecks=*/false,
+ /*ForceSimpleCall=*/true);
llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
diff --git a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
index dd93b0c..34ad93b 100644
--- a/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
+++ b/clang/test/OpenMP/nvptx_target_parallel_reduction_codegen.cpp
@@ -190,6 +190,7 @@
// CHECK: [[CNT:%.+]] = load i32, i32* [[CNT_ADDR]],
// CHECK: [[DONE_COPY:%.+]] = icmp ult i32 [[CNT]], 2
// CHECK: br i1 [[DONE_COPY]], label
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -427,6 +428,7 @@
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -466,6 +468,7 @@
//
// CHECK: [[READ_CONT]]
// CHECK: call void @__kmpc_barrier(%struct.ident_t* @
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//
@@ -740,6 +743,7 @@
// CHECK-DAG: [[LANEID:%.+]] = and i32 {{.+}}, 31
// CHECK-DAG: [[WARPID:%.+]] = ashr i32 {{.+}}, 5
// CHECK-DAG: [[RED_LIST:%.+]] = bitcast i8* {{.+}} to [[RLT]]*
+ // CHECK: call void @__kmpc_barrier(%struct.ident_t* @
// CHECK: [[IS_WARP_MASTER:%.+]] = icmp eq i32 [[LANEID]], 0
// CHECK: br i1 [[IS_WARP_MASTER]], label {{%?}}[[DO_COPY:.+]], label {{%?}}[[COPY_ELSE:.+]]
//