blob: f0f0a735f8a79fc1aa997d835f38050bae7ececd [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
20#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000021
22using namespace clang;
23using namespace CodeGen;
24
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000025namespace {
26enum OpenMPRTLFunctionNVPTX {
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000027 /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000028 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000029 OMPRTL_NVPTX__kmpc_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000030 /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000031 OMPRTL_NVPTX__kmpc_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000032 /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000033 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000034 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000035 /// Call to void __kmpc_spmd_kernel_deinit();
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000036 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000037 /// Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000038 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000039 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000040 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000041 /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000042 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000043 OMPRTL_NVPTX__kmpc_kernel_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000044 /// Call to void __kmpc_kernel_end_parallel();
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000045 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47 /// global_tid);
48 OMPRTL_NVPTX__kmpc_serialized_parallel,
49 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50 /// global_tid);
51 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000052 /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000053 /// int16_t lane_offset, int16_t warp_size);
54 OMPRTL_NVPTX__kmpc_shuffle_int32,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000055 /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000056 /// int16_t lane_offset, int16_t warp_size);
57 OMPRTL_NVPTX__kmpc_shuffle_int64,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000058 /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000059 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61 /// lane_offset, int16_t shortCircuit),
62 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000064 /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
Alexey Bataevfac26cf2018-05-02 20:03:27 +000065 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
66 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
67 /// lane_offset, int16_t shortCircuit),
68 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
69 OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000070 /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000071 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
72 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
73 /// lane_offset, int16_t shortCircuit),
74 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
75 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
76 /// int32_t index, int32_t width),
77 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
78 /// index, int32_t width, int32_t reduce))
79 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000080 /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000081 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000082 /// Call to void __kmpc_data_sharing_init_stack();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000083 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +000084 /// Call to void __kmpc_data_sharing_init_stack_spmd();
85 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000086 /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000087 /// int16_t UseSharedMemory);
88 OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000089 /// Call to void __kmpc_data_sharing_pop_stack(void *a);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000090 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000091 /// Call to void __kmpc_begin_sharing_variables(void ***args,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000092 /// size_t n_args);
93 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000094 /// Call to void __kmpc_end_sharing_variables();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000095 OMPRTL_NVPTX__kmpc_end_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000096 /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000097 OMPRTL_NVPTX__kmpc_get_shared_variables,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +000098 /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
99 /// global_tid);
100 OMPRTL_NVPTX__kmpc_parallel_level,
Alexey Bataev673110d2018-05-16 13:36:30 +0000101 /// Call to int8_t __kmpc_is_spmd_exec_mode();
102 OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000103};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000104
105/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
106class NVPTXActionTy final : public PrePostActionTy {
Alexey Bataev9ff80832018-04-16 20:16:21 +0000107 llvm::Value *EnterCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000108 ArrayRef<llvm::Value *> EnterArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000109 llvm::Value *ExitCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000110 ArrayRef<llvm::Value *> ExitArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000111 bool Conditional = false;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000112 llvm::BasicBlock *ContBlock = nullptr;
113
114public:
115 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
116 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
117 bool Conditional = false)
118 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
119 ExitArgs(ExitArgs), Conditional(Conditional) {}
120 void Enter(CodeGenFunction &CGF) override {
121 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
122 if (Conditional) {
123 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
124 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
125 ContBlock = CGF.createBasicBlock("omp_if.end");
126 // Generate the branch (If-stmt)
127 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
128 CGF.EmitBlock(ThenBlock);
129 }
130 }
131 void Done(CodeGenFunction &CGF) {
132 // Emit the rest of blocks/branches
133 CGF.EmitBranch(ContBlock);
134 CGF.EmitBlock(ContBlock, true);
135 }
136 void Exit(CodeGenFunction &CGF) override {
137 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
138 }
139};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000140
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000141/// A class to track the execution mode when codegening directives within
142/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
143/// to the target region and used by containing directives such as 'parallel'
144/// to emit optimized code.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000145class ExecutionModeRAII {
146private:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000147 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
148 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000149
150public:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000151 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
152 : Mode(Mode) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000153 SavedMode = Mode;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000154 Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
155 : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000156 }
157 ~ExecutionModeRAII() { Mode = SavedMode; }
158};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000159
160/// GPU Configuration: This information can be derived from cuda registers,
161/// however, providing compile time constants helps generate more efficient
162/// code. For all practical purposes this is fine because the configuration
163/// is the same for all known NVPTX architectures.
164enum MachineConfiguration : unsigned {
165 WarpSize = 32,
166 /// Number of bits required to represent a lane identifier, which is
167 /// computed as log_2(WarpSize).
168 LaneIDBits = 5,
169 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000170
171 /// Global memory alignment for performance.
172 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000173};
174
175enum NamedBarrier : unsigned {
176 /// Synchronize on this barrier #ID using a named barrier primitive.
177 /// Only the subset of active threads in a parallel region arrive at the
178 /// barrier.
179 NB_Parallel = 1,
180};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000181
Alexey Bataev2adecff2018-09-21 14:22:53 +0000182typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
183static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
184 return P1.first > P2.first;
185}
186
187static RecordDecl *buildRecordForGlobalizedVars(
188 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
189 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
190 &MappedDeclsFields) {
191 if (EscapedDecls.empty())
192 return nullptr;
193 SmallVector<VarsDataTy, 4> GlobalizedVars;
194 for (const ValueDecl *D : EscapedDecls)
195 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
196 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
197 stable_sort_comparator);
198 // Build struct _globalized_locals_ty {
199 // /* globalized vars */
200 // };
201 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
202 GlobalizedRD->startDefinition();
203 for (const auto &Pair : GlobalizedVars) {
204 const ValueDecl *VD = Pair.second;
205 QualType Type = VD->getType();
206 if (Type->isLValueReferenceType())
207 Type = C.getPointerType(Type.getNonReferenceType());
208 else
209 Type = Type.getNonReferenceType();
210 SourceLocation Loc = VD->getLocation();
211 auto *Field =
212 FieldDecl::Create(C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
213 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
214 /*BW=*/nullptr, /*Mutable=*/false,
215 /*InitStyle=*/ICIS_NoInit);
216 Field->setAccess(AS_public);
217 GlobalizedRD->addDecl(Field);
218 if (VD->hasAttrs()) {
219 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
220 E(VD->getAttrs().end());
221 I != E; ++I)
222 Field->addAttr(*I);
223 }
224 MappedDeclsFields.try_emplace(VD, Field);
225 }
226 GlobalizedRD->completeDefinition();
227 return GlobalizedRD;
228}
229
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000230/// Get the list of variables that can escape their declaration context.
231class CheckVarsEscapingDeclContext final
232 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
233 CodeGenFunction &CGF;
234 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000235 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000236 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000237 RecordDecl *GlobalizedRD = nullptr;
238 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000239 bool AllEscaped = false;
Alexey Bataev91433f62018-06-26 17:24:03 +0000240 bool IsForCombinedParallelRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000241
242 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev03f270c2018-03-30 18:31:07 +0000243 // Do not globalize declare target variables.
Alexey Bataev97b72212018-08-14 18:31:20 +0000244 if (!isa<VarDecl>(VD) ||
245 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
Alexey Bataev03f270c2018-03-30 18:31:07 +0000246 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000247 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000248 // Variables captured by value must be globalized.
249 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000250 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000251 // Check if need to capture the variable that was already captured by
252 // value in the outer region.
Alexey Bataev91433f62018-06-26 17:24:03 +0000253 if (!IsForCombinedParallelRegion) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000254 if (!FD->hasAttrs())
255 return;
256 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
257 if (!Attr)
258 return;
259 if (!isOpenMPPrivate(
260 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
261 Attr->getCaptureKind() == OMPC_map)
262 return;
263 }
264 if (!FD->getType()->isReferenceType()) {
265 assert(!VD->getType()->isVariablyModifiedType() &&
266 "Parameter captured by value with variably modified type");
267 EscapedParameters.insert(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000268 } else if (!IsForCombinedParallelRegion) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000269 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000270 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000271 }
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000272 }
273 if ((!CGF.CapturedStmtInfo ||
Alexey Bataev91433f62018-06-26 17:24:03 +0000274 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000275 VD->getType()->isReferenceType())
276 // Do not globalize variables with reference type.
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000277 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000278 if (VD->getType()->isVariablyModifiedType())
279 EscapedVariableLengthDecls.insert(VD);
280 else
281 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000282 }
283
284 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000285 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000286 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000287 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
288 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
289 const bool SavedAllEscaped = AllEscaped;
290 AllEscaped = VD->getType()->isLValueReferenceType();
291 Visit(VarD->getInit());
292 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000293 }
294 }
295 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000296 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
297 ArrayRef<OMPClause *> Clauses,
298 bool IsCombinedParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000299 if (!S)
300 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000301 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000302 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
303 const ValueDecl *VD = C.getCapturedVar();
Alexey Bataev91433f62018-06-26 17:24:03 +0000304 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
305 if (IsCombinedParallelRegion) {
306 // Check if the variable is privatized in the combined construct and
307 // those private copies must be shared in the inner parallel
308 // directive.
309 IsForCombinedParallelRegion = false;
310 for (const OMPClause *C : Clauses) {
311 if (!isOpenMPPrivate(C->getClauseKind()) ||
312 C->getClauseKind() == OMPC_reduction ||
313 C->getClauseKind() == OMPC_linear ||
314 C->getClauseKind() == OMPC_private)
315 continue;
316 ArrayRef<const Expr *> Vars;
317 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
318 Vars = PC->getVarRefs();
319 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
320 Vars = PC->getVarRefs();
321 else
322 llvm_unreachable("Unexpected clause.");
323 for (const auto *E : Vars) {
324 const Decl *D =
325 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
326 if (D == VD->getCanonicalDecl()) {
327 IsForCombinedParallelRegion = true;
328 break;
329 }
330 }
331 if (IsForCombinedParallelRegion)
332 break;
333 }
334 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000335 markAsEscaped(VD);
336 if (isa<OMPCapturedExprDecl>(VD))
337 VisitValueDecl(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000338 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000339 }
340 }
341 }
342
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000343 void buildRecordForGlobalizedVars() {
344 assert(!GlobalizedRD &&
345 "Record for globalized variables is built already.");
Alexey Bataev2adecff2018-09-21 14:22:53 +0000346 GlobalizedRD = ::buildRecordForGlobalizedVars(
347 CGF.getContext(), EscapedDecls.getArrayRef(), MappedDeclsFields);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000348 }
349
350public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000351 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000352 virtual ~CheckVarsEscapingDeclContext() = default;
353 void VisitDeclStmt(const DeclStmt *S) {
354 if (!S)
355 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000356 for (const Decl *D : S->decls())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000357 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
358 VisitValueDecl(VD);
359 }
360 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
361 if (!D)
362 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000363 if (!D->hasAssociatedStmt())
364 return;
365 if (const auto *S =
366 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
367 // Do not analyze directives that do not actually require capturing,
368 // like `omp for` or `omp simd` directives.
369 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
370 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
371 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
372 VisitStmt(S->getCapturedStmt());
373 return;
Alexey Bataev673110d2018-05-16 13:36:30 +0000374 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000375 VisitOpenMPCapturedStmt(
376 S, D->clauses(),
377 CaptureRegions.back() == OMPD_parallel &&
378 isOpenMPDistributeDirective(D->getDirectiveKind()));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000379 }
380 }
381 void VisitCapturedStmt(const CapturedStmt *S) {
382 if (!S)
383 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000384 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000385 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
386 const ValueDecl *VD = C.getCapturedVar();
387 markAsEscaped(VD);
388 if (isa<OMPCapturedExprDecl>(VD))
389 VisitValueDecl(VD);
390 }
391 }
392 }
393 void VisitLambdaExpr(const LambdaExpr *E) {
394 if (!E)
395 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000396 for (const LambdaCapture &C : E->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000397 if (C.capturesVariable()) {
398 if (C.getCaptureKind() == LCK_ByRef) {
399 const ValueDecl *VD = C.getCapturedVar();
400 markAsEscaped(VD);
401 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
402 VisitValueDecl(VD);
403 }
404 }
405 }
406 }
407 void VisitBlockExpr(const BlockExpr *E) {
408 if (!E)
409 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000410 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000411 if (C.isByRef()) {
412 const VarDecl *VD = C.getVariable();
413 markAsEscaped(VD);
414 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
415 VisitValueDecl(VD);
416 }
417 }
418 }
419 void VisitCallExpr(const CallExpr *E) {
420 if (!E)
421 return;
422 for (const Expr *Arg : E->arguments()) {
423 if (!Arg)
424 continue;
425 if (Arg->isLValue()) {
426 const bool SavedAllEscaped = AllEscaped;
427 AllEscaped = true;
428 Visit(Arg);
429 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000430 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000431 Visit(Arg);
Alexey Bataev9ff80832018-04-16 20:16:21 +0000432 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000433 }
434 Visit(E->getCallee());
435 }
436 void VisitDeclRefExpr(const DeclRefExpr *E) {
437 if (!E)
438 return;
439 const ValueDecl *VD = E->getDecl();
440 if (AllEscaped)
441 markAsEscaped(VD);
442 if (isa<OMPCapturedExprDecl>(VD))
443 VisitValueDecl(VD);
444 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
445 if (VarD->isInitCapture())
446 VisitValueDecl(VD);
447 }
448 void VisitUnaryOperator(const UnaryOperator *E) {
449 if (!E)
450 return;
451 if (E->getOpcode() == UO_AddrOf) {
452 const bool SavedAllEscaped = AllEscaped;
453 AllEscaped = true;
454 Visit(E->getSubExpr());
455 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000456 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000457 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000458 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000459 }
460 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
461 if (!E)
462 return;
463 if (E->getCastKind() == CK_ArrayToPointerDecay) {
464 const bool SavedAllEscaped = AllEscaped;
465 AllEscaped = true;
466 Visit(E->getSubExpr());
467 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000468 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000469 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000470 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000471 }
472 void VisitExpr(const Expr *E) {
473 if (!E)
474 return;
475 bool SavedAllEscaped = AllEscaped;
476 if (!E->isLValue())
477 AllEscaped = false;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000478 for (const Stmt *Child : E->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000479 if (Child)
480 Visit(Child);
481 AllEscaped = SavedAllEscaped;
482 }
483 void VisitStmt(const Stmt *S) {
484 if (!S)
485 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000486 for (const Stmt *Child : S->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000487 if (Child)
488 Visit(Child);
489 }
490
Alexey Bataevc99042b2018-03-15 18:10:54 +0000491 /// Returns the record that handles all the escaped local variables and used
492 /// instead of their original storage.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000493 const RecordDecl *getGlobalizedRecord() {
494 if (!GlobalizedRD)
495 buildRecordForGlobalizedVars();
496 return GlobalizedRD;
497 }
498
Alexey Bataevc99042b2018-03-15 18:10:54 +0000499 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000500 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
501 assert(GlobalizedRD &&
502 "Record for globalized variables must be generated already.");
503 auto I = MappedDeclsFields.find(VD);
504 if (I == MappedDeclsFields.end())
505 return nullptr;
506 return I->getSecond();
507 }
508
Alexey Bataevc99042b2018-03-15 18:10:54 +0000509 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000510 ArrayRef<const ValueDecl *> getEscapedDecls() const {
511 return EscapedDecls.getArrayRef();
512 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000513
514 /// Checks if the escaped local variable is actually a parameter passed by
515 /// value.
516 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
517 return EscapedParameters;
518 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000519
520 /// Returns the list of the escaped variables with the variably modified
521 /// types.
522 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
523 return EscapedVariableLengthDecls.getArrayRef();
524 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000525};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000526} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000527
528/// Get the GPU warp size.
529static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000530 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000531 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000532 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000533 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000534}
535
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000536/// Get the id of the current thread on the GPU.
537static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000538 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000539 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000540 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000541 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000542}
543
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000544/// Get the id of the warp in the block.
545/// We assume that the warp size is 32, which is always the case
546/// on the NVPTX device, to generate more efficient code.
547static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
548 CGBuilderTy &Bld = CGF.Builder;
549 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
550}
551
552/// Get the id of the current lane in the Warp.
553/// We assume that the warp size is 32, which is always the case
554/// on the NVPTX device, to generate more efficient code.
555static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
556 CGBuilderTy &Bld = CGF.Builder;
557 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
558 "nvptx_lane_id");
559}
560
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000561/// Get the maximum number of threads in a block of the GPU.
562static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000563 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000564 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000565 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000566 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000567}
568
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000569/// Get barrier to synchronize all threads in a block.
570static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000571 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000572 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000573}
574
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000575/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
576/// a CTA.
577static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
578 llvm::Value *NumThreads) {
579 CGBuilderTy &Bld = CGF.Builder;
580 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000581 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
582 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
583 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000584}
585
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000586/// Synchronize all GPU threads in a block.
587static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000588
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000589/// Synchronize worker threads in a parallel region.
590static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
591 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
592}
593
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000594/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000595/// For the 'generic' execution mode, the runtime encodes thread_limit in
596/// the launch parameters, always starting thread_limit+warpSize threads per
597/// CTA. The threads in the last warp are reserved for master execution.
598/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
599static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000600 bool IsInSPMDExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000601 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000602 return IsInSPMDExecutionMode
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000603 ? getNVPTXNumThreads(CGF)
Alexey Bataeve290ec02018-04-06 16:03:36 +0000604 : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
605 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000606}
607
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000608/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000609/// The master thread id is the first thread (lane) of the last warp in the
610/// GPU block. Warp size is assumed to be some power of 2.
611/// Thread id is 0 indexed.
612/// E.g: If NumThreads is 33, master id is 32.
613/// If NumThreads is 64, master id is 32.
614/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000615static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000616 CGBuilderTy &Bld = CGF.Builder;
617 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
618
619 // We assume that the warp size is a power of 2.
Alexey Bataeve290ec02018-04-06 16:03:36 +0000620 llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000621
Alexey Bataeve290ec02018-04-06 16:03:36 +0000622 return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000623 Bld.CreateNot(Mask), "master_tid");
624}
625
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000626CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000627 CodeGenModule &CGM, SourceLocation Loc)
Alexey Bataev9ff80832018-04-16 20:16:21 +0000628 : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
629 Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000630 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000631}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000632
633void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
634 CodeGenModule &CGM) {
635 // Create an worker function with no arguments.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000636
637 WorkerFn = llvm::Function::Create(
Alexey Bataev9ff80832018-04-16 20:16:21 +0000638 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000639 /*placeholder=*/"_worker", &CGM.getModule());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000640 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +0000641 WorkerFn->setDoesNotRecurse();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000642}
643
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000644CGOpenMPRuntimeNVPTX::ExecutionMode
645CGOpenMPRuntimeNVPTX::getExecutionMode() const {
646 return CurrentExecutionMode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000647}
648
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000649static CGOpenMPRuntimeNVPTX::DataSharingMode
650getDataSharingMode(CodeGenModule &CGM) {
651 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
652 : CGOpenMPRuntimeNVPTX::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000653}
654
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000655/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
656/// iff there is only one.
657static const Stmt *getSingleCompoundChild(const Stmt *Body) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000658 if (const auto *C = dyn_cast<CompoundStmt>(Body))
659 if (C->size() == 1)
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000660 return C->body_front();
661 return Body;
662}
663
664/// Check if the parallel directive has an 'if' clause with non-constant or
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000665/// false condition. Also, check if the number of threads is strictly specified
666/// and run those directives in non-SPMD mode.
667static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
668 const OMPExecutableDirective &D) {
669 if (D.hasClausesOfKind<OMPNumThreadsClause>())
670 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000671 for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
672 OpenMPDirectiveKind NameModifier = C->getNameModifier();
673 if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
674 continue;
675 const Expr *Cond = C->getCondition();
676 bool Result;
677 if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
678 return true;
679 }
680 return false;
681}
682
683/// Check for inner (nested) SPMD construct, if any
684static bool hasNestedSPMDDirective(ASTContext &Ctx,
685 const OMPExecutableDirective &D) {
686 const auto *CS = D.getInnermostCapturedStmt();
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000687 const auto *Body =
688 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000689 const Stmt *ChildStmt = getSingleCompoundChild(Body);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000690
691 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
692 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000693 switch (D.getDirectiveKind()) {
694 case OMPD_target:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000695 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000696 !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000697 return true;
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000698 if (DKind == OMPD_teams) {
699 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
700 /*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000701 if (!Body)
702 return false;
703 ChildStmt = getSingleCompoundChild(Body);
704 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
705 DKind = NND->getDirectiveKind();
Alexey Bataevdf093e72018-05-11 19:45:14 +0000706 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000707 !hasParallelIfNumThreadsClause(Ctx, *NND))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000708 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000709 }
710 }
711 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000712 case OMPD_target_teams:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000713 return isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000714 !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000715 case OMPD_target_simd:
716 case OMPD_target_parallel:
717 case OMPD_target_parallel_for:
718 case OMPD_target_parallel_for_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000719 case OMPD_target_teams_distribute:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000720 case OMPD_target_teams_distribute_simd:
721 case OMPD_target_teams_distribute_parallel_for:
722 case OMPD_target_teams_distribute_parallel_for_simd:
723 case OMPD_parallel:
724 case OMPD_for:
725 case OMPD_parallel_for:
726 case OMPD_parallel_sections:
727 case OMPD_for_simd:
728 case OMPD_parallel_for_simd:
729 case OMPD_cancel:
730 case OMPD_cancellation_point:
731 case OMPD_ordered:
732 case OMPD_threadprivate:
733 case OMPD_task:
734 case OMPD_simd:
735 case OMPD_sections:
736 case OMPD_section:
737 case OMPD_single:
738 case OMPD_master:
739 case OMPD_critical:
740 case OMPD_taskyield:
741 case OMPD_barrier:
742 case OMPD_taskwait:
743 case OMPD_taskgroup:
744 case OMPD_atomic:
745 case OMPD_flush:
746 case OMPD_teams:
747 case OMPD_target_data:
748 case OMPD_target_exit_data:
749 case OMPD_target_enter_data:
750 case OMPD_distribute:
751 case OMPD_distribute_simd:
752 case OMPD_distribute_parallel_for:
753 case OMPD_distribute_parallel_for_simd:
754 case OMPD_teams_distribute:
755 case OMPD_teams_distribute_simd:
756 case OMPD_teams_distribute_parallel_for:
757 case OMPD_teams_distribute_parallel_for_simd:
758 case OMPD_target_update:
759 case OMPD_declare_simd:
760 case OMPD_declare_target:
761 case OMPD_end_declare_target:
762 case OMPD_declare_reduction:
763 case OMPD_taskloop:
764 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000765 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000766 case OMPD_unknown:
767 llvm_unreachable("Unexpected directive.");
768 }
769 }
770
771 return false;
772}
773
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000774static bool supportsSPMDExecutionMode(ASTContext &Ctx,
775 const OMPExecutableDirective &D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000776 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
777 switch (DirectiveKind) {
778 case OMPD_target:
779 case OMPD_target_teams:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000780 return hasNestedSPMDDirective(Ctx, D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000781 case OMPD_target_parallel:
782 case OMPD_target_parallel_for:
783 case OMPD_target_parallel_for_simd:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000784 case OMPD_target_teams_distribute_parallel_for:
785 case OMPD_target_teams_distribute_parallel_for_simd:
Alexey Bataev2adecff2018-09-21 14:22:53 +0000786 return !hasParallelIfNumThreadsClause(Ctx, D);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000787 case OMPD_target_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000788 case OMPD_target_teams_distribute:
789 case OMPD_target_teams_distribute_simd:
790 return false;
791 case OMPD_parallel:
792 case OMPD_for:
793 case OMPD_parallel_for:
794 case OMPD_parallel_sections:
795 case OMPD_for_simd:
796 case OMPD_parallel_for_simd:
797 case OMPD_cancel:
798 case OMPD_cancellation_point:
799 case OMPD_ordered:
800 case OMPD_threadprivate:
801 case OMPD_task:
802 case OMPD_simd:
803 case OMPD_sections:
804 case OMPD_section:
805 case OMPD_single:
806 case OMPD_master:
807 case OMPD_critical:
808 case OMPD_taskyield:
809 case OMPD_barrier:
810 case OMPD_taskwait:
811 case OMPD_taskgroup:
812 case OMPD_atomic:
813 case OMPD_flush:
814 case OMPD_teams:
815 case OMPD_target_data:
816 case OMPD_target_exit_data:
817 case OMPD_target_enter_data:
818 case OMPD_distribute:
819 case OMPD_distribute_simd:
820 case OMPD_distribute_parallel_for:
821 case OMPD_distribute_parallel_for_simd:
822 case OMPD_teams_distribute:
823 case OMPD_teams_distribute_simd:
824 case OMPD_teams_distribute_parallel_for:
825 case OMPD_teams_distribute_parallel_for_simd:
826 case OMPD_target_update:
827 case OMPD_declare_simd:
828 case OMPD_declare_target:
829 case OMPD_end_declare_target:
830 case OMPD_declare_reduction:
831 case OMPD_taskloop:
832 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000833 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000834 case OMPD_unknown:
835 break;
836 }
837 llvm_unreachable(
838 "Unknown programming model for OpenMP directive on NVPTX target.");
839}
840
841/// Check if the directive is loops based and has schedule clause at all or has
842/// static scheduling.
843static bool hasStaticScheduling(const OMPExecutableDirective &D) {
844 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
845 isOpenMPLoopDirective(D.getDirectiveKind()) &&
846 "Expected loop-based directive.");
847 return !D.hasClausesOfKind<OMPOrderedClause>() &&
848 (!D.hasClausesOfKind<OMPScheduleClause>() ||
849 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
850 [](const OMPScheduleClause *C) {
851 return C->getScheduleKind() == OMPC_SCHEDULE_static;
852 }));
853}
854
855/// Check for inner (nested) lightweight runtime construct, if any
856static bool hasNestedLightweightDirective(ASTContext &Ctx,
857 const OMPExecutableDirective &D) {
858 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
859 const auto *CS = D.getInnermostCapturedStmt();
860 const auto *Body =
861 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
862 const Stmt *ChildStmt = getSingleCompoundChild(Body);
863
864 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
865 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
866 switch (D.getDirectiveKind()) {
867 case OMPD_target:
868 if (isOpenMPParallelDirective(DKind) &&
869 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
870 hasStaticScheduling(*NestedDir))
871 return true;
872 if (DKind == OMPD_parallel) {
873 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
874 /*IgnoreCaptured=*/true);
875 if (!Body)
876 return false;
877 ChildStmt = getSingleCompoundChild(Body);
878 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
879 DKind = NND->getDirectiveKind();
880 if (isOpenMPWorksharingDirective(DKind) &&
881 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
882 return true;
883 }
884 } else if (DKind == OMPD_teams) {
885 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
886 /*IgnoreCaptured=*/true);
887 if (!Body)
888 return false;
889 ChildStmt = getSingleCompoundChild(Body);
890 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
891 DKind = NND->getDirectiveKind();
892 if (isOpenMPParallelDirective(DKind) &&
893 isOpenMPWorksharingDirective(DKind) &&
894 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
895 return true;
896 if (DKind == OMPD_parallel) {
897 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
898 /*IgnoreCaptured=*/true);
899 if (!Body)
900 return false;
901 ChildStmt = getSingleCompoundChild(Body);
902 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
903 DKind = NND->getDirectiveKind();
904 if (isOpenMPWorksharingDirective(DKind) &&
905 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
906 return true;
907 }
908 }
909 }
910 }
911 return false;
912 case OMPD_target_teams:
913 if (isOpenMPParallelDirective(DKind) &&
914 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
915 hasStaticScheduling(*NestedDir))
916 return true;
917 if (DKind == OMPD_parallel) {
918 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
919 /*IgnoreCaptured=*/true);
920 if (!Body)
921 return false;
922 ChildStmt = getSingleCompoundChild(Body);
923 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
924 DKind = NND->getDirectiveKind();
925 if (isOpenMPWorksharingDirective(DKind) &&
926 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
927 return true;
928 }
929 }
930 return false;
931 case OMPD_target_parallel:
932 return isOpenMPWorksharingDirective(DKind) &&
933 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
934 case OMPD_target_teams_distribute:
935 case OMPD_target_simd:
936 case OMPD_target_parallel_for:
937 case OMPD_target_parallel_for_simd:
938 case OMPD_target_teams_distribute_simd:
939 case OMPD_target_teams_distribute_parallel_for:
940 case OMPD_target_teams_distribute_parallel_for_simd:
941 case OMPD_parallel:
942 case OMPD_for:
943 case OMPD_parallel_for:
944 case OMPD_parallel_sections:
945 case OMPD_for_simd:
946 case OMPD_parallel_for_simd:
947 case OMPD_cancel:
948 case OMPD_cancellation_point:
949 case OMPD_ordered:
950 case OMPD_threadprivate:
951 case OMPD_task:
952 case OMPD_simd:
953 case OMPD_sections:
954 case OMPD_section:
955 case OMPD_single:
956 case OMPD_master:
957 case OMPD_critical:
958 case OMPD_taskyield:
959 case OMPD_barrier:
960 case OMPD_taskwait:
961 case OMPD_taskgroup:
962 case OMPD_atomic:
963 case OMPD_flush:
964 case OMPD_teams:
965 case OMPD_target_data:
966 case OMPD_target_exit_data:
967 case OMPD_target_enter_data:
968 case OMPD_distribute:
969 case OMPD_distribute_simd:
970 case OMPD_distribute_parallel_for:
971 case OMPD_distribute_parallel_for_simd:
972 case OMPD_teams_distribute:
973 case OMPD_teams_distribute_simd:
974 case OMPD_teams_distribute_parallel_for:
975 case OMPD_teams_distribute_parallel_for_simd:
976 case OMPD_target_update:
977 case OMPD_declare_simd:
978 case OMPD_declare_target:
979 case OMPD_end_declare_target:
980 case OMPD_declare_reduction:
981 case OMPD_taskloop:
982 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000983 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000984 case OMPD_unknown:
985 llvm_unreachable("Unexpected directive.");
986 }
987 }
988
989 return false;
990}
991
992/// Checks if the construct supports lightweight runtime. It must be SPMD
993/// construct + inner loop-based construct with static scheduling.
994static bool supportsLightweightRuntime(ASTContext &Ctx,
995 const OMPExecutableDirective &D) {
996 if (!supportsSPMDExecutionMode(Ctx, D))
997 return false;
998 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
999 switch (DirectiveKind) {
1000 case OMPD_target:
1001 case OMPD_target_teams:
1002 case OMPD_target_parallel:
1003 return hasNestedLightweightDirective(Ctx, D);
1004 case OMPD_target_parallel_for:
1005 case OMPD_target_parallel_for_simd:
1006 case OMPD_target_teams_distribute_parallel_for:
1007 case OMPD_target_teams_distribute_parallel_for_simd:
1008 // (Last|First)-privates must be shared in parallel region.
1009 return hasStaticScheduling(D);
1010 case OMPD_target_simd:
1011 case OMPD_target_teams_distribute:
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001012 case OMPD_target_teams_distribute_simd:
Alexey Bataevdf093e72018-05-11 19:45:14 +00001013 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001014 case OMPD_parallel:
1015 case OMPD_for:
1016 case OMPD_parallel_for:
1017 case OMPD_parallel_sections:
1018 case OMPD_for_simd:
1019 case OMPD_parallel_for_simd:
1020 case OMPD_cancel:
1021 case OMPD_cancellation_point:
1022 case OMPD_ordered:
1023 case OMPD_threadprivate:
1024 case OMPD_task:
1025 case OMPD_simd:
1026 case OMPD_sections:
1027 case OMPD_section:
1028 case OMPD_single:
1029 case OMPD_master:
1030 case OMPD_critical:
1031 case OMPD_taskyield:
1032 case OMPD_barrier:
1033 case OMPD_taskwait:
1034 case OMPD_taskgroup:
1035 case OMPD_atomic:
1036 case OMPD_flush:
1037 case OMPD_teams:
1038 case OMPD_target_data:
1039 case OMPD_target_exit_data:
1040 case OMPD_target_enter_data:
1041 case OMPD_distribute:
1042 case OMPD_distribute_simd:
1043 case OMPD_distribute_parallel_for:
1044 case OMPD_distribute_parallel_for_simd:
1045 case OMPD_teams_distribute:
1046 case OMPD_teams_distribute_simd:
1047 case OMPD_teams_distribute_parallel_for:
1048 case OMPD_teams_distribute_parallel_for_simd:
1049 case OMPD_target_update:
1050 case OMPD_declare_simd:
1051 case OMPD_declare_target:
1052 case OMPD_end_declare_target:
1053 case OMPD_declare_reduction:
1054 case OMPD_taskloop:
1055 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001056 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001057 case OMPD_unknown:
1058 break;
1059 }
1060 llvm_unreachable(
1061 "Unknown programming model for OpenMP directive on NVPTX target.");
1062}
1063
1064void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001065 StringRef ParentName,
1066 llvm::Function *&OutlinedFn,
1067 llvm::Constant *&OutlinedFnID,
1068 bool IsOffloadEntry,
1069 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001070 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001071 EntryFunctionState EST;
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001072 WorkerFunctionState WST(CGM, D.getBeginLoc());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001073 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001074 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001075
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001076 // Emit target region as a standalone region.
1077 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001078 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1079 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001080
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001081 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001082 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001083 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001084 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001085 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001086 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001087 .emitNonSPMDEntryHeader(CGF, EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001088 }
1089 void Exit(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001090 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001091 .emitNonSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001092 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001093 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001094 CodeGen.setAction(Action);
1095 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1096 IsOffloadEntry, CodeGen);
1097
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001098 // Now change the name of the worker function to correspond to this target
1099 // region's entry function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001100 WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
Alexey Bataevaee93892018-01-08 20:09:47 +00001101
1102 // Create the worker function
1103 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001104}
1105
1106// Setup NVPTX threads for master-worker OpenMP scheme.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001107void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001108 EntryFunctionState &EST,
1109 WorkerFunctionState &WST) {
1110 CGBuilderTy &Bld = CGF.Builder;
1111
1112 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1113 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1114 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1115 EST.ExitBB = CGF.createBasicBlock(".exit");
1116
Alexey Bataev9ff80832018-04-16 20:16:21 +00001117 llvm::Value *IsWorker =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001118 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1119 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1120
1121 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001122 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001123 CGF.EmitBranch(EST.ExitBB);
1124
1125 CGF.EmitBlock(MasterCheckBB);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001126 llvm::Value *IsMaster =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001127 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1128 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1129
1130 CGF.EmitBlock(MasterBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001131 IsInTargetMasterThreadRegion = true;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001132 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001133 // First action in sequential region:
1134 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001135 // TODO: Optimize runtime initialization and pass in correct value.
1136 llvm::Value *Args[] = {getThreadLimit(CGF),
1137 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001138 CGF.EmitRuntimeCall(
1139 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001140
1141 // For data sharing, we need to initialize the stack.
1142 CGF.EmitRuntimeCall(
1143 createNVPTXRuntimeFunction(
1144 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1145
Alexey Bataevc99042b2018-03-15 18:10:54 +00001146 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001147}
1148
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001149void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001150 EntryFunctionState &EST) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001151 IsInTargetMasterThreadRegion = false;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001152 if (!CGF.HaveInsertPoint())
1153 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001154
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001155 emitGenericVarsEpilog(CGF);
1156
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001157 if (!EST.ExitBB)
1158 EST.ExitBB = CGF.createBasicBlock(".exit");
1159
1160 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1161 CGF.EmitBranch(TerminateBB);
1162
1163 CGF.EmitBlock(TerminateBB);
1164 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001165 // TODO: Optimize runtime initialization and pass in correct value.
1166 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001167 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001168 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001169 // Barrier to terminate worker threads.
1170 syncCTAThreads(CGF);
1171 // Master thread jumps to exit point.
1172 CGF.EmitBranch(EST.ExitBB);
1173
1174 CGF.EmitBlock(EST.ExitBB);
1175 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001176}
1177
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001178void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001179 StringRef ParentName,
1180 llvm::Function *&OutlinedFn,
1181 llvm::Constant *&OutlinedFnID,
1182 bool IsOffloadEntry,
1183 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001184 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001185 EntryFunctionState EST;
1186
1187 // Emit target region as a standalone region.
1188 class NVPTXPrePostActionTy : public PrePostActionTy {
1189 CGOpenMPRuntimeNVPTX &RT;
1190 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1191 const OMPExecutableDirective &D;
1192
1193 public:
1194 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1195 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1196 const OMPExecutableDirective &D)
1197 : RT(RT), EST(EST), D(D) {}
1198 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001199 RT.emitSPMDEntryHeader(CGF, EST, D);
Alexey Bataevfd006c42018-10-05 15:08:53 +00001200 // Skip target region initialization.
1201 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001202 }
1203 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevfd006c42018-10-05 15:08:53 +00001204 RT.clearLocThreadIdInsertPt(CGF);
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001205 RT.emitSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001206 }
1207 } Action(*this, EST, D);
1208 CodeGen.setAction(Action);
1209 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1210 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001211}
1212
Alexey Bataeve79451a2018-10-01 16:20:57 +00001213static void
1214getDistributeLastprivateVars(const OMPExecutableDirective &D,
1215 llvm::SmallVectorImpl<const ValueDecl *> &Vars);
1216
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001217void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001218 CodeGenFunction &CGF, EntryFunctionState &EST,
1219 const OMPExecutableDirective &D) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001220 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001221
1222 // Setup BBs in entry function.
1223 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1224 EST.ExitBB = CGF.createBasicBlock(".exit");
1225
1226 // Initialize the OMP state in the runtime; called by all active threads.
Alexey Bataev80a9a612018-08-30 14:45:24 +00001227 bool RequiresFullRuntime = CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1228 !supportsLightweightRuntime(CGF.getContext(), D);
Alexey Bataeve79451a2018-10-01 16:20:57 +00001229 // Check if we have inner distribute + lastprivate|reduction clauses.
1230 bool RequiresDatasharing = RequiresFullRuntime;
1231 if (!RequiresDatasharing) {
1232 const OMPExecutableDirective *TD = &D;
1233 if (!isOpenMPTeamsDirective(TD->getDirectiveKind()) &&
1234 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1235 const Stmt *S = getSingleCompoundChild(
1236 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1237 /*IgnoreCaptured=*/true));
1238 TD = cast<OMPExecutableDirective>(S);
1239 }
1240 if (!isOpenMPDistributeDirective(TD->getDirectiveKind()) &&
1241 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1242 const Stmt *S = getSingleCompoundChild(
1243 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1244 /*IgnoreCaptured=*/true));
1245 TD = cast<OMPExecutableDirective>(S);
1246 }
1247 if (isOpenMPDistributeDirective(TD->getDirectiveKind()))
1248 RequiresDatasharing = TD->hasClausesOfKind<OMPLastprivateClause>() ||
1249 TD->hasClausesOfKind<OMPReductionClause>();
1250 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001251 llvm::Value *Args[] = {
1252 getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1253 /*RequiresOMPRuntime=*/
1254 Bld.getInt16(RequiresFullRuntime ? 1 : 0),
Alexey Bataeve79451a2018-10-01 16:20:57 +00001255 /*RequiresDataSharing=*/Bld.getInt16(RequiresDatasharing ? 1 : 0)};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001256 CGF.EmitRuntimeCall(
1257 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001258
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001259 if (RequiresFullRuntime) {
1260 // For data sharing, we need to initialize the stack.
1261 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1262 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1263 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001264
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001265 CGF.EmitBranch(ExecuteBB);
1266
1267 CGF.EmitBlock(ExecuteBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001268
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001269 IsInTargetMasterThreadRegion = true;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001270}
1271
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001272void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001273 EntryFunctionState &EST) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001274 IsInTargetMasterThreadRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001275 if (!CGF.HaveInsertPoint())
1276 return;
1277
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001278 if (!EST.ExitBB)
1279 EST.ExitBB = CGF.createBasicBlock(".exit");
1280
1281 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1282 CGF.EmitBranch(OMPDeInitBB);
1283
1284 CGF.EmitBlock(OMPDeInitBB);
1285 // DeInitialize the OMP state in the runtime; called by all active threads.
1286 CGF.EmitRuntimeCall(
1287 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1288 CGF.EmitBranch(EST.ExitBB);
1289
1290 CGF.EmitBlock(EST.ExitBB);
1291 EST.ExitBB = nullptr;
1292}
1293
1294// Create a unique global variable to indicate the execution mode of this target
1295// region. The execution mode is either 'generic', or 'spmd' depending on the
1296// target directive. This variable is picked up by the offload library to setup
1297// the device appropriately before kernel launch. If the execution mode is
1298// 'generic', the runtime reserves one warp for the master, otherwise, all
1299// warps participate in parallel work.
1300static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001301 bool Mode) {
1302 auto *GVMode =
1303 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1304 llvm::GlobalValue::WeakAnyLinkage,
1305 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1306 Twine(Name, "_exec_mode"));
Alexey Bataev9ff80832018-04-16 20:16:21 +00001307 CGM.addCompilerUsedGlobal(GVMode);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001308}
1309
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001310void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00001311 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001312
1313 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001314 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001315 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001316 emitWorkerLoop(CGF, WST);
1317 CGF.FinishFunction();
1318}
1319
1320void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1321 WorkerFunctionState &WST) {
1322 //
1323 // The workers enter this loop and wait for parallel work from the master.
1324 // When the master encounters a parallel region it sets up the work + variable
1325 // arguments, and wakes up the workers. The workers first check to see if
1326 // they are required for the parallel region, i.e., within the # of requested
1327 // parallel threads. The activated workers load the variable arguments and
1328 // execute the parallel work.
1329 //
1330
1331 CGBuilderTy &Bld = CGF.Builder;
1332
1333 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1334 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1335 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1336 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1337 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1338 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1339
1340 CGF.EmitBranch(AwaitBB);
1341
1342 // Workers wait for work from master.
1343 CGF.EmitBlock(AwaitBB);
1344 // Wait for parallel work
1345 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001346
1347 Address WorkFn =
1348 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1349 Address ExecStatus =
1350 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1351 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1352 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1353
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001354 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001355 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001356 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001357 llvm::Value *Ret = CGF.EmitRuntimeCall(
1358 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1359 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001360
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001361 // On termination condition (workid == 0), exit loop.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001362 llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1363 llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001364 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1365
1366 // Activate requested workers.
1367 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001368 llvm::Value *IsActive =
1369 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1370 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001371
1372 // Signal start of parallel region.
1373 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001374
1375 // Process work items: outlined parallel functions.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001376 for (llvm::Function *W : Work) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001377 // Try to match this outlined function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001378 llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001379
1380 llvm::Value *WorkFnMatch =
1381 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1382
1383 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1384 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1385 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1386
1387 // Execute this outlined function.
1388 CGF.EmitBlock(ExecuteFNBB);
1389
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001390 // Insert call to work function via shared wrapper. The shared
1391 // wrapper takes two arguments:
1392 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001393 // - the thread ID;
1394 emitCall(CGF, WST.Loc, W,
1395 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001396
1397 // Go to end of parallel region.
1398 CGF.EmitBranch(TerminateBB);
1399
1400 CGF.EmitBlock(CheckNextBB);
1401 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001402 // Default case: call to outlined function through pointer if the target
1403 // region makes a declare target call that may contain an orphaned parallel
1404 // directive.
1405 auto *ParallelFnTy =
1406 llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1407 /*isVarArg=*/false)
1408 ->getPointerTo();
1409 llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1410 // Insert call to work function via shared wrapper. The shared
1411 // wrapper takes two arguments:
1412 // - the parallelism level;
1413 // - the thread ID;
1414 emitCall(CGF, WST.Loc, WorkFnCast,
1415 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1416 // Go to end of parallel region.
1417 CGF.EmitBranch(TerminateBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001418
1419 // Signal end of parallel region.
1420 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001421 CGF.EmitRuntimeCall(
1422 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1423 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001424 CGF.EmitBranch(BarrierBB);
1425
1426 // All active and inactive workers wait at a barrier after parallel region.
1427 CGF.EmitBlock(BarrierBB);
1428 // Barrier after parallel region.
1429 syncCTAThreads(CGF);
1430 CGF.EmitBranch(AwaitBB);
1431
1432 // Exit target region.
1433 CGF.EmitBlock(ExitBB);
1434}
1435
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001436/// Returns specified OpenMP runtime function for the current OpenMP
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001437/// implementation. Specialized for the NVPTX device.
1438/// \param Function OpenMP runtime function.
1439/// \return Specified function.
1440llvm::Constant *
1441CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1442 llvm::Constant *RTLFn = nullptr;
1443 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1444 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001445 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1446 // RequiresOMPRuntime);
1447 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001448 auto *FnTy =
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001449 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1450 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1451 break;
1452 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001453 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001454 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1455 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001456 auto *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001457 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001458 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1459 break;
1460 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001461 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1462 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001463 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001464 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001465 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001466 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1467 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1468 break;
1469 }
1470 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1471 // Build void __kmpc_spmd_kernel_deinit();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001472 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001473 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1474 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1475 break;
1476 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001477 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1478 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001479 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001480 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001481 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001482 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1483 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1484 break;
1485 }
1486 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001487 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1488 /// int16_t IsOMPRuntimeInitialized);
1489 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001490 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001491 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001492 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1493 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1494 break;
1495 }
1496 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1497 /// Build void __kmpc_kernel_end_parallel();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001498 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001499 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1500 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1501 break;
1502 }
1503 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1504 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1505 // global_tid);
1506 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001507 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001508 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1509 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1510 break;
1511 }
1512 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1513 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1514 // global_tid);
1515 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001516 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001517 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1518 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1519 break;
1520 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001521 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1522 // Build int32_t __kmpc_shuffle_int32(int32_t element,
1523 // int16_t lane_offset, int16_t warp_size);
1524 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001525 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001526 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1527 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1528 break;
1529 }
1530 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1531 // Build int64_t __kmpc_shuffle_int64(int64_t element,
1532 // int16_t lane_offset, int16_t warp_size);
1533 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001534 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001535 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1536 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1537 break;
1538 }
1539 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1540 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1541 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1542 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1543 // lane_offset, int16_t Algorithm Version),
1544 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1545 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1546 CGM.Int16Ty, CGM.Int16Ty};
1547 auto *ShuffleReduceFnTy =
1548 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1549 /*isVarArg=*/false);
1550 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1551 auto *InterWarpCopyFnTy =
1552 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1553 /*isVarArg=*/false);
1554 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1555 CGM.Int32Ty,
1556 CGM.SizeTy,
1557 CGM.VoidPtrTy,
1558 ShuffleReduceFnTy->getPointerTo(),
1559 InterWarpCopyFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001560 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001561 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1562 RTLFn = CGM.CreateRuntimeFunction(
1563 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1564 break;
1565 }
Alexey Bataevfac26cf2018-05-02 20:03:27 +00001566 case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1567 // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1568 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1569 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1570 // lane_offset, int16_t Algorithm Version),
1571 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1572 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1573 CGM.Int16Ty, CGM.Int16Ty};
1574 auto *ShuffleReduceFnTy =
1575 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1576 /*isVarArg=*/false);
1577 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1578 auto *InterWarpCopyFnTy =
1579 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1580 /*isVarArg=*/false);
1581 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1582 CGM.Int32Ty,
1583 CGM.SizeTy,
1584 CGM.VoidPtrTy,
1585 ShuffleReduceFnTy->getPointerTo(),
1586 InterWarpCopyFnTy->getPointerTo()};
1587 auto *FnTy =
1588 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1589 RTLFn = CGM.CreateRuntimeFunction(
1590 FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1591 break;
1592 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001593 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1594 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1595 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1596 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1597 // lane_offset, int16_t shortCircuit),
1598 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1599 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1600 // int32_t index, int32_t width),
1601 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1602 // int32_t index, int32_t width, int32_t reduce))
1603 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1604 CGM.Int16Ty, CGM.Int16Ty};
1605 auto *ShuffleReduceFnTy =
1606 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1607 /*isVarArg=*/false);
1608 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1609 auto *InterWarpCopyFnTy =
1610 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1611 /*isVarArg=*/false);
1612 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1613 CGM.Int32Ty, CGM.Int32Ty};
1614 auto *CopyToScratchpadFnTy =
1615 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1616 /*isVarArg=*/false);
1617 llvm::Type *LoadReduceTypeParams[] = {
1618 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1619 auto *LoadReduceFnTy =
1620 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1621 /*isVarArg=*/false);
1622 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1623 CGM.Int32Ty,
1624 CGM.SizeTy,
1625 CGM.VoidPtrTy,
1626 ShuffleReduceFnTy->getPointerTo(),
1627 InterWarpCopyFnTy->getPointerTo(),
1628 CopyToScratchpadFnTy->getPointerTo(),
1629 LoadReduceFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001630 auto *FnTy =
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001631 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1632 RTLFn = CGM.CreateRuntimeFunction(
1633 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1634 break;
1635 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001636 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1637 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1638 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001639 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001640 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1641 RTLFn = CGM.CreateRuntimeFunction(
1642 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1643 break;
1644 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001645 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1646 /// Build void __kmpc_data_sharing_init_stack();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001647 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001648 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1649 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1650 break;
1651 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001652 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1653 /// Build void __kmpc_data_sharing_init_stack_spmd();
1654 auto *FnTy =
1655 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001656 RTLFn =
1657 CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001658 break;
1659 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001660 case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1661 // Build void *__kmpc_data_sharing_push_stack(size_t size,
1662 // int16_t UseSharedMemory);
1663 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001664 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001665 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1666 RTLFn = CGM.CreateRuntimeFunction(
1667 FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1668 break;
1669 }
1670 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1671 // Build void __kmpc_data_sharing_pop_stack(void *a);
1672 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001673 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001674 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1675 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1676 /*Name=*/"__kmpc_data_sharing_pop_stack");
1677 break;
1678 }
1679 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1680 /// Build void __kmpc_begin_sharing_variables(void ***args,
1681 /// size_t n_args);
1682 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001683 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001684 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1685 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1686 break;
1687 }
1688 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1689 /// Build void __kmpc_end_sharing_variables();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001690 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001691 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1692 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1693 break;
1694 }
1695 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1696 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1697 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001698 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001699 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1700 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1701 break;
1702 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001703 case OMPRTL_NVPTX__kmpc_parallel_level: {
1704 // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1705 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1706 auto *FnTy =
1707 llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1708 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1709 break;
1710 }
Alexey Bataev673110d2018-05-16 13:36:30 +00001711 case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1712 // Build int8_t __kmpc_is_spmd_exec_mode();
1713 auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1714 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1715 break;
1716 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001717 }
1718 return RTLFn;
1719}
1720
1721void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1722 llvm::Constant *Addr,
Alexey Bataev03f270c2018-03-30 18:31:07 +00001723 uint64_t Size, int32_t,
1724 llvm::GlobalValue::LinkageTypes) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001725 // TODO: Add support for global variables on the device after declare target
1726 // support.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001727 if (!isa<llvm::Function>(Addr))
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001728 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +00001729 llvm::Module &M = CGM.getModule();
1730 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001731
1732 // Get "nvvm.annotations" metadata node
Alexey Bataev9ff80832018-04-16 20:16:21 +00001733 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001734
1735 llvm::Metadata *MDVals[] = {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001736 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001737 llvm::ConstantAsMetadata::get(
1738 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1739 // Append metadata to nvvm.annotations
1740 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1741}
1742
1743void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1744 const OMPExecutableDirective &D, StringRef ParentName,
1745 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001746 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001747 if (!IsOffloadEntry) // Nothing to do.
1748 return;
1749
1750 assert(!ParentName.empty() && "Invalid target region parent name!");
1751
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001752 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001753 if (Mode)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001754 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001755 CodeGen);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001756 else
1757 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1758 CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001759
1760 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001761}
1762
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001763CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001764 : CGOpenMPRuntime(CGM, "_", "$") {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001765 if (!CGM.getLangOpts().OpenMPIsDevice)
1766 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001767}
Carlo Bertollic6872252016-04-04 15:55:02 +00001768
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001769void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1770 OpenMPProcBindClauseKind ProcBind,
1771 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001772 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001773 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001774 return;
1775
1776 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1777}
1778
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001779void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1780 llvm::Value *NumThreads,
1781 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001782 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001783 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001784 return;
1785
1786 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1787}
1788
Carlo Bertollic6872252016-04-04 15:55:02 +00001789void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1790 const Expr *NumTeams,
1791 const Expr *ThreadLimit,
1792 SourceLocation Loc) {}
1793
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001794llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1795 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1796 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001797 // Emit target region as a standalone region.
1798 class NVPTXPrePostActionTy : public PrePostActionTy {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001799 bool &IsInParallelRegion;
1800 bool PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001801
1802 public:
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001803 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1804 : IsInParallelRegion(IsInParallelRegion) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001805 void Enter(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001806 PrevIsInParallelRegion = IsInParallelRegion;
1807 IsInParallelRegion = true;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001808 }
1809 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001810 IsInParallelRegion = PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001811 }
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001812 } Action(IsInParallelRegion);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001813 CodeGen.setAction(Action);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001814 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1815 IsInTargetMasterThreadRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001816 auto *OutlinedFun =
1817 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1818 D, ThreadIDVar, InnermostKind, CodeGen));
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001819 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001820 if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1821 !IsInParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001822 llvm::Function *WrapperFun =
1823 createParallelDataSharingWrapper(OutlinedFun, D);
1824 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1825 }
1826
1827 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001828}
1829
Alexey Bataev2adecff2018-09-21 14:22:53 +00001830/// Get list of lastprivate variables from the teams distribute ... or
1831/// teams {distribute ...} directives.
1832static void
1833getDistributeLastprivateVars(const OMPExecutableDirective &D,
1834 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1835 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1836 "expected teams directive.");
1837 const OMPExecutableDirective *Dir = &D;
1838 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1839 if (const Stmt *S = getSingleCompoundChild(
1840 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1841 /*IgnoreCaptured=*/true))) {
1842 Dir = dyn_cast<OMPExecutableDirective>(S);
1843 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1844 Dir = nullptr;
1845 }
1846 }
1847 if (!Dir)
1848 return;
1849 for (const OMPLastprivateClause *C :
1850 Dir->getClausesOfKind<OMPLastprivateClause>()) {
1851 for (const Expr *E : C->getVarRefs()) {
1852 const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
1853 Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
1854 }
1855 }
1856}
1857
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001858llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001859 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1860 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001861 SourceLocation Loc = D.getBeginLoc();
Carlo Bertollic6872252016-04-04 15:55:02 +00001862
Alexey Bataev2adecff2018-09-21 14:22:53 +00001863 const RecordDecl *GlobalizedRD = nullptr;
1864 llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
1865 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1866 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
1867 getDistributeLastprivateVars(D, LastPrivates);
1868 if (!LastPrivates.empty())
1869 GlobalizedRD = buildRecordForGlobalizedVars(
1870 CGM.getContext(), LastPrivates, MappedDeclsFields);
1871 }
1872
Alexey Bataevc99042b2018-03-15 18:10:54 +00001873 // Emit target region as a standalone region.
1874 class NVPTXPrePostActionTy : public PrePostActionTy {
1875 SourceLocation &Loc;
Alexey Bataev2adecff2018-09-21 14:22:53 +00001876 const RecordDecl *GlobalizedRD;
1877 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1878 &MappedDeclsFields;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001879
1880 public:
Alexey Bataev2adecff2018-09-21 14:22:53 +00001881 NVPTXPrePostActionTy(
1882 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1883 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1884 &MappedDeclsFields)
1885 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1886 MappedDeclsFields(MappedDeclsFields) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001887 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001888 auto &Rt =
1889 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1890 if (GlobalizedRD) {
1891 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1892 I->getSecond().GlobalRecord = GlobalizedRD;
1893 I->getSecond().MappedParams =
1894 llvm::make_unique<CodeGenFunction::OMPMapVars>();
1895 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1896 for (const auto &Pair : MappedDeclsFields) {
1897 assert(Pair.getFirst()->isCanonicalDecl() &&
1898 "Expected canonical declaration");
1899 Data.insert(std::make_pair(
1900 Pair.getFirst(),
1901 std::make_pair(Pair.getSecond(), Address::invalid())));
1902 }
1903 }
1904 Rt.emitGenericVarsProlog(CGF, Loc);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001905 }
1906 void Exit(CodeGenFunction &CGF) override {
1907 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1908 .emitGenericVarsEpilog(CGF);
1909 }
Alexey Bataev2adecff2018-09-21 14:22:53 +00001910 } Action(Loc, GlobalizedRD, MappedDeclsFields);
1911 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001912 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1913 D, ThreadIDVar, InnermostKind, CodeGen);
1914 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1915 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00001916 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001917 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00001918
1919 return OutlinedFun;
1920}
1921
Alexey Bataevc99042b2018-03-15 18:10:54 +00001922void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00001923 SourceLocation Loc,
1924 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001925 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
1926 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001927 return;
1928
Alexey Bataevc99042b2018-03-15 18:10:54 +00001929 CGBuilderTy &Bld = CGF.Builder;
1930
1931 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1932 if (I == FunctionGlobalizedDecls.end())
1933 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001934 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1935 QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001936
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001937 // Recover pointer to this function's global record. The runtime will
1938 // handle the specifics of the allocation of the memory.
1939 // Use actual memory size of the record including the padding
1940 // for alignment purposes.
1941 unsigned Alignment =
1942 CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
1943 unsigned GlobalRecordSize =
1944 CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1945 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001946
1947 llvm::Value *GlobalRecCastAddr;
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00001948 if (WithSPMDCheck ||
1949 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001950 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1951 llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
1952 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
1953 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
1954 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
1955 Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
1956 // There is no need to emit line number for unconditional branch.
1957 (void)ApplyDebugLocation::CreateEmpty(CGF);
1958 CGF.EmitBlock(SPMDBB);
1959 Address RecPtr = CGF.CreateMemTemp(RecTy, "_local_stack");
1960 CGF.EmitBranch(ExitBB);
1961 // There is no need to emit line number for unconditional branch.
1962 (void)ApplyDebugLocation::CreateEmpty(CGF);
1963 CGF.EmitBlock(NonSPMDBB);
1964 // TODO: allow the usage of shared memory to be controlled by
1965 // the user, for now, default to global.
1966 llvm::Value *GlobalRecordSizeArg[] = {
1967 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1968 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1969 llvm::Value *GlobalRecValue =
1970 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1971 OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1972 GlobalRecordSizeArg);
1973 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1974 GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1975 CGF.EmitBlock(ExitBB);
1976 auto *Phi = Bld.CreatePHI(GlobalRecCastAddr->getType(),
1977 /*NumReservedValues=*/2, "_select_stack");
1978 Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
1979 Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
1980 GlobalRecCastAddr = Phi;
1981 I->getSecond().GlobalRecordAddr = Phi;
1982 I->getSecond().IsInSPMDModeFlag = IsSPMD;
1983 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001984 // TODO: allow the usage of shared memory to be controlled by
1985 // the user, for now, default to global.
1986 llvm::Value *GlobalRecordSizeArg[] = {
1987 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1988 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1989 llvm::Value *GlobalRecValue =
1990 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1991 OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1992 GlobalRecordSizeArg);
1993 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1994 GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1995 I->getSecond().GlobalRecordAddr = GlobalRecValue;
1996 I->getSecond().IsInSPMDModeFlag = nullptr;
1997 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001998 LValue Base =
1999 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002000
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002001 // Emit the "global alloca" which is a GEP from the global declaration
2002 // record using the pointer returned by the runtime.
2003 for (auto &Rec : I->getSecond().LocalVarData) {
2004 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2005 llvm::Value *ParValue;
2006 if (EscapedParam) {
2007 const auto *VD = cast<VarDecl>(Rec.first);
2008 LValue ParLVal =
2009 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2010 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2011 }
2012 const FieldDecl *FD = Rec.second.first;
2013 LValue VarAddr = CGF.EmitLValueForField(Base, FD);
2014 Rec.second.second = VarAddr.getAddress();
2015 if (EscapedParam) {
2016 const auto *VD = cast<VarDecl>(Rec.first);
2017 CGF.EmitStoreOfScalar(ParValue, VarAddr);
2018 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2019 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00002020 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002021 }
2022 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2023 // Recover pointer to this function's global record. The runtime will
2024 // handle the specifics of the allocation of the memory.
2025 // Use actual memory size of the record including the padding
2026 // for alignment purposes.
Alexey Bataev9ff80832018-04-16 20:16:21 +00002027 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002028 llvm::Value *Size = CGF.getTypeSize(VD->getType());
2029 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2030 Size = Bld.CreateNUWAdd(
2031 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2032 llvm::Value *AlignVal =
2033 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2034 Size = Bld.CreateUDiv(Size, AlignVal);
2035 Size = Bld.CreateNUWMul(Size, AlignVal);
2036 // TODO: allow the usage of shared memory to be controlled by
2037 // the user, for now, default to global.
2038 llvm::Value *GlobalRecordSizeArg[] = {
2039 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2040 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2041 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
2042 GlobalRecordSizeArg);
2043 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2044 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2045 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2046 CGM.getContext().getDeclAlign(VD),
2047 AlignmentSource::Decl);
2048 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2049 Base.getAddress());
2050 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002051 }
2052 I->getSecond().MappedParams->apply(CGF);
2053}
2054
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002055void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2056 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002057 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2058 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002059 return;
2060
Alexey Bataevc99042b2018-03-15 18:10:54 +00002061 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002062 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00002063 I->getSecond().MappedParams->restore(CGF);
2064 if (!CGF.HaveInsertPoint())
2065 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002066 for (llvm::Value *Addr :
2067 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2068 CGF.EmitRuntimeCall(
2069 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2070 Addr);
2071 }
2072 if (I->getSecond().GlobalRecordAddr) {
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002073 if (WithSPMDCheck ||
2074 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002075 CGBuilderTy &Bld = CGF.Builder;
2076 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2077 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2078 Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2079 // There is no need to emit line number for unconditional branch.
2080 (void)ApplyDebugLocation::CreateEmpty(CGF);
2081 CGF.EmitBlock(NonSPMDBB);
2082 CGF.EmitRuntimeCall(
2083 createNVPTXRuntimeFunction(
2084 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2085 CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2086 CGF.EmitBlock(ExitBB);
2087 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002088 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2089 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2090 I->getSecond().GlobalRecordAddr);
2091 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002092 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00002093 }
2094}
2095
Carlo Bertollic6872252016-04-04 15:55:02 +00002096void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2097 const OMPExecutableDirective &D,
2098 SourceLocation Loc,
2099 llvm::Value *OutlinedFn,
2100 ArrayRef<llvm::Value *> CapturedVars) {
2101 if (!CGF.HaveInsertPoint())
2102 return;
2103
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002104 Address ZeroAddr = CGF.CreateMemTemp(
2105 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2106 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00002107 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2108 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002109 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00002110 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2111 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00002112 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00002113}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002114
2115void CGOpenMPRuntimeNVPTX::emitParallelCall(
2116 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2117 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2118 if (!CGF.HaveInsertPoint())
2119 return;
2120
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002121 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002122 emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002123 else
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002124 emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002125}
2126
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002127void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002128 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2129 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2130 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002131
2132 // Force inline this outlined function at its call site.
2133 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2134
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002135 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2136 /*DestWidth=*/32, /*Signed=*/1),
2137 ".zero.addr");
2138 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002139 // ThreadId for serialized parallels is 0.
2140 Address ThreadIDAddr = ZeroAddr;
2141 auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002142 CodeGenFunction &CGF, PrePostActionTy &Action) {
2143 Action.Enter(CGF);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002144
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002145 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2146 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2147 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2148 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2149 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2150 };
2151 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2152 PrePostActionTy &) {
2153
2154 RegionCodeGenTy RCG(CodeGen);
2155 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2156 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2157 llvm::Value *Args[] = {RTLoc, ThreadID};
2158
2159 NVPTXActionTy Action(
2160 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2161 Args,
2162 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2163 Args);
2164 RCG.setAction(Action);
2165 RCG(CGF);
2166 };
2167
2168 auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2169 PrePostActionTy &Action) {
2170 CGBuilderTy &Bld = CGF.Builder;
2171 llvm::Function *WFn = WrapperFunctionsMap[Fn];
2172 assert(WFn && "Wrapper function does not exist!");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002173 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2174
2175 // Prepare for parallel region. Indicate the outlined function.
2176 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002177 CGF.EmitRuntimeCall(
2178 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2179 Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002180
2181 // Create a private scope that will globalize the arguments
2182 // passed from the outside of the target region.
2183 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2184
2185 // There's somehting to share.
2186 if (!CapturedVars.empty()) {
2187 // Prepare for parallel region. Indicate the outlined function.
2188 Address SharedArgs =
2189 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2190 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2191
2192 llvm::Value *DataSharingArgs[] = {
2193 SharedArgsPtr,
2194 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2195 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2196 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2197 DataSharingArgs);
2198
2199 // Store variable address in a list of references to pass to workers.
2200 unsigned Idx = 0;
2201 ASTContext &Ctx = CGF.getContext();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002202 Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2203 SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2204 .castAs<PointerType>());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002205 for (llvm::Value *V : CapturedVars) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002206 Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2207 CGF.getPointerSize());
2208 llvm::Value *PtrV;
Alexey Bataev17314212018-03-20 15:41:05 +00002209 if (V->getType()->isIntegerTy())
2210 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2211 else
2212 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002213 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2214 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00002215 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002216 }
2217 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002218
2219 // Activate workers. This barrier is used by the master to signal
2220 // work for the workers.
2221 syncCTAThreads(CGF);
2222
2223 // OpenMP [2.5, Parallel Construct, p.49]
2224 // There is an implied barrier at the end of a parallel region. After the
2225 // end of a parallel region, only the master thread of the team resumes
2226 // execution of the enclosing task region.
2227 //
2228 // The master waits at this barrier until all workers are done.
2229 syncCTAThreads(CGF);
2230
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002231 if (!CapturedVars.empty())
2232 CGF.EmitRuntimeCall(
2233 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2234
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002235 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002236 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002237 };
2238
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002239 auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2240 CodeGenFunction &CGF, PrePostActionTy &Action) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002241 if (IsInParallelRegion) {
2242 SeqGen(CGF, Action);
2243 } else if (IsInTargetMasterThreadRegion) {
2244 L0ParallelGen(CGF, Action);
2245 } else {
2246 // Check for master and then parallelism:
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002247 // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002248 // Serialized execution.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002249 // } else {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002250 // Worker call.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002251 // }
2252 CGBuilderTy &Bld = CGF.Builder;
2253 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002254 llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2255 llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002256 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
Alexey Bataev673110d2018-05-16 13:36:30 +00002257 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2258 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002259 Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002260 // There is no need to emit line number for unconditional branch.
2261 (void)ApplyDebugLocation::CreateEmpty(CGF);
2262 CGF.EmitBlock(ParallelCheckBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002263 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2264 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2265 llvm::Value *PL = CGF.EmitRuntimeCall(
2266 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2267 {RTLoc, ThreadID});
2268 llvm::Value *Res = Bld.CreateIsNotNull(PL);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002269 Bld.CreateCondBr(Res, SeqBB, MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002270 CGF.EmitBlock(SeqBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002271 SeqGen(CGF, Action);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002272 CGF.EmitBranch(ExitBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002273 // There is no need to emit line number for unconditional branch.
2274 (void)ApplyDebugLocation::CreateEmpty(CGF);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002275 CGF.EmitBlock(MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002276 L0ParallelGen(CGF, Action);
2277 CGF.EmitBranch(ExitBB);
2278 // There is no need to emit line number for unconditional branch.
2279 (void)ApplyDebugLocation::CreateEmpty(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002280 // Emit the continuation block for code after the if.
2281 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2282 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002283 };
2284
Alexey Bataev9ff80832018-04-16 20:16:21 +00002285 if (IfCond) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002286 emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002287 } else {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002288 CodeGenFunction::RunCleanupsScope Scope(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002289 RegionCodeGenTy ThenRCG(LNParallelGen);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002290 ThenRCG(CGF);
2291 }
2292}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002293
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002294void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002295 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2296 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2297 // Just call the outlined function to execute the parallel region.
2298 // OutlinedFn(&GTid, &zero, CapturedStruct);
2299 //
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002300 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00002301
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002302 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2303 /*DestWidth=*/32, /*Signed=*/1),
2304 ".zero.addr");
Carlo Bertolli79712092018-02-28 20:48:35 +00002305 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002306 // ThreadId for serialized parallels is 0.
2307 Address ThreadIDAddr = ZeroAddr;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002308 auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
Alexey Bataev8521ff62018-07-25 20:03:01 +00002309 &ThreadIDAddr](CodeGenFunction &CGF,
2310 PrePostActionTy &Action) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002311 Action.Enter(CGF);
2312
2313 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2314 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2315 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2316 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2317 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2318 };
2319 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2320 PrePostActionTy &) {
2321
2322 RegionCodeGenTy RCG(CodeGen);
2323 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2324 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2325 llvm::Value *Args[] = {RTLoc, ThreadID};
2326
2327 NVPTXActionTy Action(
2328 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2329 Args,
2330 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2331 Args);
2332 RCG.setAction(Action);
2333 RCG(CGF);
2334 };
2335
2336 if (IsInTargetMasterThreadRegion) {
Alexey Bataev8521ff62018-07-25 20:03:01 +00002337 // In the worker need to use the real thread id.
2338 ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002339 RegionCodeGenTy RCG(CodeGen);
2340 RCG(CGF);
2341 } else {
2342 // If we are not in the target region, it is definitely L2 parallelism or
2343 // more, because for SPMD mode we always has L1 parallel level, sowe don't
2344 // need to check for orphaned directives.
2345 RegionCodeGenTy RCG(SeqGen);
2346 RCG(CGF);
2347 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002348}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002349
Alexey Bataev504fc2d2018-05-07 17:23:05 +00002350void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2351 CodeGenFunction &CGF, StringRef CriticalName,
2352 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2353 const Expr *Hint) {
2354 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2355 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2356 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2357 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2358 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2359
2360 // Fetch team-local id of the thread.
2361 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2362
2363 // Get the width of the team.
2364 llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2365
2366 // Initialize the counter variable for the loop.
2367 QualType Int32Ty =
2368 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2369 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2370 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2371 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2372 /*isInit=*/true);
2373
2374 // Block checks if loop counter exceeds upper bound.
2375 CGF.EmitBlock(LoopBB);
2376 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2377 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2378 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2379
2380 // Block tests which single thread should execute region, and which threads
2381 // should go straight to synchronisation point.
2382 CGF.EmitBlock(TestBB);
2383 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2384 llvm::Value *CmpThreadToCounter =
2385 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2386 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2387
2388 // Block emits the body of the critical region.
2389 CGF.EmitBlock(BodyBB);
2390
2391 // Output the critical statement.
2392 CriticalOpGen(CGF);
2393
2394 // After the body surrounded by the critical region, the single executing
2395 // thread will jump to the synchronisation point.
2396 // Block waits for all threads in current team to finish then increments the
2397 // counter variable and returns to the loop.
2398 CGF.EmitBlock(SyncBB);
2399 getNVPTXCTABarrier(CGF);
2400
2401 llvm::Value *IncCounterVal =
2402 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2403 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2404 CGF.EmitBranch(LoopBB);
2405
2406 // Block that is reached when all threads in the team complete the region.
2407 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2408}
2409
Alexey Bataevb2575932018-01-04 20:18:55 +00002410/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00002411static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2412 QualType ValTy, QualType CastTy,
2413 SourceLocation Loc) {
2414 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2415 "Cast type must sized.");
2416 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2417 "Val type must sized.");
2418 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2419 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00002420 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00002421 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2422 CGF.getContext().getTypeSizeInChars(CastTy))
2423 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2424 if (CastTy->isIntegerType() && ValTy->isIntegerType())
2425 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2426 CastTy->hasSignedIntegerRepresentation());
2427 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002428 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2429 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00002430 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2431 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00002432}
2433
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002434/// This function creates calls to one of two shuffle functions to copy
2435/// variables between lanes in a warp.
2436static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002437 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00002438 QualType ElemType,
2439 llvm::Value *Offset,
2440 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002441 CodeGenModule &CGM = CGF.CGM;
2442 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002443 CGOpenMPRuntimeNVPTX &RT =
2444 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2445
Alexey Bataeva453f362018-03-19 17:53:56 +00002446 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2447 assert(Size.getQuantity() <= 8 &&
2448 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002449
Alexey Bataeva453f362018-03-19 17:53:56 +00002450 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002451 ? OMPRTL_NVPTX__kmpc_shuffle_int32
2452 : OMPRTL_NVPTX__kmpc_shuffle_int64;
2453
2454 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00002455 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2456 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2457 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002458 llvm::Value *WarpSize =
Alexey Bataevb2575932018-01-04 20:18:55 +00002459 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002460
Alexey Bataev9ff80832018-04-16 20:16:21 +00002461 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2462 RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002463
Alexey Bataeva453f362018-03-19 17:53:56 +00002464 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002465}
2466
Alexey Bataev12c62902018-06-22 19:10:38 +00002467static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2468 Address DestAddr, QualType ElemType,
2469 llvm::Value *Offset, SourceLocation Loc) {
2470 CGBuilderTy &Bld = CGF.Builder;
2471
2472 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2473 // Create the loop over the big sized data.
2474 // ptr = (void*)Elem;
2475 // ptrEnd = (void*) Elem + 1;
2476 // Step = 8;
2477 // while (ptr + Step < ptrEnd)
2478 // shuffle((int64_t)*ptr);
2479 // Step = 4;
2480 // while (ptr + Step < ptrEnd)
2481 // shuffle((int32_t)*ptr);
2482 // ...
2483 Address ElemPtr = DestAddr;
2484 Address Ptr = SrcAddr;
2485 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2486 Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2487 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2488 if (Size < CharUnits::fromQuantity(IntSize))
2489 continue;
2490 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2491 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2492 /*Signed=*/1);
2493 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2494 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2495 ElemPtr =
2496 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2497 if (Size.getQuantity() / IntSize > 1) {
2498 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2499 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2500 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2501 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2502 CGF.EmitBlock(PreCondBB);
2503 llvm::PHINode *PhiSrc =
2504 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2505 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2506 llvm::PHINode *PhiDest =
2507 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2508 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2509 Ptr = Address(PhiSrc, Ptr.getAlignment());
2510 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2511 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2512 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2513 Ptr.getPointer(), CGF.VoidPtrTy));
2514 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2515 ThenBB, ExitBB);
2516 CGF.EmitBlock(ThenBB);
2517 llvm::Value *Res = createRuntimeShuffleFunction(
2518 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2519 IntType, Offset, Loc);
2520 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2521 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2522 ElemPtr =
2523 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2524 PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2525 PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2526 CGF.EmitBranch(PreCondBB);
2527 CGF.EmitBlock(ExitBB);
2528 } else {
2529 llvm::Value *Res = createRuntimeShuffleFunction(
2530 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2531 IntType, Offset, Loc);
2532 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2533 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2534 ElemPtr =
2535 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2536 }
2537 Size = Size % IntSize;
2538 }
2539}
2540
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002541namespace {
2542enum CopyAction : unsigned {
2543 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2544 // the warp using shuffle instructions.
2545 RemoteLaneToThread,
2546 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2547 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002548 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2549 ThreadToScratchpad,
2550 // ScratchpadToThread: Copy from a scratchpad array in global memory
2551 // containing team-reduced data to a thread's stack.
2552 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002553};
2554} // namespace
2555
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002556struct CopyOptionsTy {
2557 llvm::Value *RemoteLaneOffset;
2558 llvm::Value *ScratchpadIndex;
2559 llvm::Value *ScratchpadWidth;
2560};
2561
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002562/// Emit instructions to copy a Reduce list, which contains partially
2563/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002564static void emitReductionListCopy(
2565 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2566 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2567 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002568
Alexey Bataev9ff80832018-04-16 20:16:21 +00002569 CodeGenModule &CGM = CGF.CGM;
2570 ASTContext &C = CGM.getContext();
2571 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002572
Alexey Bataev9ff80832018-04-16 20:16:21 +00002573 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2574 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2575 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002576
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002577 // Iterates, element-by-element, through the source Reduce list and
2578 // make a copy.
2579 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002580 unsigned Size = Privates.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00002581 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002582 Address SrcElementAddr = Address::invalid();
2583 Address DestElementAddr = Address::invalid();
2584 Address DestElementPtrAddr = Address::invalid();
2585 // Should we shuffle in an element from a remote lane?
2586 bool ShuffleInElement = false;
2587 // Set to true to update the pointer in the dest Reduce list to a
2588 // newly created element.
2589 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002590 // Increment the src or dest pointer to the scratchpad, for each
2591 // new element.
2592 bool IncrScratchpadSrc = false;
2593 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002594
2595 switch (Action) {
2596 case RemoteLaneToThread: {
2597 // Step 1.1: Get the address for the src element in the Reduce list.
2598 Address SrcElementPtrAddr =
2599 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002600 SrcElementAddr = CGF.EmitLoadOfPointer(
2601 SrcElementPtrAddr,
2602 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002603
2604 // Step 1.2: Create a temporary to store the element in the destination
2605 // Reduce list.
2606 DestElementPtrAddr =
2607 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2608 DestElementAddr =
2609 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2610 ShuffleInElement = true;
2611 UpdateDestListPtr = true;
2612 break;
2613 }
2614 case ThreadCopy: {
2615 // Step 1.1: Get the address for the src element in the Reduce list.
2616 Address SrcElementPtrAddr =
2617 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002618 SrcElementAddr = CGF.EmitLoadOfPointer(
2619 SrcElementPtrAddr,
2620 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002621
2622 // Step 1.2: Get the address for dest element. The destination
2623 // element has already been created on the thread's stack.
2624 DestElementPtrAddr =
2625 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002626 DestElementAddr = CGF.EmitLoadOfPointer(
2627 DestElementPtrAddr,
2628 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002629 break;
2630 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002631 case ThreadToScratchpad: {
2632 // Step 1.1: Get the address for the src element in the Reduce list.
2633 Address SrcElementPtrAddr =
2634 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002635 SrcElementAddr = CGF.EmitLoadOfPointer(
2636 SrcElementPtrAddr,
2637 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002638
2639 // Step 1.2: Get the address for dest element:
2640 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002641 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002642 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002643 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002644 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002645 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002646 ScratchPadElemAbsolutePtrVal =
2647 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002648 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2649 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002650 IncrScratchpadDest = true;
2651 break;
2652 }
2653 case ScratchpadToThread: {
2654 // Step 1.1: Get the address for the src element in the scratchpad.
2655 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002656 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002657 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002658 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002659 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002660 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002661 ScratchPadElemAbsolutePtrVal =
2662 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2663 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2664 C.getTypeAlignInChars(Private->getType()));
2665 IncrScratchpadSrc = true;
2666
2667 // Step 1.2: Create a temporary to store the element in the destination
2668 // Reduce list.
2669 DestElementPtrAddr =
2670 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2671 DestElementAddr =
2672 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2673 UpdateDestListPtr = true;
2674 break;
2675 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002676 }
2677
2678 // Regardless of src and dest of copy, we emit the load of src
2679 // element as this is required in all directions
2680 SrcElementAddr = Bld.CreateElementBitCast(
2681 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
Alexey Bataev12c62902018-06-22 19:10:38 +00002682 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2683 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002684
2685 // Now that all active lanes have read the element in the
2686 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00002687 if (ShuffleInElement) {
Alexey Bataev12c62902018-06-22 19:10:38 +00002688 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2689 RemoteLaneOffset, Private->getExprLoc());
2690 } else {
2691 if (Private->getType()->isScalarType()) {
2692 llvm::Value *Elem =
2693 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2694 Private->getType(), Private->getExprLoc());
2695 // Store the source element value to the dest element address.
2696 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2697 Private->getType());
2698 } else {
2699 CGF.EmitAggregateCopy(
2700 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2701 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2702 Private->getType(), AggValueSlot::DoesNotOverlap);
2703 }
Alexey Bataeva453f362018-03-19 17:53:56 +00002704 }
Alexey Bataevb2575932018-01-04 20:18:55 +00002705
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002706 // Step 3.1: Modify reference in dest Reduce list as needed.
2707 // Modifying the reference in Reduce list to point to the newly
2708 // created element. The element is live in the current function
2709 // scope and that of functions it invokes (i.e., reduce_function).
2710 // RemoteReduceData[i] = (void*)&RemoteElem
2711 if (UpdateDestListPtr) {
2712 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2713 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2714 DestElementPtrAddr, /*Volatile=*/false,
2715 C.VoidPtrTy);
2716 }
2717
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002718 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2719 // address of the next element in scratchpad memory, unless we're currently
2720 // processing the last one. Memory alignment is also taken care of here.
2721 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2722 llvm::Value *ScratchpadBasePtr =
2723 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
Alexey Bataeve290ec02018-04-06 16:03:36 +00002724 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2725 ScratchpadBasePtr = Bld.CreateNUWAdd(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002726 ScratchpadBasePtr,
Alexey Bataeve290ec02018-04-06 16:03:36 +00002727 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002728
2729 // Take care of global memory alignment for performance
Alexey Bataeve290ec02018-04-06 16:03:36 +00002730 ScratchpadBasePtr = Bld.CreateNUWSub(
2731 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2732 ScratchpadBasePtr = Bld.CreateUDiv(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002733 ScratchpadBasePtr,
2734 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
Alexey Bataeve290ec02018-04-06 16:03:36 +00002735 ScratchpadBasePtr = Bld.CreateNUWAdd(
2736 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2737 ScratchpadBasePtr = Bld.CreateNUWMul(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002738 ScratchpadBasePtr,
2739 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2740
2741 if (IncrScratchpadDest)
2742 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2743 else /* IncrScratchpadSrc = true */
2744 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2745 }
2746
Alexey Bataev9ff80832018-04-16 20:16:21 +00002747 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002748 }
2749}
2750
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002751/// This function emits a helper that loads data from the scratchpad array
2752/// and (optionally) reduces it with the input operand.
2753///
2754/// load_and_reduce(local, scratchpad, index, width, should_reduce)
2755/// reduce_data remote;
2756/// for elem in remote:
2757/// remote.elem = Scratchpad[elem_id][index]
2758/// if (should_reduce)
2759/// local = local @ remote
2760/// else
2761/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002762static llvm::Value *emitReduceScratchpadFunction(
2763 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2764 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002765 ASTContext &C = CGM.getContext();
2766 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002767
2768 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002769 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2770 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002771 // Base address of the scratchpad array, with each element storing a
2772 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002773 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2774 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002775 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002776 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2777 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002778 // Row width of an element in the scratchpad array, typically
2779 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002780 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2781 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002782 // If should_reduce == 1, then it's load AND reduce,
2783 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
2784 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002785 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2786 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002787
2788 FunctionArgList Args;
2789 Args.push_back(&ReduceListArg);
2790 Args.push_back(&ScratchPadArg);
2791 Args.push_back(&IndexArg);
2792 Args.push_back(&WidthArg);
2793 Args.push_back(&ShouldReduceArg);
2794
Alexey Bataev9ff80832018-04-16 20:16:21 +00002795 const CGFunctionInfo &CGFI =
2796 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002797 auto *Fn = llvm::Function::Create(
2798 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2799 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002800 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00002801 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002802 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002803 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002804
Alexey Bataev9ff80832018-04-16 20:16:21 +00002805 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002806
2807 // Get local Reduce list pointer.
2808 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2809 Address ReduceListAddr(
2810 Bld.CreatePointerBitCastOrAddrSpaceCast(
2811 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002812 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002813 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2814 CGF.getPointerAlign());
2815
2816 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2817 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002818 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002819
2820 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002821 llvm::Value *IndexVal = Bld.CreateIntCast(
2822 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2823 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002824
2825 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002826 llvm::Value *WidthVal = Bld.CreateIntCast(
2827 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
2828 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002829
2830 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
2831 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002832 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002833
2834 // The absolute ptr address to the base addr of the next element to copy.
2835 llvm::Value *CumulativeElemBasePtr =
2836 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2837 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2838
2839 // Create a Remote Reduce list to store the elements read from the
2840 // scratchpad array.
2841 Address RemoteReduceList =
2842 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
2843
2844 // Assemble remote Reduce list from scratchpad array.
2845 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
2846 SrcDataAddr, RemoteReduceList,
2847 {/*RemoteLaneOffset=*/nullptr,
2848 /*ScratchpadIndex=*/IndexVal,
2849 /*ScratchpadWidth=*/WidthVal});
2850
2851 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2852 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2853 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2854
Alexey Bataev9ff80832018-04-16 20:16:21 +00002855 llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002856 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2857
2858 CGF.EmitBlock(ThenBB);
2859 // We should reduce with the local Reduce list.
2860 // reduce_function(LocalReduceList, RemoteReduceList)
2861 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2862 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
2863 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2864 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002865 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2866 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002867 Bld.CreateBr(MergeBB);
2868
2869 CGF.EmitBlock(ElseBB);
2870 // No reduction; just copy:
2871 // Local Reduce list = Remote Reduce list.
2872 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2873 RemoteReduceList, ReduceListAddr);
2874 Bld.CreateBr(MergeBB);
2875
2876 CGF.EmitBlock(MergeBB);
2877
2878 CGF.FinishFunction();
2879 return Fn;
2880}
2881
2882/// This function emits a helper that stores reduced data from the team
2883/// master to a scratchpad array in global memory.
2884///
2885/// for elem in Reduce List:
2886/// scratchpad[elem_id][index] = elem
2887///
Benjamin Kramer674d5792017-05-26 20:08:24 +00002888static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
2889 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002890 QualType ReductionArrayTy,
2891 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002892
Alexey Bataev9ff80832018-04-16 20:16:21 +00002893 ASTContext &C = CGM.getContext();
2894 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002895
2896 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002897 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2898 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002899 // Base address of the scratchpad array, with each element storing a
2900 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002901 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2902 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002903 // A destination index into the scratchpad array, typically the team
2904 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002905 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2906 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002907 // Row width of an element in the scratchpad array, typically
2908 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002909 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
2910 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002911
2912 FunctionArgList Args;
2913 Args.push_back(&ReduceListArg);
2914 Args.push_back(&ScratchPadArg);
2915 Args.push_back(&IndexArg);
2916 Args.push_back(&WidthArg);
2917
Alexey Bataev9ff80832018-04-16 20:16:21 +00002918 const CGFunctionInfo &CGFI =
2919 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002920 auto *Fn = llvm::Function::Create(
2921 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2922 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002923 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00002924 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002925 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002926 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002927
Alexey Bataev9ff80832018-04-16 20:16:21 +00002928 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002929
2930 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2931 Address SrcDataAddr(
2932 Bld.CreatePointerBitCastOrAddrSpaceCast(
2933 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002934 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002935 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2936 CGF.getPointerAlign());
2937
2938 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
2939 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002940 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002941
2942 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002943 llvm::Value *IndexVal = Bld.CreateIntCast(
2944 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2945 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002946
2947 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2948 llvm::Value *WidthVal =
2949 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2950 Int32Ty, SourceLocation()),
2951 CGF.SizeTy, /*isSigned=*/true);
2952
2953 // The absolute ptr address to the base addr of the next element to copy.
2954 llvm::Value *CumulativeElemBasePtr =
2955 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2956 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2957
2958 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2959 SrcDataAddr, DestDataAddr,
2960 {/*RemoteLaneOffset=*/nullptr,
2961 /*ScratchpadIndex=*/IndexVal,
2962 /*ScratchpadWidth=*/WidthVal});
2963
2964 CGF.FinishFunction();
2965 return Fn;
2966}
2967
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002968/// This function emits a helper that gathers Reduce lists from the first
2969/// lane of every active warp to lanes in the first warp.
2970///
2971/// void inter_warp_copy_func(void* reduce_data, num_warps)
2972/// shared smem[warp_size];
2973/// For all data entries D in reduce_data:
2974/// If (I am the first lane in each warp)
2975/// Copy my local D to smem[warp_id]
2976/// sync
2977/// if (I am the first warp)
2978/// Copy smem[thread_id] to my local D
2979/// sync
2980static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2981 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002982 QualType ReductionArrayTy,
2983 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002984 ASTContext &C = CGM.getContext();
2985 llvm::Module &M = CGM.getModule();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002986
2987 // ReduceList: thread local Reduce list.
2988 // At the stage of the computation when this function is called, partially
2989 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002990 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2991 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002992 // NumWarps: number of warps active in the parallel region. This could
2993 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002994 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00002995 C.getIntTypeForBitwidth(32, /* Signed */ true),
2996 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002997 FunctionArgList Args;
2998 Args.push_back(&ReduceListArg);
2999 Args.push_back(&NumWarpsArg);
3000
Alexey Bataev9ff80832018-04-16 20:16:21 +00003001 const CGFunctionInfo &CGFI =
3002 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003003 auto *Fn = llvm::Function::Create(
3004 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3005 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003006 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003007 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003008 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003009 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003010
Alexey Bataev9ff80832018-04-16 20:16:21 +00003011 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003012
3013 // This array is used as a medium to transfer, one reduce element at a time,
3014 // the data from the first lane of every warp to lanes in the first warp
3015 // in order to perform the final step of a reduction in a parallel region
3016 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3017 // for reduced latency, as well as to have a distinct copy for concurrently
3018 // executing target regions. The array is declared with common linkage so
3019 // as to be shared across compilation units.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003020 StringRef TransferMediumName =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003021 "__openmp_nvptx_data_transfer_temporary_storage";
3022 llvm::GlobalVariable *TransferMedium =
3023 M.getGlobalVariable(TransferMediumName);
3024 if (!TransferMedium) {
3025 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
3026 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3027 TransferMedium = new llvm::GlobalVariable(
3028 M, Ty,
3029 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3030 llvm::Constant::getNullValue(Ty), TransferMediumName,
3031 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3032 SharedAddressSpace);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003033 CGM.addCompilerUsedGlobal(TransferMedium);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003034 }
3035
3036 // Get the CUDA thread id of the current OpenMP thread on the GPU.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003037 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003038 // nvptx_lane_id = nvptx_id % warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003039 llvm::Value *LaneID = getNVPTXLaneID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003040 // nvptx_warp_id = nvptx_id / warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003041 llvm::Value *WarpID = getNVPTXWarpID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003042
3043 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3044 Address LocalReduceList(
3045 Bld.CreatePointerBitCastOrAddrSpaceCast(
3046 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3047 C.VoidPtrTy, SourceLocation()),
3048 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3049 CGF.getPointerAlign());
3050
3051 unsigned Idx = 0;
Alexey Bataev9ff80832018-04-16 20:16:21 +00003052 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003053 //
3054 // Warp master copies reduce element to transfer medium in __shared__
3055 // memory.
3056 //
3057 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3058 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3059 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3060
3061 // if (lane_id == 0)
Alexey Bataev9ff80832018-04-16 20:16:21 +00003062 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003063 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3064 CGF.EmitBlock(ThenBB);
3065
3066 // Reduce element = LocalReduceList[i]
3067 Address ElemPtrPtrAddr =
3068 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3069 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3070 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3071 // elemptr = (type[i]*)(elemptrptr)
3072 Address ElemPtr =
3073 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3074 ElemPtr = Bld.CreateElementBitCast(
3075 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003076
3077 // Get pointer to location in transfer medium.
3078 // MediumPtr = &medium[warp_id]
3079 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3080 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3081 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
3082 // Casting to actual data type.
3083 // MediumPtr = (type[i]*)MediumPtrAddr;
3084 MediumPtr = Bld.CreateElementBitCast(
3085 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3086
Alexey Bataev12c62902018-06-22 19:10:38 +00003087 // elem = *elemptr
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003088 //*MediumPtr = elem
Alexey Bataev12c62902018-06-22 19:10:38 +00003089 if (Private->getType()->isScalarType()) {
3090 llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3091 Private->getType(), Loc);
3092 // Store the source element value to the dest element address.
3093 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
3094 Private->getType());
3095 } else {
3096 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3097 CGF.MakeAddrLValue(MediumPtr, Private->getType()),
3098 Private->getType(), AggValueSlot::DoesNotOverlap);
3099 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003100
3101 Bld.CreateBr(MergeBB);
3102
3103 CGF.EmitBlock(ElseBB);
3104 Bld.CreateBr(MergeBB);
3105
3106 CGF.EmitBlock(MergeBB);
3107
3108 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3109 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3110 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
3111
Alexey Bataev9ff80832018-04-16 20:16:21 +00003112 llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003113 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3114 // named_barrier_sync(ParallelBarrierID, num_active_threads)
3115 syncParallelThreads(CGF, NumActiveThreads);
3116
3117 //
3118 // Warp 0 copies reduce element from transfer medium.
3119 //
3120 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3121 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3122 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3123
3124 // Up to 32 threads in warp 0 are active.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003125 llvm::Value *IsActiveThread =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003126 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3127 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3128
3129 CGF.EmitBlock(W0ThenBB);
3130
3131 // SrcMediumPtr = &medium[tid]
3132 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3133 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3134 Address SrcMediumPtr(SrcMediumPtrVal,
3135 C.getTypeAlignInChars(Private->getType()));
3136 // SrcMediumVal = *SrcMediumPtr;
3137 SrcMediumPtr = Bld.CreateElementBitCast(
3138 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003139
3140 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
3141 Address TargetElemPtrPtr =
3142 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3143 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3144 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3145 Address TargetElemPtr =
3146 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
3147 TargetElemPtr = Bld.CreateElementBitCast(
3148 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3149
3150 // *TargetElemPtr = SrcMediumVal;
Alexey Bataev12c62902018-06-22 19:10:38 +00003151 if (Private->getType()->isScalarType()) {
3152 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
3153 SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
3154 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3155 Private->getType());
3156 } else {
3157 CGF.EmitAggregateCopy(
3158 CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
3159 CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
3160 Private->getType(), AggValueSlot::DoesNotOverlap);
3161 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003162 Bld.CreateBr(W0MergeBB);
3163
3164 CGF.EmitBlock(W0ElseBB);
3165 Bld.CreateBr(W0MergeBB);
3166
3167 CGF.EmitBlock(W0MergeBB);
3168
3169 // While warp 0 copies values from transfer medium, all other warps must
3170 // wait.
3171 syncParallelThreads(CGF, NumActiveThreads);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003172 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003173 }
3174
3175 CGF.FinishFunction();
3176 return Fn;
3177}
3178
3179/// Emit a helper that reduces data across two OpenMP threads (lanes)
3180/// in the same warp. It uses shuffle instructions to copy over data from
3181/// a remote lane's stack. The reduction algorithm performed is specified
3182/// by the fourth parameter.
3183///
3184/// Algorithm Versions.
3185/// Full Warp Reduce (argument value 0):
3186/// This algorithm assumes that all 32 lanes are active and gathers
3187/// data from these 32 lanes, producing a single resultant value.
3188/// Contiguous Partial Warp Reduce (argument value 1):
3189/// This algorithm assumes that only a *contiguous* subset of lanes
3190/// are active. This happens for the last warp in a parallel region
3191/// when the user specified num_threads is not an integer multiple of
3192/// 32. This contiguous subset always starts with the zeroth lane.
3193/// Partial Warp Reduce (argument value 2):
3194/// This algorithm gathers data from any number of lanes at any position.
3195/// All reduced values are stored in the lowest possible lane. The set
3196/// of problems every algorithm addresses is a super set of those
3197/// addressable by algorithms with a lower version number. Overhead
3198/// increases as algorithm version increases.
3199///
3200/// Terminology
3201/// Reduce element:
3202/// Reduce element refers to the individual data field with primitive
3203/// data types to be combined and reduced across threads.
3204/// Reduce list:
3205/// Reduce list refers to a collection of local, thread-private
3206/// reduce elements.
3207/// Remote Reduce list:
3208/// Remote Reduce list refers to a collection of remote (relative to
3209/// the current thread) reduce elements.
3210///
3211/// We distinguish between three states of threads that are important to
3212/// the implementation of this function.
3213/// Alive threads:
3214/// Threads in a warp executing the SIMT instruction, as distinguished from
3215/// threads that are inactive due to divergent control flow.
3216/// Active threads:
3217/// The minimal set of threads that has to be alive upon entry to this
3218/// function. The computation is correct iff active threads are alive.
3219/// Some threads are alive but they are not active because they do not
3220/// contribute to the computation in any useful manner. Turning them off
3221/// may introduce control flow overheads without any tangible benefits.
3222/// Effective threads:
3223/// In order to comply with the argument requirements of the shuffle
3224/// function, we must keep all lanes holding data alive. But at most
3225/// half of them perform value aggregation; we refer to this half of
3226/// threads as effective. The other half is simply handing off their
3227/// data.
3228///
3229/// Procedure
3230/// Value shuffle:
3231/// In this step active threads transfer data from higher lane positions
3232/// in the warp to lower lane positions, creating Remote Reduce list.
3233/// Value aggregation:
3234/// In this step, effective threads combine their thread local Reduce list
3235/// with Remote Reduce list and store the result in the thread local
3236/// Reduce list.
3237/// Value copy:
3238/// In this step, we deal with the assumption made by algorithm 2
3239/// (i.e. contiguity assumption). When we have an odd number of lanes
3240/// active, say 2k+1, only k threads will be effective and therefore k
3241/// new values will be produced. However, the Reduce list owned by the
3242/// (2k+1)th thread is ignored in the value aggregation. Therefore
3243/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3244/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003245static llvm::Value *emitShuffleAndReduceFunction(
3246 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3247 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003248 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003249
3250 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003251 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3252 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003253 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003254 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3255 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003256 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003257 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3258 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003259 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003260 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3261 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003262 FunctionArgList Args;
3263 Args.push_back(&ReduceListArg);
3264 Args.push_back(&LaneIDArg);
3265 Args.push_back(&RemoteLaneOffsetArg);
3266 Args.push_back(&AlgoVerArg);
3267
Alexey Bataev9ff80832018-04-16 20:16:21 +00003268 const CGFunctionInfo &CGFI =
3269 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003270 auto *Fn = llvm::Function::Create(
3271 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3272 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003273 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003274 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003275 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003276 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003277
Alexey Bataev9ff80832018-04-16 20:16:21 +00003278 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003279
3280 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3281 Address LocalReduceList(
3282 Bld.CreatePointerBitCastOrAddrSpaceCast(
3283 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3284 C.VoidPtrTy, SourceLocation()),
3285 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3286 CGF.getPointerAlign());
3287
3288 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3289 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3290 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3291
3292 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3293 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3294 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3295
3296 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3297 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3298 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3299
3300 // Create a local thread-private variable to host the Reduce list
3301 // from a remote lane.
3302 Address RemoteReduceList =
3303 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3304
3305 // This loop iterates through the list of reduce elements and copies,
3306 // element by element, from a remote lane in the warp to RemoteReduceList,
3307 // hosted on the thread's stack.
3308 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3309 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003310 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3311 /*ScratchpadIndex=*/nullptr,
3312 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003313
3314 // The actions to be performed on the Remote Reduce list is dependent
3315 // on the algorithm version.
3316 //
3317 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3318 // LaneId % 2 == 0 && Offset > 0):
3319 // do the reduction value aggregation
3320 //
3321 // The thread local variable Reduce list is mutated in place to host the
3322 // reduced data, which is the aggregated value produced from local and
3323 // remote lanes.
3324 //
3325 // Note that AlgoVer is expected to be a constant integer known at compile
3326 // time.
3327 // When AlgoVer==0, the first conjunction evaluates to true, making
3328 // the entire predicate true during compile time.
3329 // When AlgoVer==1, the second conjunction has only the second part to be
3330 // evaluated during runtime. Other conjunctions evaluates to false
3331 // during compile time.
3332 // When AlgoVer==2, the third conjunction has only the second part to be
3333 // evaluated during runtime. Other conjunctions evaluates to false
3334 // during compile time.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003335 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003336
Alexey Bataev9ff80832018-04-16 20:16:21 +00003337 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3338 llvm::Value *CondAlgo1 = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003339 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3340
Alexey Bataev9ff80832018-04-16 20:16:21 +00003341 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3342 llvm::Value *CondAlgo2 = Bld.CreateAnd(
3343 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003344 CondAlgo2 = Bld.CreateAnd(
3345 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3346
Alexey Bataev9ff80832018-04-16 20:16:21 +00003347 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003348 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3349
3350 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3351 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3352 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3353 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3354
3355 CGF.EmitBlock(ThenBB);
3356 // reduce_function(LocalReduceList, RemoteReduceList)
3357 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3358 LocalReduceList.getPointer(), CGF.VoidPtrTy);
3359 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3360 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003361 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3362 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003363 Bld.CreateBr(MergeBB);
3364
3365 CGF.EmitBlock(ElseBB);
3366 Bld.CreateBr(MergeBB);
3367
3368 CGF.EmitBlock(MergeBB);
3369
3370 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3371 // Reduce list.
3372 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
Alexey Bataev9ff80832018-04-16 20:16:21 +00003373 llvm::Value *CondCopy = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003374 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3375
3376 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3377 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3378 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3379 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3380
3381 CGF.EmitBlock(CpyThenBB);
3382 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3383 RemoteReduceList, LocalReduceList);
3384 Bld.CreateBr(CpyMergeBB);
3385
3386 CGF.EmitBlock(CpyElseBB);
3387 Bld.CreateBr(CpyMergeBB);
3388
3389 CGF.EmitBlock(CpyMergeBB);
3390
3391 CGF.FinishFunction();
3392 return Fn;
3393}
3394
3395///
3396/// Design of OpenMP reductions on the GPU
3397///
3398/// Consider a typical OpenMP program with one or more reduction
3399/// clauses:
3400///
3401/// float foo;
3402/// double bar;
3403/// #pragma omp target teams distribute parallel for \
3404/// reduction(+:foo) reduction(*:bar)
3405/// for (int i = 0; i < N; i++) {
3406/// foo += A[i]; bar *= B[i];
3407/// }
3408///
3409/// where 'foo' and 'bar' are reduced across all OpenMP threads in
3410/// all teams. In our OpenMP implementation on the NVPTX device an
3411/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3412/// within a team are mapped to CUDA threads within a threadblock.
3413/// Our goal is to efficiently aggregate values across all OpenMP
3414/// threads such that:
3415///
3416/// - the compiler and runtime are logically concise, and
3417/// - the reduction is performed efficiently in a hierarchical
3418/// manner as follows: within OpenMP threads in the same warp,
3419/// across warps in a threadblock, and finally across teams on
3420/// the NVPTX device.
3421///
3422/// Introduction to Decoupling
3423///
3424/// We would like to decouple the compiler and the runtime so that the
3425/// latter is ignorant of the reduction variables (number, data types)
3426/// and the reduction operators. This allows a simpler interface
3427/// and implementation while still attaining good performance.
3428///
3429/// Pseudocode for the aforementioned OpenMP program generated by the
3430/// compiler is as follows:
3431///
3432/// 1. Create private copies of reduction variables on each OpenMP
3433/// thread: 'foo_private', 'bar_private'
3434/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3435/// to it and writes the result in 'foo_private' and 'bar_private'
3436/// respectively.
3437/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3438/// and store the result on the team master:
3439///
3440/// __kmpc_nvptx_parallel_reduce_nowait(...,
3441/// reduceData, shuffleReduceFn, interWarpCpyFn)
3442///
3443/// where:
3444/// struct ReduceData {
3445/// double *foo;
3446/// double *bar;
3447/// } reduceData
3448/// reduceData.foo = &foo_private
3449/// reduceData.bar = &bar_private
3450///
3451/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3452/// auxiliary functions generated by the compiler that operate on
3453/// variables of type 'ReduceData'. They aid the runtime perform
3454/// algorithmic steps in a data agnostic manner.
3455///
3456/// 'shuffleReduceFn' is a pointer to a function that reduces data
3457/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3458/// same warp. It takes the following arguments as input:
3459///
3460/// a. variable of type 'ReduceData' on the calling lane,
3461/// b. its lane_id,
3462/// c. an offset relative to the current lane_id to generate a
3463/// remote_lane_id. The remote lane contains the second
3464/// variable of type 'ReduceData' that is to be reduced.
3465/// d. an algorithm version parameter determining which reduction
3466/// algorithm to use.
3467///
3468/// 'shuffleReduceFn' retrieves data from the remote lane using
3469/// efficient GPU shuffle intrinsics and reduces, using the
3470/// algorithm specified by the 4th parameter, the two operands
3471/// element-wise. The result is written to the first operand.
3472///
3473/// Different reduction algorithms are implemented in different
3474/// runtime functions, all calling 'shuffleReduceFn' to perform
3475/// the essential reduction step. Therefore, based on the 4th
3476/// parameter, this function behaves slightly differently to
3477/// cooperate with the runtime to ensure correctness under
3478/// different circumstances.
3479///
3480/// 'InterWarpCpyFn' is a pointer to a function that transfers
3481/// reduced variables across warps. It tunnels, through CUDA
3482/// shared memory, the thread-private data of type 'ReduceData'
3483/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003484/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3485/// The last team writes the global reduced value to memory.
3486///
3487/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3488/// reduceData, shuffleReduceFn, interWarpCpyFn,
3489/// scratchpadCopyFn, loadAndReduceFn)
3490///
3491/// 'scratchpadCopyFn' is a helper that stores reduced
3492/// data from the team master to a scratchpad array in
3493/// global memory.
3494///
3495/// 'loadAndReduceFn' is a helper that loads data from
3496/// the scratchpad array and reduces it with the input
3497/// operand.
3498///
3499/// These compiler generated functions hide address
3500/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003501/// 5. if ret == 1:
3502/// The team master of the last team stores the reduced
3503/// result to the globals in memory.
3504/// foo += reduceData.foo; bar *= reduceData.bar
3505///
3506///
3507/// Warp Reduction Algorithms
3508///
3509/// On the warp level, we have three algorithms implemented in the
3510/// OpenMP runtime depending on the number of active lanes:
3511///
3512/// Full Warp Reduction
3513///
3514/// The reduce algorithm within a warp where all lanes are active
3515/// is implemented in the runtime as follows:
3516///
3517/// full_warp_reduce(void *reduce_data,
3518/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3519/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3520/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3521/// }
3522///
3523/// The algorithm completes in log(2, WARPSIZE) steps.
3524///
3525/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3526/// not used therefore we save instructions by not retrieving lane_id
3527/// from the corresponding special registers. The 4th parameter, which
3528/// represents the version of the algorithm being used, is set to 0 to
3529/// signify full warp reduction.
3530///
3531/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3532///
3533/// #reduce_elem refers to an element in the local lane's data structure
3534/// #remote_elem is retrieved from a remote lane
3535/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3536/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3537///
3538/// Contiguous Partial Warp Reduction
3539///
3540/// This reduce algorithm is used within a warp where only the first
3541/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3542/// number of OpenMP threads in a parallel region is not a multiple of
3543/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3544///
3545/// void
3546/// contiguous_partial_reduce(void *reduce_data,
3547/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3548/// int size, int lane_id) {
3549/// int curr_size;
3550/// int offset;
3551/// curr_size = size;
3552/// mask = curr_size/2;
3553/// while (offset>0) {
3554/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3555/// curr_size = (curr_size+1)/2;
3556/// offset = curr_size/2;
3557/// }
3558/// }
3559///
3560/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3561///
3562/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3563/// if (lane_id < offset)
3564/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3565/// else
3566/// reduce_elem = remote_elem
3567///
3568/// This algorithm assumes that the data to be reduced are located in a
3569/// contiguous subset of lanes starting from the first. When there is
3570/// an odd number of active lanes, the data in the last lane is not
3571/// aggregated with any other lane's dat but is instead copied over.
3572///
3573/// Dispersed Partial Warp Reduction
3574///
3575/// This algorithm is used within a warp when any discontiguous subset of
3576/// lanes are active. It is used to implement the reduction operation
3577/// across lanes in an OpenMP simd region or in a nested parallel region.
3578///
3579/// void
3580/// dispersed_partial_reduce(void *reduce_data,
3581/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3582/// int size, remote_id;
3583/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3584/// do {
3585/// remote_id = next_active_lane_id_right_after_me();
3586/// # the above function returns 0 of no active lane
3587/// # is present right after the current lane.
3588/// size = number_of_active_lanes_in_this_warp();
3589/// logical_lane_id /= 2;
3590/// ShuffleReduceFn(reduce_data, logical_lane_id,
3591/// remote_id-1-threadIdx.x, 2);
3592/// } while (logical_lane_id % 2 == 0 && size > 1);
3593/// }
3594///
3595/// There is no assumption made about the initial state of the reduction.
3596/// Any number of lanes (>=1) could be active at any position. The reduction
3597/// result is returned in the first active lane.
3598///
3599/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3600///
3601/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3602/// if (lane_id % 2 == 0 && offset > 0)
3603/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3604/// else
3605/// reduce_elem = remote_elem
3606///
3607///
3608/// Intra-Team Reduction
3609///
3610/// This function, as implemented in the runtime call
3611/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3612/// threads in a team. It first reduces within a warp using the
3613/// aforementioned algorithms. We then proceed to gather all such
3614/// reduced values at the first warp.
3615///
3616/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3617/// data from each of the "warp master" (zeroth lane of each warp, where
3618/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3619/// a mathematical sense) the problem of reduction across warp masters in
3620/// a block to the problem of warp reduction.
3621///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003622///
3623/// Inter-Team Reduction
3624///
3625/// Once a team has reduced its data to a single value, it is stored in
3626/// a global scratchpad array. Since each team has a distinct slot, this
3627/// can be done without locking.
3628///
3629/// The last team to write to the scratchpad array proceeds to reduce the
3630/// scratchpad array. One or more workers in the last team use the helper
3631/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3632/// the k'th worker reduces every k'th element.
3633///
3634/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3635/// reduce across workers and compute a globally reduced value.
3636///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003637void CGOpenMPRuntimeNVPTX::emitReduction(
3638 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3639 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3640 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3641 if (!CGF.HaveInsertPoint())
3642 return;
3643
3644 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003645 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003646 bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3647 assert((TeamsReduction || ParallelReduction || SimdReduction) &&
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003648 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003649
Alexey Bataev7b55d2d2018-06-18 17:11:45 +00003650 if (Options.SimpleReduction) {
3651 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3652 ReductionOps, Options);
3653 return;
3654 }
3655
Alexey Bataev9ff80832018-04-16 20:16:21 +00003656 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003657
3658 // 1. Build a list of reduction variables.
3659 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3660 auto Size = RHSExprs.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00003661 for (const Expr *E : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003662 if (E->getType()->isVariablyModifiedType())
3663 // Reserve place for array size.
3664 ++Size;
3665 }
3666 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3667 QualType ReductionArrayTy =
3668 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
3669 /*IndexTypeQuals=*/0);
3670 Address ReductionList =
3671 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3672 auto IPriv = Privates.begin();
3673 unsigned Idx = 0;
3674 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3675 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3676 CGF.getPointerSize());
3677 CGF.Builder.CreateStore(
3678 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3679 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3680 Elem);
3681 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3682 // Store array size.
3683 ++Idx;
3684 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3685 CGF.getPointerSize());
3686 llvm::Value *Size = CGF.Builder.CreateIntCast(
3687 CGF.getVLASize(
3688 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00003689 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003690 CGF.SizeTy, /*isSigned=*/false);
3691 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3692 Elem);
3693 }
3694 }
3695
3696 // 2. Emit reduce_func().
Alexey Bataev9ff80832018-04-16 20:16:21 +00003697 llvm::Value *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003698 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3699 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003700
3701 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3702 // RedList, shuffle_reduce_func, interwarp_copy_func);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003703 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3704 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3705 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003706 ReductionList.getPointer(), CGF.VoidPtrTy);
3707
Alexey Bataev9ff80832018-04-16 20:16:21 +00003708 llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003709 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003710 llvm::Value *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003711 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003712
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003713 llvm::Value *Args[] = {ThreadId,
3714 CGF.Builder.getInt32(RHSExprs.size()),
3715 ReductionArrayTySize,
3716 RL,
3717 ShuffleAndReduceFn,
3718 InterWarpCopyFn};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003719
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003720 llvm::Value *Res = nullptr;
3721 if (ParallelReduction)
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003722 Res = CGF.EmitRuntimeCall(
3723 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3724 Args);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003725 else if (SimdReduction)
3726 Res = CGF.EmitRuntimeCall(
3727 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3728 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003729
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003730 if (TeamsReduction) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003731 llvm::Value *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003732 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003733 llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003734 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003735
3736 llvm::Value *Args[] = {ThreadId,
3737 CGF.Builder.getInt32(RHSExprs.size()),
3738 ReductionArrayTySize,
3739 RL,
3740 ShuffleAndReduceFn,
3741 InterWarpCopyFn,
3742 ScratchPadCopyFn,
3743 LoadAndReduceFn};
3744 Res = CGF.EmitRuntimeCall(
3745 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
3746 Args);
3747 }
3748
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003749 // 5. Build switch(res)
Alexey Bataev9ff80832018-04-16 20:16:21 +00003750 llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
3751 llvm::SwitchInst *SwInst =
3752 CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003753
3754 // 6. Build case 1: where we have reduced values in the master
3755 // thread in each team.
3756 // __kmpc_end_reduce{_nowait}(<gtid>);
3757 // break;
Alexey Bataev9ff80832018-04-16 20:16:21 +00003758 llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003759 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
3760 CGF.EmitBlock(Case1BB);
3761
3762 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
3763 llvm::Value *EndArgs[] = {ThreadId};
Alexey Bataev9ff80832018-04-16 20:16:21 +00003764 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003765 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
3766 auto IPriv = Privates.begin();
3767 auto ILHS = LHSExprs.begin();
3768 auto IRHS = RHSExprs.begin();
Alexey Bataev9ff80832018-04-16 20:16:21 +00003769 for (const Expr *E : ReductionOps) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003770 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
3771 cast<DeclRefExpr>(*IRHS));
3772 ++IPriv;
3773 ++ILHS;
3774 ++IRHS;
3775 }
3776 };
3777 RegionCodeGenTy RCG(CodeGen);
3778 NVPTXActionTy Action(
3779 nullptr, llvm::None,
3780 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
3781 EndArgs);
3782 RCG.setAction(Action);
3783 RCG(CGF);
3784 CGF.EmitBranch(DefaultBB);
3785 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
3786}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003787
3788const VarDecl *
3789CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
3790 const VarDecl *NativeParam) const {
3791 if (!NativeParam->getType()->isReferenceType())
3792 return NativeParam;
3793 QualType ArgType = NativeParam->getType();
3794 QualifierCollector QC;
3795 const Type *NonQualTy = QC.strip(ArgType);
3796 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3797 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
3798 if (Attr->getCaptureKind() == OMPC_map) {
3799 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
3800 LangAS::opencl_global);
3801 }
3802 }
3803 ArgType = CGM.getContext().getPointerType(PointeeTy);
3804 QC.addRestrict();
3805 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00003806 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003807 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003808 if (isa<ImplicitParamDecl>(NativeParam))
Alexey Bataevb45d43c2017-11-22 16:02:03 +00003809 return ImplicitParamDecl::Create(
3810 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
3811 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00003812 return ParmVarDecl::Create(
3813 CGM.getContext(),
3814 const_cast<DeclContext *>(NativeParam->getDeclContext()),
Stephen Kellyf2ceec42018-08-09 21:08:08 +00003815 NativeParam->getBeginLoc(), NativeParam->getLocation(),
Alexey Bataevb45d43c2017-11-22 16:02:03 +00003816 NativeParam->getIdentifier(), ArgType,
3817 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003818}
3819
3820Address
3821CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
3822 const VarDecl *NativeParam,
3823 const VarDecl *TargetParam) const {
3824 assert(NativeParam != TargetParam &&
3825 NativeParam->getType()->isReferenceType() &&
3826 "Native arg must not be the same as target arg.");
3827 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
3828 QualType NativeParamType = NativeParam->getType();
3829 QualifierCollector QC;
3830 const Type *NonQualTy = QC.strip(NativeParamType);
3831 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
3832 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00003833 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00003834 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003835 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00003836 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003837 // First cast to generic.
3838 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3839 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3840 /*AddrSpace=*/0));
3841 // Cast from generic to native address space.
3842 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3843 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
3844 NativePointeeAddrSpace));
3845 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
3846 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00003847 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003848 return NativeParamAddr;
3849}
3850
3851void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00003852 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003853 ArrayRef<llvm::Value *> Args) const {
3854 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00003855 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003856 auto *FnType =
3857 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
3858 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00003859 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
3860 TargetArgs.append(std::next(Args.begin(), I), Args.end());
3861 break;
3862 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003863 llvm::Type *TargetType = FnType->getParamType(I);
3864 llvm::Value *NativeArg = Args[I];
3865 if (!TargetType->isPointerTy()) {
3866 TargetArgs.emplace_back(NativeArg);
3867 continue;
3868 }
3869 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00003870 NativeArg,
3871 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003872 TargetArgs.emplace_back(
3873 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
3874 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00003875 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00003876}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003877
3878/// Emit function which wraps the outline parallel region
3879/// and controls the arguments which are passed to this function.
3880/// The wrapper ensures that the outlined function is called
3881/// with the correct arguments when data is shared.
3882llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
3883 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
3884 ASTContext &Ctx = CGM.getContext();
3885 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
3886
3887 // Create a function that takes as argument the source thread.
3888 FunctionArgList WrapperArgs;
3889 QualType Int16QTy =
3890 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
3891 QualType Int32QTy =
3892 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00003893 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003894 /*Id=*/nullptr, Int16QTy,
3895 ImplicitParamDecl::Other);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00003896 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003897 /*Id=*/nullptr, Int32QTy,
3898 ImplicitParamDecl::Other);
3899 WrapperArgs.emplace_back(&ParallelLevelArg);
3900 WrapperArgs.emplace_back(&WrapperArg);
3901
Alexey Bataev9ff80832018-04-16 20:16:21 +00003902 const CGFunctionInfo &CGFI =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003903 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
3904
3905 auto *Fn = llvm::Function::Create(
3906 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataev9ff80832018-04-16 20:16:21 +00003907 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00003908 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003909 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003910 Fn->setDoesNotRecurse();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003911
3912 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
3913 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
Stephen Kellyf2ceec42018-08-09 21:08:08 +00003914 D.getBeginLoc(), D.getBeginLoc());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003915
3916 const auto *RD = CS.getCapturedRecordDecl();
3917 auto CurField = RD->field_begin();
3918
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00003919 Address ZeroAddr = CGF.CreateMemTemp(
3920 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
3921 /*Name*/ ".zero.addr");
3922 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003923 // Get the array of arguments.
3924 SmallVector<llvm::Value *, 8> Args;
3925
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00003926 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
3927 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003928
3929 CGBuilderTy &Bld = CGF.Builder;
3930 auto CI = CS.capture_begin();
3931
3932 // Use global memory for data sharing.
3933 // Handle passing of global args to workers.
3934 Address GlobalArgs =
3935 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
3936 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
3937 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
3938 CGF.EmitRuntimeCall(
3939 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
3940 DataSharingArgs);
3941
3942 // Retrieve the shared variables from the list of references returned
3943 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00003944 Address SharedArgListAddress = Address::invalid();
3945 if (CS.capture_size() > 0 ||
3946 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3947 SharedArgListAddress = CGF.EmitLoadOfPointer(
3948 GlobalArgs, CGF.getContext()
3949 .getPointerType(CGF.getContext().getPointerType(
3950 CGF.getContext().VoidPtrTy))
3951 .castAs<PointerType>());
3952 }
3953 unsigned Idx = 0;
3954 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
3955 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3956 CGF.getPointerSize());
3957 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3958 Src, CGF.SizeTy->getPointerTo());
3959 llvm::Value *LB = CGF.EmitLoadOfScalar(
3960 TypedAddress,
3961 /*Volatile=*/false,
3962 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3963 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
3964 Args.emplace_back(LB);
3965 ++Idx;
3966 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
3967 CGF.getPointerSize());
3968 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3969 Src, CGF.SizeTy->getPointerTo());
3970 llvm::Value *UB = CGF.EmitLoadOfScalar(
3971 TypedAddress,
3972 /*Volatile=*/false,
3973 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3974 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3975 Args.emplace_back(UB);
3976 ++Idx;
3977 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003978 if (CS.capture_size() > 0) {
3979 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003980 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3981 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00003982 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3983 CGF.getPointerSize());
3984 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003985 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3986 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3987 /*Volatile=*/false,
3988 CGFContext.getPointerType(ElemTy),
3989 CI->getLocation());
Alexey Bataev2091ca62018-04-23 17:33:41 +00003990 if (CI->capturesVariableByCopy() &&
3991 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
Alexey Bataev17314212018-03-20 15:41:05 +00003992 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3993 CI->getLocation());
3994 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003995 Args.emplace_back(Arg);
3996 }
3997 }
3998
Stephen Kellyf2ceec42018-08-09 21:08:08 +00003999 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004000 CGF.FinishFunction();
4001 return Fn;
4002}
4003
4004void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4005 const Decl *D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004006 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4007 return;
4008
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004009 assert(D && "Expected function or captured|block decl.");
4010 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4011 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004012 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004013 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004014 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4015 Body = FD->getBody();
4016 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4017 Body = BD->getBody();
4018 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4019 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004020 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Alexey Bataev2adecff2018-09-21 14:22:53 +00004021 if (NeedToDelayGlobalization &&
4022 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4023 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004024 }
4025 if (!Body)
4026 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004027 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004028 VarChecker.Visit(Body);
4029 const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004030 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4031 VarChecker.getEscapedVariableLengthDecls();
4032 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004033 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004034 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4035 I->getSecond().MappedParams =
4036 llvm::make_unique<CodeGenFunction::OMPMapVars>();
4037 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4038 I->getSecond().EscapedParameters.insert(
4039 VarChecker.getEscapedParameters().begin(),
4040 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004041 I->getSecond().EscapedVariableLengthDecls.append(
4042 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004043 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004044 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004045 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004046 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004047 Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
4048 }
4049 if (!NeedToDelayGlobalization) {
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004050 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004051 struct GlobalizationScope final : EHScopeStack::Cleanup {
4052 GlobalizationScope() = default;
4053
4054 void Emit(CodeGenFunction &CGF, Flags flags) override {
4055 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004056 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004057 }
4058 };
4059 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004060 }
4061}
4062
4063Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4064 const VarDecl *VD) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004065 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4066 return Address::invalid();
4067
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004068 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004069 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4070 if (I == FunctionGlobalizedDecls.end())
4071 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00004072 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004073 if (VDI != I->getSecond().LocalVarData.end())
4074 return VDI->second.second;
4075 if (VD->hasAttrs()) {
4076 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4077 E(VD->attr_end());
4078 IT != E; ++IT) {
4079 auto VDI = I->getSecond().LocalVarData.find(
4080 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4081 ->getCanonicalDecl());
4082 if (VDI != I->getSecond().LocalVarData.end())
4083 return VDI->second.second;
4084 }
4085 }
4086 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004087}
4088
4089void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004090 FunctionGlobalizedDecls.erase(CGF.CurFn);
4091 CGOpenMPRuntime::functionFinished(CGF);
4092}
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004093
4094void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4095 CodeGenFunction &CGF, const OMPLoopDirective &S,
4096 OpenMPDistScheduleClauseKind &ScheduleKind,
4097 llvm::Value *&Chunk) const {
4098 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4099 ScheduleKind = OMPC_DIST_SCHEDULE_static;
4100 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4101 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4102 S.getIterationVariable()->getType(), S.getBeginLoc());
4103 }
4104}
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004105
4106void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4107 CodeGenFunction &CGF, const OMPLoopDirective &S,
4108 OpenMPScheduleClauseKind &ScheduleKind,
4109 llvm::Value *&Chunk) const {
4110 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4111 ScheduleKind = OMPC_SCHEDULE_static;
4112 Chunk = CGF.Builder.getIntN(CGF.getContext().getTypeSize(
4113 S.getIterationVariable()->getType()), 1);
4114 }
4115}