blob: 8ab890cb4c39920bcca69ff2bd02d858646c7406 [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
20#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000021
22using namespace clang;
23using namespace CodeGen;
24
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000025namespace {
26enum OpenMPRTLFunctionNVPTX {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000027 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000029 OMPRTL_NVPTX__kmpc_kernel_init,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000030 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000031 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000032 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000033 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000034 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35 /// \brief Call to void __kmpc_spmd_kernel_deinit();
36 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000037 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000038 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000039 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000040 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000041 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function,
42 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000043 OMPRTL_NVPTX__kmpc_kernel_parallel,
44 /// \brief Call to void __kmpc_kernel_end_parallel();
45 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47 /// global_tid);
48 OMPRTL_NVPTX__kmpc_serialized_parallel,
49 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50 /// global_tid);
51 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000052 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
53 /// int16_t lane_offset, int16_t warp_size);
54 OMPRTL_NVPTX__kmpc_shuffle_int32,
55 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
56 /// int16_t lane_offset, int16_t warp_size);
57 OMPRTL_NVPTX__kmpc_shuffle_int64,
58 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61 /// lane_offset, int16_t shortCircuit),
62 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000064 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
65 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
66 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
67 /// lane_offset, int16_t shortCircuit),
68 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
69 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
70 /// int32_t index, int32_t width),
71 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
72 /// index, int32_t width, int32_t reduce))
73 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000074 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000075 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
76 /// \brief Call to void __kmpc_data_sharing_init_stack();
77 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
78 /// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size,
79 /// int16_t UseSharedMemory);
80 OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
81 /// \brief Call to void __kmpc_data_sharing_pop_stack(void *a);
82 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
83 /// \brief Call to void __kmpc_begin_sharing_variables(void ***args,
84 /// size_t n_args);
85 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
86 /// \brief Call to void __kmpc_end_sharing_variables();
87 OMPRTL_NVPTX__kmpc_end_sharing_variables,
88 /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
89 OMPRTL_NVPTX__kmpc_get_shared_variables,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000090};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000091
92/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
93class NVPTXActionTy final : public PrePostActionTy {
94 llvm::Value *EnterCallee;
95 ArrayRef<llvm::Value *> EnterArgs;
96 llvm::Value *ExitCallee;
97 ArrayRef<llvm::Value *> ExitArgs;
98 bool Conditional;
99 llvm::BasicBlock *ContBlock = nullptr;
100
101public:
102 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
103 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
104 bool Conditional = false)
105 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
106 ExitArgs(ExitArgs), Conditional(Conditional) {}
107 void Enter(CodeGenFunction &CGF) override {
108 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
109 if (Conditional) {
110 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
111 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
112 ContBlock = CGF.createBasicBlock("omp_if.end");
113 // Generate the branch (If-stmt)
114 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
115 CGF.EmitBlock(ThenBlock);
116 }
117 }
118 void Done(CodeGenFunction &CGF) {
119 // Emit the rest of blocks/branches
120 CGF.EmitBranch(ContBlock);
121 CGF.EmitBlock(ContBlock, true);
122 }
123 void Exit(CodeGenFunction &CGF) override {
124 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
125 }
126};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000127
128// A class to track the execution mode when codegening directives within
129// a target region. The appropriate mode (generic/spmd) is set on entry
130// to the target region and used by containing directives such as 'parallel'
131// to emit optimized code.
132class ExecutionModeRAII {
133private:
134 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
135 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
136
137public:
138 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
139 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
140 : Mode(Mode) {
141 SavedMode = Mode;
142 Mode = NewMode;
143 }
144 ~ExecutionModeRAII() { Mode = SavedMode; }
145};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000146
147/// GPU Configuration: This information can be derived from cuda registers,
148/// however, providing compile time constants helps generate more efficient
149/// code. For all practical purposes this is fine because the configuration
150/// is the same for all known NVPTX architectures.
151enum MachineConfiguration : unsigned {
152 WarpSize = 32,
153 /// Number of bits required to represent a lane identifier, which is
154 /// computed as log_2(WarpSize).
155 LaneIDBits = 5,
156 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000157
158 /// Global memory alignment for performance.
159 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000160};
161
162enum NamedBarrier : unsigned {
163 /// Synchronize on this barrier #ID using a named barrier primitive.
164 /// Only the subset of active threads in a parallel region arrive at the
165 /// barrier.
166 NB_Parallel = 1,
167};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000168
169/// Get the list of variables that can escape their declaration context.
170class CheckVarsEscapingDeclContext final
171 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
172 CodeGenFunction &CGF;
173 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000174 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000175 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000176 bool AllEscaped = false;
177 RecordDecl *GlobalizedRD = nullptr;
178 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
179
180 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000181 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000182 // Variables captured by value must be globalized.
183 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000184 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000185 if (!FD->hasAttrs())
186 return;
187 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
188 if (!Attr)
189 return;
190 if (!isOpenMPPrivate(
191 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
192 Attr->getCaptureKind() == OMPC_map)
193 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000194 if (FD->getType()->isReferenceType())
195 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000196 assert(!VD->getType()->isVariablyModifiedType() &&
197 "Parameter captured by value with variably modified type");
Alexey Bataevc99042b2018-03-15 18:10:54 +0000198 EscapedParameters.insert(VD);
199 }
200 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000201 if (VD->getType()->isVariablyModifiedType())
202 EscapedVariableLengthDecls.insert(VD);
203 else
204 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000205 }
206
207 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000208 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000209 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000210 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
211 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
212 const bool SavedAllEscaped = AllEscaped;
213 AllEscaped = VD->getType()->isLValueReferenceType();
214 Visit(VarD->getInit());
215 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000216 }
217 }
218 }
219 void VisitOpenMPCapturedStmt(const CapturedStmt *S) {
220 if (!S)
221 return;
222 for (const auto &C : S->captures()) {
223 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
224 const ValueDecl *VD = C.getCapturedVar();
225 markAsEscaped(VD);
226 if (isa<OMPCapturedExprDecl>(VD))
227 VisitValueDecl(VD);
228 }
229 }
230 }
231
232 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
233 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
234 return P1.first > P2.first;
235 }
236
237 void buildRecordForGlobalizedVars() {
238 assert(!GlobalizedRD &&
239 "Record for globalized variables is built already.");
240 if (EscapedDecls.empty())
241 return;
242 ASTContext &C = CGF.getContext();
243 SmallVector<VarsDataTy, 4> GlobalizedVars;
244 for (const auto *D : EscapedDecls)
245 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
246 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
247 stable_sort_comparator);
248 // Build struct _globalized_locals_ty {
249 // /* globalized vars */
250 // };
251 GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
252 GlobalizedRD->startDefinition();
253 for (const auto &Pair : GlobalizedVars) {
254 const ValueDecl *VD = Pair.second;
255 QualType Type = VD->getType();
256 if (Type->isLValueReferenceType())
257 Type = C.getPointerType(Type.getNonReferenceType());
258 else
259 Type = Type.getNonReferenceType();
260 SourceLocation Loc = VD->getLocation();
261 auto *Field = FieldDecl::Create(
262 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
263 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
264 /*BW=*/nullptr, /*Mutable=*/false,
265 /*InitStyle=*/ICIS_NoInit);
266 Field->setAccess(AS_public);
267 GlobalizedRD->addDecl(Field);
268 if (VD->hasAttrs()) {
269 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
270 E(VD->getAttrs().end());
271 I != E; ++I)
272 Field->addAttr(*I);
273 }
274 MappedDeclsFields.try_emplace(VD, Field);
275 }
276 GlobalizedRD->completeDefinition();
277 }
278
279public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000280 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000281 virtual ~CheckVarsEscapingDeclContext() = default;
282 void VisitDeclStmt(const DeclStmt *S) {
283 if (!S)
284 return;
285 for (const auto *D : S->decls())
286 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
287 VisitValueDecl(VD);
288 }
289 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
290 if (!D)
291 return;
292 if (D->hasAssociatedStmt()) {
293 if (const auto *S =
294 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt()))
295 VisitOpenMPCapturedStmt(S);
296 }
297 }
298 void VisitCapturedStmt(const CapturedStmt *S) {
299 if (!S)
300 return;
301 for (const auto &C : S->captures()) {
302 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
303 const ValueDecl *VD = C.getCapturedVar();
304 markAsEscaped(VD);
305 if (isa<OMPCapturedExprDecl>(VD))
306 VisitValueDecl(VD);
307 }
308 }
309 }
310 void VisitLambdaExpr(const LambdaExpr *E) {
311 if (!E)
312 return;
313 for (const auto &C : E->captures()) {
314 if (C.capturesVariable()) {
315 if (C.getCaptureKind() == LCK_ByRef) {
316 const ValueDecl *VD = C.getCapturedVar();
317 markAsEscaped(VD);
318 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
319 VisitValueDecl(VD);
320 }
321 }
322 }
323 }
324 void VisitBlockExpr(const BlockExpr *E) {
325 if (!E)
326 return;
327 for (const auto &C : E->getBlockDecl()->captures()) {
328 if (C.isByRef()) {
329 const VarDecl *VD = C.getVariable();
330 markAsEscaped(VD);
331 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
332 VisitValueDecl(VD);
333 }
334 }
335 }
336 void VisitCallExpr(const CallExpr *E) {
337 if (!E)
338 return;
339 for (const Expr *Arg : E->arguments()) {
340 if (!Arg)
341 continue;
342 if (Arg->isLValue()) {
343 const bool SavedAllEscaped = AllEscaped;
344 AllEscaped = true;
345 Visit(Arg);
346 AllEscaped = SavedAllEscaped;
347 } else
348 Visit(Arg);
349 }
350 Visit(E->getCallee());
351 }
352 void VisitDeclRefExpr(const DeclRefExpr *E) {
353 if (!E)
354 return;
355 const ValueDecl *VD = E->getDecl();
356 if (AllEscaped)
357 markAsEscaped(VD);
358 if (isa<OMPCapturedExprDecl>(VD))
359 VisitValueDecl(VD);
360 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
361 if (VarD->isInitCapture())
362 VisitValueDecl(VD);
363 }
364 void VisitUnaryOperator(const UnaryOperator *E) {
365 if (!E)
366 return;
367 if (E->getOpcode() == UO_AddrOf) {
368 const bool SavedAllEscaped = AllEscaped;
369 AllEscaped = true;
370 Visit(E->getSubExpr());
371 AllEscaped = SavedAllEscaped;
372 } else
373 Visit(E->getSubExpr());
374 }
375 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
376 if (!E)
377 return;
378 if (E->getCastKind() == CK_ArrayToPointerDecay) {
379 const bool SavedAllEscaped = AllEscaped;
380 AllEscaped = true;
381 Visit(E->getSubExpr());
382 AllEscaped = SavedAllEscaped;
383 } else
384 Visit(E->getSubExpr());
385 }
386 void VisitExpr(const Expr *E) {
387 if (!E)
388 return;
389 bool SavedAllEscaped = AllEscaped;
390 if (!E->isLValue())
391 AllEscaped = false;
392 for (const auto *Child : E->children())
393 if (Child)
394 Visit(Child);
395 AllEscaped = SavedAllEscaped;
396 }
397 void VisitStmt(const Stmt *S) {
398 if (!S)
399 return;
400 for (const auto *Child : S->children())
401 if (Child)
402 Visit(Child);
403 }
404
Alexey Bataevc99042b2018-03-15 18:10:54 +0000405 /// Returns the record that handles all the escaped local variables and used
406 /// instead of their original storage.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000407 const RecordDecl *getGlobalizedRecord() {
408 if (!GlobalizedRD)
409 buildRecordForGlobalizedVars();
410 return GlobalizedRD;
411 }
412
Alexey Bataevc99042b2018-03-15 18:10:54 +0000413 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000414 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
415 assert(GlobalizedRD &&
416 "Record for globalized variables must be generated already.");
417 auto I = MappedDeclsFields.find(VD);
418 if (I == MappedDeclsFields.end())
419 return nullptr;
420 return I->getSecond();
421 }
422
Alexey Bataevc99042b2018-03-15 18:10:54 +0000423 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000424 ArrayRef<const ValueDecl *> getEscapedDecls() const {
425 return EscapedDecls.getArrayRef();
426 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000427
428 /// Checks if the escaped local variable is actually a parameter passed by
429 /// value.
430 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
431 return EscapedParameters;
432 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000433
434 /// Returns the list of the escaped variables with the variably modified
435 /// types.
436 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
437 return EscapedVariableLengthDecls.getArrayRef();
438 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000439};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000440} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000441
442/// Get the GPU warp size.
443static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000444 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000445 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000446 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000447 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000448}
449
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000450/// Get the id of the current thread on the GPU.
451static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000452 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000453 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000454 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000455 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000456}
457
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000458/// Get the id of the warp in the block.
459/// We assume that the warp size is 32, which is always the case
460/// on the NVPTX device, to generate more efficient code.
461static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
462 CGBuilderTy &Bld = CGF.Builder;
463 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
464}
465
466/// Get the id of the current lane in the Warp.
467/// We assume that the warp size is 32, which is always the case
468/// on the NVPTX device, to generate more efficient code.
469static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
470 CGBuilderTy &Bld = CGF.Builder;
471 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
472 "nvptx_lane_id");
473}
474
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000475/// Get the maximum number of threads in a block of the GPU.
476static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000477 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000478 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000479 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000480 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000481}
482
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000483/// Get barrier to synchronize all threads in a block.
484static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000485 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000486 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000487}
488
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000489/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
490/// a CTA.
491static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
492 llvm::Value *NumThreads) {
493 CGBuilderTy &Bld = CGF.Builder;
494 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000495 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
496 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
497 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000498}
499
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000500/// Synchronize all GPU threads in a block.
501static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000502
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000503/// Synchronize worker threads in a parallel region.
504static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
505 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
506}
507
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000508/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000509/// For the 'generic' execution mode, the runtime encodes thread_limit in
510/// the launch parameters, always starting thread_limit+warpSize threads per
511/// CTA. The threads in the last warp are reserved for master execution.
512/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
513static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
514 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000515 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000516 return IsInSpmdExecutionMode
517 ? getNVPTXNumThreads(CGF)
518 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
519 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000520}
521
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000522/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000523/// The master thread id is the first thread (lane) of the last warp in the
524/// GPU block. Warp size is assumed to be some power of 2.
525/// Thread id is 0 indexed.
526/// E.g: If NumThreads is 33, master id is 32.
527/// If NumThreads is 64, master id is 32.
528/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000529static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000530 CGBuilderTy &Bld = CGF.Builder;
531 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
532
533 // We assume that the warp size is a power of 2.
534 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
535
536 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
537 Bld.CreateNot(Mask), "master_tid");
538}
539
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000540CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000541 CodeGenModule &CGM, SourceLocation Loc)
542 : WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000543 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000544}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000545
546void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
547 CodeGenModule &CGM) {
548 // Create an worker function with no arguments.
549 CGFI = &CGM.getTypes().arrangeNullaryFunction();
550
551 WorkerFn = llvm::Function::Create(
552 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000553 /*placeholder=*/"_worker", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +0000554 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000555}
556
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000557bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
558 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
559}
560
561static CGOpenMPRuntimeNVPTX::ExecutionMode
Carlo Bertolli79712092018-02-28 20:48:35 +0000562getExecutionMode(CodeGenModule &CGM) {
563 return CGM.getLangOpts().OpenMPCUDAMode
564 ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
565 : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000566}
567
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000568void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
569 StringRef ParentName,
570 llvm::Function *&OutlinedFn,
571 llvm::Constant *&OutlinedFnID,
572 bool IsOffloadEntry,
573 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000574 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
575 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000576 EntryFunctionState EST;
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000577 WorkerFunctionState WST(CGM, D.getLocStart());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000578 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000579 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000580
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000581 // Emit target region as a standalone region.
582 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000583 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
584 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000586 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000587 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000588 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000589 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000590 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000591 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
592 .emitGenericEntryHeader(CGF, EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000593 }
594 void Exit(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000595 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
596 .emitGenericEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000597 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000598 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000599 CodeGen.setAction(Action);
600 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
601 IsOffloadEntry, CodeGen);
602
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000603 // Now change the name of the worker function to correspond to this target
604 // region's entry function.
605 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
Alexey Bataevaee93892018-01-08 20:09:47 +0000606
607 // Create the worker function
608 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000609}
610
611// Setup NVPTX threads for master-worker OpenMP scheme.
612void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
613 EntryFunctionState &EST,
614 WorkerFunctionState &WST) {
615 CGBuilderTy &Bld = CGF.Builder;
616
617 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
618 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
619 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
620 EST.ExitBB = CGF.createBasicBlock(".exit");
621
622 auto *IsWorker =
623 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
624 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
625
626 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +0000627 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000628 CGF.EmitBranch(EST.ExitBB);
629
630 CGF.EmitBlock(MasterCheckBB);
631 auto *IsMaster =
632 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
633 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
634
635 CGF.EmitBlock(MasterBB);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000636 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000637 // First action in sequential region:
638 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000639 // TODO: Optimize runtime initialization and pass in correct value.
640 llvm::Value *Args[] = {getThreadLimit(CGF),
641 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000642 CGF.EmitRuntimeCall(
643 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000644
645 // For data sharing, we need to initialize the stack.
646 CGF.EmitRuntimeCall(
647 createNVPTXRuntimeFunction(
648 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
649
Alexey Bataevc99042b2018-03-15 18:10:54 +0000650 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000651}
652
653void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
654 EntryFunctionState &EST) {
Alexey Bataevc99042b2018-03-15 18:10:54 +0000655 emitGenericVarsEpilog(CGF);
656 if (!CGF.HaveInsertPoint())
657 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000658
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000659 if (!EST.ExitBB)
660 EST.ExitBB = CGF.createBasicBlock(".exit");
661
662 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
663 CGF.EmitBranch(TerminateBB);
664
665 CGF.EmitBlock(TerminateBB);
666 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000667 // TODO: Optimize runtime initialization and pass in correct value.
668 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000669 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000670 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000671 // Barrier to terminate worker threads.
672 syncCTAThreads(CGF);
673 // Master thread jumps to exit point.
674 CGF.EmitBranch(EST.ExitBB);
675
676 CGF.EmitBlock(EST.ExitBB);
677 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000678}
679
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000680void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
681 StringRef ParentName,
682 llvm::Function *&OutlinedFn,
683 llvm::Constant *&OutlinedFnID,
684 bool IsOffloadEntry,
685 const RegionCodeGenTy &CodeGen) {
686 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
687 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
688 EntryFunctionState EST;
689
690 // Emit target region as a standalone region.
691 class NVPTXPrePostActionTy : public PrePostActionTy {
692 CGOpenMPRuntimeNVPTX &RT;
693 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
694 const OMPExecutableDirective &D;
695
696 public:
697 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
698 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
699 const OMPExecutableDirective &D)
700 : RT(RT), EST(EST), D(D) {}
701 void Enter(CodeGenFunction &CGF) override {
702 RT.emitSpmdEntryHeader(CGF, EST, D);
703 }
704 void Exit(CodeGenFunction &CGF) override {
705 RT.emitSpmdEntryFooter(CGF, EST);
706 }
707 } Action(*this, EST, D);
708 CodeGen.setAction(Action);
709 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
710 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000711}
712
713void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
714 CodeGenFunction &CGF, EntryFunctionState &EST,
715 const OMPExecutableDirective &D) {
716 auto &Bld = CGF.Builder;
717
718 // Setup BBs in entry function.
719 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
720 EST.ExitBB = CGF.createBasicBlock(".exit");
721
722 // Initialize the OMP state in the runtime; called by all active threads.
723 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
724 // based on code analysis of the target region.
725 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
726 /*RequiresOMPRuntime=*/Bld.getInt16(1),
727 /*RequiresDataSharing=*/Bld.getInt16(1)};
728 CGF.EmitRuntimeCall(
729 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
730 CGF.EmitBranch(ExecuteBB);
731
732 CGF.EmitBlock(ExecuteBB);
733}
734
735void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
736 EntryFunctionState &EST) {
737 if (!EST.ExitBB)
738 EST.ExitBB = CGF.createBasicBlock(".exit");
739
740 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
741 CGF.EmitBranch(OMPDeInitBB);
742
743 CGF.EmitBlock(OMPDeInitBB);
744 // DeInitialize the OMP state in the runtime; called by all active threads.
745 CGF.EmitRuntimeCall(
746 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
747 CGF.EmitBranch(EST.ExitBB);
748
749 CGF.EmitBlock(EST.ExitBB);
750 EST.ExitBB = nullptr;
751}
752
753// Create a unique global variable to indicate the execution mode of this target
754// region. The execution mode is either 'generic', or 'spmd' depending on the
755// target directive. This variable is picked up by the offload library to setup
756// the device appropriately before kernel launch. If the execution mode is
757// 'generic', the runtime reserves one warp for the master, otherwise, all
758// warps participate in parallel work.
759static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
760 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
761 (void)new llvm::GlobalVariable(
762 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
763 llvm::GlobalValue::WeakAnyLinkage,
764 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
765}
766
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000767void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000768 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000769
770 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000771 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {},
772 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000773 emitWorkerLoop(CGF, WST);
774 CGF.FinishFunction();
775}
776
777void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
778 WorkerFunctionState &WST) {
779 //
780 // The workers enter this loop and wait for parallel work from the master.
781 // When the master encounters a parallel region it sets up the work + variable
782 // arguments, and wakes up the workers. The workers first check to see if
783 // they are required for the parallel region, i.e., within the # of requested
784 // parallel threads. The activated workers load the variable arguments and
785 // execute the parallel work.
786 //
787
788 CGBuilderTy &Bld = CGF.Builder;
789
790 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
791 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
792 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
793 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
794 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
795 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
796
797 CGF.EmitBranch(AwaitBB);
798
799 // Workers wait for work from master.
800 CGF.EmitBlock(AwaitBB);
801 // Wait for parallel work
802 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000803
804 Address WorkFn =
805 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
806 Address ExecStatus =
807 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
808 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
809 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
810
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000811 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000812 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000813 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000814 llvm::Value *Ret = CGF.EmitRuntimeCall(
815 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
816 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000817
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000818 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000819 llvm::Value *ShouldTerminate =
820 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000821 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
822
823 // Activate requested workers.
824 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000825 llvm::Value *IsActive =
826 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
827 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000828
829 // Signal start of parallel region.
830 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000831
832 // Process work items: outlined parallel functions.
833 for (auto *W : Work) {
834 // Try to match this outlined function.
835 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
836
837 llvm::Value *WorkFnMatch =
838 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
839
840 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
841 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
842 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
843
844 // Execute this outlined function.
845 CGF.EmitBlock(ExecuteFNBB);
846
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000847 // Insert call to work function via shared wrapper. The shared
848 // wrapper takes two arguments:
849 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +0000850 // - the thread ID;
851 emitCall(CGF, WST.Loc, W,
852 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000853
854 // Go to end of parallel region.
855 CGF.EmitBranch(TerminateBB);
856
857 CGF.EmitBlock(CheckNextBB);
858 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000859
860 // Signal end of parallel region.
861 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000862 CGF.EmitRuntimeCall(
863 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
864 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000865 CGF.EmitBranch(BarrierBB);
866
867 // All active and inactive workers wait at a barrier after parallel region.
868 CGF.EmitBlock(BarrierBB);
869 // Barrier after parallel region.
870 syncCTAThreads(CGF);
871 CGF.EmitBranch(AwaitBB);
872
873 // Exit target region.
874 CGF.EmitBlock(ExitBB);
875}
876
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000877/// \brief Returns specified OpenMP runtime function for the current OpenMP
878/// implementation. Specialized for the NVPTX device.
879/// \param Function OpenMP runtime function.
880/// \return Specified function.
881llvm::Constant *
882CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
883 llvm::Constant *RTLFn = nullptr;
884 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
885 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000886 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
887 // RequiresOMPRuntime);
888 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000889 llvm::FunctionType *FnTy =
890 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
891 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
892 break;
893 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000894 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000895 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
896 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000897 llvm::FunctionType *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000898 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000899 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
900 break;
901 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000902 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
903 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000904 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000905 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
906 llvm::FunctionType *FnTy =
907 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
908 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
909 break;
910 }
911 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
912 // Build void __kmpc_spmd_kernel_deinit();
913 llvm::FunctionType *FnTy =
914 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
915 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
916 break;
917 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000918 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
919 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000920 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000921 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000922 llvm::FunctionType *FnTy =
923 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
924 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
925 break;
926 }
927 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000928 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
929 /// int16_t IsOMPRuntimeInitialized);
930 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000931 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
932 llvm::FunctionType *FnTy =
933 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
934 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
935 break;
936 }
937 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
938 /// Build void __kmpc_kernel_end_parallel();
939 llvm::FunctionType *FnTy =
940 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
941 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
942 break;
943 }
944 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
945 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
946 // global_tid);
947 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
948 llvm::FunctionType *FnTy =
949 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
950 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
951 break;
952 }
953 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
954 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
955 // global_tid);
956 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
957 llvm::FunctionType *FnTy =
958 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
959 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
960 break;
961 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000962 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
963 // Build int32_t __kmpc_shuffle_int32(int32_t element,
964 // int16_t lane_offset, int16_t warp_size);
965 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
966 llvm::FunctionType *FnTy =
967 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
968 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
969 break;
970 }
971 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
972 // Build int64_t __kmpc_shuffle_int64(int64_t element,
973 // int16_t lane_offset, int16_t warp_size);
974 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
975 llvm::FunctionType *FnTy =
976 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
977 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
978 break;
979 }
980 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
981 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
982 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
983 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
984 // lane_offset, int16_t Algorithm Version),
985 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
986 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
987 CGM.Int16Ty, CGM.Int16Ty};
988 auto *ShuffleReduceFnTy =
989 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
990 /*isVarArg=*/false);
991 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
992 auto *InterWarpCopyFnTy =
993 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
994 /*isVarArg=*/false);
995 llvm::Type *TypeParams[] = {CGM.Int32Ty,
996 CGM.Int32Ty,
997 CGM.SizeTy,
998 CGM.VoidPtrTy,
999 ShuffleReduceFnTy->getPointerTo(),
1000 InterWarpCopyFnTy->getPointerTo()};
1001 llvm::FunctionType *FnTy =
1002 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1003 RTLFn = CGM.CreateRuntimeFunction(
1004 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1005 break;
1006 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001007 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1008 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1009 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1010 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1011 // lane_offset, int16_t shortCircuit),
1012 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1013 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1014 // int32_t index, int32_t width),
1015 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1016 // int32_t index, int32_t width, int32_t reduce))
1017 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1018 CGM.Int16Ty, CGM.Int16Ty};
1019 auto *ShuffleReduceFnTy =
1020 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1021 /*isVarArg=*/false);
1022 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1023 auto *InterWarpCopyFnTy =
1024 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1025 /*isVarArg=*/false);
1026 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1027 CGM.Int32Ty, CGM.Int32Ty};
1028 auto *CopyToScratchpadFnTy =
1029 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1030 /*isVarArg=*/false);
1031 llvm::Type *LoadReduceTypeParams[] = {
1032 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1033 auto *LoadReduceFnTy =
1034 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1035 /*isVarArg=*/false);
1036 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1037 CGM.Int32Ty,
1038 CGM.SizeTy,
1039 CGM.VoidPtrTy,
1040 ShuffleReduceFnTy->getPointerTo(),
1041 InterWarpCopyFnTy->getPointerTo(),
1042 CopyToScratchpadFnTy->getPointerTo(),
1043 LoadReduceFnTy->getPointerTo()};
1044 llvm::FunctionType *FnTy =
1045 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1046 RTLFn = CGM.CreateRuntimeFunction(
1047 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1048 break;
1049 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001050 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1051 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1052 llvm::Type *TypeParams[] = {CGM.Int32Ty};
1053 llvm::FunctionType *FnTy =
1054 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1055 RTLFn = CGM.CreateRuntimeFunction(
1056 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1057 break;
1058 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001059 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1060 /// Build void __kmpc_data_sharing_init_stack();
1061 llvm::FunctionType *FnTy =
1062 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1063 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1064 break;
1065 }
1066 case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1067 // Build void *__kmpc_data_sharing_push_stack(size_t size,
1068 // int16_t UseSharedMemory);
1069 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1070 llvm::FunctionType *FnTy =
1071 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1072 RTLFn = CGM.CreateRuntimeFunction(
1073 FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1074 break;
1075 }
1076 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1077 // Build void __kmpc_data_sharing_pop_stack(void *a);
1078 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1079 llvm::FunctionType *FnTy =
1080 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1081 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1082 /*Name=*/"__kmpc_data_sharing_pop_stack");
1083 break;
1084 }
1085 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1086 /// Build void __kmpc_begin_sharing_variables(void ***args,
1087 /// size_t n_args);
1088 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1089 llvm::FunctionType *FnTy =
1090 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1091 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1092 break;
1093 }
1094 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1095 /// Build void __kmpc_end_sharing_variables();
1096 llvm::FunctionType *FnTy =
1097 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1098 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1099 break;
1100 }
1101 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1102 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1103 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1104 llvm::FunctionType *FnTy =
1105 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1106 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1107 break;
1108 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001109 }
1110 return RTLFn;
1111}
1112
1113void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1114 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +00001115 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001116 auto *F = dyn_cast<llvm::Function>(Addr);
1117 // TODO: Add support for global variables on the device after declare target
1118 // support.
1119 if (!F)
1120 return;
1121 llvm::Module *M = F->getParent();
1122 llvm::LLVMContext &Ctx = M->getContext();
1123
1124 // Get "nvvm.annotations" metadata node
1125 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
1126
1127 llvm::Metadata *MDVals[] = {
1128 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
1129 llvm::ConstantAsMetadata::get(
1130 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1131 // Append metadata to nvvm.annotations
1132 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1133}
1134
1135void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1136 const OMPExecutableDirective &D, StringRef ParentName,
1137 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001138 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001139 if (!IsOffloadEntry) // Nothing to do.
1140 return;
1141
1142 assert(!ParentName.empty() && "Invalid target region parent name!");
1143
Carlo Bertolli79712092018-02-28 20:48:35 +00001144 CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001145 switch (Mode) {
1146 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
1147 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1148 CodeGen);
1149 break;
1150 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
1151 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1152 CodeGen);
1153 break;
1154 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
1155 llvm_unreachable(
1156 "Unknown programming model for OpenMP directive on NVPTX target.");
1157 }
1158
1159 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001160}
1161
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001162CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001163 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001164 if (!CGM.getLangOpts().OpenMPIsDevice)
1165 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001166}
Carlo Bertollic6872252016-04-04 15:55:02 +00001167
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001168void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1169 OpenMPProcBindClauseKind ProcBind,
1170 SourceLocation Loc) {
1171 // Do nothing in case of Spmd mode and L0 parallel.
1172 // TODO: If in Spmd mode and L1 parallel emit the clause.
1173 if (isInSpmdExecutionMode())
1174 return;
1175
1176 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1177}
1178
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001179void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1180 llvm::Value *NumThreads,
1181 SourceLocation Loc) {
1182 // Do nothing in case of Spmd mode and L0 parallel.
1183 // TODO: If in Spmd mode and L1 parallel emit the clause.
1184 if (isInSpmdExecutionMode())
1185 return;
1186
1187 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1188}
1189
Carlo Bertollic6872252016-04-04 15:55:02 +00001190void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1191 const Expr *NumTeams,
1192 const Expr *ThreadLimit,
1193 SourceLocation Loc) {}
1194
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001195llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1196 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1197 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001198 SourceLocation Loc = D.getLocStart();
1199
1200 // Emit target region as a standalone region.
1201 class NVPTXPrePostActionTy : public PrePostActionTy {
1202 SourceLocation &Loc;
1203
1204 public:
1205 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1206 void Enter(CodeGenFunction &CGF) override {
1207 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1208 .emitGenericVarsProlog(CGF, Loc);
1209 }
1210 void Exit(CodeGenFunction &CGF) override {
1211 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1212 .emitGenericVarsEpilog(CGF);
1213 }
1214 } Action(Loc);
1215 CodeGen.setAction(Action);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001216 auto *OutlinedFun =
1217 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1218 D, ThreadIDVar, InnermostKind, CodeGen));
1219 if (!isInSpmdExecutionMode()) {
1220 llvm::Function *WrapperFun =
1221 createParallelDataSharingWrapper(OutlinedFun, D);
1222 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1223 }
1224
1225 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001226}
1227
1228llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001229 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1230 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001231 SourceLocation Loc = D.getLocStart();
Carlo Bertollic6872252016-04-04 15:55:02 +00001232
Alexey Bataevc99042b2018-03-15 18:10:54 +00001233 // Emit target region as a standalone region.
1234 class NVPTXPrePostActionTy : public PrePostActionTy {
1235 SourceLocation &Loc;
1236
1237 public:
1238 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1239 void Enter(CodeGenFunction &CGF) override {
1240 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1241 .emitGenericVarsProlog(CGF, Loc);
1242 }
1243 void Exit(CodeGenFunction &CGF) override {
1244 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1245 .emitGenericVarsEpilog(CGF);
1246 }
1247 } Action(Loc);
1248 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001249 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1250 D, ThreadIDVar, InnermostKind, CodeGen);
1251 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1252 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00001253 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001254 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00001255
1256 return OutlinedFun;
1257}
1258
Alexey Bataevc99042b2018-03-15 18:10:54 +00001259void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1260 SourceLocation Loc) {
1261 CGBuilderTy &Bld = CGF.Builder;
1262
1263 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1264 if (I == FunctionGlobalizedDecls.end())
1265 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001266 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1267 QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001268
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001269 // Recover pointer to this function's global record. The runtime will
1270 // handle the specifics of the allocation of the memory.
1271 // Use actual memory size of the record including the padding
1272 // for alignment purposes.
1273 unsigned Alignment =
1274 CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
1275 unsigned GlobalRecordSize =
1276 CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1277 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1278 // TODO: allow the usage of shared memory to be controlled by
1279 // the user, for now, default to global.
1280 llvm::Value *GlobalRecordSizeArg[] = {
1281 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1282 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1283 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1284 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1285 GlobalRecordSizeArg);
1286 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1287 GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1288 LValue Base =
1289 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1290 I->getSecond().GlobalRecordAddr = GlobalRecValue;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001291
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001292 // Emit the "global alloca" which is a GEP from the global declaration
1293 // record using the pointer returned by the runtime.
1294 for (auto &Rec : I->getSecond().LocalVarData) {
1295 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1296 llvm::Value *ParValue;
1297 if (EscapedParam) {
1298 const auto *VD = cast<VarDecl>(Rec.first);
1299 LValue ParLVal =
1300 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1301 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1302 }
1303 const FieldDecl *FD = Rec.second.first;
1304 LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1305 Rec.second.second = VarAddr.getAddress();
1306 if (EscapedParam) {
1307 const auto *VD = cast<VarDecl>(Rec.first);
1308 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1309 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1310 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00001311 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001312 }
1313 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1314 // Recover pointer to this function's global record. The runtime will
1315 // handle the specifics of the allocation of the memory.
1316 // Use actual memory size of the record including the padding
1317 // for alignment purposes.
1318 auto &Bld = CGF.Builder;
1319 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1320 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1321 Size = Bld.CreateNUWAdd(
1322 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1323 llvm::Value *AlignVal =
1324 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1325 Size = Bld.CreateUDiv(Size, AlignVal);
1326 Size = Bld.CreateNUWMul(Size, AlignVal);
1327 // TODO: allow the usage of shared memory to be controlled by
1328 // the user, for now, default to global.
1329 llvm::Value *GlobalRecordSizeArg[] = {
1330 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1331 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1332 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1333 GlobalRecordSizeArg);
1334 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1335 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1336 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1337 CGM.getContext().getDeclAlign(VD),
1338 AlignmentSource::Decl);
1339 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1340 Base.getAddress());
1341 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001342 }
1343 I->getSecond().MappedParams->apply(CGF);
1344}
1345
1346void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1347 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001348 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001349 I->getSecond().MappedParams->restore(CGF);
1350 if (!CGF.HaveInsertPoint())
1351 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001352 for (llvm::Value *Addr :
1353 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1354 CGF.EmitRuntimeCall(
1355 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1356 Addr);
1357 }
1358 if (I->getSecond().GlobalRecordAddr) {
1359 CGF.EmitRuntimeCall(
1360 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1361 I->getSecond().GlobalRecordAddr);
1362 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00001363 }
1364}
1365
Carlo Bertollic6872252016-04-04 15:55:02 +00001366void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
1367 const OMPExecutableDirective &D,
1368 SourceLocation Loc,
1369 llvm::Value *OutlinedFn,
1370 ArrayRef<llvm::Value *> CapturedVars) {
1371 if (!CGF.HaveInsertPoint())
1372 return;
1373
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001374 Address ZeroAddr = CGF.CreateMemTemp(
1375 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1376 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00001377 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1378 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001379 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00001380 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1381 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001382 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00001383}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001384
1385void CGOpenMPRuntimeNVPTX::emitParallelCall(
1386 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1387 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1388 if (!CGF.HaveInsertPoint())
1389 return;
1390
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001391 if (isInSpmdExecutionMode())
1392 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1393 else
1394 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001395}
1396
1397void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
1398 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1399 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1400 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001401 llvm::Function *WFn = WrapperFunctionsMap[Fn];
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001402
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001403 assert(WFn && "Wrapper function does not exist!");
1404
1405 // Force inline this outlined function at its call site.
1406 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1407
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001408 auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001409 PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001410 CGBuilderTy &Bld = CGF.Builder;
1411
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001412 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1413
1414 // Prepare for parallel region. Indicate the outlined function.
1415 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1416 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1417 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
1418 Args);
1419
1420 // Create a private scope that will globalize the arguments
1421 // passed from the outside of the target region.
1422 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1423
1424 // There's somehting to share.
1425 if (!CapturedVars.empty()) {
1426 // Prepare for parallel region. Indicate the outlined function.
1427 Address SharedArgs =
1428 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
1429 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
1430
1431 llvm::Value *DataSharingArgs[] = {
1432 SharedArgsPtr,
1433 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1434 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1435 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
1436 DataSharingArgs);
1437
1438 // Store variable address in a list of references to pass to workers.
1439 unsigned Idx = 0;
1440 ASTContext &Ctx = CGF.getContext();
1441 Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
1442 Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
1443 .castAs<PointerType>());
1444 for (llvm::Value *V : CapturedVars) {
1445 Address Dst = Bld.CreateConstInBoundsGEP(
1446 SharedArgListAddress, Idx, CGF.getPointerSize());
Alexey Bataev17314212018-03-20 15:41:05 +00001447 llvm::Value * PtrV;
1448 if (V->getType()->isIntegerTy())
1449 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1450 else
1451 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001452 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1453 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00001454 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001455 }
1456 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001457
1458 // Activate workers. This barrier is used by the master to signal
1459 // work for the workers.
1460 syncCTAThreads(CGF);
1461
1462 // OpenMP [2.5, Parallel Construct, p.49]
1463 // There is an implied barrier at the end of a parallel region. After the
1464 // end of a parallel region, only the master thread of the team resumes
1465 // execution of the enclosing task region.
1466 //
1467 // The master waits at this barrier until all workers are done.
1468 syncCTAThreads(CGF);
1469
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001470 if (!CapturedVars.empty())
1471 CGF.EmitRuntimeCall(
1472 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
1473
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001474 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001475 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001476 };
1477
1478 auto *RTLoc = emitUpdateLocation(CGF, Loc);
1479 auto *ThreadID = getThreadID(CGF, Loc);
1480 llvm::Value *Args[] = {RTLoc, ThreadID};
1481
Alexey Bataev634b5ba2018-03-19 17:18:13 +00001482 auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1483 PrePostActionTy &) {
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001484 auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF,
1485 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001486 Action.Enter(CGF);
1487
1488 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001489 Address ZeroAddr =
1490 CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
1491 /*DestWidth=*/32, /*Signed=*/1),
1492 ".zero.addr");
1493 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1494 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1495 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001496 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001497 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001498 };
1499
1500 RegionCodeGenTy RCG(CodeGen);
1501 NVPTXActionTy Action(
1502 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1503 Args,
1504 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1505 Args);
1506 RCG.setAction(Action);
1507 RCG(CGF);
1508 };
1509
1510 if (IfCond)
1511 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1512 else {
1513 CodeGenFunction::RunCleanupsScope Scope(CGF);
1514 RegionCodeGenTy ThenRCG(L0ParallelGen);
1515 ThenRCG(CGF);
1516 }
1517}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001518
1519void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1520 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1521 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1522 // Just call the outlined function to execute the parallel region.
1523 // OutlinedFn(&GTid, &zero, CapturedStruct);
1524 //
1525 // TODO: Do something with IfCond when support for the 'if' clause
1526 // is added on Spmd target directives.
1527 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00001528
1529 Address ZeroAddr = CGF.CreateMemTemp(
1530 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1531 ".zero.addr");
1532 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001533 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertolli79712092018-02-28 20:48:35 +00001534 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001535 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001536 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001537}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001538
Alexey Bataevb2575932018-01-04 20:18:55 +00001539/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00001540static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1541 QualType ValTy, QualType CastTy,
1542 SourceLocation Loc) {
1543 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1544 "Cast type must sized.");
1545 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1546 "Val type must sized.");
1547 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1548 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00001549 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00001550 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1551 CGF.getContext().getTypeSizeInChars(CastTy))
1552 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1553 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1554 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1555 CastTy->hasSignedIntegerRepresentation());
1556 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00001557 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1558 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00001559 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
1560 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00001561}
1562
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001563/// This function creates calls to one of two shuffle functions to copy
1564/// variables between lanes in a warp.
1565static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001566 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00001567 QualType ElemType,
1568 llvm::Value *Offset,
1569 SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001570 auto &CGM = CGF.CGM;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001571 auto &Bld = CGF.Builder;
1572 CGOpenMPRuntimeNVPTX &RT =
1573 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1574
Alexey Bataeva453f362018-03-19 17:53:56 +00001575 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1576 assert(Size.getQuantity() <= 8 &&
1577 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001578
Alexey Bataeva453f362018-03-19 17:53:56 +00001579 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001580 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1581 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1582
1583 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00001584 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1585 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1586 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00001587 auto *WarpSize =
1588 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001589
1590 auto *ShuffledVal =
1591 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1592 {ElemCast, Offset, WarpSize});
1593
Alexey Bataeva453f362018-03-19 17:53:56 +00001594 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001595}
1596
1597namespace {
1598enum CopyAction : unsigned {
1599 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1600 // the warp using shuffle instructions.
1601 RemoteLaneToThread,
1602 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1603 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001604 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1605 ThreadToScratchpad,
1606 // ScratchpadToThread: Copy from a scratchpad array in global memory
1607 // containing team-reduced data to a thread's stack.
1608 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001609};
1610} // namespace
1611
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001612struct CopyOptionsTy {
1613 llvm::Value *RemoteLaneOffset;
1614 llvm::Value *ScratchpadIndex;
1615 llvm::Value *ScratchpadWidth;
1616};
1617
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001618/// Emit instructions to copy a Reduce list, which contains partially
1619/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001620static void emitReductionListCopy(
1621 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1622 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1623 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001624
1625 auto &CGM = CGF.CGM;
1626 auto &C = CGM.getContext();
1627 auto &Bld = CGF.Builder;
1628
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001629 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1630 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1631 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1632
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001633 // Iterates, element-by-element, through the source Reduce list and
1634 // make a copy.
1635 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001636 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001637 for (auto &Private : Privates) {
1638 Address SrcElementAddr = Address::invalid();
1639 Address DestElementAddr = Address::invalid();
1640 Address DestElementPtrAddr = Address::invalid();
1641 // Should we shuffle in an element from a remote lane?
1642 bool ShuffleInElement = false;
1643 // Set to true to update the pointer in the dest Reduce list to a
1644 // newly created element.
1645 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001646 // Increment the src or dest pointer to the scratchpad, for each
1647 // new element.
1648 bool IncrScratchpadSrc = false;
1649 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001650
1651 switch (Action) {
1652 case RemoteLaneToThread: {
1653 // Step 1.1: Get the address for the src element in the Reduce list.
1654 Address SrcElementPtrAddr =
1655 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001656 SrcElementAddr = CGF.EmitLoadOfPointer(
1657 SrcElementPtrAddr,
1658 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001659
1660 // Step 1.2: Create a temporary to store the element in the destination
1661 // Reduce list.
1662 DestElementPtrAddr =
1663 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1664 DestElementAddr =
1665 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1666 ShuffleInElement = true;
1667 UpdateDestListPtr = true;
1668 break;
1669 }
1670 case ThreadCopy: {
1671 // Step 1.1: Get the address for the src element in the Reduce list.
1672 Address SrcElementPtrAddr =
1673 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001674 SrcElementAddr = CGF.EmitLoadOfPointer(
1675 SrcElementPtrAddr,
1676 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001677
1678 // Step 1.2: Get the address for dest element. The destination
1679 // element has already been created on the thread's stack.
1680 DestElementPtrAddr =
1681 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001682 DestElementAddr = CGF.EmitLoadOfPointer(
1683 DestElementPtrAddr,
1684 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001685 break;
1686 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001687 case ThreadToScratchpad: {
1688 // Step 1.1: Get the address for the src element in the Reduce list.
1689 Address SrcElementPtrAddr =
1690 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001691 SrcElementAddr = CGF.EmitLoadOfPointer(
1692 SrcElementPtrAddr,
1693 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001694
1695 // Step 1.2: Get the address for dest element:
1696 // address = base + index * ElementSizeInChars.
1697 unsigned ElementSizeInChars =
1698 C.getTypeSizeInChars(Private->getType()).getQuantity();
1699 auto *CurrentOffset =
1700 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1701 ScratchpadIndex);
1702 auto *ScratchPadElemAbsolutePtrVal =
1703 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1704 ScratchPadElemAbsolutePtrVal =
1705 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00001706 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1707 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001708 IncrScratchpadDest = true;
1709 break;
1710 }
1711 case ScratchpadToThread: {
1712 // Step 1.1: Get the address for the src element in the scratchpad.
1713 // address = base + index * ElementSizeInChars.
1714 unsigned ElementSizeInChars =
1715 C.getTypeSizeInChars(Private->getType()).getQuantity();
1716 auto *CurrentOffset =
1717 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1718 ScratchpadIndex);
1719 auto *ScratchPadElemAbsolutePtrVal =
1720 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1721 ScratchPadElemAbsolutePtrVal =
1722 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1723 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1724 C.getTypeAlignInChars(Private->getType()));
1725 IncrScratchpadSrc = true;
1726
1727 // Step 1.2: Create a temporary to store the element in the destination
1728 // Reduce list.
1729 DestElementPtrAddr =
1730 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1731 DestElementAddr =
1732 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1733 UpdateDestListPtr = true;
1734 break;
1735 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001736 }
1737
1738 // Regardless of src and dest of copy, we emit the load of src
1739 // element as this is required in all directions
1740 SrcElementAddr = Bld.CreateElementBitCast(
1741 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1742 llvm::Value *Elem =
1743 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001744 Private->getType(), Private->getExprLoc());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001745
1746 // Now that all active lanes have read the element in the
1747 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00001748 if (ShuffleInElement) {
1749 Elem =
1750 createRuntimeShuffleFunction(CGF, Elem, Private->getType(),
1751 RemoteLaneOffset, Private->getExprLoc());
1752 }
Alexey Bataevb2575932018-01-04 20:18:55 +00001753
1754 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1755 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001756
1757 // Store the source element value to the dest element address.
1758 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1759 Private->getType());
1760
1761 // Step 3.1: Modify reference in dest Reduce list as needed.
1762 // Modifying the reference in Reduce list to point to the newly
1763 // created element. The element is live in the current function
1764 // scope and that of functions it invokes (i.e., reduce_function).
1765 // RemoteReduceData[i] = (void*)&RemoteElem
1766 if (UpdateDestListPtr) {
1767 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1768 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1769 DestElementPtrAddr, /*Volatile=*/false,
1770 C.VoidPtrTy);
1771 }
1772
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001773 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1774 // address of the next element in scratchpad memory, unless we're currently
1775 // processing the last one. Memory alignment is also taken care of here.
1776 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1777 llvm::Value *ScratchpadBasePtr =
1778 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1779 unsigned ElementSizeInChars =
1780 C.getTypeSizeInChars(Private->getType()).getQuantity();
1781 ScratchpadBasePtr = Bld.CreateAdd(
1782 ScratchpadBasePtr,
1783 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1784 CGM.SizeTy, ElementSizeInChars)));
1785
1786 // Take care of global memory alignment for performance
1787 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1788 llvm::ConstantInt::get(CGM.SizeTy, 1));
1789 ScratchpadBasePtr = Bld.CreateSDiv(
1790 ScratchpadBasePtr,
1791 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1792 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1793 llvm::ConstantInt::get(CGM.SizeTy, 1));
1794 ScratchpadBasePtr = Bld.CreateMul(
1795 ScratchpadBasePtr,
1796 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1797
1798 if (IncrScratchpadDest)
1799 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1800 else /* IncrScratchpadSrc = true */
1801 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1802 }
1803
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001804 Idx++;
1805 }
1806}
1807
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001808/// This function emits a helper that loads data from the scratchpad array
1809/// and (optionally) reduces it with the input operand.
1810///
1811/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1812/// reduce_data remote;
1813/// for elem in remote:
1814/// remote.elem = Scratchpad[elem_id][index]
1815/// if (should_reduce)
1816/// local = local @ remote
1817/// else
1818/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001819static llvm::Value *emitReduceScratchpadFunction(
1820 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
1821 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001822 auto &C = CGM.getContext();
1823 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1824
1825 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001826 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1827 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001828 // Base address of the scratchpad array, with each element storing a
1829 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001830 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1831 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001832 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001833 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1834 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001835 // Row width of an element in the scratchpad array, typically
1836 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001837 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1838 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001839 // If should_reduce == 1, then it's load AND reduce,
1840 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1841 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001842 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1843 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001844
1845 FunctionArgList Args;
1846 Args.push_back(&ReduceListArg);
1847 Args.push_back(&ScratchPadArg);
1848 Args.push_back(&IndexArg);
1849 Args.push_back(&WidthArg);
1850 Args.push_back(&ShouldReduceArg);
1851
1852 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1853 auto *Fn = llvm::Function::Create(
1854 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1855 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001856 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001857 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001858 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001859
1860 auto &Bld = CGF.Builder;
1861
1862 // Get local Reduce list pointer.
1863 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1864 Address ReduceListAddr(
1865 Bld.CreatePointerBitCastOrAddrSpaceCast(
1866 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001867 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001868 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1869 CGF.getPointerAlign());
1870
1871 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1872 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001873 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001874
1875 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001876 llvm::Value *IndexVal = Bld.CreateIntCast(
1877 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
1878 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001879
1880 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001881 llvm::Value *WidthVal = Bld.CreateIntCast(
1882 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
1883 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001884
1885 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1886 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001887 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001888
1889 // The absolute ptr address to the base addr of the next element to copy.
1890 llvm::Value *CumulativeElemBasePtr =
1891 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1892 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1893
1894 // Create a Remote Reduce list to store the elements read from the
1895 // scratchpad array.
1896 Address RemoteReduceList =
1897 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1898
1899 // Assemble remote Reduce list from scratchpad array.
1900 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1901 SrcDataAddr, RemoteReduceList,
1902 {/*RemoteLaneOffset=*/nullptr,
1903 /*ScratchpadIndex=*/IndexVal,
1904 /*ScratchpadWidth=*/WidthVal});
1905
1906 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1907 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1908 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1909
1910 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1911 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1912
1913 CGF.EmitBlock(ThenBB);
1914 // We should reduce with the local Reduce list.
1915 // reduce_function(LocalReduceList, RemoteReduceList)
1916 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1917 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1918 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1919 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001920 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
1921 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001922 Bld.CreateBr(MergeBB);
1923
1924 CGF.EmitBlock(ElseBB);
1925 // No reduction; just copy:
1926 // Local Reduce list = Remote Reduce list.
1927 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1928 RemoteReduceList, ReduceListAddr);
1929 Bld.CreateBr(MergeBB);
1930
1931 CGF.EmitBlock(MergeBB);
1932
1933 CGF.FinishFunction();
1934 return Fn;
1935}
1936
1937/// This function emits a helper that stores reduced data from the team
1938/// master to a scratchpad array in global memory.
1939///
1940/// for elem in Reduce List:
1941/// scratchpad[elem_id][index] = elem
1942///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001943static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1944 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001945 QualType ReductionArrayTy,
1946 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001947
1948 auto &C = CGM.getContext();
1949 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1950
1951 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001952 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1953 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001954 // Base address of the scratchpad array, with each element storing a
1955 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001956 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1957 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001958 // A destination index into the scratchpad array, typically the team
1959 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001960 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1961 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001962 // Row width of an element in the scratchpad array, typically
1963 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001964 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1965 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001966
1967 FunctionArgList Args;
1968 Args.push_back(&ReduceListArg);
1969 Args.push_back(&ScratchPadArg);
1970 Args.push_back(&IndexArg);
1971 Args.push_back(&WidthArg);
1972
1973 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1974 auto *Fn = llvm::Function::Create(
1975 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1976 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001977 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001978 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001979 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001980
1981 auto &Bld = CGF.Builder;
1982
1983 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1984 Address SrcDataAddr(
1985 Bld.CreatePointerBitCastOrAddrSpaceCast(
1986 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001987 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001988 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1989 CGF.getPointerAlign());
1990
1991 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1992 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001993 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001994
1995 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001996 llvm::Value *IndexVal = Bld.CreateIntCast(
1997 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
1998 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001999
2000 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2001 llvm::Value *WidthVal =
2002 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2003 Int32Ty, SourceLocation()),
2004 CGF.SizeTy, /*isSigned=*/true);
2005
2006 // The absolute ptr address to the base addr of the next element to copy.
2007 llvm::Value *CumulativeElemBasePtr =
2008 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2009 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2010
2011 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2012 SrcDataAddr, DestDataAddr,
2013 {/*RemoteLaneOffset=*/nullptr,
2014 /*ScratchpadIndex=*/IndexVal,
2015 /*ScratchpadWidth=*/WidthVal});
2016
2017 CGF.FinishFunction();
2018 return Fn;
2019}
2020
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002021/// This function emits a helper that gathers Reduce lists from the first
2022/// lane of every active warp to lanes in the first warp.
2023///
2024/// void inter_warp_copy_func(void* reduce_data, num_warps)
2025/// shared smem[warp_size];
2026/// For all data entries D in reduce_data:
2027/// If (I am the first lane in each warp)
2028/// Copy my local D to smem[warp_id]
2029/// sync
2030/// if (I am the first warp)
2031/// Copy smem[thread_id] to my local D
2032/// sync
2033static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2034 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002035 QualType ReductionArrayTy,
2036 SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002037 auto &C = CGM.getContext();
2038 auto &M = CGM.getModule();
2039
2040 // ReduceList: thread local Reduce list.
2041 // At the stage of the computation when this function is called, partially
2042 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002043 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2044 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002045 // NumWarps: number of warps active in the parallel region. This could
2046 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002047 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00002048 C.getIntTypeForBitwidth(32, /* Signed */ true),
2049 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002050 FunctionArgList Args;
2051 Args.push_back(&ReduceListArg);
2052 Args.push_back(&NumWarpsArg);
2053
2054 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2055 auto *Fn = llvm::Function::Create(
2056 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2057 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002058 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002059 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002060 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002061
2062 auto &Bld = CGF.Builder;
2063
2064 // This array is used as a medium to transfer, one reduce element at a time,
2065 // the data from the first lane of every warp to lanes in the first warp
2066 // in order to perform the final step of a reduction in a parallel region
2067 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2068 // for reduced latency, as well as to have a distinct copy for concurrently
2069 // executing target regions. The array is declared with common linkage so
2070 // as to be shared across compilation units.
2071 const char *TransferMediumName =
2072 "__openmp_nvptx_data_transfer_temporary_storage";
2073 llvm::GlobalVariable *TransferMedium =
2074 M.getGlobalVariable(TransferMediumName);
2075 if (!TransferMedium) {
2076 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2077 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2078 TransferMedium = new llvm::GlobalVariable(
2079 M, Ty,
2080 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2081 llvm::Constant::getNullValue(Ty), TransferMediumName,
2082 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2083 SharedAddressSpace);
2084 }
2085
2086 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2087 auto *ThreadID = getNVPTXThreadID(CGF);
2088 // nvptx_lane_id = nvptx_id % warpsize
2089 auto *LaneID = getNVPTXLaneID(CGF);
2090 // nvptx_warp_id = nvptx_id / warpsize
2091 auto *WarpID = getNVPTXWarpID(CGF);
2092
2093 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2094 Address LocalReduceList(
2095 Bld.CreatePointerBitCastOrAddrSpaceCast(
2096 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2097 C.VoidPtrTy, SourceLocation()),
2098 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2099 CGF.getPointerAlign());
2100
2101 unsigned Idx = 0;
2102 for (auto &Private : Privates) {
2103 //
2104 // Warp master copies reduce element to transfer medium in __shared__
2105 // memory.
2106 //
2107 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2108 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2109 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2110
2111 // if (lane_id == 0)
2112 auto IsWarpMaster =
2113 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
2114 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2115 CGF.EmitBlock(ThenBB);
2116
2117 // Reduce element = LocalReduceList[i]
2118 Address ElemPtrPtrAddr =
2119 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2120 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2121 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2122 // elemptr = (type[i]*)(elemptrptr)
2123 Address ElemPtr =
2124 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2125 ElemPtr = Bld.CreateElementBitCast(
2126 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2127 // elem = *elemptr
2128 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2129 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2130
2131 // Get pointer to location in transfer medium.
2132 // MediumPtr = &medium[warp_id]
2133 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2134 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2135 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
2136 // Casting to actual data type.
2137 // MediumPtr = (type[i]*)MediumPtrAddr;
2138 MediumPtr = Bld.CreateElementBitCast(
2139 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2140
2141 //*MediumPtr = elem
2142 Bld.CreateStore(Elem, MediumPtr);
2143
2144 Bld.CreateBr(MergeBB);
2145
2146 CGF.EmitBlock(ElseBB);
2147 Bld.CreateBr(MergeBB);
2148
2149 CGF.EmitBlock(MergeBB);
2150
2151 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2152 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2153 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
2154
2155 auto *NumActiveThreads = Bld.CreateNSWMul(
2156 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
2157 // named_barrier_sync(ParallelBarrierID, num_active_threads)
2158 syncParallelThreads(CGF, NumActiveThreads);
2159
2160 //
2161 // Warp 0 copies reduce element from transfer medium.
2162 //
2163 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2164 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2165 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2166
2167 // Up to 32 threads in warp 0 are active.
2168 auto IsActiveThread =
2169 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2170 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2171
2172 CGF.EmitBlock(W0ThenBB);
2173
2174 // SrcMediumPtr = &medium[tid]
2175 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2176 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2177 Address SrcMediumPtr(SrcMediumPtrVal,
2178 C.getTypeAlignInChars(Private->getType()));
2179 // SrcMediumVal = *SrcMediumPtr;
2180 SrcMediumPtr = Bld.CreateElementBitCast(
2181 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2182 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
2183 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2184
2185 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
2186 Address TargetElemPtrPtr =
2187 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2188 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2189 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2190 Address TargetElemPtr =
2191 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
2192 TargetElemPtr = Bld.CreateElementBitCast(
2193 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2194
2195 // *TargetElemPtr = SrcMediumVal;
2196 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2197 Private->getType());
2198 Bld.CreateBr(W0MergeBB);
2199
2200 CGF.EmitBlock(W0ElseBB);
2201 Bld.CreateBr(W0MergeBB);
2202
2203 CGF.EmitBlock(W0MergeBB);
2204
2205 // While warp 0 copies values from transfer medium, all other warps must
2206 // wait.
2207 syncParallelThreads(CGF, NumActiveThreads);
2208 Idx++;
2209 }
2210
2211 CGF.FinishFunction();
2212 return Fn;
2213}
2214
2215/// Emit a helper that reduces data across two OpenMP threads (lanes)
2216/// in the same warp. It uses shuffle instructions to copy over data from
2217/// a remote lane's stack. The reduction algorithm performed is specified
2218/// by the fourth parameter.
2219///
2220/// Algorithm Versions.
2221/// Full Warp Reduce (argument value 0):
2222/// This algorithm assumes that all 32 lanes are active and gathers
2223/// data from these 32 lanes, producing a single resultant value.
2224/// Contiguous Partial Warp Reduce (argument value 1):
2225/// This algorithm assumes that only a *contiguous* subset of lanes
2226/// are active. This happens for the last warp in a parallel region
2227/// when the user specified num_threads is not an integer multiple of
2228/// 32. This contiguous subset always starts with the zeroth lane.
2229/// Partial Warp Reduce (argument value 2):
2230/// This algorithm gathers data from any number of lanes at any position.
2231/// All reduced values are stored in the lowest possible lane. The set
2232/// of problems every algorithm addresses is a super set of those
2233/// addressable by algorithms with a lower version number. Overhead
2234/// increases as algorithm version increases.
2235///
2236/// Terminology
2237/// Reduce element:
2238/// Reduce element refers to the individual data field with primitive
2239/// data types to be combined and reduced across threads.
2240/// Reduce list:
2241/// Reduce list refers to a collection of local, thread-private
2242/// reduce elements.
2243/// Remote Reduce list:
2244/// Remote Reduce list refers to a collection of remote (relative to
2245/// the current thread) reduce elements.
2246///
2247/// We distinguish between three states of threads that are important to
2248/// the implementation of this function.
2249/// Alive threads:
2250/// Threads in a warp executing the SIMT instruction, as distinguished from
2251/// threads that are inactive due to divergent control flow.
2252/// Active threads:
2253/// The minimal set of threads that has to be alive upon entry to this
2254/// function. The computation is correct iff active threads are alive.
2255/// Some threads are alive but they are not active because they do not
2256/// contribute to the computation in any useful manner. Turning them off
2257/// may introduce control flow overheads without any tangible benefits.
2258/// Effective threads:
2259/// In order to comply with the argument requirements of the shuffle
2260/// function, we must keep all lanes holding data alive. But at most
2261/// half of them perform value aggregation; we refer to this half of
2262/// threads as effective. The other half is simply handing off their
2263/// data.
2264///
2265/// Procedure
2266/// Value shuffle:
2267/// In this step active threads transfer data from higher lane positions
2268/// in the warp to lower lane positions, creating Remote Reduce list.
2269/// Value aggregation:
2270/// In this step, effective threads combine their thread local Reduce list
2271/// with Remote Reduce list and store the result in the thread local
2272/// Reduce list.
2273/// Value copy:
2274/// In this step, we deal with the assumption made by algorithm 2
2275/// (i.e. contiguity assumption). When we have an odd number of lanes
2276/// active, say 2k+1, only k threads will be effective and therefore k
2277/// new values will be produced. However, the Reduce list owned by the
2278/// (2k+1)th thread is ignored in the value aggregation. Therefore
2279/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2280/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002281static llvm::Value *emitShuffleAndReduceFunction(
2282 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2283 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002284 auto &C = CGM.getContext();
2285
2286 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002287 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2288 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002289 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002290 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2291 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002292 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002293 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2294 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002295 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002296 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2297 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002298 FunctionArgList Args;
2299 Args.push_back(&ReduceListArg);
2300 Args.push_back(&LaneIDArg);
2301 Args.push_back(&RemoteLaneOffsetArg);
2302 Args.push_back(&AlgoVerArg);
2303
2304 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2305 auto *Fn = llvm::Function::Create(
2306 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2307 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002308 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002309 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002310 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002311
2312 auto &Bld = CGF.Builder;
2313
2314 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2315 Address LocalReduceList(
2316 Bld.CreatePointerBitCastOrAddrSpaceCast(
2317 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2318 C.VoidPtrTy, SourceLocation()),
2319 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2320 CGF.getPointerAlign());
2321
2322 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2323 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2324 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2325
2326 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2327 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2328 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2329
2330 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2331 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2332 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2333
2334 // Create a local thread-private variable to host the Reduce list
2335 // from a remote lane.
2336 Address RemoteReduceList =
2337 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2338
2339 // This loop iterates through the list of reduce elements and copies,
2340 // element by element, from a remote lane in the warp to RemoteReduceList,
2341 // hosted on the thread's stack.
2342 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2343 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002344 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2345 /*ScratchpadIndex=*/nullptr,
2346 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002347
2348 // The actions to be performed on the Remote Reduce list is dependent
2349 // on the algorithm version.
2350 //
2351 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2352 // LaneId % 2 == 0 && Offset > 0):
2353 // do the reduction value aggregation
2354 //
2355 // The thread local variable Reduce list is mutated in place to host the
2356 // reduced data, which is the aggregated value produced from local and
2357 // remote lanes.
2358 //
2359 // Note that AlgoVer is expected to be a constant integer known at compile
2360 // time.
2361 // When AlgoVer==0, the first conjunction evaluates to true, making
2362 // the entire predicate true during compile time.
2363 // When AlgoVer==1, the second conjunction has only the second part to be
2364 // evaluated during runtime. Other conjunctions evaluates to false
2365 // during compile time.
2366 // When AlgoVer==2, the third conjunction has only the second part to be
2367 // evaluated during runtime. Other conjunctions evaluates to false
2368 // during compile time.
2369 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
2370
2371 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2372 auto CondAlgo1 = Bld.CreateAnd(
2373 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2374
2375 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2376 auto CondAlgo2 = Bld.CreateAnd(
2377 Algo2,
2378 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
2379 Bld.getInt16(0)));
2380 CondAlgo2 = Bld.CreateAnd(
2381 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2382
2383 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2384 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2385
2386 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2387 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2388 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2389 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2390
2391 CGF.EmitBlock(ThenBB);
2392 // reduce_function(LocalReduceList, RemoteReduceList)
2393 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2394 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2395 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2396 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002397 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2398 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002399 Bld.CreateBr(MergeBB);
2400
2401 CGF.EmitBlock(ElseBB);
2402 Bld.CreateBr(MergeBB);
2403
2404 CGF.EmitBlock(MergeBB);
2405
2406 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2407 // Reduce list.
2408 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2409 auto CondCopy = Bld.CreateAnd(
2410 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2411
2412 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2413 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2414 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2415 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2416
2417 CGF.EmitBlock(CpyThenBB);
2418 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2419 RemoteReduceList, LocalReduceList);
2420 Bld.CreateBr(CpyMergeBB);
2421
2422 CGF.EmitBlock(CpyElseBB);
2423 Bld.CreateBr(CpyMergeBB);
2424
2425 CGF.EmitBlock(CpyMergeBB);
2426
2427 CGF.FinishFunction();
2428 return Fn;
2429}
2430
2431///
2432/// Design of OpenMP reductions on the GPU
2433///
2434/// Consider a typical OpenMP program with one or more reduction
2435/// clauses:
2436///
2437/// float foo;
2438/// double bar;
2439/// #pragma omp target teams distribute parallel for \
2440/// reduction(+:foo) reduction(*:bar)
2441/// for (int i = 0; i < N; i++) {
2442/// foo += A[i]; bar *= B[i];
2443/// }
2444///
2445/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2446/// all teams. In our OpenMP implementation on the NVPTX device an
2447/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2448/// within a team are mapped to CUDA threads within a threadblock.
2449/// Our goal is to efficiently aggregate values across all OpenMP
2450/// threads such that:
2451///
2452/// - the compiler and runtime are logically concise, and
2453/// - the reduction is performed efficiently in a hierarchical
2454/// manner as follows: within OpenMP threads in the same warp,
2455/// across warps in a threadblock, and finally across teams on
2456/// the NVPTX device.
2457///
2458/// Introduction to Decoupling
2459///
2460/// We would like to decouple the compiler and the runtime so that the
2461/// latter is ignorant of the reduction variables (number, data types)
2462/// and the reduction operators. This allows a simpler interface
2463/// and implementation while still attaining good performance.
2464///
2465/// Pseudocode for the aforementioned OpenMP program generated by the
2466/// compiler is as follows:
2467///
2468/// 1. Create private copies of reduction variables on each OpenMP
2469/// thread: 'foo_private', 'bar_private'
2470/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2471/// to it and writes the result in 'foo_private' and 'bar_private'
2472/// respectively.
2473/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2474/// and store the result on the team master:
2475///
2476/// __kmpc_nvptx_parallel_reduce_nowait(...,
2477/// reduceData, shuffleReduceFn, interWarpCpyFn)
2478///
2479/// where:
2480/// struct ReduceData {
2481/// double *foo;
2482/// double *bar;
2483/// } reduceData
2484/// reduceData.foo = &foo_private
2485/// reduceData.bar = &bar_private
2486///
2487/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2488/// auxiliary functions generated by the compiler that operate on
2489/// variables of type 'ReduceData'. They aid the runtime perform
2490/// algorithmic steps in a data agnostic manner.
2491///
2492/// 'shuffleReduceFn' is a pointer to a function that reduces data
2493/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2494/// same warp. It takes the following arguments as input:
2495///
2496/// a. variable of type 'ReduceData' on the calling lane,
2497/// b. its lane_id,
2498/// c. an offset relative to the current lane_id to generate a
2499/// remote_lane_id. The remote lane contains the second
2500/// variable of type 'ReduceData' that is to be reduced.
2501/// d. an algorithm version parameter determining which reduction
2502/// algorithm to use.
2503///
2504/// 'shuffleReduceFn' retrieves data from the remote lane using
2505/// efficient GPU shuffle intrinsics and reduces, using the
2506/// algorithm specified by the 4th parameter, the two operands
2507/// element-wise. The result is written to the first operand.
2508///
2509/// Different reduction algorithms are implemented in different
2510/// runtime functions, all calling 'shuffleReduceFn' to perform
2511/// the essential reduction step. Therefore, based on the 4th
2512/// parameter, this function behaves slightly differently to
2513/// cooperate with the runtime to ensure correctness under
2514/// different circumstances.
2515///
2516/// 'InterWarpCpyFn' is a pointer to a function that transfers
2517/// reduced variables across warps. It tunnels, through CUDA
2518/// shared memory, the thread-private data of type 'ReduceData'
2519/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002520/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2521/// The last team writes the global reduced value to memory.
2522///
2523/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2524/// reduceData, shuffleReduceFn, interWarpCpyFn,
2525/// scratchpadCopyFn, loadAndReduceFn)
2526///
2527/// 'scratchpadCopyFn' is a helper that stores reduced
2528/// data from the team master to a scratchpad array in
2529/// global memory.
2530///
2531/// 'loadAndReduceFn' is a helper that loads data from
2532/// the scratchpad array and reduces it with the input
2533/// operand.
2534///
2535/// These compiler generated functions hide address
2536/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002537/// 5. if ret == 1:
2538/// The team master of the last team stores the reduced
2539/// result to the globals in memory.
2540/// foo += reduceData.foo; bar *= reduceData.bar
2541///
2542///
2543/// Warp Reduction Algorithms
2544///
2545/// On the warp level, we have three algorithms implemented in the
2546/// OpenMP runtime depending on the number of active lanes:
2547///
2548/// Full Warp Reduction
2549///
2550/// The reduce algorithm within a warp where all lanes are active
2551/// is implemented in the runtime as follows:
2552///
2553/// full_warp_reduce(void *reduce_data,
2554/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2555/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2556/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2557/// }
2558///
2559/// The algorithm completes in log(2, WARPSIZE) steps.
2560///
2561/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2562/// not used therefore we save instructions by not retrieving lane_id
2563/// from the corresponding special registers. The 4th parameter, which
2564/// represents the version of the algorithm being used, is set to 0 to
2565/// signify full warp reduction.
2566///
2567/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2568///
2569/// #reduce_elem refers to an element in the local lane's data structure
2570/// #remote_elem is retrieved from a remote lane
2571/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2572/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2573///
2574/// Contiguous Partial Warp Reduction
2575///
2576/// This reduce algorithm is used within a warp where only the first
2577/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2578/// number of OpenMP threads in a parallel region is not a multiple of
2579/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2580///
2581/// void
2582/// contiguous_partial_reduce(void *reduce_data,
2583/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2584/// int size, int lane_id) {
2585/// int curr_size;
2586/// int offset;
2587/// curr_size = size;
2588/// mask = curr_size/2;
2589/// while (offset>0) {
2590/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2591/// curr_size = (curr_size+1)/2;
2592/// offset = curr_size/2;
2593/// }
2594/// }
2595///
2596/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2597///
2598/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2599/// if (lane_id < offset)
2600/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2601/// else
2602/// reduce_elem = remote_elem
2603///
2604/// This algorithm assumes that the data to be reduced are located in a
2605/// contiguous subset of lanes starting from the first. When there is
2606/// an odd number of active lanes, the data in the last lane is not
2607/// aggregated with any other lane's dat but is instead copied over.
2608///
2609/// Dispersed Partial Warp Reduction
2610///
2611/// This algorithm is used within a warp when any discontiguous subset of
2612/// lanes are active. It is used to implement the reduction operation
2613/// across lanes in an OpenMP simd region or in a nested parallel region.
2614///
2615/// void
2616/// dispersed_partial_reduce(void *reduce_data,
2617/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2618/// int size, remote_id;
2619/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2620/// do {
2621/// remote_id = next_active_lane_id_right_after_me();
2622/// # the above function returns 0 of no active lane
2623/// # is present right after the current lane.
2624/// size = number_of_active_lanes_in_this_warp();
2625/// logical_lane_id /= 2;
2626/// ShuffleReduceFn(reduce_data, logical_lane_id,
2627/// remote_id-1-threadIdx.x, 2);
2628/// } while (logical_lane_id % 2 == 0 && size > 1);
2629/// }
2630///
2631/// There is no assumption made about the initial state of the reduction.
2632/// Any number of lanes (>=1) could be active at any position. The reduction
2633/// result is returned in the first active lane.
2634///
2635/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2636///
2637/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2638/// if (lane_id % 2 == 0 && offset > 0)
2639/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2640/// else
2641/// reduce_elem = remote_elem
2642///
2643///
2644/// Intra-Team Reduction
2645///
2646/// This function, as implemented in the runtime call
2647/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2648/// threads in a team. It first reduces within a warp using the
2649/// aforementioned algorithms. We then proceed to gather all such
2650/// reduced values at the first warp.
2651///
2652/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2653/// data from each of the "warp master" (zeroth lane of each warp, where
2654/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2655/// a mathematical sense) the problem of reduction across warp masters in
2656/// a block to the problem of warp reduction.
2657///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002658///
2659/// Inter-Team Reduction
2660///
2661/// Once a team has reduced its data to a single value, it is stored in
2662/// a global scratchpad array. Since each team has a distinct slot, this
2663/// can be done without locking.
2664///
2665/// The last team to write to the scratchpad array proceeds to reduce the
2666/// scratchpad array. One or more workers in the last team use the helper
2667/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2668/// the k'th worker reduces every k'th element.
2669///
2670/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2671/// reduce across workers and compute a globally reduced value.
2672///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002673void CGOpenMPRuntimeNVPTX::emitReduction(
2674 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2675 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2676 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2677 if (!CGF.HaveInsertPoint())
2678 return;
2679
2680 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002681 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2682 // FIXME: Add support for simd reduction.
2683 assert((TeamsReduction || ParallelReduction) &&
2684 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002685
2686 auto &C = CGM.getContext();
2687
2688 // 1. Build a list of reduction variables.
2689 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2690 auto Size = RHSExprs.size();
2691 for (auto *E : Privates) {
2692 if (E->getType()->isVariablyModifiedType())
2693 // Reserve place for array size.
2694 ++Size;
2695 }
2696 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2697 QualType ReductionArrayTy =
2698 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2699 /*IndexTypeQuals=*/0);
2700 Address ReductionList =
2701 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2702 auto IPriv = Privates.begin();
2703 unsigned Idx = 0;
2704 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2705 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2706 CGF.getPointerSize());
2707 CGF.Builder.CreateStore(
2708 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2709 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2710 Elem);
2711 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2712 // Store array size.
2713 ++Idx;
2714 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2715 CGF.getPointerSize());
2716 llvm::Value *Size = CGF.Builder.CreateIntCast(
2717 CGF.getVLASize(
2718 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00002719 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002720 CGF.SizeTy, /*isSigned=*/false);
2721 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2722 Elem);
2723 }
2724 }
2725
2726 // 2. Emit reduce_func().
2727 auto *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002728 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
2729 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002730
2731 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2732 // RedList, shuffle_reduce_func, interwarp_copy_func);
2733 auto *ThreadId = getThreadID(CGF, Loc);
2734 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2735 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2736 ReductionList.getPointer(), CGF.VoidPtrTy);
2737
2738 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002739 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002740 auto *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002741 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002742
2743 llvm::Value *Res = nullptr;
2744 if (ParallelReduction) {
2745 llvm::Value *Args[] = {ThreadId,
2746 CGF.Builder.getInt32(RHSExprs.size()),
2747 ReductionArrayTySize,
2748 RL,
2749 ShuffleAndReduceFn,
2750 InterWarpCopyFn};
2751
2752 Res = CGF.EmitRuntimeCall(
2753 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2754 Args);
2755 }
2756
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002757 if (TeamsReduction) {
2758 auto *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002759 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002760 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002761 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002762
2763 llvm::Value *Args[] = {ThreadId,
2764 CGF.Builder.getInt32(RHSExprs.size()),
2765 ReductionArrayTySize,
2766 RL,
2767 ShuffleAndReduceFn,
2768 InterWarpCopyFn,
2769 ScratchPadCopyFn,
2770 LoadAndReduceFn};
2771 Res = CGF.EmitRuntimeCall(
2772 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2773 Args);
2774 }
2775
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002776 // 5. Build switch(res)
2777 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2778 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2779
2780 // 6. Build case 1: where we have reduced values in the master
2781 // thread in each team.
2782 // __kmpc_end_reduce{_nowait}(<gtid>);
2783 // break;
2784 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2785 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2786 CGF.EmitBlock(Case1BB);
2787
2788 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2789 llvm::Value *EndArgs[] = {ThreadId};
2790 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2791 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2792 auto IPriv = Privates.begin();
2793 auto ILHS = LHSExprs.begin();
2794 auto IRHS = RHSExprs.begin();
2795 for (auto *E : ReductionOps) {
2796 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2797 cast<DeclRefExpr>(*IRHS));
2798 ++IPriv;
2799 ++ILHS;
2800 ++IRHS;
2801 }
2802 };
2803 RegionCodeGenTy RCG(CodeGen);
2804 NVPTXActionTy Action(
2805 nullptr, llvm::None,
2806 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2807 EndArgs);
2808 RCG.setAction(Action);
2809 RCG(CGF);
2810 CGF.EmitBranch(DefaultBB);
2811 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2812}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002813
2814const VarDecl *
2815CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2816 const VarDecl *NativeParam) const {
2817 if (!NativeParam->getType()->isReferenceType())
2818 return NativeParam;
2819 QualType ArgType = NativeParam->getType();
2820 QualifierCollector QC;
2821 const Type *NonQualTy = QC.strip(ArgType);
2822 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2823 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2824 if (Attr->getCaptureKind() == OMPC_map) {
2825 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2826 LangAS::opencl_global);
2827 }
2828 }
2829 ArgType = CGM.getContext().getPointerType(PointeeTy);
2830 QC.addRestrict();
2831 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002832 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002833 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00002834 if (isa<ImplicitParamDecl>(NativeParam)) {
2835 return ImplicitParamDecl::Create(
2836 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2837 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2838 }
2839 return ParmVarDecl::Create(
2840 CGM.getContext(),
2841 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2842 NativeParam->getLocStart(), NativeParam->getLocation(),
2843 NativeParam->getIdentifier(), ArgType,
2844 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002845}
2846
2847Address
2848CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2849 const VarDecl *NativeParam,
2850 const VarDecl *TargetParam) const {
2851 assert(NativeParam != TargetParam &&
2852 NativeParam->getType()->isReferenceType() &&
2853 "Native arg must not be the same as target arg.");
2854 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2855 QualType NativeParamType = NativeParam->getType();
2856 QualifierCollector QC;
2857 const Type *NonQualTy = QC.strip(NativeParamType);
2858 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2859 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002860 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002861 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002862 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002863 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002864 // First cast to generic.
2865 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2866 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2867 /*AddrSpace=*/0));
2868 // Cast from generic to native address space.
2869 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2870 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2871 NativePointeeAddrSpace));
2872 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2873 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002874 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002875 return NativeParamAddr;
2876}
2877
2878void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002879 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002880 ArrayRef<llvm::Value *> Args) const {
2881 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002882 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002883 auto *FnType =
2884 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2885 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002886 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2887 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2888 break;
2889 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002890 llvm::Type *TargetType = FnType->getParamType(I);
2891 llvm::Value *NativeArg = Args[I];
2892 if (!TargetType->isPointerTy()) {
2893 TargetArgs.emplace_back(NativeArg);
2894 continue;
2895 }
2896 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00002897 NativeArg,
2898 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002899 TargetArgs.emplace_back(
2900 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2901 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002902 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002903}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002904
2905/// Emit function which wraps the outline parallel region
2906/// and controls the arguments which are passed to this function.
2907/// The wrapper ensures that the outlined function is called
2908/// with the correct arguments when data is shared.
2909llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
2910 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2911 ASTContext &Ctx = CGM.getContext();
2912 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
2913
2914 // Create a function that takes as argument the source thread.
2915 FunctionArgList WrapperArgs;
2916 QualType Int16QTy =
2917 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2918 QualType Int32QTy =
2919 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2920 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2921 /*Id=*/nullptr, Int16QTy,
2922 ImplicitParamDecl::Other);
2923 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2924 /*Id=*/nullptr, Int32QTy,
2925 ImplicitParamDecl::Other);
2926 WrapperArgs.emplace_back(&ParallelLevelArg);
2927 WrapperArgs.emplace_back(&WrapperArg);
2928
2929 auto &CGFI =
2930 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2931
2932 auto *Fn = llvm::Function::Create(
2933 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2934 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00002935 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002936 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2937
2938 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2939 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
2940 D.getLocStart(), D.getLocStart());
2941
2942 const auto *RD = CS.getCapturedRecordDecl();
2943 auto CurField = RD->field_begin();
2944
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002945 Address ZeroAddr = CGF.CreateMemTemp(
2946 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2947 /*Name*/ ".zero.addr");
2948 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002949 // Get the array of arguments.
2950 SmallVector<llvm::Value *, 8> Args;
2951
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002952 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
2953 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002954
2955 CGBuilderTy &Bld = CGF.Builder;
2956 auto CI = CS.capture_begin();
2957
2958 // Use global memory for data sharing.
2959 // Handle passing of global args to workers.
2960 Address GlobalArgs =
2961 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
2962 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
2963 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
2964 CGF.EmitRuntimeCall(
2965 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
2966 DataSharingArgs);
2967
2968 // Retrieve the shared variables from the list of references returned
2969 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00002970 Address SharedArgListAddress = Address::invalid();
2971 if (CS.capture_size() > 0 ||
2972 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
2973 SharedArgListAddress = CGF.EmitLoadOfPointer(
2974 GlobalArgs, CGF.getContext()
2975 .getPointerType(CGF.getContext().getPointerType(
2976 CGF.getContext().VoidPtrTy))
2977 .castAs<PointerType>());
2978 }
2979 unsigned Idx = 0;
2980 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
2981 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2982 CGF.getPointerSize());
2983 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
2984 Src, CGF.SizeTy->getPointerTo());
2985 llvm::Value *LB = CGF.EmitLoadOfScalar(
2986 TypedAddress,
2987 /*Volatile=*/false,
2988 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
2989 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
2990 Args.emplace_back(LB);
2991 ++Idx;
2992 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2993 CGF.getPointerSize());
2994 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
2995 Src, CGF.SizeTy->getPointerTo());
2996 llvm::Value *UB = CGF.EmitLoadOfScalar(
2997 TypedAddress,
2998 /*Volatile=*/false,
2999 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3000 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3001 Args.emplace_back(UB);
3002 ++Idx;
3003 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003004 if (CS.capture_size() > 0) {
3005 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003006 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3007 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00003008 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3009 CGF.getPointerSize());
3010 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003011 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3012 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3013 /*Volatile=*/false,
3014 CGFContext.getPointerType(ElemTy),
3015 CI->getLocation());
Alexey Bataev17314212018-03-20 15:41:05 +00003016 if (CI->capturesVariableByCopy()) {
3017 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3018 CI->getLocation());
3019 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003020 Args.emplace_back(Arg);
3021 }
3022 }
3023
3024 emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
3025 CGF.FinishFunction();
3026 return Fn;
3027}
3028
3029void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
3030 const Decl *D) {
3031 assert(D && "Expected function or captured|block decl.");
3032 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3033 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003034 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00003035 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003036 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3037 Body = FD->getBody();
3038 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3039 Body = BD->getBody();
3040 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3041 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003042 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003043 }
3044 if (!Body)
3045 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003046 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003047 VarChecker.Visit(Body);
3048 const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003049 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3050 VarChecker.getEscapedVariableLengthDecls();
3051 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003052 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00003053 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3054 I->getSecond().MappedParams =
3055 llvm::make_unique<CodeGenFunction::OMPMapVars>();
3056 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
3057 I->getSecond().EscapedParameters.insert(
3058 VarChecker.getEscapedParameters().begin(),
3059 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003060 I->getSecond().EscapedVariableLengthDecls.append(
3061 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00003062 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003063 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003064 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003065 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataevc99042b2018-03-15 18:10:54 +00003066 Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
3067 }
3068 if (!NeedToDelayGlobalization) {
3069 emitGenericVarsProlog(CGF, D->getLocStart());
3070 struct GlobalizationScope final : EHScopeStack::Cleanup {
3071 GlobalizationScope() = default;
3072
3073 void Emit(CodeGenFunction &CGF, Flags flags) override {
3074 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
3075 .emitGenericVarsEpilog(CGF);
3076 }
3077 };
3078 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003079 }
3080}
3081
3082Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
3083 const VarDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003084 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003085 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3086 if (I == FunctionGlobalizedDecls.end())
3087 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00003088 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003089 if (VDI != I->getSecond().LocalVarData.end())
3090 return VDI->second.second;
3091 if (VD->hasAttrs()) {
3092 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3093 E(VD->attr_end());
3094 IT != E; ++IT) {
3095 auto VDI = I->getSecond().LocalVarData.find(
3096 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3097 ->getCanonicalDecl());
3098 if (VDI != I->getSecond().LocalVarData.end())
3099 return VDI->second.second;
3100 }
3101 }
3102 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003103}
3104
3105void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003106 FunctionGlobalizedDecls.erase(CGF.CurFn);
3107 CGOpenMPRuntime::functionFinished(CGF);
3108}