blob: 62131cd0243785fedb967f59f2748285e5f3c5d7 [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
20#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000021
22using namespace clang;
23using namespace CodeGen;
24
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000025namespace {
26enum OpenMPRTLFunctionNVPTX {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000027 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000029 OMPRTL_NVPTX__kmpc_kernel_init,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000030 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000031 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000032 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000033 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000034 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35 /// \brief Call to void __kmpc_spmd_kernel_deinit();
36 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000037 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000038 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000039 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000040 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000041 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function,
42 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000043 OMPRTL_NVPTX__kmpc_kernel_parallel,
44 /// \brief Call to void __kmpc_kernel_end_parallel();
45 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47 /// global_tid);
48 OMPRTL_NVPTX__kmpc_serialized_parallel,
49 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50 /// global_tid);
51 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000052 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
53 /// int16_t lane_offset, int16_t warp_size);
54 OMPRTL_NVPTX__kmpc_shuffle_int32,
55 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
56 /// int16_t lane_offset, int16_t warp_size);
57 OMPRTL_NVPTX__kmpc_shuffle_int64,
58 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61 /// lane_offset, int16_t shortCircuit),
62 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000064 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
65 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
66 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
67 /// lane_offset, int16_t shortCircuit),
68 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
69 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
70 /// int32_t index, int32_t width),
71 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
72 /// index, int32_t width, int32_t reduce))
73 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000074 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000075 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
76 /// \brief Call to void __kmpc_data_sharing_init_stack();
77 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
78 /// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size,
79 /// int16_t UseSharedMemory);
80 OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
81 /// \brief Call to void __kmpc_data_sharing_pop_stack(void *a);
82 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
83 /// \brief Call to void __kmpc_begin_sharing_variables(void ***args,
84 /// size_t n_args);
85 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
86 /// \brief Call to void __kmpc_end_sharing_variables();
87 OMPRTL_NVPTX__kmpc_end_sharing_variables,
88 /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
89 OMPRTL_NVPTX__kmpc_get_shared_variables,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000090};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000091
92/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
93class NVPTXActionTy final : public PrePostActionTy {
94 llvm::Value *EnterCallee;
95 ArrayRef<llvm::Value *> EnterArgs;
96 llvm::Value *ExitCallee;
97 ArrayRef<llvm::Value *> ExitArgs;
98 bool Conditional;
99 llvm::BasicBlock *ContBlock = nullptr;
100
101public:
102 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
103 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
104 bool Conditional = false)
105 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
106 ExitArgs(ExitArgs), Conditional(Conditional) {}
107 void Enter(CodeGenFunction &CGF) override {
108 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
109 if (Conditional) {
110 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
111 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
112 ContBlock = CGF.createBasicBlock("omp_if.end");
113 // Generate the branch (If-stmt)
114 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
115 CGF.EmitBlock(ThenBlock);
116 }
117 }
118 void Done(CodeGenFunction &CGF) {
119 // Emit the rest of blocks/branches
120 CGF.EmitBranch(ContBlock);
121 CGF.EmitBlock(ContBlock, true);
122 }
123 void Exit(CodeGenFunction &CGF) override {
124 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
125 }
126};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000127
128// A class to track the execution mode when codegening directives within
129// a target region. The appropriate mode (generic/spmd) is set on entry
130// to the target region and used by containing directives such as 'parallel'
131// to emit optimized code.
132class ExecutionModeRAII {
133private:
134 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
135 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
136
137public:
138 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
139 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
140 : Mode(Mode) {
141 SavedMode = Mode;
142 Mode = NewMode;
143 }
144 ~ExecutionModeRAII() { Mode = SavedMode; }
145};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000146
147/// GPU Configuration: This information can be derived from cuda registers,
148/// however, providing compile time constants helps generate more efficient
149/// code. For all practical purposes this is fine because the configuration
150/// is the same for all known NVPTX architectures.
151enum MachineConfiguration : unsigned {
152 WarpSize = 32,
153 /// Number of bits required to represent a lane identifier, which is
154 /// computed as log_2(WarpSize).
155 LaneIDBits = 5,
156 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000157
158 /// Global memory alignment for performance.
159 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000160};
161
162enum NamedBarrier : unsigned {
163 /// Synchronize on this barrier #ID using a named barrier primitive.
164 /// Only the subset of active threads in a parallel region arrive at the
165 /// barrier.
166 NB_Parallel = 1,
167};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000168
169/// Get the list of variables that can escape their declaration context.
170class CheckVarsEscapingDeclContext final
171 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
172 CodeGenFunction &CGF;
173 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000174 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000175 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000176 bool AllEscaped = false;
177 RecordDecl *GlobalizedRD = nullptr;
178 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
179
180 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000181 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000182 // Variables captured by value must be globalized.
183 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000184 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000185 if (!FD->hasAttrs())
186 return;
187 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
188 if (!Attr)
189 return;
190 if (!isOpenMPPrivate(
191 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
192 Attr->getCaptureKind() == OMPC_map)
193 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000194 if (FD->getType()->isReferenceType())
195 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000196 assert(!VD->getType()->isVariablyModifiedType() &&
197 "Parameter captured by value with variably modified type");
Alexey Bataevc99042b2018-03-15 18:10:54 +0000198 EscapedParameters.insert(VD);
199 }
200 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000201 if (VD->getType()->isVariablyModifiedType())
202 EscapedVariableLengthDecls.insert(VD);
203 else
204 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000205 }
206
207 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000208 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000209 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000210 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
211 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
212 const bool SavedAllEscaped = AllEscaped;
213 AllEscaped = VD->getType()->isLValueReferenceType();
214 Visit(VarD->getInit());
215 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000216 }
217 }
218 }
219 void VisitOpenMPCapturedStmt(const CapturedStmt *S) {
220 if (!S)
221 return;
222 for (const auto &C : S->captures()) {
223 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
224 const ValueDecl *VD = C.getCapturedVar();
225 markAsEscaped(VD);
226 if (isa<OMPCapturedExprDecl>(VD))
227 VisitValueDecl(VD);
228 }
229 }
230 }
231
232 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
233 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
234 return P1.first > P2.first;
235 }
236
237 void buildRecordForGlobalizedVars() {
238 assert(!GlobalizedRD &&
239 "Record for globalized variables is built already.");
240 if (EscapedDecls.empty())
241 return;
242 ASTContext &C = CGF.getContext();
243 SmallVector<VarsDataTy, 4> GlobalizedVars;
244 for (const auto *D : EscapedDecls)
245 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
246 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
247 stable_sort_comparator);
248 // Build struct _globalized_locals_ty {
249 // /* globalized vars */
250 // };
251 GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
252 GlobalizedRD->startDefinition();
253 for (const auto &Pair : GlobalizedVars) {
254 const ValueDecl *VD = Pair.second;
255 QualType Type = VD->getType();
256 if (Type->isLValueReferenceType())
257 Type = C.getPointerType(Type.getNonReferenceType());
258 else
259 Type = Type.getNonReferenceType();
260 SourceLocation Loc = VD->getLocation();
261 auto *Field = FieldDecl::Create(
262 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
263 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
264 /*BW=*/nullptr, /*Mutable=*/false,
265 /*InitStyle=*/ICIS_NoInit);
266 Field->setAccess(AS_public);
267 GlobalizedRD->addDecl(Field);
268 if (VD->hasAttrs()) {
269 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
270 E(VD->getAttrs().end());
271 I != E; ++I)
272 Field->addAttr(*I);
273 }
274 MappedDeclsFields.try_emplace(VD, Field);
275 }
276 GlobalizedRD->completeDefinition();
277 }
278
279public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000280 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000281 virtual ~CheckVarsEscapingDeclContext() = default;
282 void VisitDeclStmt(const DeclStmt *S) {
283 if (!S)
284 return;
285 for (const auto *D : S->decls())
286 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
287 VisitValueDecl(VD);
288 }
289 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
290 if (!D)
291 return;
292 if (D->hasAssociatedStmt()) {
293 if (const auto *S =
294 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt()))
295 VisitOpenMPCapturedStmt(S);
296 }
297 }
298 void VisitCapturedStmt(const CapturedStmt *S) {
299 if (!S)
300 return;
301 for (const auto &C : S->captures()) {
302 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
303 const ValueDecl *VD = C.getCapturedVar();
304 markAsEscaped(VD);
305 if (isa<OMPCapturedExprDecl>(VD))
306 VisitValueDecl(VD);
307 }
308 }
309 }
310 void VisitLambdaExpr(const LambdaExpr *E) {
311 if (!E)
312 return;
313 for (const auto &C : E->captures()) {
314 if (C.capturesVariable()) {
315 if (C.getCaptureKind() == LCK_ByRef) {
316 const ValueDecl *VD = C.getCapturedVar();
317 markAsEscaped(VD);
318 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
319 VisitValueDecl(VD);
320 }
321 }
322 }
323 }
324 void VisitBlockExpr(const BlockExpr *E) {
325 if (!E)
326 return;
327 for (const auto &C : E->getBlockDecl()->captures()) {
328 if (C.isByRef()) {
329 const VarDecl *VD = C.getVariable();
330 markAsEscaped(VD);
331 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
332 VisitValueDecl(VD);
333 }
334 }
335 }
336 void VisitCallExpr(const CallExpr *E) {
337 if (!E)
338 return;
339 for (const Expr *Arg : E->arguments()) {
340 if (!Arg)
341 continue;
342 if (Arg->isLValue()) {
343 const bool SavedAllEscaped = AllEscaped;
344 AllEscaped = true;
345 Visit(Arg);
346 AllEscaped = SavedAllEscaped;
347 } else
348 Visit(Arg);
349 }
350 Visit(E->getCallee());
351 }
352 void VisitDeclRefExpr(const DeclRefExpr *E) {
353 if (!E)
354 return;
355 const ValueDecl *VD = E->getDecl();
356 if (AllEscaped)
357 markAsEscaped(VD);
358 if (isa<OMPCapturedExprDecl>(VD))
359 VisitValueDecl(VD);
360 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
361 if (VarD->isInitCapture())
362 VisitValueDecl(VD);
363 }
364 void VisitUnaryOperator(const UnaryOperator *E) {
365 if (!E)
366 return;
367 if (E->getOpcode() == UO_AddrOf) {
368 const bool SavedAllEscaped = AllEscaped;
369 AllEscaped = true;
370 Visit(E->getSubExpr());
371 AllEscaped = SavedAllEscaped;
372 } else
373 Visit(E->getSubExpr());
374 }
375 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
376 if (!E)
377 return;
378 if (E->getCastKind() == CK_ArrayToPointerDecay) {
379 const bool SavedAllEscaped = AllEscaped;
380 AllEscaped = true;
381 Visit(E->getSubExpr());
382 AllEscaped = SavedAllEscaped;
383 } else
384 Visit(E->getSubExpr());
385 }
386 void VisitExpr(const Expr *E) {
387 if (!E)
388 return;
389 bool SavedAllEscaped = AllEscaped;
390 if (!E->isLValue())
391 AllEscaped = false;
392 for (const auto *Child : E->children())
393 if (Child)
394 Visit(Child);
395 AllEscaped = SavedAllEscaped;
396 }
397 void VisitStmt(const Stmt *S) {
398 if (!S)
399 return;
400 for (const auto *Child : S->children())
401 if (Child)
402 Visit(Child);
403 }
404
Alexey Bataevc99042b2018-03-15 18:10:54 +0000405 /// Returns the record that handles all the escaped local variables and used
406 /// instead of their original storage.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000407 const RecordDecl *getGlobalizedRecord() {
408 if (!GlobalizedRD)
409 buildRecordForGlobalizedVars();
410 return GlobalizedRD;
411 }
412
Alexey Bataevc99042b2018-03-15 18:10:54 +0000413 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000414 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
415 assert(GlobalizedRD &&
416 "Record for globalized variables must be generated already.");
417 auto I = MappedDeclsFields.find(VD);
418 if (I == MappedDeclsFields.end())
419 return nullptr;
420 return I->getSecond();
421 }
422
Alexey Bataevc99042b2018-03-15 18:10:54 +0000423 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000424 ArrayRef<const ValueDecl *> getEscapedDecls() const {
425 return EscapedDecls.getArrayRef();
426 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000427
428 /// Checks if the escaped local variable is actually a parameter passed by
429 /// value.
430 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
431 return EscapedParameters;
432 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000433
434 /// Returns the list of the escaped variables with the variably modified
435 /// types.
436 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
437 return EscapedVariableLengthDecls.getArrayRef();
438 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000439};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000440} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000441
442/// Get the GPU warp size.
443static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000444 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000445 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000446 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000447 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000448}
449
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000450/// Get the id of the current thread on the GPU.
451static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000452 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000453 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000454 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000455 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000456}
457
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000458/// Get the id of the warp in the block.
459/// We assume that the warp size is 32, which is always the case
460/// on the NVPTX device, to generate more efficient code.
461static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
462 CGBuilderTy &Bld = CGF.Builder;
463 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
464}
465
466/// Get the id of the current lane in the Warp.
467/// We assume that the warp size is 32, which is always the case
468/// on the NVPTX device, to generate more efficient code.
469static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
470 CGBuilderTy &Bld = CGF.Builder;
471 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
472 "nvptx_lane_id");
473}
474
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000475/// Get the maximum number of threads in a block of the GPU.
476static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000477 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000478 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000479 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000480 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000481}
482
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000483/// Get barrier to synchronize all threads in a block.
484static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000485 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000486 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000487}
488
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000489/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
490/// a CTA.
491static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
492 llvm::Value *NumThreads) {
493 CGBuilderTy &Bld = CGF.Builder;
494 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000495 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
496 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
497 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000498}
499
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000500/// Synchronize all GPU threads in a block.
501static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000502
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000503/// Synchronize worker threads in a parallel region.
504static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
505 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
506}
507
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000508/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000509/// For the 'generic' execution mode, the runtime encodes thread_limit in
510/// the launch parameters, always starting thread_limit+warpSize threads per
511/// CTA. The threads in the last warp are reserved for master execution.
512/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
513static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
514 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000515 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000516 return IsInSpmdExecutionMode
517 ? getNVPTXNumThreads(CGF)
518 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
519 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000520}
521
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000522/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000523/// The master thread id is the first thread (lane) of the last warp in the
524/// GPU block. Warp size is assumed to be some power of 2.
525/// Thread id is 0 indexed.
526/// E.g: If NumThreads is 33, master id is 32.
527/// If NumThreads is 64, master id is 32.
528/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000529static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000530 CGBuilderTy &Bld = CGF.Builder;
531 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
532
533 // We assume that the warp size is a power of 2.
534 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
535
536 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
537 Bld.CreateNot(Mask), "master_tid");
538}
539
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000540CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000541 CodeGenModule &CGM, SourceLocation Loc)
542 : WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000543 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000544}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000545
546void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
547 CodeGenModule &CGM) {
548 // Create an worker function with no arguments.
549 CGFI = &CGM.getTypes().arrangeNullaryFunction();
550
551 WorkerFn = llvm::Function::Create(
552 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000553 /*placeholder=*/"_worker", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +0000554 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000555}
556
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000557bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
558 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
559}
560
561static CGOpenMPRuntimeNVPTX::ExecutionMode
Carlo Bertolli79712092018-02-28 20:48:35 +0000562getExecutionMode(CodeGenModule &CGM) {
563 return CGM.getLangOpts().OpenMPCUDAMode
564 ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
565 : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000566}
567
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000568void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
569 StringRef ParentName,
570 llvm::Function *&OutlinedFn,
571 llvm::Constant *&OutlinedFnID,
572 bool IsOffloadEntry,
573 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000574 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
575 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000576 EntryFunctionState EST;
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000577 WorkerFunctionState WST(CGM, D.getLocStart());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000578 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000579 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000580
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000581 // Emit target region as a standalone region.
582 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000583 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
584 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000586 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000587 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000588 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000589 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000590 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000591 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
592 .emitGenericEntryHeader(CGF, EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000593 }
594 void Exit(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000595 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
596 .emitGenericEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000597 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000598 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000599 CodeGen.setAction(Action);
600 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
601 IsOffloadEntry, CodeGen);
602
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000603 // Now change the name of the worker function to correspond to this target
604 // region's entry function.
605 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
Alexey Bataevaee93892018-01-08 20:09:47 +0000606
607 // Create the worker function
608 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000609}
610
611// Setup NVPTX threads for master-worker OpenMP scheme.
612void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
613 EntryFunctionState &EST,
614 WorkerFunctionState &WST) {
615 CGBuilderTy &Bld = CGF.Builder;
616
617 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
618 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
619 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
620 EST.ExitBB = CGF.createBasicBlock(".exit");
621
622 auto *IsWorker =
623 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
624 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
625
626 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +0000627 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000628 CGF.EmitBranch(EST.ExitBB);
629
630 CGF.EmitBlock(MasterCheckBB);
631 auto *IsMaster =
632 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
633 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
634
635 CGF.EmitBlock(MasterBB);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000636 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000637 // First action in sequential region:
638 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000639 // TODO: Optimize runtime initialization and pass in correct value.
640 llvm::Value *Args[] = {getThreadLimit(CGF),
641 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000642 CGF.EmitRuntimeCall(
643 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000644
645 // For data sharing, we need to initialize the stack.
646 CGF.EmitRuntimeCall(
647 createNVPTXRuntimeFunction(
648 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
649
Alexey Bataevc99042b2018-03-15 18:10:54 +0000650 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000651}
652
653void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
654 EntryFunctionState &EST) {
Alexey Bataevc99042b2018-03-15 18:10:54 +0000655 emitGenericVarsEpilog(CGF);
656 if (!CGF.HaveInsertPoint())
657 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000658
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000659 if (!EST.ExitBB)
660 EST.ExitBB = CGF.createBasicBlock(".exit");
661
662 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
663 CGF.EmitBranch(TerminateBB);
664
665 CGF.EmitBlock(TerminateBB);
666 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000667 // TODO: Optimize runtime initialization and pass in correct value.
668 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000669 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000670 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000671 // Barrier to terminate worker threads.
672 syncCTAThreads(CGF);
673 // Master thread jumps to exit point.
674 CGF.EmitBranch(EST.ExitBB);
675
676 CGF.EmitBlock(EST.ExitBB);
677 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000678}
679
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000680void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
681 StringRef ParentName,
682 llvm::Function *&OutlinedFn,
683 llvm::Constant *&OutlinedFnID,
684 bool IsOffloadEntry,
685 const RegionCodeGenTy &CodeGen) {
686 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
687 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
688 EntryFunctionState EST;
689
690 // Emit target region as a standalone region.
691 class NVPTXPrePostActionTy : public PrePostActionTy {
692 CGOpenMPRuntimeNVPTX &RT;
693 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
694 const OMPExecutableDirective &D;
695
696 public:
697 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
698 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
699 const OMPExecutableDirective &D)
700 : RT(RT), EST(EST), D(D) {}
701 void Enter(CodeGenFunction &CGF) override {
702 RT.emitSpmdEntryHeader(CGF, EST, D);
703 }
704 void Exit(CodeGenFunction &CGF) override {
705 RT.emitSpmdEntryFooter(CGF, EST);
706 }
707 } Action(*this, EST, D);
708 CodeGen.setAction(Action);
709 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
710 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000711}
712
713void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
714 CodeGenFunction &CGF, EntryFunctionState &EST,
715 const OMPExecutableDirective &D) {
716 auto &Bld = CGF.Builder;
717
718 // Setup BBs in entry function.
719 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
720 EST.ExitBB = CGF.createBasicBlock(".exit");
721
722 // Initialize the OMP state in the runtime; called by all active threads.
723 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
724 // based on code analysis of the target region.
725 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
726 /*RequiresOMPRuntime=*/Bld.getInt16(1),
727 /*RequiresDataSharing=*/Bld.getInt16(1)};
728 CGF.EmitRuntimeCall(
729 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
730 CGF.EmitBranch(ExecuteBB);
731
732 CGF.EmitBlock(ExecuteBB);
733}
734
735void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
736 EntryFunctionState &EST) {
737 if (!EST.ExitBB)
738 EST.ExitBB = CGF.createBasicBlock(".exit");
739
740 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
741 CGF.EmitBranch(OMPDeInitBB);
742
743 CGF.EmitBlock(OMPDeInitBB);
744 // DeInitialize the OMP state in the runtime; called by all active threads.
745 CGF.EmitRuntimeCall(
746 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
747 CGF.EmitBranch(EST.ExitBB);
748
749 CGF.EmitBlock(EST.ExitBB);
750 EST.ExitBB = nullptr;
751}
752
753// Create a unique global variable to indicate the execution mode of this target
754// region. The execution mode is either 'generic', or 'spmd' depending on the
755// target directive. This variable is picked up by the offload library to setup
756// the device appropriately before kernel launch. If the execution mode is
757// 'generic', the runtime reserves one warp for the master, otherwise, all
758// warps participate in parallel work.
759static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
760 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
761 (void)new llvm::GlobalVariable(
762 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
763 llvm::GlobalValue::WeakAnyLinkage,
764 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
765}
766
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000767void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000768 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000769
770 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000771 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {},
772 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000773 emitWorkerLoop(CGF, WST);
774 CGF.FinishFunction();
775}
776
777void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
778 WorkerFunctionState &WST) {
779 //
780 // The workers enter this loop and wait for parallel work from the master.
781 // When the master encounters a parallel region it sets up the work + variable
782 // arguments, and wakes up the workers. The workers first check to see if
783 // they are required for the parallel region, i.e., within the # of requested
784 // parallel threads. The activated workers load the variable arguments and
785 // execute the parallel work.
786 //
787
788 CGBuilderTy &Bld = CGF.Builder;
789
790 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
791 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
792 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
793 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
794 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
795 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
796
797 CGF.EmitBranch(AwaitBB);
798
799 // Workers wait for work from master.
800 CGF.EmitBlock(AwaitBB);
801 // Wait for parallel work
802 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000803
Gheorghe-Teodor Bercea36cdfad2018-03-22 17:33:27 +0000804 // For data sharing, we need to initialize the stack for workers.
805 CGF.EmitRuntimeCall(
806 createNVPTXRuntimeFunction(
807 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
808
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000809 Address WorkFn =
810 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
811 Address ExecStatus =
812 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
813 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
814 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
815
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000816 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000817 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000818 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000819 llvm::Value *Ret = CGF.EmitRuntimeCall(
820 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
821 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000822
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000823 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000824 llvm::Value *ShouldTerminate =
825 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000826 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
827
828 // Activate requested workers.
829 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000830 llvm::Value *IsActive =
831 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
832 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000833
834 // Signal start of parallel region.
835 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000836
837 // Process work items: outlined parallel functions.
838 for (auto *W : Work) {
839 // Try to match this outlined function.
840 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
841
842 llvm::Value *WorkFnMatch =
843 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
844
845 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
846 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
847 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
848
849 // Execute this outlined function.
850 CGF.EmitBlock(ExecuteFNBB);
851
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000852 // Insert call to work function via shared wrapper. The shared
853 // wrapper takes two arguments:
854 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +0000855 // - the thread ID;
856 emitCall(CGF, WST.Loc, W,
857 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000858
859 // Go to end of parallel region.
860 CGF.EmitBranch(TerminateBB);
861
862 CGF.EmitBlock(CheckNextBB);
863 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000864
865 // Signal end of parallel region.
866 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000867 CGF.EmitRuntimeCall(
868 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
869 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000870 CGF.EmitBranch(BarrierBB);
871
872 // All active and inactive workers wait at a barrier after parallel region.
873 CGF.EmitBlock(BarrierBB);
874 // Barrier after parallel region.
875 syncCTAThreads(CGF);
876 CGF.EmitBranch(AwaitBB);
877
878 // Exit target region.
879 CGF.EmitBlock(ExitBB);
880}
881
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000882/// \brief Returns specified OpenMP runtime function for the current OpenMP
883/// implementation. Specialized for the NVPTX device.
884/// \param Function OpenMP runtime function.
885/// \return Specified function.
886llvm::Constant *
887CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
888 llvm::Constant *RTLFn = nullptr;
889 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
890 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000891 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
892 // RequiresOMPRuntime);
893 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000894 llvm::FunctionType *FnTy =
895 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
896 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
897 break;
898 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000899 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000900 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
901 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000902 llvm::FunctionType *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000903 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000904 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
905 break;
906 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000907 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
908 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000909 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000910 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
911 llvm::FunctionType *FnTy =
912 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
913 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
914 break;
915 }
916 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
917 // Build void __kmpc_spmd_kernel_deinit();
918 llvm::FunctionType *FnTy =
919 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
920 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
921 break;
922 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000923 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
924 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000925 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000926 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000927 llvm::FunctionType *FnTy =
928 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
929 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
930 break;
931 }
932 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000933 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
934 /// int16_t IsOMPRuntimeInitialized);
935 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000936 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
937 llvm::FunctionType *FnTy =
938 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
939 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
940 break;
941 }
942 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
943 /// Build void __kmpc_kernel_end_parallel();
944 llvm::FunctionType *FnTy =
945 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
946 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
947 break;
948 }
949 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
950 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
951 // global_tid);
952 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
953 llvm::FunctionType *FnTy =
954 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
955 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
956 break;
957 }
958 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
959 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
960 // global_tid);
961 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
962 llvm::FunctionType *FnTy =
963 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
964 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
965 break;
966 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000967 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
968 // Build int32_t __kmpc_shuffle_int32(int32_t element,
969 // int16_t lane_offset, int16_t warp_size);
970 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
971 llvm::FunctionType *FnTy =
972 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
973 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
974 break;
975 }
976 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
977 // Build int64_t __kmpc_shuffle_int64(int64_t element,
978 // int16_t lane_offset, int16_t warp_size);
979 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
980 llvm::FunctionType *FnTy =
981 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
982 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
983 break;
984 }
985 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
986 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
987 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
988 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
989 // lane_offset, int16_t Algorithm Version),
990 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
991 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
992 CGM.Int16Ty, CGM.Int16Ty};
993 auto *ShuffleReduceFnTy =
994 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
995 /*isVarArg=*/false);
996 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
997 auto *InterWarpCopyFnTy =
998 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
999 /*isVarArg=*/false);
1000 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1001 CGM.Int32Ty,
1002 CGM.SizeTy,
1003 CGM.VoidPtrTy,
1004 ShuffleReduceFnTy->getPointerTo(),
1005 InterWarpCopyFnTy->getPointerTo()};
1006 llvm::FunctionType *FnTy =
1007 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1008 RTLFn = CGM.CreateRuntimeFunction(
1009 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1010 break;
1011 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001012 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1013 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1014 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1015 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1016 // lane_offset, int16_t shortCircuit),
1017 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1018 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1019 // int32_t index, int32_t width),
1020 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1021 // int32_t index, int32_t width, int32_t reduce))
1022 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1023 CGM.Int16Ty, CGM.Int16Ty};
1024 auto *ShuffleReduceFnTy =
1025 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1026 /*isVarArg=*/false);
1027 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1028 auto *InterWarpCopyFnTy =
1029 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1030 /*isVarArg=*/false);
1031 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1032 CGM.Int32Ty, CGM.Int32Ty};
1033 auto *CopyToScratchpadFnTy =
1034 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1035 /*isVarArg=*/false);
1036 llvm::Type *LoadReduceTypeParams[] = {
1037 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1038 auto *LoadReduceFnTy =
1039 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1040 /*isVarArg=*/false);
1041 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1042 CGM.Int32Ty,
1043 CGM.SizeTy,
1044 CGM.VoidPtrTy,
1045 ShuffleReduceFnTy->getPointerTo(),
1046 InterWarpCopyFnTy->getPointerTo(),
1047 CopyToScratchpadFnTy->getPointerTo(),
1048 LoadReduceFnTy->getPointerTo()};
1049 llvm::FunctionType *FnTy =
1050 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1051 RTLFn = CGM.CreateRuntimeFunction(
1052 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1053 break;
1054 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001055 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1056 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1057 llvm::Type *TypeParams[] = {CGM.Int32Ty};
1058 llvm::FunctionType *FnTy =
1059 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1060 RTLFn = CGM.CreateRuntimeFunction(
1061 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1062 break;
1063 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001064 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1065 /// Build void __kmpc_data_sharing_init_stack();
1066 llvm::FunctionType *FnTy =
1067 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1068 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1069 break;
1070 }
1071 case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1072 // Build void *__kmpc_data_sharing_push_stack(size_t size,
1073 // int16_t UseSharedMemory);
1074 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1075 llvm::FunctionType *FnTy =
1076 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1077 RTLFn = CGM.CreateRuntimeFunction(
1078 FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1079 break;
1080 }
1081 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1082 // Build void __kmpc_data_sharing_pop_stack(void *a);
1083 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1084 llvm::FunctionType *FnTy =
1085 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1086 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1087 /*Name=*/"__kmpc_data_sharing_pop_stack");
1088 break;
1089 }
1090 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1091 /// Build void __kmpc_begin_sharing_variables(void ***args,
1092 /// size_t n_args);
1093 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1094 llvm::FunctionType *FnTy =
1095 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1096 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1097 break;
1098 }
1099 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1100 /// Build void __kmpc_end_sharing_variables();
1101 llvm::FunctionType *FnTy =
1102 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1103 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1104 break;
1105 }
1106 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1107 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1108 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1109 llvm::FunctionType *FnTy =
1110 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1111 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1112 break;
1113 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001114 }
1115 return RTLFn;
1116}
1117
1118void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1119 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +00001120 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001121 auto *F = dyn_cast<llvm::Function>(Addr);
1122 // TODO: Add support for global variables on the device after declare target
1123 // support.
1124 if (!F)
1125 return;
1126 llvm::Module *M = F->getParent();
1127 llvm::LLVMContext &Ctx = M->getContext();
1128
1129 // Get "nvvm.annotations" metadata node
1130 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
1131
1132 llvm::Metadata *MDVals[] = {
1133 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
1134 llvm::ConstantAsMetadata::get(
1135 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1136 // Append metadata to nvvm.annotations
1137 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1138}
1139
1140void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1141 const OMPExecutableDirective &D, StringRef ParentName,
1142 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001143 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001144 if (!IsOffloadEntry) // Nothing to do.
1145 return;
1146
1147 assert(!ParentName.empty() && "Invalid target region parent name!");
1148
Carlo Bertolli79712092018-02-28 20:48:35 +00001149 CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001150 switch (Mode) {
1151 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
1152 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1153 CodeGen);
1154 break;
1155 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
1156 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1157 CodeGen);
1158 break;
1159 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
1160 llvm_unreachable(
1161 "Unknown programming model for OpenMP directive on NVPTX target.");
1162 }
1163
1164 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001165}
1166
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001167CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001168 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001169 if (!CGM.getLangOpts().OpenMPIsDevice)
1170 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001171}
Carlo Bertollic6872252016-04-04 15:55:02 +00001172
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001173void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1174 OpenMPProcBindClauseKind ProcBind,
1175 SourceLocation Loc) {
1176 // Do nothing in case of Spmd mode and L0 parallel.
1177 // TODO: If in Spmd mode and L1 parallel emit the clause.
1178 if (isInSpmdExecutionMode())
1179 return;
1180
1181 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1182}
1183
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001184void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1185 llvm::Value *NumThreads,
1186 SourceLocation Loc) {
1187 // Do nothing in case of Spmd mode and L0 parallel.
1188 // TODO: If in Spmd mode and L1 parallel emit the clause.
1189 if (isInSpmdExecutionMode())
1190 return;
1191
1192 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1193}
1194
Carlo Bertollic6872252016-04-04 15:55:02 +00001195void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1196 const Expr *NumTeams,
1197 const Expr *ThreadLimit,
1198 SourceLocation Loc) {}
1199
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001200llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1201 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1202 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001203 SourceLocation Loc = D.getLocStart();
1204
1205 // Emit target region as a standalone region.
1206 class NVPTXPrePostActionTy : public PrePostActionTy {
1207 SourceLocation &Loc;
1208
1209 public:
1210 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1211 void Enter(CodeGenFunction &CGF) override {
1212 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1213 .emitGenericVarsProlog(CGF, Loc);
1214 }
1215 void Exit(CodeGenFunction &CGF) override {
1216 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1217 .emitGenericVarsEpilog(CGF);
1218 }
1219 } Action(Loc);
1220 CodeGen.setAction(Action);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001221 auto *OutlinedFun =
1222 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1223 D, ThreadIDVar, InnermostKind, CodeGen));
1224 if (!isInSpmdExecutionMode()) {
1225 llvm::Function *WrapperFun =
1226 createParallelDataSharingWrapper(OutlinedFun, D);
1227 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1228 }
1229
1230 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001231}
1232
1233llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001234 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1235 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001236 SourceLocation Loc = D.getLocStart();
Carlo Bertollic6872252016-04-04 15:55:02 +00001237
Alexey Bataevc99042b2018-03-15 18:10:54 +00001238 // Emit target region as a standalone region.
1239 class NVPTXPrePostActionTy : public PrePostActionTy {
1240 SourceLocation &Loc;
1241
1242 public:
1243 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1244 void Enter(CodeGenFunction &CGF) override {
1245 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1246 .emitGenericVarsProlog(CGF, Loc);
1247 }
1248 void Exit(CodeGenFunction &CGF) override {
1249 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1250 .emitGenericVarsEpilog(CGF);
1251 }
1252 } Action(Loc);
1253 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001254 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1255 D, ThreadIDVar, InnermostKind, CodeGen);
1256 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1257 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00001258 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001259 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00001260
1261 return OutlinedFun;
1262}
1263
Alexey Bataevc99042b2018-03-15 18:10:54 +00001264void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1265 SourceLocation Loc) {
1266 CGBuilderTy &Bld = CGF.Builder;
1267
1268 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1269 if (I == FunctionGlobalizedDecls.end())
1270 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001271 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
1272 QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001273
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001274 // Recover pointer to this function's global record. The runtime will
1275 // handle the specifics of the allocation of the memory.
1276 // Use actual memory size of the record including the padding
1277 // for alignment purposes.
1278 unsigned Alignment =
1279 CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
1280 unsigned GlobalRecordSize =
1281 CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1282 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1283 // TODO: allow the usage of shared memory to be controlled by
1284 // the user, for now, default to global.
1285 llvm::Value *GlobalRecordSizeArg[] = {
1286 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1287 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1288 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1289 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1290 GlobalRecordSizeArg);
1291 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1292 GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1293 LValue Base =
1294 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1295 I->getSecond().GlobalRecordAddr = GlobalRecValue;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001296
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001297 // Emit the "global alloca" which is a GEP from the global declaration
1298 // record using the pointer returned by the runtime.
1299 for (auto &Rec : I->getSecond().LocalVarData) {
1300 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1301 llvm::Value *ParValue;
1302 if (EscapedParam) {
1303 const auto *VD = cast<VarDecl>(Rec.first);
1304 LValue ParLVal =
1305 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1306 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1307 }
1308 const FieldDecl *FD = Rec.second.first;
1309 LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1310 Rec.second.second = VarAddr.getAddress();
1311 if (EscapedParam) {
1312 const auto *VD = cast<VarDecl>(Rec.first);
1313 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1314 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1315 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00001316 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001317 }
1318 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
1319 // Recover pointer to this function's global record. The runtime will
1320 // handle the specifics of the allocation of the memory.
1321 // Use actual memory size of the record including the padding
1322 // for alignment purposes.
1323 auto &Bld = CGF.Builder;
1324 llvm::Value *Size = CGF.getTypeSize(VD->getType());
1325 CharUnits Align = CGM.getContext().getDeclAlign(VD);
1326 Size = Bld.CreateNUWAdd(
1327 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
1328 llvm::Value *AlignVal =
1329 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
1330 Size = Bld.CreateUDiv(Size, AlignVal);
1331 Size = Bld.CreateNUWMul(Size, AlignVal);
1332 // TODO: allow the usage of shared memory to be controlled by
1333 // the user, for now, default to global.
1334 llvm::Value *GlobalRecordSizeArg[] = {
1335 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1336 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1337 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1338 GlobalRecordSizeArg);
1339 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1340 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
1341 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
1342 CGM.getContext().getDeclAlign(VD),
1343 AlignmentSource::Decl);
1344 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
1345 Base.getAddress());
1346 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001347 }
1348 I->getSecond().MappedParams->apply(CGF);
1349}
1350
1351void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1352 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001353 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001354 I->getSecond().MappedParams->restore(CGF);
1355 if (!CGF.HaveInsertPoint())
1356 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00001357 for (llvm::Value *Addr :
1358 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
1359 CGF.EmitRuntimeCall(
1360 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1361 Addr);
1362 }
1363 if (I->getSecond().GlobalRecordAddr) {
1364 CGF.EmitRuntimeCall(
1365 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1366 I->getSecond().GlobalRecordAddr);
1367 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00001368 }
1369}
1370
Carlo Bertollic6872252016-04-04 15:55:02 +00001371void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
1372 const OMPExecutableDirective &D,
1373 SourceLocation Loc,
1374 llvm::Value *OutlinedFn,
1375 ArrayRef<llvm::Value *> CapturedVars) {
1376 if (!CGF.HaveInsertPoint())
1377 return;
1378
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001379 Address ZeroAddr = CGF.CreateMemTemp(
1380 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1381 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00001382 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1383 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001384 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00001385 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1386 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001387 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00001388}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001389
1390void CGOpenMPRuntimeNVPTX::emitParallelCall(
1391 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1392 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1393 if (!CGF.HaveInsertPoint())
1394 return;
1395
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001396 if (isInSpmdExecutionMode())
1397 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1398 else
1399 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001400}
1401
1402void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
1403 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1404 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1405 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001406 llvm::Function *WFn = WrapperFunctionsMap[Fn];
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001407
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001408 assert(WFn && "Wrapper function does not exist!");
1409
1410 // Force inline this outlined function at its call site.
1411 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1412
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001413 auto &&L0ParallelGen = [this, WFn, CapturedVars](CodeGenFunction &CGF,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001414 PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001415 CGBuilderTy &Bld = CGF.Builder;
1416
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001417 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1418
1419 // Prepare for parallel region. Indicate the outlined function.
1420 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1421 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1422 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
1423 Args);
1424
1425 // Create a private scope that will globalize the arguments
1426 // passed from the outside of the target region.
1427 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1428
1429 // There's somehting to share.
1430 if (!CapturedVars.empty()) {
1431 // Prepare for parallel region. Indicate the outlined function.
1432 Address SharedArgs =
1433 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
1434 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
1435
1436 llvm::Value *DataSharingArgs[] = {
1437 SharedArgsPtr,
1438 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1439 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1440 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
1441 DataSharingArgs);
1442
1443 // Store variable address in a list of references to pass to workers.
1444 unsigned Idx = 0;
1445 ASTContext &Ctx = CGF.getContext();
1446 Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
1447 Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
1448 .castAs<PointerType>());
1449 for (llvm::Value *V : CapturedVars) {
1450 Address Dst = Bld.CreateConstInBoundsGEP(
1451 SharedArgListAddress, Idx, CGF.getPointerSize());
Alexey Bataev17314212018-03-20 15:41:05 +00001452 llvm::Value * PtrV;
1453 if (V->getType()->isIntegerTy())
1454 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
1455 else
1456 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001457 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1458 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00001459 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001460 }
1461 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001462
1463 // Activate workers. This barrier is used by the master to signal
1464 // work for the workers.
1465 syncCTAThreads(CGF);
1466
1467 // OpenMP [2.5, Parallel Construct, p.49]
1468 // There is an implied barrier at the end of a parallel region. After the
1469 // end of a parallel region, only the master thread of the team resumes
1470 // execution of the enclosing task region.
1471 //
1472 // The master waits at this barrier until all workers are done.
1473 syncCTAThreads(CGF);
1474
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001475 if (!CapturedVars.empty())
1476 CGF.EmitRuntimeCall(
1477 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
1478
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001479 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001480 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001481 };
1482
1483 auto *RTLoc = emitUpdateLocation(CGF, Loc);
1484 auto *ThreadID = getThreadID(CGF, Loc);
1485 llvm::Value *Args[] = {RTLoc, ThreadID};
1486
Alexey Bataev634b5ba2018-03-19 17:18:13 +00001487 auto &&SeqGen = [this, Fn, CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1488 PrePostActionTy &) {
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001489 auto &&CodeGen = [this, Fn, CapturedVars, Loc](CodeGenFunction &CGF,
1490 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001491 Action.Enter(CGF);
1492
1493 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001494 Address ZeroAddr =
1495 CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
1496 /*DestWidth=*/32, /*Signed=*/1),
1497 ".zero.addr");
1498 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1499 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
1500 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001501 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001502 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001503 };
1504
1505 RegionCodeGenTy RCG(CodeGen);
1506 NVPTXActionTy Action(
1507 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1508 Args,
1509 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1510 Args);
1511 RCG.setAction(Action);
1512 RCG(CGF);
1513 };
1514
1515 if (IfCond)
1516 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1517 else {
1518 CodeGenFunction::RunCleanupsScope Scope(CGF);
1519 RegionCodeGenTy ThenRCG(L0ParallelGen);
1520 ThenRCG(CGF);
1521 }
1522}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001523
1524void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1525 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1526 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1527 // Just call the outlined function to execute the parallel region.
1528 // OutlinedFn(&GTid, &zero, CapturedStruct);
1529 //
1530 // TODO: Do something with IfCond when support for the 'if' clause
1531 // is added on Spmd target directives.
1532 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00001533
1534 Address ZeroAddr = CGF.CreateMemTemp(
1535 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1536 ".zero.addr");
1537 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001538 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertolli79712092018-02-28 20:48:35 +00001539 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001540 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001541 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001542}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001543
Alexey Bataevb2575932018-01-04 20:18:55 +00001544/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00001545static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
1546 QualType ValTy, QualType CastTy,
1547 SourceLocation Loc) {
1548 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
1549 "Cast type must sized.");
1550 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
1551 "Val type must sized.");
1552 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
1553 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00001554 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00001555 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
1556 CGF.getContext().getTypeSizeInChars(CastTy))
1557 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
1558 if (CastTy->isIntegerType() && ValTy->isIntegerType())
1559 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
1560 CastTy->hasSignedIntegerRepresentation());
1561 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00001562 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1563 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00001564 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
1565 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00001566}
1567
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001568/// This function creates calls to one of two shuffle functions to copy
1569/// variables between lanes in a warp.
1570static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001571 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00001572 QualType ElemType,
1573 llvm::Value *Offset,
1574 SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001575 auto &CGM = CGF.CGM;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001576 auto &Bld = CGF.Builder;
1577 CGOpenMPRuntimeNVPTX &RT =
1578 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1579
Alexey Bataeva453f362018-03-19 17:53:56 +00001580 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
1581 assert(Size.getQuantity() <= 8 &&
1582 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001583
Alexey Bataeva453f362018-03-19 17:53:56 +00001584 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001585 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1586 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1587
1588 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00001589 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
1590 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
1591 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00001592 auto *WarpSize =
1593 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001594
1595 auto *ShuffledVal =
1596 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1597 {ElemCast, Offset, WarpSize});
1598
Alexey Bataeva453f362018-03-19 17:53:56 +00001599 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001600}
1601
1602namespace {
1603enum CopyAction : unsigned {
1604 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1605 // the warp using shuffle instructions.
1606 RemoteLaneToThread,
1607 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1608 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001609 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1610 ThreadToScratchpad,
1611 // ScratchpadToThread: Copy from a scratchpad array in global memory
1612 // containing team-reduced data to a thread's stack.
1613 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001614};
1615} // namespace
1616
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001617struct CopyOptionsTy {
1618 llvm::Value *RemoteLaneOffset;
1619 llvm::Value *ScratchpadIndex;
1620 llvm::Value *ScratchpadWidth;
1621};
1622
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001623/// Emit instructions to copy a Reduce list, which contains partially
1624/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001625static void emitReductionListCopy(
1626 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1627 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1628 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001629
1630 auto &CGM = CGF.CGM;
1631 auto &C = CGM.getContext();
1632 auto &Bld = CGF.Builder;
1633
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001634 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1635 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1636 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1637
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001638 // Iterates, element-by-element, through the source Reduce list and
1639 // make a copy.
1640 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001641 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001642 for (auto &Private : Privates) {
1643 Address SrcElementAddr = Address::invalid();
1644 Address DestElementAddr = Address::invalid();
1645 Address DestElementPtrAddr = Address::invalid();
1646 // Should we shuffle in an element from a remote lane?
1647 bool ShuffleInElement = false;
1648 // Set to true to update the pointer in the dest Reduce list to a
1649 // newly created element.
1650 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001651 // Increment the src or dest pointer to the scratchpad, for each
1652 // new element.
1653 bool IncrScratchpadSrc = false;
1654 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001655
1656 switch (Action) {
1657 case RemoteLaneToThread: {
1658 // Step 1.1: Get the address for the src element in the Reduce list.
1659 Address SrcElementPtrAddr =
1660 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001661 SrcElementAddr = CGF.EmitLoadOfPointer(
1662 SrcElementPtrAddr,
1663 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001664
1665 // Step 1.2: Create a temporary to store the element in the destination
1666 // Reduce list.
1667 DestElementPtrAddr =
1668 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1669 DestElementAddr =
1670 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1671 ShuffleInElement = true;
1672 UpdateDestListPtr = true;
1673 break;
1674 }
1675 case ThreadCopy: {
1676 // Step 1.1: Get the address for the src element in the Reduce list.
1677 Address SrcElementPtrAddr =
1678 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001679 SrcElementAddr = CGF.EmitLoadOfPointer(
1680 SrcElementPtrAddr,
1681 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001682
1683 // Step 1.2: Get the address for dest element. The destination
1684 // element has already been created on the thread's stack.
1685 DestElementPtrAddr =
1686 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001687 DestElementAddr = CGF.EmitLoadOfPointer(
1688 DestElementPtrAddr,
1689 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001690 break;
1691 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001692 case ThreadToScratchpad: {
1693 // Step 1.1: Get the address for the src element in the Reduce list.
1694 Address SrcElementPtrAddr =
1695 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001696 SrcElementAddr = CGF.EmitLoadOfPointer(
1697 SrcElementPtrAddr,
1698 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001699
1700 // Step 1.2: Get the address for dest element:
1701 // address = base + index * ElementSizeInChars.
1702 unsigned ElementSizeInChars =
1703 C.getTypeSizeInChars(Private->getType()).getQuantity();
1704 auto *CurrentOffset =
1705 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1706 ScratchpadIndex);
1707 auto *ScratchPadElemAbsolutePtrVal =
1708 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1709 ScratchPadElemAbsolutePtrVal =
1710 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00001711 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1712 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001713 IncrScratchpadDest = true;
1714 break;
1715 }
1716 case ScratchpadToThread: {
1717 // Step 1.1: Get the address for the src element in the scratchpad.
1718 // address = base + index * ElementSizeInChars.
1719 unsigned ElementSizeInChars =
1720 C.getTypeSizeInChars(Private->getType()).getQuantity();
1721 auto *CurrentOffset =
1722 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1723 ScratchpadIndex);
1724 auto *ScratchPadElemAbsolutePtrVal =
1725 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1726 ScratchPadElemAbsolutePtrVal =
1727 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1728 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1729 C.getTypeAlignInChars(Private->getType()));
1730 IncrScratchpadSrc = true;
1731
1732 // Step 1.2: Create a temporary to store the element in the destination
1733 // Reduce list.
1734 DestElementPtrAddr =
1735 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1736 DestElementAddr =
1737 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1738 UpdateDestListPtr = true;
1739 break;
1740 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001741 }
1742
1743 // Regardless of src and dest of copy, we emit the load of src
1744 // element as this is required in all directions
1745 SrcElementAddr = Bld.CreateElementBitCast(
1746 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1747 llvm::Value *Elem =
1748 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001749 Private->getType(), Private->getExprLoc());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001750
1751 // Now that all active lanes have read the element in the
1752 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00001753 if (ShuffleInElement) {
1754 Elem =
1755 createRuntimeShuffleFunction(CGF, Elem, Private->getType(),
1756 RemoteLaneOffset, Private->getExprLoc());
1757 }
Alexey Bataevb2575932018-01-04 20:18:55 +00001758
1759 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1760 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001761
1762 // Store the source element value to the dest element address.
1763 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1764 Private->getType());
1765
1766 // Step 3.1: Modify reference in dest Reduce list as needed.
1767 // Modifying the reference in Reduce list to point to the newly
1768 // created element. The element is live in the current function
1769 // scope and that of functions it invokes (i.e., reduce_function).
1770 // RemoteReduceData[i] = (void*)&RemoteElem
1771 if (UpdateDestListPtr) {
1772 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1773 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1774 DestElementPtrAddr, /*Volatile=*/false,
1775 C.VoidPtrTy);
1776 }
1777
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001778 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1779 // address of the next element in scratchpad memory, unless we're currently
1780 // processing the last one. Memory alignment is also taken care of here.
1781 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1782 llvm::Value *ScratchpadBasePtr =
1783 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1784 unsigned ElementSizeInChars =
1785 C.getTypeSizeInChars(Private->getType()).getQuantity();
1786 ScratchpadBasePtr = Bld.CreateAdd(
1787 ScratchpadBasePtr,
1788 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1789 CGM.SizeTy, ElementSizeInChars)));
1790
1791 // Take care of global memory alignment for performance
1792 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1793 llvm::ConstantInt::get(CGM.SizeTy, 1));
1794 ScratchpadBasePtr = Bld.CreateSDiv(
1795 ScratchpadBasePtr,
1796 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1797 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1798 llvm::ConstantInt::get(CGM.SizeTy, 1));
1799 ScratchpadBasePtr = Bld.CreateMul(
1800 ScratchpadBasePtr,
1801 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1802
1803 if (IncrScratchpadDest)
1804 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1805 else /* IncrScratchpadSrc = true */
1806 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1807 }
1808
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001809 Idx++;
1810 }
1811}
1812
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001813/// This function emits a helper that loads data from the scratchpad array
1814/// and (optionally) reduces it with the input operand.
1815///
1816/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1817/// reduce_data remote;
1818/// for elem in remote:
1819/// remote.elem = Scratchpad[elem_id][index]
1820/// if (should_reduce)
1821/// local = local @ remote
1822/// else
1823/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001824static llvm::Value *emitReduceScratchpadFunction(
1825 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
1826 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001827 auto &C = CGM.getContext();
1828 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1829
1830 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001831 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1832 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001833 // Base address of the scratchpad array, with each element storing a
1834 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001835 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1836 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001837 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001838 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1839 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001840 // Row width of an element in the scratchpad array, typically
1841 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001842 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1843 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001844 // If should_reduce == 1, then it's load AND reduce,
1845 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1846 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001847 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1848 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001849
1850 FunctionArgList Args;
1851 Args.push_back(&ReduceListArg);
1852 Args.push_back(&ScratchPadArg);
1853 Args.push_back(&IndexArg);
1854 Args.push_back(&WidthArg);
1855 Args.push_back(&ShouldReduceArg);
1856
1857 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1858 auto *Fn = llvm::Function::Create(
1859 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1860 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001861 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001862 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001863 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001864
1865 auto &Bld = CGF.Builder;
1866
1867 // Get local Reduce list pointer.
1868 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1869 Address ReduceListAddr(
1870 Bld.CreatePointerBitCastOrAddrSpaceCast(
1871 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001872 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001873 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1874 CGF.getPointerAlign());
1875
1876 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1877 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001878 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001879
1880 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001881 llvm::Value *IndexVal = Bld.CreateIntCast(
1882 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
1883 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001884
1885 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001886 llvm::Value *WidthVal = Bld.CreateIntCast(
1887 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
1888 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001889
1890 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1891 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001892 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001893
1894 // The absolute ptr address to the base addr of the next element to copy.
1895 llvm::Value *CumulativeElemBasePtr =
1896 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1897 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1898
1899 // Create a Remote Reduce list to store the elements read from the
1900 // scratchpad array.
1901 Address RemoteReduceList =
1902 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1903
1904 // Assemble remote Reduce list from scratchpad array.
1905 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1906 SrcDataAddr, RemoteReduceList,
1907 {/*RemoteLaneOffset=*/nullptr,
1908 /*ScratchpadIndex=*/IndexVal,
1909 /*ScratchpadWidth=*/WidthVal});
1910
1911 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1912 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1913 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1914
1915 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1916 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1917
1918 CGF.EmitBlock(ThenBB);
1919 // We should reduce with the local Reduce list.
1920 // reduce_function(LocalReduceList, RemoteReduceList)
1921 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1922 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1923 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1924 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001925 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
1926 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001927 Bld.CreateBr(MergeBB);
1928
1929 CGF.EmitBlock(ElseBB);
1930 // No reduction; just copy:
1931 // Local Reduce list = Remote Reduce list.
1932 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1933 RemoteReduceList, ReduceListAddr);
1934 Bld.CreateBr(MergeBB);
1935
1936 CGF.EmitBlock(MergeBB);
1937
1938 CGF.FinishFunction();
1939 return Fn;
1940}
1941
1942/// This function emits a helper that stores reduced data from the team
1943/// master to a scratchpad array in global memory.
1944///
1945/// for elem in Reduce List:
1946/// scratchpad[elem_id][index] = elem
1947///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001948static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1949 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001950 QualType ReductionArrayTy,
1951 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001952
1953 auto &C = CGM.getContext();
1954 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1955
1956 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001957 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1958 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001959 // Base address of the scratchpad array, with each element storing a
1960 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001961 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1962 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001963 // A destination index into the scratchpad array, typically the team
1964 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001965 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1966 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001967 // Row width of an element in the scratchpad array, typically
1968 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001969 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1970 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001971
1972 FunctionArgList Args;
1973 Args.push_back(&ReduceListArg);
1974 Args.push_back(&ScratchPadArg);
1975 Args.push_back(&IndexArg);
1976 Args.push_back(&WidthArg);
1977
1978 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1979 auto *Fn = llvm::Function::Create(
1980 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1981 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001982 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001983 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001984 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001985
1986 auto &Bld = CGF.Builder;
1987
1988 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1989 Address SrcDataAddr(
1990 Bld.CreatePointerBitCastOrAddrSpaceCast(
1991 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001992 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001993 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1994 CGF.getPointerAlign());
1995
1996 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1997 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001998 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001999
2000 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00002001 llvm::Value *IndexVal = Bld.CreateIntCast(
2002 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
2003 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002004
2005 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
2006 llvm::Value *WidthVal =
2007 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
2008 Int32Ty, SourceLocation()),
2009 CGF.SizeTy, /*isSigned=*/true);
2010
2011 // The absolute ptr address to the base addr of the next element to copy.
2012 llvm::Value *CumulativeElemBasePtr =
2013 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
2014 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
2015
2016 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
2017 SrcDataAddr, DestDataAddr,
2018 {/*RemoteLaneOffset=*/nullptr,
2019 /*ScratchpadIndex=*/IndexVal,
2020 /*ScratchpadWidth=*/WidthVal});
2021
2022 CGF.FinishFunction();
2023 return Fn;
2024}
2025
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002026/// This function emits a helper that gathers Reduce lists from the first
2027/// lane of every active warp to lanes in the first warp.
2028///
2029/// void inter_warp_copy_func(void* reduce_data, num_warps)
2030/// shared smem[warp_size];
2031/// For all data entries D in reduce_data:
2032/// If (I am the first lane in each warp)
2033/// Copy my local D to smem[warp_id]
2034/// sync
2035/// if (I am the first warp)
2036/// Copy smem[thread_id] to my local D
2037/// sync
2038static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
2039 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002040 QualType ReductionArrayTy,
2041 SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002042 auto &C = CGM.getContext();
2043 auto &M = CGM.getModule();
2044
2045 // ReduceList: thread local Reduce list.
2046 // At the stage of the computation when this function is called, partially
2047 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002048 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2049 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002050 // NumWarps: number of warps active in the parallel region. This could
2051 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002052 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00002053 C.getIntTypeForBitwidth(32, /* Signed */ true),
2054 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002055 FunctionArgList Args;
2056 Args.push_back(&ReduceListArg);
2057 Args.push_back(&NumWarpsArg);
2058
2059 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2060 auto *Fn = llvm::Function::Create(
2061 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2062 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002063 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002064 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002065 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002066
2067 auto &Bld = CGF.Builder;
2068
2069 // This array is used as a medium to transfer, one reduce element at a time,
2070 // the data from the first lane of every warp to lanes in the first warp
2071 // in order to perform the final step of a reduction in a parallel region
2072 // (reduction across warps). The array is placed in NVPTX __shared__ memory
2073 // for reduced latency, as well as to have a distinct copy for concurrently
2074 // executing target regions. The array is declared with common linkage so
2075 // as to be shared across compilation units.
2076 const char *TransferMediumName =
2077 "__openmp_nvptx_data_transfer_temporary_storage";
2078 llvm::GlobalVariable *TransferMedium =
2079 M.getGlobalVariable(TransferMediumName);
2080 if (!TransferMedium) {
2081 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2082 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2083 TransferMedium = new llvm::GlobalVariable(
2084 M, Ty,
2085 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2086 llvm::Constant::getNullValue(Ty), TransferMediumName,
2087 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2088 SharedAddressSpace);
2089 }
2090
2091 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2092 auto *ThreadID = getNVPTXThreadID(CGF);
2093 // nvptx_lane_id = nvptx_id % warpsize
2094 auto *LaneID = getNVPTXLaneID(CGF);
2095 // nvptx_warp_id = nvptx_id / warpsize
2096 auto *WarpID = getNVPTXWarpID(CGF);
2097
2098 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2099 Address LocalReduceList(
2100 Bld.CreatePointerBitCastOrAddrSpaceCast(
2101 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2102 C.VoidPtrTy, SourceLocation()),
2103 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2104 CGF.getPointerAlign());
2105
2106 unsigned Idx = 0;
2107 for (auto &Private : Privates) {
2108 //
2109 // Warp master copies reduce element to transfer medium in __shared__
2110 // memory.
2111 //
2112 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2113 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2114 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2115
2116 // if (lane_id == 0)
2117 auto IsWarpMaster =
2118 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
2119 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2120 CGF.EmitBlock(ThenBB);
2121
2122 // Reduce element = LocalReduceList[i]
2123 Address ElemPtrPtrAddr =
2124 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2125 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2126 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2127 // elemptr = (type[i]*)(elemptrptr)
2128 Address ElemPtr =
2129 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2130 ElemPtr = Bld.CreateElementBitCast(
2131 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2132 // elem = *elemptr
2133 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2134 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2135
2136 // Get pointer to location in transfer medium.
2137 // MediumPtr = &medium[warp_id]
2138 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2139 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2140 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
2141 // Casting to actual data type.
2142 // MediumPtr = (type[i]*)MediumPtrAddr;
2143 MediumPtr = Bld.CreateElementBitCast(
2144 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2145
2146 //*MediumPtr = elem
2147 Bld.CreateStore(Elem, MediumPtr);
2148
2149 Bld.CreateBr(MergeBB);
2150
2151 CGF.EmitBlock(ElseBB);
2152 Bld.CreateBr(MergeBB);
2153
2154 CGF.EmitBlock(MergeBB);
2155
2156 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2157 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2158 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
2159
2160 auto *NumActiveThreads = Bld.CreateNSWMul(
2161 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
2162 // named_barrier_sync(ParallelBarrierID, num_active_threads)
2163 syncParallelThreads(CGF, NumActiveThreads);
2164
2165 //
2166 // Warp 0 copies reduce element from transfer medium.
2167 //
2168 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2169 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2170 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2171
2172 // Up to 32 threads in warp 0 are active.
2173 auto IsActiveThread =
2174 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2175 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2176
2177 CGF.EmitBlock(W0ThenBB);
2178
2179 // SrcMediumPtr = &medium[tid]
2180 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2181 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2182 Address SrcMediumPtr(SrcMediumPtrVal,
2183 C.getTypeAlignInChars(Private->getType()));
2184 // SrcMediumVal = *SrcMediumPtr;
2185 SrcMediumPtr = Bld.CreateElementBitCast(
2186 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2187 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
2188 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2189
2190 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
2191 Address TargetElemPtrPtr =
2192 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2193 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2194 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2195 Address TargetElemPtr =
2196 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
2197 TargetElemPtr = Bld.CreateElementBitCast(
2198 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2199
2200 // *TargetElemPtr = SrcMediumVal;
2201 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2202 Private->getType());
2203 Bld.CreateBr(W0MergeBB);
2204
2205 CGF.EmitBlock(W0ElseBB);
2206 Bld.CreateBr(W0MergeBB);
2207
2208 CGF.EmitBlock(W0MergeBB);
2209
2210 // While warp 0 copies values from transfer medium, all other warps must
2211 // wait.
2212 syncParallelThreads(CGF, NumActiveThreads);
2213 Idx++;
2214 }
2215
2216 CGF.FinishFunction();
2217 return Fn;
2218}
2219
2220/// Emit a helper that reduces data across two OpenMP threads (lanes)
2221/// in the same warp. It uses shuffle instructions to copy over data from
2222/// a remote lane's stack. The reduction algorithm performed is specified
2223/// by the fourth parameter.
2224///
2225/// Algorithm Versions.
2226/// Full Warp Reduce (argument value 0):
2227/// This algorithm assumes that all 32 lanes are active and gathers
2228/// data from these 32 lanes, producing a single resultant value.
2229/// Contiguous Partial Warp Reduce (argument value 1):
2230/// This algorithm assumes that only a *contiguous* subset of lanes
2231/// are active. This happens for the last warp in a parallel region
2232/// when the user specified num_threads is not an integer multiple of
2233/// 32. This contiguous subset always starts with the zeroth lane.
2234/// Partial Warp Reduce (argument value 2):
2235/// This algorithm gathers data from any number of lanes at any position.
2236/// All reduced values are stored in the lowest possible lane. The set
2237/// of problems every algorithm addresses is a super set of those
2238/// addressable by algorithms with a lower version number. Overhead
2239/// increases as algorithm version increases.
2240///
2241/// Terminology
2242/// Reduce element:
2243/// Reduce element refers to the individual data field with primitive
2244/// data types to be combined and reduced across threads.
2245/// Reduce list:
2246/// Reduce list refers to a collection of local, thread-private
2247/// reduce elements.
2248/// Remote Reduce list:
2249/// Remote Reduce list refers to a collection of remote (relative to
2250/// the current thread) reduce elements.
2251///
2252/// We distinguish between three states of threads that are important to
2253/// the implementation of this function.
2254/// Alive threads:
2255/// Threads in a warp executing the SIMT instruction, as distinguished from
2256/// threads that are inactive due to divergent control flow.
2257/// Active threads:
2258/// The minimal set of threads that has to be alive upon entry to this
2259/// function. The computation is correct iff active threads are alive.
2260/// Some threads are alive but they are not active because they do not
2261/// contribute to the computation in any useful manner. Turning them off
2262/// may introduce control flow overheads without any tangible benefits.
2263/// Effective threads:
2264/// In order to comply with the argument requirements of the shuffle
2265/// function, we must keep all lanes holding data alive. But at most
2266/// half of them perform value aggregation; we refer to this half of
2267/// threads as effective. The other half is simply handing off their
2268/// data.
2269///
2270/// Procedure
2271/// Value shuffle:
2272/// In this step active threads transfer data from higher lane positions
2273/// in the warp to lower lane positions, creating Remote Reduce list.
2274/// Value aggregation:
2275/// In this step, effective threads combine their thread local Reduce list
2276/// with Remote Reduce list and store the result in the thread local
2277/// Reduce list.
2278/// Value copy:
2279/// In this step, we deal with the assumption made by algorithm 2
2280/// (i.e. contiguity assumption). When we have an odd number of lanes
2281/// active, say 2k+1, only k threads will be effective and therefore k
2282/// new values will be produced. However, the Reduce list owned by the
2283/// (2k+1)th thread is ignored in the value aggregation. Therefore
2284/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2285/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002286static llvm::Value *emitShuffleAndReduceFunction(
2287 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2288 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002289 auto &C = CGM.getContext();
2290
2291 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002292 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2293 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002294 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002295 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2296 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002297 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002298 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2299 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002300 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002301 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2302 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002303 FunctionArgList Args;
2304 Args.push_back(&ReduceListArg);
2305 Args.push_back(&LaneIDArg);
2306 Args.push_back(&RemoteLaneOffsetArg);
2307 Args.push_back(&AlgoVerArg);
2308
2309 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2310 auto *Fn = llvm::Function::Create(
2311 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2312 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002313 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002314 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002315 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002316
2317 auto &Bld = CGF.Builder;
2318
2319 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2320 Address LocalReduceList(
2321 Bld.CreatePointerBitCastOrAddrSpaceCast(
2322 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2323 C.VoidPtrTy, SourceLocation()),
2324 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2325 CGF.getPointerAlign());
2326
2327 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2328 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2329 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2330
2331 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2332 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2333 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2334
2335 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2336 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2337 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2338
2339 // Create a local thread-private variable to host the Reduce list
2340 // from a remote lane.
2341 Address RemoteReduceList =
2342 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2343
2344 // This loop iterates through the list of reduce elements and copies,
2345 // element by element, from a remote lane in the warp to RemoteReduceList,
2346 // hosted on the thread's stack.
2347 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2348 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002349 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2350 /*ScratchpadIndex=*/nullptr,
2351 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002352
2353 // The actions to be performed on the Remote Reduce list is dependent
2354 // on the algorithm version.
2355 //
2356 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2357 // LaneId % 2 == 0 && Offset > 0):
2358 // do the reduction value aggregation
2359 //
2360 // The thread local variable Reduce list is mutated in place to host the
2361 // reduced data, which is the aggregated value produced from local and
2362 // remote lanes.
2363 //
2364 // Note that AlgoVer is expected to be a constant integer known at compile
2365 // time.
2366 // When AlgoVer==0, the first conjunction evaluates to true, making
2367 // the entire predicate true during compile time.
2368 // When AlgoVer==1, the second conjunction has only the second part to be
2369 // evaluated during runtime. Other conjunctions evaluates to false
2370 // during compile time.
2371 // When AlgoVer==2, the third conjunction has only the second part to be
2372 // evaluated during runtime. Other conjunctions evaluates to false
2373 // during compile time.
2374 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
2375
2376 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2377 auto CondAlgo1 = Bld.CreateAnd(
2378 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2379
2380 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2381 auto CondAlgo2 = Bld.CreateAnd(
2382 Algo2,
2383 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
2384 Bld.getInt16(0)));
2385 CondAlgo2 = Bld.CreateAnd(
2386 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2387
2388 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2389 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2390
2391 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2392 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2393 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2394 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2395
2396 CGF.EmitBlock(ThenBB);
2397 // reduce_function(LocalReduceList, RemoteReduceList)
2398 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2399 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2400 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2401 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002402 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2403 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002404 Bld.CreateBr(MergeBB);
2405
2406 CGF.EmitBlock(ElseBB);
2407 Bld.CreateBr(MergeBB);
2408
2409 CGF.EmitBlock(MergeBB);
2410
2411 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2412 // Reduce list.
2413 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2414 auto CondCopy = Bld.CreateAnd(
2415 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2416
2417 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2418 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2419 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2420 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2421
2422 CGF.EmitBlock(CpyThenBB);
2423 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2424 RemoteReduceList, LocalReduceList);
2425 Bld.CreateBr(CpyMergeBB);
2426
2427 CGF.EmitBlock(CpyElseBB);
2428 Bld.CreateBr(CpyMergeBB);
2429
2430 CGF.EmitBlock(CpyMergeBB);
2431
2432 CGF.FinishFunction();
2433 return Fn;
2434}
2435
2436///
2437/// Design of OpenMP reductions on the GPU
2438///
2439/// Consider a typical OpenMP program with one or more reduction
2440/// clauses:
2441///
2442/// float foo;
2443/// double bar;
2444/// #pragma omp target teams distribute parallel for \
2445/// reduction(+:foo) reduction(*:bar)
2446/// for (int i = 0; i < N; i++) {
2447/// foo += A[i]; bar *= B[i];
2448/// }
2449///
2450/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2451/// all teams. In our OpenMP implementation on the NVPTX device an
2452/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2453/// within a team are mapped to CUDA threads within a threadblock.
2454/// Our goal is to efficiently aggregate values across all OpenMP
2455/// threads such that:
2456///
2457/// - the compiler and runtime are logically concise, and
2458/// - the reduction is performed efficiently in a hierarchical
2459/// manner as follows: within OpenMP threads in the same warp,
2460/// across warps in a threadblock, and finally across teams on
2461/// the NVPTX device.
2462///
2463/// Introduction to Decoupling
2464///
2465/// We would like to decouple the compiler and the runtime so that the
2466/// latter is ignorant of the reduction variables (number, data types)
2467/// and the reduction operators. This allows a simpler interface
2468/// and implementation while still attaining good performance.
2469///
2470/// Pseudocode for the aforementioned OpenMP program generated by the
2471/// compiler is as follows:
2472///
2473/// 1. Create private copies of reduction variables on each OpenMP
2474/// thread: 'foo_private', 'bar_private'
2475/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2476/// to it and writes the result in 'foo_private' and 'bar_private'
2477/// respectively.
2478/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2479/// and store the result on the team master:
2480///
2481/// __kmpc_nvptx_parallel_reduce_nowait(...,
2482/// reduceData, shuffleReduceFn, interWarpCpyFn)
2483///
2484/// where:
2485/// struct ReduceData {
2486/// double *foo;
2487/// double *bar;
2488/// } reduceData
2489/// reduceData.foo = &foo_private
2490/// reduceData.bar = &bar_private
2491///
2492/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2493/// auxiliary functions generated by the compiler that operate on
2494/// variables of type 'ReduceData'. They aid the runtime perform
2495/// algorithmic steps in a data agnostic manner.
2496///
2497/// 'shuffleReduceFn' is a pointer to a function that reduces data
2498/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2499/// same warp. It takes the following arguments as input:
2500///
2501/// a. variable of type 'ReduceData' on the calling lane,
2502/// b. its lane_id,
2503/// c. an offset relative to the current lane_id to generate a
2504/// remote_lane_id. The remote lane contains the second
2505/// variable of type 'ReduceData' that is to be reduced.
2506/// d. an algorithm version parameter determining which reduction
2507/// algorithm to use.
2508///
2509/// 'shuffleReduceFn' retrieves data from the remote lane using
2510/// efficient GPU shuffle intrinsics and reduces, using the
2511/// algorithm specified by the 4th parameter, the two operands
2512/// element-wise. The result is written to the first operand.
2513///
2514/// Different reduction algorithms are implemented in different
2515/// runtime functions, all calling 'shuffleReduceFn' to perform
2516/// the essential reduction step. Therefore, based on the 4th
2517/// parameter, this function behaves slightly differently to
2518/// cooperate with the runtime to ensure correctness under
2519/// different circumstances.
2520///
2521/// 'InterWarpCpyFn' is a pointer to a function that transfers
2522/// reduced variables across warps. It tunnels, through CUDA
2523/// shared memory, the thread-private data of type 'ReduceData'
2524/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002525/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2526/// The last team writes the global reduced value to memory.
2527///
2528/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2529/// reduceData, shuffleReduceFn, interWarpCpyFn,
2530/// scratchpadCopyFn, loadAndReduceFn)
2531///
2532/// 'scratchpadCopyFn' is a helper that stores reduced
2533/// data from the team master to a scratchpad array in
2534/// global memory.
2535///
2536/// 'loadAndReduceFn' is a helper that loads data from
2537/// the scratchpad array and reduces it with the input
2538/// operand.
2539///
2540/// These compiler generated functions hide address
2541/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002542/// 5. if ret == 1:
2543/// The team master of the last team stores the reduced
2544/// result to the globals in memory.
2545/// foo += reduceData.foo; bar *= reduceData.bar
2546///
2547///
2548/// Warp Reduction Algorithms
2549///
2550/// On the warp level, we have three algorithms implemented in the
2551/// OpenMP runtime depending on the number of active lanes:
2552///
2553/// Full Warp Reduction
2554///
2555/// The reduce algorithm within a warp where all lanes are active
2556/// is implemented in the runtime as follows:
2557///
2558/// full_warp_reduce(void *reduce_data,
2559/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2560/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2561/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2562/// }
2563///
2564/// The algorithm completes in log(2, WARPSIZE) steps.
2565///
2566/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2567/// not used therefore we save instructions by not retrieving lane_id
2568/// from the corresponding special registers. The 4th parameter, which
2569/// represents the version of the algorithm being used, is set to 0 to
2570/// signify full warp reduction.
2571///
2572/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2573///
2574/// #reduce_elem refers to an element in the local lane's data structure
2575/// #remote_elem is retrieved from a remote lane
2576/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2577/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2578///
2579/// Contiguous Partial Warp Reduction
2580///
2581/// This reduce algorithm is used within a warp where only the first
2582/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2583/// number of OpenMP threads in a parallel region is not a multiple of
2584/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2585///
2586/// void
2587/// contiguous_partial_reduce(void *reduce_data,
2588/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2589/// int size, int lane_id) {
2590/// int curr_size;
2591/// int offset;
2592/// curr_size = size;
2593/// mask = curr_size/2;
2594/// while (offset>0) {
2595/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2596/// curr_size = (curr_size+1)/2;
2597/// offset = curr_size/2;
2598/// }
2599/// }
2600///
2601/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2602///
2603/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2604/// if (lane_id < offset)
2605/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2606/// else
2607/// reduce_elem = remote_elem
2608///
2609/// This algorithm assumes that the data to be reduced are located in a
2610/// contiguous subset of lanes starting from the first. When there is
2611/// an odd number of active lanes, the data in the last lane is not
2612/// aggregated with any other lane's dat but is instead copied over.
2613///
2614/// Dispersed Partial Warp Reduction
2615///
2616/// This algorithm is used within a warp when any discontiguous subset of
2617/// lanes are active. It is used to implement the reduction operation
2618/// across lanes in an OpenMP simd region or in a nested parallel region.
2619///
2620/// void
2621/// dispersed_partial_reduce(void *reduce_data,
2622/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2623/// int size, remote_id;
2624/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2625/// do {
2626/// remote_id = next_active_lane_id_right_after_me();
2627/// # the above function returns 0 of no active lane
2628/// # is present right after the current lane.
2629/// size = number_of_active_lanes_in_this_warp();
2630/// logical_lane_id /= 2;
2631/// ShuffleReduceFn(reduce_data, logical_lane_id,
2632/// remote_id-1-threadIdx.x, 2);
2633/// } while (logical_lane_id % 2 == 0 && size > 1);
2634/// }
2635///
2636/// There is no assumption made about the initial state of the reduction.
2637/// Any number of lanes (>=1) could be active at any position. The reduction
2638/// result is returned in the first active lane.
2639///
2640/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2641///
2642/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2643/// if (lane_id % 2 == 0 && offset > 0)
2644/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2645/// else
2646/// reduce_elem = remote_elem
2647///
2648///
2649/// Intra-Team Reduction
2650///
2651/// This function, as implemented in the runtime call
2652/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2653/// threads in a team. It first reduces within a warp using the
2654/// aforementioned algorithms. We then proceed to gather all such
2655/// reduced values at the first warp.
2656///
2657/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2658/// data from each of the "warp master" (zeroth lane of each warp, where
2659/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2660/// a mathematical sense) the problem of reduction across warp masters in
2661/// a block to the problem of warp reduction.
2662///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002663///
2664/// Inter-Team Reduction
2665///
2666/// Once a team has reduced its data to a single value, it is stored in
2667/// a global scratchpad array. Since each team has a distinct slot, this
2668/// can be done without locking.
2669///
2670/// The last team to write to the scratchpad array proceeds to reduce the
2671/// scratchpad array. One or more workers in the last team use the helper
2672/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2673/// the k'th worker reduces every k'th element.
2674///
2675/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2676/// reduce across workers and compute a globally reduced value.
2677///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002678void CGOpenMPRuntimeNVPTX::emitReduction(
2679 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2680 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2681 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2682 if (!CGF.HaveInsertPoint())
2683 return;
2684
2685 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002686 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2687 // FIXME: Add support for simd reduction.
2688 assert((TeamsReduction || ParallelReduction) &&
2689 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002690
2691 auto &C = CGM.getContext();
2692
2693 // 1. Build a list of reduction variables.
2694 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2695 auto Size = RHSExprs.size();
2696 for (auto *E : Privates) {
2697 if (E->getType()->isVariablyModifiedType())
2698 // Reserve place for array size.
2699 ++Size;
2700 }
2701 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2702 QualType ReductionArrayTy =
2703 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2704 /*IndexTypeQuals=*/0);
2705 Address ReductionList =
2706 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2707 auto IPriv = Privates.begin();
2708 unsigned Idx = 0;
2709 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2710 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2711 CGF.getPointerSize());
2712 CGF.Builder.CreateStore(
2713 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2714 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2715 Elem);
2716 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2717 // Store array size.
2718 ++Idx;
2719 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2720 CGF.getPointerSize());
2721 llvm::Value *Size = CGF.Builder.CreateIntCast(
2722 CGF.getVLASize(
2723 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00002724 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002725 CGF.SizeTy, /*isSigned=*/false);
2726 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2727 Elem);
2728 }
2729 }
2730
2731 // 2. Emit reduce_func().
2732 auto *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002733 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
2734 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002735
2736 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2737 // RedList, shuffle_reduce_func, interwarp_copy_func);
2738 auto *ThreadId = getThreadID(CGF, Loc);
2739 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2740 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2741 ReductionList.getPointer(), CGF.VoidPtrTy);
2742
2743 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002744 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002745 auto *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002746 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002747
2748 llvm::Value *Res = nullptr;
2749 if (ParallelReduction) {
2750 llvm::Value *Args[] = {ThreadId,
2751 CGF.Builder.getInt32(RHSExprs.size()),
2752 ReductionArrayTySize,
2753 RL,
2754 ShuffleAndReduceFn,
2755 InterWarpCopyFn};
2756
2757 Res = CGF.EmitRuntimeCall(
2758 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2759 Args);
2760 }
2761
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002762 if (TeamsReduction) {
2763 auto *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002764 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002765 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002766 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002767
2768 llvm::Value *Args[] = {ThreadId,
2769 CGF.Builder.getInt32(RHSExprs.size()),
2770 ReductionArrayTySize,
2771 RL,
2772 ShuffleAndReduceFn,
2773 InterWarpCopyFn,
2774 ScratchPadCopyFn,
2775 LoadAndReduceFn};
2776 Res = CGF.EmitRuntimeCall(
2777 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2778 Args);
2779 }
2780
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002781 // 5. Build switch(res)
2782 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2783 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2784
2785 // 6. Build case 1: where we have reduced values in the master
2786 // thread in each team.
2787 // __kmpc_end_reduce{_nowait}(<gtid>);
2788 // break;
2789 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2790 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2791 CGF.EmitBlock(Case1BB);
2792
2793 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2794 llvm::Value *EndArgs[] = {ThreadId};
2795 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2796 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2797 auto IPriv = Privates.begin();
2798 auto ILHS = LHSExprs.begin();
2799 auto IRHS = RHSExprs.begin();
2800 for (auto *E : ReductionOps) {
2801 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2802 cast<DeclRefExpr>(*IRHS));
2803 ++IPriv;
2804 ++ILHS;
2805 ++IRHS;
2806 }
2807 };
2808 RegionCodeGenTy RCG(CodeGen);
2809 NVPTXActionTy Action(
2810 nullptr, llvm::None,
2811 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2812 EndArgs);
2813 RCG.setAction(Action);
2814 RCG(CGF);
2815 CGF.EmitBranch(DefaultBB);
2816 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2817}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002818
2819const VarDecl *
2820CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2821 const VarDecl *NativeParam) const {
2822 if (!NativeParam->getType()->isReferenceType())
2823 return NativeParam;
2824 QualType ArgType = NativeParam->getType();
2825 QualifierCollector QC;
2826 const Type *NonQualTy = QC.strip(ArgType);
2827 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2828 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2829 if (Attr->getCaptureKind() == OMPC_map) {
2830 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2831 LangAS::opencl_global);
2832 }
2833 }
2834 ArgType = CGM.getContext().getPointerType(PointeeTy);
2835 QC.addRestrict();
2836 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002837 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002838 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00002839 if (isa<ImplicitParamDecl>(NativeParam)) {
2840 return ImplicitParamDecl::Create(
2841 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2842 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2843 }
2844 return ParmVarDecl::Create(
2845 CGM.getContext(),
2846 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2847 NativeParam->getLocStart(), NativeParam->getLocation(),
2848 NativeParam->getIdentifier(), ArgType,
2849 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002850}
2851
2852Address
2853CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2854 const VarDecl *NativeParam,
2855 const VarDecl *TargetParam) const {
2856 assert(NativeParam != TargetParam &&
2857 NativeParam->getType()->isReferenceType() &&
2858 "Native arg must not be the same as target arg.");
2859 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2860 QualType NativeParamType = NativeParam->getType();
2861 QualifierCollector QC;
2862 const Type *NonQualTy = QC.strip(NativeParamType);
2863 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2864 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002865 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002866 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002867 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002868 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002869 // First cast to generic.
2870 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2871 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2872 /*AddrSpace=*/0));
2873 // Cast from generic to native address space.
2874 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2875 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2876 NativePointeeAddrSpace));
2877 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2878 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002879 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002880 return NativeParamAddr;
2881}
2882
2883void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002884 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002885 ArrayRef<llvm::Value *> Args) const {
2886 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002887 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002888 auto *FnType =
2889 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2890 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002891 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2892 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2893 break;
2894 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002895 llvm::Type *TargetType = FnType->getParamType(I);
2896 llvm::Value *NativeArg = Args[I];
2897 if (!TargetType->isPointerTy()) {
2898 TargetArgs.emplace_back(NativeArg);
2899 continue;
2900 }
2901 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00002902 NativeArg,
2903 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002904 TargetArgs.emplace_back(
2905 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2906 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002907 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002908}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002909
2910/// Emit function which wraps the outline parallel region
2911/// and controls the arguments which are passed to this function.
2912/// The wrapper ensures that the outlined function is called
2913/// with the correct arguments when data is shared.
2914llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
2915 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2916 ASTContext &Ctx = CGM.getContext();
2917 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
2918
2919 // Create a function that takes as argument the source thread.
2920 FunctionArgList WrapperArgs;
2921 QualType Int16QTy =
2922 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2923 QualType Int32QTy =
2924 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2925 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2926 /*Id=*/nullptr, Int16QTy,
2927 ImplicitParamDecl::Other);
2928 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2929 /*Id=*/nullptr, Int32QTy,
2930 ImplicitParamDecl::Other);
2931 WrapperArgs.emplace_back(&ParallelLevelArg);
2932 WrapperArgs.emplace_back(&WrapperArg);
2933
2934 auto &CGFI =
2935 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2936
2937 auto *Fn = llvm::Function::Create(
2938 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2939 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00002940 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002941 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2942
2943 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2944 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
2945 D.getLocStart(), D.getLocStart());
2946
2947 const auto *RD = CS.getCapturedRecordDecl();
2948 auto CurField = RD->field_begin();
2949
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002950 Address ZeroAddr = CGF.CreateMemTemp(
2951 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2952 /*Name*/ ".zero.addr");
2953 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002954 // Get the array of arguments.
2955 SmallVector<llvm::Value *, 8> Args;
2956
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002957 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
2958 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002959
2960 CGBuilderTy &Bld = CGF.Builder;
2961 auto CI = CS.capture_begin();
2962
2963 // Use global memory for data sharing.
2964 // Handle passing of global args to workers.
2965 Address GlobalArgs =
2966 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
2967 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
2968 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
2969 CGF.EmitRuntimeCall(
2970 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
2971 DataSharingArgs);
2972
2973 // Retrieve the shared variables from the list of references returned
2974 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00002975 Address SharedArgListAddress = Address::invalid();
2976 if (CS.capture_size() > 0 ||
2977 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
2978 SharedArgListAddress = CGF.EmitLoadOfPointer(
2979 GlobalArgs, CGF.getContext()
2980 .getPointerType(CGF.getContext().getPointerType(
2981 CGF.getContext().VoidPtrTy))
2982 .castAs<PointerType>());
2983 }
2984 unsigned Idx = 0;
2985 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
2986 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2987 CGF.getPointerSize());
2988 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
2989 Src, CGF.SizeTy->getPointerTo());
2990 llvm::Value *LB = CGF.EmitLoadOfScalar(
2991 TypedAddress,
2992 /*Volatile=*/false,
2993 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
2994 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
2995 Args.emplace_back(LB);
2996 ++Idx;
2997 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2998 CGF.getPointerSize());
2999 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
3000 Src, CGF.SizeTy->getPointerTo());
3001 llvm::Value *UB = CGF.EmitLoadOfScalar(
3002 TypedAddress,
3003 /*Volatile=*/false,
3004 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
3005 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
3006 Args.emplace_back(UB);
3007 ++Idx;
3008 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003009 if (CS.capture_size() > 0) {
3010 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003011 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
3012 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00003013 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
3014 CGF.getPointerSize());
3015 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003016 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
3017 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
3018 /*Volatile=*/false,
3019 CGFContext.getPointerType(ElemTy),
3020 CI->getLocation());
Alexey Bataev17314212018-03-20 15:41:05 +00003021 if (CI->capturesVariableByCopy()) {
3022 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
3023 CI->getLocation());
3024 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003025 Args.emplace_back(Arg);
3026 }
3027 }
3028
3029 emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
3030 CGF.FinishFunction();
3031 return Fn;
3032}
3033
3034void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
3035 const Decl *D) {
3036 assert(D && "Expected function or captured|block decl.");
3037 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
3038 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003039 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00003040 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003041 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
3042 Body = FD->getBody();
3043 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
3044 Body = BD->getBody();
3045 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
3046 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003047 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003048 }
3049 if (!Body)
3050 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003051 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003052 VarChecker.Visit(Body);
3053 const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003054 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
3055 VarChecker.getEscapedVariableLengthDecls();
3056 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003057 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00003058 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
3059 I->getSecond().MappedParams =
3060 llvm::make_unique<CodeGenFunction::OMPMapVars>();
3061 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
3062 I->getSecond().EscapedParameters.insert(
3063 VarChecker.getEscapedParameters().begin(),
3064 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003065 I->getSecond().EscapedVariableLengthDecls.append(
3066 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00003067 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003068 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003069 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003070 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataevc99042b2018-03-15 18:10:54 +00003071 Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
3072 }
3073 if (!NeedToDelayGlobalization) {
3074 emitGenericVarsProlog(CGF, D->getLocStart());
3075 struct GlobalizationScope final : EHScopeStack::Cleanup {
3076 GlobalizationScope() = default;
3077
3078 void Emit(CodeGenFunction &CGF, Flags flags) override {
3079 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
3080 .emitGenericVarsEpilog(CGF);
3081 }
3082 };
3083 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003084 }
3085}
3086
3087Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
3088 const VarDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003089 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003090 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
3091 if (I == FunctionGlobalizedDecls.end())
3092 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00003093 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00003094 if (VDI != I->getSecond().LocalVarData.end())
3095 return VDI->second.second;
3096 if (VD->hasAttrs()) {
3097 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
3098 E(VD->attr_end());
3099 IT != E; ++IT) {
3100 auto VDI = I->getSecond().LocalVarData.find(
3101 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
3102 ->getCanonicalDecl());
3103 if (VDI != I->getSecond().LocalVarData.end())
3104 return VDI->second.second;
3105 }
3106 }
3107 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003108}
3109
3110void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00003111 FunctionGlobalizedDecls.erase(CGF.CurFn);
3112 CGOpenMPRuntime::functionFinished(CGF);
3113}