blob: 637a86b4553477064404adf59e5cdc8de0a3f636 [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
20#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000021
22using namespace clang;
23using namespace CodeGen;
24
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000025namespace {
26enum OpenMPRTLFunctionNVPTX {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000027 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
28 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000029 OMPRTL_NVPTX__kmpc_kernel_init,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000030 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000031 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000032 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000033 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000034 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
35 /// \brief Call to void __kmpc_spmd_kernel_deinit();
36 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000037 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000038 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000039 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000040 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000041 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function,
42 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000043 OMPRTL_NVPTX__kmpc_kernel_parallel,
44 /// \brief Call to void __kmpc_kernel_end_parallel();
45 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
46 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
47 /// global_tid);
48 OMPRTL_NVPTX__kmpc_serialized_parallel,
49 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
50 /// global_tid);
51 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000052 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
53 /// int16_t lane_offset, int16_t warp_size);
54 OMPRTL_NVPTX__kmpc_shuffle_int32,
55 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
56 /// int16_t lane_offset, int16_t warp_size);
57 OMPRTL_NVPTX__kmpc_shuffle_int64,
58 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
59 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
60 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
61 /// lane_offset, int16_t shortCircuit),
62 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
63 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000064 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
65 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
66 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
67 /// lane_offset, int16_t shortCircuit),
68 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
69 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
70 /// int32_t index, int32_t width),
71 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
72 /// index, int32_t width, int32_t reduce))
73 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000074 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000075 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
76 /// \brief Call to void __kmpc_data_sharing_init_stack();
77 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
78 /// \brief Call to void* __kmpc_data_sharing_push_stack(size_t size,
79 /// int16_t UseSharedMemory);
80 OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
81 /// \brief Call to void __kmpc_data_sharing_pop_stack(void *a);
82 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
83 /// \brief Call to void __kmpc_begin_sharing_variables(void ***args,
84 /// size_t n_args);
85 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
86 /// \brief Call to void __kmpc_end_sharing_variables();
87 OMPRTL_NVPTX__kmpc_end_sharing_variables,
88 /// \brief Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
89 OMPRTL_NVPTX__kmpc_get_shared_variables,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000090};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000091
92/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
93class NVPTXActionTy final : public PrePostActionTy {
94 llvm::Value *EnterCallee;
95 ArrayRef<llvm::Value *> EnterArgs;
96 llvm::Value *ExitCallee;
97 ArrayRef<llvm::Value *> ExitArgs;
98 bool Conditional;
99 llvm::BasicBlock *ContBlock = nullptr;
100
101public:
102 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
103 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
104 bool Conditional = false)
105 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
106 ExitArgs(ExitArgs), Conditional(Conditional) {}
107 void Enter(CodeGenFunction &CGF) override {
108 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
109 if (Conditional) {
110 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
111 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
112 ContBlock = CGF.createBasicBlock("omp_if.end");
113 // Generate the branch (If-stmt)
114 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
115 CGF.EmitBlock(ThenBlock);
116 }
117 }
118 void Done(CodeGenFunction &CGF) {
119 // Emit the rest of blocks/branches
120 CGF.EmitBranch(ContBlock);
121 CGF.EmitBlock(ContBlock, true);
122 }
123 void Exit(CodeGenFunction &CGF) override {
124 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
125 }
126};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000127
128// A class to track the execution mode when codegening directives within
129// a target region. The appropriate mode (generic/spmd) is set on entry
130// to the target region and used by containing directives such as 'parallel'
131// to emit optimized code.
132class ExecutionModeRAII {
133private:
134 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
135 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
136
137public:
138 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
139 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
140 : Mode(Mode) {
141 SavedMode = Mode;
142 Mode = NewMode;
143 }
144 ~ExecutionModeRAII() { Mode = SavedMode; }
145};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000146
147/// GPU Configuration: This information can be derived from cuda registers,
148/// however, providing compile time constants helps generate more efficient
149/// code. For all practical purposes this is fine because the configuration
150/// is the same for all known NVPTX architectures.
151enum MachineConfiguration : unsigned {
152 WarpSize = 32,
153 /// Number of bits required to represent a lane identifier, which is
154 /// computed as log_2(WarpSize).
155 LaneIDBits = 5,
156 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000157
158 /// Global memory alignment for performance.
159 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000160};
161
162enum NamedBarrier : unsigned {
163 /// Synchronize on this barrier #ID using a named barrier primitive.
164 /// Only the subset of active threads in a parallel region arrive at the
165 /// barrier.
166 NB_Parallel = 1,
167};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000168
169/// Get the list of variables that can escape their declaration context.
170class CheckVarsEscapingDeclContext final
171 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
172 CodeGenFunction &CGF;
173 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000174 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000175 llvm::SmallPtrSet<const ValueDecl *, 4> IgnoredDecls;
176 bool AllEscaped = false;
177 RecordDecl *GlobalizedRD = nullptr;
178 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
179
180 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataevc99042b2018-03-15 18:10:54 +0000181 if (IgnoredDecls.count(VD))
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000182 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000183 // Variables captured by value must be globalized.
184 if (auto *CSI = CGF.CapturedStmtInfo) {
185 if (const FieldDecl *FD = CGF.CapturedStmtInfo->lookup(cast<VarDecl>(VD))) {
186 if (FD->getType()->isReferenceType())
187 return;
188 EscapedParameters.insert(VD);
189 }
190 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000191 EscapedDecls.insert(VD);
192 }
193
194 void VisitValueDecl(const ValueDecl *VD) {
195 if (VD->getType()->isLValueReferenceType()) {
196 markAsEscaped(VD);
197 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
198 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
199 const bool SavedAllEscaped = AllEscaped;
200 AllEscaped = true;
201 Visit(VarD->getInit());
202 AllEscaped = SavedAllEscaped;
203 }
204 }
205 }
206 }
207 void VisitOpenMPCapturedStmt(const CapturedStmt *S) {
208 if (!S)
209 return;
210 for (const auto &C : S->captures()) {
211 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
212 const ValueDecl *VD = C.getCapturedVar();
213 markAsEscaped(VD);
214 if (isa<OMPCapturedExprDecl>(VD))
215 VisitValueDecl(VD);
216 }
217 }
218 }
219
220 typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
221 static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
222 return P1.first > P2.first;
223 }
224
225 void buildRecordForGlobalizedVars() {
226 assert(!GlobalizedRD &&
227 "Record for globalized variables is built already.");
228 if (EscapedDecls.empty())
229 return;
230 ASTContext &C = CGF.getContext();
231 SmallVector<VarsDataTy, 4> GlobalizedVars;
232 for (const auto *D : EscapedDecls)
233 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
234 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
235 stable_sort_comparator);
236 // Build struct _globalized_locals_ty {
237 // /* globalized vars */
238 // };
239 GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
240 GlobalizedRD->startDefinition();
241 for (const auto &Pair : GlobalizedVars) {
242 const ValueDecl *VD = Pair.second;
243 QualType Type = VD->getType();
244 if (Type->isLValueReferenceType())
245 Type = C.getPointerType(Type.getNonReferenceType());
246 else
247 Type = Type.getNonReferenceType();
248 SourceLocation Loc = VD->getLocation();
249 auto *Field = FieldDecl::Create(
250 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
251 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
252 /*BW=*/nullptr, /*Mutable=*/false,
253 /*InitStyle=*/ICIS_NoInit);
254 Field->setAccess(AS_public);
255 GlobalizedRD->addDecl(Field);
256 if (VD->hasAttrs()) {
257 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
258 E(VD->getAttrs().end());
259 I != E; ++I)
260 Field->addAttr(*I);
261 }
262 MappedDeclsFields.try_emplace(VD, Field);
263 }
264 GlobalizedRD->completeDefinition();
265 }
266
267public:
268 CheckVarsEscapingDeclContext(CodeGenFunction &CGF,
269 ArrayRef<const ValueDecl *> IgnoredDecls)
270 : CGF(CGF), IgnoredDecls(IgnoredDecls.begin(), IgnoredDecls.end()) {}
271 virtual ~CheckVarsEscapingDeclContext() = default;
272 void VisitDeclStmt(const DeclStmt *S) {
273 if (!S)
274 return;
275 for (const auto *D : S->decls())
276 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
277 VisitValueDecl(VD);
278 }
279 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
280 if (!D)
281 return;
282 if (D->hasAssociatedStmt()) {
283 if (const auto *S =
284 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt()))
285 VisitOpenMPCapturedStmt(S);
286 }
287 }
288 void VisitCapturedStmt(const CapturedStmt *S) {
289 if (!S)
290 return;
291 for (const auto &C : S->captures()) {
292 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
293 const ValueDecl *VD = C.getCapturedVar();
294 markAsEscaped(VD);
295 if (isa<OMPCapturedExprDecl>(VD))
296 VisitValueDecl(VD);
297 }
298 }
299 }
300 void VisitLambdaExpr(const LambdaExpr *E) {
301 if (!E)
302 return;
303 for (const auto &C : E->captures()) {
304 if (C.capturesVariable()) {
305 if (C.getCaptureKind() == LCK_ByRef) {
306 const ValueDecl *VD = C.getCapturedVar();
307 markAsEscaped(VD);
308 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
309 VisitValueDecl(VD);
310 }
311 }
312 }
313 }
314 void VisitBlockExpr(const BlockExpr *E) {
315 if (!E)
316 return;
317 for (const auto &C : E->getBlockDecl()->captures()) {
318 if (C.isByRef()) {
319 const VarDecl *VD = C.getVariable();
320 markAsEscaped(VD);
321 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
322 VisitValueDecl(VD);
323 }
324 }
325 }
326 void VisitCallExpr(const CallExpr *E) {
327 if (!E)
328 return;
329 for (const Expr *Arg : E->arguments()) {
330 if (!Arg)
331 continue;
332 if (Arg->isLValue()) {
333 const bool SavedAllEscaped = AllEscaped;
334 AllEscaped = true;
335 Visit(Arg);
336 AllEscaped = SavedAllEscaped;
337 } else
338 Visit(Arg);
339 }
340 Visit(E->getCallee());
341 }
342 void VisitDeclRefExpr(const DeclRefExpr *E) {
343 if (!E)
344 return;
345 const ValueDecl *VD = E->getDecl();
346 if (AllEscaped)
347 markAsEscaped(VD);
348 if (isa<OMPCapturedExprDecl>(VD))
349 VisitValueDecl(VD);
350 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
351 if (VarD->isInitCapture())
352 VisitValueDecl(VD);
353 }
354 void VisitUnaryOperator(const UnaryOperator *E) {
355 if (!E)
356 return;
357 if (E->getOpcode() == UO_AddrOf) {
358 const bool SavedAllEscaped = AllEscaped;
359 AllEscaped = true;
360 Visit(E->getSubExpr());
361 AllEscaped = SavedAllEscaped;
362 } else
363 Visit(E->getSubExpr());
364 }
365 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
366 if (!E)
367 return;
368 if (E->getCastKind() == CK_ArrayToPointerDecay) {
369 const bool SavedAllEscaped = AllEscaped;
370 AllEscaped = true;
371 Visit(E->getSubExpr());
372 AllEscaped = SavedAllEscaped;
373 } else
374 Visit(E->getSubExpr());
375 }
376 void VisitExpr(const Expr *E) {
377 if (!E)
378 return;
379 bool SavedAllEscaped = AllEscaped;
380 if (!E->isLValue())
381 AllEscaped = false;
382 for (const auto *Child : E->children())
383 if (Child)
384 Visit(Child);
385 AllEscaped = SavedAllEscaped;
386 }
387 void VisitStmt(const Stmt *S) {
388 if (!S)
389 return;
390 for (const auto *Child : S->children())
391 if (Child)
392 Visit(Child);
393 }
394
Alexey Bataevc99042b2018-03-15 18:10:54 +0000395 /// Returns the record that handles all the escaped local variables and used
396 /// instead of their original storage.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000397 const RecordDecl *getGlobalizedRecord() {
398 if (!GlobalizedRD)
399 buildRecordForGlobalizedVars();
400 return GlobalizedRD;
401 }
402
Alexey Bataevc99042b2018-03-15 18:10:54 +0000403 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000404 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
405 assert(GlobalizedRD &&
406 "Record for globalized variables must be generated already.");
407 auto I = MappedDeclsFields.find(VD);
408 if (I == MappedDeclsFields.end())
409 return nullptr;
410 return I->getSecond();
411 }
412
Alexey Bataevc99042b2018-03-15 18:10:54 +0000413 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000414 ArrayRef<const ValueDecl *> getEscapedDecls() const {
415 return EscapedDecls.getArrayRef();
416 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000417
418 /// Checks if the escaped local variable is actually a parameter passed by
419 /// value.
420 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
421 return EscapedParameters;
422 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000423};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000424} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000425
426/// Get the GPU warp size.
427static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000428 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000429 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000430 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000431 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000432}
433
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000434/// Get the id of the current thread on the GPU.
435static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000436 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000437 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000438 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000439 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000440}
441
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000442/// Get the id of the warp in the block.
443/// We assume that the warp size is 32, which is always the case
444/// on the NVPTX device, to generate more efficient code.
445static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
446 CGBuilderTy &Bld = CGF.Builder;
447 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
448}
449
450/// Get the id of the current lane in the Warp.
451/// We assume that the warp size is 32, which is always the case
452/// on the NVPTX device, to generate more efficient code.
453static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
454 CGBuilderTy &Bld = CGF.Builder;
455 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
456 "nvptx_lane_id");
457}
458
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000459/// Get the maximum number of threads in a block of the GPU.
460static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000461 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000462 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000463 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000464 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000465}
466
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000467/// Get barrier to synchronize all threads in a block.
468static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000469 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000470 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000471}
472
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000473/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
474/// a CTA.
475static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
476 llvm::Value *NumThreads) {
477 CGBuilderTy &Bld = CGF.Builder;
478 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000479 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
480 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
481 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000482}
483
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000484/// Synchronize all GPU threads in a block.
485static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000486
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000487/// Synchronize worker threads in a parallel region.
488static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
489 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
490}
491
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000492/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000493/// For the 'generic' execution mode, the runtime encodes thread_limit in
494/// the launch parameters, always starting thread_limit+warpSize threads per
495/// CTA. The threads in the last warp are reserved for master execution.
496/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
497static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
498 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000499 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000500 return IsInSpmdExecutionMode
501 ? getNVPTXNumThreads(CGF)
502 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
503 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000504}
505
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000506/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000507/// The master thread id is the first thread (lane) of the last warp in the
508/// GPU block. Warp size is assumed to be some power of 2.
509/// Thread id is 0 indexed.
510/// E.g: If NumThreads is 33, master id is 32.
511/// If NumThreads is 64, master id is 32.
512/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000513static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000514 CGBuilderTy &Bld = CGF.Builder;
515 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
516
517 // We assume that the warp size is a power of 2.
518 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
519
520 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
521 Bld.CreateNot(Mask), "master_tid");
522}
523
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000524CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000525 CodeGenModule &CGM, SourceLocation Loc)
526 : WorkerFn(nullptr), CGFI(nullptr), Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000527 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000528}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000529
530void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
531 CodeGenModule &CGM) {
532 // Create an worker function with no arguments.
533 CGFI = &CGM.getTypes().arrangeNullaryFunction();
534
535 WorkerFn = llvm::Function::Create(
536 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000537 /*placeholder=*/"_worker", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +0000538 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000539}
540
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000541bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
542 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
543}
544
545static CGOpenMPRuntimeNVPTX::ExecutionMode
Carlo Bertolli79712092018-02-28 20:48:35 +0000546getExecutionMode(CodeGenModule &CGM) {
547 return CGM.getLangOpts().OpenMPCUDAMode
548 ? CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd
549 : CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000550}
551
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000552void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
553 StringRef ParentName,
554 llvm::Function *&OutlinedFn,
555 llvm::Constant *&OutlinedFnID,
556 bool IsOffloadEntry,
557 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000558 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
559 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000560 EntryFunctionState EST;
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000561 WorkerFunctionState WST(CGM, D.getLocStart());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000562 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000563 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000564
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000565 // Emit target region as a standalone region.
566 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000567 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
568 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000569
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000570 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000571 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000572 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000573 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000574 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000575 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
576 .emitGenericEntryHeader(CGF, EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000577 }
578 void Exit(CodeGenFunction &CGF) override {
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000579 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
580 .emitGenericEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000581 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000582 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000583 CodeGen.setAction(Action);
584 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
585 IsOffloadEntry, CodeGen);
586
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000587 // Now change the name of the worker function to correspond to this target
588 // region's entry function.
589 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
Alexey Bataevaee93892018-01-08 20:09:47 +0000590
591 // Create the worker function
592 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000593}
594
595// Setup NVPTX threads for master-worker OpenMP scheme.
596void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
597 EntryFunctionState &EST,
598 WorkerFunctionState &WST) {
599 CGBuilderTy &Bld = CGF.Builder;
600
601 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
602 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
603 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
604 EST.ExitBB = CGF.createBasicBlock(".exit");
605
606 auto *IsWorker =
607 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
608 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
609
610 CGF.EmitBlock(WorkerBB);
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000611 emitOutlinedFunctionCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000612 CGF.EmitBranch(EST.ExitBB);
613
614 CGF.EmitBlock(MasterCheckBB);
615 auto *IsMaster =
616 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
617 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
618
619 CGF.EmitBlock(MasterBB);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000620 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000621 // First action in sequential region:
622 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000623 // TODO: Optimize runtime initialization and pass in correct value.
624 llvm::Value *Args[] = {getThreadLimit(CGF),
625 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000626 CGF.EmitRuntimeCall(
627 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000628
629 // For data sharing, we need to initialize the stack.
630 CGF.EmitRuntimeCall(
631 createNVPTXRuntimeFunction(
632 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
633
Alexey Bataevc99042b2018-03-15 18:10:54 +0000634 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000635}
636
637void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
638 EntryFunctionState &EST) {
Alexey Bataevc99042b2018-03-15 18:10:54 +0000639 emitGenericVarsEpilog(CGF);
640 if (!CGF.HaveInsertPoint())
641 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000642
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000643 if (!EST.ExitBB)
644 EST.ExitBB = CGF.createBasicBlock(".exit");
645
646 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
647 CGF.EmitBranch(TerminateBB);
648
649 CGF.EmitBlock(TerminateBB);
650 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000651 // TODO: Optimize runtime initialization and pass in correct value.
652 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000653 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000654 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000655 // Barrier to terminate worker threads.
656 syncCTAThreads(CGF);
657 // Master thread jumps to exit point.
658 CGF.EmitBranch(EST.ExitBB);
659
660 CGF.EmitBlock(EST.ExitBB);
661 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000662}
663
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000664void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
665 StringRef ParentName,
666 llvm::Function *&OutlinedFn,
667 llvm::Constant *&OutlinedFnID,
668 bool IsOffloadEntry,
669 const RegionCodeGenTy &CodeGen) {
670 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
671 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
672 EntryFunctionState EST;
673
674 // Emit target region as a standalone region.
675 class NVPTXPrePostActionTy : public PrePostActionTy {
676 CGOpenMPRuntimeNVPTX &RT;
677 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
678 const OMPExecutableDirective &D;
679
680 public:
681 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
682 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
683 const OMPExecutableDirective &D)
684 : RT(RT), EST(EST), D(D) {}
685 void Enter(CodeGenFunction &CGF) override {
686 RT.emitSpmdEntryHeader(CGF, EST, D);
687 }
688 void Exit(CodeGenFunction &CGF) override {
689 RT.emitSpmdEntryFooter(CGF, EST);
690 }
691 } Action(*this, EST, D);
692 CodeGen.setAction(Action);
693 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
694 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000695}
696
697void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
698 CodeGenFunction &CGF, EntryFunctionState &EST,
699 const OMPExecutableDirective &D) {
700 auto &Bld = CGF.Builder;
701
702 // Setup BBs in entry function.
703 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
704 EST.ExitBB = CGF.createBasicBlock(".exit");
705
706 // Initialize the OMP state in the runtime; called by all active threads.
707 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
708 // based on code analysis of the target region.
709 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
710 /*RequiresOMPRuntime=*/Bld.getInt16(1),
711 /*RequiresDataSharing=*/Bld.getInt16(1)};
712 CGF.EmitRuntimeCall(
713 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
714 CGF.EmitBranch(ExecuteBB);
715
716 CGF.EmitBlock(ExecuteBB);
717}
718
719void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
720 EntryFunctionState &EST) {
721 if (!EST.ExitBB)
722 EST.ExitBB = CGF.createBasicBlock(".exit");
723
724 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
725 CGF.EmitBranch(OMPDeInitBB);
726
727 CGF.EmitBlock(OMPDeInitBB);
728 // DeInitialize the OMP state in the runtime; called by all active threads.
729 CGF.EmitRuntimeCall(
730 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
731 CGF.EmitBranch(EST.ExitBB);
732
733 CGF.EmitBlock(EST.ExitBB);
734 EST.ExitBB = nullptr;
735}
736
737// Create a unique global variable to indicate the execution mode of this target
738// region. The execution mode is either 'generic', or 'spmd' depending on the
739// target directive. This variable is picked up by the offload library to setup
740// the device appropriately before kernel launch. If the execution mode is
741// 'generic', the runtime reserves one warp for the master, otherwise, all
742// warps participate in parallel work.
743static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
744 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
745 (void)new llvm::GlobalVariable(
746 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
747 llvm::GlobalValue::WeakAnyLinkage,
748 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
749}
750
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000751void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000752 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000753
754 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000755 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {},
756 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000757 emitWorkerLoop(CGF, WST);
758 CGF.FinishFunction();
759}
760
761void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
762 WorkerFunctionState &WST) {
763 //
764 // The workers enter this loop and wait for parallel work from the master.
765 // When the master encounters a parallel region it sets up the work + variable
766 // arguments, and wakes up the workers. The workers first check to see if
767 // they are required for the parallel region, i.e., within the # of requested
768 // parallel threads. The activated workers load the variable arguments and
769 // execute the parallel work.
770 //
771
772 CGBuilderTy &Bld = CGF.Builder;
773
774 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
775 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
776 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
777 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
778 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
779 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
780
781 CGF.EmitBranch(AwaitBB);
782
783 // Workers wait for work from master.
784 CGF.EmitBlock(AwaitBB);
785 // Wait for parallel work
786 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000787
788 Address WorkFn =
789 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
790 Address ExecStatus =
791 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
792 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
793 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
794
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000795 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000796 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000797 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000798 llvm::Value *Ret = CGF.EmitRuntimeCall(
799 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
800 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000801
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000802 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000803 llvm::Value *ShouldTerminate =
804 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000805 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
806
807 // Activate requested workers.
808 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000809 llvm::Value *IsActive =
810 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
811 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000812
813 // Signal start of parallel region.
814 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000815
816 // Process work items: outlined parallel functions.
817 for (auto *W : Work) {
818 // Try to match this outlined function.
819 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
820
821 llvm::Value *WorkFnMatch =
822 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
823
824 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
825 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
826 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
827
828 // Execute this outlined function.
829 CGF.EmitBlock(ExecuteFNBB);
830
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000831 // Insert call to work function via shared wrapper. The shared
832 // wrapper takes two arguments:
833 // - the parallelism level;
834 // - the master thread ID;
835 emitOutlinedFunctionCall(CGF, WST.Loc, W,
836 {Bld.getInt16(/*ParallelLevel=*/0),
837 getMasterThreadID(CGF)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000838
839 // Go to end of parallel region.
840 CGF.EmitBranch(TerminateBB);
841
842 CGF.EmitBlock(CheckNextBB);
843 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000844
845 // Signal end of parallel region.
846 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000847 CGF.EmitRuntimeCall(
848 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
849 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000850 CGF.EmitBranch(BarrierBB);
851
852 // All active and inactive workers wait at a barrier after parallel region.
853 CGF.EmitBlock(BarrierBB);
854 // Barrier after parallel region.
855 syncCTAThreads(CGF);
856 CGF.EmitBranch(AwaitBB);
857
858 // Exit target region.
859 CGF.EmitBlock(ExitBB);
860}
861
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000862/// \brief Returns specified OpenMP runtime function for the current OpenMP
863/// implementation. Specialized for the NVPTX device.
864/// \param Function OpenMP runtime function.
865/// \return Specified function.
866llvm::Constant *
867CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
868 llvm::Constant *RTLFn = nullptr;
869 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
870 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000871 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
872 // RequiresOMPRuntime);
873 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000874 llvm::FunctionType *FnTy =
875 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
876 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
877 break;
878 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000879 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000880 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
881 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000882 llvm::FunctionType *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000883 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000884 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
885 break;
886 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000887 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
888 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000889 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000890 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
891 llvm::FunctionType *FnTy =
892 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
893 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
894 break;
895 }
896 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
897 // Build void __kmpc_spmd_kernel_deinit();
898 llvm::FunctionType *FnTy =
899 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
900 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
901 break;
902 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000903 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
904 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000905 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000906 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000907 llvm::FunctionType *FnTy =
908 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
909 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
910 break;
911 }
912 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +0000913 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
914 /// int16_t IsOMPRuntimeInitialized);
915 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000916 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
917 llvm::FunctionType *FnTy =
918 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
919 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
920 break;
921 }
922 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
923 /// Build void __kmpc_kernel_end_parallel();
924 llvm::FunctionType *FnTy =
925 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
926 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
927 break;
928 }
929 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
930 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
931 // global_tid);
932 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
933 llvm::FunctionType *FnTy =
934 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
935 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
936 break;
937 }
938 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
939 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
940 // global_tid);
941 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
942 llvm::FunctionType *FnTy =
943 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
944 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
945 break;
946 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000947 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
948 // Build int32_t __kmpc_shuffle_int32(int32_t element,
949 // int16_t lane_offset, int16_t warp_size);
950 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
951 llvm::FunctionType *FnTy =
952 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
953 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
954 break;
955 }
956 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
957 // Build int64_t __kmpc_shuffle_int64(int64_t element,
958 // int16_t lane_offset, int16_t warp_size);
959 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
960 llvm::FunctionType *FnTy =
961 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
962 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
963 break;
964 }
965 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
966 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
967 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
968 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
969 // lane_offset, int16_t Algorithm Version),
970 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
971 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
972 CGM.Int16Ty, CGM.Int16Ty};
973 auto *ShuffleReduceFnTy =
974 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
975 /*isVarArg=*/false);
976 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
977 auto *InterWarpCopyFnTy =
978 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
979 /*isVarArg=*/false);
980 llvm::Type *TypeParams[] = {CGM.Int32Ty,
981 CGM.Int32Ty,
982 CGM.SizeTy,
983 CGM.VoidPtrTy,
984 ShuffleReduceFnTy->getPointerTo(),
985 InterWarpCopyFnTy->getPointerTo()};
986 llvm::FunctionType *FnTy =
987 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
988 RTLFn = CGM.CreateRuntimeFunction(
989 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
990 break;
991 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000992 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
993 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
994 // int32_t num_vars, size_t reduce_size, void *reduce_data,
995 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
996 // lane_offset, int16_t shortCircuit),
997 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
998 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
999 // int32_t index, int32_t width),
1000 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1001 // int32_t index, int32_t width, int32_t reduce))
1002 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1003 CGM.Int16Ty, CGM.Int16Ty};
1004 auto *ShuffleReduceFnTy =
1005 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1006 /*isVarArg=*/false);
1007 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1008 auto *InterWarpCopyFnTy =
1009 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1010 /*isVarArg=*/false);
1011 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1012 CGM.Int32Ty, CGM.Int32Ty};
1013 auto *CopyToScratchpadFnTy =
1014 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1015 /*isVarArg=*/false);
1016 llvm::Type *LoadReduceTypeParams[] = {
1017 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1018 auto *LoadReduceFnTy =
1019 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1020 /*isVarArg=*/false);
1021 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1022 CGM.Int32Ty,
1023 CGM.SizeTy,
1024 CGM.VoidPtrTy,
1025 ShuffleReduceFnTy->getPointerTo(),
1026 InterWarpCopyFnTy->getPointerTo(),
1027 CopyToScratchpadFnTy->getPointerTo(),
1028 LoadReduceFnTy->getPointerTo()};
1029 llvm::FunctionType *FnTy =
1030 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1031 RTLFn = CGM.CreateRuntimeFunction(
1032 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1033 break;
1034 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001035 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1036 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1037 llvm::Type *TypeParams[] = {CGM.Int32Ty};
1038 llvm::FunctionType *FnTy =
1039 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1040 RTLFn = CGM.CreateRuntimeFunction(
1041 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1042 break;
1043 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001044 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1045 /// Build void __kmpc_data_sharing_init_stack();
1046 llvm::FunctionType *FnTy =
1047 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1048 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1049 break;
1050 }
1051 case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1052 // Build void *__kmpc_data_sharing_push_stack(size_t size,
1053 // int16_t UseSharedMemory);
1054 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
1055 llvm::FunctionType *FnTy =
1056 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1057 RTLFn = CGM.CreateRuntimeFunction(
1058 FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1059 break;
1060 }
1061 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1062 // Build void __kmpc_data_sharing_pop_stack(void *a);
1063 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
1064 llvm::FunctionType *FnTy =
1065 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1066 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1067 /*Name=*/"__kmpc_data_sharing_pop_stack");
1068 break;
1069 }
1070 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1071 /// Build void __kmpc_begin_sharing_variables(void ***args,
1072 /// size_t n_args);
1073 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
1074 llvm::FunctionType *FnTy =
1075 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1076 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1077 break;
1078 }
1079 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1080 /// Build void __kmpc_end_sharing_variables();
1081 llvm::FunctionType *FnTy =
1082 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1083 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1084 break;
1085 }
1086 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1087 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1088 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
1089 llvm::FunctionType *FnTy =
1090 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1091 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1092 break;
1093 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001094 }
1095 return RTLFn;
1096}
1097
1098void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1099 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +00001100 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001101 auto *F = dyn_cast<llvm::Function>(Addr);
1102 // TODO: Add support for global variables on the device after declare target
1103 // support.
1104 if (!F)
1105 return;
1106 llvm::Module *M = F->getParent();
1107 llvm::LLVMContext &Ctx = M->getContext();
1108
1109 // Get "nvvm.annotations" metadata node
1110 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
1111
1112 llvm::Metadata *MDVals[] = {
1113 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
1114 llvm::ConstantAsMetadata::get(
1115 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1116 // Append metadata to nvvm.annotations
1117 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1118}
1119
1120void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1121 const OMPExecutableDirective &D, StringRef ParentName,
1122 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001123 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001124 if (!IsOffloadEntry) // Nothing to do.
1125 return;
1126
1127 assert(!ParentName.empty() && "Invalid target region parent name!");
1128
Carlo Bertolli79712092018-02-28 20:48:35 +00001129 CGOpenMPRuntimeNVPTX::ExecutionMode Mode = getExecutionMode(CGM);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001130 switch (Mode) {
1131 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
1132 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1133 CodeGen);
1134 break;
1135 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
1136 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1137 CodeGen);
1138 break;
1139 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
1140 llvm_unreachable(
1141 "Unknown programming model for OpenMP directive on NVPTX target.");
1142 }
1143
1144 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001145}
1146
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001147CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001148 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001149 if (!CGM.getLangOpts().OpenMPIsDevice)
1150 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001151}
Carlo Bertollic6872252016-04-04 15:55:02 +00001152
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001153void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1154 OpenMPProcBindClauseKind ProcBind,
1155 SourceLocation Loc) {
1156 // Do nothing in case of Spmd mode and L0 parallel.
1157 // TODO: If in Spmd mode and L1 parallel emit the clause.
1158 if (isInSpmdExecutionMode())
1159 return;
1160
1161 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1162}
1163
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001164void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1165 llvm::Value *NumThreads,
1166 SourceLocation Loc) {
1167 // Do nothing in case of Spmd mode and L0 parallel.
1168 // TODO: If in Spmd mode and L1 parallel emit the clause.
1169 if (isInSpmdExecutionMode())
1170 return;
1171
1172 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1173}
1174
Carlo Bertollic6872252016-04-04 15:55:02 +00001175void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1176 const Expr *NumTeams,
1177 const Expr *ThreadLimit,
1178 SourceLocation Loc) {}
1179
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001180llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1181 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1182 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001183 SourceLocation Loc = D.getLocStart();
1184
1185 // Emit target region as a standalone region.
1186 class NVPTXPrePostActionTy : public PrePostActionTy {
1187 SourceLocation &Loc;
1188
1189 public:
1190 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1191 void Enter(CodeGenFunction &CGF) override {
1192 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1193 .emitGenericVarsProlog(CGF, Loc);
1194 }
1195 void Exit(CodeGenFunction &CGF) override {
1196 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1197 .emitGenericVarsEpilog(CGF);
1198 }
1199 } Action(Loc);
1200 CodeGen.setAction(Action);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001201 auto *OutlinedFun =
1202 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1203 D, ThreadIDVar, InnermostKind, CodeGen));
1204 if (!isInSpmdExecutionMode()) {
1205 llvm::Function *WrapperFun =
1206 createParallelDataSharingWrapper(OutlinedFun, D);
1207 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1208 }
1209
1210 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001211}
1212
1213llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001214 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1215 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001216 SourceLocation Loc = D.getLocStart();
Carlo Bertollic6872252016-04-04 15:55:02 +00001217
Alexey Bataevc99042b2018-03-15 18:10:54 +00001218 // Emit target region as a standalone region.
1219 class NVPTXPrePostActionTy : public PrePostActionTy {
1220 SourceLocation &Loc;
1221
1222 public:
1223 NVPTXPrePostActionTy(SourceLocation &Loc) : Loc(Loc) {}
1224 void Enter(CodeGenFunction &CGF) override {
1225 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1226 .emitGenericVarsProlog(CGF, Loc);
1227 }
1228 void Exit(CodeGenFunction &CGF) override {
1229 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
1230 .emitGenericVarsEpilog(CGF);
1231 }
1232 } Action(Loc);
1233 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001234 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
1235 D, ThreadIDVar, InnermostKind, CodeGen);
1236 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
1237 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00001238 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001239 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00001240
1241 return OutlinedFun;
1242}
1243
Alexey Bataevc99042b2018-03-15 18:10:54 +00001244void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
1245 SourceLocation Loc) {
1246 CGBuilderTy &Bld = CGF.Builder;
1247
1248 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1249 if (I == FunctionGlobalizedDecls.end())
1250 return;
1251 const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord;
1252 QualType RecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
1253
1254 // Recover pointer to this function's global record. The runtime will
1255 // handle the specifics of the allocation of the memory.
1256 // Use actual memory size of the record including the padding
1257 // for alignment purposes.
1258 unsigned Alignment =
1259 CGM.getContext().getTypeAlignInChars(RecTy).getQuantity();
1260 unsigned GlobalRecordSize =
1261 CGM.getContext().getTypeSizeInChars(RecTy).getQuantity();
1262 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
1263 // TODO: allow the usage of shared memory to be controlled by
1264 // the user, for now, default to global.
1265 llvm::Value *GlobalRecordSizeArg[] = {
1266 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
1267 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
1268 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
1269 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
1270 GlobalRecordSizeArg);
1271 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1272 GlobalRecValue, CGF.ConvertTypeForMem(RecTy)->getPointerTo());
1273 LValue Base = CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, RecTy);
1274 I->getSecond().GlobalRecordAddr = GlobalRecValue;
1275
1276 // Emit the "global alloca" which is a GEP from the global declaration record
1277 // using the pointer returned by the runtime.
1278 for (auto &Rec : I->getSecond().LocalVarData) {
1279 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
1280 llvm::Value *ParValue;
1281 if (EscapedParam) {
1282 const auto *VD = cast<VarDecl>(Rec.first);
1283 LValue ParLVal =
1284 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
1285 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
1286 }
1287 const FieldDecl *FD = Rec.second.first;
1288 LValue VarAddr = CGF.EmitLValueForField(Base, FD);
1289 Rec.second.second = VarAddr.getAddress();
1290 if (EscapedParam) {
1291 const auto *VD = cast<VarDecl>(Rec.first);
1292 CGF.EmitStoreOfScalar(ParValue, VarAddr);
1293 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
1294 }
1295 }
1296 I->getSecond().MappedParams->apply(CGF);
1297}
1298
1299void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF) {
1300 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
1301 if (I != FunctionGlobalizedDecls.end() && I->getSecond().GlobalRecordAddr) {
1302 I->getSecond().MappedParams->restore(CGF);
1303 if (!CGF.HaveInsertPoint())
1304 return;
1305 CGF.EmitRuntimeCall(
1306 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
1307 I->getSecond().GlobalRecordAddr);
1308 }
1309}
1310
Carlo Bertollic6872252016-04-04 15:55:02 +00001311void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
1312 const OMPExecutableDirective &D,
1313 SourceLocation Loc,
1314 llvm::Value *OutlinedFn,
1315 ArrayRef<llvm::Value *> CapturedVars) {
1316 if (!CGF.HaveInsertPoint())
1317 return;
1318
1319 Address ZeroAddr =
1320 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
1321 /*Name*/ ".zero.addr");
1322 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1323 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1324 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1325 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1326 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001327 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00001328}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001329
1330void CGOpenMPRuntimeNVPTX::emitParallelCall(
1331 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1332 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1333 if (!CGF.HaveInsertPoint())
1334 return;
1335
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001336 if (isInSpmdExecutionMode())
1337 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
1338 else
1339 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001340}
1341
1342void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
1343 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1344 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1345 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001346 llvm::Function *WFn = WrapperFunctionsMap[Fn];
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001347
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001348 assert(WFn && "Wrapper function does not exist!");
1349
1350 // Force inline this outlined function at its call site.
1351 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
1352
1353 auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
1354 PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001355 CGBuilderTy &Bld = CGF.Builder;
1356
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001357 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
1358
1359 // Prepare for parallel region. Indicate the outlined function.
1360 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
1361 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1362 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
1363 Args);
1364
1365 // Create a private scope that will globalize the arguments
1366 // passed from the outside of the target region.
1367 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
1368
1369 // There's somehting to share.
1370 if (!CapturedVars.empty()) {
1371 // Prepare for parallel region. Indicate the outlined function.
1372 Address SharedArgs =
1373 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
1374 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
1375
1376 llvm::Value *DataSharingArgs[] = {
1377 SharedArgsPtr,
1378 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
1379 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1380 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
1381 DataSharingArgs);
1382
1383 // Store variable address in a list of references to pass to workers.
1384 unsigned Idx = 0;
1385 ASTContext &Ctx = CGF.getContext();
1386 Address SharedArgListAddress = CGF.EmitLoadOfPointer(SharedArgs,
1387 Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
1388 .castAs<PointerType>());
1389 for (llvm::Value *V : CapturedVars) {
1390 Address Dst = Bld.CreateConstInBoundsGEP(
1391 SharedArgListAddress, Idx, CGF.getPointerSize());
1392 llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
1393 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
1394 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00001395 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001396 }
1397 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001398
1399 // Activate workers. This barrier is used by the master to signal
1400 // work for the workers.
1401 syncCTAThreads(CGF);
1402
1403 // OpenMP [2.5, Parallel Construct, p.49]
1404 // There is an implied barrier at the end of a parallel region. After the
1405 // end of a parallel region, only the master thread of the team resumes
1406 // execution of the enclosing task region.
1407 //
1408 // The master waits at this barrier until all workers are done.
1409 syncCTAThreads(CGF);
1410
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001411 if (!CapturedVars.empty())
1412 CGF.EmitRuntimeCall(
1413 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
1414
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001415 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001416 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001417 };
1418
1419 auto *RTLoc = emitUpdateLocation(CGF, Loc);
1420 auto *ThreadID = getThreadID(CGF, Loc);
1421 llvm::Value *Args[] = {RTLoc, ThreadID};
1422
Alexey Bataev3c595a62017-08-14 15:01:03 +00001423 auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1424 PrePostActionTy &) {
1425 auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,
1426 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001427 Action.Enter(CGF);
1428
1429 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1430 OutlinedFnArgs.push_back(
1431 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1432 OutlinedFnArgs.push_back(
1433 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1434 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001435 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001436 };
1437
1438 RegionCodeGenTy RCG(CodeGen);
1439 NVPTXActionTy Action(
1440 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1441 Args,
1442 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1443 Args);
1444 RCG.setAction(Action);
1445 RCG(CGF);
1446 };
1447
1448 if (IfCond)
1449 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1450 else {
1451 CodeGenFunction::RunCleanupsScope Scope(CGF);
1452 RegionCodeGenTy ThenRCG(L0ParallelGen);
1453 ThenRCG(CGF);
1454 }
1455}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001456
1457void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1458 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1459 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1460 // Just call the outlined function to execute the parallel region.
1461 // OutlinedFn(&GTid, &zero, CapturedStruct);
1462 //
1463 // TODO: Do something with IfCond when support for the 'if' clause
1464 // is added on Spmd target directives.
1465 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00001466
1467 Address ZeroAddr = CGF.CreateMemTemp(
1468 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
1469 ".zero.addr");
1470 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
1471 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
1472 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001473 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001474 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001475}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001476
Alexey Bataevb2575932018-01-04 20:18:55 +00001477/// Cast value to the specified type.
1478static llvm::Value *
1479castValueToType(CodeGenFunction &CGF, llvm::Value *Val, llvm::Type *CastTy,
1480 llvm::Optional<bool> IsSigned = llvm::None) {
1481 if (Val->getType() == CastTy)
1482 return Val;
1483 if (Val->getType()->getPrimitiveSizeInBits() > 0 &&
1484 CastTy->getPrimitiveSizeInBits() > 0 &&
1485 Val->getType()->getPrimitiveSizeInBits() ==
1486 CastTy->getPrimitiveSizeInBits())
1487 return CGF.Builder.CreateBitCast(Val, CastTy);
1488 if (IsSigned.hasValue() && CastTy->isIntegerTy() &&
1489 Val->getType()->isIntegerTy())
1490 return CGF.Builder.CreateIntCast(Val, CastTy, *IsSigned);
1491 Address CastItem = CGF.CreateTempAlloca(
1492 CastTy,
1493 CharUnits::fromQuantity(
1494 CGF.CGM.getDataLayout().getPrefTypeAlignment(Val->getType())));
1495 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1496 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
1497 CGF.Builder.CreateStore(Val, ValCastItem);
1498 return CGF.Builder.CreateLoad(CastItem);
1499}
1500
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001501/// This function creates calls to one of two shuffle functions to copy
1502/// variables between lanes in a warp.
1503static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001504 llvm::Value *Elem,
1505 llvm::Value *Offset) {
1506 auto &CGM = CGF.CGM;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001507 auto &Bld = CGF.Builder;
1508 CGOpenMPRuntimeNVPTX &RT =
1509 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1510
Alexey Bataevb2575932018-01-04 20:18:55 +00001511 unsigned Size = CGM.getDataLayout().getTypeStoreSize(Elem->getType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001512 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
1513
1514 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
1515 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1516 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1517
1518 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataevb2575932018-01-04 20:18:55 +00001519 llvm::Type *CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1520 llvm::Value *ElemCast = castValueToType(CGF, Elem, CastTy, /*isSigned=*/true);
1521 auto *WarpSize =
1522 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001523
1524 auto *ShuffledVal =
1525 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1526 {ElemCast, Offset, WarpSize});
1527
Alexey Bataevb2575932018-01-04 20:18:55 +00001528 return castValueToType(CGF, ShuffledVal, Elem->getType(), /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001529}
1530
1531namespace {
1532enum CopyAction : unsigned {
1533 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1534 // the warp using shuffle instructions.
1535 RemoteLaneToThread,
1536 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1537 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001538 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1539 ThreadToScratchpad,
1540 // ScratchpadToThread: Copy from a scratchpad array in global memory
1541 // containing team-reduced data to a thread's stack.
1542 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001543};
1544} // namespace
1545
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001546struct CopyOptionsTy {
1547 llvm::Value *RemoteLaneOffset;
1548 llvm::Value *ScratchpadIndex;
1549 llvm::Value *ScratchpadWidth;
1550};
1551
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001552/// Emit instructions to copy a Reduce list, which contains partially
1553/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001554static void emitReductionListCopy(
1555 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1556 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1557 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001558
1559 auto &CGM = CGF.CGM;
1560 auto &C = CGM.getContext();
1561 auto &Bld = CGF.Builder;
1562
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001563 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1564 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1565 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1566
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001567 // Iterates, element-by-element, through the source Reduce list and
1568 // make a copy.
1569 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001570 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001571 for (auto &Private : Privates) {
1572 Address SrcElementAddr = Address::invalid();
1573 Address DestElementAddr = Address::invalid();
1574 Address DestElementPtrAddr = Address::invalid();
1575 // Should we shuffle in an element from a remote lane?
1576 bool ShuffleInElement = false;
1577 // Set to true to update the pointer in the dest Reduce list to a
1578 // newly created element.
1579 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001580 // Increment the src or dest pointer to the scratchpad, for each
1581 // new element.
1582 bool IncrScratchpadSrc = false;
1583 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001584
1585 switch (Action) {
1586 case RemoteLaneToThread: {
1587 // Step 1.1: Get the address for the src element in the Reduce list.
1588 Address SrcElementPtrAddr =
1589 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001590 SrcElementAddr = CGF.EmitLoadOfPointer(
1591 SrcElementPtrAddr,
1592 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001593
1594 // Step 1.2: Create a temporary to store the element in the destination
1595 // Reduce list.
1596 DestElementPtrAddr =
1597 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1598 DestElementAddr =
1599 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1600 ShuffleInElement = true;
1601 UpdateDestListPtr = true;
1602 break;
1603 }
1604 case ThreadCopy: {
1605 // Step 1.1: Get the address for the src element in the Reduce list.
1606 Address SrcElementPtrAddr =
1607 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001608 SrcElementAddr = CGF.EmitLoadOfPointer(
1609 SrcElementPtrAddr,
1610 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001611
1612 // Step 1.2: Get the address for dest element. The destination
1613 // element has already been created on the thread's stack.
1614 DestElementPtrAddr =
1615 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001616 DestElementAddr = CGF.EmitLoadOfPointer(
1617 DestElementPtrAddr,
1618 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001619 break;
1620 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001621 case ThreadToScratchpad: {
1622 // Step 1.1: Get the address for the src element in the Reduce list.
1623 Address SrcElementPtrAddr =
1624 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00001625 SrcElementAddr = CGF.EmitLoadOfPointer(
1626 SrcElementPtrAddr,
1627 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001628
1629 // Step 1.2: Get the address for dest element:
1630 // address = base + index * ElementSizeInChars.
1631 unsigned ElementSizeInChars =
1632 C.getTypeSizeInChars(Private->getType()).getQuantity();
1633 auto *CurrentOffset =
1634 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1635 ScratchpadIndex);
1636 auto *ScratchPadElemAbsolutePtrVal =
1637 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1638 ScratchPadElemAbsolutePtrVal =
1639 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00001640 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1641 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001642 IncrScratchpadDest = true;
1643 break;
1644 }
1645 case ScratchpadToThread: {
1646 // Step 1.1: Get the address for the src element in the scratchpad.
1647 // address = base + index * ElementSizeInChars.
1648 unsigned ElementSizeInChars =
1649 C.getTypeSizeInChars(Private->getType()).getQuantity();
1650 auto *CurrentOffset =
1651 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1652 ScratchpadIndex);
1653 auto *ScratchPadElemAbsolutePtrVal =
1654 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1655 ScratchPadElemAbsolutePtrVal =
1656 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1657 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1658 C.getTypeAlignInChars(Private->getType()));
1659 IncrScratchpadSrc = true;
1660
1661 // Step 1.2: Create a temporary to store the element in the destination
1662 // Reduce list.
1663 DestElementPtrAddr =
1664 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1665 DestElementAddr =
1666 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1667 UpdateDestListPtr = true;
1668 break;
1669 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001670 }
1671
1672 // Regardless of src and dest of copy, we emit the load of src
1673 // element as this is required in all directions
1674 SrcElementAddr = Bld.CreateElementBitCast(
1675 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1676 llvm::Value *Elem =
1677 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001678 Private->getType(), Private->getExprLoc());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001679
1680 // Now that all active lanes have read the element in the
1681 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataevb2575932018-01-04 20:18:55 +00001682 if (ShuffleInElement)
1683 Elem = createRuntimeShuffleFunction(CGF, Elem, RemoteLaneOffset);
1684
1685 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
1686 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001687
1688 // Store the source element value to the dest element address.
1689 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1690 Private->getType());
1691
1692 // Step 3.1: Modify reference in dest Reduce list as needed.
1693 // Modifying the reference in Reduce list to point to the newly
1694 // created element. The element is live in the current function
1695 // scope and that of functions it invokes (i.e., reduce_function).
1696 // RemoteReduceData[i] = (void*)&RemoteElem
1697 if (UpdateDestListPtr) {
1698 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1699 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1700 DestElementPtrAddr, /*Volatile=*/false,
1701 C.VoidPtrTy);
1702 }
1703
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001704 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1705 // address of the next element in scratchpad memory, unless we're currently
1706 // processing the last one. Memory alignment is also taken care of here.
1707 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1708 llvm::Value *ScratchpadBasePtr =
1709 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1710 unsigned ElementSizeInChars =
1711 C.getTypeSizeInChars(Private->getType()).getQuantity();
1712 ScratchpadBasePtr = Bld.CreateAdd(
1713 ScratchpadBasePtr,
1714 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1715 CGM.SizeTy, ElementSizeInChars)));
1716
1717 // Take care of global memory alignment for performance
1718 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1719 llvm::ConstantInt::get(CGM.SizeTy, 1));
1720 ScratchpadBasePtr = Bld.CreateSDiv(
1721 ScratchpadBasePtr,
1722 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1723 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1724 llvm::ConstantInt::get(CGM.SizeTy, 1));
1725 ScratchpadBasePtr = Bld.CreateMul(
1726 ScratchpadBasePtr,
1727 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1728
1729 if (IncrScratchpadDest)
1730 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1731 else /* IncrScratchpadSrc = true */
1732 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1733 }
1734
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001735 Idx++;
1736 }
1737}
1738
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001739/// This function emits a helper that loads data from the scratchpad array
1740/// and (optionally) reduces it with the input operand.
1741///
1742/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1743/// reduce_data remote;
1744/// for elem in remote:
1745/// remote.elem = Scratchpad[elem_id][index]
1746/// if (should_reduce)
1747/// local = local @ remote
1748/// else
1749/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001750static llvm::Value *emitReduceScratchpadFunction(
1751 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
1752 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001753 auto &C = CGM.getContext();
1754 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1755
1756 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001757 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1758 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001759 // Base address of the scratchpad array, with each element storing a
1760 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001761 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1762 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001763 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001764 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1765 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001766 // Row width of an element in the scratchpad array, typically
1767 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001768 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1769 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001770 // If should_reduce == 1, then it's load AND reduce,
1771 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1772 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001773 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1774 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001775
1776 FunctionArgList Args;
1777 Args.push_back(&ReduceListArg);
1778 Args.push_back(&ScratchPadArg);
1779 Args.push_back(&IndexArg);
1780 Args.push_back(&WidthArg);
1781 Args.push_back(&ShouldReduceArg);
1782
1783 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1784 auto *Fn = llvm::Function::Create(
1785 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1786 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001787 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001788 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001789 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001790
1791 auto &Bld = CGF.Builder;
1792
1793 // Get local Reduce list pointer.
1794 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1795 Address ReduceListAddr(
1796 Bld.CreatePointerBitCastOrAddrSpaceCast(
1797 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001798 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001799 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1800 CGF.getPointerAlign());
1801
1802 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1803 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001804 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001805
1806 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001807 llvm::Value *IndexVal = Bld.CreateIntCast(
1808 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
1809 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001810
1811 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001812 llvm::Value *WidthVal = Bld.CreateIntCast(
1813 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
1814 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001815
1816 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1817 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001818 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001819
1820 // The absolute ptr address to the base addr of the next element to copy.
1821 llvm::Value *CumulativeElemBasePtr =
1822 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1823 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1824
1825 // Create a Remote Reduce list to store the elements read from the
1826 // scratchpad array.
1827 Address RemoteReduceList =
1828 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1829
1830 // Assemble remote Reduce list from scratchpad array.
1831 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1832 SrcDataAddr, RemoteReduceList,
1833 {/*RemoteLaneOffset=*/nullptr,
1834 /*ScratchpadIndex=*/IndexVal,
1835 /*ScratchpadWidth=*/WidthVal});
1836
1837 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1838 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1839 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1840
1841 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1842 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1843
1844 CGF.EmitBlock(ThenBB);
1845 // We should reduce with the local Reduce list.
1846 // reduce_function(LocalReduceList, RemoteReduceList)
1847 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1848 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1849 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1850 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001851 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
1852 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001853 Bld.CreateBr(MergeBB);
1854
1855 CGF.EmitBlock(ElseBB);
1856 // No reduction; just copy:
1857 // Local Reduce list = Remote Reduce list.
1858 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1859 RemoteReduceList, ReduceListAddr);
1860 Bld.CreateBr(MergeBB);
1861
1862 CGF.EmitBlock(MergeBB);
1863
1864 CGF.FinishFunction();
1865 return Fn;
1866}
1867
1868/// This function emits a helper that stores reduced data from the team
1869/// master to a scratchpad array in global memory.
1870///
1871/// for elem in Reduce List:
1872/// scratchpad[elem_id][index] = elem
1873///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001874static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1875 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001876 QualType ReductionArrayTy,
1877 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001878
1879 auto &C = CGM.getContext();
1880 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1881
1882 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001883 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1884 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001885 // Base address of the scratchpad array, with each element storing a
1886 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001887 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1888 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001889 // A destination index into the scratchpad array, typically the team
1890 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001891 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1892 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001893 // Row width of an element in the scratchpad array, typically
1894 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001895 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
1896 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001897
1898 FunctionArgList Args;
1899 Args.push_back(&ReduceListArg);
1900 Args.push_back(&ScratchPadArg);
1901 Args.push_back(&IndexArg);
1902 Args.push_back(&WidthArg);
1903
1904 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1905 auto *Fn = llvm::Function::Create(
1906 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1907 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001908 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001909 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001910 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001911
1912 auto &Bld = CGF.Builder;
1913
1914 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1915 Address SrcDataAddr(
1916 Bld.CreatePointerBitCastOrAddrSpaceCast(
1917 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001918 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001919 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1920 CGF.getPointerAlign());
1921
1922 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1923 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001924 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001925
1926 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00001927 llvm::Value *IndexVal = Bld.CreateIntCast(
1928 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
1929 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001930
1931 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1932 llvm::Value *WidthVal =
1933 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1934 Int32Ty, SourceLocation()),
1935 CGF.SizeTy, /*isSigned=*/true);
1936
1937 // The absolute ptr address to the base addr of the next element to copy.
1938 llvm::Value *CumulativeElemBasePtr =
1939 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1940 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1941
1942 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1943 SrcDataAddr, DestDataAddr,
1944 {/*RemoteLaneOffset=*/nullptr,
1945 /*ScratchpadIndex=*/IndexVal,
1946 /*ScratchpadWidth=*/WidthVal});
1947
1948 CGF.FinishFunction();
1949 return Fn;
1950}
1951
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001952/// This function emits a helper that gathers Reduce lists from the first
1953/// lane of every active warp to lanes in the first warp.
1954///
1955/// void inter_warp_copy_func(void* reduce_data, num_warps)
1956/// shared smem[warp_size];
1957/// For all data entries D in reduce_data:
1958/// If (I am the first lane in each warp)
1959/// Copy my local D to smem[warp_id]
1960/// sync
1961/// if (I am the first warp)
1962/// Copy smem[thread_id] to my local D
1963/// sync
1964static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1965 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001966 QualType ReductionArrayTy,
1967 SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001968 auto &C = CGM.getContext();
1969 auto &M = CGM.getModule();
1970
1971 // ReduceList: thread local Reduce list.
1972 // At the stage of the computation when this function is called, partially
1973 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001974 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
1975 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001976 // NumWarps: number of warps active in the parallel region. This could
1977 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001978 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00001979 C.getIntTypeForBitwidth(32, /* Signed */ true),
1980 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001981 FunctionArgList Args;
1982 Args.push_back(&ReduceListArg);
1983 Args.push_back(&NumWarpsArg);
1984
1985 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1986 auto *Fn = llvm::Function::Create(
1987 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1988 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00001989 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001990 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001991 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001992
1993 auto &Bld = CGF.Builder;
1994
1995 // This array is used as a medium to transfer, one reduce element at a time,
1996 // the data from the first lane of every warp to lanes in the first warp
1997 // in order to perform the final step of a reduction in a parallel region
1998 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1999 // for reduced latency, as well as to have a distinct copy for concurrently
2000 // executing target regions. The array is declared with common linkage so
2001 // as to be shared across compilation units.
2002 const char *TransferMediumName =
2003 "__openmp_nvptx_data_transfer_temporary_storage";
2004 llvm::GlobalVariable *TransferMedium =
2005 M.getGlobalVariable(TransferMediumName);
2006 if (!TransferMedium) {
2007 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
2008 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
2009 TransferMedium = new llvm::GlobalVariable(
2010 M, Ty,
2011 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
2012 llvm::Constant::getNullValue(Ty), TransferMediumName,
2013 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
2014 SharedAddressSpace);
2015 }
2016
2017 // Get the CUDA thread id of the current OpenMP thread on the GPU.
2018 auto *ThreadID = getNVPTXThreadID(CGF);
2019 // nvptx_lane_id = nvptx_id % warpsize
2020 auto *LaneID = getNVPTXLaneID(CGF);
2021 // nvptx_warp_id = nvptx_id / warpsize
2022 auto *WarpID = getNVPTXWarpID(CGF);
2023
2024 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2025 Address LocalReduceList(
2026 Bld.CreatePointerBitCastOrAddrSpaceCast(
2027 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2028 C.VoidPtrTy, SourceLocation()),
2029 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2030 CGF.getPointerAlign());
2031
2032 unsigned Idx = 0;
2033 for (auto &Private : Privates) {
2034 //
2035 // Warp master copies reduce element to transfer medium in __shared__
2036 // memory.
2037 //
2038 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2039 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2040 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2041
2042 // if (lane_id == 0)
2043 auto IsWarpMaster =
2044 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
2045 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
2046 CGF.EmitBlock(ThenBB);
2047
2048 // Reduce element = LocalReduceList[i]
2049 Address ElemPtrPtrAddr =
2050 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2051 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
2052 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2053 // elemptr = (type[i]*)(elemptrptr)
2054 Address ElemPtr =
2055 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
2056 ElemPtr = Bld.CreateElementBitCast(
2057 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2058 // elem = *elemptr
2059 llvm::Value *Elem = CGF.EmitLoadOfScalar(
2060 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2061
2062 // Get pointer to location in transfer medium.
2063 // MediumPtr = &medium[warp_id]
2064 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
2065 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
2066 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
2067 // Casting to actual data type.
2068 // MediumPtr = (type[i]*)MediumPtrAddr;
2069 MediumPtr = Bld.CreateElementBitCast(
2070 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2071
2072 //*MediumPtr = elem
2073 Bld.CreateStore(Elem, MediumPtr);
2074
2075 Bld.CreateBr(MergeBB);
2076
2077 CGF.EmitBlock(ElseBB);
2078 Bld.CreateBr(MergeBB);
2079
2080 CGF.EmitBlock(MergeBB);
2081
2082 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
2083 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
2084 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
2085
2086 auto *NumActiveThreads = Bld.CreateNSWMul(
2087 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
2088 // named_barrier_sync(ParallelBarrierID, num_active_threads)
2089 syncParallelThreads(CGF, NumActiveThreads);
2090
2091 //
2092 // Warp 0 copies reduce element from transfer medium.
2093 //
2094 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
2095 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
2096 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
2097
2098 // Up to 32 threads in warp 0 are active.
2099 auto IsActiveThread =
2100 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
2101 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
2102
2103 CGF.EmitBlock(W0ThenBB);
2104
2105 // SrcMediumPtr = &medium[tid]
2106 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
2107 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
2108 Address SrcMediumPtr(SrcMediumPtrVal,
2109 C.getTypeAlignInChars(Private->getType()));
2110 // SrcMediumVal = *SrcMediumPtr;
2111 SrcMediumPtr = Bld.CreateElementBitCast(
2112 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
2113 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
2114 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
2115
2116 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
2117 Address TargetElemPtrPtr =
2118 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
2119 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
2120 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
2121 Address TargetElemPtr =
2122 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
2123 TargetElemPtr = Bld.CreateElementBitCast(
2124 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
2125
2126 // *TargetElemPtr = SrcMediumVal;
2127 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
2128 Private->getType());
2129 Bld.CreateBr(W0MergeBB);
2130
2131 CGF.EmitBlock(W0ElseBB);
2132 Bld.CreateBr(W0MergeBB);
2133
2134 CGF.EmitBlock(W0MergeBB);
2135
2136 // While warp 0 copies values from transfer medium, all other warps must
2137 // wait.
2138 syncParallelThreads(CGF, NumActiveThreads);
2139 Idx++;
2140 }
2141
2142 CGF.FinishFunction();
2143 return Fn;
2144}
2145
2146/// Emit a helper that reduces data across two OpenMP threads (lanes)
2147/// in the same warp. It uses shuffle instructions to copy over data from
2148/// a remote lane's stack. The reduction algorithm performed is specified
2149/// by the fourth parameter.
2150///
2151/// Algorithm Versions.
2152/// Full Warp Reduce (argument value 0):
2153/// This algorithm assumes that all 32 lanes are active and gathers
2154/// data from these 32 lanes, producing a single resultant value.
2155/// Contiguous Partial Warp Reduce (argument value 1):
2156/// This algorithm assumes that only a *contiguous* subset of lanes
2157/// are active. This happens for the last warp in a parallel region
2158/// when the user specified num_threads is not an integer multiple of
2159/// 32. This contiguous subset always starts with the zeroth lane.
2160/// Partial Warp Reduce (argument value 2):
2161/// This algorithm gathers data from any number of lanes at any position.
2162/// All reduced values are stored in the lowest possible lane. The set
2163/// of problems every algorithm addresses is a super set of those
2164/// addressable by algorithms with a lower version number. Overhead
2165/// increases as algorithm version increases.
2166///
2167/// Terminology
2168/// Reduce element:
2169/// Reduce element refers to the individual data field with primitive
2170/// data types to be combined and reduced across threads.
2171/// Reduce list:
2172/// Reduce list refers to a collection of local, thread-private
2173/// reduce elements.
2174/// Remote Reduce list:
2175/// Remote Reduce list refers to a collection of remote (relative to
2176/// the current thread) reduce elements.
2177///
2178/// We distinguish between three states of threads that are important to
2179/// the implementation of this function.
2180/// Alive threads:
2181/// Threads in a warp executing the SIMT instruction, as distinguished from
2182/// threads that are inactive due to divergent control flow.
2183/// Active threads:
2184/// The minimal set of threads that has to be alive upon entry to this
2185/// function. The computation is correct iff active threads are alive.
2186/// Some threads are alive but they are not active because they do not
2187/// contribute to the computation in any useful manner. Turning them off
2188/// may introduce control flow overheads without any tangible benefits.
2189/// Effective threads:
2190/// In order to comply with the argument requirements of the shuffle
2191/// function, we must keep all lanes holding data alive. But at most
2192/// half of them perform value aggregation; we refer to this half of
2193/// threads as effective. The other half is simply handing off their
2194/// data.
2195///
2196/// Procedure
2197/// Value shuffle:
2198/// In this step active threads transfer data from higher lane positions
2199/// in the warp to lower lane positions, creating Remote Reduce list.
2200/// Value aggregation:
2201/// In this step, effective threads combine their thread local Reduce list
2202/// with Remote Reduce list and store the result in the thread local
2203/// Reduce list.
2204/// Value copy:
2205/// In this step, we deal with the assumption made by algorithm 2
2206/// (i.e. contiguity assumption). When we have an odd number of lanes
2207/// active, say 2k+1, only k threads will be effective and therefore k
2208/// new values will be produced. However, the Reduce list owned by the
2209/// (2k+1)th thread is ignored in the value aggregation. Therefore
2210/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
2211/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002212static llvm::Value *emitShuffleAndReduceFunction(
2213 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
2214 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002215 auto &C = CGM.getContext();
2216
2217 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002218 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2219 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002220 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002221 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
2222 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002223 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002224 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2225 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002226 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002227 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
2228 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002229 FunctionArgList Args;
2230 Args.push_back(&ReduceListArg);
2231 Args.push_back(&LaneIDArg);
2232 Args.push_back(&RemoteLaneOffsetArg);
2233 Args.push_back(&AlgoVerArg);
2234
2235 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
2236 auto *Fn = llvm::Function::Create(
2237 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2238 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00002239 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002240 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002241 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002242
2243 auto &Bld = CGF.Builder;
2244
2245 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
2246 Address LocalReduceList(
2247 Bld.CreatePointerBitCastOrAddrSpaceCast(
2248 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
2249 C.VoidPtrTy, SourceLocation()),
2250 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
2251 CGF.getPointerAlign());
2252
2253 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
2254 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
2255 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2256
2257 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
2258 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
2259 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2260
2261 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
2262 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
2263 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
2264
2265 // Create a local thread-private variable to host the Reduce list
2266 // from a remote lane.
2267 Address RemoteReduceList =
2268 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
2269
2270 // This loop iterates through the list of reduce elements and copies,
2271 // element by element, from a remote lane in the warp to RemoteReduceList,
2272 // hosted on the thread's stack.
2273 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
2274 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002275 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
2276 /*ScratchpadIndex=*/nullptr,
2277 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002278
2279 // The actions to be performed on the Remote Reduce list is dependent
2280 // on the algorithm version.
2281 //
2282 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
2283 // LaneId % 2 == 0 && Offset > 0):
2284 // do the reduction value aggregation
2285 //
2286 // The thread local variable Reduce list is mutated in place to host the
2287 // reduced data, which is the aggregated value produced from local and
2288 // remote lanes.
2289 //
2290 // Note that AlgoVer is expected to be a constant integer known at compile
2291 // time.
2292 // When AlgoVer==0, the first conjunction evaluates to true, making
2293 // the entire predicate true during compile time.
2294 // When AlgoVer==1, the second conjunction has only the second part to be
2295 // evaluated during runtime. Other conjunctions evaluates to false
2296 // during compile time.
2297 // When AlgoVer==2, the third conjunction has only the second part to be
2298 // evaluated during runtime. Other conjunctions evaluates to false
2299 // during compile time.
2300 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
2301
2302 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2303 auto CondAlgo1 = Bld.CreateAnd(
2304 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
2305
2306 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
2307 auto CondAlgo2 = Bld.CreateAnd(
2308 Algo2,
2309 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
2310 Bld.getInt16(0)));
2311 CondAlgo2 = Bld.CreateAnd(
2312 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
2313
2314 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
2315 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
2316
2317 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
2318 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
2319 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
2320 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
2321
2322 CGF.EmitBlock(ThenBB);
2323 // reduce_function(LocalReduceList, RemoteReduceList)
2324 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2325 LocalReduceList.getPointer(), CGF.VoidPtrTy);
2326 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2327 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002328 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
2329 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002330 Bld.CreateBr(MergeBB);
2331
2332 CGF.EmitBlock(ElseBB);
2333 Bld.CreateBr(MergeBB);
2334
2335 CGF.EmitBlock(MergeBB);
2336
2337 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
2338 // Reduce list.
2339 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
2340 auto CondCopy = Bld.CreateAnd(
2341 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
2342
2343 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
2344 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
2345 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
2346 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
2347
2348 CGF.EmitBlock(CpyThenBB);
2349 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
2350 RemoteReduceList, LocalReduceList);
2351 Bld.CreateBr(CpyMergeBB);
2352
2353 CGF.EmitBlock(CpyElseBB);
2354 Bld.CreateBr(CpyMergeBB);
2355
2356 CGF.EmitBlock(CpyMergeBB);
2357
2358 CGF.FinishFunction();
2359 return Fn;
2360}
2361
2362///
2363/// Design of OpenMP reductions on the GPU
2364///
2365/// Consider a typical OpenMP program with one or more reduction
2366/// clauses:
2367///
2368/// float foo;
2369/// double bar;
2370/// #pragma omp target teams distribute parallel for \
2371/// reduction(+:foo) reduction(*:bar)
2372/// for (int i = 0; i < N; i++) {
2373/// foo += A[i]; bar *= B[i];
2374/// }
2375///
2376/// where 'foo' and 'bar' are reduced across all OpenMP threads in
2377/// all teams. In our OpenMP implementation on the NVPTX device an
2378/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
2379/// within a team are mapped to CUDA threads within a threadblock.
2380/// Our goal is to efficiently aggregate values across all OpenMP
2381/// threads such that:
2382///
2383/// - the compiler and runtime are logically concise, and
2384/// - the reduction is performed efficiently in a hierarchical
2385/// manner as follows: within OpenMP threads in the same warp,
2386/// across warps in a threadblock, and finally across teams on
2387/// the NVPTX device.
2388///
2389/// Introduction to Decoupling
2390///
2391/// We would like to decouple the compiler and the runtime so that the
2392/// latter is ignorant of the reduction variables (number, data types)
2393/// and the reduction operators. This allows a simpler interface
2394/// and implementation while still attaining good performance.
2395///
2396/// Pseudocode for the aforementioned OpenMP program generated by the
2397/// compiler is as follows:
2398///
2399/// 1. Create private copies of reduction variables on each OpenMP
2400/// thread: 'foo_private', 'bar_private'
2401/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
2402/// to it and writes the result in 'foo_private' and 'bar_private'
2403/// respectively.
2404/// 3. Call the OpenMP runtime on the GPU to reduce within a team
2405/// and store the result on the team master:
2406///
2407/// __kmpc_nvptx_parallel_reduce_nowait(...,
2408/// reduceData, shuffleReduceFn, interWarpCpyFn)
2409///
2410/// where:
2411/// struct ReduceData {
2412/// double *foo;
2413/// double *bar;
2414/// } reduceData
2415/// reduceData.foo = &foo_private
2416/// reduceData.bar = &bar_private
2417///
2418/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
2419/// auxiliary functions generated by the compiler that operate on
2420/// variables of type 'ReduceData'. They aid the runtime perform
2421/// algorithmic steps in a data agnostic manner.
2422///
2423/// 'shuffleReduceFn' is a pointer to a function that reduces data
2424/// of type 'ReduceData' across two OpenMP threads (lanes) in the
2425/// same warp. It takes the following arguments as input:
2426///
2427/// a. variable of type 'ReduceData' on the calling lane,
2428/// b. its lane_id,
2429/// c. an offset relative to the current lane_id to generate a
2430/// remote_lane_id. The remote lane contains the second
2431/// variable of type 'ReduceData' that is to be reduced.
2432/// d. an algorithm version parameter determining which reduction
2433/// algorithm to use.
2434///
2435/// 'shuffleReduceFn' retrieves data from the remote lane using
2436/// efficient GPU shuffle intrinsics and reduces, using the
2437/// algorithm specified by the 4th parameter, the two operands
2438/// element-wise. The result is written to the first operand.
2439///
2440/// Different reduction algorithms are implemented in different
2441/// runtime functions, all calling 'shuffleReduceFn' to perform
2442/// the essential reduction step. Therefore, based on the 4th
2443/// parameter, this function behaves slightly differently to
2444/// cooperate with the runtime to ensure correctness under
2445/// different circumstances.
2446///
2447/// 'InterWarpCpyFn' is a pointer to a function that transfers
2448/// reduced variables across warps. It tunnels, through CUDA
2449/// shared memory, the thread-private data of type 'ReduceData'
2450/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002451/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2452/// The last team writes the global reduced value to memory.
2453///
2454/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2455/// reduceData, shuffleReduceFn, interWarpCpyFn,
2456/// scratchpadCopyFn, loadAndReduceFn)
2457///
2458/// 'scratchpadCopyFn' is a helper that stores reduced
2459/// data from the team master to a scratchpad array in
2460/// global memory.
2461///
2462/// 'loadAndReduceFn' is a helper that loads data from
2463/// the scratchpad array and reduces it with the input
2464/// operand.
2465///
2466/// These compiler generated functions hide address
2467/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002468/// 5. if ret == 1:
2469/// The team master of the last team stores the reduced
2470/// result to the globals in memory.
2471/// foo += reduceData.foo; bar *= reduceData.bar
2472///
2473///
2474/// Warp Reduction Algorithms
2475///
2476/// On the warp level, we have three algorithms implemented in the
2477/// OpenMP runtime depending on the number of active lanes:
2478///
2479/// Full Warp Reduction
2480///
2481/// The reduce algorithm within a warp where all lanes are active
2482/// is implemented in the runtime as follows:
2483///
2484/// full_warp_reduce(void *reduce_data,
2485/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2486/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2487/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2488/// }
2489///
2490/// The algorithm completes in log(2, WARPSIZE) steps.
2491///
2492/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2493/// not used therefore we save instructions by not retrieving lane_id
2494/// from the corresponding special registers. The 4th parameter, which
2495/// represents the version of the algorithm being used, is set to 0 to
2496/// signify full warp reduction.
2497///
2498/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2499///
2500/// #reduce_elem refers to an element in the local lane's data structure
2501/// #remote_elem is retrieved from a remote lane
2502/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2503/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2504///
2505/// Contiguous Partial Warp Reduction
2506///
2507/// This reduce algorithm is used within a warp where only the first
2508/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2509/// number of OpenMP threads in a parallel region is not a multiple of
2510/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2511///
2512/// void
2513/// contiguous_partial_reduce(void *reduce_data,
2514/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2515/// int size, int lane_id) {
2516/// int curr_size;
2517/// int offset;
2518/// curr_size = size;
2519/// mask = curr_size/2;
2520/// while (offset>0) {
2521/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2522/// curr_size = (curr_size+1)/2;
2523/// offset = curr_size/2;
2524/// }
2525/// }
2526///
2527/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2528///
2529/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2530/// if (lane_id < offset)
2531/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2532/// else
2533/// reduce_elem = remote_elem
2534///
2535/// This algorithm assumes that the data to be reduced are located in a
2536/// contiguous subset of lanes starting from the first. When there is
2537/// an odd number of active lanes, the data in the last lane is not
2538/// aggregated with any other lane's dat but is instead copied over.
2539///
2540/// Dispersed Partial Warp Reduction
2541///
2542/// This algorithm is used within a warp when any discontiguous subset of
2543/// lanes are active. It is used to implement the reduction operation
2544/// across lanes in an OpenMP simd region or in a nested parallel region.
2545///
2546/// void
2547/// dispersed_partial_reduce(void *reduce_data,
2548/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2549/// int size, remote_id;
2550/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2551/// do {
2552/// remote_id = next_active_lane_id_right_after_me();
2553/// # the above function returns 0 of no active lane
2554/// # is present right after the current lane.
2555/// size = number_of_active_lanes_in_this_warp();
2556/// logical_lane_id /= 2;
2557/// ShuffleReduceFn(reduce_data, logical_lane_id,
2558/// remote_id-1-threadIdx.x, 2);
2559/// } while (logical_lane_id % 2 == 0 && size > 1);
2560/// }
2561///
2562/// There is no assumption made about the initial state of the reduction.
2563/// Any number of lanes (>=1) could be active at any position. The reduction
2564/// result is returned in the first active lane.
2565///
2566/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2567///
2568/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2569/// if (lane_id % 2 == 0 && offset > 0)
2570/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2571/// else
2572/// reduce_elem = remote_elem
2573///
2574///
2575/// Intra-Team Reduction
2576///
2577/// This function, as implemented in the runtime call
2578/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2579/// threads in a team. It first reduces within a warp using the
2580/// aforementioned algorithms. We then proceed to gather all such
2581/// reduced values at the first warp.
2582///
2583/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2584/// data from each of the "warp master" (zeroth lane of each warp, where
2585/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2586/// a mathematical sense) the problem of reduction across warp masters in
2587/// a block to the problem of warp reduction.
2588///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002589///
2590/// Inter-Team Reduction
2591///
2592/// Once a team has reduced its data to a single value, it is stored in
2593/// a global scratchpad array. Since each team has a distinct slot, this
2594/// can be done without locking.
2595///
2596/// The last team to write to the scratchpad array proceeds to reduce the
2597/// scratchpad array. One or more workers in the last team use the helper
2598/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2599/// the k'th worker reduces every k'th element.
2600///
2601/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2602/// reduce across workers and compute a globally reduced value.
2603///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002604void CGOpenMPRuntimeNVPTX::emitReduction(
2605 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2606 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2607 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2608 if (!CGF.HaveInsertPoint())
2609 return;
2610
2611 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002612 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2613 // FIXME: Add support for simd reduction.
2614 assert((TeamsReduction || ParallelReduction) &&
2615 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002616
2617 auto &C = CGM.getContext();
2618
2619 // 1. Build a list of reduction variables.
2620 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2621 auto Size = RHSExprs.size();
2622 for (auto *E : Privates) {
2623 if (E->getType()->isVariablyModifiedType())
2624 // Reserve place for array size.
2625 ++Size;
2626 }
2627 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2628 QualType ReductionArrayTy =
2629 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2630 /*IndexTypeQuals=*/0);
2631 Address ReductionList =
2632 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2633 auto IPriv = Privates.begin();
2634 unsigned Idx = 0;
2635 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2636 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2637 CGF.getPointerSize());
2638 CGF.Builder.CreateStore(
2639 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2640 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2641 Elem);
2642 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2643 // Store array size.
2644 ++Idx;
2645 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2646 CGF.getPointerSize());
2647 llvm::Value *Size = CGF.Builder.CreateIntCast(
2648 CGF.getVLASize(
2649 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00002650 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002651 CGF.SizeTy, /*isSigned=*/false);
2652 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2653 Elem);
2654 }
2655 }
2656
2657 // 2. Emit reduce_func().
2658 auto *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002659 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
2660 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002661
2662 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2663 // RedList, shuffle_reduce_func, interwarp_copy_func);
2664 auto *ThreadId = getThreadID(CGF, Loc);
2665 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2666 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2667 ReductionList.getPointer(), CGF.VoidPtrTy);
2668
2669 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002670 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002671 auto *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002672 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002673
2674 llvm::Value *Res = nullptr;
2675 if (ParallelReduction) {
2676 llvm::Value *Args[] = {ThreadId,
2677 CGF.Builder.getInt32(RHSExprs.size()),
2678 ReductionArrayTySize,
2679 RL,
2680 ShuffleAndReduceFn,
2681 InterWarpCopyFn};
2682
2683 Res = CGF.EmitRuntimeCall(
2684 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2685 Args);
2686 }
2687
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002688 if (TeamsReduction) {
2689 auto *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002690 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002691 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00002692 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002693
2694 llvm::Value *Args[] = {ThreadId,
2695 CGF.Builder.getInt32(RHSExprs.size()),
2696 ReductionArrayTySize,
2697 RL,
2698 ShuffleAndReduceFn,
2699 InterWarpCopyFn,
2700 ScratchPadCopyFn,
2701 LoadAndReduceFn};
2702 Res = CGF.EmitRuntimeCall(
2703 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2704 Args);
2705 }
2706
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002707 // 5. Build switch(res)
2708 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2709 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2710
2711 // 6. Build case 1: where we have reduced values in the master
2712 // thread in each team.
2713 // __kmpc_end_reduce{_nowait}(<gtid>);
2714 // break;
2715 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2716 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2717 CGF.EmitBlock(Case1BB);
2718
2719 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2720 llvm::Value *EndArgs[] = {ThreadId};
2721 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2722 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2723 auto IPriv = Privates.begin();
2724 auto ILHS = LHSExprs.begin();
2725 auto IRHS = RHSExprs.begin();
2726 for (auto *E : ReductionOps) {
2727 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2728 cast<DeclRefExpr>(*IRHS));
2729 ++IPriv;
2730 ++ILHS;
2731 ++IRHS;
2732 }
2733 };
2734 RegionCodeGenTy RCG(CodeGen);
2735 NVPTXActionTy Action(
2736 nullptr, llvm::None,
2737 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2738 EndArgs);
2739 RCG.setAction(Action);
2740 RCG(CGF);
2741 CGF.EmitBranch(DefaultBB);
2742 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2743}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002744
2745const VarDecl *
2746CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2747 const VarDecl *NativeParam) const {
2748 if (!NativeParam->getType()->isReferenceType())
2749 return NativeParam;
2750 QualType ArgType = NativeParam->getType();
2751 QualifierCollector QC;
2752 const Type *NonQualTy = QC.strip(ArgType);
2753 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2754 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2755 if (Attr->getCaptureKind() == OMPC_map) {
2756 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2757 LangAS::opencl_global);
2758 }
2759 }
2760 ArgType = CGM.getContext().getPointerType(PointeeTy);
2761 QC.addRestrict();
2762 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002763 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002764 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00002765 if (isa<ImplicitParamDecl>(NativeParam)) {
2766 return ImplicitParamDecl::Create(
2767 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2768 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2769 }
2770 return ParmVarDecl::Create(
2771 CGM.getContext(),
2772 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2773 NativeParam->getLocStart(), NativeParam->getLocation(),
2774 NativeParam->getIdentifier(), ArgType,
2775 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002776}
2777
2778Address
2779CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2780 const VarDecl *NativeParam,
2781 const VarDecl *TargetParam) const {
2782 assert(NativeParam != TargetParam &&
2783 NativeParam->getType()->isReferenceType() &&
2784 "Native arg must not be the same as target arg.");
2785 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2786 QualType NativeParamType = NativeParam->getType();
2787 QualifierCollector QC;
2788 const Type *NonQualTy = QC.strip(NativeParamType);
2789 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2790 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002791 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002792 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002793 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002794 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002795 // First cast to generic.
2796 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2797 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2798 /*AddrSpace=*/0));
2799 // Cast from generic to native address space.
2800 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2801 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2802 NativePointeeAddrSpace));
2803 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2804 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002805 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002806 return NativeParamAddr;
2807}
2808
2809void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002810 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002811 ArrayRef<llvm::Value *> Args) const {
2812 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002813 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002814 auto *FnType =
2815 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2816 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002817 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2818 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2819 break;
2820 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002821 llvm::Type *TargetType = FnType->getParamType(I);
2822 llvm::Value *NativeArg = Args[I];
2823 if (!TargetType->isPointerTy()) {
2824 TargetArgs.emplace_back(NativeArg);
2825 continue;
2826 }
2827 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00002828 NativeArg,
2829 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002830 TargetArgs.emplace_back(
2831 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2832 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002833 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002834}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002835
2836/// Emit function which wraps the outline parallel region
2837/// and controls the arguments which are passed to this function.
2838/// The wrapper ensures that the outlined function is called
2839/// with the correct arguments when data is shared.
2840llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
2841 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2842 ASTContext &Ctx = CGM.getContext();
2843 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
2844
2845 // Create a function that takes as argument the source thread.
2846 FunctionArgList WrapperArgs;
2847 QualType Int16QTy =
2848 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2849 QualType Int32QTy =
2850 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2851 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2852 /*Id=*/nullptr, Int16QTy,
2853 ImplicitParamDecl::Other);
2854 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getLocStart(),
2855 /*Id=*/nullptr, Int32QTy,
2856 ImplicitParamDecl::Other);
2857 WrapperArgs.emplace_back(&ParallelLevelArg);
2858 WrapperArgs.emplace_back(&WrapperArg);
2859
2860 auto &CGFI =
2861 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2862
2863 auto *Fn = llvm::Function::Create(
2864 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2865 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00002866 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002867 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2868
2869 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2870 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
2871 D.getLocStart(), D.getLocStart());
2872
2873 const auto *RD = CS.getCapturedRecordDecl();
2874 auto CurField = RD->field_begin();
2875
2876 // Get the array of arguments.
2877 SmallVector<llvm::Value *, 8> Args;
2878
2879 // TODO: suppport SIMD and pass actual values
2880 Args.emplace_back(
2881 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
2882 Args.emplace_back(
2883 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
2884
2885 CGBuilderTy &Bld = CGF.Builder;
2886 auto CI = CS.capture_begin();
2887
2888 // Use global memory for data sharing.
2889 // Handle passing of global args to workers.
2890 Address GlobalArgs =
2891 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
2892 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
2893 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
2894 CGF.EmitRuntimeCall(
2895 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
2896 DataSharingArgs);
2897
2898 // Retrieve the shared variables from the list of references returned
2899 // by the runtime. Pass the variables to the outlined function.
2900 if (CS.capture_size() > 0) {
2901 ASTContext &CGFContext = CGF.getContext();
2902 Address SharedArgListAddress = CGF.EmitLoadOfPointer(GlobalArgs,
2903 CGFContext
2904 .getPointerType(CGFContext.getPointerType(CGFContext.VoidPtrTy))
2905 .castAs<PointerType>());
2906 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
2907 QualType ElemTy = CurField->getType();
2908 Address Src = Bld.CreateConstInBoundsGEP(
2909 SharedArgListAddress, I, CGF.getPointerSize());
2910 Address TypedAddress = Bld.CreateBitCast(
2911 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
2912 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
2913 /*Volatile=*/false,
2914 CGFContext.getPointerType(ElemTy),
2915 CI->getLocation());
2916 Args.emplace_back(Arg);
2917 }
2918 }
2919
2920 emitOutlinedFunctionCall(CGF, D.getLocStart(), OutlinedParallelFn, Args);
2921 CGF.FinishFunction();
2922 return Fn;
2923}
2924
2925void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
2926 const Decl *D) {
2927 assert(D && "Expected function or captured|block decl.");
2928 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
2929 "Function is registered already.");
2930 SmallVector<const ValueDecl *, 4> IgnoredDecls;
2931 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002932 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002933 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
2934 Body = FD->getBody();
2935 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
2936 Body = BD->getBody();
2937 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
2938 Body = CD->getBody();
2939 if (CGF.CapturedStmtInfo->getKind() == CR_OpenMP) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00002940 NeedToDelayGlobalization = true;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002941 if (const auto *CS = dyn_cast<CapturedStmt>(Body)) {
2942 IgnoredDecls.reserve(CS->capture_size());
2943 for (const auto &Capture : CS->captures())
2944 if (Capture.capturesVariable())
2945 IgnoredDecls.emplace_back(Capture.getCapturedVar());
2946 }
2947 }
2948 }
2949 if (!Body)
2950 return;
2951 CheckVarsEscapingDeclContext VarChecker(CGF, IgnoredDecls);
2952 VarChecker.Visit(Body);
2953 const RecordDecl *GlobalizedVarsRecord = VarChecker.getGlobalizedRecord();
2954 if (!GlobalizedVarsRecord)
2955 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002956 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2957 I->getSecond().MappedParams =
2958 llvm::make_unique<CodeGenFunction::OMPMapVars>();
2959 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
2960 I->getSecond().EscapedParameters.insert(
2961 VarChecker.getEscapedParameters().begin(),
2962 VarChecker.getEscapedParameters().end());
2963 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002964 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
2965 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002966 Data.insert(std::make_pair(VD, std::make_pair(FD, Address::invalid())));
2967 }
2968 if (!NeedToDelayGlobalization) {
2969 emitGenericVarsProlog(CGF, D->getLocStart());
2970 struct GlobalizationScope final : EHScopeStack::Cleanup {
2971 GlobalizationScope() = default;
2972
2973 void Emit(CodeGenFunction &CGF, Flags flags) override {
2974 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2975 .emitGenericVarsEpilog(CGF);
2976 }
2977 };
2978 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002979 }
2980}
2981
2982Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
2983 const VarDecl *VD) {
2984 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2985 if (I == FunctionGlobalizedDecls.end())
2986 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00002987 auto VDI = I->getSecond().LocalVarData.find(VD);
2988 if (VDI == I->getSecond().LocalVarData.end())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002989 return Address::invalid();
2990 return VDI->second.second;
2991}
2992
2993void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002994 FunctionGlobalizedDecls.erase(CGF.CurFn);
2995 CGOpenMPRuntime::functionFinished(CGF);
2996}