blob: ab9631c1feeccd3df1123747b49ee5e47230bd57 [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
Alexey Bataeve4090182018-11-02 14:54:07 +000020#include "clang/Basic/Cuda.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000021#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000022
23using namespace clang;
24using namespace CodeGen;
25
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000026namespace {
27enum OpenMPRTLFunctionNVPTX {
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000028 /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000029 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000030 OMPRTL_NVPTX__kmpc_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000031 /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000032 OMPRTL_NVPTX__kmpc_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000033 /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000034 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000035 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000036 /// Call to void __kmpc_spmd_kernel_deinit();
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000037 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000038 /// Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000039 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000040 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000041 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000042 /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000043 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000044 OMPRTL_NVPTX__kmpc_kernel_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000045 /// Call to void __kmpc_kernel_end_parallel();
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000046 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
47 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
48 /// global_tid);
49 OMPRTL_NVPTX__kmpc_serialized_parallel,
50 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
51 /// global_tid);
52 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000053 /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000054 /// int16_t lane_offset, int16_t warp_size);
55 OMPRTL_NVPTX__kmpc_shuffle_int32,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000056 /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000057 /// int16_t lane_offset, int16_t warp_size);
58 OMPRTL_NVPTX__kmpc_shuffle_int64,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000059 /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000060 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
61 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
62 /// lane_offset, int16_t shortCircuit),
63 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
64 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000065 /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
Alexey Bataevfac26cf2018-05-02 20:03:27 +000066 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
67 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68 /// lane_offset, int16_t shortCircuit),
69 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
70 OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000071 /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000072 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
73 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
74 /// lane_offset, int16_t shortCircuit),
75 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
76 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
77 /// int32_t index, int32_t width),
78 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
79 /// index, int32_t width, int32_t reduce))
80 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000081 /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000082 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000083 /// Call to void __kmpc_data_sharing_init_stack();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000084 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +000085 /// Call to void __kmpc_data_sharing_init_stack_spmd();
86 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +000087 /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000088 /// int16_t UseSharedMemory);
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +000089 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000090 /// Call to void __kmpc_data_sharing_pop_stack(void *a);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000091 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000092 /// Call to void __kmpc_begin_sharing_variables(void ***args,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000093 /// size_t n_args);
94 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000095 /// Call to void __kmpc_end_sharing_variables();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000096 OMPRTL_NVPTX__kmpc_end_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000097 /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000098 OMPRTL_NVPTX__kmpc_get_shared_variables,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +000099 /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
100 /// global_tid);
101 OMPRTL_NVPTX__kmpc_parallel_level,
Alexey Bataev673110d2018-05-16 13:36:30 +0000102 /// Call to int8_t __kmpc_is_spmd_exec_mode();
103 OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
Alexey Bataeve4090182018-11-02 14:54:07 +0000104 /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size,
105 /// int16_t is_shared, const void **res);
106 OMPRTL_NVPTX__kmpc_get_team_static_memory,
107 /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
108 OMPRTL_NVPTX__kmpc_restore_team_static_memory,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000109};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000110
111/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
112class NVPTXActionTy final : public PrePostActionTy {
Alexey Bataev9ff80832018-04-16 20:16:21 +0000113 llvm::Value *EnterCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000114 ArrayRef<llvm::Value *> EnterArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000115 llvm::Value *ExitCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000116 ArrayRef<llvm::Value *> ExitArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000117 bool Conditional = false;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000118 llvm::BasicBlock *ContBlock = nullptr;
119
120public:
121 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
122 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
123 bool Conditional = false)
124 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
125 ExitArgs(ExitArgs), Conditional(Conditional) {}
126 void Enter(CodeGenFunction &CGF) override {
127 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
128 if (Conditional) {
129 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
130 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
131 ContBlock = CGF.createBasicBlock("omp_if.end");
132 // Generate the branch (If-stmt)
133 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
134 CGF.EmitBlock(ThenBlock);
135 }
136 }
137 void Done(CodeGenFunction &CGF) {
138 // Emit the rest of blocks/branches
139 CGF.EmitBranch(ContBlock);
140 CGF.EmitBlock(ContBlock, true);
141 }
142 void Exit(CodeGenFunction &CGF) override {
143 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
144 }
145};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000146
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000147/// A class to track the execution mode when codegening directives within
148/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
149/// to the target region and used by containing directives such as 'parallel'
150/// to emit optimized code.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000151class ExecutionModeRAII {
152private:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000153 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
154 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000155
156public:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000157 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
158 : Mode(Mode) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000159 SavedMode = Mode;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000160 Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
161 : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000162 }
163 ~ExecutionModeRAII() { Mode = SavedMode; }
164};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000165
166/// GPU Configuration: This information can be derived from cuda registers,
167/// however, providing compile time constants helps generate more efficient
168/// code. For all practical purposes this is fine because the configuration
169/// is the same for all known NVPTX architectures.
170enum MachineConfiguration : unsigned {
171 WarpSize = 32,
172 /// Number of bits required to represent a lane identifier, which is
173 /// computed as log_2(WarpSize).
174 LaneIDBits = 5,
175 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000176
177 /// Global memory alignment for performance.
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000178 GlobalMemoryAlignment = 128,
Alexey Bataev09c9eea2018-11-09 16:18:04 +0000179
180 /// Maximal size of the shared memory buffer.
181 SharedMemorySize = 128,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000182};
183
184enum NamedBarrier : unsigned {
185 /// Synchronize on this barrier #ID using a named barrier primitive.
186 /// Only the subset of active threads in a parallel region arrive at the
187 /// barrier.
188 NB_Parallel = 1,
189};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000190
Alexey Bataev2adecff2018-09-21 14:22:53 +0000191typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
192static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
193 return P1.first > P2.first;
194}
195
196static RecordDecl *buildRecordForGlobalizedVars(
197 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000198 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
Alexey Bataev2adecff2018-09-21 14:22:53 +0000199 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
200 &MappedDeclsFields) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000201 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
Alexey Bataev2adecff2018-09-21 14:22:53 +0000202 return nullptr;
203 SmallVector<VarsDataTy, 4> GlobalizedVars;
204 for (const ValueDecl *D : EscapedDecls)
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000205 GlobalizedVars.emplace_back(
206 CharUnits::fromQuantity(std::max(
207 C.getDeclAlign(D).getQuantity(),
208 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
209 D);
210 for (const ValueDecl *D : EscapedDeclsForTeams)
Alexey Bataev2adecff2018-09-21 14:22:53 +0000211 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
212 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
213 stable_sort_comparator);
214 // Build struct _globalized_locals_ty {
Alexey Bataevff23bb62018-10-11 18:30:31 +0000215 // /* globalized vars */[WarSize] align (max(decl_align,
216 // GlobalMemoryAlignment))
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000217 // /* globalized vars */ for EscapedDeclsForTeams
Alexey Bataev2adecff2018-09-21 14:22:53 +0000218 // };
219 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
220 GlobalizedRD->startDefinition();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000221 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
222 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
Alexey Bataev2adecff2018-09-21 14:22:53 +0000223 for (const auto &Pair : GlobalizedVars) {
224 const ValueDecl *VD = Pair.second;
225 QualType Type = VD->getType();
226 if (Type->isLValueReferenceType())
227 Type = C.getPointerType(Type.getNonReferenceType());
228 else
229 Type = Type.getNonReferenceType();
230 SourceLocation Loc = VD->getLocation();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000231 FieldDecl *Field;
232 if (SingleEscaped.count(VD)) {
233 Field = FieldDecl::Create(
234 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
235 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
236 /*BW=*/nullptr, /*Mutable=*/false,
237 /*InitStyle=*/ICIS_NoInit);
238 Field->setAccess(AS_public);
239 if (VD->hasAttrs()) {
240 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
241 E(VD->getAttrs().end());
242 I != E; ++I)
243 Field->addAttr(*I);
244 }
245 } else {
246 llvm::APInt ArraySize(32, WarpSize);
247 Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
248 Field = FieldDecl::Create(
249 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
250 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
251 /*BW=*/nullptr, /*Mutable=*/false,
252 /*InitStyle=*/ICIS_NoInit);
253 Field->setAccess(AS_public);
254 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
255 static_cast<CharUnits::QuantityType>(
256 GlobalMemoryAlignment)));
257 Field->addAttr(AlignedAttr::CreateImplicit(
258 C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
259 IntegerLiteral::Create(C, Align,
260 C.getIntTypeForBitwidth(32, /*Signed=*/0),
261 SourceLocation())));
Alexey Bataev2adecff2018-09-21 14:22:53 +0000262 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000263 GlobalizedRD->addDecl(Field);
Alexey Bataev2adecff2018-09-21 14:22:53 +0000264 MappedDeclsFields.try_emplace(VD, Field);
265 }
266 GlobalizedRD->completeDefinition();
267 return GlobalizedRD;
268}
269
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000270/// Get the list of variables that can escape their declaration context.
271class CheckVarsEscapingDeclContext final
272 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
273 CodeGenFunction &CGF;
274 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000275 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000276 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000277 RecordDecl *GlobalizedRD = nullptr;
278 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000279 bool AllEscaped = false;
Alexey Bataev91433f62018-06-26 17:24:03 +0000280 bool IsForCombinedParallelRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000281
282 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev03f270c2018-03-30 18:31:07 +0000283 // Do not globalize declare target variables.
Alexey Bataev97b72212018-08-14 18:31:20 +0000284 if (!isa<VarDecl>(VD) ||
285 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
Alexey Bataev03f270c2018-03-30 18:31:07 +0000286 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000287 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000288 // Variables captured by value must be globalized.
289 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000290 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000291 // Check if need to capture the variable that was already captured by
292 // value in the outer region.
Alexey Bataev91433f62018-06-26 17:24:03 +0000293 if (!IsForCombinedParallelRegion) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000294 if (!FD->hasAttrs())
295 return;
296 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
297 if (!Attr)
298 return;
299 if (!isOpenMPPrivate(
300 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
301 Attr->getCaptureKind() == OMPC_map)
302 return;
303 }
304 if (!FD->getType()->isReferenceType()) {
305 assert(!VD->getType()->isVariablyModifiedType() &&
306 "Parameter captured by value with variably modified type");
307 EscapedParameters.insert(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000308 } else if (!IsForCombinedParallelRegion) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000309 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000310 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000311 }
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000312 }
313 if ((!CGF.CapturedStmtInfo ||
Alexey Bataev91433f62018-06-26 17:24:03 +0000314 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000315 VD->getType()->isReferenceType())
316 // Do not globalize variables with reference type.
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000317 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000318 if (VD->getType()->isVariablyModifiedType())
319 EscapedVariableLengthDecls.insert(VD);
320 else
321 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000322 }
323
324 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000325 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000326 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000327 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
328 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
329 const bool SavedAllEscaped = AllEscaped;
330 AllEscaped = VD->getType()->isLValueReferenceType();
331 Visit(VarD->getInit());
332 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000333 }
334 }
335 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000336 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
337 ArrayRef<OMPClause *> Clauses,
338 bool IsCombinedParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000339 if (!S)
340 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000341 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000342 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
343 const ValueDecl *VD = C.getCapturedVar();
Alexey Bataev91433f62018-06-26 17:24:03 +0000344 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
345 if (IsCombinedParallelRegion) {
346 // Check if the variable is privatized in the combined construct and
347 // those private copies must be shared in the inner parallel
348 // directive.
349 IsForCombinedParallelRegion = false;
350 for (const OMPClause *C : Clauses) {
351 if (!isOpenMPPrivate(C->getClauseKind()) ||
352 C->getClauseKind() == OMPC_reduction ||
353 C->getClauseKind() == OMPC_linear ||
354 C->getClauseKind() == OMPC_private)
355 continue;
356 ArrayRef<const Expr *> Vars;
357 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
358 Vars = PC->getVarRefs();
359 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
360 Vars = PC->getVarRefs();
361 else
362 llvm_unreachable("Unexpected clause.");
363 for (const auto *E : Vars) {
364 const Decl *D =
365 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
366 if (D == VD->getCanonicalDecl()) {
367 IsForCombinedParallelRegion = true;
368 break;
369 }
370 }
371 if (IsForCombinedParallelRegion)
372 break;
373 }
374 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000375 markAsEscaped(VD);
376 if (isa<OMPCapturedExprDecl>(VD))
377 VisitValueDecl(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000378 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000379 }
380 }
381 }
382
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000383 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000384 assert(!GlobalizedRD &&
385 "Record for globalized variables is built already.");
Alexey Bataevff23bb62018-10-11 18:30:31 +0000386 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000387 if (IsInTTDRegion)
Alexey Bataevff23bb62018-10-11 18:30:31 +0000388 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
389 else
390 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
Alexey Bataev2adecff2018-09-21 14:22:53 +0000391 GlobalizedRD = ::buildRecordForGlobalizedVars(
Alexey Bataevff23bb62018-10-11 18:30:31 +0000392 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000393 MappedDeclsFields);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000394 }
395
396public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000397 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000398 virtual ~CheckVarsEscapingDeclContext() = default;
399 void VisitDeclStmt(const DeclStmt *S) {
400 if (!S)
401 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000402 for (const Decl *D : S->decls())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000403 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
404 VisitValueDecl(VD);
405 }
406 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
407 if (!D)
408 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000409 if (!D->hasAssociatedStmt())
410 return;
411 if (const auto *S =
412 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
413 // Do not analyze directives that do not actually require capturing,
414 // like `omp for` or `omp simd` directives.
415 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
416 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
417 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
418 VisitStmt(S->getCapturedStmt());
419 return;
Alexey Bataev673110d2018-05-16 13:36:30 +0000420 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000421 VisitOpenMPCapturedStmt(
422 S, D->clauses(),
423 CaptureRegions.back() == OMPD_parallel &&
424 isOpenMPDistributeDirective(D->getDirectiveKind()));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000425 }
426 }
427 void VisitCapturedStmt(const CapturedStmt *S) {
428 if (!S)
429 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000430 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000431 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
432 const ValueDecl *VD = C.getCapturedVar();
433 markAsEscaped(VD);
434 if (isa<OMPCapturedExprDecl>(VD))
435 VisitValueDecl(VD);
436 }
437 }
438 }
439 void VisitLambdaExpr(const LambdaExpr *E) {
440 if (!E)
441 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000442 for (const LambdaCapture &C : E->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000443 if (C.capturesVariable()) {
444 if (C.getCaptureKind() == LCK_ByRef) {
445 const ValueDecl *VD = C.getCapturedVar();
446 markAsEscaped(VD);
447 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
448 VisitValueDecl(VD);
449 }
450 }
451 }
452 }
453 void VisitBlockExpr(const BlockExpr *E) {
454 if (!E)
455 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000456 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000457 if (C.isByRef()) {
458 const VarDecl *VD = C.getVariable();
459 markAsEscaped(VD);
460 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
461 VisitValueDecl(VD);
462 }
463 }
464 }
465 void VisitCallExpr(const CallExpr *E) {
466 if (!E)
467 return;
468 for (const Expr *Arg : E->arguments()) {
469 if (!Arg)
470 continue;
471 if (Arg->isLValue()) {
472 const bool SavedAllEscaped = AllEscaped;
473 AllEscaped = true;
474 Visit(Arg);
475 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000476 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000477 Visit(Arg);
Alexey Bataev9ff80832018-04-16 20:16:21 +0000478 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000479 }
480 Visit(E->getCallee());
481 }
482 void VisitDeclRefExpr(const DeclRefExpr *E) {
483 if (!E)
484 return;
485 const ValueDecl *VD = E->getDecl();
486 if (AllEscaped)
487 markAsEscaped(VD);
488 if (isa<OMPCapturedExprDecl>(VD))
489 VisitValueDecl(VD);
490 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
491 if (VarD->isInitCapture())
492 VisitValueDecl(VD);
493 }
494 void VisitUnaryOperator(const UnaryOperator *E) {
495 if (!E)
496 return;
497 if (E->getOpcode() == UO_AddrOf) {
498 const bool SavedAllEscaped = AllEscaped;
499 AllEscaped = true;
500 Visit(E->getSubExpr());
501 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000502 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000503 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000504 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000505 }
506 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
507 if (!E)
508 return;
509 if (E->getCastKind() == CK_ArrayToPointerDecay) {
510 const bool SavedAllEscaped = AllEscaped;
511 AllEscaped = true;
512 Visit(E->getSubExpr());
513 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000514 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000515 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000516 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000517 }
518 void VisitExpr(const Expr *E) {
519 if (!E)
520 return;
521 bool SavedAllEscaped = AllEscaped;
522 if (!E->isLValue())
523 AllEscaped = false;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000524 for (const Stmt *Child : E->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000525 if (Child)
526 Visit(Child);
527 AllEscaped = SavedAllEscaped;
528 }
529 void VisitStmt(const Stmt *S) {
530 if (!S)
531 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000532 for (const Stmt *Child : S->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000533 if (Child)
534 Visit(Child);
535 }
536
Alexey Bataevc99042b2018-03-15 18:10:54 +0000537 /// Returns the record that handles all the escaped local variables and used
538 /// instead of their original storage.
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000539 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000540 if (!GlobalizedRD)
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000541 buildRecordForGlobalizedVars(IsInTTDRegion);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000542 return GlobalizedRD;
543 }
544
Alexey Bataevc99042b2018-03-15 18:10:54 +0000545 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000546 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
547 assert(GlobalizedRD &&
548 "Record for globalized variables must be generated already.");
549 auto I = MappedDeclsFields.find(VD);
550 if (I == MappedDeclsFields.end())
551 return nullptr;
552 return I->getSecond();
553 }
554
Alexey Bataevc99042b2018-03-15 18:10:54 +0000555 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000556 ArrayRef<const ValueDecl *> getEscapedDecls() const {
557 return EscapedDecls.getArrayRef();
558 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000559
560 /// Checks if the escaped local variable is actually a parameter passed by
561 /// value.
562 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
563 return EscapedParameters;
564 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000565
566 /// Returns the list of the escaped variables with the variably modified
567 /// types.
568 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
569 return EscapedVariableLengthDecls.getArrayRef();
570 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000571};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000572} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000573
574/// Get the GPU warp size.
575static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000576 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000577 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000578 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000579 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000580}
581
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000582/// Get the id of the current thread on the GPU.
583static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000584 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000586 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000587 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000588}
589
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000590/// Get the id of the warp in the block.
591/// We assume that the warp size is 32, which is always the case
592/// on the NVPTX device, to generate more efficient code.
593static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
594 CGBuilderTy &Bld = CGF.Builder;
595 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
596}
597
598/// Get the id of the current lane in the Warp.
599/// We assume that the warp size is 32, which is always the case
600/// on the NVPTX device, to generate more efficient code.
601static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
602 CGBuilderTy &Bld = CGF.Builder;
603 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
604 "nvptx_lane_id");
605}
606
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000607/// Get the maximum number of threads in a block of the GPU.
608static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000609 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000610 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000611 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000612 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000613}
614
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000615/// Get barrier to synchronize all threads in a block.
616static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000617 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000618 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000619}
620
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000621/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
622/// a CTA.
623static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
624 llvm::Value *NumThreads) {
625 CGBuilderTy &Bld = CGF.Builder;
626 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000627 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
628 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
629 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000630}
631
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000632/// Synchronize all GPU threads in a block.
633static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000634
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000635/// Synchronize worker threads in a parallel region.
636static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
637 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
638}
639
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000640/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000641/// For the 'generic' execution mode, the runtime encodes thread_limit in
642/// the launch parameters, always starting thread_limit+warpSize threads per
643/// CTA. The threads in the last warp are reserved for master execution.
644/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
645static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000646 bool IsInSPMDExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000647 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000648 return IsInSPMDExecutionMode
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000649 ? getNVPTXNumThreads(CGF)
Alexey Bataeve290ec02018-04-06 16:03:36 +0000650 : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
651 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000652}
653
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000654/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000655/// The master thread id is the first thread (lane) of the last warp in the
656/// GPU block. Warp size is assumed to be some power of 2.
657/// Thread id is 0 indexed.
658/// E.g: If NumThreads is 33, master id is 32.
659/// If NumThreads is 64, master id is 32.
660/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000661static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000662 CGBuilderTy &Bld = CGF.Builder;
663 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
664
665 // We assume that the warp size is a power of 2.
Alexey Bataeve290ec02018-04-06 16:03:36 +0000666 llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000667
Alexey Bataeve290ec02018-04-06 16:03:36 +0000668 return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000669 Bld.CreateNot(Mask), "master_tid");
670}
671
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000672CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000673 CodeGenModule &CGM, SourceLocation Loc)
Alexey Bataev9ff80832018-04-16 20:16:21 +0000674 : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
675 Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000676 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000677}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000678
679void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
680 CodeGenModule &CGM) {
681 // Create an worker function with no arguments.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000682
683 WorkerFn = llvm::Function::Create(
Alexey Bataev9ff80832018-04-16 20:16:21 +0000684 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000685 /*placeholder=*/"_worker", &CGM.getModule());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000686 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +0000687 WorkerFn->setDoesNotRecurse();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000688}
689
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000690CGOpenMPRuntimeNVPTX::ExecutionMode
691CGOpenMPRuntimeNVPTX::getExecutionMode() const {
692 return CurrentExecutionMode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000693}
694
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000695static CGOpenMPRuntimeNVPTX::DataSharingMode
696getDataSharingMode(CodeGenModule &CGM) {
697 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
698 : CGOpenMPRuntimeNVPTX::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000699}
700
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000701/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
702/// iff there is only one.
703static const Stmt *getSingleCompoundChild(const Stmt *Body) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000704 if (const auto *C = dyn_cast<CompoundStmt>(Body))
705 if (C->size() == 1)
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000706 return C->body_front();
707 return Body;
708}
709
710/// Check if the parallel directive has an 'if' clause with non-constant or
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000711/// false condition. Also, check if the number of threads is strictly specified
712/// and run those directives in non-SPMD mode.
713static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
714 const OMPExecutableDirective &D) {
715 if (D.hasClausesOfKind<OMPNumThreadsClause>())
716 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000717 for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
718 OpenMPDirectiveKind NameModifier = C->getNameModifier();
719 if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
720 continue;
721 const Expr *Cond = C->getCondition();
722 bool Result;
723 if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
724 return true;
725 }
726 return false;
727}
728
729/// Check for inner (nested) SPMD construct, if any
730static bool hasNestedSPMDDirective(ASTContext &Ctx,
731 const OMPExecutableDirective &D) {
732 const auto *CS = D.getInnermostCapturedStmt();
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000733 const auto *Body =
734 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000735 const Stmt *ChildStmt = getSingleCompoundChild(Body);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000736
737 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
738 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000739 switch (D.getDirectiveKind()) {
740 case OMPD_target:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000741 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000742 !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000743 return true;
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000744 if (DKind == OMPD_teams) {
745 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
746 /*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000747 if (!Body)
748 return false;
749 ChildStmt = getSingleCompoundChild(Body);
750 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
751 DKind = NND->getDirectiveKind();
Alexey Bataevdf093e72018-05-11 19:45:14 +0000752 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000753 !hasParallelIfNumThreadsClause(Ctx, *NND))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000754 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000755 }
756 }
757 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000758 case OMPD_target_teams:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000759 return isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000760 !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000761 case OMPD_target_simd:
762 case OMPD_target_parallel:
763 case OMPD_target_parallel_for:
764 case OMPD_target_parallel_for_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000765 case OMPD_target_teams_distribute:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000766 case OMPD_target_teams_distribute_simd:
767 case OMPD_target_teams_distribute_parallel_for:
768 case OMPD_target_teams_distribute_parallel_for_simd:
769 case OMPD_parallel:
770 case OMPD_for:
771 case OMPD_parallel_for:
772 case OMPD_parallel_sections:
773 case OMPD_for_simd:
774 case OMPD_parallel_for_simd:
775 case OMPD_cancel:
776 case OMPD_cancellation_point:
777 case OMPD_ordered:
778 case OMPD_threadprivate:
779 case OMPD_task:
780 case OMPD_simd:
781 case OMPD_sections:
782 case OMPD_section:
783 case OMPD_single:
784 case OMPD_master:
785 case OMPD_critical:
786 case OMPD_taskyield:
787 case OMPD_barrier:
788 case OMPD_taskwait:
789 case OMPD_taskgroup:
790 case OMPD_atomic:
791 case OMPD_flush:
792 case OMPD_teams:
793 case OMPD_target_data:
794 case OMPD_target_exit_data:
795 case OMPD_target_enter_data:
796 case OMPD_distribute:
797 case OMPD_distribute_simd:
798 case OMPD_distribute_parallel_for:
799 case OMPD_distribute_parallel_for_simd:
800 case OMPD_teams_distribute:
801 case OMPD_teams_distribute_simd:
802 case OMPD_teams_distribute_parallel_for:
803 case OMPD_teams_distribute_parallel_for_simd:
804 case OMPD_target_update:
805 case OMPD_declare_simd:
806 case OMPD_declare_target:
807 case OMPD_end_declare_target:
808 case OMPD_declare_reduction:
809 case OMPD_taskloop:
810 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000811 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000812 case OMPD_unknown:
813 llvm_unreachable("Unexpected directive.");
814 }
815 }
816
817 return false;
818}
819
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000820static bool supportsSPMDExecutionMode(ASTContext &Ctx,
821 const OMPExecutableDirective &D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000822 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
823 switch (DirectiveKind) {
824 case OMPD_target:
825 case OMPD_target_teams:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000826 return hasNestedSPMDDirective(Ctx, D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000827 case OMPD_target_parallel:
828 case OMPD_target_parallel_for:
829 case OMPD_target_parallel_for_simd:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000830 case OMPD_target_teams_distribute_parallel_for:
831 case OMPD_target_teams_distribute_parallel_for_simd:
Alexey Bataev2adecff2018-09-21 14:22:53 +0000832 return !hasParallelIfNumThreadsClause(Ctx, D);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000833 case OMPD_target_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000834 case OMPD_target_teams_distribute:
835 case OMPD_target_teams_distribute_simd:
836 return false;
837 case OMPD_parallel:
838 case OMPD_for:
839 case OMPD_parallel_for:
840 case OMPD_parallel_sections:
841 case OMPD_for_simd:
842 case OMPD_parallel_for_simd:
843 case OMPD_cancel:
844 case OMPD_cancellation_point:
845 case OMPD_ordered:
846 case OMPD_threadprivate:
847 case OMPD_task:
848 case OMPD_simd:
849 case OMPD_sections:
850 case OMPD_section:
851 case OMPD_single:
852 case OMPD_master:
853 case OMPD_critical:
854 case OMPD_taskyield:
855 case OMPD_barrier:
856 case OMPD_taskwait:
857 case OMPD_taskgroup:
858 case OMPD_atomic:
859 case OMPD_flush:
860 case OMPD_teams:
861 case OMPD_target_data:
862 case OMPD_target_exit_data:
863 case OMPD_target_enter_data:
864 case OMPD_distribute:
865 case OMPD_distribute_simd:
866 case OMPD_distribute_parallel_for:
867 case OMPD_distribute_parallel_for_simd:
868 case OMPD_teams_distribute:
869 case OMPD_teams_distribute_simd:
870 case OMPD_teams_distribute_parallel_for:
871 case OMPD_teams_distribute_parallel_for_simd:
872 case OMPD_target_update:
873 case OMPD_declare_simd:
874 case OMPD_declare_target:
875 case OMPD_end_declare_target:
876 case OMPD_declare_reduction:
877 case OMPD_taskloop:
878 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000879 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000880 case OMPD_unknown:
881 break;
882 }
883 llvm_unreachable(
884 "Unknown programming model for OpenMP directive on NVPTX target.");
885}
886
887/// Check if the directive is loops based and has schedule clause at all or has
888/// static scheduling.
889static bool hasStaticScheduling(const OMPExecutableDirective &D) {
890 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
891 isOpenMPLoopDirective(D.getDirectiveKind()) &&
892 "Expected loop-based directive.");
893 return !D.hasClausesOfKind<OMPOrderedClause>() &&
894 (!D.hasClausesOfKind<OMPScheduleClause>() ||
895 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
896 [](const OMPScheduleClause *C) {
897 return C->getScheduleKind() == OMPC_SCHEDULE_static;
898 }));
899}
900
901/// Check for inner (nested) lightweight runtime construct, if any
902static bool hasNestedLightweightDirective(ASTContext &Ctx,
903 const OMPExecutableDirective &D) {
904 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
905 const auto *CS = D.getInnermostCapturedStmt();
906 const auto *Body =
907 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
908 const Stmt *ChildStmt = getSingleCompoundChild(Body);
909
910 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
911 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
912 switch (D.getDirectiveKind()) {
913 case OMPD_target:
914 if (isOpenMPParallelDirective(DKind) &&
915 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
916 hasStaticScheduling(*NestedDir))
917 return true;
918 if (DKind == OMPD_parallel) {
919 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
920 /*IgnoreCaptured=*/true);
921 if (!Body)
922 return false;
923 ChildStmt = getSingleCompoundChild(Body);
924 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
925 DKind = NND->getDirectiveKind();
926 if (isOpenMPWorksharingDirective(DKind) &&
927 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
928 return true;
929 }
930 } else if (DKind == OMPD_teams) {
931 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
932 /*IgnoreCaptured=*/true);
933 if (!Body)
934 return false;
935 ChildStmt = getSingleCompoundChild(Body);
936 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
937 DKind = NND->getDirectiveKind();
938 if (isOpenMPParallelDirective(DKind) &&
939 isOpenMPWorksharingDirective(DKind) &&
940 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
941 return true;
942 if (DKind == OMPD_parallel) {
943 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
944 /*IgnoreCaptured=*/true);
945 if (!Body)
946 return false;
947 ChildStmt = getSingleCompoundChild(Body);
948 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
949 DKind = NND->getDirectiveKind();
950 if (isOpenMPWorksharingDirective(DKind) &&
951 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
952 return true;
953 }
954 }
955 }
956 }
957 return false;
958 case OMPD_target_teams:
959 if (isOpenMPParallelDirective(DKind) &&
960 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
961 hasStaticScheduling(*NestedDir))
962 return true;
963 if (DKind == OMPD_parallel) {
964 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
965 /*IgnoreCaptured=*/true);
966 if (!Body)
967 return false;
968 ChildStmt = getSingleCompoundChild(Body);
969 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
970 DKind = NND->getDirectiveKind();
971 if (isOpenMPWorksharingDirective(DKind) &&
972 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
973 return true;
974 }
975 }
976 return false;
977 case OMPD_target_parallel:
978 return isOpenMPWorksharingDirective(DKind) &&
979 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
980 case OMPD_target_teams_distribute:
981 case OMPD_target_simd:
982 case OMPD_target_parallel_for:
983 case OMPD_target_parallel_for_simd:
984 case OMPD_target_teams_distribute_simd:
985 case OMPD_target_teams_distribute_parallel_for:
986 case OMPD_target_teams_distribute_parallel_for_simd:
987 case OMPD_parallel:
988 case OMPD_for:
989 case OMPD_parallel_for:
990 case OMPD_parallel_sections:
991 case OMPD_for_simd:
992 case OMPD_parallel_for_simd:
993 case OMPD_cancel:
994 case OMPD_cancellation_point:
995 case OMPD_ordered:
996 case OMPD_threadprivate:
997 case OMPD_task:
998 case OMPD_simd:
999 case OMPD_sections:
1000 case OMPD_section:
1001 case OMPD_single:
1002 case OMPD_master:
1003 case OMPD_critical:
1004 case OMPD_taskyield:
1005 case OMPD_barrier:
1006 case OMPD_taskwait:
1007 case OMPD_taskgroup:
1008 case OMPD_atomic:
1009 case OMPD_flush:
1010 case OMPD_teams:
1011 case OMPD_target_data:
1012 case OMPD_target_exit_data:
1013 case OMPD_target_enter_data:
1014 case OMPD_distribute:
1015 case OMPD_distribute_simd:
1016 case OMPD_distribute_parallel_for:
1017 case OMPD_distribute_parallel_for_simd:
1018 case OMPD_teams_distribute:
1019 case OMPD_teams_distribute_simd:
1020 case OMPD_teams_distribute_parallel_for:
1021 case OMPD_teams_distribute_parallel_for_simd:
1022 case OMPD_target_update:
1023 case OMPD_declare_simd:
1024 case OMPD_declare_target:
1025 case OMPD_end_declare_target:
1026 case OMPD_declare_reduction:
1027 case OMPD_taskloop:
1028 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001029 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001030 case OMPD_unknown:
1031 llvm_unreachable("Unexpected directive.");
1032 }
1033 }
1034
1035 return false;
1036}
1037
1038/// Checks if the construct supports lightweight runtime. It must be SPMD
1039/// construct + inner loop-based construct with static scheduling.
1040static bool supportsLightweightRuntime(ASTContext &Ctx,
1041 const OMPExecutableDirective &D) {
1042 if (!supportsSPMDExecutionMode(Ctx, D))
1043 return false;
1044 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1045 switch (DirectiveKind) {
1046 case OMPD_target:
1047 case OMPD_target_teams:
1048 case OMPD_target_parallel:
1049 return hasNestedLightweightDirective(Ctx, D);
1050 case OMPD_target_parallel_for:
1051 case OMPD_target_parallel_for_simd:
1052 case OMPD_target_teams_distribute_parallel_for:
1053 case OMPD_target_teams_distribute_parallel_for_simd:
1054 // (Last|First)-privates must be shared in parallel region.
1055 return hasStaticScheduling(D);
1056 case OMPD_target_simd:
1057 case OMPD_target_teams_distribute:
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001058 case OMPD_target_teams_distribute_simd:
Alexey Bataevdf093e72018-05-11 19:45:14 +00001059 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001060 case OMPD_parallel:
1061 case OMPD_for:
1062 case OMPD_parallel_for:
1063 case OMPD_parallel_sections:
1064 case OMPD_for_simd:
1065 case OMPD_parallel_for_simd:
1066 case OMPD_cancel:
1067 case OMPD_cancellation_point:
1068 case OMPD_ordered:
1069 case OMPD_threadprivate:
1070 case OMPD_task:
1071 case OMPD_simd:
1072 case OMPD_sections:
1073 case OMPD_section:
1074 case OMPD_single:
1075 case OMPD_master:
1076 case OMPD_critical:
1077 case OMPD_taskyield:
1078 case OMPD_barrier:
1079 case OMPD_taskwait:
1080 case OMPD_taskgroup:
1081 case OMPD_atomic:
1082 case OMPD_flush:
1083 case OMPD_teams:
1084 case OMPD_target_data:
1085 case OMPD_target_exit_data:
1086 case OMPD_target_enter_data:
1087 case OMPD_distribute:
1088 case OMPD_distribute_simd:
1089 case OMPD_distribute_parallel_for:
1090 case OMPD_distribute_parallel_for_simd:
1091 case OMPD_teams_distribute:
1092 case OMPD_teams_distribute_simd:
1093 case OMPD_teams_distribute_parallel_for:
1094 case OMPD_teams_distribute_parallel_for_simd:
1095 case OMPD_target_update:
1096 case OMPD_declare_simd:
1097 case OMPD_declare_target:
1098 case OMPD_end_declare_target:
1099 case OMPD_declare_reduction:
1100 case OMPD_taskloop:
1101 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001102 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001103 case OMPD_unknown:
1104 break;
1105 }
1106 llvm_unreachable(
1107 "Unknown programming model for OpenMP directive on NVPTX target.");
1108}
1109
1110void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001111 StringRef ParentName,
1112 llvm::Function *&OutlinedFn,
1113 llvm::Constant *&OutlinedFnID,
1114 bool IsOffloadEntry,
1115 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001116 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001117 EntryFunctionState EST;
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001118 WorkerFunctionState WST(CGM, D.getBeginLoc());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001119 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001120 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001121
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001122 // Emit target region as a standalone region.
1123 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001124 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1125 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001126
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001127 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001128 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001129 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001130 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001131 void Enter(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001132 auto &RT =
1133 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001134 RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1135 // Skip target region initialization.
1136 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001137 }
1138 void Exit(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001139 auto &RT =
1140 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001141 RT.clearLocThreadIdInsertPt(CGF);
1142 RT.emitNonSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001143 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001144 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001145 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001146 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001147 // Reserve place for the globalized memory.
1148 GlobalizedRecords.emplace_back();
Alexey Bataeve4090182018-11-02 14:54:07 +00001149 if (!KernelStaticGlobalized) {
1150 KernelStaticGlobalized = new llvm::GlobalVariable(
1151 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1152 llvm::GlobalValue::InternalLinkage,
1153 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1154 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1155 llvm::GlobalValue::NotThreadLocal,
1156 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1157 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001158 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1159 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001160 IsInTTDRegion = false;
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001161
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001162 // Now change the name of the worker function to correspond to this target
1163 // region's entry function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001164 WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
Alexey Bataevaee93892018-01-08 20:09:47 +00001165
1166 // Create the worker function
1167 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001168}
1169
1170// Setup NVPTX threads for master-worker OpenMP scheme.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001171void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001172 EntryFunctionState &EST,
1173 WorkerFunctionState &WST) {
1174 CGBuilderTy &Bld = CGF.Builder;
1175
1176 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1177 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1178 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1179 EST.ExitBB = CGF.createBasicBlock(".exit");
1180
Alexey Bataev9ff80832018-04-16 20:16:21 +00001181 llvm::Value *IsWorker =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001182 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1183 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1184
1185 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001186 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001187 CGF.EmitBranch(EST.ExitBB);
1188
1189 CGF.EmitBlock(MasterCheckBB);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001190 llvm::Value *IsMaster =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001191 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1192 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1193
1194 CGF.EmitBlock(MasterBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001195 IsInTargetMasterThreadRegion = true;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001196 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001197 // First action in sequential region:
1198 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001199 // TODO: Optimize runtime initialization and pass in correct value.
1200 llvm::Value *Args[] = {getThreadLimit(CGF),
1201 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001202 CGF.EmitRuntimeCall(
1203 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001204
1205 // For data sharing, we need to initialize the stack.
1206 CGF.EmitRuntimeCall(
1207 createNVPTXRuntimeFunction(
1208 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1209
Alexey Bataevc99042b2018-03-15 18:10:54 +00001210 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001211}
1212
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001213void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001214 EntryFunctionState &EST) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001215 IsInTargetMasterThreadRegion = false;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001216 if (!CGF.HaveInsertPoint())
1217 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001218
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001219 emitGenericVarsEpilog(CGF);
1220
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001221 if (!EST.ExitBB)
1222 EST.ExitBB = CGF.createBasicBlock(".exit");
1223
1224 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1225 CGF.EmitBranch(TerminateBB);
1226
1227 CGF.EmitBlock(TerminateBB);
1228 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001229 // TODO: Optimize runtime initialization and pass in correct value.
1230 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001231 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001232 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001233 // Barrier to terminate worker threads.
1234 syncCTAThreads(CGF);
1235 // Master thread jumps to exit point.
1236 CGF.EmitBranch(EST.ExitBB);
1237
1238 CGF.EmitBlock(EST.ExitBB);
1239 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001240}
1241
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001242void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001243 StringRef ParentName,
1244 llvm::Function *&OutlinedFn,
1245 llvm::Constant *&OutlinedFnID,
1246 bool IsOffloadEntry,
1247 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001248 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001249 EntryFunctionState EST;
1250
1251 // Emit target region as a standalone region.
1252 class NVPTXPrePostActionTy : public PrePostActionTy {
1253 CGOpenMPRuntimeNVPTX &RT;
1254 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1255 const OMPExecutableDirective &D;
1256
1257 public:
1258 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1259 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1260 const OMPExecutableDirective &D)
1261 : RT(RT), EST(EST), D(D) {}
1262 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001263 RT.emitSPMDEntryHeader(CGF, EST, D);
Alexey Bataevfd006c42018-10-05 15:08:53 +00001264 // Skip target region initialization.
1265 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001266 }
1267 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevfd006c42018-10-05 15:08:53 +00001268 RT.clearLocThreadIdInsertPt(CGF);
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001269 RT.emitSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001270 }
1271 } Action(*this, EST, D);
1272 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001273 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001274 // Reserve place for the globalized memory.
1275 GlobalizedRecords.emplace_back();
Alexey Bataeve4090182018-11-02 14:54:07 +00001276 if (!KernelStaticGlobalized) {
1277 KernelStaticGlobalized = new llvm::GlobalVariable(
1278 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1279 llvm::GlobalValue::InternalLinkage,
1280 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1281 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1282 llvm::GlobalValue::NotThreadLocal,
1283 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1284 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001285 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1286 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001287 IsInTTDRegion = false;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001288}
1289
Alexey Bataeve79451a2018-10-01 16:20:57 +00001290static void
1291getDistributeLastprivateVars(const OMPExecutableDirective &D,
1292 llvm::SmallVectorImpl<const ValueDecl *> &Vars);
1293
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001294void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001295 CodeGenFunction &CGF, EntryFunctionState &EST,
1296 const OMPExecutableDirective &D) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001297 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001298
1299 // Setup BBs in entry function.
1300 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1301 EST.ExitBB = CGF.createBasicBlock(".exit");
1302
1303 // Initialize the OMP state in the runtime; called by all active threads.
Alexey Bataev80a9a612018-08-30 14:45:24 +00001304 bool RequiresFullRuntime = CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1305 !supportsLightweightRuntime(CGF.getContext(), D);
Alexey Bataeve79451a2018-10-01 16:20:57 +00001306 // Check if we have inner distribute + lastprivate|reduction clauses.
1307 bool RequiresDatasharing = RequiresFullRuntime;
1308 if (!RequiresDatasharing) {
1309 const OMPExecutableDirective *TD = &D;
1310 if (!isOpenMPTeamsDirective(TD->getDirectiveKind()) &&
1311 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1312 const Stmt *S = getSingleCompoundChild(
1313 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1314 /*IgnoreCaptured=*/true));
1315 TD = cast<OMPExecutableDirective>(S);
1316 }
1317 if (!isOpenMPDistributeDirective(TD->getDirectiveKind()) &&
1318 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1319 const Stmt *S = getSingleCompoundChild(
1320 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1321 /*IgnoreCaptured=*/true));
1322 TD = cast<OMPExecutableDirective>(S);
1323 }
1324 if (isOpenMPDistributeDirective(TD->getDirectiveKind()))
1325 RequiresDatasharing = TD->hasClausesOfKind<OMPLastprivateClause>() ||
1326 TD->hasClausesOfKind<OMPReductionClause>();
1327 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001328 llvm::Value *Args[] = {
1329 getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1330 /*RequiresOMPRuntime=*/
1331 Bld.getInt16(RequiresFullRuntime ? 1 : 0),
Alexey Bataeve79451a2018-10-01 16:20:57 +00001332 /*RequiresDataSharing=*/Bld.getInt16(RequiresDatasharing ? 1 : 0)};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001333 CGF.EmitRuntimeCall(
1334 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001335
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001336 if (RequiresFullRuntime) {
1337 // For data sharing, we need to initialize the stack.
1338 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1339 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1340 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001341
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001342 CGF.EmitBranch(ExecuteBB);
1343
1344 CGF.EmitBlock(ExecuteBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001345
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001346 IsInTargetMasterThreadRegion = true;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001347}
1348
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001349void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001350 EntryFunctionState &EST) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001351 IsInTargetMasterThreadRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001352 if (!CGF.HaveInsertPoint())
1353 return;
1354
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001355 if (!EST.ExitBB)
1356 EST.ExitBB = CGF.createBasicBlock(".exit");
1357
1358 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1359 CGF.EmitBranch(OMPDeInitBB);
1360
1361 CGF.EmitBlock(OMPDeInitBB);
1362 // DeInitialize the OMP state in the runtime; called by all active threads.
1363 CGF.EmitRuntimeCall(
1364 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1365 CGF.EmitBranch(EST.ExitBB);
1366
1367 CGF.EmitBlock(EST.ExitBB);
1368 EST.ExitBB = nullptr;
1369}
1370
1371// Create a unique global variable to indicate the execution mode of this target
1372// region. The execution mode is either 'generic', or 'spmd' depending on the
1373// target directive. This variable is picked up by the offload library to setup
1374// the device appropriately before kernel launch. If the execution mode is
1375// 'generic', the runtime reserves one warp for the master, otherwise, all
1376// warps participate in parallel work.
1377static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001378 bool Mode) {
1379 auto *GVMode =
1380 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1381 llvm::GlobalValue::WeakAnyLinkage,
1382 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1383 Twine(Name, "_exec_mode"));
Alexey Bataev9ff80832018-04-16 20:16:21 +00001384 CGM.addCompilerUsedGlobal(GVMode);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001385}
1386
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001387void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00001388 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001389
1390 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001391 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001392 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001393 emitWorkerLoop(CGF, WST);
1394 CGF.FinishFunction();
1395}
1396
1397void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1398 WorkerFunctionState &WST) {
1399 //
1400 // The workers enter this loop and wait for parallel work from the master.
1401 // When the master encounters a parallel region it sets up the work + variable
1402 // arguments, and wakes up the workers. The workers first check to see if
1403 // they are required for the parallel region, i.e., within the # of requested
1404 // parallel threads. The activated workers load the variable arguments and
1405 // execute the parallel work.
1406 //
1407
1408 CGBuilderTy &Bld = CGF.Builder;
1409
1410 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1411 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1412 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1413 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1414 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1415 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1416
1417 CGF.EmitBranch(AwaitBB);
1418
1419 // Workers wait for work from master.
1420 CGF.EmitBlock(AwaitBB);
1421 // Wait for parallel work
1422 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001423
1424 Address WorkFn =
1425 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1426 Address ExecStatus =
1427 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1428 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1429 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1430
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001431 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001432 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001433 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001434 llvm::Value *Ret = CGF.EmitRuntimeCall(
1435 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1436 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001437
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001438 // On termination condition (workid == 0), exit loop.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001439 llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1440 llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001441 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1442
1443 // Activate requested workers.
1444 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001445 llvm::Value *IsActive =
1446 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1447 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001448
1449 // Signal start of parallel region.
1450 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001451
1452 // Process work items: outlined parallel functions.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001453 for (llvm::Function *W : Work) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001454 // Try to match this outlined function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001455 llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001456
1457 llvm::Value *WorkFnMatch =
1458 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1459
1460 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1461 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1462 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1463
1464 // Execute this outlined function.
1465 CGF.EmitBlock(ExecuteFNBB);
1466
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001467 // Insert call to work function via shared wrapper. The shared
1468 // wrapper takes two arguments:
1469 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001470 // - the thread ID;
1471 emitCall(CGF, WST.Loc, W,
1472 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001473
1474 // Go to end of parallel region.
1475 CGF.EmitBranch(TerminateBB);
1476
1477 CGF.EmitBlock(CheckNextBB);
1478 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001479 // Default case: call to outlined function through pointer if the target
1480 // region makes a declare target call that may contain an orphaned parallel
1481 // directive.
1482 auto *ParallelFnTy =
1483 llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1484 /*isVarArg=*/false)
1485 ->getPointerTo();
1486 llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1487 // Insert call to work function via shared wrapper. The shared
1488 // wrapper takes two arguments:
1489 // - the parallelism level;
1490 // - the thread ID;
1491 emitCall(CGF, WST.Loc, WorkFnCast,
1492 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1493 // Go to end of parallel region.
1494 CGF.EmitBranch(TerminateBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001495
1496 // Signal end of parallel region.
1497 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001498 CGF.EmitRuntimeCall(
1499 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1500 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001501 CGF.EmitBranch(BarrierBB);
1502
1503 // All active and inactive workers wait at a barrier after parallel region.
1504 CGF.EmitBlock(BarrierBB);
1505 // Barrier after parallel region.
1506 syncCTAThreads(CGF);
1507 CGF.EmitBranch(AwaitBB);
1508
1509 // Exit target region.
1510 CGF.EmitBlock(ExitBB);
1511}
1512
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001513/// Returns specified OpenMP runtime function for the current OpenMP
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001514/// implementation. Specialized for the NVPTX device.
1515/// \param Function OpenMP runtime function.
1516/// \return Specified function.
1517llvm::Constant *
1518CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1519 llvm::Constant *RTLFn = nullptr;
1520 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1521 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001522 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1523 // RequiresOMPRuntime);
1524 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001525 auto *FnTy =
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001526 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1527 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1528 break;
1529 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001530 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001531 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1532 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001533 auto *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001534 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001535 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1536 break;
1537 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001538 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1539 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001540 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001541 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001542 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001543 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1544 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1545 break;
1546 }
1547 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1548 // Build void __kmpc_spmd_kernel_deinit();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001549 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001550 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1551 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1552 break;
1553 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001554 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1555 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001556 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001557 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001558 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001559 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1560 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1561 break;
1562 }
1563 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001564 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1565 /// int16_t IsOMPRuntimeInitialized);
1566 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001567 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001568 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001569 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1570 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1571 break;
1572 }
1573 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1574 /// Build void __kmpc_kernel_end_parallel();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001575 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001576 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1577 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1578 break;
1579 }
1580 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1581 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1582 // global_tid);
1583 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001584 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001585 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1586 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1587 break;
1588 }
1589 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1590 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1591 // global_tid);
1592 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001593 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001594 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1595 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1596 break;
1597 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001598 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1599 // Build int32_t __kmpc_shuffle_int32(int32_t element,
1600 // int16_t lane_offset, int16_t warp_size);
1601 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001602 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001603 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1604 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1605 break;
1606 }
1607 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1608 // Build int64_t __kmpc_shuffle_int64(int64_t element,
1609 // int16_t lane_offset, int16_t warp_size);
1610 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001611 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001612 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1613 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1614 break;
1615 }
1616 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1617 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1618 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1619 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1620 // lane_offset, int16_t Algorithm Version),
1621 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1622 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1623 CGM.Int16Ty, CGM.Int16Ty};
1624 auto *ShuffleReduceFnTy =
1625 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1626 /*isVarArg=*/false);
1627 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1628 auto *InterWarpCopyFnTy =
1629 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1630 /*isVarArg=*/false);
1631 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1632 CGM.Int32Ty,
1633 CGM.SizeTy,
1634 CGM.VoidPtrTy,
1635 ShuffleReduceFnTy->getPointerTo(),
1636 InterWarpCopyFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001637 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001638 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1639 RTLFn = CGM.CreateRuntimeFunction(
1640 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1641 break;
1642 }
Alexey Bataevfac26cf2018-05-02 20:03:27 +00001643 case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1644 // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1645 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1646 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1647 // lane_offset, int16_t Algorithm Version),
1648 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1649 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1650 CGM.Int16Ty, CGM.Int16Ty};
1651 auto *ShuffleReduceFnTy =
1652 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1653 /*isVarArg=*/false);
1654 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1655 auto *InterWarpCopyFnTy =
1656 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1657 /*isVarArg=*/false);
1658 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1659 CGM.Int32Ty,
1660 CGM.SizeTy,
1661 CGM.VoidPtrTy,
1662 ShuffleReduceFnTy->getPointerTo(),
1663 InterWarpCopyFnTy->getPointerTo()};
1664 auto *FnTy =
1665 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1666 RTLFn = CGM.CreateRuntimeFunction(
1667 FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1668 break;
1669 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001670 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1671 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1672 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1673 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1674 // lane_offset, int16_t shortCircuit),
1675 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1676 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1677 // int32_t index, int32_t width),
1678 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1679 // int32_t index, int32_t width, int32_t reduce))
1680 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1681 CGM.Int16Ty, CGM.Int16Ty};
1682 auto *ShuffleReduceFnTy =
1683 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1684 /*isVarArg=*/false);
1685 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1686 auto *InterWarpCopyFnTy =
1687 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1688 /*isVarArg=*/false);
1689 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1690 CGM.Int32Ty, CGM.Int32Ty};
1691 auto *CopyToScratchpadFnTy =
1692 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1693 /*isVarArg=*/false);
1694 llvm::Type *LoadReduceTypeParams[] = {
1695 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1696 auto *LoadReduceFnTy =
1697 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1698 /*isVarArg=*/false);
1699 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1700 CGM.Int32Ty,
1701 CGM.SizeTy,
1702 CGM.VoidPtrTy,
1703 ShuffleReduceFnTy->getPointerTo(),
1704 InterWarpCopyFnTy->getPointerTo(),
1705 CopyToScratchpadFnTy->getPointerTo(),
1706 LoadReduceFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001707 auto *FnTy =
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001708 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1709 RTLFn = CGM.CreateRuntimeFunction(
1710 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1711 break;
1712 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001713 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1714 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1715 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001716 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001717 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1718 RTLFn = CGM.CreateRuntimeFunction(
1719 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1720 break;
1721 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001722 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1723 /// Build void __kmpc_data_sharing_init_stack();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001724 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001725 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1726 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1727 break;
1728 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001729 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1730 /// Build void __kmpc_data_sharing_init_stack_spmd();
1731 auto *FnTy =
1732 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001733 RTLFn =
1734 CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001735 break;
1736 }
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00001737 case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1738 // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001739 // int16_t UseSharedMemory);
1740 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001741 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001742 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1743 RTLFn = CGM.CreateRuntimeFunction(
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00001744 FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001745 break;
1746 }
1747 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1748 // Build void __kmpc_data_sharing_pop_stack(void *a);
1749 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001750 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001751 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1752 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1753 /*Name=*/"__kmpc_data_sharing_pop_stack");
1754 break;
1755 }
1756 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1757 /// Build void __kmpc_begin_sharing_variables(void ***args,
1758 /// size_t n_args);
1759 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001760 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001761 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1762 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1763 break;
1764 }
1765 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1766 /// Build void __kmpc_end_sharing_variables();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001767 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001768 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1769 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1770 break;
1771 }
1772 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1773 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1774 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001775 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001776 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1777 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1778 break;
1779 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001780 case OMPRTL_NVPTX__kmpc_parallel_level: {
1781 // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1782 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1783 auto *FnTy =
1784 llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1785 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1786 break;
1787 }
Alexey Bataev673110d2018-05-16 13:36:30 +00001788 case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1789 // Build int8_t __kmpc_is_spmd_exec_mode();
1790 auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1791 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1792 break;
1793 }
Alexey Bataeve4090182018-11-02 14:54:07 +00001794 case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1795 // Build void __kmpc_get_team_static_memory(const void *buf, size_t size,
1796 // int16_t is_shared, const void **res);
1797 llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty,
1798 CGM.VoidPtrPtrTy};
1799 auto *FnTy =
1800 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1801 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1802 break;
1803 }
1804 case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1805 // Build void __kmpc_restore_team_static_memory(int16_t is_shared);
1806 auto *FnTy =
1807 llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false);
1808 RTLFn =
1809 CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1810 break;
1811 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001812 }
1813 return RTLFn;
1814}
1815
1816void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1817 llvm::Constant *Addr,
Alexey Bataev03f270c2018-03-30 18:31:07 +00001818 uint64_t Size, int32_t,
1819 llvm::GlobalValue::LinkageTypes) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001820 // TODO: Add support for global variables on the device after declare target
1821 // support.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001822 if (!isa<llvm::Function>(Addr))
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001823 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +00001824 llvm::Module &M = CGM.getModule();
1825 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001826
1827 // Get "nvvm.annotations" metadata node
Alexey Bataev9ff80832018-04-16 20:16:21 +00001828 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001829
1830 llvm::Metadata *MDVals[] = {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001831 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001832 llvm::ConstantAsMetadata::get(
1833 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1834 // Append metadata to nvvm.annotations
1835 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1836}
1837
1838void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1839 const OMPExecutableDirective &D, StringRef ParentName,
1840 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001841 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001842 if (!IsOffloadEntry) // Nothing to do.
1843 return;
1844
1845 assert(!ParentName.empty() && "Invalid target region parent name!");
1846
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001847 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001848 if (Mode)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001849 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001850 CodeGen);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001851 else
1852 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1853 CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001854
1855 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001856}
1857
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001858CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001859 : CGOpenMPRuntime(CGM, "_", "$") {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001860 if (!CGM.getLangOpts().OpenMPIsDevice)
1861 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001862}
Carlo Bertollic6872252016-04-04 15:55:02 +00001863
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001864void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1865 OpenMPProcBindClauseKind ProcBind,
1866 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001867 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001868 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001869 return;
1870
1871 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1872}
1873
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001874void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1875 llvm::Value *NumThreads,
1876 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001877 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001878 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001879 return;
1880
1881 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1882}
1883
Carlo Bertollic6872252016-04-04 15:55:02 +00001884void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1885 const Expr *NumTeams,
1886 const Expr *ThreadLimit,
1887 SourceLocation Loc) {}
1888
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001889llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1890 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1891 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001892 // Emit target region as a standalone region.
1893 class NVPTXPrePostActionTy : public PrePostActionTy {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001894 bool &IsInParallelRegion;
1895 bool PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001896
1897 public:
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001898 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1899 : IsInParallelRegion(IsInParallelRegion) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001900 void Enter(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001901 PrevIsInParallelRegion = IsInParallelRegion;
1902 IsInParallelRegion = true;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001903 }
1904 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001905 IsInParallelRegion = PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001906 }
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001907 } Action(IsInParallelRegion);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001908 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001909 bool PrevIsInTTDRegion = IsInTTDRegion;
1910 IsInTTDRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001911 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1912 IsInTargetMasterThreadRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001913 auto *OutlinedFun =
1914 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1915 D, ThreadIDVar, InnermostKind, CodeGen));
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001916 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001917 IsInTTDRegion = PrevIsInTTDRegion;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001918 if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1919 !IsInParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001920 llvm::Function *WrapperFun =
1921 createParallelDataSharingWrapper(OutlinedFun, D);
1922 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1923 }
1924
1925 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001926}
1927
Alexey Bataev2adecff2018-09-21 14:22:53 +00001928/// Get list of lastprivate variables from the teams distribute ... or
1929/// teams {distribute ...} directives.
1930static void
1931getDistributeLastprivateVars(const OMPExecutableDirective &D,
1932 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1933 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1934 "expected teams directive.");
1935 const OMPExecutableDirective *Dir = &D;
1936 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1937 if (const Stmt *S = getSingleCompoundChild(
1938 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1939 /*IgnoreCaptured=*/true))) {
1940 Dir = dyn_cast<OMPExecutableDirective>(S);
1941 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1942 Dir = nullptr;
1943 }
1944 }
1945 if (!Dir)
1946 return;
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001947 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001948 for (const Expr *E : C->getVarRefs()) {
1949 const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
1950 Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
1951 }
1952 }
1953}
1954
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001955llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001956 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1957 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001958 SourceLocation Loc = D.getBeginLoc();
Carlo Bertollic6872252016-04-04 15:55:02 +00001959
Alexey Bataev2adecff2018-09-21 14:22:53 +00001960 const RecordDecl *GlobalizedRD = nullptr;
1961 llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
1962 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1963 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
1964 getDistributeLastprivateVars(D, LastPrivates);
1965 if (!LastPrivates.empty())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001966 GlobalizedRD = ::buildRecordForGlobalizedVars(
1967 CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
Alexey Bataev2adecff2018-09-21 14:22:53 +00001968 }
1969
Alexey Bataevc99042b2018-03-15 18:10:54 +00001970 // Emit target region as a standalone region.
1971 class NVPTXPrePostActionTy : public PrePostActionTy {
1972 SourceLocation &Loc;
Alexey Bataev2adecff2018-09-21 14:22:53 +00001973 const RecordDecl *GlobalizedRD;
1974 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1975 &MappedDeclsFields;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001976
1977 public:
Alexey Bataev2adecff2018-09-21 14:22:53 +00001978 NVPTXPrePostActionTy(
1979 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1980 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1981 &MappedDeclsFields)
1982 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1983 MappedDeclsFields(MappedDeclsFields) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001984 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001985 auto &Rt =
1986 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1987 if (GlobalizedRD) {
1988 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
1989 I->getSecond().GlobalRecord = GlobalizedRD;
1990 I->getSecond().MappedParams =
1991 llvm::make_unique<CodeGenFunction::OMPMapVars>();
1992 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
1993 for (const auto &Pair : MappedDeclsFields) {
1994 assert(Pair.getFirst()->isCanonicalDecl() &&
1995 "Expected canonical declaration");
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001996 Data.insert(std::make_pair(Pair.getFirst(),
1997 MappedVarData(Pair.getSecond(),
1998 /*IsOnePerTeam=*/true)));
Alexey Bataev2adecff2018-09-21 14:22:53 +00001999 }
2000 }
2001 Rt.emitGenericVarsProlog(CGF, Loc);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002002 }
2003 void Exit(CodeGenFunction &CGF) override {
2004 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2005 .emitGenericVarsEpilog(CGF);
2006 }
Alexey Bataev2adecff2018-09-21 14:22:53 +00002007 } Action(Loc, GlobalizedRD, MappedDeclsFields);
2008 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002009 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2010 D, ThreadIDVar, InnermostKind, CodeGen);
2011 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
2012 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00002013 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002014 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00002015
2016 return OutlinedFun;
2017}
2018
Alexey Bataevc99042b2018-03-15 18:10:54 +00002019void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002020 SourceLocation Loc,
2021 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002022 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2023 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002024 return;
2025
Alexey Bataevc99042b2018-03-15 18:10:54 +00002026 CGBuilderTy &Bld = CGF.Builder;
2027
2028 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2029 if (I == FunctionGlobalizedDecls.end())
2030 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002031 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002032 QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002033 QualType SecGlobalRecTy;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002034
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002035 // Recover pointer to this function's global record. The runtime will
2036 // handle the specifics of the allocation of the memory.
2037 // Use actual memory size of the record including the padding
2038 // for alignment purposes.
2039 unsigned Alignment =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002040 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002041 unsigned GlobalRecordSize =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002042 CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002043 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002044
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002045 llvm::PointerType *GlobalRecPtrTy =
2046 CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002047 llvm::Value *GlobalRecCastAddr;
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002048 llvm::Value *IsTTD = nullptr;
Alexey Bataeve4090182018-11-02 14:54:07 +00002049 if (!IsInTTDRegion &&
2050 (WithSPMDCheck ||
2051 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002052 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2053 llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2054 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002055 if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2056 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2057 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2058 llvm::Value *PL = CGF.EmitRuntimeCall(
2059 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2060 {RTLoc, ThreadID});
2061 IsTTD = Bld.CreateIsNull(PL);
2062 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002063 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2064 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2065 Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2066 // There is no need to emit line number for unconditional branch.
2067 (void)ApplyDebugLocation::CreateEmpty(CGF);
2068 CGF.EmitBlock(SPMDBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002069 Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2070 CharUnits::fromQuantity(Alignment));
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002071 CGF.EmitBranch(ExitBB);
2072 // There is no need to emit line number for unconditional branch.
2073 (void)ApplyDebugLocation::CreateEmpty(CGF);
2074 CGF.EmitBlock(NonSPMDBB);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002075 llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2076 if (const RecordDecl *SecGlobalizedVarsRecord =
2077 I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2078 SecGlobalRecTy =
2079 CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2080
2081 // Recover pointer to this function's global record. The runtime will
2082 // handle the specifics of the allocation of the memory.
2083 // Use actual memory size of the record including the padding
2084 // for alignment purposes.
2085 unsigned Alignment =
2086 CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2087 unsigned GlobalRecordSize =
2088 CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2089 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2090 Size = Bld.CreateSelect(
2091 IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2092 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002093 // TODO: allow the usage of shared memory to be controlled by
2094 // the user, for now, default to global.
2095 llvm::Value *GlobalRecordSizeArg[] = {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002096 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002097 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2098 createNVPTXRuntimeFunction(
2099 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2100 GlobalRecordSizeArg);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002101 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002102 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002103 CGF.EmitBlock(ExitBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002104 auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002105 /*NumReservedValues=*/2, "_select_stack");
2106 Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2107 Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2108 GlobalRecCastAddr = Phi;
2109 I->getSecond().GlobalRecordAddr = Phi;
2110 I->getSecond().IsInSPMDModeFlag = IsSPMD;
Alexey Bataeve4090182018-11-02 14:54:07 +00002111 } else if (IsInTTDRegion) {
2112 assert(GlobalizedRecords.back().Records.size() < 2 &&
2113 "Expected less than 2 globalized records: one for target and one "
2114 "for teams.");
2115 unsigned Offset = 0;
2116 for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2117 QualType RDTy = CGM.getContext().getRecordType(RD);
2118 unsigned Alignment =
2119 CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2120 unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2121 Offset =
2122 llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2123 }
2124 unsigned Alignment =
2125 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2126 Offset = llvm::alignTo(Offset, Alignment);
2127 GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2128 ++GlobalizedRecords.back().RegionCounter;
2129 if (GlobalizedRecords.back().Records.size() == 1) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002130 assert(KernelStaticGlobalized &&
2131 "Kernel static pointer must be initialized already.");
2132 auto *UseSharedMemory = new llvm::GlobalVariable(
2133 CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2134 llvm::GlobalValue::InternalLinkage, nullptr,
2135 "_openmp_static_kernel$is_shared");
2136 UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2137 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2138 /*DestWidth=*/16, /*Signed=*/0);
2139 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2140 Address(UseSharedMemory,
2141 CGM.getContext().getTypeAlignInChars(Int16Ty)),
2142 /*Volatile=*/false, Int16Ty, Loc);
2143 auto *StaticGlobalized = new llvm::GlobalVariable(
2144 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2145 llvm::GlobalValue::WeakAnyLinkage, nullptr);
Alexey Bataeve4090182018-11-02 14:54:07 +00002146 auto *RecSize = new llvm::GlobalVariable(
2147 CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2148 llvm::GlobalValue::InternalLinkage, nullptr,
2149 "_openmp_static_kernel$size");
2150 RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2151 llvm::Value *Ld = CGF.EmitLoadOfScalar(
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002152 Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
Alexey Bataeve4090182018-11-02 14:54:07 +00002153 CGM.getContext().getSizeType(), Loc);
2154 llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2155 KernelStaticGlobalized, CGM.VoidPtrPtrTy);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002156 llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
2157 IsInSharedMemory, ResAddr};
Alexey Bataeve4090182018-11-02 14:54:07 +00002158 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2159 OMPRTL_NVPTX__kmpc_get_team_static_memory),
2160 GlobalRecordSizeArg);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002161 GlobalizedRecords.back().Buffer = StaticGlobalized;
Alexey Bataeve4090182018-11-02 14:54:07 +00002162 GlobalizedRecords.back().RecSize = RecSize;
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002163 GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2164 GlobalizedRecords.back().Loc = Loc;
Alexey Bataeve4090182018-11-02 14:54:07 +00002165 }
2166 assert(KernelStaticGlobalized && "Global address must be set already.");
2167 Address FrameAddr = CGF.EmitLoadOfPointer(
2168 Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2169 CGM.getContext()
2170 .getPointerType(CGM.getContext().VoidPtrTy)
2171 .castAs<PointerType>());
2172 llvm::Value *GlobalRecValue =
2173 Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One())
2174 .getPointer();
2175 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2176 I->getSecond().IsInSPMDModeFlag = nullptr;
2177 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2178 GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002179 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002180 // TODO: allow the usage of shared memory to be controlled by
2181 // the user, for now, default to global.
2182 llvm::Value *GlobalRecordSizeArg[] = {
2183 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2184 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002185 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2186 createNVPTXRuntimeFunction(
2187 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2188 GlobalRecordSizeArg);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002189 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002190 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002191 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2192 I->getSecond().IsInSPMDModeFlag = nullptr;
2193 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002194 LValue Base =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002195 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002196
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002197 // Emit the "global alloca" which is a GEP from the global declaration
2198 // record using the pointer returned by the runtime.
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002199 LValue SecBase;
2200 decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2201 if (IsTTD) {
2202 SecIt = I->getSecond().SecondaryLocalVarData->begin();
2203 llvm::PointerType *SecGlobalRecPtrTy =
2204 CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2205 SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2206 Bld.CreatePointerBitCastOrAddrSpaceCast(
2207 I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2208 SecGlobalRecTy);
2209 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002210 for (auto &Rec : I->getSecond().LocalVarData) {
2211 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2212 llvm::Value *ParValue;
2213 if (EscapedParam) {
2214 const auto *VD = cast<VarDecl>(Rec.first);
2215 LValue ParLVal =
2216 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2217 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2218 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002219 LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2220 // Emit VarAddr basing on lane-id if required.
2221 QualType VarTy;
2222 if (Rec.second.IsOnePerTeam) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002223 VarTy = Rec.second.FD->getType();
2224 } else {
2225 llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2226 VarAddr.getAddress().getPointer(),
2227 {Bld.getInt32(0), getNVPTXLaneID(CGF)});
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002228 VarTy =
2229 Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002230 VarAddr = CGF.MakeAddrLValue(
2231 Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2232 AlignmentSource::Decl);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002233 }
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002234 Rec.second.PrivateAddr = VarAddr.getAddress();
Alexey Bataeve4090182018-11-02 14:54:07 +00002235 if (!IsInTTDRegion &&
2236 (WithSPMDCheck ||
2237 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002238 assert(I->getSecond().IsInSPMDModeFlag &&
2239 "Expected unknown execution mode or required SPMD check.");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002240 if (IsTTD) {
2241 assert(SecIt->second.IsOnePerTeam &&
2242 "Secondary glob data must be one per team.");
2243 LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2244 VarAddr.setAddress(
2245 Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2246 VarAddr.getPointer()),
2247 VarAddr.getAlignment()));
2248 Rec.second.PrivateAddr = VarAddr.getAddress();
2249 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002250 Address GlobalPtr = Rec.second.PrivateAddr;
2251 Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2252 Rec.second.PrivateAddr = Address(
2253 Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2254 LocalAddr.getPointer(), GlobalPtr.getPointer()),
2255 LocalAddr.getAlignment());
2256 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002257 if (EscapedParam) {
2258 const auto *VD = cast<VarDecl>(Rec.first);
2259 CGF.EmitStoreOfScalar(ParValue, VarAddr);
2260 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2261 }
Alexey Bataev93a38d62018-10-16 00:09:06 +00002262 if (IsTTD)
2263 ++SecIt;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002264 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002265 }
2266 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2267 // Recover pointer to this function's global record. The runtime will
2268 // handle the specifics of the allocation of the memory.
2269 // Use actual memory size of the record including the padding
2270 // for alignment purposes.
Alexey Bataev9ff80832018-04-16 20:16:21 +00002271 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002272 llvm::Value *Size = CGF.getTypeSize(VD->getType());
2273 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2274 Size = Bld.CreateNUWAdd(
2275 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2276 llvm::Value *AlignVal =
2277 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2278 Size = Bld.CreateUDiv(Size, AlignVal);
2279 Size = Bld.CreateNUWMul(Size, AlignVal);
2280 // TODO: allow the usage of shared memory to be controlled by
2281 // the user, for now, default to global.
2282 llvm::Value *GlobalRecordSizeArg[] = {
2283 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2284 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002285 createNVPTXRuntimeFunction(
2286 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002287 GlobalRecordSizeArg);
2288 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2289 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2290 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2291 CGM.getContext().getDeclAlign(VD),
2292 AlignmentSource::Decl);
2293 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2294 Base.getAddress());
2295 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002296 }
2297 I->getSecond().MappedParams->apply(CGF);
2298}
2299
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002300void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2301 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002302 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2303 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002304 return;
2305
Alexey Bataevc99042b2018-03-15 18:10:54 +00002306 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002307 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00002308 I->getSecond().MappedParams->restore(CGF);
2309 if (!CGF.HaveInsertPoint())
2310 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002311 for (llvm::Value *Addr :
2312 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2313 CGF.EmitRuntimeCall(
2314 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2315 Addr);
2316 }
2317 if (I->getSecond().GlobalRecordAddr) {
Alexey Bataeve4090182018-11-02 14:54:07 +00002318 if (!IsInTTDRegion &&
2319 (WithSPMDCheck ||
2320 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002321 CGBuilderTy &Bld = CGF.Builder;
2322 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2323 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2324 Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2325 // There is no need to emit line number for unconditional branch.
2326 (void)ApplyDebugLocation::CreateEmpty(CGF);
2327 CGF.EmitBlock(NonSPMDBB);
2328 CGF.EmitRuntimeCall(
2329 createNVPTXRuntimeFunction(
2330 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2331 CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2332 CGF.EmitBlock(ExitBB);
Alexey Bataeve4090182018-11-02 14:54:07 +00002333 } else if (IsInTTDRegion) {
2334 assert(GlobalizedRecords.back().RegionCounter > 0 &&
2335 "region counter must be > 0.");
2336 --GlobalizedRecords.back().RegionCounter;
2337 // Emit the restore function only in the target region.
2338 if (GlobalizedRecords.back().RegionCounter == 0) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002339 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2340 /*DestWidth=*/16, /*Signed=*/0);
2341 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2342 Address(GlobalizedRecords.back().UseSharedMemory,
2343 CGM.getContext().getTypeAlignInChars(Int16Ty)),
2344 /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
Alexey Bataeve4090182018-11-02 14:54:07 +00002345 CGF.EmitRuntimeCall(
2346 createNVPTXRuntimeFunction(
2347 OMPRTL_NVPTX__kmpc_restore_team_static_memory),
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002348 IsInSharedMemory);
Alexey Bataeve4090182018-11-02 14:54:07 +00002349 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002350 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002351 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2352 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2353 I->getSecond().GlobalRecordAddr);
2354 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002355 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00002356 }
2357}
2358
Carlo Bertollic6872252016-04-04 15:55:02 +00002359void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2360 const OMPExecutableDirective &D,
2361 SourceLocation Loc,
2362 llvm::Value *OutlinedFn,
2363 ArrayRef<llvm::Value *> CapturedVars) {
2364 if (!CGF.HaveInsertPoint())
2365 return;
2366
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002367 Address ZeroAddr = CGF.CreateMemTemp(
2368 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2369 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00002370 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2371 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002372 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00002373 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2374 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00002375 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00002376}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002377
2378void CGOpenMPRuntimeNVPTX::emitParallelCall(
2379 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2380 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2381 if (!CGF.HaveInsertPoint())
2382 return;
2383
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002384 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002385 emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002386 else
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002387 emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002388}
2389
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002390void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002391 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2392 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2393 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002394
2395 // Force inline this outlined function at its call site.
2396 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2397
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002398 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2399 /*DestWidth=*/32, /*Signed=*/1),
2400 ".zero.addr");
2401 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002402 // ThreadId for serialized parallels is 0.
2403 Address ThreadIDAddr = ZeroAddr;
2404 auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002405 CodeGenFunction &CGF, PrePostActionTy &Action) {
2406 Action.Enter(CGF);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002407
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002408 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2409 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2410 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2411 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2412 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2413 };
2414 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2415 PrePostActionTy &) {
2416
2417 RegionCodeGenTy RCG(CodeGen);
2418 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2419 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2420 llvm::Value *Args[] = {RTLoc, ThreadID};
2421
2422 NVPTXActionTy Action(
2423 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2424 Args,
2425 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2426 Args);
2427 RCG.setAction(Action);
2428 RCG(CGF);
2429 };
2430
2431 auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2432 PrePostActionTy &Action) {
2433 CGBuilderTy &Bld = CGF.Builder;
2434 llvm::Function *WFn = WrapperFunctionsMap[Fn];
2435 assert(WFn && "Wrapper function does not exist!");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002436 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2437
2438 // Prepare for parallel region. Indicate the outlined function.
2439 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002440 CGF.EmitRuntimeCall(
2441 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2442 Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002443
2444 // Create a private scope that will globalize the arguments
2445 // passed from the outside of the target region.
2446 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2447
2448 // There's somehting to share.
2449 if (!CapturedVars.empty()) {
2450 // Prepare for parallel region. Indicate the outlined function.
2451 Address SharedArgs =
2452 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2453 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2454
2455 llvm::Value *DataSharingArgs[] = {
2456 SharedArgsPtr,
2457 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2458 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2459 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2460 DataSharingArgs);
2461
2462 // Store variable address in a list of references to pass to workers.
2463 unsigned Idx = 0;
2464 ASTContext &Ctx = CGF.getContext();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002465 Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2466 SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2467 .castAs<PointerType>());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002468 for (llvm::Value *V : CapturedVars) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002469 Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2470 CGF.getPointerSize());
2471 llvm::Value *PtrV;
Alexey Bataev17314212018-03-20 15:41:05 +00002472 if (V->getType()->isIntegerTy())
2473 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2474 else
2475 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002476 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2477 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00002478 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002479 }
2480 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002481
2482 // Activate workers. This barrier is used by the master to signal
2483 // work for the workers.
2484 syncCTAThreads(CGF);
2485
2486 // OpenMP [2.5, Parallel Construct, p.49]
2487 // There is an implied barrier at the end of a parallel region. After the
2488 // end of a parallel region, only the master thread of the team resumes
2489 // execution of the enclosing task region.
2490 //
2491 // The master waits at this barrier until all workers are done.
2492 syncCTAThreads(CGF);
2493
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002494 if (!CapturedVars.empty())
2495 CGF.EmitRuntimeCall(
2496 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2497
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002498 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002499 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002500 };
2501
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002502 auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2503 CodeGenFunction &CGF, PrePostActionTy &Action) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002504 if (IsInParallelRegion) {
2505 SeqGen(CGF, Action);
2506 } else if (IsInTargetMasterThreadRegion) {
2507 L0ParallelGen(CGF, Action);
2508 } else {
2509 // Check for master and then parallelism:
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002510 // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002511 // Serialized execution.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002512 // } else {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002513 // Worker call.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002514 // }
2515 CGBuilderTy &Bld = CGF.Builder;
2516 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002517 llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2518 llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002519 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
Alexey Bataev673110d2018-05-16 13:36:30 +00002520 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2521 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002522 Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002523 // There is no need to emit line number for unconditional branch.
2524 (void)ApplyDebugLocation::CreateEmpty(CGF);
2525 CGF.EmitBlock(ParallelCheckBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002526 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2527 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2528 llvm::Value *PL = CGF.EmitRuntimeCall(
2529 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2530 {RTLoc, ThreadID});
2531 llvm::Value *Res = Bld.CreateIsNotNull(PL);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002532 Bld.CreateCondBr(Res, SeqBB, MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002533 CGF.EmitBlock(SeqBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002534 SeqGen(CGF, Action);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002535 CGF.EmitBranch(ExitBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002536 // There is no need to emit line number for unconditional branch.
2537 (void)ApplyDebugLocation::CreateEmpty(CGF);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002538 CGF.EmitBlock(MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002539 L0ParallelGen(CGF, Action);
2540 CGF.EmitBranch(ExitBB);
2541 // There is no need to emit line number for unconditional branch.
2542 (void)ApplyDebugLocation::CreateEmpty(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002543 // Emit the continuation block for code after the if.
2544 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2545 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002546 };
2547
Alexey Bataev9ff80832018-04-16 20:16:21 +00002548 if (IfCond) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002549 emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002550 } else {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002551 CodeGenFunction::RunCleanupsScope Scope(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002552 RegionCodeGenTy ThenRCG(LNParallelGen);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002553 ThenRCG(CGF);
2554 }
2555}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002556
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002557void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002558 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2559 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2560 // Just call the outlined function to execute the parallel region.
2561 // OutlinedFn(&GTid, &zero, CapturedStruct);
2562 //
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002563 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00002564
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002565 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2566 /*DestWidth=*/32, /*Signed=*/1),
2567 ".zero.addr");
Carlo Bertolli79712092018-02-28 20:48:35 +00002568 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002569 // ThreadId for serialized parallels is 0.
2570 Address ThreadIDAddr = ZeroAddr;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002571 auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
Alexey Bataev8521ff62018-07-25 20:03:01 +00002572 &ThreadIDAddr](CodeGenFunction &CGF,
2573 PrePostActionTy &Action) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002574 Action.Enter(CGF);
2575
2576 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2577 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2578 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2579 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2580 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2581 };
2582 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2583 PrePostActionTy &) {
2584
2585 RegionCodeGenTy RCG(CodeGen);
2586 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2587 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2588 llvm::Value *Args[] = {RTLoc, ThreadID};
2589
2590 NVPTXActionTy Action(
2591 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2592 Args,
2593 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2594 Args);
2595 RCG.setAction(Action);
2596 RCG(CGF);
2597 };
2598
2599 if (IsInTargetMasterThreadRegion) {
Alexey Bataev8521ff62018-07-25 20:03:01 +00002600 // In the worker need to use the real thread id.
2601 ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002602 RegionCodeGenTy RCG(CodeGen);
2603 RCG(CGF);
2604 } else {
2605 // If we are not in the target region, it is definitely L2 parallelism or
2606 // more, because for SPMD mode we always has L1 parallel level, sowe don't
2607 // need to check for orphaned directives.
2608 RegionCodeGenTy RCG(SeqGen);
2609 RCG(CGF);
2610 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002611}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002612
Alexey Bataev504fc2d2018-05-07 17:23:05 +00002613void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2614 CodeGenFunction &CGF, StringRef CriticalName,
2615 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2616 const Expr *Hint) {
2617 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2618 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2619 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2620 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2621 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2622
2623 // Fetch team-local id of the thread.
2624 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2625
2626 // Get the width of the team.
2627 llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2628
2629 // Initialize the counter variable for the loop.
2630 QualType Int32Ty =
2631 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2632 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2633 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2634 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2635 /*isInit=*/true);
2636
2637 // Block checks if loop counter exceeds upper bound.
2638 CGF.EmitBlock(LoopBB);
2639 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2640 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2641 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2642
2643 // Block tests which single thread should execute region, and which threads
2644 // should go straight to synchronisation point.
2645 CGF.EmitBlock(TestBB);
2646 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2647 llvm::Value *CmpThreadToCounter =
2648 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2649 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2650
2651 // Block emits the body of the critical region.
2652 CGF.EmitBlock(BodyBB);
2653
2654 // Output the critical statement.
2655 CriticalOpGen(CGF);
2656
2657 // After the body surrounded by the critical region, the single executing
2658 // thread will jump to the synchronisation point.
2659 // Block waits for all threads in current team to finish then increments the
2660 // counter variable and returns to the loop.
2661 CGF.EmitBlock(SyncBB);
2662 getNVPTXCTABarrier(CGF);
2663
2664 llvm::Value *IncCounterVal =
2665 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2666 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2667 CGF.EmitBranch(LoopBB);
2668
2669 // Block that is reached when all threads in the team complete the region.
2670 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2671}
2672
Alexey Bataevb2575932018-01-04 20:18:55 +00002673/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00002674static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2675 QualType ValTy, QualType CastTy,
2676 SourceLocation Loc) {
2677 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2678 "Cast type must sized.");
2679 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2680 "Val type must sized.");
2681 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2682 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00002683 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00002684 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2685 CGF.getContext().getTypeSizeInChars(CastTy))
2686 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2687 if (CastTy->isIntegerType() && ValTy->isIntegerType())
2688 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2689 CastTy->hasSignedIntegerRepresentation());
2690 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002691 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2692 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00002693 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2694 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00002695}
2696
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002697/// This function creates calls to one of two shuffle functions to copy
2698/// variables between lanes in a warp.
2699static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002700 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00002701 QualType ElemType,
2702 llvm::Value *Offset,
2703 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002704 CodeGenModule &CGM = CGF.CGM;
2705 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002706 CGOpenMPRuntimeNVPTX &RT =
2707 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2708
Alexey Bataeva453f362018-03-19 17:53:56 +00002709 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2710 assert(Size.getQuantity() <= 8 &&
2711 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002712
Alexey Bataeva453f362018-03-19 17:53:56 +00002713 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002714 ? OMPRTL_NVPTX__kmpc_shuffle_int32
2715 : OMPRTL_NVPTX__kmpc_shuffle_int64;
2716
2717 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00002718 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2719 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2720 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002721 llvm::Value *WarpSize =
Alexey Bataevb2575932018-01-04 20:18:55 +00002722 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002723
Alexey Bataev9ff80832018-04-16 20:16:21 +00002724 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2725 RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002726
Alexey Bataeva453f362018-03-19 17:53:56 +00002727 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002728}
2729
Alexey Bataev12c62902018-06-22 19:10:38 +00002730static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2731 Address DestAddr, QualType ElemType,
2732 llvm::Value *Offset, SourceLocation Loc) {
2733 CGBuilderTy &Bld = CGF.Builder;
2734
2735 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2736 // Create the loop over the big sized data.
2737 // ptr = (void*)Elem;
2738 // ptrEnd = (void*) Elem + 1;
2739 // Step = 8;
2740 // while (ptr + Step < ptrEnd)
2741 // shuffle((int64_t)*ptr);
2742 // Step = 4;
2743 // while (ptr + Step < ptrEnd)
2744 // shuffle((int32_t)*ptr);
2745 // ...
2746 Address ElemPtr = DestAddr;
2747 Address Ptr = SrcAddr;
2748 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2749 Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2750 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2751 if (Size < CharUnits::fromQuantity(IntSize))
2752 continue;
2753 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2754 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2755 /*Signed=*/1);
2756 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2757 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2758 ElemPtr =
2759 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2760 if (Size.getQuantity() / IntSize > 1) {
2761 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2762 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2763 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2764 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2765 CGF.EmitBlock(PreCondBB);
2766 llvm::PHINode *PhiSrc =
2767 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2768 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2769 llvm::PHINode *PhiDest =
2770 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2771 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2772 Ptr = Address(PhiSrc, Ptr.getAlignment());
2773 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2774 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2775 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2776 Ptr.getPointer(), CGF.VoidPtrTy));
2777 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2778 ThenBB, ExitBB);
2779 CGF.EmitBlock(ThenBB);
2780 llvm::Value *Res = createRuntimeShuffleFunction(
2781 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2782 IntType, Offset, Loc);
2783 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2784 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2785 ElemPtr =
2786 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2787 PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2788 PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2789 CGF.EmitBranch(PreCondBB);
2790 CGF.EmitBlock(ExitBB);
2791 } else {
2792 llvm::Value *Res = createRuntimeShuffleFunction(
2793 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2794 IntType, Offset, Loc);
2795 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2796 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2797 ElemPtr =
2798 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2799 }
2800 Size = Size % IntSize;
2801 }
2802}
2803
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002804namespace {
2805enum CopyAction : unsigned {
2806 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2807 // the warp using shuffle instructions.
2808 RemoteLaneToThread,
2809 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2810 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002811 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2812 ThreadToScratchpad,
2813 // ScratchpadToThread: Copy from a scratchpad array in global memory
2814 // containing team-reduced data to a thread's stack.
2815 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002816};
2817} // namespace
2818
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002819struct CopyOptionsTy {
2820 llvm::Value *RemoteLaneOffset;
2821 llvm::Value *ScratchpadIndex;
2822 llvm::Value *ScratchpadWidth;
2823};
2824
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002825/// Emit instructions to copy a Reduce list, which contains partially
2826/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002827static void emitReductionListCopy(
2828 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2829 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2830 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002831
Alexey Bataev9ff80832018-04-16 20:16:21 +00002832 CodeGenModule &CGM = CGF.CGM;
2833 ASTContext &C = CGM.getContext();
2834 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002835
Alexey Bataev9ff80832018-04-16 20:16:21 +00002836 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2837 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2838 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002839
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002840 // Iterates, element-by-element, through the source Reduce list and
2841 // make a copy.
2842 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002843 unsigned Size = Privates.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00002844 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002845 Address SrcElementAddr = Address::invalid();
2846 Address DestElementAddr = Address::invalid();
2847 Address DestElementPtrAddr = Address::invalid();
2848 // Should we shuffle in an element from a remote lane?
2849 bool ShuffleInElement = false;
2850 // Set to true to update the pointer in the dest Reduce list to a
2851 // newly created element.
2852 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002853 // Increment the src or dest pointer to the scratchpad, for each
2854 // new element.
2855 bool IncrScratchpadSrc = false;
2856 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002857
2858 switch (Action) {
2859 case RemoteLaneToThread: {
2860 // Step 1.1: Get the address for the src element in the Reduce list.
2861 Address SrcElementPtrAddr =
2862 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002863 SrcElementAddr = CGF.EmitLoadOfPointer(
2864 SrcElementPtrAddr,
2865 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002866
2867 // Step 1.2: Create a temporary to store the element in the destination
2868 // Reduce list.
2869 DestElementPtrAddr =
2870 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2871 DestElementAddr =
2872 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2873 ShuffleInElement = true;
2874 UpdateDestListPtr = true;
2875 break;
2876 }
2877 case ThreadCopy: {
2878 // Step 1.1: Get the address for the src element in the Reduce list.
2879 Address SrcElementPtrAddr =
2880 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002881 SrcElementAddr = CGF.EmitLoadOfPointer(
2882 SrcElementPtrAddr,
2883 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002884
2885 // Step 1.2: Get the address for dest element. The destination
2886 // element has already been created on the thread's stack.
2887 DestElementPtrAddr =
2888 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002889 DestElementAddr = CGF.EmitLoadOfPointer(
2890 DestElementPtrAddr,
2891 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002892 break;
2893 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002894 case ThreadToScratchpad: {
2895 // Step 1.1: Get the address for the src element in the Reduce list.
2896 Address SrcElementPtrAddr =
2897 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002898 SrcElementAddr = CGF.EmitLoadOfPointer(
2899 SrcElementPtrAddr,
2900 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002901
2902 // Step 1.2: Get the address for dest element:
2903 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002904 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002905 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002906 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002907 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002908 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002909 ScratchPadElemAbsolutePtrVal =
2910 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002911 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2912 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002913 IncrScratchpadDest = true;
2914 break;
2915 }
2916 case ScratchpadToThread: {
2917 // Step 1.1: Get the address for the src element in the scratchpad.
2918 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002919 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002920 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002921 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002922 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002923 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002924 ScratchPadElemAbsolutePtrVal =
2925 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2926 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2927 C.getTypeAlignInChars(Private->getType()));
2928 IncrScratchpadSrc = true;
2929
2930 // Step 1.2: Create a temporary to store the element in the destination
2931 // Reduce list.
2932 DestElementPtrAddr =
2933 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2934 DestElementAddr =
2935 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2936 UpdateDestListPtr = true;
2937 break;
2938 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002939 }
2940
2941 // Regardless of src and dest of copy, we emit the load of src
2942 // element as this is required in all directions
2943 SrcElementAddr = Bld.CreateElementBitCast(
2944 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
Alexey Bataev12c62902018-06-22 19:10:38 +00002945 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2946 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002947
2948 // Now that all active lanes have read the element in the
2949 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00002950 if (ShuffleInElement) {
Alexey Bataev12c62902018-06-22 19:10:38 +00002951 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2952 RemoteLaneOffset, Private->getExprLoc());
2953 } else {
2954 if (Private->getType()->isScalarType()) {
2955 llvm::Value *Elem =
2956 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2957 Private->getType(), Private->getExprLoc());
2958 // Store the source element value to the dest element address.
2959 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2960 Private->getType());
2961 } else {
2962 CGF.EmitAggregateCopy(
2963 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2964 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2965 Private->getType(), AggValueSlot::DoesNotOverlap);
2966 }
Alexey Bataeva453f362018-03-19 17:53:56 +00002967 }
Alexey Bataevb2575932018-01-04 20:18:55 +00002968
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002969 // Step 3.1: Modify reference in dest Reduce list as needed.
2970 // Modifying the reference in Reduce list to point to the newly
2971 // created element. The element is live in the current function
2972 // scope and that of functions it invokes (i.e., reduce_function).
2973 // RemoteReduceData[i] = (void*)&RemoteElem
2974 if (UpdateDestListPtr) {
2975 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2976 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2977 DestElementPtrAddr, /*Volatile=*/false,
2978 C.VoidPtrTy);
2979 }
2980
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002981 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2982 // address of the next element in scratchpad memory, unless we're currently
2983 // processing the last one. Memory alignment is also taken care of here.
2984 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2985 llvm::Value *ScratchpadBasePtr =
2986 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
Alexey Bataeve290ec02018-04-06 16:03:36 +00002987 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2988 ScratchpadBasePtr = Bld.CreateNUWAdd(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002989 ScratchpadBasePtr,
Alexey Bataeve290ec02018-04-06 16:03:36 +00002990 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002991
2992 // Take care of global memory alignment for performance
Alexey Bataeve290ec02018-04-06 16:03:36 +00002993 ScratchpadBasePtr = Bld.CreateNUWSub(
2994 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2995 ScratchpadBasePtr = Bld.CreateUDiv(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002996 ScratchpadBasePtr,
2997 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
Alexey Bataeve290ec02018-04-06 16:03:36 +00002998 ScratchpadBasePtr = Bld.CreateNUWAdd(
2999 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3000 ScratchpadBasePtr = Bld.CreateNUWMul(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003001 ScratchpadBasePtr,
3002 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3003
3004 if (IncrScratchpadDest)
3005 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3006 else /* IncrScratchpadSrc = true */
3007 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3008 }
3009
Alexey Bataev9ff80832018-04-16 20:16:21 +00003010 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003011 }
3012}
3013
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003014/// This function emits a helper that loads data from the scratchpad array
3015/// and (optionally) reduces it with the input operand.
3016///
3017/// load_and_reduce(local, scratchpad, index, width, should_reduce)
3018/// reduce_data remote;
3019/// for elem in remote:
3020/// remote.elem = Scratchpad[elem_id][index]
3021/// if (should_reduce)
3022/// local = local @ remote
3023/// else
3024/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003025static llvm::Value *emitReduceScratchpadFunction(
3026 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3027 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003028 ASTContext &C = CGM.getContext();
3029 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003030
3031 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003032 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3033 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003034 // Base address of the scratchpad array, with each element storing a
3035 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003036 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3037 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003038 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003039 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3040 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003041 // Row width of an element in the scratchpad array, typically
3042 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003043 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3044 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003045 // If should_reduce == 1, then it's load AND reduce,
3046 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
3047 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003048 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3049 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003050
3051 FunctionArgList Args;
3052 Args.push_back(&ReduceListArg);
3053 Args.push_back(&ScratchPadArg);
3054 Args.push_back(&IndexArg);
3055 Args.push_back(&WidthArg);
3056 Args.push_back(&ShouldReduceArg);
3057
Alexey Bataev9ff80832018-04-16 20:16:21 +00003058 const CGFunctionInfo &CGFI =
3059 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003060 auto *Fn = llvm::Function::Create(
3061 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3062 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003063 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003064 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003065 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003066 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003067
Alexey Bataev9ff80832018-04-16 20:16:21 +00003068 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003069
3070 // Get local Reduce list pointer.
3071 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3072 Address ReduceListAddr(
3073 Bld.CreatePointerBitCastOrAddrSpaceCast(
3074 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003075 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003076 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3077 CGF.getPointerAlign());
3078
3079 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3080 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003081 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003082
3083 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003084 llvm::Value *IndexVal = Bld.CreateIntCast(
3085 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3086 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003087
3088 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003089 llvm::Value *WidthVal = Bld.CreateIntCast(
3090 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
3091 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003092
3093 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
3094 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003095 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003096
3097 // The absolute ptr address to the base addr of the next element to copy.
3098 llvm::Value *CumulativeElemBasePtr =
3099 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3100 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3101
3102 // Create a Remote Reduce list to store the elements read from the
3103 // scratchpad array.
3104 Address RemoteReduceList =
3105 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
3106
3107 // Assemble remote Reduce list from scratchpad array.
3108 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
3109 SrcDataAddr, RemoteReduceList,
3110 {/*RemoteLaneOffset=*/nullptr,
3111 /*ScratchpadIndex=*/IndexVal,
3112 /*ScratchpadWidth=*/WidthVal});
3113
3114 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3115 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3116 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3117
Alexey Bataev9ff80832018-04-16 20:16:21 +00003118 llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003119 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3120
3121 CGF.EmitBlock(ThenBB);
3122 // We should reduce with the local Reduce list.
3123 // reduce_function(LocalReduceList, RemoteReduceList)
3124 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3125 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
3126 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3127 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003128 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3129 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003130 Bld.CreateBr(MergeBB);
3131
3132 CGF.EmitBlock(ElseBB);
3133 // No reduction; just copy:
3134 // Local Reduce list = Remote Reduce list.
3135 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3136 RemoteReduceList, ReduceListAddr);
3137 Bld.CreateBr(MergeBB);
3138
3139 CGF.EmitBlock(MergeBB);
3140
3141 CGF.FinishFunction();
3142 return Fn;
3143}
3144
3145/// This function emits a helper that stores reduced data from the team
3146/// master to a scratchpad array in global memory.
3147///
3148/// for elem in Reduce List:
3149/// scratchpad[elem_id][index] = elem
3150///
Benjamin Kramer674d5792017-05-26 20:08:24 +00003151static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
3152 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003153 QualType ReductionArrayTy,
3154 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003155
Alexey Bataev9ff80832018-04-16 20:16:21 +00003156 ASTContext &C = CGM.getContext();
3157 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003158
3159 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003160 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3161 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003162 // Base address of the scratchpad array, with each element storing a
3163 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003164 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3165 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003166 // A destination index into the scratchpad array, typically the team
3167 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003168 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3169 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003170 // Row width of an element in the scratchpad array, typically
3171 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003172 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3173 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003174
3175 FunctionArgList Args;
3176 Args.push_back(&ReduceListArg);
3177 Args.push_back(&ScratchPadArg);
3178 Args.push_back(&IndexArg);
3179 Args.push_back(&WidthArg);
3180
Alexey Bataev9ff80832018-04-16 20:16:21 +00003181 const CGFunctionInfo &CGFI =
3182 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003183 auto *Fn = llvm::Function::Create(
3184 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3185 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003186 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003187 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003188 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003189 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003190
Alexey Bataev9ff80832018-04-16 20:16:21 +00003191 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003192
3193 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3194 Address SrcDataAddr(
3195 Bld.CreatePointerBitCastOrAddrSpaceCast(
3196 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003197 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003198 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3199 CGF.getPointerAlign());
3200
3201 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3202 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003203 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003204
3205 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003206 llvm::Value *IndexVal = Bld.CreateIntCast(
3207 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3208 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003209
3210 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
3211 llvm::Value *WidthVal =
3212 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
3213 Int32Ty, SourceLocation()),
3214 CGF.SizeTy, /*isSigned=*/true);
3215
3216 // The absolute ptr address to the base addr of the next element to copy.
3217 llvm::Value *CumulativeElemBasePtr =
3218 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3219 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3220
3221 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
3222 SrcDataAddr, DestDataAddr,
3223 {/*RemoteLaneOffset=*/nullptr,
3224 /*ScratchpadIndex=*/IndexVal,
3225 /*ScratchpadWidth=*/WidthVal});
3226
3227 CGF.FinishFunction();
3228 return Fn;
3229}
3230
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003231/// This function emits a helper that gathers Reduce lists from the first
3232/// lane of every active warp to lanes in the first warp.
3233///
3234/// void inter_warp_copy_func(void* reduce_data, num_warps)
3235/// shared smem[warp_size];
3236/// For all data entries D in reduce_data:
3237/// If (I am the first lane in each warp)
3238/// Copy my local D to smem[warp_id]
3239/// sync
3240/// if (I am the first warp)
3241/// Copy smem[thread_id] to my local D
3242/// sync
3243static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3244 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003245 QualType ReductionArrayTy,
3246 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003247 ASTContext &C = CGM.getContext();
3248 llvm::Module &M = CGM.getModule();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003249
3250 // ReduceList: thread local Reduce list.
3251 // At the stage of the computation when this function is called, partially
3252 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003253 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3254 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003255 // NumWarps: number of warps active in the parallel region. This could
3256 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003257 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00003258 C.getIntTypeForBitwidth(32, /* Signed */ true),
3259 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003260 FunctionArgList Args;
3261 Args.push_back(&ReduceListArg);
3262 Args.push_back(&NumWarpsArg);
3263
Alexey Bataev9ff80832018-04-16 20:16:21 +00003264 const CGFunctionInfo &CGFI =
3265 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003266 auto *Fn = llvm::Function::Create(
3267 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3268 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003269 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003270 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003271 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003272 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003273
Alexey Bataev9ff80832018-04-16 20:16:21 +00003274 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003275
3276 // This array is used as a medium to transfer, one reduce element at a time,
3277 // the data from the first lane of every warp to lanes in the first warp
3278 // in order to perform the final step of a reduction in a parallel region
3279 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3280 // for reduced latency, as well as to have a distinct copy for concurrently
3281 // executing target regions. The array is declared with common linkage so
3282 // as to be shared across compilation units.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003283 StringRef TransferMediumName =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003284 "__openmp_nvptx_data_transfer_temporary_storage";
3285 llvm::GlobalVariable *TransferMedium =
3286 M.getGlobalVariable(TransferMediumName);
3287 if (!TransferMedium) {
3288 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
3289 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3290 TransferMedium = new llvm::GlobalVariable(
3291 M, Ty,
3292 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3293 llvm::Constant::getNullValue(Ty), TransferMediumName,
3294 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3295 SharedAddressSpace);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003296 CGM.addCompilerUsedGlobal(TransferMedium);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003297 }
3298
3299 // Get the CUDA thread id of the current OpenMP thread on the GPU.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003300 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003301 // nvptx_lane_id = nvptx_id % warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003302 llvm::Value *LaneID = getNVPTXLaneID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003303 // nvptx_warp_id = nvptx_id / warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003304 llvm::Value *WarpID = getNVPTXWarpID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003305
3306 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3307 Address LocalReduceList(
3308 Bld.CreatePointerBitCastOrAddrSpaceCast(
3309 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3310 C.VoidPtrTy, SourceLocation()),
3311 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3312 CGF.getPointerAlign());
3313
3314 unsigned Idx = 0;
Alexey Bataev9ff80832018-04-16 20:16:21 +00003315 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003316 //
3317 // Warp master copies reduce element to transfer medium in __shared__
3318 // memory.
3319 //
3320 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3321 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3322 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3323
3324 // if (lane_id == 0)
Alexey Bataev9ff80832018-04-16 20:16:21 +00003325 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003326 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3327 CGF.EmitBlock(ThenBB);
3328
3329 // Reduce element = LocalReduceList[i]
3330 Address ElemPtrPtrAddr =
3331 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3332 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3333 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3334 // elemptr = (type[i]*)(elemptrptr)
3335 Address ElemPtr =
3336 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3337 ElemPtr = Bld.CreateElementBitCast(
3338 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003339
3340 // Get pointer to location in transfer medium.
3341 // MediumPtr = &medium[warp_id]
3342 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3343 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3344 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
3345 // Casting to actual data type.
3346 // MediumPtr = (type[i]*)MediumPtrAddr;
3347 MediumPtr = Bld.CreateElementBitCast(
3348 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3349
Alexey Bataev12c62902018-06-22 19:10:38 +00003350 // elem = *elemptr
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003351 //*MediumPtr = elem
Alexey Bataev12c62902018-06-22 19:10:38 +00003352 if (Private->getType()->isScalarType()) {
3353 llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3354 Private->getType(), Loc);
3355 // Store the source element value to the dest element address.
3356 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
3357 Private->getType());
3358 } else {
3359 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3360 CGF.MakeAddrLValue(MediumPtr, Private->getType()),
3361 Private->getType(), AggValueSlot::DoesNotOverlap);
3362 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003363
3364 Bld.CreateBr(MergeBB);
3365
3366 CGF.EmitBlock(ElseBB);
3367 Bld.CreateBr(MergeBB);
3368
3369 CGF.EmitBlock(MergeBB);
3370
3371 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3372 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3373 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
3374
Alexey Bataev9ff80832018-04-16 20:16:21 +00003375 llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003376 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3377 // named_barrier_sync(ParallelBarrierID, num_active_threads)
3378 syncParallelThreads(CGF, NumActiveThreads);
3379
3380 //
3381 // Warp 0 copies reduce element from transfer medium.
3382 //
3383 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3384 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3385 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3386
3387 // Up to 32 threads in warp 0 are active.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003388 llvm::Value *IsActiveThread =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003389 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3390 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3391
3392 CGF.EmitBlock(W0ThenBB);
3393
3394 // SrcMediumPtr = &medium[tid]
3395 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3396 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3397 Address SrcMediumPtr(SrcMediumPtrVal,
3398 C.getTypeAlignInChars(Private->getType()));
3399 // SrcMediumVal = *SrcMediumPtr;
3400 SrcMediumPtr = Bld.CreateElementBitCast(
3401 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003402
3403 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
3404 Address TargetElemPtrPtr =
3405 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3406 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3407 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3408 Address TargetElemPtr =
3409 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
3410 TargetElemPtr = Bld.CreateElementBitCast(
3411 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3412
3413 // *TargetElemPtr = SrcMediumVal;
Alexey Bataev12c62902018-06-22 19:10:38 +00003414 if (Private->getType()->isScalarType()) {
3415 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
3416 SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
3417 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3418 Private->getType());
3419 } else {
3420 CGF.EmitAggregateCopy(
3421 CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
3422 CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
3423 Private->getType(), AggValueSlot::DoesNotOverlap);
3424 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003425 Bld.CreateBr(W0MergeBB);
3426
3427 CGF.EmitBlock(W0ElseBB);
3428 Bld.CreateBr(W0MergeBB);
3429
3430 CGF.EmitBlock(W0MergeBB);
3431
3432 // While warp 0 copies values from transfer medium, all other warps must
3433 // wait.
3434 syncParallelThreads(CGF, NumActiveThreads);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003435 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003436 }
3437
3438 CGF.FinishFunction();
3439 return Fn;
3440}
3441
3442/// Emit a helper that reduces data across two OpenMP threads (lanes)
3443/// in the same warp. It uses shuffle instructions to copy over data from
3444/// a remote lane's stack. The reduction algorithm performed is specified
3445/// by the fourth parameter.
3446///
3447/// Algorithm Versions.
3448/// Full Warp Reduce (argument value 0):
3449/// This algorithm assumes that all 32 lanes are active and gathers
3450/// data from these 32 lanes, producing a single resultant value.
3451/// Contiguous Partial Warp Reduce (argument value 1):
3452/// This algorithm assumes that only a *contiguous* subset of lanes
3453/// are active. This happens for the last warp in a parallel region
3454/// when the user specified num_threads is not an integer multiple of
3455/// 32. This contiguous subset always starts with the zeroth lane.
3456/// Partial Warp Reduce (argument value 2):
3457/// This algorithm gathers data from any number of lanes at any position.
3458/// All reduced values are stored in the lowest possible lane. The set
3459/// of problems every algorithm addresses is a super set of those
3460/// addressable by algorithms with a lower version number. Overhead
3461/// increases as algorithm version increases.
3462///
3463/// Terminology
3464/// Reduce element:
3465/// Reduce element refers to the individual data field with primitive
3466/// data types to be combined and reduced across threads.
3467/// Reduce list:
3468/// Reduce list refers to a collection of local, thread-private
3469/// reduce elements.
3470/// Remote Reduce list:
3471/// Remote Reduce list refers to a collection of remote (relative to
3472/// the current thread) reduce elements.
3473///
3474/// We distinguish between three states of threads that are important to
3475/// the implementation of this function.
3476/// Alive threads:
3477/// Threads in a warp executing the SIMT instruction, as distinguished from
3478/// threads that are inactive due to divergent control flow.
3479/// Active threads:
3480/// The minimal set of threads that has to be alive upon entry to this
3481/// function. The computation is correct iff active threads are alive.
3482/// Some threads are alive but they are not active because they do not
3483/// contribute to the computation in any useful manner. Turning them off
3484/// may introduce control flow overheads without any tangible benefits.
3485/// Effective threads:
3486/// In order to comply with the argument requirements of the shuffle
3487/// function, we must keep all lanes holding data alive. But at most
3488/// half of them perform value aggregation; we refer to this half of
3489/// threads as effective. The other half is simply handing off their
3490/// data.
3491///
3492/// Procedure
3493/// Value shuffle:
3494/// In this step active threads transfer data from higher lane positions
3495/// in the warp to lower lane positions, creating Remote Reduce list.
3496/// Value aggregation:
3497/// In this step, effective threads combine their thread local Reduce list
3498/// with Remote Reduce list and store the result in the thread local
3499/// Reduce list.
3500/// Value copy:
3501/// In this step, we deal with the assumption made by algorithm 2
3502/// (i.e. contiguity assumption). When we have an odd number of lanes
3503/// active, say 2k+1, only k threads will be effective and therefore k
3504/// new values will be produced. However, the Reduce list owned by the
3505/// (2k+1)th thread is ignored in the value aggregation. Therefore
3506/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3507/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003508static llvm::Value *emitShuffleAndReduceFunction(
3509 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3510 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003511 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003512
3513 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003514 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3515 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003516 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003517 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3518 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003519 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003520 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3521 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003522 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003523 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3524 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003525 FunctionArgList Args;
3526 Args.push_back(&ReduceListArg);
3527 Args.push_back(&LaneIDArg);
3528 Args.push_back(&RemoteLaneOffsetArg);
3529 Args.push_back(&AlgoVerArg);
3530
Alexey Bataev9ff80832018-04-16 20:16:21 +00003531 const CGFunctionInfo &CGFI =
3532 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003533 auto *Fn = llvm::Function::Create(
3534 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3535 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003536 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003537 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003538 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003539 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003540
Alexey Bataev9ff80832018-04-16 20:16:21 +00003541 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003542
3543 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3544 Address LocalReduceList(
3545 Bld.CreatePointerBitCastOrAddrSpaceCast(
3546 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3547 C.VoidPtrTy, SourceLocation()),
3548 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3549 CGF.getPointerAlign());
3550
3551 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3552 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3553 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3554
3555 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3556 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3557 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3558
3559 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3560 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3561 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3562
3563 // Create a local thread-private variable to host the Reduce list
3564 // from a remote lane.
3565 Address RemoteReduceList =
3566 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3567
3568 // This loop iterates through the list of reduce elements and copies,
3569 // element by element, from a remote lane in the warp to RemoteReduceList,
3570 // hosted on the thread's stack.
3571 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3572 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003573 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3574 /*ScratchpadIndex=*/nullptr,
3575 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003576
3577 // The actions to be performed on the Remote Reduce list is dependent
3578 // on the algorithm version.
3579 //
3580 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3581 // LaneId % 2 == 0 && Offset > 0):
3582 // do the reduction value aggregation
3583 //
3584 // The thread local variable Reduce list is mutated in place to host the
3585 // reduced data, which is the aggregated value produced from local and
3586 // remote lanes.
3587 //
3588 // Note that AlgoVer is expected to be a constant integer known at compile
3589 // time.
3590 // When AlgoVer==0, the first conjunction evaluates to true, making
3591 // the entire predicate true during compile time.
3592 // When AlgoVer==1, the second conjunction has only the second part to be
3593 // evaluated during runtime. Other conjunctions evaluates to false
3594 // during compile time.
3595 // When AlgoVer==2, the third conjunction has only the second part to be
3596 // evaluated during runtime. Other conjunctions evaluates to false
3597 // during compile time.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003598 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003599
Alexey Bataev9ff80832018-04-16 20:16:21 +00003600 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3601 llvm::Value *CondAlgo1 = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003602 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3603
Alexey Bataev9ff80832018-04-16 20:16:21 +00003604 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3605 llvm::Value *CondAlgo2 = Bld.CreateAnd(
3606 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003607 CondAlgo2 = Bld.CreateAnd(
3608 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3609
Alexey Bataev9ff80832018-04-16 20:16:21 +00003610 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003611 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3612
3613 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3614 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3615 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3616 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3617
3618 CGF.EmitBlock(ThenBB);
3619 // reduce_function(LocalReduceList, RemoteReduceList)
3620 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3621 LocalReduceList.getPointer(), CGF.VoidPtrTy);
3622 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3623 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003624 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3625 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003626 Bld.CreateBr(MergeBB);
3627
3628 CGF.EmitBlock(ElseBB);
3629 Bld.CreateBr(MergeBB);
3630
3631 CGF.EmitBlock(MergeBB);
3632
3633 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3634 // Reduce list.
3635 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
Alexey Bataev9ff80832018-04-16 20:16:21 +00003636 llvm::Value *CondCopy = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003637 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3638
3639 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3640 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3641 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3642 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3643
3644 CGF.EmitBlock(CpyThenBB);
3645 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3646 RemoteReduceList, LocalReduceList);
3647 Bld.CreateBr(CpyMergeBB);
3648
3649 CGF.EmitBlock(CpyElseBB);
3650 Bld.CreateBr(CpyMergeBB);
3651
3652 CGF.EmitBlock(CpyMergeBB);
3653
3654 CGF.FinishFunction();
3655 return Fn;
3656}
3657
3658///
3659/// Design of OpenMP reductions on the GPU
3660///
3661/// Consider a typical OpenMP program with one or more reduction
3662/// clauses:
3663///
3664/// float foo;
3665/// double bar;
3666/// #pragma omp target teams distribute parallel for \
3667/// reduction(+:foo) reduction(*:bar)
3668/// for (int i = 0; i < N; i++) {
3669/// foo += A[i]; bar *= B[i];
3670/// }
3671///
3672/// where 'foo' and 'bar' are reduced across all OpenMP threads in
3673/// all teams. In our OpenMP implementation on the NVPTX device an
3674/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3675/// within a team are mapped to CUDA threads within a threadblock.
3676/// Our goal is to efficiently aggregate values across all OpenMP
3677/// threads such that:
3678///
3679/// - the compiler and runtime are logically concise, and
3680/// - the reduction is performed efficiently in a hierarchical
3681/// manner as follows: within OpenMP threads in the same warp,
3682/// across warps in a threadblock, and finally across teams on
3683/// the NVPTX device.
3684///
3685/// Introduction to Decoupling
3686///
3687/// We would like to decouple the compiler and the runtime so that the
3688/// latter is ignorant of the reduction variables (number, data types)
3689/// and the reduction operators. This allows a simpler interface
3690/// and implementation while still attaining good performance.
3691///
3692/// Pseudocode for the aforementioned OpenMP program generated by the
3693/// compiler is as follows:
3694///
3695/// 1. Create private copies of reduction variables on each OpenMP
3696/// thread: 'foo_private', 'bar_private'
3697/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3698/// to it and writes the result in 'foo_private' and 'bar_private'
3699/// respectively.
3700/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3701/// and store the result on the team master:
3702///
3703/// __kmpc_nvptx_parallel_reduce_nowait(...,
3704/// reduceData, shuffleReduceFn, interWarpCpyFn)
3705///
3706/// where:
3707/// struct ReduceData {
3708/// double *foo;
3709/// double *bar;
3710/// } reduceData
3711/// reduceData.foo = &foo_private
3712/// reduceData.bar = &bar_private
3713///
3714/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3715/// auxiliary functions generated by the compiler that operate on
3716/// variables of type 'ReduceData'. They aid the runtime perform
3717/// algorithmic steps in a data agnostic manner.
3718///
3719/// 'shuffleReduceFn' is a pointer to a function that reduces data
3720/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3721/// same warp. It takes the following arguments as input:
3722///
3723/// a. variable of type 'ReduceData' on the calling lane,
3724/// b. its lane_id,
3725/// c. an offset relative to the current lane_id to generate a
3726/// remote_lane_id. The remote lane contains the second
3727/// variable of type 'ReduceData' that is to be reduced.
3728/// d. an algorithm version parameter determining which reduction
3729/// algorithm to use.
3730///
3731/// 'shuffleReduceFn' retrieves data from the remote lane using
3732/// efficient GPU shuffle intrinsics and reduces, using the
3733/// algorithm specified by the 4th parameter, the two operands
3734/// element-wise. The result is written to the first operand.
3735///
3736/// Different reduction algorithms are implemented in different
3737/// runtime functions, all calling 'shuffleReduceFn' to perform
3738/// the essential reduction step. Therefore, based on the 4th
3739/// parameter, this function behaves slightly differently to
3740/// cooperate with the runtime to ensure correctness under
3741/// different circumstances.
3742///
3743/// 'InterWarpCpyFn' is a pointer to a function that transfers
3744/// reduced variables across warps. It tunnels, through CUDA
3745/// shared memory, the thread-private data of type 'ReduceData'
3746/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003747/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3748/// The last team writes the global reduced value to memory.
3749///
3750/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3751/// reduceData, shuffleReduceFn, interWarpCpyFn,
3752/// scratchpadCopyFn, loadAndReduceFn)
3753///
3754/// 'scratchpadCopyFn' is a helper that stores reduced
3755/// data from the team master to a scratchpad array in
3756/// global memory.
3757///
3758/// 'loadAndReduceFn' is a helper that loads data from
3759/// the scratchpad array and reduces it with the input
3760/// operand.
3761///
3762/// These compiler generated functions hide address
3763/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003764/// 5. if ret == 1:
3765/// The team master of the last team stores the reduced
3766/// result to the globals in memory.
3767/// foo += reduceData.foo; bar *= reduceData.bar
3768///
3769///
3770/// Warp Reduction Algorithms
3771///
3772/// On the warp level, we have three algorithms implemented in the
3773/// OpenMP runtime depending on the number of active lanes:
3774///
3775/// Full Warp Reduction
3776///
3777/// The reduce algorithm within a warp where all lanes are active
3778/// is implemented in the runtime as follows:
3779///
3780/// full_warp_reduce(void *reduce_data,
3781/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3782/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3783/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3784/// }
3785///
3786/// The algorithm completes in log(2, WARPSIZE) steps.
3787///
3788/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3789/// not used therefore we save instructions by not retrieving lane_id
3790/// from the corresponding special registers. The 4th parameter, which
3791/// represents the version of the algorithm being used, is set to 0 to
3792/// signify full warp reduction.
3793///
3794/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3795///
3796/// #reduce_elem refers to an element in the local lane's data structure
3797/// #remote_elem is retrieved from a remote lane
3798/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3799/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3800///
3801/// Contiguous Partial Warp Reduction
3802///
3803/// This reduce algorithm is used within a warp where only the first
3804/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3805/// number of OpenMP threads in a parallel region is not a multiple of
3806/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3807///
3808/// void
3809/// contiguous_partial_reduce(void *reduce_data,
3810/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3811/// int size, int lane_id) {
3812/// int curr_size;
3813/// int offset;
3814/// curr_size = size;
3815/// mask = curr_size/2;
3816/// while (offset>0) {
3817/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3818/// curr_size = (curr_size+1)/2;
3819/// offset = curr_size/2;
3820/// }
3821/// }
3822///
3823/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3824///
3825/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3826/// if (lane_id < offset)
3827/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3828/// else
3829/// reduce_elem = remote_elem
3830///
3831/// This algorithm assumes that the data to be reduced are located in a
3832/// contiguous subset of lanes starting from the first. When there is
3833/// an odd number of active lanes, the data in the last lane is not
3834/// aggregated with any other lane's dat but is instead copied over.
3835///
3836/// Dispersed Partial Warp Reduction
3837///
3838/// This algorithm is used within a warp when any discontiguous subset of
3839/// lanes are active. It is used to implement the reduction operation
3840/// across lanes in an OpenMP simd region or in a nested parallel region.
3841///
3842/// void
3843/// dispersed_partial_reduce(void *reduce_data,
3844/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3845/// int size, remote_id;
3846/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3847/// do {
3848/// remote_id = next_active_lane_id_right_after_me();
3849/// # the above function returns 0 of no active lane
3850/// # is present right after the current lane.
3851/// size = number_of_active_lanes_in_this_warp();
3852/// logical_lane_id /= 2;
3853/// ShuffleReduceFn(reduce_data, logical_lane_id,
3854/// remote_id-1-threadIdx.x, 2);
3855/// } while (logical_lane_id % 2 == 0 && size > 1);
3856/// }
3857///
3858/// There is no assumption made about the initial state of the reduction.
3859/// Any number of lanes (>=1) could be active at any position. The reduction
3860/// result is returned in the first active lane.
3861///
3862/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3863///
3864/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3865/// if (lane_id % 2 == 0 && offset > 0)
3866/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3867/// else
3868/// reduce_elem = remote_elem
3869///
3870///
3871/// Intra-Team Reduction
3872///
3873/// This function, as implemented in the runtime call
3874/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3875/// threads in a team. It first reduces within a warp using the
3876/// aforementioned algorithms. We then proceed to gather all such
3877/// reduced values at the first warp.
3878///
3879/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3880/// data from each of the "warp master" (zeroth lane of each warp, where
3881/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3882/// a mathematical sense) the problem of reduction across warp masters in
3883/// a block to the problem of warp reduction.
3884///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003885///
3886/// Inter-Team Reduction
3887///
3888/// Once a team has reduced its data to a single value, it is stored in
3889/// a global scratchpad array. Since each team has a distinct slot, this
3890/// can be done without locking.
3891///
3892/// The last team to write to the scratchpad array proceeds to reduce the
3893/// scratchpad array. One or more workers in the last team use the helper
3894/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3895/// the k'th worker reduces every k'th element.
3896///
3897/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3898/// reduce across workers and compute a globally reduced value.
3899///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003900void CGOpenMPRuntimeNVPTX::emitReduction(
3901 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3902 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3903 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3904 if (!CGF.HaveInsertPoint())
3905 return;
3906
3907 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003908 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003909 bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3910 assert((TeamsReduction || ParallelReduction || SimdReduction) &&
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003911 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003912
Alexey Bataev7b55d2d2018-06-18 17:11:45 +00003913 if (Options.SimpleReduction) {
3914 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3915 ReductionOps, Options);
3916 return;
3917 }
3918
Alexey Bataev9ff80832018-04-16 20:16:21 +00003919 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003920
3921 // 1. Build a list of reduction variables.
3922 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3923 auto Size = RHSExprs.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00003924 for (const Expr *E : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003925 if (E->getType()->isVariablyModifiedType())
3926 // Reserve place for array size.
3927 ++Size;
3928 }
3929 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3930 QualType ReductionArrayTy =
3931 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
3932 /*IndexTypeQuals=*/0);
3933 Address ReductionList =
3934 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3935 auto IPriv = Privates.begin();
3936 unsigned Idx = 0;
3937 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3938 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3939 CGF.getPointerSize());
3940 CGF.Builder.CreateStore(
3941 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3942 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3943 Elem);
3944 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3945 // Store array size.
3946 ++Idx;
3947 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3948 CGF.getPointerSize());
3949 llvm::Value *Size = CGF.Builder.CreateIntCast(
3950 CGF.getVLASize(
3951 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00003952 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003953 CGF.SizeTy, /*isSigned=*/false);
3954 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3955 Elem);
3956 }
3957 }
3958
3959 // 2. Emit reduce_func().
Alexey Bataev9ff80832018-04-16 20:16:21 +00003960 llvm::Value *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003961 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3962 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003963
3964 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3965 // RedList, shuffle_reduce_func, interwarp_copy_func);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003966 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3967 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3968 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003969 ReductionList.getPointer(), CGF.VoidPtrTy);
3970
Alexey Bataev9ff80832018-04-16 20:16:21 +00003971 llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003972 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003973 llvm::Value *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003974 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003975
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003976 llvm::Value *Args[] = {ThreadId,
3977 CGF.Builder.getInt32(RHSExprs.size()),
3978 ReductionArrayTySize,
3979 RL,
3980 ShuffleAndReduceFn,
3981 InterWarpCopyFn};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003982
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003983 llvm::Value *Res = nullptr;
3984 if (ParallelReduction)
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003985 Res = CGF.EmitRuntimeCall(
3986 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3987 Args);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003988 else if (SimdReduction)
3989 Res = CGF.EmitRuntimeCall(
3990 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3991 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003992
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003993 if (TeamsReduction) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003994 llvm::Value *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003995 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003996 llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003997 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003998
3999 llvm::Value *Args[] = {ThreadId,
4000 CGF.Builder.getInt32(RHSExprs.size()),
4001 ReductionArrayTySize,
4002 RL,
4003 ShuffleAndReduceFn,
4004 InterWarpCopyFn,
4005 ScratchPadCopyFn,
4006 LoadAndReduceFn};
4007 Res = CGF.EmitRuntimeCall(
4008 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
4009 Args);
4010 }
4011
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004012 // 5. Build switch(res)
Alexey Bataev9ff80832018-04-16 20:16:21 +00004013 llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
4014 llvm::SwitchInst *SwInst =
4015 CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004016
4017 // 6. Build case 1: where we have reduced values in the master
4018 // thread in each team.
4019 // __kmpc_end_reduce{_nowait}(<gtid>);
4020 // break;
Alexey Bataev9ff80832018-04-16 20:16:21 +00004021 llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004022 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
4023 CGF.EmitBlock(Case1BB);
4024
4025 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4026 llvm::Value *EndArgs[] = {ThreadId};
Alexey Bataev9ff80832018-04-16 20:16:21 +00004027 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004028 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4029 auto IPriv = Privates.begin();
4030 auto ILHS = LHSExprs.begin();
4031 auto IRHS = RHSExprs.begin();
Alexey Bataev9ff80832018-04-16 20:16:21 +00004032 for (const Expr *E : ReductionOps) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004033 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4034 cast<DeclRefExpr>(*IRHS));
4035 ++IPriv;
4036 ++ILHS;
4037 ++IRHS;
4038 }
4039 };
4040 RegionCodeGenTy RCG(CodeGen);
4041 NVPTXActionTy Action(
4042 nullptr, llvm::None,
4043 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4044 EndArgs);
4045 RCG.setAction(Action);
4046 RCG(CGF);
4047 CGF.EmitBranch(DefaultBB);
4048 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
4049}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004050
4051const VarDecl *
4052CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4053 const VarDecl *NativeParam) const {
4054 if (!NativeParam->getType()->isReferenceType())
4055 return NativeParam;
4056 QualType ArgType = NativeParam->getType();
4057 QualifierCollector QC;
4058 const Type *NonQualTy = QC.strip(ArgType);
4059 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4060 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4061 if (Attr->getCaptureKind() == OMPC_map) {
4062 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4063 LangAS::opencl_global);
4064 }
4065 }
4066 ArgType = CGM.getContext().getPointerType(PointeeTy);
4067 QC.addRestrict();
4068 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00004069 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004070 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataev9ff80832018-04-16 20:16:21 +00004071 if (isa<ImplicitParamDecl>(NativeParam))
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004072 return ImplicitParamDecl::Create(
4073 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4074 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004075 return ParmVarDecl::Create(
4076 CGM.getContext(),
4077 const_cast<DeclContext *>(NativeParam->getDeclContext()),
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004078 NativeParam->getBeginLoc(), NativeParam->getLocation(),
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004079 NativeParam->getIdentifier(), ArgType,
4080 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004081}
4082
4083Address
4084CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4085 const VarDecl *NativeParam,
4086 const VarDecl *TargetParam) const {
4087 assert(NativeParam != TargetParam &&
4088 NativeParam->getType()->isReferenceType() &&
4089 "Native arg must not be the same as target arg.");
4090 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4091 QualType NativeParamType = NativeParam->getType();
4092 QualifierCollector QC;
4093 const Type *NonQualTy = QC.strip(NativeParamType);
4094 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4095 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00004096 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004097 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004098 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004099 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004100 // First cast to generic.
4101 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4102 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4103 /*AddrSpace=*/0));
4104 // Cast from generic to native address space.
4105 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4106 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4107 NativePointeeAddrSpace));
4108 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4109 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004110 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004111 return NativeParamAddr;
4112}
4113
4114void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00004115 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004116 ArrayRef<llvm::Value *> Args) const {
4117 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004118 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004119 auto *FnType =
4120 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
4121 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004122 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4123 TargetArgs.append(std::next(Args.begin(), I), Args.end());
4124 break;
4125 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004126 llvm::Type *TargetType = FnType->getParamType(I);
4127 llvm::Value *NativeArg = Args[I];
4128 if (!TargetType->isPointerTy()) {
4129 TargetArgs.emplace_back(NativeArg);
4130 continue;
4131 }
4132 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00004133 NativeArg,
4134 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004135 TargetArgs.emplace_back(
4136 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4137 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00004138 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004139}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004140
4141/// Emit function which wraps the outline parallel region
4142/// and controls the arguments which are passed to this function.
4143/// The wrapper ensures that the outlined function is called
4144/// with the correct arguments when data is shared.
4145llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4146 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4147 ASTContext &Ctx = CGM.getContext();
4148 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4149
4150 // Create a function that takes as argument the source thread.
4151 FunctionArgList WrapperArgs;
4152 QualType Int16QTy =
4153 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4154 QualType Int32QTy =
4155 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004156 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004157 /*Id=*/nullptr, Int16QTy,
4158 ImplicitParamDecl::Other);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004159 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004160 /*Id=*/nullptr, Int32QTy,
4161 ImplicitParamDecl::Other);
4162 WrapperArgs.emplace_back(&ParallelLevelArg);
4163 WrapperArgs.emplace_back(&WrapperArg);
4164
Alexey Bataev9ff80832018-04-16 20:16:21 +00004165 const CGFunctionInfo &CGFI =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004166 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4167
4168 auto *Fn = llvm::Function::Create(
4169 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataev9ff80832018-04-16 20:16:21 +00004170 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004171 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004172 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00004173 Fn->setDoesNotRecurse();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004174
4175 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4176 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004177 D.getBeginLoc(), D.getBeginLoc());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004178
4179 const auto *RD = CS.getCapturedRecordDecl();
4180 auto CurField = RD->field_begin();
4181
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004182 Address ZeroAddr = CGF.CreateMemTemp(
4183 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4184 /*Name*/ ".zero.addr");
4185 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004186 // Get the array of arguments.
4187 SmallVector<llvm::Value *, 8> Args;
4188
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004189 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4190 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004191
4192 CGBuilderTy &Bld = CGF.Builder;
4193 auto CI = CS.capture_begin();
4194
4195 // Use global memory for data sharing.
4196 // Handle passing of global args to workers.
4197 Address GlobalArgs =
4198 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4199 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4200 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4201 CGF.EmitRuntimeCall(
4202 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4203 DataSharingArgs);
4204
4205 // Retrieve the shared variables from the list of references returned
4206 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00004207 Address SharedArgListAddress = Address::invalid();
4208 if (CS.capture_size() > 0 ||
4209 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4210 SharedArgListAddress = CGF.EmitLoadOfPointer(
4211 GlobalArgs, CGF.getContext()
4212 .getPointerType(CGF.getContext().getPointerType(
4213 CGF.getContext().VoidPtrTy))
4214 .castAs<PointerType>());
4215 }
4216 unsigned Idx = 0;
4217 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4218 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4219 CGF.getPointerSize());
4220 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4221 Src, CGF.SizeTy->getPointerTo());
4222 llvm::Value *LB = CGF.EmitLoadOfScalar(
4223 TypedAddress,
4224 /*Volatile=*/false,
4225 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4226 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4227 Args.emplace_back(LB);
4228 ++Idx;
4229 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4230 CGF.getPointerSize());
4231 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4232 Src, CGF.SizeTy->getPointerTo());
4233 llvm::Value *UB = CGF.EmitLoadOfScalar(
4234 TypedAddress,
4235 /*Volatile=*/false,
4236 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4237 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4238 Args.emplace_back(UB);
4239 ++Idx;
4240 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004241 if (CS.capture_size() > 0) {
4242 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004243 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4244 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00004245 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
4246 CGF.getPointerSize());
4247 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004248 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4249 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4250 /*Volatile=*/false,
4251 CGFContext.getPointerType(ElemTy),
4252 CI->getLocation());
Alexey Bataev2091ca62018-04-23 17:33:41 +00004253 if (CI->capturesVariableByCopy() &&
4254 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
Alexey Bataev17314212018-03-20 15:41:05 +00004255 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4256 CI->getLocation());
4257 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004258 Args.emplace_back(Arg);
4259 }
4260 }
4261
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004262 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004263 CGF.FinishFunction();
4264 return Fn;
4265}
4266
4267void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4268 const Decl *D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004269 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4270 return;
4271
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004272 assert(D && "Expected function or captured|block decl.");
4273 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4274 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004275 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004276 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004277 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4278 Body = FD->getBody();
4279 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4280 Body = BD->getBody();
4281 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4282 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004283 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Alexey Bataev2adecff2018-09-21 14:22:53 +00004284 if (NeedToDelayGlobalization &&
4285 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4286 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004287 }
4288 if (!Body)
4289 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004290 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004291 VarChecker.Visit(Body);
Alexey Bataevff23bb62018-10-11 18:30:31 +00004292 const RecordDecl *GlobalizedVarsRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004293 VarChecker.getGlobalizedRecord(IsInTTDRegion);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004294 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4295 VarChecker.getEscapedVariableLengthDecls();
4296 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004297 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004298 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4299 I->getSecond().MappedParams =
4300 llvm::make_unique<CodeGenFunction::OMPMapVars>();
4301 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4302 I->getSecond().EscapedParameters.insert(
4303 VarChecker.getEscapedParameters().begin(),
4304 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004305 I->getSecond().EscapedVariableLengthDecls.append(
4306 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004307 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004308 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004309 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004310 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004311 Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
Alexey Bataevc99042b2018-03-15 18:10:54 +00004312 }
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004313 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004314 CheckVarsEscapingDeclContext VarChecker(CGF);
4315 VarChecker.Visit(Body);
4316 I->getSecond().SecondaryGlobalRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004317 VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004318 I->getSecond().SecondaryLocalVarData.emplace();
4319 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4320 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4321 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4322 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004323 Data.insert(
4324 std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004325 }
4326 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00004327 if (!NeedToDelayGlobalization) {
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004328 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004329 struct GlobalizationScope final : EHScopeStack::Cleanup {
4330 GlobalizationScope() = default;
4331
4332 void Emit(CodeGenFunction &CGF, Flags flags) override {
4333 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004334 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004335 }
4336 };
4337 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004338 }
4339}
4340
4341Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4342 const VarDecl *VD) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004343 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4344 return Address::invalid();
4345
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004346 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004347 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4348 if (I == FunctionGlobalizedDecls.end())
4349 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00004350 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004351 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004352 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004353 if (VD->hasAttrs()) {
4354 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4355 E(VD->attr_end());
4356 IT != E; ++IT) {
4357 auto VDI = I->getSecond().LocalVarData.find(
4358 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4359 ->getCanonicalDecl());
4360 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004361 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004362 }
4363 }
4364 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004365}
4366
4367void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004368 FunctionGlobalizedDecls.erase(CGF.CurFn);
4369 CGOpenMPRuntime::functionFinished(CGF);
4370}
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004371
4372void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4373 CodeGenFunction &CGF, const OMPLoopDirective &S,
4374 OpenMPDistScheduleClauseKind &ScheduleKind,
4375 llvm::Value *&Chunk) const {
4376 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4377 ScheduleKind = OMPC_DIST_SCHEDULE_static;
4378 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4379 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4380 S.getIterationVariable()->getType(), S.getBeginLoc());
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004381 return;
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004382 }
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004383 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4384 CGF, S, ScheduleKind, Chunk);
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004385}
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004386
4387void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4388 CodeGenFunction &CGF, const OMPLoopDirective &S,
4389 OpenMPScheduleClauseKind &ScheduleKind,
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004390 const Expr *&ChunkExpr) const {
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004391 ScheduleKind = OMPC_SCHEDULE_static;
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004392 // Chunk size is 1 in this case.
4393 llvm::APInt ChunkSize(32, 1);
4394 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4395 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4396 SourceLocation());
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004397}
Alexey Bataev60705422018-10-30 15:50:12 +00004398
4399void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4400 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4401 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4402 " Expected target-based directive.");
4403 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4404 for (const CapturedStmt::Capture &C : CS->captures()) {
4405 // Capture variables captured by reference in lambdas for target-based
4406 // directives.
4407 if (!C.capturesVariable())
4408 continue;
4409 const VarDecl *VD = C.getCapturedVar();
4410 const auto *RD = VD->getType()
4411 .getCanonicalType()
4412 .getNonReferenceType()
4413 ->getAsCXXRecordDecl();
4414 if (!RD || !RD->isLambda())
4415 continue;
4416 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4417 LValue VDLVal;
4418 if (VD->getType().getCanonicalType()->isReferenceType())
4419 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4420 else
4421 VDLVal = CGF.MakeAddrLValue(
4422 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4423 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4424 FieldDecl *ThisCapture = nullptr;
4425 RD->getCaptureFields(Captures, ThisCapture);
4426 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4427 LValue ThisLVal =
4428 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4429 llvm::Value *CXXThis = CGF.LoadCXXThis();
4430 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4431 }
4432 for (const LambdaCapture &LC : RD->captures()) {
4433 if (LC.getCaptureKind() != LCK_ByRef)
4434 continue;
4435 const VarDecl *VD = LC.getCapturedVar();
4436 if (!CS->capturesVariable(VD))
4437 continue;
4438 auto It = Captures.find(VD);
4439 assert(It != Captures.end() && "Found lambda capture without field.");
4440 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4441 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4442 if (VD->getType().getCanonicalType()->isReferenceType())
4443 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4444 VD->getType().getCanonicalType())
4445 .getAddress();
4446 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4447 }
4448 }
4449}
4450
Alexey Bataeve4090182018-11-02 14:54:07 +00004451/// Get number of SMs and number of blocks per SM.
4452static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4453 std::pair<unsigned, unsigned> Data;
4454 if (CGM.getLangOpts().OpenMPCUDANumSMs)
4455 Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4456 if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4457 Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4458 if (Data.first && Data.second)
4459 return Data;
4460 if (CGM.getTarget().hasFeature("ptx")) {
4461 llvm::StringMap<bool> Features;
4462 CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4463 CGM.getTarget().getTargetOpts().CPU,
4464 CGM.getTarget().getTargetOpts().Features);
4465 for (const auto &Feature : Features) {
4466 if (Feature.getValue()) {
4467 switch (StringToCudaArch(Feature.getKey())) {
4468 case CudaArch::SM_20:
4469 case CudaArch::SM_21:
4470 case CudaArch::SM_30:
4471 case CudaArch::SM_32:
4472 case CudaArch::SM_35:
4473 case CudaArch::SM_37:
4474 case CudaArch::SM_50:
4475 case CudaArch::SM_52:
4476 case CudaArch::SM_53:
4477 return {16, 16};
4478 case CudaArch::SM_60:
4479 case CudaArch::SM_61:
4480 case CudaArch::SM_62:
4481 return {56, 32};
4482 case CudaArch::SM_70:
4483 case CudaArch::SM_72:
4484 case CudaArch::SM_75:
4485 return {84, 32};
4486 case CudaArch::GFX600:
4487 case CudaArch::GFX601:
4488 case CudaArch::GFX700:
4489 case CudaArch::GFX701:
4490 case CudaArch::GFX702:
4491 case CudaArch::GFX703:
4492 case CudaArch::GFX704:
4493 case CudaArch::GFX801:
4494 case CudaArch::GFX802:
4495 case CudaArch::GFX803:
4496 case CudaArch::GFX810:
4497 case CudaArch::GFX900:
4498 case CudaArch::GFX902:
4499 case CudaArch::GFX904:
4500 case CudaArch::GFX906:
4501 case CudaArch::GFX909:
4502 case CudaArch::UNKNOWN:
4503 break;
4504 case CudaArch::LAST:
4505 llvm_unreachable("Unexpected Cuda arch.");
4506 }
4507 }
4508 }
4509 }
4510 llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4511}
4512
4513void CGOpenMPRuntimeNVPTX::clear() {
4514 if (!GlobalizedRecords.empty()) {
4515 ASTContext &C = CGM.getContext();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004516 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4517 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
Alexey Bataeve4090182018-11-02 14:54:07 +00004518 RecordDecl *StaticRD = C.buildImplicitRecord(
4519 "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4520 StaticRD->startDefinition();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004521 RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4522 "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4523 SharedStaticRD->startDefinition();
Alexey Bataeve4090182018-11-02 14:54:07 +00004524 for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4525 if (Records.Records.empty())
4526 continue;
4527 unsigned Size = 0;
4528 unsigned RecAlignment = 0;
4529 for (const RecordDecl *RD : Records.Records) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004530 QualType RDTy = C.getRecordType(RD);
4531 unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
Alexey Bataeve4090182018-11-02 14:54:07 +00004532 RecAlignment = std::max(RecAlignment, Alignment);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004533 unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
Alexey Bataeve4090182018-11-02 14:54:07 +00004534 Size =
4535 llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4536 }
4537 Size = llvm::alignTo(Size, RecAlignment);
4538 llvm::APInt ArySize(/*numBits=*/64, Size);
4539 QualType SubTy = C.getConstantArrayType(
4540 C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004541 const bool UseSharedMemory = Size <= SharedMemorySize;
4542 auto *Field =
4543 FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4544 SourceLocation(), SourceLocation(), nullptr, SubTy,
4545 C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4546 /*BW=*/nullptr, /*Mutable=*/false,
4547 /*InitStyle=*/ICIS_NoInit);
Alexey Bataeve4090182018-11-02 14:54:07 +00004548 Field->setAccess(AS_public);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004549 if (UseSharedMemory) {
4550 SharedStaticRD->addDecl(Field);
4551 SharedRecs.push_back(&Records);
4552 } else {
4553 StaticRD->addDecl(Field);
4554 GlobalRecs.push_back(&Records);
4555 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004556 Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004557 Records.UseSharedMemory->setInitializer(
4558 llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4559 }
4560 SharedStaticRD->completeDefinition();
4561 if (!SharedStaticRD->field_empty()) {
4562 QualType StaticTy = C.getRecordType(SharedStaticRD);
4563 llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4564 auto *GV = new llvm::GlobalVariable(
4565 CGM.getModule(), LLVMStaticTy,
4566 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4567 llvm::Constant::getNullValue(LLVMStaticTy),
4568 "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4569 llvm::GlobalValue::NotThreadLocal,
4570 C.getTargetAddressSpace(LangAS::cuda_shared));
4571 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4572 GV, CGM.VoidPtrTy);
4573 for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4574 Rec->Buffer->replaceAllUsesWith(Replacement);
4575 Rec->Buffer->eraseFromParent();
4576 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004577 }
4578 StaticRD->completeDefinition();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004579 if (!StaticRD->field_empty()) {
4580 QualType StaticTy = C.getRecordType(StaticRD);
4581 std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4582 llvm::APInt Size1(32, SMsBlockPerSM.second);
4583 QualType Arr1Ty =
4584 C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
4585 /*IndexTypeQuals=*/0);
4586 llvm::APInt Size2(32, SMsBlockPerSM.first);
4587 QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
4588 /*IndexTypeQuals=*/0);
4589 llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4590 auto *GV = new llvm::GlobalVariable(
4591 CGM.getModule(), LLVMArr2Ty,
4592 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4593 llvm::Constant::getNullValue(LLVMArr2Ty),
4594 "_openmp_static_glob_rd_$_");
4595 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4596 GV, CGM.VoidPtrTy);
4597 for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4598 Rec->Buffer->replaceAllUsesWith(Replacement);
4599 Rec->Buffer->eraseFromParent();
4600 }
4601 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004602 }
4603 CGOpenMPRuntime::clear();
4604}