blob: 8a9845489273eff44621b5f639d83a50034b1cb8 [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
Alexey Bataeve4090182018-11-02 14:54:07 +000020#include "clang/Basic/Cuda.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000021#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000022
23using namespace clang;
24using namespace CodeGen;
25
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000026namespace {
27enum OpenMPRTLFunctionNVPTX {
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000028 /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000029 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000030 OMPRTL_NVPTX__kmpc_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000031 /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000032 OMPRTL_NVPTX__kmpc_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000033 /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000034 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000035 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000036 /// Call to void __kmpc_spmd_kernel_deinit();
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000037 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000038 /// Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000039 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000040 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000041 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000042 /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000043 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000044 OMPRTL_NVPTX__kmpc_kernel_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000045 /// Call to void __kmpc_kernel_end_parallel();
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000046 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
47 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
48 /// global_tid);
49 OMPRTL_NVPTX__kmpc_serialized_parallel,
50 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
51 /// global_tid);
52 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000053 /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000054 /// int16_t lane_offset, int16_t warp_size);
55 OMPRTL_NVPTX__kmpc_shuffle_int32,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000056 /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000057 /// int16_t lane_offset, int16_t warp_size);
58 OMPRTL_NVPTX__kmpc_shuffle_int64,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000059 /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000060 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
61 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
62 /// lane_offset, int16_t shortCircuit),
63 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
64 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000065 /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
Alexey Bataevfac26cf2018-05-02 20:03:27 +000066 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
67 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68 /// lane_offset, int16_t shortCircuit),
69 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
70 OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000071 /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000072 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
73 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
74 /// lane_offset, int16_t shortCircuit),
75 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
76 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
77 /// int32_t index, int32_t width),
78 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
79 /// index, int32_t width, int32_t reduce))
80 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000081 /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000082 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000083 /// Call to void __kmpc_data_sharing_init_stack();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000084 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +000085 /// Call to void __kmpc_data_sharing_init_stack_spmd();
86 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +000087 /// Call to void* __kmpc_data_sharing_coalesced_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000088 /// int16_t UseSharedMemory);
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +000089 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000090 /// Call to void __kmpc_data_sharing_pop_stack(void *a);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000091 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000092 /// Call to void __kmpc_begin_sharing_variables(void ***args,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000093 /// size_t n_args);
94 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000095 /// Call to void __kmpc_end_sharing_variables();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000096 OMPRTL_NVPTX__kmpc_end_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000097 /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000098 OMPRTL_NVPTX__kmpc_get_shared_variables,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +000099 /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
100 /// global_tid);
101 OMPRTL_NVPTX__kmpc_parallel_level,
Alexey Bataev673110d2018-05-16 13:36:30 +0000102 /// Call to int8_t __kmpc_is_spmd_exec_mode();
103 OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
Alexey Bataeve4090182018-11-02 14:54:07 +0000104 /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size,
105 /// int16_t is_shared, const void **res);
106 OMPRTL_NVPTX__kmpc_get_team_static_memory,
107 /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
108 OMPRTL_NVPTX__kmpc_restore_team_static_memory,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000109};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000110
111/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
112class NVPTXActionTy final : public PrePostActionTy {
Alexey Bataev9ff80832018-04-16 20:16:21 +0000113 llvm::Value *EnterCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000114 ArrayRef<llvm::Value *> EnterArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000115 llvm::Value *ExitCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000116 ArrayRef<llvm::Value *> ExitArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000117 bool Conditional = false;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000118 llvm::BasicBlock *ContBlock = nullptr;
119
120public:
121 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
122 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
123 bool Conditional = false)
124 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
125 ExitArgs(ExitArgs), Conditional(Conditional) {}
126 void Enter(CodeGenFunction &CGF) override {
127 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
128 if (Conditional) {
129 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
130 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
131 ContBlock = CGF.createBasicBlock("omp_if.end");
132 // Generate the branch (If-stmt)
133 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
134 CGF.EmitBlock(ThenBlock);
135 }
136 }
137 void Done(CodeGenFunction &CGF) {
138 // Emit the rest of blocks/branches
139 CGF.EmitBranch(ContBlock);
140 CGF.EmitBlock(ContBlock, true);
141 }
142 void Exit(CodeGenFunction &CGF) override {
143 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
144 }
145};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000146
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000147/// A class to track the execution mode when codegening directives within
148/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
149/// to the target region and used by containing directives such as 'parallel'
150/// to emit optimized code.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000151class ExecutionModeRAII {
152private:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000153 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
154 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000155
156public:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000157 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
158 : Mode(Mode) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000159 SavedMode = Mode;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000160 Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
161 : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000162 }
163 ~ExecutionModeRAII() { Mode = SavedMode; }
164};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000165
166/// GPU Configuration: This information can be derived from cuda registers,
167/// however, providing compile time constants helps generate more efficient
168/// code. For all practical purposes this is fine because the configuration
169/// is the same for all known NVPTX architectures.
170enum MachineConfiguration : unsigned {
171 WarpSize = 32,
172 /// Number of bits required to represent a lane identifier, which is
173 /// computed as log_2(WarpSize).
174 LaneIDBits = 5,
175 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000176
177 /// Global memory alignment for performance.
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000178 GlobalMemoryAlignment = 128,
Alexey Bataev09c9eea2018-11-09 16:18:04 +0000179
180 /// Maximal size of the shared memory buffer.
181 SharedMemorySize = 128,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000182};
183
184enum NamedBarrier : unsigned {
185 /// Synchronize on this barrier #ID using a named barrier primitive.
186 /// Only the subset of active threads in a parallel region arrive at the
187 /// barrier.
188 NB_Parallel = 1,
189};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000190
Alexey Bataev2adecff2018-09-21 14:22:53 +0000191typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
192static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
193 return P1.first > P2.first;
194}
195
196static RecordDecl *buildRecordForGlobalizedVars(
197 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000198 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
Alexey Bataev2adecff2018-09-21 14:22:53 +0000199 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
200 &MappedDeclsFields) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000201 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
Alexey Bataev2adecff2018-09-21 14:22:53 +0000202 return nullptr;
203 SmallVector<VarsDataTy, 4> GlobalizedVars;
204 for (const ValueDecl *D : EscapedDecls)
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000205 GlobalizedVars.emplace_back(
206 CharUnits::fromQuantity(std::max(
207 C.getDeclAlign(D).getQuantity(),
208 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
209 D);
210 for (const ValueDecl *D : EscapedDeclsForTeams)
Alexey Bataev2adecff2018-09-21 14:22:53 +0000211 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
212 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
213 stable_sort_comparator);
214 // Build struct _globalized_locals_ty {
Alexey Bataevff23bb62018-10-11 18:30:31 +0000215 // /* globalized vars */[WarSize] align (max(decl_align,
216 // GlobalMemoryAlignment))
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000217 // /* globalized vars */ for EscapedDeclsForTeams
Alexey Bataev2adecff2018-09-21 14:22:53 +0000218 // };
219 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
220 GlobalizedRD->startDefinition();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000221 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
222 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
Alexey Bataev2adecff2018-09-21 14:22:53 +0000223 for (const auto &Pair : GlobalizedVars) {
224 const ValueDecl *VD = Pair.second;
225 QualType Type = VD->getType();
226 if (Type->isLValueReferenceType())
227 Type = C.getPointerType(Type.getNonReferenceType());
228 else
229 Type = Type.getNonReferenceType();
230 SourceLocation Loc = VD->getLocation();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000231 FieldDecl *Field;
232 if (SingleEscaped.count(VD)) {
233 Field = FieldDecl::Create(
234 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
235 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
236 /*BW=*/nullptr, /*Mutable=*/false,
237 /*InitStyle=*/ICIS_NoInit);
238 Field->setAccess(AS_public);
239 if (VD->hasAttrs()) {
240 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
241 E(VD->getAttrs().end());
242 I != E; ++I)
243 Field->addAttr(*I);
244 }
245 } else {
246 llvm::APInt ArraySize(32, WarpSize);
247 Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
248 Field = FieldDecl::Create(
249 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
250 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
251 /*BW=*/nullptr, /*Mutable=*/false,
252 /*InitStyle=*/ICIS_NoInit);
253 Field->setAccess(AS_public);
254 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
255 static_cast<CharUnits::QuantityType>(
256 GlobalMemoryAlignment)));
257 Field->addAttr(AlignedAttr::CreateImplicit(
258 C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
259 IntegerLiteral::Create(C, Align,
260 C.getIntTypeForBitwidth(32, /*Signed=*/0),
261 SourceLocation())));
Alexey Bataev2adecff2018-09-21 14:22:53 +0000262 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000263 GlobalizedRD->addDecl(Field);
Alexey Bataev2adecff2018-09-21 14:22:53 +0000264 MappedDeclsFields.try_emplace(VD, Field);
265 }
266 GlobalizedRD->completeDefinition();
267 return GlobalizedRD;
268}
269
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000270/// Get the list of variables that can escape their declaration context.
271class CheckVarsEscapingDeclContext final
272 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
273 CodeGenFunction &CGF;
274 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000275 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000276 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000277 RecordDecl *GlobalizedRD = nullptr;
278 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000279 bool AllEscaped = false;
Alexey Bataev91433f62018-06-26 17:24:03 +0000280 bool IsForCombinedParallelRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000281
282 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev03f270c2018-03-30 18:31:07 +0000283 // Do not globalize declare target variables.
Alexey Bataev97b72212018-08-14 18:31:20 +0000284 if (!isa<VarDecl>(VD) ||
285 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
Alexey Bataev03f270c2018-03-30 18:31:07 +0000286 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000287 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000288 // Variables captured by value must be globalized.
289 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000290 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000291 // Check if need to capture the variable that was already captured by
292 // value in the outer region.
Alexey Bataev91433f62018-06-26 17:24:03 +0000293 if (!IsForCombinedParallelRegion) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000294 if (!FD->hasAttrs())
295 return;
296 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
297 if (!Attr)
298 return;
299 if (!isOpenMPPrivate(
300 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
301 Attr->getCaptureKind() == OMPC_map)
302 return;
303 }
304 if (!FD->getType()->isReferenceType()) {
305 assert(!VD->getType()->isVariablyModifiedType() &&
306 "Parameter captured by value with variably modified type");
307 EscapedParameters.insert(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000308 } else if (!IsForCombinedParallelRegion) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000309 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000310 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000311 }
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000312 }
313 if ((!CGF.CapturedStmtInfo ||
Alexey Bataev91433f62018-06-26 17:24:03 +0000314 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000315 VD->getType()->isReferenceType())
316 // Do not globalize variables with reference type.
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000317 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000318 if (VD->getType()->isVariablyModifiedType())
319 EscapedVariableLengthDecls.insert(VD);
320 else
321 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000322 }
323
324 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000325 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000326 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000327 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
328 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
329 const bool SavedAllEscaped = AllEscaped;
330 AllEscaped = VD->getType()->isLValueReferenceType();
331 Visit(VarD->getInit());
332 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000333 }
334 }
335 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000336 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
337 ArrayRef<OMPClause *> Clauses,
338 bool IsCombinedParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000339 if (!S)
340 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000341 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000342 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
343 const ValueDecl *VD = C.getCapturedVar();
Alexey Bataev91433f62018-06-26 17:24:03 +0000344 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
345 if (IsCombinedParallelRegion) {
346 // Check if the variable is privatized in the combined construct and
347 // those private copies must be shared in the inner parallel
348 // directive.
349 IsForCombinedParallelRegion = false;
350 for (const OMPClause *C : Clauses) {
351 if (!isOpenMPPrivate(C->getClauseKind()) ||
352 C->getClauseKind() == OMPC_reduction ||
353 C->getClauseKind() == OMPC_linear ||
354 C->getClauseKind() == OMPC_private)
355 continue;
356 ArrayRef<const Expr *> Vars;
357 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
358 Vars = PC->getVarRefs();
359 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
360 Vars = PC->getVarRefs();
361 else
362 llvm_unreachable("Unexpected clause.");
363 for (const auto *E : Vars) {
364 const Decl *D =
365 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
366 if (D == VD->getCanonicalDecl()) {
367 IsForCombinedParallelRegion = true;
368 break;
369 }
370 }
371 if (IsForCombinedParallelRegion)
372 break;
373 }
374 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000375 markAsEscaped(VD);
376 if (isa<OMPCapturedExprDecl>(VD))
377 VisitValueDecl(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000378 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000379 }
380 }
381 }
382
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000383 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000384 assert(!GlobalizedRD &&
385 "Record for globalized variables is built already.");
Alexey Bataevff23bb62018-10-11 18:30:31 +0000386 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000387 if (IsInTTDRegion)
Alexey Bataevff23bb62018-10-11 18:30:31 +0000388 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
389 else
390 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
Alexey Bataev2adecff2018-09-21 14:22:53 +0000391 GlobalizedRD = ::buildRecordForGlobalizedVars(
Alexey Bataevff23bb62018-10-11 18:30:31 +0000392 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000393 MappedDeclsFields);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000394 }
395
396public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000397 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000398 virtual ~CheckVarsEscapingDeclContext() = default;
399 void VisitDeclStmt(const DeclStmt *S) {
400 if (!S)
401 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000402 for (const Decl *D : S->decls())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000403 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
404 VisitValueDecl(VD);
405 }
406 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
407 if (!D)
408 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000409 if (!D->hasAssociatedStmt())
410 return;
411 if (const auto *S =
412 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
413 // Do not analyze directives that do not actually require capturing,
414 // like `omp for` or `omp simd` directives.
415 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
416 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
417 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
418 VisitStmt(S->getCapturedStmt());
419 return;
Alexey Bataev673110d2018-05-16 13:36:30 +0000420 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000421 VisitOpenMPCapturedStmt(
422 S, D->clauses(),
423 CaptureRegions.back() == OMPD_parallel &&
424 isOpenMPDistributeDirective(D->getDirectiveKind()));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000425 }
426 }
427 void VisitCapturedStmt(const CapturedStmt *S) {
428 if (!S)
429 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000430 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000431 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
432 const ValueDecl *VD = C.getCapturedVar();
433 markAsEscaped(VD);
434 if (isa<OMPCapturedExprDecl>(VD))
435 VisitValueDecl(VD);
436 }
437 }
438 }
439 void VisitLambdaExpr(const LambdaExpr *E) {
440 if (!E)
441 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000442 for (const LambdaCapture &C : E->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000443 if (C.capturesVariable()) {
444 if (C.getCaptureKind() == LCK_ByRef) {
445 const ValueDecl *VD = C.getCapturedVar();
446 markAsEscaped(VD);
447 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
448 VisitValueDecl(VD);
449 }
450 }
451 }
452 }
453 void VisitBlockExpr(const BlockExpr *E) {
454 if (!E)
455 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000456 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000457 if (C.isByRef()) {
458 const VarDecl *VD = C.getVariable();
459 markAsEscaped(VD);
460 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
461 VisitValueDecl(VD);
462 }
463 }
464 }
465 void VisitCallExpr(const CallExpr *E) {
466 if (!E)
467 return;
468 for (const Expr *Arg : E->arguments()) {
469 if (!Arg)
470 continue;
471 if (Arg->isLValue()) {
472 const bool SavedAllEscaped = AllEscaped;
473 AllEscaped = true;
474 Visit(Arg);
475 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000476 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000477 Visit(Arg);
Alexey Bataev9ff80832018-04-16 20:16:21 +0000478 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000479 }
480 Visit(E->getCallee());
481 }
482 void VisitDeclRefExpr(const DeclRefExpr *E) {
483 if (!E)
484 return;
485 const ValueDecl *VD = E->getDecl();
486 if (AllEscaped)
487 markAsEscaped(VD);
488 if (isa<OMPCapturedExprDecl>(VD))
489 VisitValueDecl(VD);
490 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
491 if (VarD->isInitCapture())
492 VisitValueDecl(VD);
493 }
494 void VisitUnaryOperator(const UnaryOperator *E) {
495 if (!E)
496 return;
497 if (E->getOpcode() == UO_AddrOf) {
498 const bool SavedAllEscaped = AllEscaped;
499 AllEscaped = true;
500 Visit(E->getSubExpr());
501 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000502 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000503 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000504 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000505 }
506 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
507 if (!E)
508 return;
509 if (E->getCastKind() == CK_ArrayToPointerDecay) {
510 const bool SavedAllEscaped = AllEscaped;
511 AllEscaped = true;
512 Visit(E->getSubExpr());
513 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000514 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000515 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000516 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000517 }
518 void VisitExpr(const Expr *E) {
519 if (!E)
520 return;
521 bool SavedAllEscaped = AllEscaped;
522 if (!E->isLValue())
523 AllEscaped = false;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000524 for (const Stmt *Child : E->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000525 if (Child)
526 Visit(Child);
527 AllEscaped = SavedAllEscaped;
528 }
529 void VisitStmt(const Stmt *S) {
530 if (!S)
531 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000532 for (const Stmt *Child : S->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000533 if (Child)
534 Visit(Child);
535 }
536
Alexey Bataevc99042b2018-03-15 18:10:54 +0000537 /// Returns the record that handles all the escaped local variables and used
538 /// instead of their original storage.
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000539 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000540 if (!GlobalizedRD)
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000541 buildRecordForGlobalizedVars(IsInTTDRegion);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000542 return GlobalizedRD;
543 }
544
Alexey Bataevc99042b2018-03-15 18:10:54 +0000545 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000546 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
547 assert(GlobalizedRD &&
548 "Record for globalized variables must be generated already.");
549 auto I = MappedDeclsFields.find(VD);
550 if (I == MappedDeclsFields.end())
551 return nullptr;
552 return I->getSecond();
553 }
554
Alexey Bataevc99042b2018-03-15 18:10:54 +0000555 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000556 ArrayRef<const ValueDecl *> getEscapedDecls() const {
557 return EscapedDecls.getArrayRef();
558 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000559
560 /// Checks if the escaped local variable is actually a parameter passed by
561 /// value.
562 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
563 return EscapedParameters;
564 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000565
566 /// Returns the list of the escaped variables with the variably modified
567 /// types.
568 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
569 return EscapedVariableLengthDecls.getArrayRef();
570 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000571};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000572} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000573
574/// Get the GPU warp size.
575static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000576 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000577 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000578 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000579 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000580}
581
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000582/// Get the id of the current thread on the GPU.
583static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000584 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000586 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000587 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000588}
589
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000590/// Get the id of the warp in the block.
591/// We assume that the warp size is 32, which is always the case
592/// on the NVPTX device, to generate more efficient code.
593static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
594 CGBuilderTy &Bld = CGF.Builder;
595 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
596}
597
598/// Get the id of the current lane in the Warp.
599/// We assume that the warp size is 32, which is always the case
600/// on the NVPTX device, to generate more efficient code.
601static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
602 CGBuilderTy &Bld = CGF.Builder;
603 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
604 "nvptx_lane_id");
605}
606
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000607/// Get the maximum number of threads in a block of the GPU.
608static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000609 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000610 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000611 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000612 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000613}
614
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000615/// Get barrier to synchronize all threads in a block.
616static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000617 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000618 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000619}
620
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000621/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
622/// a CTA.
623static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
624 llvm::Value *NumThreads) {
625 CGBuilderTy &Bld = CGF.Builder;
626 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000627 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
628 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
629 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000630}
631
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000632/// Synchronize all GPU threads in a block.
633static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000634
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000635/// Synchronize worker threads in a parallel region.
636static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
637 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
638}
639
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000640/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000641/// For the 'generic' execution mode, the runtime encodes thread_limit in
642/// the launch parameters, always starting thread_limit+warpSize threads per
643/// CTA. The threads in the last warp are reserved for master execution.
644/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
645static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000646 bool IsInSPMDExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000647 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000648 return IsInSPMDExecutionMode
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000649 ? getNVPTXNumThreads(CGF)
Alexey Bataeve290ec02018-04-06 16:03:36 +0000650 : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
651 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000652}
653
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000654/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000655/// The master thread id is the first thread (lane) of the last warp in the
656/// GPU block. Warp size is assumed to be some power of 2.
657/// Thread id is 0 indexed.
658/// E.g: If NumThreads is 33, master id is 32.
659/// If NumThreads is 64, master id is 32.
660/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000661static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000662 CGBuilderTy &Bld = CGF.Builder;
663 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
664
665 // We assume that the warp size is a power of 2.
Alexey Bataeve290ec02018-04-06 16:03:36 +0000666 llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000667
Alexey Bataeve290ec02018-04-06 16:03:36 +0000668 return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000669 Bld.CreateNot(Mask), "master_tid");
670}
671
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000672CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000673 CodeGenModule &CGM, SourceLocation Loc)
Alexey Bataev9ff80832018-04-16 20:16:21 +0000674 : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
675 Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000676 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000677}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000678
679void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
680 CodeGenModule &CGM) {
681 // Create an worker function with no arguments.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000682
683 WorkerFn = llvm::Function::Create(
Alexey Bataev9ff80832018-04-16 20:16:21 +0000684 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000685 /*placeholder=*/"_worker", &CGM.getModule());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000686 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +0000687 WorkerFn->setDoesNotRecurse();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000688}
689
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000690CGOpenMPRuntimeNVPTX::ExecutionMode
691CGOpenMPRuntimeNVPTX::getExecutionMode() const {
692 return CurrentExecutionMode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000693}
694
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000695static CGOpenMPRuntimeNVPTX::DataSharingMode
696getDataSharingMode(CodeGenModule &CGM) {
697 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
698 : CGOpenMPRuntimeNVPTX::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000699}
700
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000701// Checks if the expression is constant or does not have non-trivial function
702// calls.
703static bool isTrivial(ASTContext &Ctx, const Expr * E) {
704 // We can skip constant expressions.
705 // We can skip expressions with trivial calls or simple expressions.
706 return (E->isEvaluatable(Ctx, Expr::SE_AllowUndefinedBehavior) ||
707 !E->hasNonTrivialCall(Ctx)) &&
708 !E->HasSideEffects(Ctx, /*IncludePossibleEffects=*/true);
709}
710
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000711/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000712/// iff there is only one that is not evaluatable at the compile time.
713static const Stmt *getSingleCompoundChild(ASTContext &Ctx, const Stmt *Body) {
714 if (const auto *C = dyn_cast<CompoundStmt>(Body)) {
715 const Stmt *Child = nullptr;
716 for (const Stmt *S : C->body()) {
717 if (const auto *E = dyn_cast<Expr>(S)) {
718 if (isTrivial(Ctx, E))
719 continue;
720 }
721 // Some of the statements can be ignored.
722 if (isa<AsmStmt>(S) || isa<NullStmt>(S) || isa<OMPFlushDirective>(S) ||
723 isa<OMPBarrierDirective>(S) || isa<OMPTaskyieldDirective>(S))
724 continue;
725 // Analyze declarations.
726 if (const auto *DS = dyn_cast<DeclStmt>(S)) {
727 if (llvm::all_of(DS->decls(), [&Ctx](const Decl *D) {
728 if (isa<EmptyDecl>(D) || isa<DeclContext>(D) ||
729 isa<TypeDecl>(D) || isa<PragmaCommentDecl>(D) ||
730 isa<PragmaDetectMismatchDecl>(D) || isa<UsingDecl>(D) ||
731 isa<UsingDirectiveDecl>(D) ||
732 isa<OMPDeclareReductionDecl>(D) ||
733 isa<OMPThreadPrivateDecl>(D))
734 return true;
735 const auto *VD = dyn_cast<VarDecl>(D);
736 if (!VD)
737 return false;
738 return VD->isConstexpr() ||
739 ((VD->getType().isTrivialType(Ctx) ||
740 VD->getType()->isReferenceType()) &&
741 (!VD->hasInit() || isTrivial(Ctx, VD->getInit())));
742 }))
743 continue;
744 }
745 // Found multiple children - cannot get the one child only.
746 if (Child)
747 return Body;
748 Child = S;
749 }
750 if (Child)
751 return Child;
752 }
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000753 return Body;
754}
755
756/// Check if the parallel directive has an 'if' clause with non-constant or
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000757/// false condition. Also, check if the number of threads is strictly specified
758/// and run those directives in non-SPMD mode.
759static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
760 const OMPExecutableDirective &D) {
761 if (D.hasClausesOfKind<OMPNumThreadsClause>())
762 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000763 for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
764 OpenMPDirectiveKind NameModifier = C->getNameModifier();
765 if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
766 continue;
767 const Expr *Cond = C->getCondition();
768 bool Result;
769 if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
770 return true;
771 }
772 return false;
773}
774
775/// Check for inner (nested) SPMD construct, if any
776static bool hasNestedSPMDDirective(ASTContext &Ctx,
777 const OMPExecutableDirective &D) {
778 const auto *CS = D.getInnermostCapturedStmt();
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000779 const auto *Body =
780 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000781 const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000782
783 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
784 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000785 switch (D.getDirectiveKind()) {
786 case OMPD_target:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000787 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000788 !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000789 return true;
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000790 if (DKind == OMPD_teams) {
791 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
792 /*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000793 if (!Body)
794 return false;
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000795 ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000796 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
797 DKind = NND->getDirectiveKind();
Alexey Bataevdf093e72018-05-11 19:45:14 +0000798 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000799 !hasParallelIfNumThreadsClause(Ctx, *NND))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000800 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000801 }
802 }
803 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000804 case OMPD_target_teams:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000805 return isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000806 !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000807 case OMPD_target_simd:
808 case OMPD_target_parallel:
809 case OMPD_target_parallel_for:
810 case OMPD_target_parallel_for_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000811 case OMPD_target_teams_distribute:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000812 case OMPD_target_teams_distribute_simd:
813 case OMPD_target_teams_distribute_parallel_for:
814 case OMPD_target_teams_distribute_parallel_for_simd:
815 case OMPD_parallel:
816 case OMPD_for:
817 case OMPD_parallel_for:
818 case OMPD_parallel_sections:
819 case OMPD_for_simd:
820 case OMPD_parallel_for_simd:
821 case OMPD_cancel:
822 case OMPD_cancellation_point:
823 case OMPD_ordered:
824 case OMPD_threadprivate:
825 case OMPD_task:
826 case OMPD_simd:
827 case OMPD_sections:
828 case OMPD_section:
829 case OMPD_single:
830 case OMPD_master:
831 case OMPD_critical:
832 case OMPD_taskyield:
833 case OMPD_barrier:
834 case OMPD_taskwait:
835 case OMPD_taskgroup:
836 case OMPD_atomic:
837 case OMPD_flush:
838 case OMPD_teams:
839 case OMPD_target_data:
840 case OMPD_target_exit_data:
841 case OMPD_target_enter_data:
842 case OMPD_distribute:
843 case OMPD_distribute_simd:
844 case OMPD_distribute_parallel_for:
845 case OMPD_distribute_parallel_for_simd:
846 case OMPD_teams_distribute:
847 case OMPD_teams_distribute_simd:
848 case OMPD_teams_distribute_parallel_for:
849 case OMPD_teams_distribute_parallel_for_simd:
850 case OMPD_target_update:
851 case OMPD_declare_simd:
852 case OMPD_declare_target:
853 case OMPD_end_declare_target:
854 case OMPD_declare_reduction:
855 case OMPD_taskloop:
856 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000857 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000858 case OMPD_unknown:
859 llvm_unreachable("Unexpected directive.");
860 }
861 }
862
863 return false;
864}
865
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000866static bool supportsSPMDExecutionMode(ASTContext &Ctx,
867 const OMPExecutableDirective &D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000868 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
869 switch (DirectiveKind) {
870 case OMPD_target:
871 case OMPD_target_teams:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000872 return hasNestedSPMDDirective(Ctx, D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000873 case OMPD_target_parallel:
874 case OMPD_target_parallel_for:
875 case OMPD_target_parallel_for_simd:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000876 case OMPD_target_teams_distribute_parallel_for:
877 case OMPD_target_teams_distribute_parallel_for_simd:
Alexey Bataev2adecff2018-09-21 14:22:53 +0000878 return !hasParallelIfNumThreadsClause(Ctx, D);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000879 case OMPD_target_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000880 case OMPD_target_teams_distribute:
881 case OMPD_target_teams_distribute_simd:
882 return false;
883 case OMPD_parallel:
884 case OMPD_for:
885 case OMPD_parallel_for:
886 case OMPD_parallel_sections:
887 case OMPD_for_simd:
888 case OMPD_parallel_for_simd:
889 case OMPD_cancel:
890 case OMPD_cancellation_point:
891 case OMPD_ordered:
892 case OMPD_threadprivate:
893 case OMPD_task:
894 case OMPD_simd:
895 case OMPD_sections:
896 case OMPD_section:
897 case OMPD_single:
898 case OMPD_master:
899 case OMPD_critical:
900 case OMPD_taskyield:
901 case OMPD_barrier:
902 case OMPD_taskwait:
903 case OMPD_taskgroup:
904 case OMPD_atomic:
905 case OMPD_flush:
906 case OMPD_teams:
907 case OMPD_target_data:
908 case OMPD_target_exit_data:
909 case OMPD_target_enter_data:
910 case OMPD_distribute:
911 case OMPD_distribute_simd:
912 case OMPD_distribute_parallel_for:
913 case OMPD_distribute_parallel_for_simd:
914 case OMPD_teams_distribute:
915 case OMPD_teams_distribute_simd:
916 case OMPD_teams_distribute_parallel_for:
917 case OMPD_teams_distribute_parallel_for_simd:
918 case OMPD_target_update:
919 case OMPD_declare_simd:
920 case OMPD_declare_target:
921 case OMPD_end_declare_target:
922 case OMPD_declare_reduction:
923 case OMPD_taskloop:
924 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000925 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000926 case OMPD_unknown:
927 break;
928 }
929 llvm_unreachable(
930 "Unknown programming model for OpenMP directive on NVPTX target.");
931}
932
933/// Check if the directive is loops based and has schedule clause at all or has
934/// static scheduling.
935static bool hasStaticScheduling(const OMPExecutableDirective &D) {
936 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
937 isOpenMPLoopDirective(D.getDirectiveKind()) &&
938 "Expected loop-based directive.");
939 return !D.hasClausesOfKind<OMPOrderedClause>() &&
940 (!D.hasClausesOfKind<OMPScheduleClause>() ||
941 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
942 [](const OMPScheduleClause *C) {
943 return C->getScheduleKind() == OMPC_SCHEDULE_static;
944 }));
945}
946
947/// Check for inner (nested) lightweight runtime construct, if any
948static bool hasNestedLightweightDirective(ASTContext &Ctx,
949 const OMPExecutableDirective &D) {
950 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
951 const auto *CS = D.getInnermostCapturedStmt();
952 const auto *Body =
953 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000954 const Stmt *ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000955
956 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
957 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
958 switch (D.getDirectiveKind()) {
959 case OMPD_target:
960 if (isOpenMPParallelDirective(DKind) &&
961 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
962 hasStaticScheduling(*NestedDir))
963 return true;
964 if (DKind == OMPD_parallel) {
965 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
966 /*IgnoreCaptured=*/true);
967 if (!Body)
968 return false;
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000969 ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000970 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
971 DKind = NND->getDirectiveKind();
972 if (isOpenMPWorksharingDirective(DKind) &&
973 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
974 return true;
975 }
976 } else if (DKind == OMPD_teams) {
977 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
978 /*IgnoreCaptured=*/true);
979 if (!Body)
980 return false;
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000981 ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000982 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
983 DKind = NND->getDirectiveKind();
984 if (isOpenMPParallelDirective(DKind) &&
985 isOpenMPWorksharingDirective(DKind) &&
986 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
987 return true;
988 if (DKind == OMPD_parallel) {
989 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
990 /*IgnoreCaptured=*/true);
991 if (!Body)
992 return false;
Alexey Bataev8bcc69c2018-11-09 20:03:19 +0000993 ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000994 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
995 DKind = NND->getDirectiveKind();
996 if (isOpenMPWorksharingDirective(DKind) &&
997 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
998 return true;
999 }
1000 }
1001 }
1002 }
1003 return false;
1004 case OMPD_target_teams:
1005 if (isOpenMPParallelDirective(DKind) &&
1006 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
1007 hasStaticScheduling(*NestedDir))
1008 return true;
1009 if (DKind == OMPD_parallel) {
1010 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
1011 /*IgnoreCaptured=*/true);
1012 if (!Body)
1013 return false;
Alexey Bataev8bcc69c2018-11-09 20:03:19 +00001014 ChildStmt = getSingleCompoundChild(Ctx, Body);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001015 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
1016 DKind = NND->getDirectiveKind();
1017 if (isOpenMPWorksharingDirective(DKind) &&
1018 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
1019 return true;
1020 }
1021 }
1022 return false;
1023 case OMPD_target_parallel:
1024 return isOpenMPWorksharingDirective(DKind) &&
1025 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
1026 case OMPD_target_teams_distribute:
1027 case OMPD_target_simd:
1028 case OMPD_target_parallel_for:
1029 case OMPD_target_parallel_for_simd:
1030 case OMPD_target_teams_distribute_simd:
1031 case OMPD_target_teams_distribute_parallel_for:
1032 case OMPD_target_teams_distribute_parallel_for_simd:
1033 case OMPD_parallel:
1034 case OMPD_for:
1035 case OMPD_parallel_for:
1036 case OMPD_parallel_sections:
1037 case OMPD_for_simd:
1038 case OMPD_parallel_for_simd:
1039 case OMPD_cancel:
1040 case OMPD_cancellation_point:
1041 case OMPD_ordered:
1042 case OMPD_threadprivate:
1043 case OMPD_task:
1044 case OMPD_simd:
1045 case OMPD_sections:
1046 case OMPD_section:
1047 case OMPD_single:
1048 case OMPD_master:
1049 case OMPD_critical:
1050 case OMPD_taskyield:
1051 case OMPD_barrier:
1052 case OMPD_taskwait:
1053 case OMPD_taskgroup:
1054 case OMPD_atomic:
1055 case OMPD_flush:
1056 case OMPD_teams:
1057 case OMPD_target_data:
1058 case OMPD_target_exit_data:
1059 case OMPD_target_enter_data:
1060 case OMPD_distribute:
1061 case OMPD_distribute_simd:
1062 case OMPD_distribute_parallel_for:
1063 case OMPD_distribute_parallel_for_simd:
1064 case OMPD_teams_distribute:
1065 case OMPD_teams_distribute_simd:
1066 case OMPD_teams_distribute_parallel_for:
1067 case OMPD_teams_distribute_parallel_for_simd:
1068 case OMPD_target_update:
1069 case OMPD_declare_simd:
1070 case OMPD_declare_target:
1071 case OMPD_end_declare_target:
1072 case OMPD_declare_reduction:
1073 case OMPD_taskloop:
1074 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001075 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001076 case OMPD_unknown:
1077 llvm_unreachable("Unexpected directive.");
1078 }
1079 }
1080
1081 return false;
1082}
1083
1084/// Checks if the construct supports lightweight runtime. It must be SPMD
1085/// construct + inner loop-based construct with static scheduling.
1086static bool supportsLightweightRuntime(ASTContext &Ctx,
1087 const OMPExecutableDirective &D) {
1088 if (!supportsSPMDExecutionMode(Ctx, D))
1089 return false;
1090 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1091 switch (DirectiveKind) {
1092 case OMPD_target:
1093 case OMPD_target_teams:
1094 case OMPD_target_parallel:
1095 return hasNestedLightweightDirective(Ctx, D);
1096 case OMPD_target_parallel_for:
1097 case OMPD_target_parallel_for_simd:
1098 case OMPD_target_teams_distribute_parallel_for:
1099 case OMPD_target_teams_distribute_parallel_for_simd:
1100 // (Last|First)-privates must be shared in parallel region.
1101 return hasStaticScheduling(D);
1102 case OMPD_target_simd:
1103 case OMPD_target_teams_distribute:
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001104 case OMPD_target_teams_distribute_simd:
Alexey Bataevdf093e72018-05-11 19:45:14 +00001105 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001106 case OMPD_parallel:
1107 case OMPD_for:
1108 case OMPD_parallel_for:
1109 case OMPD_parallel_sections:
1110 case OMPD_for_simd:
1111 case OMPD_parallel_for_simd:
1112 case OMPD_cancel:
1113 case OMPD_cancellation_point:
1114 case OMPD_ordered:
1115 case OMPD_threadprivate:
1116 case OMPD_task:
1117 case OMPD_simd:
1118 case OMPD_sections:
1119 case OMPD_section:
1120 case OMPD_single:
1121 case OMPD_master:
1122 case OMPD_critical:
1123 case OMPD_taskyield:
1124 case OMPD_barrier:
1125 case OMPD_taskwait:
1126 case OMPD_taskgroup:
1127 case OMPD_atomic:
1128 case OMPD_flush:
1129 case OMPD_teams:
1130 case OMPD_target_data:
1131 case OMPD_target_exit_data:
1132 case OMPD_target_enter_data:
1133 case OMPD_distribute:
1134 case OMPD_distribute_simd:
1135 case OMPD_distribute_parallel_for:
1136 case OMPD_distribute_parallel_for_simd:
1137 case OMPD_teams_distribute:
1138 case OMPD_teams_distribute_simd:
1139 case OMPD_teams_distribute_parallel_for:
1140 case OMPD_teams_distribute_parallel_for_simd:
1141 case OMPD_target_update:
1142 case OMPD_declare_simd:
1143 case OMPD_declare_target:
1144 case OMPD_end_declare_target:
1145 case OMPD_declare_reduction:
1146 case OMPD_taskloop:
1147 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001148 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001149 case OMPD_unknown:
1150 break;
1151 }
1152 llvm_unreachable(
1153 "Unknown programming model for OpenMP directive on NVPTX target.");
1154}
1155
1156void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001157 StringRef ParentName,
1158 llvm::Function *&OutlinedFn,
1159 llvm::Constant *&OutlinedFnID,
1160 bool IsOffloadEntry,
1161 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001162 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001163 EntryFunctionState EST;
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001164 WorkerFunctionState WST(CGM, D.getBeginLoc());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001165 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001166 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001167
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001168 // Emit target region as a standalone region.
1169 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001170 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1171 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001172
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001173 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001174 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001175 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001176 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001177 void Enter(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001178 auto &RT =
1179 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001180 RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1181 // Skip target region initialization.
1182 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001183 }
1184 void Exit(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001185 auto &RT =
1186 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001187 RT.clearLocThreadIdInsertPt(CGF);
1188 RT.emitNonSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001189 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001190 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001191 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001192 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001193 // Reserve place for the globalized memory.
1194 GlobalizedRecords.emplace_back();
Alexey Bataeve4090182018-11-02 14:54:07 +00001195 if (!KernelStaticGlobalized) {
1196 KernelStaticGlobalized = new llvm::GlobalVariable(
1197 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1198 llvm::GlobalValue::InternalLinkage,
1199 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1200 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1201 llvm::GlobalValue::NotThreadLocal,
1202 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1203 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001204 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1205 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001206 IsInTTDRegion = false;
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001207
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001208 // Now change the name of the worker function to correspond to this target
1209 // region's entry function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001210 WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
Alexey Bataevaee93892018-01-08 20:09:47 +00001211
1212 // Create the worker function
1213 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001214}
1215
1216// Setup NVPTX threads for master-worker OpenMP scheme.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001217void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001218 EntryFunctionState &EST,
1219 WorkerFunctionState &WST) {
1220 CGBuilderTy &Bld = CGF.Builder;
1221
1222 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1223 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1224 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1225 EST.ExitBB = CGF.createBasicBlock(".exit");
1226
Alexey Bataev9ff80832018-04-16 20:16:21 +00001227 llvm::Value *IsWorker =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001228 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1229 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1230
1231 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001232 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001233 CGF.EmitBranch(EST.ExitBB);
1234
1235 CGF.EmitBlock(MasterCheckBB);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001236 llvm::Value *IsMaster =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001237 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1238 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1239
1240 CGF.EmitBlock(MasterBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001241 IsInTargetMasterThreadRegion = true;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001242 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001243 // First action in sequential region:
1244 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001245 // TODO: Optimize runtime initialization and pass in correct value.
1246 llvm::Value *Args[] = {getThreadLimit(CGF),
1247 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001248 CGF.EmitRuntimeCall(
1249 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001250
1251 // For data sharing, we need to initialize the stack.
1252 CGF.EmitRuntimeCall(
1253 createNVPTXRuntimeFunction(
1254 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1255
Alexey Bataevc99042b2018-03-15 18:10:54 +00001256 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001257}
1258
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001259void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001260 EntryFunctionState &EST) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001261 IsInTargetMasterThreadRegion = false;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001262 if (!CGF.HaveInsertPoint())
1263 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001264
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001265 emitGenericVarsEpilog(CGF);
1266
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001267 if (!EST.ExitBB)
1268 EST.ExitBB = CGF.createBasicBlock(".exit");
1269
1270 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1271 CGF.EmitBranch(TerminateBB);
1272
1273 CGF.EmitBlock(TerminateBB);
1274 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001275 // TODO: Optimize runtime initialization and pass in correct value.
1276 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001277 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001278 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001279 // Barrier to terminate worker threads.
1280 syncCTAThreads(CGF);
1281 // Master thread jumps to exit point.
1282 CGF.EmitBranch(EST.ExitBB);
1283
1284 CGF.EmitBlock(EST.ExitBB);
1285 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001286}
1287
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001288void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001289 StringRef ParentName,
1290 llvm::Function *&OutlinedFn,
1291 llvm::Constant *&OutlinedFnID,
1292 bool IsOffloadEntry,
1293 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001294 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001295 EntryFunctionState EST;
1296
1297 // Emit target region as a standalone region.
1298 class NVPTXPrePostActionTy : public PrePostActionTy {
1299 CGOpenMPRuntimeNVPTX &RT;
1300 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1301 const OMPExecutableDirective &D;
1302
1303 public:
1304 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1305 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1306 const OMPExecutableDirective &D)
1307 : RT(RT), EST(EST), D(D) {}
1308 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001309 RT.emitSPMDEntryHeader(CGF, EST, D);
Alexey Bataevfd006c42018-10-05 15:08:53 +00001310 // Skip target region initialization.
1311 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001312 }
1313 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevfd006c42018-10-05 15:08:53 +00001314 RT.clearLocThreadIdInsertPt(CGF);
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001315 RT.emitSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001316 }
1317 } Action(*this, EST, D);
1318 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001319 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001320 // Reserve place for the globalized memory.
1321 GlobalizedRecords.emplace_back();
Alexey Bataeve4090182018-11-02 14:54:07 +00001322 if (!KernelStaticGlobalized) {
1323 KernelStaticGlobalized = new llvm::GlobalVariable(
1324 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1325 llvm::GlobalValue::InternalLinkage,
1326 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1327 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1328 llvm::GlobalValue::NotThreadLocal,
1329 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1330 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001331 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1332 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001333 IsInTTDRegion = false;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001334}
1335
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001336void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001337 CodeGenFunction &CGF, EntryFunctionState &EST,
1338 const OMPExecutableDirective &D) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001339 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001340
1341 // Setup BBs in entry function.
1342 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1343 EST.ExitBB = CGF.createBasicBlock(".exit");
1344
1345 // Initialize the OMP state in the runtime; called by all active threads.
Alexey Bataev80a9a612018-08-30 14:45:24 +00001346 bool RequiresFullRuntime = CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1347 !supportsLightweightRuntime(CGF.getContext(), D);
Alexey Bataev8bcc69c2018-11-09 20:03:19 +00001348 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1349 /*RequiresOMPRuntime=*/
1350 Bld.getInt16(RequiresFullRuntime ? 1 : 0),
1351 /*RequiresDataSharing=*/Bld.getInt16(0)};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001352 CGF.EmitRuntimeCall(
1353 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001354
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001355 if (RequiresFullRuntime) {
1356 // For data sharing, we need to initialize the stack.
1357 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1358 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1359 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001360
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001361 CGF.EmitBranch(ExecuteBB);
1362
1363 CGF.EmitBlock(ExecuteBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001364
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001365 IsInTargetMasterThreadRegion = true;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001366}
1367
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001368void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001369 EntryFunctionState &EST) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001370 IsInTargetMasterThreadRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001371 if (!CGF.HaveInsertPoint())
1372 return;
1373
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001374 if (!EST.ExitBB)
1375 EST.ExitBB = CGF.createBasicBlock(".exit");
1376
1377 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1378 CGF.EmitBranch(OMPDeInitBB);
1379
1380 CGF.EmitBlock(OMPDeInitBB);
1381 // DeInitialize the OMP state in the runtime; called by all active threads.
1382 CGF.EmitRuntimeCall(
1383 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1384 CGF.EmitBranch(EST.ExitBB);
1385
1386 CGF.EmitBlock(EST.ExitBB);
1387 EST.ExitBB = nullptr;
1388}
1389
1390// Create a unique global variable to indicate the execution mode of this target
1391// region. The execution mode is either 'generic', or 'spmd' depending on the
1392// target directive. This variable is picked up by the offload library to setup
1393// the device appropriately before kernel launch. If the execution mode is
1394// 'generic', the runtime reserves one warp for the master, otherwise, all
1395// warps participate in parallel work.
1396static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001397 bool Mode) {
1398 auto *GVMode =
1399 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1400 llvm::GlobalValue::WeakAnyLinkage,
1401 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1402 Twine(Name, "_exec_mode"));
Alexey Bataev9ff80832018-04-16 20:16:21 +00001403 CGM.addCompilerUsedGlobal(GVMode);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001404}
1405
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001406void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00001407 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001408
1409 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001410 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001411 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001412 emitWorkerLoop(CGF, WST);
1413 CGF.FinishFunction();
1414}
1415
1416void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1417 WorkerFunctionState &WST) {
1418 //
1419 // The workers enter this loop and wait for parallel work from the master.
1420 // When the master encounters a parallel region it sets up the work + variable
1421 // arguments, and wakes up the workers. The workers first check to see if
1422 // they are required for the parallel region, i.e., within the # of requested
1423 // parallel threads. The activated workers load the variable arguments and
1424 // execute the parallel work.
1425 //
1426
1427 CGBuilderTy &Bld = CGF.Builder;
1428
1429 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1430 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1431 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1432 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1433 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1434 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1435
1436 CGF.EmitBranch(AwaitBB);
1437
1438 // Workers wait for work from master.
1439 CGF.EmitBlock(AwaitBB);
1440 // Wait for parallel work
1441 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001442
1443 Address WorkFn =
1444 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1445 Address ExecStatus =
1446 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1447 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1448 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1449
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001450 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001451 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001452 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001453 llvm::Value *Ret = CGF.EmitRuntimeCall(
1454 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1455 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001456
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001457 // On termination condition (workid == 0), exit loop.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001458 llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1459 llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001460 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1461
1462 // Activate requested workers.
1463 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001464 llvm::Value *IsActive =
1465 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1466 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001467
1468 // Signal start of parallel region.
1469 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001470
1471 // Process work items: outlined parallel functions.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001472 for (llvm::Function *W : Work) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001473 // Try to match this outlined function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001474 llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001475
1476 llvm::Value *WorkFnMatch =
1477 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1478
1479 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1480 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1481 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1482
1483 // Execute this outlined function.
1484 CGF.EmitBlock(ExecuteFNBB);
1485
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001486 // Insert call to work function via shared wrapper. The shared
1487 // wrapper takes two arguments:
1488 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001489 // - the thread ID;
1490 emitCall(CGF, WST.Loc, W,
1491 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001492
1493 // Go to end of parallel region.
1494 CGF.EmitBranch(TerminateBB);
1495
1496 CGF.EmitBlock(CheckNextBB);
1497 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001498 // Default case: call to outlined function through pointer if the target
1499 // region makes a declare target call that may contain an orphaned parallel
1500 // directive.
1501 auto *ParallelFnTy =
1502 llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1503 /*isVarArg=*/false)
1504 ->getPointerTo();
1505 llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1506 // Insert call to work function via shared wrapper. The shared
1507 // wrapper takes two arguments:
1508 // - the parallelism level;
1509 // - the thread ID;
1510 emitCall(CGF, WST.Loc, WorkFnCast,
1511 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1512 // Go to end of parallel region.
1513 CGF.EmitBranch(TerminateBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001514
1515 // Signal end of parallel region.
1516 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001517 CGF.EmitRuntimeCall(
1518 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1519 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001520 CGF.EmitBranch(BarrierBB);
1521
1522 // All active and inactive workers wait at a barrier after parallel region.
1523 CGF.EmitBlock(BarrierBB);
1524 // Barrier after parallel region.
1525 syncCTAThreads(CGF);
1526 CGF.EmitBranch(AwaitBB);
1527
1528 // Exit target region.
1529 CGF.EmitBlock(ExitBB);
1530}
1531
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001532/// Returns specified OpenMP runtime function for the current OpenMP
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001533/// implementation. Specialized for the NVPTX device.
1534/// \param Function OpenMP runtime function.
1535/// \return Specified function.
1536llvm::Constant *
1537CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1538 llvm::Constant *RTLFn = nullptr;
1539 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1540 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001541 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1542 // RequiresOMPRuntime);
1543 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001544 auto *FnTy =
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001545 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1546 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1547 break;
1548 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001549 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001550 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1551 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001552 auto *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001553 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001554 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1555 break;
1556 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001557 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1558 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001559 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001560 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001561 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001562 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1563 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1564 break;
1565 }
1566 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1567 // Build void __kmpc_spmd_kernel_deinit();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001568 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001569 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1570 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1571 break;
1572 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001573 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1574 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001575 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001576 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001577 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001578 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1579 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1580 break;
1581 }
1582 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001583 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1584 /// int16_t IsOMPRuntimeInitialized);
1585 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001586 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001587 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001588 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1589 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1590 break;
1591 }
1592 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1593 /// Build void __kmpc_kernel_end_parallel();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001594 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001595 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1596 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1597 break;
1598 }
1599 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1600 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1601 // global_tid);
1602 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001603 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001604 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1605 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1606 break;
1607 }
1608 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1609 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1610 // global_tid);
1611 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001612 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001613 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1614 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1615 break;
1616 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001617 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1618 // Build int32_t __kmpc_shuffle_int32(int32_t element,
1619 // int16_t lane_offset, int16_t warp_size);
1620 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001621 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001622 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1623 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1624 break;
1625 }
1626 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1627 // Build int64_t __kmpc_shuffle_int64(int64_t element,
1628 // int16_t lane_offset, int16_t warp_size);
1629 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001630 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001631 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1632 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1633 break;
1634 }
1635 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1636 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1637 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1638 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1639 // lane_offset, int16_t Algorithm Version),
1640 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1641 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1642 CGM.Int16Ty, CGM.Int16Ty};
1643 auto *ShuffleReduceFnTy =
1644 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1645 /*isVarArg=*/false);
1646 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1647 auto *InterWarpCopyFnTy =
1648 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1649 /*isVarArg=*/false);
1650 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1651 CGM.Int32Ty,
1652 CGM.SizeTy,
1653 CGM.VoidPtrTy,
1654 ShuffleReduceFnTy->getPointerTo(),
1655 InterWarpCopyFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001656 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001657 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1658 RTLFn = CGM.CreateRuntimeFunction(
1659 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1660 break;
1661 }
Alexey Bataevfac26cf2018-05-02 20:03:27 +00001662 case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1663 // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1664 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1665 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1666 // lane_offset, int16_t Algorithm Version),
1667 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1668 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1669 CGM.Int16Ty, CGM.Int16Ty};
1670 auto *ShuffleReduceFnTy =
1671 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1672 /*isVarArg=*/false);
1673 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1674 auto *InterWarpCopyFnTy =
1675 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1676 /*isVarArg=*/false);
1677 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1678 CGM.Int32Ty,
1679 CGM.SizeTy,
1680 CGM.VoidPtrTy,
1681 ShuffleReduceFnTy->getPointerTo(),
1682 InterWarpCopyFnTy->getPointerTo()};
1683 auto *FnTy =
1684 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1685 RTLFn = CGM.CreateRuntimeFunction(
1686 FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1687 break;
1688 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001689 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1690 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1691 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1692 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1693 // lane_offset, int16_t shortCircuit),
1694 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1695 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1696 // int32_t index, int32_t width),
1697 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1698 // int32_t index, int32_t width, int32_t reduce))
1699 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1700 CGM.Int16Ty, CGM.Int16Ty};
1701 auto *ShuffleReduceFnTy =
1702 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1703 /*isVarArg=*/false);
1704 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1705 auto *InterWarpCopyFnTy =
1706 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1707 /*isVarArg=*/false);
1708 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1709 CGM.Int32Ty, CGM.Int32Ty};
1710 auto *CopyToScratchpadFnTy =
1711 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1712 /*isVarArg=*/false);
1713 llvm::Type *LoadReduceTypeParams[] = {
1714 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1715 auto *LoadReduceFnTy =
1716 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1717 /*isVarArg=*/false);
1718 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1719 CGM.Int32Ty,
1720 CGM.SizeTy,
1721 CGM.VoidPtrTy,
1722 ShuffleReduceFnTy->getPointerTo(),
1723 InterWarpCopyFnTy->getPointerTo(),
1724 CopyToScratchpadFnTy->getPointerTo(),
1725 LoadReduceFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001726 auto *FnTy =
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001727 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1728 RTLFn = CGM.CreateRuntimeFunction(
1729 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1730 break;
1731 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001732 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1733 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1734 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001735 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001736 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1737 RTLFn = CGM.CreateRuntimeFunction(
1738 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1739 break;
1740 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001741 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1742 /// Build void __kmpc_data_sharing_init_stack();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001743 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001744 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1745 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1746 break;
1747 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001748 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1749 /// Build void __kmpc_data_sharing_init_stack_spmd();
1750 auto *FnTy =
1751 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001752 RTLFn =
1753 CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001754 break;
1755 }
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00001756 case OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack: {
1757 // Build void *__kmpc_data_sharing_coalesced_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001758 // int16_t UseSharedMemory);
1759 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001760 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001761 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1762 RTLFn = CGM.CreateRuntimeFunction(
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00001763 FnTy, /*Name=*/"__kmpc_data_sharing_coalesced_push_stack");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001764 break;
1765 }
1766 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1767 // Build void __kmpc_data_sharing_pop_stack(void *a);
1768 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001769 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001770 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1771 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1772 /*Name=*/"__kmpc_data_sharing_pop_stack");
1773 break;
1774 }
1775 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1776 /// Build void __kmpc_begin_sharing_variables(void ***args,
1777 /// size_t n_args);
1778 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001779 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001780 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1781 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1782 break;
1783 }
1784 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1785 /// Build void __kmpc_end_sharing_variables();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001786 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001787 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1788 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1789 break;
1790 }
1791 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1792 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1793 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001794 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001795 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1796 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1797 break;
1798 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001799 case OMPRTL_NVPTX__kmpc_parallel_level: {
1800 // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1801 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1802 auto *FnTy =
1803 llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1804 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1805 break;
1806 }
Alexey Bataev673110d2018-05-16 13:36:30 +00001807 case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1808 // Build int8_t __kmpc_is_spmd_exec_mode();
1809 auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1810 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1811 break;
1812 }
Alexey Bataeve4090182018-11-02 14:54:07 +00001813 case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1814 // Build void __kmpc_get_team_static_memory(const void *buf, size_t size,
1815 // int16_t is_shared, const void **res);
1816 llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty,
1817 CGM.VoidPtrPtrTy};
1818 auto *FnTy =
1819 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1820 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1821 break;
1822 }
1823 case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1824 // Build void __kmpc_restore_team_static_memory(int16_t is_shared);
1825 auto *FnTy =
1826 llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false);
1827 RTLFn =
1828 CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1829 break;
1830 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001831 }
1832 return RTLFn;
1833}
1834
1835void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1836 llvm::Constant *Addr,
Alexey Bataev03f270c2018-03-30 18:31:07 +00001837 uint64_t Size, int32_t,
1838 llvm::GlobalValue::LinkageTypes) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001839 // TODO: Add support for global variables on the device after declare target
1840 // support.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001841 if (!isa<llvm::Function>(Addr))
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001842 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +00001843 llvm::Module &M = CGM.getModule();
1844 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001845
1846 // Get "nvvm.annotations" metadata node
Alexey Bataev9ff80832018-04-16 20:16:21 +00001847 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001848
1849 llvm::Metadata *MDVals[] = {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001850 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001851 llvm::ConstantAsMetadata::get(
1852 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1853 // Append metadata to nvvm.annotations
1854 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1855}
1856
1857void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1858 const OMPExecutableDirective &D, StringRef ParentName,
1859 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001860 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001861 if (!IsOffloadEntry) // Nothing to do.
1862 return;
1863
1864 assert(!ParentName.empty() && "Invalid target region parent name!");
1865
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001866 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001867 if (Mode)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001868 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001869 CodeGen);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001870 else
1871 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1872 CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001873
1874 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001875}
1876
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001877CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001878 : CGOpenMPRuntime(CGM, "_", "$") {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001879 if (!CGM.getLangOpts().OpenMPIsDevice)
1880 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001881}
Carlo Bertollic6872252016-04-04 15:55:02 +00001882
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001883void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1884 OpenMPProcBindClauseKind ProcBind,
1885 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001886 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001887 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001888 return;
1889
1890 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1891}
1892
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001893void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1894 llvm::Value *NumThreads,
1895 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001896 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001897 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001898 return;
1899
1900 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1901}
1902
Carlo Bertollic6872252016-04-04 15:55:02 +00001903void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1904 const Expr *NumTeams,
1905 const Expr *ThreadLimit,
1906 SourceLocation Loc) {}
1907
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001908llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1909 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1910 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001911 // Emit target region as a standalone region.
1912 class NVPTXPrePostActionTy : public PrePostActionTy {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001913 bool &IsInParallelRegion;
1914 bool PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001915
1916 public:
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001917 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1918 : IsInParallelRegion(IsInParallelRegion) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001919 void Enter(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001920 PrevIsInParallelRegion = IsInParallelRegion;
1921 IsInParallelRegion = true;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001922 }
1923 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001924 IsInParallelRegion = PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001925 }
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001926 } Action(IsInParallelRegion);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001927 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001928 bool PrevIsInTTDRegion = IsInTTDRegion;
1929 IsInTTDRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001930 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1931 IsInTargetMasterThreadRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001932 auto *OutlinedFun =
1933 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1934 D, ThreadIDVar, InnermostKind, CodeGen));
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001935 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001936 IsInTTDRegion = PrevIsInTTDRegion;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001937 if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1938 !IsInParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001939 llvm::Function *WrapperFun =
1940 createParallelDataSharingWrapper(OutlinedFun, D);
1941 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1942 }
1943
1944 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001945}
1946
Alexey Bataev2adecff2018-09-21 14:22:53 +00001947/// Get list of lastprivate variables from the teams distribute ... or
1948/// teams {distribute ...} directives.
1949static void
Alexey Bataev8bcc69c2018-11-09 20:03:19 +00001950getDistributeLastprivateVars(ASTContext &Ctx, const OMPExecutableDirective &D,
Alexey Bataev2adecff2018-09-21 14:22:53 +00001951 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1952 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1953 "expected teams directive.");
1954 const OMPExecutableDirective *Dir = &D;
1955 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1956 if (const Stmt *S = getSingleCompoundChild(
Alexey Bataev8bcc69c2018-11-09 20:03:19 +00001957 Ctx,
Alexey Bataev2adecff2018-09-21 14:22:53 +00001958 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1959 /*IgnoreCaptured=*/true))) {
1960 Dir = dyn_cast<OMPExecutableDirective>(S);
1961 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1962 Dir = nullptr;
1963 }
1964 }
1965 if (!Dir)
1966 return;
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001967 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001968 for (const Expr *E : C->getVarRefs()) {
1969 const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
1970 Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
1971 }
1972 }
1973}
1974
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001975llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001976 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1977 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001978 SourceLocation Loc = D.getBeginLoc();
Carlo Bertollic6872252016-04-04 15:55:02 +00001979
Alexey Bataev2adecff2018-09-21 14:22:53 +00001980 const RecordDecl *GlobalizedRD = nullptr;
1981 llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
1982 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1983 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
Alexey Bataev8bcc69c2018-11-09 20:03:19 +00001984 getDistributeLastprivateVars(CGM.getContext(), D, LastPrivates);
Alexey Bataev2adecff2018-09-21 14:22:53 +00001985 if (!LastPrivates.empty())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001986 GlobalizedRD = ::buildRecordForGlobalizedVars(
1987 CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
Alexey Bataev2adecff2018-09-21 14:22:53 +00001988 }
1989
Alexey Bataevc99042b2018-03-15 18:10:54 +00001990 // Emit target region as a standalone region.
1991 class NVPTXPrePostActionTy : public PrePostActionTy {
1992 SourceLocation &Loc;
Alexey Bataev2adecff2018-09-21 14:22:53 +00001993 const RecordDecl *GlobalizedRD;
1994 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1995 &MappedDeclsFields;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001996
1997 public:
Alexey Bataev2adecff2018-09-21 14:22:53 +00001998 NVPTXPrePostActionTy(
1999 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
2000 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
2001 &MappedDeclsFields)
2002 : Loc(Loc), GlobalizedRD(GlobalizedRD),
2003 MappedDeclsFields(MappedDeclsFields) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00002004 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002005 auto &Rt =
2006 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
2007 if (GlobalizedRD) {
2008 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2009 I->getSecond().GlobalRecord = GlobalizedRD;
2010 I->getSecond().MappedParams =
2011 llvm::make_unique<CodeGenFunction::OMPMapVars>();
2012 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2013 for (const auto &Pair : MappedDeclsFields) {
2014 assert(Pair.getFirst()->isCanonicalDecl() &&
2015 "Expected canonical declaration");
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002016 Data.insert(std::make_pair(Pair.getFirst(),
2017 MappedVarData(Pair.getSecond(),
2018 /*IsOnePerTeam=*/true)));
Alexey Bataev2adecff2018-09-21 14:22:53 +00002019 }
2020 }
2021 Rt.emitGenericVarsProlog(CGF, Loc);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002022 }
2023 void Exit(CodeGenFunction &CGF) override {
2024 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2025 .emitGenericVarsEpilog(CGF);
2026 }
Alexey Bataev2adecff2018-09-21 14:22:53 +00002027 } Action(Loc, GlobalizedRD, MappedDeclsFields);
2028 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002029 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2030 D, ThreadIDVar, InnermostKind, CodeGen);
2031 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
2032 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00002033 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002034 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00002035
2036 return OutlinedFun;
2037}
2038
Alexey Bataevc99042b2018-03-15 18:10:54 +00002039void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002040 SourceLocation Loc,
2041 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002042 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2043 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002044 return;
2045
Alexey Bataevc99042b2018-03-15 18:10:54 +00002046 CGBuilderTy &Bld = CGF.Builder;
2047
2048 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2049 if (I == FunctionGlobalizedDecls.end())
2050 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002051 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002052 QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002053 QualType SecGlobalRecTy;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002054
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002055 // Recover pointer to this function's global record. The runtime will
2056 // handle the specifics of the allocation of the memory.
2057 // Use actual memory size of the record including the padding
2058 // for alignment purposes.
2059 unsigned Alignment =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002060 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002061 unsigned GlobalRecordSize =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002062 CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002063 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002064
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002065 llvm::PointerType *GlobalRecPtrTy =
2066 CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002067 llvm::Value *GlobalRecCastAddr;
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002068 llvm::Value *IsTTD = nullptr;
Alexey Bataeve4090182018-11-02 14:54:07 +00002069 if (!IsInTTDRegion &&
2070 (WithSPMDCheck ||
2071 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002072 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2073 llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2074 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002075 if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2076 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2077 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2078 llvm::Value *PL = CGF.EmitRuntimeCall(
2079 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2080 {RTLoc, ThreadID});
2081 IsTTD = Bld.CreateIsNull(PL);
2082 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002083 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2084 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2085 Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2086 // There is no need to emit line number for unconditional branch.
2087 (void)ApplyDebugLocation::CreateEmpty(CGF);
2088 CGF.EmitBlock(SPMDBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002089 Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2090 CharUnits::fromQuantity(Alignment));
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002091 CGF.EmitBranch(ExitBB);
2092 // There is no need to emit line number for unconditional branch.
2093 (void)ApplyDebugLocation::CreateEmpty(CGF);
2094 CGF.EmitBlock(NonSPMDBB);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002095 llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2096 if (const RecordDecl *SecGlobalizedVarsRecord =
2097 I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2098 SecGlobalRecTy =
2099 CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2100
2101 // Recover pointer to this function's global record. The runtime will
2102 // handle the specifics of the allocation of the memory.
2103 // Use actual memory size of the record including the padding
2104 // for alignment purposes.
2105 unsigned Alignment =
2106 CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2107 unsigned GlobalRecordSize =
2108 CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2109 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2110 Size = Bld.CreateSelect(
2111 IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2112 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002113 // TODO: allow the usage of shared memory to be controlled by
2114 // the user, for now, default to global.
2115 llvm::Value *GlobalRecordSizeArg[] = {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002116 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002117 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2118 createNVPTXRuntimeFunction(
2119 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2120 GlobalRecordSizeArg);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002121 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002122 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002123 CGF.EmitBlock(ExitBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002124 auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002125 /*NumReservedValues=*/2, "_select_stack");
2126 Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2127 Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2128 GlobalRecCastAddr = Phi;
2129 I->getSecond().GlobalRecordAddr = Phi;
2130 I->getSecond().IsInSPMDModeFlag = IsSPMD;
Alexey Bataeve4090182018-11-02 14:54:07 +00002131 } else if (IsInTTDRegion) {
2132 assert(GlobalizedRecords.back().Records.size() < 2 &&
2133 "Expected less than 2 globalized records: one for target and one "
2134 "for teams.");
2135 unsigned Offset = 0;
2136 for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2137 QualType RDTy = CGM.getContext().getRecordType(RD);
2138 unsigned Alignment =
2139 CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2140 unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2141 Offset =
2142 llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2143 }
2144 unsigned Alignment =
2145 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2146 Offset = llvm::alignTo(Offset, Alignment);
2147 GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2148 ++GlobalizedRecords.back().RegionCounter;
2149 if (GlobalizedRecords.back().Records.size() == 1) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002150 assert(KernelStaticGlobalized &&
2151 "Kernel static pointer must be initialized already.");
2152 auto *UseSharedMemory = new llvm::GlobalVariable(
2153 CGM.getModule(), CGM.Int16Ty, /*isConstant=*/true,
2154 llvm::GlobalValue::InternalLinkage, nullptr,
2155 "_openmp_static_kernel$is_shared");
2156 UseSharedMemory->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2157 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2158 /*DestWidth=*/16, /*Signed=*/0);
2159 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2160 Address(UseSharedMemory,
2161 CGM.getContext().getTypeAlignInChars(Int16Ty)),
2162 /*Volatile=*/false, Int16Ty, Loc);
2163 auto *StaticGlobalized = new llvm::GlobalVariable(
2164 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/false,
2165 llvm::GlobalValue::WeakAnyLinkage, nullptr);
Alexey Bataeve4090182018-11-02 14:54:07 +00002166 auto *RecSize = new llvm::GlobalVariable(
2167 CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2168 llvm::GlobalValue::InternalLinkage, nullptr,
2169 "_openmp_static_kernel$size");
2170 RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2171 llvm::Value *Ld = CGF.EmitLoadOfScalar(
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002172 Address(RecSize, CGM.getSizeAlign()), /*Volatile=*/false,
Alexey Bataeve4090182018-11-02 14:54:07 +00002173 CGM.getContext().getSizeType(), Loc);
2174 llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2175 KernelStaticGlobalized, CGM.VoidPtrPtrTy);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002176 llvm::Value *GlobalRecordSizeArg[] = {StaticGlobalized, Ld,
2177 IsInSharedMemory, ResAddr};
Alexey Bataeve4090182018-11-02 14:54:07 +00002178 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2179 OMPRTL_NVPTX__kmpc_get_team_static_memory),
2180 GlobalRecordSizeArg);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002181 GlobalizedRecords.back().Buffer = StaticGlobalized;
Alexey Bataeve4090182018-11-02 14:54:07 +00002182 GlobalizedRecords.back().RecSize = RecSize;
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002183 GlobalizedRecords.back().UseSharedMemory = UseSharedMemory;
2184 GlobalizedRecords.back().Loc = Loc;
Alexey Bataeve4090182018-11-02 14:54:07 +00002185 }
2186 assert(KernelStaticGlobalized && "Global address must be set already.");
2187 Address FrameAddr = CGF.EmitLoadOfPointer(
2188 Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2189 CGM.getContext()
2190 .getPointerType(CGM.getContext().VoidPtrTy)
2191 .castAs<PointerType>());
2192 llvm::Value *GlobalRecValue =
2193 Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One())
2194 .getPointer();
2195 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2196 I->getSecond().IsInSPMDModeFlag = nullptr;
2197 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2198 GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002199 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002200 // TODO: allow the usage of shared memory to be controlled by
2201 // the user, for now, default to global.
2202 llvm::Value *GlobalRecordSizeArg[] = {
2203 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2204 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002205 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2206 createNVPTXRuntimeFunction(
2207 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
2208 GlobalRecordSizeArg);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002209 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002210 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002211 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2212 I->getSecond().IsInSPMDModeFlag = nullptr;
2213 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002214 LValue Base =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002215 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002216
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002217 // Emit the "global alloca" which is a GEP from the global declaration
2218 // record using the pointer returned by the runtime.
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002219 LValue SecBase;
2220 decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2221 if (IsTTD) {
2222 SecIt = I->getSecond().SecondaryLocalVarData->begin();
2223 llvm::PointerType *SecGlobalRecPtrTy =
2224 CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2225 SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2226 Bld.CreatePointerBitCastOrAddrSpaceCast(
2227 I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2228 SecGlobalRecTy);
2229 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002230 for (auto &Rec : I->getSecond().LocalVarData) {
2231 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2232 llvm::Value *ParValue;
2233 if (EscapedParam) {
2234 const auto *VD = cast<VarDecl>(Rec.first);
2235 LValue ParLVal =
2236 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2237 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2238 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002239 LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2240 // Emit VarAddr basing on lane-id if required.
2241 QualType VarTy;
2242 if (Rec.second.IsOnePerTeam) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002243 VarTy = Rec.second.FD->getType();
2244 } else {
2245 llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2246 VarAddr.getAddress().getPointer(),
2247 {Bld.getInt32(0), getNVPTXLaneID(CGF)});
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002248 VarTy =
2249 Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002250 VarAddr = CGF.MakeAddrLValue(
2251 Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2252 AlignmentSource::Decl);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002253 }
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002254 Rec.second.PrivateAddr = VarAddr.getAddress();
Alexey Bataeve4090182018-11-02 14:54:07 +00002255 if (!IsInTTDRegion &&
2256 (WithSPMDCheck ||
2257 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002258 assert(I->getSecond().IsInSPMDModeFlag &&
2259 "Expected unknown execution mode or required SPMD check.");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002260 if (IsTTD) {
2261 assert(SecIt->second.IsOnePerTeam &&
2262 "Secondary glob data must be one per team.");
2263 LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2264 VarAddr.setAddress(
2265 Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2266 VarAddr.getPointer()),
2267 VarAddr.getAlignment()));
2268 Rec.second.PrivateAddr = VarAddr.getAddress();
2269 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002270 Address GlobalPtr = Rec.second.PrivateAddr;
2271 Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2272 Rec.second.PrivateAddr = Address(
2273 Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2274 LocalAddr.getPointer(), GlobalPtr.getPointer()),
2275 LocalAddr.getAlignment());
2276 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002277 if (EscapedParam) {
2278 const auto *VD = cast<VarDecl>(Rec.first);
2279 CGF.EmitStoreOfScalar(ParValue, VarAddr);
2280 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2281 }
Alexey Bataev93a38d62018-10-16 00:09:06 +00002282 if (IsTTD)
2283 ++SecIt;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002284 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002285 }
2286 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2287 // Recover pointer to this function's global record. The runtime will
2288 // handle the specifics of the allocation of the memory.
2289 // Use actual memory size of the record including the padding
2290 // for alignment purposes.
Alexey Bataev9ff80832018-04-16 20:16:21 +00002291 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002292 llvm::Value *Size = CGF.getTypeSize(VD->getType());
2293 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2294 Size = Bld.CreateNUWAdd(
2295 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2296 llvm::Value *AlignVal =
2297 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2298 Size = Bld.CreateUDiv(Size, AlignVal);
2299 Size = Bld.CreateNUWMul(Size, AlignVal);
2300 // TODO: allow the usage of shared memory to be controlled by
2301 // the user, for now, default to global.
2302 llvm::Value *GlobalRecordSizeArg[] = {
2303 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2304 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
Alexey Bataev1fc1f8e2018-11-02 16:08:31 +00002305 createNVPTXRuntimeFunction(
2306 OMPRTL_NVPTX__kmpc_data_sharing_coalesced_push_stack),
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002307 GlobalRecordSizeArg);
2308 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2309 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2310 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2311 CGM.getContext().getDeclAlign(VD),
2312 AlignmentSource::Decl);
2313 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2314 Base.getAddress());
2315 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002316 }
2317 I->getSecond().MappedParams->apply(CGF);
2318}
2319
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002320void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2321 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002322 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2323 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002324 return;
2325
Alexey Bataevc99042b2018-03-15 18:10:54 +00002326 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002327 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00002328 I->getSecond().MappedParams->restore(CGF);
2329 if (!CGF.HaveInsertPoint())
2330 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002331 for (llvm::Value *Addr :
2332 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2333 CGF.EmitRuntimeCall(
2334 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2335 Addr);
2336 }
2337 if (I->getSecond().GlobalRecordAddr) {
Alexey Bataeve4090182018-11-02 14:54:07 +00002338 if (!IsInTTDRegion &&
2339 (WithSPMDCheck ||
2340 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002341 CGBuilderTy &Bld = CGF.Builder;
2342 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2343 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2344 Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2345 // There is no need to emit line number for unconditional branch.
2346 (void)ApplyDebugLocation::CreateEmpty(CGF);
2347 CGF.EmitBlock(NonSPMDBB);
2348 CGF.EmitRuntimeCall(
2349 createNVPTXRuntimeFunction(
2350 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2351 CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2352 CGF.EmitBlock(ExitBB);
Alexey Bataeve4090182018-11-02 14:54:07 +00002353 } else if (IsInTTDRegion) {
2354 assert(GlobalizedRecords.back().RegionCounter > 0 &&
2355 "region counter must be > 0.");
2356 --GlobalizedRecords.back().RegionCounter;
2357 // Emit the restore function only in the target region.
2358 if (GlobalizedRecords.back().RegionCounter == 0) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002359 QualType Int16Ty = CGM.getContext().getIntTypeForBitwidth(
2360 /*DestWidth=*/16, /*Signed=*/0);
2361 llvm::Value *IsInSharedMemory = CGF.EmitLoadOfScalar(
2362 Address(GlobalizedRecords.back().UseSharedMemory,
2363 CGM.getContext().getTypeAlignInChars(Int16Ty)),
2364 /*Volatile=*/false, Int16Ty, GlobalizedRecords.back().Loc);
Alexey Bataeve4090182018-11-02 14:54:07 +00002365 CGF.EmitRuntimeCall(
2366 createNVPTXRuntimeFunction(
2367 OMPRTL_NVPTX__kmpc_restore_team_static_memory),
Alexey Bataev09c9eea2018-11-09 16:18:04 +00002368 IsInSharedMemory);
Alexey Bataeve4090182018-11-02 14:54:07 +00002369 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002370 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002371 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2372 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2373 I->getSecond().GlobalRecordAddr);
2374 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002375 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00002376 }
2377}
2378
Carlo Bertollic6872252016-04-04 15:55:02 +00002379void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2380 const OMPExecutableDirective &D,
2381 SourceLocation Loc,
2382 llvm::Value *OutlinedFn,
2383 ArrayRef<llvm::Value *> CapturedVars) {
2384 if (!CGF.HaveInsertPoint())
2385 return;
2386
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002387 Address ZeroAddr = CGF.CreateMemTemp(
2388 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2389 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00002390 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2391 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002392 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00002393 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2394 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00002395 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00002396}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002397
2398void CGOpenMPRuntimeNVPTX::emitParallelCall(
2399 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2400 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2401 if (!CGF.HaveInsertPoint())
2402 return;
2403
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002404 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002405 emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002406 else
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002407 emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002408}
2409
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002410void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002411 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2412 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2413 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002414
2415 // Force inline this outlined function at its call site.
2416 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2417
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002418 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2419 /*DestWidth=*/32, /*Signed=*/1),
2420 ".zero.addr");
2421 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002422 // ThreadId for serialized parallels is 0.
2423 Address ThreadIDAddr = ZeroAddr;
2424 auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002425 CodeGenFunction &CGF, PrePostActionTy &Action) {
2426 Action.Enter(CGF);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002427
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002428 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2429 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2430 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2431 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2432 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2433 };
2434 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2435 PrePostActionTy &) {
2436
2437 RegionCodeGenTy RCG(CodeGen);
2438 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2439 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2440 llvm::Value *Args[] = {RTLoc, ThreadID};
2441
2442 NVPTXActionTy Action(
2443 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2444 Args,
2445 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2446 Args);
2447 RCG.setAction(Action);
2448 RCG(CGF);
2449 };
2450
2451 auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2452 PrePostActionTy &Action) {
2453 CGBuilderTy &Bld = CGF.Builder;
2454 llvm::Function *WFn = WrapperFunctionsMap[Fn];
2455 assert(WFn && "Wrapper function does not exist!");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002456 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2457
2458 // Prepare for parallel region. Indicate the outlined function.
2459 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002460 CGF.EmitRuntimeCall(
2461 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2462 Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002463
2464 // Create a private scope that will globalize the arguments
2465 // passed from the outside of the target region.
2466 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2467
2468 // There's somehting to share.
2469 if (!CapturedVars.empty()) {
2470 // Prepare for parallel region. Indicate the outlined function.
2471 Address SharedArgs =
2472 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2473 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2474
2475 llvm::Value *DataSharingArgs[] = {
2476 SharedArgsPtr,
2477 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2478 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2479 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2480 DataSharingArgs);
2481
2482 // Store variable address in a list of references to pass to workers.
2483 unsigned Idx = 0;
2484 ASTContext &Ctx = CGF.getContext();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002485 Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2486 SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2487 .castAs<PointerType>());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002488 for (llvm::Value *V : CapturedVars) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002489 Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2490 CGF.getPointerSize());
2491 llvm::Value *PtrV;
Alexey Bataev17314212018-03-20 15:41:05 +00002492 if (V->getType()->isIntegerTy())
2493 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2494 else
2495 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002496 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2497 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00002498 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002499 }
2500 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002501
2502 // Activate workers. This barrier is used by the master to signal
2503 // work for the workers.
2504 syncCTAThreads(CGF);
2505
2506 // OpenMP [2.5, Parallel Construct, p.49]
2507 // There is an implied barrier at the end of a parallel region. After the
2508 // end of a parallel region, only the master thread of the team resumes
2509 // execution of the enclosing task region.
2510 //
2511 // The master waits at this barrier until all workers are done.
2512 syncCTAThreads(CGF);
2513
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002514 if (!CapturedVars.empty())
2515 CGF.EmitRuntimeCall(
2516 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2517
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002518 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002519 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002520 };
2521
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002522 auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2523 CodeGenFunction &CGF, PrePostActionTy &Action) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002524 if (IsInParallelRegion) {
2525 SeqGen(CGF, Action);
2526 } else if (IsInTargetMasterThreadRegion) {
2527 L0ParallelGen(CGF, Action);
2528 } else {
2529 // Check for master and then parallelism:
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002530 // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002531 // Serialized execution.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002532 // } else {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002533 // Worker call.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002534 // }
2535 CGBuilderTy &Bld = CGF.Builder;
2536 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002537 llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2538 llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002539 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
Alexey Bataev673110d2018-05-16 13:36:30 +00002540 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2541 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002542 Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002543 // There is no need to emit line number for unconditional branch.
2544 (void)ApplyDebugLocation::CreateEmpty(CGF);
2545 CGF.EmitBlock(ParallelCheckBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002546 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2547 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2548 llvm::Value *PL = CGF.EmitRuntimeCall(
2549 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2550 {RTLoc, ThreadID});
2551 llvm::Value *Res = Bld.CreateIsNotNull(PL);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002552 Bld.CreateCondBr(Res, SeqBB, MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002553 CGF.EmitBlock(SeqBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002554 SeqGen(CGF, Action);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002555 CGF.EmitBranch(ExitBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002556 // There is no need to emit line number for unconditional branch.
2557 (void)ApplyDebugLocation::CreateEmpty(CGF);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002558 CGF.EmitBlock(MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002559 L0ParallelGen(CGF, Action);
2560 CGF.EmitBranch(ExitBB);
2561 // There is no need to emit line number for unconditional branch.
2562 (void)ApplyDebugLocation::CreateEmpty(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002563 // Emit the continuation block for code after the if.
2564 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2565 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002566 };
2567
Alexey Bataev9ff80832018-04-16 20:16:21 +00002568 if (IfCond) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002569 emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002570 } else {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002571 CodeGenFunction::RunCleanupsScope Scope(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002572 RegionCodeGenTy ThenRCG(LNParallelGen);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002573 ThenRCG(CGF);
2574 }
2575}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002576
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002577void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002578 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2579 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2580 // Just call the outlined function to execute the parallel region.
2581 // OutlinedFn(&GTid, &zero, CapturedStruct);
2582 //
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002583 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00002584
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002585 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2586 /*DestWidth=*/32, /*Signed=*/1),
2587 ".zero.addr");
Carlo Bertolli79712092018-02-28 20:48:35 +00002588 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002589 // ThreadId for serialized parallels is 0.
2590 Address ThreadIDAddr = ZeroAddr;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002591 auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
Alexey Bataev8521ff62018-07-25 20:03:01 +00002592 &ThreadIDAddr](CodeGenFunction &CGF,
2593 PrePostActionTy &Action) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002594 Action.Enter(CGF);
2595
2596 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2597 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2598 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2599 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2600 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2601 };
2602 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2603 PrePostActionTy &) {
2604
2605 RegionCodeGenTy RCG(CodeGen);
2606 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2607 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2608 llvm::Value *Args[] = {RTLoc, ThreadID};
2609
2610 NVPTXActionTy Action(
2611 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2612 Args,
2613 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2614 Args);
2615 RCG.setAction(Action);
2616 RCG(CGF);
2617 };
2618
2619 if (IsInTargetMasterThreadRegion) {
Alexey Bataev8521ff62018-07-25 20:03:01 +00002620 // In the worker need to use the real thread id.
2621 ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002622 RegionCodeGenTy RCG(CodeGen);
2623 RCG(CGF);
2624 } else {
2625 // If we are not in the target region, it is definitely L2 parallelism or
2626 // more, because for SPMD mode we always has L1 parallel level, sowe don't
2627 // need to check for orphaned directives.
2628 RegionCodeGenTy RCG(SeqGen);
2629 RCG(CGF);
2630 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002631}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002632
Alexey Bataev504fc2d2018-05-07 17:23:05 +00002633void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2634 CodeGenFunction &CGF, StringRef CriticalName,
2635 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2636 const Expr *Hint) {
2637 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2638 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2639 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2640 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2641 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2642
2643 // Fetch team-local id of the thread.
2644 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2645
2646 // Get the width of the team.
2647 llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2648
2649 // Initialize the counter variable for the loop.
2650 QualType Int32Ty =
2651 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2652 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2653 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2654 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2655 /*isInit=*/true);
2656
2657 // Block checks if loop counter exceeds upper bound.
2658 CGF.EmitBlock(LoopBB);
2659 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2660 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2661 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2662
2663 // Block tests which single thread should execute region, and which threads
2664 // should go straight to synchronisation point.
2665 CGF.EmitBlock(TestBB);
2666 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2667 llvm::Value *CmpThreadToCounter =
2668 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2669 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2670
2671 // Block emits the body of the critical region.
2672 CGF.EmitBlock(BodyBB);
2673
2674 // Output the critical statement.
2675 CriticalOpGen(CGF);
2676
2677 // After the body surrounded by the critical region, the single executing
2678 // thread will jump to the synchronisation point.
2679 // Block waits for all threads in current team to finish then increments the
2680 // counter variable and returns to the loop.
2681 CGF.EmitBlock(SyncBB);
2682 getNVPTXCTABarrier(CGF);
2683
2684 llvm::Value *IncCounterVal =
2685 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2686 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2687 CGF.EmitBranch(LoopBB);
2688
2689 // Block that is reached when all threads in the team complete the region.
2690 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2691}
2692
Alexey Bataevb2575932018-01-04 20:18:55 +00002693/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00002694static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2695 QualType ValTy, QualType CastTy,
2696 SourceLocation Loc) {
2697 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2698 "Cast type must sized.");
2699 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2700 "Val type must sized.");
2701 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2702 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00002703 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00002704 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2705 CGF.getContext().getTypeSizeInChars(CastTy))
2706 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2707 if (CastTy->isIntegerType() && ValTy->isIntegerType())
2708 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2709 CastTy->hasSignedIntegerRepresentation());
2710 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002711 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2712 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00002713 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2714 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00002715}
2716
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002717/// This function creates calls to one of two shuffle functions to copy
2718/// variables between lanes in a warp.
2719static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002720 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00002721 QualType ElemType,
2722 llvm::Value *Offset,
2723 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002724 CodeGenModule &CGM = CGF.CGM;
2725 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002726 CGOpenMPRuntimeNVPTX &RT =
2727 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2728
Alexey Bataeva453f362018-03-19 17:53:56 +00002729 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2730 assert(Size.getQuantity() <= 8 &&
2731 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002732
Alexey Bataeva453f362018-03-19 17:53:56 +00002733 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002734 ? OMPRTL_NVPTX__kmpc_shuffle_int32
2735 : OMPRTL_NVPTX__kmpc_shuffle_int64;
2736
2737 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00002738 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2739 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2740 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002741 llvm::Value *WarpSize =
Alexey Bataevb2575932018-01-04 20:18:55 +00002742 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002743
Alexey Bataev9ff80832018-04-16 20:16:21 +00002744 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2745 RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002746
Alexey Bataeva453f362018-03-19 17:53:56 +00002747 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002748}
2749
Alexey Bataev12c62902018-06-22 19:10:38 +00002750static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2751 Address DestAddr, QualType ElemType,
2752 llvm::Value *Offset, SourceLocation Loc) {
2753 CGBuilderTy &Bld = CGF.Builder;
2754
2755 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2756 // Create the loop over the big sized data.
2757 // ptr = (void*)Elem;
2758 // ptrEnd = (void*) Elem + 1;
2759 // Step = 8;
2760 // while (ptr + Step < ptrEnd)
2761 // shuffle((int64_t)*ptr);
2762 // Step = 4;
2763 // while (ptr + Step < ptrEnd)
2764 // shuffle((int32_t)*ptr);
2765 // ...
2766 Address ElemPtr = DestAddr;
2767 Address Ptr = SrcAddr;
2768 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2769 Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2770 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2771 if (Size < CharUnits::fromQuantity(IntSize))
2772 continue;
2773 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2774 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2775 /*Signed=*/1);
2776 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2777 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2778 ElemPtr =
2779 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2780 if (Size.getQuantity() / IntSize > 1) {
2781 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2782 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2783 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2784 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2785 CGF.EmitBlock(PreCondBB);
2786 llvm::PHINode *PhiSrc =
2787 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2788 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2789 llvm::PHINode *PhiDest =
2790 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2791 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2792 Ptr = Address(PhiSrc, Ptr.getAlignment());
2793 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2794 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2795 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2796 Ptr.getPointer(), CGF.VoidPtrTy));
2797 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2798 ThenBB, ExitBB);
2799 CGF.EmitBlock(ThenBB);
2800 llvm::Value *Res = createRuntimeShuffleFunction(
2801 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2802 IntType, Offset, Loc);
2803 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2804 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2805 ElemPtr =
2806 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2807 PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2808 PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2809 CGF.EmitBranch(PreCondBB);
2810 CGF.EmitBlock(ExitBB);
2811 } else {
2812 llvm::Value *Res = createRuntimeShuffleFunction(
2813 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2814 IntType, Offset, Loc);
2815 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2816 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2817 ElemPtr =
2818 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2819 }
2820 Size = Size % IntSize;
2821 }
2822}
2823
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002824namespace {
2825enum CopyAction : unsigned {
2826 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2827 // the warp using shuffle instructions.
2828 RemoteLaneToThread,
2829 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2830 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002831 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2832 ThreadToScratchpad,
2833 // ScratchpadToThread: Copy from a scratchpad array in global memory
2834 // containing team-reduced data to a thread's stack.
2835 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002836};
2837} // namespace
2838
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002839struct CopyOptionsTy {
2840 llvm::Value *RemoteLaneOffset;
2841 llvm::Value *ScratchpadIndex;
2842 llvm::Value *ScratchpadWidth;
2843};
2844
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002845/// Emit instructions to copy a Reduce list, which contains partially
2846/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002847static void emitReductionListCopy(
2848 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2849 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2850 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002851
Alexey Bataev9ff80832018-04-16 20:16:21 +00002852 CodeGenModule &CGM = CGF.CGM;
2853 ASTContext &C = CGM.getContext();
2854 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002855
Alexey Bataev9ff80832018-04-16 20:16:21 +00002856 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2857 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2858 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002859
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002860 // Iterates, element-by-element, through the source Reduce list and
2861 // make a copy.
2862 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002863 unsigned Size = Privates.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00002864 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002865 Address SrcElementAddr = Address::invalid();
2866 Address DestElementAddr = Address::invalid();
2867 Address DestElementPtrAddr = Address::invalid();
2868 // Should we shuffle in an element from a remote lane?
2869 bool ShuffleInElement = false;
2870 // Set to true to update the pointer in the dest Reduce list to a
2871 // newly created element.
2872 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002873 // Increment the src or dest pointer to the scratchpad, for each
2874 // new element.
2875 bool IncrScratchpadSrc = false;
2876 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002877
2878 switch (Action) {
2879 case RemoteLaneToThread: {
2880 // Step 1.1: Get the address for the src element in the Reduce list.
2881 Address SrcElementPtrAddr =
2882 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002883 SrcElementAddr = CGF.EmitLoadOfPointer(
2884 SrcElementPtrAddr,
2885 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002886
2887 // Step 1.2: Create a temporary to store the element in the destination
2888 // Reduce list.
2889 DestElementPtrAddr =
2890 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2891 DestElementAddr =
2892 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2893 ShuffleInElement = true;
2894 UpdateDestListPtr = true;
2895 break;
2896 }
2897 case ThreadCopy: {
2898 // Step 1.1: Get the address for the src element in the Reduce list.
2899 Address SrcElementPtrAddr =
2900 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002901 SrcElementAddr = CGF.EmitLoadOfPointer(
2902 SrcElementPtrAddr,
2903 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002904
2905 // Step 1.2: Get the address for dest element. The destination
2906 // element has already been created on the thread's stack.
2907 DestElementPtrAddr =
2908 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002909 DestElementAddr = CGF.EmitLoadOfPointer(
2910 DestElementPtrAddr,
2911 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002912 break;
2913 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002914 case ThreadToScratchpad: {
2915 // Step 1.1: Get the address for the src element in the Reduce list.
2916 Address SrcElementPtrAddr =
2917 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002918 SrcElementAddr = CGF.EmitLoadOfPointer(
2919 SrcElementPtrAddr,
2920 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002921
2922 // Step 1.2: Get the address for dest element:
2923 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002924 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002925 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002926 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002927 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002928 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002929 ScratchPadElemAbsolutePtrVal =
2930 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002931 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2932 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002933 IncrScratchpadDest = true;
2934 break;
2935 }
2936 case ScratchpadToThread: {
2937 // Step 1.1: Get the address for the src element in the scratchpad.
2938 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002939 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002940 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002941 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002942 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002943 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002944 ScratchPadElemAbsolutePtrVal =
2945 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2946 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2947 C.getTypeAlignInChars(Private->getType()));
2948 IncrScratchpadSrc = true;
2949
2950 // Step 1.2: Create a temporary to store the element in the destination
2951 // Reduce list.
2952 DestElementPtrAddr =
2953 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2954 DestElementAddr =
2955 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2956 UpdateDestListPtr = true;
2957 break;
2958 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002959 }
2960
2961 // Regardless of src and dest of copy, we emit the load of src
2962 // element as this is required in all directions
2963 SrcElementAddr = Bld.CreateElementBitCast(
2964 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
Alexey Bataev12c62902018-06-22 19:10:38 +00002965 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2966 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002967
2968 // Now that all active lanes have read the element in the
2969 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00002970 if (ShuffleInElement) {
Alexey Bataev12c62902018-06-22 19:10:38 +00002971 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2972 RemoteLaneOffset, Private->getExprLoc());
2973 } else {
2974 if (Private->getType()->isScalarType()) {
2975 llvm::Value *Elem =
2976 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2977 Private->getType(), Private->getExprLoc());
2978 // Store the source element value to the dest element address.
2979 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2980 Private->getType());
2981 } else {
2982 CGF.EmitAggregateCopy(
2983 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2984 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2985 Private->getType(), AggValueSlot::DoesNotOverlap);
2986 }
Alexey Bataeva453f362018-03-19 17:53:56 +00002987 }
Alexey Bataevb2575932018-01-04 20:18:55 +00002988
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002989 // Step 3.1: Modify reference in dest Reduce list as needed.
2990 // Modifying the reference in Reduce list to point to the newly
2991 // created element. The element is live in the current function
2992 // scope and that of functions it invokes (i.e., reduce_function).
2993 // RemoteReduceData[i] = (void*)&RemoteElem
2994 if (UpdateDestListPtr) {
2995 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2996 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2997 DestElementPtrAddr, /*Volatile=*/false,
2998 C.VoidPtrTy);
2999 }
3000
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003001 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
3002 // address of the next element in scratchpad memory, unless we're currently
3003 // processing the last one. Memory alignment is also taken care of here.
3004 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
3005 llvm::Value *ScratchpadBasePtr =
3006 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
Alexey Bataeve290ec02018-04-06 16:03:36 +00003007 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
3008 ScratchpadBasePtr = Bld.CreateNUWAdd(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003009 ScratchpadBasePtr,
Alexey Bataeve290ec02018-04-06 16:03:36 +00003010 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003011
3012 // Take care of global memory alignment for performance
Alexey Bataeve290ec02018-04-06 16:03:36 +00003013 ScratchpadBasePtr = Bld.CreateNUWSub(
3014 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3015 ScratchpadBasePtr = Bld.CreateUDiv(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003016 ScratchpadBasePtr,
3017 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
Alexey Bataeve290ec02018-04-06 16:03:36 +00003018 ScratchpadBasePtr = Bld.CreateNUWAdd(
3019 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
3020 ScratchpadBasePtr = Bld.CreateNUWMul(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003021 ScratchpadBasePtr,
3022 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
3023
3024 if (IncrScratchpadDest)
3025 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3026 else /* IncrScratchpadSrc = true */
3027 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3028 }
3029
Alexey Bataev9ff80832018-04-16 20:16:21 +00003030 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003031 }
3032}
3033
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003034/// This function emits a helper that loads data from the scratchpad array
3035/// and (optionally) reduces it with the input operand.
3036///
3037/// load_and_reduce(local, scratchpad, index, width, should_reduce)
3038/// reduce_data remote;
3039/// for elem in remote:
3040/// remote.elem = Scratchpad[elem_id][index]
3041/// if (should_reduce)
3042/// local = local @ remote
3043/// else
3044/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003045static llvm::Value *emitReduceScratchpadFunction(
3046 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3047 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003048 ASTContext &C = CGM.getContext();
3049 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003050
3051 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003052 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3053 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003054 // Base address of the scratchpad array, with each element storing a
3055 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003056 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3057 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003058 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003059 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3060 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003061 // Row width of an element in the scratchpad array, typically
3062 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003063 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3064 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003065 // If should_reduce == 1, then it's load AND reduce,
3066 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
3067 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003068 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3069 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003070
3071 FunctionArgList Args;
3072 Args.push_back(&ReduceListArg);
3073 Args.push_back(&ScratchPadArg);
3074 Args.push_back(&IndexArg);
3075 Args.push_back(&WidthArg);
3076 Args.push_back(&ShouldReduceArg);
3077
Alexey Bataev9ff80832018-04-16 20:16:21 +00003078 const CGFunctionInfo &CGFI =
3079 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003080 auto *Fn = llvm::Function::Create(
3081 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3082 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003083 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003084 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003085 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003086 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003087
Alexey Bataev9ff80832018-04-16 20:16:21 +00003088 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003089
3090 // Get local Reduce list pointer.
3091 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3092 Address ReduceListAddr(
3093 Bld.CreatePointerBitCastOrAddrSpaceCast(
3094 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003095 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003096 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3097 CGF.getPointerAlign());
3098
3099 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3100 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003101 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003102
3103 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003104 llvm::Value *IndexVal = Bld.CreateIntCast(
3105 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3106 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003107
3108 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003109 llvm::Value *WidthVal = Bld.CreateIntCast(
3110 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
3111 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003112
3113 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
3114 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003115 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003116
3117 // The absolute ptr address to the base addr of the next element to copy.
3118 llvm::Value *CumulativeElemBasePtr =
3119 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3120 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3121
3122 // Create a Remote Reduce list to store the elements read from the
3123 // scratchpad array.
3124 Address RemoteReduceList =
3125 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
3126
3127 // Assemble remote Reduce list from scratchpad array.
3128 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
3129 SrcDataAddr, RemoteReduceList,
3130 {/*RemoteLaneOffset=*/nullptr,
3131 /*ScratchpadIndex=*/IndexVal,
3132 /*ScratchpadWidth=*/WidthVal});
3133
3134 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3135 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3136 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3137
Alexey Bataev9ff80832018-04-16 20:16:21 +00003138 llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003139 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3140
3141 CGF.EmitBlock(ThenBB);
3142 // We should reduce with the local Reduce list.
3143 // reduce_function(LocalReduceList, RemoteReduceList)
3144 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3145 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
3146 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3147 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003148 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3149 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003150 Bld.CreateBr(MergeBB);
3151
3152 CGF.EmitBlock(ElseBB);
3153 // No reduction; just copy:
3154 // Local Reduce list = Remote Reduce list.
3155 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3156 RemoteReduceList, ReduceListAddr);
3157 Bld.CreateBr(MergeBB);
3158
3159 CGF.EmitBlock(MergeBB);
3160
3161 CGF.FinishFunction();
3162 return Fn;
3163}
3164
3165/// This function emits a helper that stores reduced data from the team
3166/// master to a scratchpad array in global memory.
3167///
3168/// for elem in Reduce List:
3169/// scratchpad[elem_id][index] = elem
3170///
Benjamin Kramer674d5792017-05-26 20:08:24 +00003171static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
3172 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003173 QualType ReductionArrayTy,
3174 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003175
Alexey Bataev9ff80832018-04-16 20:16:21 +00003176 ASTContext &C = CGM.getContext();
3177 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003178
3179 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003180 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3181 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003182 // Base address of the scratchpad array, with each element storing a
3183 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003184 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3185 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003186 // A destination index into the scratchpad array, typically the team
3187 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003188 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3189 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003190 // Row width of an element in the scratchpad array, typically
3191 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003192 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3193 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003194
3195 FunctionArgList Args;
3196 Args.push_back(&ReduceListArg);
3197 Args.push_back(&ScratchPadArg);
3198 Args.push_back(&IndexArg);
3199 Args.push_back(&WidthArg);
3200
Alexey Bataev9ff80832018-04-16 20:16:21 +00003201 const CGFunctionInfo &CGFI =
3202 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003203 auto *Fn = llvm::Function::Create(
3204 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3205 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003206 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003207 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003208 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003209 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003210
Alexey Bataev9ff80832018-04-16 20:16:21 +00003211 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003212
3213 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3214 Address SrcDataAddr(
3215 Bld.CreatePointerBitCastOrAddrSpaceCast(
3216 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003217 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003218 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3219 CGF.getPointerAlign());
3220
3221 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3222 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003223 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003224
3225 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003226 llvm::Value *IndexVal = Bld.CreateIntCast(
3227 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3228 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003229
3230 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
3231 llvm::Value *WidthVal =
3232 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
3233 Int32Ty, SourceLocation()),
3234 CGF.SizeTy, /*isSigned=*/true);
3235
3236 // The absolute ptr address to the base addr of the next element to copy.
3237 llvm::Value *CumulativeElemBasePtr =
3238 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3239 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3240
3241 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
3242 SrcDataAddr, DestDataAddr,
3243 {/*RemoteLaneOffset=*/nullptr,
3244 /*ScratchpadIndex=*/IndexVal,
3245 /*ScratchpadWidth=*/WidthVal});
3246
3247 CGF.FinishFunction();
3248 return Fn;
3249}
3250
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003251/// This function emits a helper that gathers Reduce lists from the first
3252/// lane of every active warp to lanes in the first warp.
3253///
3254/// void inter_warp_copy_func(void* reduce_data, num_warps)
3255/// shared smem[warp_size];
3256/// For all data entries D in reduce_data:
3257/// If (I am the first lane in each warp)
3258/// Copy my local D to smem[warp_id]
3259/// sync
3260/// if (I am the first warp)
3261/// Copy smem[thread_id] to my local D
3262/// sync
3263static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3264 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003265 QualType ReductionArrayTy,
3266 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003267 ASTContext &C = CGM.getContext();
3268 llvm::Module &M = CGM.getModule();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003269
3270 // ReduceList: thread local Reduce list.
3271 // At the stage of the computation when this function is called, partially
3272 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003273 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3274 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003275 // NumWarps: number of warps active in the parallel region. This could
3276 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003277 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00003278 C.getIntTypeForBitwidth(32, /* Signed */ true),
3279 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003280 FunctionArgList Args;
3281 Args.push_back(&ReduceListArg);
3282 Args.push_back(&NumWarpsArg);
3283
Alexey Bataev9ff80832018-04-16 20:16:21 +00003284 const CGFunctionInfo &CGFI =
3285 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003286 auto *Fn = llvm::Function::Create(
3287 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3288 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003289 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003290 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003291 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003292 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003293
Alexey Bataev9ff80832018-04-16 20:16:21 +00003294 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003295
3296 // This array is used as a medium to transfer, one reduce element at a time,
3297 // the data from the first lane of every warp to lanes in the first warp
3298 // in order to perform the final step of a reduction in a parallel region
3299 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3300 // for reduced latency, as well as to have a distinct copy for concurrently
3301 // executing target regions. The array is declared with common linkage so
3302 // as to be shared across compilation units.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003303 StringRef TransferMediumName =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003304 "__openmp_nvptx_data_transfer_temporary_storage";
3305 llvm::GlobalVariable *TransferMedium =
3306 M.getGlobalVariable(TransferMediumName);
3307 if (!TransferMedium) {
3308 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
3309 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3310 TransferMedium = new llvm::GlobalVariable(
3311 M, Ty,
3312 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3313 llvm::Constant::getNullValue(Ty), TransferMediumName,
3314 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3315 SharedAddressSpace);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003316 CGM.addCompilerUsedGlobal(TransferMedium);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003317 }
3318
3319 // Get the CUDA thread id of the current OpenMP thread on the GPU.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003320 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003321 // nvptx_lane_id = nvptx_id % warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003322 llvm::Value *LaneID = getNVPTXLaneID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003323 // nvptx_warp_id = nvptx_id / warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003324 llvm::Value *WarpID = getNVPTXWarpID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003325
3326 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3327 Address LocalReduceList(
3328 Bld.CreatePointerBitCastOrAddrSpaceCast(
3329 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3330 C.VoidPtrTy, SourceLocation()),
3331 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3332 CGF.getPointerAlign());
3333
3334 unsigned Idx = 0;
Alexey Bataev9ff80832018-04-16 20:16:21 +00003335 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003336 //
3337 // Warp master copies reduce element to transfer medium in __shared__
3338 // memory.
3339 //
3340 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3341 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3342 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3343
3344 // if (lane_id == 0)
Alexey Bataev9ff80832018-04-16 20:16:21 +00003345 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003346 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3347 CGF.EmitBlock(ThenBB);
3348
3349 // Reduce element = LocalReduceList[i]
3350 Address ElemPtrPtrAddr =
3351 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3352 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3353 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3354 // elemptr = (type[i]*)(elemptrptr)
3355 Address ElemPtr =
3356 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3357 ElemPtr = Bld.CreateElementBitCast(
3358 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003359
3360 // Get pointer to location in transfer medium.
3361 // MediumPtr = &medium[warp_id]
3362 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3363 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3364 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
3365 // Casting to actual data type.
3366 // MediumPtr = (type[i]*)MediumPtrAddr;
3367 MediumPtr = Bld.CreateElementBitCast(
3368 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3369
Alexey Bataev12c62902018-06-22 19:10:38 +00003370 // elem = *elemptr
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003371 //*MediumPtr = elem
Alexey Bataev12c62902018-06-22 19:10:38 +00003372 if (Private->getType()->isScalarType()) {
3373 llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3374 Private->getType(), Loc);
3375 // Store the source element value to the dest element address.
3376 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
3377 Private->getType());
3378 } else {
3379 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3380 CGF.MakeAddrLValue(MediumPtr, Private->getType()),
3381 Private->getType(), AggValueSlot::DoesNotOverlap);
3382 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003383
3384 Bld.CreateBr(MergeBB);
3385
3386 CGF.EmitBlock(ElseBB);
3387 Bld.CreateBr(MergeBB);
3388
3389 CGF.EmitBlock(MergeBB);
3390
3391 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3392 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3393 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
3394
Alexey Bataev9ff80832018-04-16 20:16:21 +00003395 llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003396 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3397 // named_barrier_sync(ParallelBarrierID, num_active_threads)
3398 syncParallelThreads(CGF, NumActiveThreads);
3399
3400 //
3401 // Warp 0 copies reduce element from transfer medium.
3402 //
3403 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3404 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3405 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3406
3407 // Up to 32 threads in warp 0 are active.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003408 llvm::Value *IsActiveThread =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003409 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3410 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3411
3412 CGF.EmitBlock(W0ThenBB);
3413
3414 // SrcMediumPtr = &medium[tid]
3415 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3416 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3417 Address SrcMediumPtr(SrcMediumPtrVal,
3418 C.getTypeAlignInChars(Private->getType()));
3419 // SrcMediumVal = *SrcMediumPtr;
3420 SrcMediumPtr = Bld.CreateElementBitCast(
3421 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003422
3423 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
3424 Address TargetElemPtrPtr =
3425 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3426 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3427 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3428 Address TargetElemPtr =
3429 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
3430 TargetElemPtr = Bld.CreateElementBitCast(
3431 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3432
3433 // *TargetElemPtr = SrcMediumVal;
Alexey Bataev12c62902018-06-22 19:10:38 +00003434 if (Private->getType()->isScalarType()) {
3435 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
3436 SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
3437 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3438 Private->getType());
3439 } else {
3440 CGF.EmitAggregateCopy(
3441 CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
3442 CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
3443 Private->getType(), AggValueSlot::DoesNotOverlap);
3444 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003445 Bld.CreateBr(W0MergeBB);
3446
3447 CGF.EmitBlock(W0ElseBB);
3448 Bld.CreateBr(W0MergeBB);
3449
3450 CGF.EmitBlock(W0MergeBB);
3451
3452 // While warp 0 copies values from transfer medium, all other warps must
3453 // wait.
3454 syncParallelThreads(CGF, NumActiveThreads);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003455 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003456 }
3457
3458 CGF.FinishFunction();
3459 return Fn;
3460}
3461
3462/// Emit a helper that reduces data across two OpenMP threads (lanes)
3463/// in the same warp. It uses shuffle instructions to copy over data from
3464/// a remote lane's stack. The reduction algorithm performed is specified
3465/// by the fourth parameter.
3466///
3467/// Algorithm Versions.
3468/// Full Warp Reduce (argument value 0):
3469/// This algorithm assumes that all 32 lanes are active and gathers
3470/// data from these 32 lanes, producing a single resultant value.
3471/// Contiguous Partial Warp Reduce (argument value 1):
3472/// This algorithm assumes that only a *contiguous* subset of lanes
3473/// are active. This happens for the last warp in a parallel region
3474/// when the user specified num_threads is not an integer multiple of
3475/// 32. This contiguous subset always starts with the zeroth lane.
3476/// Partial Warp Reduce (argument value 2):
3477/// This algorithm gathers data from any number of lanes at any position.
3478/// All reduced values are stored in the lowest possible lane. The set
3479/// of problems every algorithm addresses is a super set of those
3480/// addressable by algorithms with a lower version number. Overhead
3481/// increases as algorithm version increases.
3482///
3483/// Terminology
3484/// Reduce element:
3485/// Reduce element refers to the individual data field with primitive
3486/// data types to be combined and reduced across threads.
3487/// Reduce list:
3488/// Reduce list refers to a collection of local, thread-private
3489/// reduce elements.
3490/// Remote Reduce list:
3491/// Remote Reduce list refers to a collection of remote (relative to
3492/// the current thread) reduce elements.
3493///
3494/// We distinguish between three states of threads that are important to
3495/// the implementation of this function.
3496/// Alive threads:
3497/// Threads in a warp executing the SIMT instruction, as distinguished from
3498/// threads that are inactive due to divergent control flow.
3499/// Active threads:
3500/// The minimal set of threads that has to be alive upon entry to this
3501/// function. The computation is correct iff active threads are alive.
3502/// Some threads are alive but they are not active because they do not
3503/// contribute to the computation in any useful manner. Turning them off
3504/// may introduce control flow overheads without any tangible benefits.
3505/// Effective threads:
3506/// In order to comply with the argument requirements of the shuffle
3507/// function, we must keep all lanes holding data alive. But at most
3508/// half of them perform value aggregation; we refer to this half of
3509/// threads as effective. The other half is simply handing off their
3510/// data.
3511///
3512/// Procedure
3513/// Value shuffle:
3514/// In this step active threads transfer data from higher lane positions
3515/// in the warp to lower lane positions, creating Remote Reduce list.
3516/// Value aggregation:
3517/// In this step, effective threads combine their thread local Reduce list
3518/// with Remote Reduce list and store the result in the thread local
3519/// Reduce list.
3520/// Value copy:
3521/// In this step, we deal with the assumption made by algorithm 2
3522/// (i.e. contiguity assumption). When we have an odd number of lanes
3523/// active, say 2k+1, only k threads will be effective and therefore k
3524/// new values will be produced. However, the Reduce list owned by the
3525/// (2k+1)th thread is ignored in the value aggregation. Therefore
3526/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3527/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003528static llvm::Value *emitShuffleAndReduceFunction(
3529 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3530 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003531 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003532
3533 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003534 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3535 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003536 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003537 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3538 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003539 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003540 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3541 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003542 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003543 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3544 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003545 FunctionArgList Args;
3546 Args.push_back(&ReduceListArg);
3547 Args.push_back(&LaneIDArg);
3548 Args.push_back(&RemoteLaneOffsetArg);
3549 Args.push_back(&AlgoVerArg);
3550
Alexey Bataev9ff80832018-04-16 20:16:21 +00003551 const CGFunctionInfo &CGFI =
3552 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003553 auto *Fn = llvm::Function::Create(
3554 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3555 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003556 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003557 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003558 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003559 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003560
Alexey Bataev9ff80832018-04-16 20:16:21 +00003561 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003562
3563 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3564 Address LocalReduceList(
3565 Bld.CreatePointerBitCastOrAddrSpaceCast(
3566 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3567 C.VoidPtrTy, SourceLocation()),
3568 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3569 CGF.getPointerAlign());
3570
3571 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3572 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3573 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3574
3575 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3576 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3577 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3578
3579 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3580 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3581 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3582
3583 // Create a local thread-private variable to host the Reduce list
3584 // from a remote lane.
3585 Address RemoteReduceList =
3586 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3587
3588 // This loop iterates through the list of reduce elements and copies,
3589 // element by element, from a remote lane in the warp to RemoteReduceList,
3590 // hosted on the thread's stack.
3591 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3592 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003593 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3594 /*ScratchpadIndex=*/nullptr,
3595 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003596
3597 // The actions to be performed on the Remote Reduce list is dependent
3598 // on the algorithm version.
3599 //
3600 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3601 // LaneId % 2 == 0 && Offset > 0):
3602 // do the reduction value aggregation
3603 //
3604 // The thread local variable Reduce list is mutated in place to host the
3605 // reduced data, which is the aggregated value produced from local and
3606 // remote lanes.
3607 //
3608 // Note that AlgoVer is expected to be a constant integer known at compile
3609 // time.
3610 // When AlgoVer==0, the first conjunction evaluates to true, making
3611 // the entire predicate true during compile time.
3612 // When AlgoVer==1, the second conjunction has only the second part to be
3613 // evaluated during runtime. Other conjunctions evaluates to false
3614 // during compile time.
3615 // When AlgoVer==2, the third conjunction has only the second part to be
3616 // evaluated during runtime. Other conjunctions evaluates to false
3617 // during compile time.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003618 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003619
Alexey Bataev9ff80832018-04-16 20:16:21 +00003620 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3621 llvm::Value *CondAlgo1 = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003622 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3623
Alexey Bataev9ff80832018-04-16 20:16:21 +00003624 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3625 llvm::Value *CondAlgo2 = Bld.CreateAnd(
3626 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003627 CondAlgo2 = Bld.CreateAnd(
3628 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3629
Alexey Bataev9ff80832018-04-16 20:16:21 +00003630 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003631 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3632
3633 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3634 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3635 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3636 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3637
3638 CGF.EmitBlock(ThenBB);
3639 // reduce_function(LocalReduceList, RemoteReduceList)
3640 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3641 LocalReduceList.getPointer(), CGF.VoidPtrTy);
3642 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3643 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003644 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3645 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003646 Bld.CreateBr(MergeBB);
3647
3648 CGF.EmitBlock(ElseBB);
3649 Bld.CreateBr(MergeBB);
3650
3651 CGF.EmitBlock(MergeBB);
3652
3653 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3654 // Reduce list.
3655 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
Alexey Bataev9ff80832018-04-16 20:16:21 +00003656 llvm::Value *CondCopy = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003657 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3658
3659 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3660 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3661 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3662 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3663
3664 CGF.EmitBlock(CpyThenBB);
3665 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3666 RemoteReduceList, LocalReduceList);
3667 Bld.CreateBr(CpyMergeBB);
3668
3669 CGF.EmitBlock(CpyElseBB);
3670 Bld.CreateBr(CpyMergeBB);
3671
3672 CGF.EmitBlock(CpyMergeBB);
3673
3674 CGF.FinishFunction();
3675 return Fn;
3676}
3677
3678///
3679/// Design of OpenMP reductions on the GPU
3680///
3681/// Consider a typical OpenMP program with one or more reduction
3682/// clauses:
3683///
3684/// float foo;
3685/// double bar;
3686/// #pragma omp target teams distribute parallel for \
3687/// reduction(+:foo) reduction(*:bar)
3688/// for (int i = 0; i < N; i++) {
3689/// foo += A[i]; bar *= B[i];
3690/// }
3691///
3692/// where 'foo' and 'bar' are reduced across all OpenMP threads in
3693/// all teams. In our OpenMP implementation on the NVPTX device an
3694/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3695/// within a team are mapped to CUDA threads within a threadblock.
3696/// Our goal is to efficiently aggregate values across all OpenMP
3697/// threads such that:
3698///
3699/// - the compiler and runtime are logically concise, and
3700/// - the reduction is performed efficiently in a hierarchical
3701/// manner as follows: within OpenMP threads in the same warp,
3702/// across warps in a threadblock, and finally across teams on
3703/// the NVPTX device.
3704///
3705/// Introduction to Decoupling
3706///
3707/// We would like to decouple the compiler and the runtime so that the
3708/// latter is ignorant of the reduction variables (number, data types)
3709/// and the reduction operators. This allows a simpler interface
3710/// and implementation while still attaining good performance.
3711///
3712/// Pseudocode for the aforementioned OpenMP program generated by the
3713/// compiler is as follows:
3714///
3715/// 1. Create private copies of reduction variables on each OpenMP
3716/// thread: 'foo_private', 'bar_private'
3717/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3718/// to it and writes the result in 'foo_private' and 'bar_private'
3719/// respectively.
3720/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3721/// and store the result on the team master:
3722///
3723/// __kmpc_nvptx_parallel_reduce_nowait(...,
3724/// reduceData, shuffleReduceFn, interWarpCpyFn)
3725///
3726/// where:
3727/// struct ReduceData {
3728/// double *foo;
3729/// double *bar;
3730/// } reduceData
3731/// reduceData.foo = &foo_private
3732/// reduceData.bar = &bar_private
3733///
3734/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3735/// auxiliary functions generated by the compiler that operate on
3736/// variables of type 'ReduceData'. They aid the runtime perform
3737/// algorithmic steps in a data agnostic manner.
3738///
3739/// 'shuffleReduceFn' is a pointer to a function that reduces data
3740/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3741/// same warp. It takes the following arguments as input:
3742///
3743/// a. variable of type 'ReduceData' on the calling lane,
3744/// b. its lane_id,
3745/// c. an offset relative to the current lane_id to generate a
3746/// remote_lane_id. The remote lane contains the second
3747/// variable of type 'ReduceData' that is to be reduced.
3748/// d. an algorithm version parameter determining which reduction
3749/// algorithm to use.
3750///
3751/// 'shuffleReduceFn' retrieves data from the remote lane using
3752/// efficient GPU shuffle intrinsics and reduces, using the
3753/// algorithm specified by the 4th parameter, the two operands
3754/// element-wise. The result is written to the first operand.
3755///
3756/// Different reduction algorithms are implemented in different
3757/// runtime functions, all calling 'shuffleReduceFn' to perform
3758/// the essential reduction step. Therefore, based on the 4th
3759/// parameter, this function behaves slightly differently to
3760/// cooperate with the runtime to ensure correctness under
3761/// different circumstances.
3762///
3763/// 'InterWarpCpyFn' is a pointer to a function that transfers
3764/// reduced variables across warps. It tunnels, through CUDA
3765/// shared memory, the thread-private data of type 'ReduceData'
3766/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003767/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3768/// The last team writes the global reduced value to memory.
3769///
3770/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3771/// reduceData, shuffleReduceFn, interWarpCpyFn,
3772/// scratchpadCopyFn, loadAndReduceFn)
3773///
3774/// 'scratchpadCopyFn' is a helper that stores reduced
3775/// data from the team master to a scratchpad array in
3776/// global memory.
3777///
3778/// 'loadAndReduceFn' is a helper that loads data from
3779/// the scratchpad array and reduces it with the input
3780/// operand.
3781///
3782/// These compiler generated functions hide address
3783/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003784/// 5. if ret == 1:
3785/// The team master of the last team stores the reduced
3786/// result to the globals in memory.
3787/// foo += reduceData.foo; bar *= reduceData.bar
3788///
3789///
3790/// Warp Reduction Algorithms
3791///
3792/// On the warp level, we have three algorithms implemented in the
3793/// OpenMP runtime depending on the number of active lanes:
3794///
3795/// Full Warp Reduction
3796///
3797/// The reduce algorithm within a warp where all lanes are active
3798/// is implemented in the runtime as follows:
3799///
3800/// full_warp_reduce(void *reduce_data,
3801/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3802/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3803/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3804/// }
3805///
3806/// The algorithm completes in log(2, WARPSIZE) steps.
3807///
3808/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3809/// not used therefore we save instructions by not retrieving lane_id
3810/// from the corresponding special registers. The 4th parameter, which
3811/// represents the version of the algorithm being used, is set to 0 to
3812/// signify full warp reduction.
3813///
3814/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3815///
3816/// #reduce_elem refers to an element in the local lane's data structure
3817/// #remote_elem is retrieved from a remote lane
3818/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3819/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3820///
3821/// Contiguous Partial Warp Reduction
3822///
3823/// This reduce algorithm is used within a warp where only the first
3824/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3825/// number of OpenMP threads in a parallel region is not a multiple of
3826/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3827///
3828/// void
3829/// contiguous_partial_reduce(void *reduce_data,
3830/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3831/// int size, int lane_id) {
3832/// int curr_size;
3833/// int offset;
3834/// curr_size = size;
3835/// mask = curr_size/2;
3836/// while (offset>0) {
3837/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3838/// curr_size = (curr_size+1)/2;
3839/// offset = curr_size/2;
3840/// }
3841/// }
3842///
3843/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3844///
3845/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3846/// if (lane_id < offset)
3847/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3848/// else
3849/// reduce_elem = remote_elem
3850///
3851/// This algorithm assumes that the data to be reduced are located in a
3852/// contiguous subset of lanes starting from the first. When there is
3853/// an odd number of active lanes, the data in the last lane is not
3854/// aggregated with any other lane's dat but is instead copied over.
3855///
3856/// Dispersed Partial Warp Reduction
3857///
3858/// This algorithm is used within a warp when any discontiguous subset of
3859/// lanes are active. It is used to implement the reduction operation
3860/// across lanes in an OpenMP simd region or in a nested parallel region.
3861///
3862/// void
3863/// dispersed_partial_reduce(void *reduce_data,
3864/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3865/// int size, remote_id;
3866/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3867/// do {
3868/// remote_id = next_active_lane_id_right_after_me();
3869/// # the above function returns 0 of no active lane
3870/// # is present right after the current lane.
3871/// size = number_of_active_lanes_in_this_warp();
3872/// logical_lane_id /= 2;
3873/// ShuffleReduceFn(reduce_data, logical_lane_id,
3874/// remote_id-1-threadIdx.x, 2);
3875/// } while (logical_lane_id % 2 == 0 && size > 1);
3876/// }
3877///
3878/// There is no assumption made about the initial state of the reduction.
3879/// Any number of lanes (>=1) could be active at any position. The reduction
3880/// result is returned in the first active lane.
3881///
3882/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3883///
3884/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3885/// if (lane_id % 2 == 0 && offset > 0)
3886/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3887/// else
3888/// reduce_elem = remote_elem
3889///
3890///
3891/// Intra-Team Reduction
3892///
3893/// This function, as implemented in the runtime call
3894/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3895/// threads in a team. It first reduces within a warp using the
3896/// aforementioned algorithms. We then proceed to gather all such
3897/// reduced values at the first warp.
3898///
3899/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3900/// data from each of the "warp master" (zeroth lane of each warp, where
3901/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3902/// a mathematical sense) the problem of reduction across warp masters in
3903/// a block to the problem of warp reduction.
3904///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003905///
3906/// Inter-Team Reduction
3907///
3908/// Once a team has reduced its data to a single value, it is stored in
3909/// a global scratchpad array. Since each team has a distinct slot, this
3910/// can be done without locking.
3911///
3912/// The last team to write to the scratchpad array proceeds to reduce the
3913/// scratchpad array. One or more workers in the last team use the helper
3914/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3915/// the k'th worker reduces every k'th element.
3916///
3917/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3918/// reduce across workers and compute a globally reduced value.
3919///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003920void CGOpenMPRuntimeNVPTX::emitReduction(
3921 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3922 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3923 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3924 if (!CGF.HaveInsertPoint())
3925 return;
3926
3927 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003928 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003929 bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3930 assert((TeamsReduction || ParallelReduction || SimdReduction) &&
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003931 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003932
Alexey Bataev7b55d2d2018-06-18 17:11:45 +00003933 if (Options.SimpleReduction) {
3934 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3935 ReductionOps, Options);
3936 return;
3937 }
3938
Alexey Bataev9ff80832018-04-16 20:16:21 +00003939 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003940
3941 // 1. Build a list of reduction variables.
3942 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3943 auto Size = RHSExprs.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00003944 for (const Expr *E : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003945 if (E->getType()->isVariablyModifiedType())
3946 // Reserve place for array size.
3947 ++Size;
3948 }
3949 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3950 QualType ReductionArrayTy =
3951 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
3952 /*IndexTypeQuals=*/0);
3953 Address ReductionList =
3954 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3955 auto IPriv = Privates.begin();
3956 unsigned Idx = 0;
3957 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3958 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3959 CGF.getPointerSize());
3960 CGF.Builder.CreateStore(
3961 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3962 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3963 Elem);
3964 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3965 // Store array size.
3966 ++Idx;
3967 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3968 CGF.getPointerSize());
3969 llvm::Value *Size = CGF.Builder.CreateIntCast(
3970 CGF.getVLASize(
3971 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00003972 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003973 CGF.SizeTy, /*isSigned=*/false);
3974 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3975 Elem);
3976 }
3977 }
3978
3979 // 2. Emit reduce_func().
Alexey Bataev9ff80832018-04-16 20:16:21 +00003980 llvm::Value *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003981 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3982 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003983
3984 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3985 // RedList, shuffle_reduce_func, interwarp_copy_func);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003986 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3987 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3988 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003989 ReductionList.getPointer(), CGF.VoidPtrTy);
3990
Alexey Bataev9ff80832018-04-16 20:16:21 +00003991 llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003992 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003993 llvm::Value *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003994 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003995
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003996 llvm::Value *Args[] = {ThreadId,
3997 CGF.Builder.getInt32(RHSExprs.size()),
3998 ReductionArrayTySize,
3999 RL,
4000 ShuffleAndReduceFn,
4001 InterWarpCopyFn};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004002
Alexey Bataevfac26cf2018-05-02 20:03:27 +00004003 llvm::Value *Res = nullptr;
4004 if (ParallelReduction)
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004005 Res = CGF.EmitRuntimeCall(
4006 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
4007 Args);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00004008 else if (SimdReduction)
4009 Res = CGF.EmitRuntimeCall(
4010 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
4011 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004012
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00004013 if (TeamsReduction) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00004014 llvm::Value *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00004015 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00004016 llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00004017 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00004018
4019 llvm::Value *Args[] = {ThreadId,
4020 CGF.Builder.getInt32(RHSExprs.size()),
4021 ReductionArrayTySize,
4022 RL,
4023 ShuffleAndReduceFn,
4024 InterWarpCopyFn,
4025 ScratchPadCopyFn,
4026 LoadAndReduceFn};
4027 Res = CGF.EmitRuntimeCall(
4028 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
4029 Args);
4030 }
4031
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004032 // 5. Build switch(res)
Alexey Bataev9ff80832018-04-16 20:16:21 +00004033 llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
4034 llvm::SwitchInst *SwInst =
4035 CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004036
4037 // 6. Build case 1: where we have reduced values in the master
4038 // thread in each team.
4039 // __kmpc_end_reduce{_nowait}(<gtid>);
4040 // break;
Alexey Bataev9ff80832018-04-16 20:16:21 +00004041 llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004042 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
4043 CGF.EmitBlock(Case1BB);
4044
4045 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4046 llvm::Value *EndArgs[] = {ThreadId};
Alexey Bataev9ff80832018-04-16 20:16:21 +00004047 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004048 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4049 auto IPriv = Privates.begin();
4050 auto ILHS = LHSExprs.begin();
4051 auto IRHS = RHSExprs.begin();
Alexey Bataev9ff80832018-04-16 20:16:21 +00004052 for (const Expr *E : ReductionOps) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004053 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4054 cast<DeclRefExpr>(*IRHS));
4055 ++IPriv;
4056 ++ILHS;
4057 ++IRHS;
4058 }
4059 };
4060 RegionCodeGenTy RCG(CodeGen);
4061 NVPTXActionTy Action(
4062 nullptr, llvm::None,
4063 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4064 EndArgs);
4065 RCG.setAction(Action);
4066 RCG(CGF);
4067 CGF.EmitBranch(DefaultBB);
4068 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
4069}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004070
4071const VarDecl *
4072CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4073 const VarDecl *NativeParam) const {
4074 if (!NativeParam->getType()->isReferenceType())
4075 return NativeParam;
4076 QualType ArgType = NativeParam->getType();
4077 QualifierCollector QC;
4078 const Type *NonQualTy = QC.strip(ArgType);
4079 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4080 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4081 if (Attr->getCaptureKind() == OMPC_map) {
4082 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4083 LangAS::opencl_global);
4084 }
4085 }
4086 ArgType = CGM.getContext().getPointerType(PointeeTy);
4087 QC.addRestrict();
4088 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00004089 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004090 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataev9ff80832018-04-16 20:16:21 +00004091 if (isa<ImplicitParamDecl>(NativeParam))
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004092 return ImplicitParamDecl::Create(
4093 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4094 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004095 return ParmVarDecl::Create(
4096 CGM.getContext(),
4097 const_cast<DeclContext *>(NativeParam->getDeclContext()),
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004098 NativeParam->getBeginLoc(), NativeParam->getLocation(),
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004099 NativeParam->getIdentifier(), ArgType,
4100 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004101}
4102
4103Address
4104CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4105 const VarDecl *NativeParam,
4106 const VarDecl *TargetParam) const {
4107 assert(NativeParam != TargetParam &&
4108 NativeParam->getType()->isReferenceType() &&
4109 "Native arg must not be the same as target arg.");
4110 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4111 QualType NativeParamType = NativeParam->getType();
4112 QualifierCollector QC;
4113 const Type *NonQualTy = QC.strip(NativeParamType);
4114 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4115 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00004116 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004117 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004118 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004119 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004120 // First cast to generic.
4121 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4122 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4123 /*AddrSpace=*/0));
4124 // Cast from generic to native address space.
4125 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4126 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4127 NativePointeeAddrSpace));
4128 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4129 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004130 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004131 return NativeParamAddr;
4132}
4133
4134void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00004135 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004136 ArrayRef<llvm::Value *> Args) const {
4137 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004138 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004139 auto *FnType =
4140 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
4141 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004142 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4143 TargetArgs.append(std::next(Args.begin(), I), Args.end());
4144 break;
4145 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004146 llvm::Type *TargetType = FnType->getParamType(I);
4147 llvm::Value *NativeArg = Args[I];
4148 if (!TargetType->isPointerTy()) {
4149 TargetArgs.emplace_back(NativeArg);
4150 continue;
4151 }
4152 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00004153 NativeArg,
4154 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004155 TargetArgs.emplace_back(
4156 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4157 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00004158 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004159}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004160
4161/// Emit function which wraps the outline parallel region
4162/// and controls the arguments which are passed to this function.
4163/// The wrapper ensures that the outlined function is called
4164/// with the correct arguments when data is shared.
4165llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4166 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4167 ASTContext &Ctx = CGM.getContext();
4168 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4169
4170 // Create a function that takes as argument the source thread.
4171 FunctionArgList WrapperArgs;
4172 QualType Int16QTy =
4173 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4174 QualType Int32QTy =
4175 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004176 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004177 /*Id=*/nullptr, Int16QTy,
4178 ImplicitParamDecl::Other);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004179 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004180 /*Id=*/nullptr, Int32QTy,
4181 ImplicitParamDecl::Other);
4182 WrapperArgs.emplace_back(&ParallelLevelArg);
4183 WrapperArgs.emplace_back(&WrapperArg);
4184
Alexey Bataev9ff80832018-04-16 20:16:21 +00004185 const CGFunctionInfo &CGFI =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004186 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4187
4188 auto *Fn = llvm::Function::Create(
4189 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataev9ff80832018-04-16 20:16:21 +00004190 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004191 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004192 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00004193 Fn->setDoesNotRecurse();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004194
4195 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4196 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004197 D.getBeginLoc(), D.getBeginLoc());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004198
4199 const auto *RD = CS.getCapturedRecordDecl();
4200 auto CurField = RD->field_begin();
4201
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004202 Address ZeroAddr = CGF.CreateMemTemp(
4203 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4204 /*Name*/ ".zero.addr");
4205 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004206 // Get the array of arguments.
4207 SmallVector<llvm::Value *, 8> Args;
4208
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004209 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4210 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004211
4212 CGBuilderTy &Bld = CGF.Builder;
4213 auto CI = CS.capture_begin();
4214
4215 // Use global memory for data sharing.
4216 // Handle passing of global args to workers.
4217 Address GlobalArgs =
4218 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4219 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4220 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4221 CGF.EmitRuntimeCall(
4222 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4223 DataSharingArgs);
4224
4225 // Retrieve the shared variables from the list of references returned
4226 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00004227 Address SharedArgListAddress = Address::invalid();
4228 if (CS.capture_size() > 0 ||
4229 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4230 SharedArgListAddress = CGF.EmitLoadOfPointer(
4231 GlobalArgs, CGF.getContext()
4232 .getPointerType(CGF.getContext().getPointerType(
4233 CGF.getContext().VoidPtrTy))
4234 .castAs<PointerType>());
4235 }
4236 unsigned Idx = 0;
4237 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4238 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4239 CGF.getPointerSize());
4240 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4241 Src, CGF.SizeTy->getPointerTo());
4242 llvm::Value *LB = CGF.EmitLoadOfScalar(
4243 TypedAddress,
4244 /*Volatile=*/false,
4245 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4246 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4247 Args.emplace_back(LB);
4248 ++Idx;
4249 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4250 CGF.getPointerSize());
4251 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4252 Src, CGF.SizeTy->getPointerTo());
4253 llvm::Value *UB = CGF.EmitLoadOfScalar(
4254 TypedAddress,
4255 /*Volatile=*/false,
4256 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4257 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4258 Args.emplace_back(UB);
4259 ++Idx;
4260 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004261 if (CS.capture_size() > 0) {
4262 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004263 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4264 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00004265 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
4266 CGF.getPointerSize());
4267 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004268 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4269 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4270 /*Volatile=*/false,
4271 CGFContext.getPointerType(ElemTy),
4272 CI->getLocation());
Alexey Bataev2091ca62018-04-23 17:33:41 +00004273 if (CI->capturesVariableByCopy() &&
4274 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
Alexey Bataev17314212018-03-20 15:41:05 +00004275 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4276 CI->getLocation());
4277 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004278 Args.emplace_back(Arg);
4279 }
4280 }
4281
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004282 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004283 CGF.FinishFunction();
4284 return Fn;
4285}
4286
4287void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4288 const Decl *D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004289 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4290 return;
4291
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004292 assert(D && "Expected function or captured|block decl.");
4293 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4294 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004295 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004296 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004297 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4298 Body = FD->getBody();
4299 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4300 Body = BD->getBody();
4301 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4302 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004303 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Alexey Bataev2adecff2018-09-21 14:22:53 +00004304 if (NeedToDelayGlobalization &&
4305 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4306 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004307 }
4308 if (!Body)
4309 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004310 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004311 VarChecker.Visit(Body);
Alexey Bataevff23bb62018-10-11 18:30:31 +00004312 const RecordDecl *GlobalizedVarsRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004313 VarChecker.getGlobalizedRecord(IsInTTDRegion);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004314 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4315 VarChecker.getEscapedVariableLengthDecls();
4316 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004317 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004318 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4319 I->getSecond().MappedParams =
4320 llvm::make_unique<CodeGenFunction::OMPMapVars>();
4321 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4322 I->getSecond().EscapedParameters.insert(
4323 VarChecker.getEscapedParameters().begin(),
4324 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004325 I->getSecond().EscapedVariableLengthDecls.append(
4326 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004327 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004328 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004329 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004330 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004331 Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
Alexey Bataevc99042b2018-03-15 18:10:54 +00004332 }
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004333 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004334 CheckVarsEscapingDeclContext VarChecker(CGF);
4335 VarChecker.Visit(Body);
4336 I->getSecond().SecondaryGlobalRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004337 VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004338 I->getSecond().SecondaryLocalVarData.emplace();
4339 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4340 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4341 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4342 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004343 Data.insert(
4344 std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004345 }
4346 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00004347 if (!NeedToDelayGlobalization) {
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004348 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004349 struct GlobalizationScope final : EHScopeStack::Cleanup {
4350 GlobalizationScope() = default;
4351
4352 void Emit(CodeGenFunction &CGF, Flags flags) override {
4353 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004354 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004355 }
4356 };
4357 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004358 }
4359}
4360
4361Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4362 const VarDecl *VD) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004363 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4364 return Address::invalid();
4365
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004366 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004367 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4368 if (I == FunctionGlobalizedDecls.end())
4369 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00004370 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004371 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004372 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004373 if (VD->hasAttrs()) {
4374 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4375 E(VD->attr_end());
4376 IT != E; ++IT) {
4377 auto VDI = I->getSecond().LocalVarData.find(
4378 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4379 ->getCanonicalDecl());
4380 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004381 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004382 }
4383 }
4384 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004385}
4386
4387void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004388 FunctionGlobalizedDecls.erase(CGF.CurFn);
4389 CGOpenMPRuntime::functionFinished(CGF);
4390}
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004391
4392void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4393 CodeGenFunction &CGF, const OMPLoopDirective &S,
4394 OpenMPDistScheduleClauseKind &ScheduleKind,
4395 llvm::Value *&Chunk) const {
4396 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4397 ScheduleKind = OMPC_DIST_SCHEDULE_static;
4398 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4399 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4400 S.getIterationVariable()->getType(), S.getBeginLoc());
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004401 return;
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004402 }
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004403 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4404 CGF, S, ScheduleKind, Chunk);
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004405}
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004406
4407void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4408 CodeGenFunction &CGF, const OMPLoopDirective &S,
4409 OpenMPScheduleClauseKind &ScheduleKind,
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004410 const Expr *&ChunkExpr) const {
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004411 ScheduleKind = OMPC_SCHEDULE_static;
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004412 // Chunk size is 1 in this case.
4413 llvm::APInt ChunkSize(32, 1);
4414 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4415 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4416 SourceLocation());
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004417}
Alexey Bataev60705422018-10-30 15:50:12 +00004418
4419void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4420 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4421 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4422 " Expected target-based directive.");
4423 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4424 for (const CapturedStmt::Capture &C : CS->captures()) {
4425 // Capture variables captured by reference in lambdas for target-based
4426 // directives.
4427 if (!C.capturesVariable())
4428 continue;
4429 const VarDecl *VD = C.getCapturedVar();
4430 const auto *RD = VD->getType()
4431 .getCanonicalType()
4432 .getNonReferenceType()
4433 ->getAsCXXRecordDecl();
4434 if (!RD || !RD->isLambda())
4435 continue;
4436 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4437 LValue VDLVal;
4438 if (VD->getType().getCanonicalType()->isReferenceType())
4439 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4440 else
4441 VDLVal = CGF.MakeAddrLValue(
4442 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4443 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4444 FieldDecl *ThisCapture = nullptr;
4445 RD->getCaptureFields(Captures, ThisCapture);
4446 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4447 LValue ThisLVal =
4448 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4449 llvm::Value *CXXThis = CGF.LoadCXXThis();
4450 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4451 }
4452 for (const LambdaCapture &LC : RD->captures()) {
4453 if (LC.getCaptureKind() != LCK_ByRef)
4454 continue;
4455 const VarDecl *VD = LC.getCapturedVar();
4456 if (!CS->capturesVariable(VD))
4457 continue;
4458 auto It = Captures.find(VD);
4459 assert(It != Captures.end() && "Found lambda capture without field.");
4460 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4461 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4462 if (VD->getType().getCanonicalType()->isReferenceType())
4463 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4464 VD->getType().getCanonicalType())
4465 .getAddress();
4466 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4467 }
4468 }
4469}
4470
Alexey Bataeve4090182018-11-02 14:54:07 +00004471/// Get number of SMs and number of blocks per SM.
4472static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4473 std::pair<unsigned, unsigned> Data;
4474 if (CGM.getLangOpts().OpenMPCUDANumSMs)
4475 Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4476 if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4477 Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4478 if (Data.first && Data.second)
4479 return Data;
4480 if (CGM.getTarget().hasFeature("ptx")) {
4481 llvm::StringMap<bool> Features;
4482 CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4483 CGM.getTarget().getTargetOpts().CPU,
4484 CGM.getTarget().getTargetOpts().Features);
4485 for (const auto &Feature : Features) {
4486 if (Feature.getValue()) {
4487 switch (StringToCudaArch(Feature.getKey())) {
4488 case CudaArch::SM_20:
4489 case CudaArch::SM_21:
4490 case CudaArch::SM_30:
4491 case CudaArch::SM_32:
4492 case CudaArch::SM_35:
4493 case CudaArch::SM_37:
4494 case CudaArch::SM_50:
4495 case CudaArch::SM_52:
4496 case CudaArch::SM_53:
4497 return {16, 16};
4498 case CudaArch::SM_60:
4499 case CudaArch::SM_61:
4500 case CudaArch::SM_62:
4501 return {56, 32};
4502 case CudaArch::SM_70:
4503 case CudaArch::SM_72:
4504 case CudaArch::SM_75:
4505 return {84, 32};
4506 case CudaArch::GFX600:
4507 case CudaArch::GFX601:
4508 case CudaArch::GFX700:
4509 case CudaArch::GFX701:
4510 case CudaArch::GFX702:
4511 case CudaArch::GFX703:
4512 case CudaArch::GFX704:
4513 case CudaArch::GFX801:
4514 case CudaArch::GFX802:
4515 case CudaArch::GFX803:
4516 case CudaArch::GFX810:
4517 case CudaArch::GFX900:
4518 case CudaArch::GFX902:
4519 case CudaArch::GFX904:
4520 case CudaArch::GFX906:
4521 case CudaArch::GFX909:
4522 case CudaArch::UNKNOWN:
4523 break;
4524 case CudaArch::LAST:
4525 llvm_unreachable("Unexpected Cuda arch.");
4526 }
4527 }
4528 }
4529 }
4530 llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4531}
4532
4533void CGOpenMPRuntimeNVPTX::clear() {
4534 if (!GlobalizedRecords.empty()) {
4535 ASTContext &C = CGM.getContext();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004536 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> GlobalRecs;
4537 llvm::SmallVector<const GlobalPtrSizeRecsTy *, 4> SharedRecs;
Alexey Bataeve4090182018-11-02 14:54:07 +00004538 RecordDecl *StaticRD = C.buildImplicitRecord(
4539 "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4540 StaticRD->startDefinition();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004541 RecordDecl *SharedStaticRD = C.buildImplicitRecord(
4542 "_shared_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4543 SharedStaticRD->startDefinition();
Alexey Bataeve4090182018-11-02 14:54:07 +00004544 for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4545 if (Records.Records.empty())
4546 continue;
4547 unsigned Size = 0;
4548 unsigned RecAlignment = 0;
4549 for (const RecordDecl *RD : Records.Records) {
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004550 QualType RDTy = C.getRecordType(RD);
4551 unsigned Alignment = C.getTypeAlignInChars(RDTy).getQuantity();
Alexey Bataeve4090182018-11-02 14:54:07 +00004552 RecAlignment = std::max(RecAlignment, Alignment);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004553 unsigned RecSize = C.getTypeSizeInChars(RDTy).getQuantity();
Alexey Bataeve4090182018-11-02 14:54:07 +00004554 Size =
4555 llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4556 }
4557 Size = llvm::alignTo(Size, RecAlignment);
4558 llvm::APInt ArySize(/*numBits=*/64, Size);
4559 QualType SubTy = C.getConstantArrayType(
4560 C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004561 const bool UseSharedMemory = Size <= SharedMemorySize;
4562 auto *Field =
4563 FieldDecl::Create(C, UseSharedMemory ? SharedStaticRD : StaticRD,
4564 SourceLocation(), SourceLocation(), nullptr, SubTy,
4565 C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4566 /*BW=*/nullptr, /*Mutable=*/false,
4567 /*InitStyle=*/ICIS_NoInit);
Alexey Bataeve4090182018-11-02 14:54:07 +00004568 Field->setAccess(AS_public);
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004569 if (UseSharedMemory) {
4570 SharedStaticRD->addDecl(Field);
4571 SharedRecs.push_back(&Records);
4572 } else {
4573 StaticRD->addDecl(Field);
4574 GlobalRecs.push_back(&Records);
4575 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004576 Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004577 Records.UseSharedMemory->setInitializer(
4578 llvm::ConstantInt::get(CGM.Int16Ty, UseSharedMemory ? 1 : 0));
4579 }
4580 SharedStaticRD->completeDefinition();
4581 if (!SharedStaticRD->field_empty()) {
4582 QualType StaticTy = C.getRecordType(SharedStaticRD);
4583 llvm::Type *LLVMStaticTy = CGM.getTypes().ConvertTypeForMem(StaticTy);
4584 auto *GV = new llvm::GlobalVariable(
4585 CGM.getModule(), LLVMStaticTy,
4586 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4587 llvm::Constant::getNullValue(LLVMStaticTy),
4588 "_openmp_shared_static_glob_rd_$_", /*InsertBefore=*/nullptr,
4589 llvm::GlobalValue::NotThreadLocal,
4590 C.getTargetAddressSpace(LangAS::cuda_shared));
4591 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4592 GV, CGM.VoidPtrTy);
4593 for (const GlobalPtrSizeRecsTy *Rec : SharedRecs) {
4594 Rec->Buffer->replaceAllUsesWith(Replacement);
4595 Rec->Buffer->eraseFromParent();
4596 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004597 }
4598 StaticRD->completeDefinition();
Alexey Bataev09c9eea2018-11-09 16:18:04 +00004599 if (!StaticRD->field_empty()) {
4600 QualType StaticTy = C.getRecordType(StaticRD);
4601 std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4602 llvm::APInt Size1(32, SMsBlockPerSM.second);
4603 QualType Arr1Ty =
4604 C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
4605 /*IndexTypeQuals=*/0);
4606 llvm::APInt Size2(32, SMsBlockPerSM.first);
4607 QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
4608 /*IndexTypeQuals=*/0);
4609 llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4610 auto *GV = new llvm::GlobalVariable(
4611 CGM.getModule(), LLVMArr2Ty,
4612 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4613 llvm::Constant::getNullValue(LLVMArr2Ty),
4614 "_openmp_static_glob_rd_$_");
4615 auto *Replacement = llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(
4616 GV, CGM.VoidPtrTy);
4617 for (const GlobalPtrSizeRecsTy *Rec : GlobalRecs) {
4618 Rec->Buffer->replaceAllUsesWith(Replacement);
4619 Rec->Buffer->eraseFromParent();
4620 }
4621 }
Alexey Bataeve4090182018-11-02 14:54:07 +00004622 }
4623 CGOpenMPRuntime::clear();
4624}