blob: 8daaf4cff5a06f17d4fc7aeb1d98167330356cfa [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000016#include "CodeGenFunction.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000017#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000018#include "clang/AST/StmtOpenMP.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000019#include "clang/AST/StmtVisitor.h"
Alexey Bataeve4090182018-11-02 14:54:07 +000020#include "clang/Basic/Cuda.h"
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000021#include "llvm/ADT/SmallPtrSet.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000022
23using namespace clang;
24using namespace CodeGen;
25
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000026namespace {
27enum OpenMPRTLFunctionNVPTX {
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000028 /// Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000029 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000030 OMPRTL_NVPTX__kmpc_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000031 /// Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000032 OMPRTL_NVPTX__kmpc_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000033 /// Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000034 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000035 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000036 /// Call to void __kmpc_spmd_kernel_deinit();
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000037 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000038 /// Call to void __kmpc_kernel_prepare_parallel(void
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000039 /// *outlined_function, int16_t
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000040 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000041 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000042 /// Call to bool __kmpc_kernel_parallel(void **outlined_function,
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +000043 /// int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000044 OMPRTL_NVPTX__kmpc_kernel_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000045 /// Call to void __kmpc_kernel_end_parallel();
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000046 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
47 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
48 /// global_tid);
49 OMPRTL_NVPTX__kmpc_serialized_parallel,
50 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
51 /// global_tid);
52 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000053 /// Call to int32_t __kmpc_shuffle_int32(int32_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000054 /// int16_t lane_offset, int16_t warp_size);
55 OMPRTL_NVPTX__kmpc_shuffle_int32,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000056 /// Call to int64_t __kmpc_shuffle_int64(int64_t element,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000057 /// int16_t lane_offset, int16_t warp_size);
58 OMPRTL_NVPTX__kmpc_shuffle_int64,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000059 /// Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000060 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
61 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
62 /// lane_offset, int16_t shortCircuit),
63 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
64 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000065 /// Call to __kmpc_nvptx_simd_reduce_nowait(kmp_int32
Alexey Bataevfac26cf2018-05-02 20:03:27 +000066 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
67 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
68 /// lane_offset, int16_t shortCircuit),
69 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
70 OMPRTL_NVPTX__kmpc_simd_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000071 /// Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000072 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
73 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
74 /// lane_offset, int16_t shortCircuit),
75 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
76 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
77 /// int32_t index, int32_t width),
78 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
79 /// index, int32_t width, int32_t reduce))
80 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000081 /// Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000082 OMPRTL_NVPTX__kmpc_end_reduce_nowait,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000083 /// Call to void __kmpc_data_sharing_init_stack();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000084 OMPRTL_NVPTX__kmpc_data_sharing_init_stack,
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +000085 /// Call to void __kmpc_data_sharing_init_stack_spmd();
86 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000087 /// Call to void* __kmpc_data_sharing_push_stack(size_t size,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000088 /// int16_t UseSharedMemory);
89 OMPRTL_NVPTX__kmpc_data_sharing_push_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000090 /// Call to void __kmpc_data_sharing_pop_stack(void *a);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000091 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000092 /// Call to void __kmpc_begin_sharing_variables(void ***args,
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000093 /// size_t n_args);
94 OMPRTL_NVPTX__kmpc_begin_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000095 /// Call to void __kmpc_end_sharing_variables();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000096 OMPRTL_NVPTX__kmpc_end_sharing_variables,
Adrian Prantl9fc8faf2018-05-09 01:00:01 +000097 /// Call to void __kmpc_get_shared_variables(void ***GlobalArgs)
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +000098 OMPRTL_NVPTX__kmpc_get_shared_variables,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +000099 /// Call to uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32
100 /// global_tid);
101 OMPRTL_NVPTX__kmpc_parallel_level,
Alexey Bataev673110d2018-05-16 13:36:30 +0000102 /// Call to int8_t __kmpc_is_spmd_exec_mode();
103 OMPRTL_NVPTX__kmpc_is_spmd_exec_mode,
Alexey Bataeve4090182018-11-02 14:54:07 +0000104 /// Call to void __kmpc_get_team_static_memory(const void *buf, size_t size,
105 /// int16_t is_shared, const void **res);
106 OMPRTL_NVPTX__kmpc_get_team_static_memory,
107 /// Call to void __kmpc_restore_team_static_memory(int16_t is_shared);
108 OMPRTL_NVPTX__kmpc_restore_team_static_memory,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000109};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000110
111/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
112class NVPTXActionTy final : public PrePostActionTy {
Alexey Bataev9ff80832018-04-16 20:16:21 +0000113 llvm::Value *EnterCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000114 ArrayRef<llvm::Value *> EnterArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000115 llvm::Value *ExitCallee = nullptr;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000116 ArrayRef<llvm::Value *> ExitArgs;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000117 bool Conditional = false;
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000118 llvm::BasicBlock *ContBlock = nullptr;
119
120public:
121 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
122 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
123 bool Conditional = false)
124 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
125 ExitArgs(ExitArgs), Conditional(Conditional) {}
126 void Enter(CodeGenFunction &CGF) override {
127 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
128 if (Conditional) {
129 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
130 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
131 ContBlock = CGF.createBasicBlock("omp_if.end");
132 // Generate the branch (If-stmt)
133 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
134 CGF.EmitBlock(ThenBlock);
135 }
136 }
137 void Done(CodeGenFunction &CGF) {
138 // Emit the rest of blocks/branches
139 CGF.EmitBranch(ContBlock);
140 CGF.EmitBlock(ContBlock, true);
141 }
142 void Exit(CodeGenFunction &CGF) override {
143 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
144 }
145};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000146
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000147/// A class to track the execution mode when codegening directives within
148/// a target region. The appropriate mode (SPMD|NON-SPMD) is set on entry
149/// to the target region and used by containing directives such as 'parallel'
150/// to emit optimized code.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000151class ExecutionModeRAII {
152private:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000153 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
154 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000155
156public:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000157 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, bool IsSPMD)
158 : Mode(Mode) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000159 SavedMode = Mode;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000160 Mode = IsSPMD ? CGOpenMPRuntimeNVPTX::EM_SPMD
161 : CGOpenMPRuntimeNVPTX::EM_NonSPMD;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000162 }
163 ~ExecutionModeRAII() { Mode = SavedMode; }
164};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000165
166/// GPU Configuration: This information can be derived from cuda registers,
167/// however, providing compile time constants helps generate more efficient
168/// code. For all practical purposes this is fine because the configuration
169/// is the same for all known NVPTX architectures.
170enum MachineConfiguration : unsigned {
171 WarpSize = 32,
172 /// Number of bits required to represent a lane identifier, which is
173 /// computed as log_2(WarpSize).
174 LaneIDBits = 5,
175 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000176
177 /// Global memory alignment for performance.
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000178 GlobalMemoryAlignment = 128,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000179};
180
181enum NamedBarrier : unsigned {
182 /// Synchronize on this barrier #ID using a named barrier primitive.
183 /// Only the subset of active threads in a parallel region arrive at the
184 /// barrier.
185 NB_Parallel = 1,
186};
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000187
Alexey Bataev2adecff2018-09-21 14:22:53 +0000188typedef std::pair<CharUnits /*Align*/, const ValueDecl *> VarsDataTy;
189static bool stable_sort_comparator(const VarsDataTy P1, const VarsDataTy P2) {
190 return P1.first > P2.first;
191}
192
193static RecordDecl *buildRecordForGlobalizedVars(
194 ASTContext &C, ArrayRef<const ValueDecl *> EscapedDecls,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000195 ArrayRef<const ValueDecl *> EscapedDeclsForTeams,
Alexey Bataev2adecff2018-09-21 14:22:53 +0000196 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
197 &MappedDeclsFields) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000198 if (EscapedDecls.empty() && EscapedDeclsForTeams.empty())
Alexey Bataev2adecff2018-09-21 14:22:53 +0000199 return nullptr;
200 SmallVector<VarsDataTy, 4> GlobalizedVars;
201 for (const ValueDecl *D : EscapedDecls)
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000202 GlobalizedVars.emplace_back(
203 CharUnits::fromQuantity(std::max(
204 C.getDeclAlign(D).getQuantity(),
205 static_cast<CharUnits::QuantityType>(GlobalMemoryAlignment))),
206 D);
207 for (const ValueDecl *D : EscapedDeclsForTeams)
Alexey Bataev2adecff2018-09-21 14:22:53 +0000208 GlobalizedVars.emplace_back(C.getDeclAlign(D), D);
209 std::stable_sort(GlobalizedVars.begin(), GlobalizedVars.end(),
210 stable_sort_comparator);
211 // Build struct _globalized_locals_ty {
Alexey Bataevff23bb62018-10-11 18:30:31 +0000212 // /* globalized vars */[WarSize] align (max(decl_align,
213 // GlobalMemoryAlignment))
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000214 // /* globalized vars */ for EscapedDeclsForTeams
Alexey Bataev2adecff2018-09-21 14:22:53 +0000215 // };
216 RecordDecl *GlobalizedRD = C.buildImplicitRecord("_globalized_locals_ty");
217 GlobalizedRD->startDefinition();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000218 llvm::SmallPtrSet<const ValueDecl *, 16> SingleEscaped(
219 EscapedDeclsForTeams.begin(), EscapedDeclsForTeams.end());
Alexey Bataev2adecff2018-09-21 14:22:53 +0000220 for (const auto &Pair : GlobalizedVars) {
221 const ValueDecl *VD = Pair.second;
222 QualType Type = VD->getType();
223 if (Type->isLValueReferenceType())
224 Type = C.getPointerType(Type.getNonReferenceType());
225 else
226 Type = Type.getNonReferenceType();
227 SourceLocation Loc = VD->getLocation();
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000228 FieldDecl *Field;
229 if (SingleEscaped.count(VD)) {
230 Field = FieldDecl::Create(
231 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
232 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
233 /*BW=*/nullptr, /*Mutable=*/false,
234 /*InitStyle=*/ICIS_NoInit);
235 Field->setAccess(AS_public);
236 if (VD->hasAttrs()) {
237 for (specific_attr_iterator<AlignedAttr> I(VD->getAttrs().begin()),
238 E(VD->getAttrs().end());
239 I != E; ++I)
240 Field->addAttr(*I);
241 }
242 } else {
243 llvm::APInt ArraySize(32, WarpSize);
244 Type = C.getConstantArrayType(Type, ArraySize, ArrayType::Normal, 0);
245 Field = FieldDecl::Create(
246 C, GlobalizedRD, Loc, Loc, VD->getIdentifier(), Type,
247 C.getTrivialTypeSourceInfo(Type, SourceLocation()),
248 /*BW=*/nullptr, /*Mutable=*/false,
249 /*InitStyle=*/ICIS_NoInit);
250 Field->setAccess(AS_public);
251 llvm::APInt Align(32, std::max(C.getDeclAlign(VD).getQuantity(),
252 static_cast<CharUnits::QuantityType>(
253 GlobalMemoryAlignment)));
254 Field->addAttr(AlignedAttr::CreateImplicit(
255 C, AlignedAttr::GNU_aligned, /*IsAlignmentExpr=*/true,
256 IntegerLiteral::Create(C, Align,
257 C.getIntTypeForBitwidth(32, /*Signed=*/0),
258 SourceLocation())));
Alexey Bataev2adecff2018-09-21 14:22:53 +0000259 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000260 GlobalizedRD->addDecl(Field);
Alexey Bataev2adecff2018-09-21 14:22:53 +0000261 MappedDeclsFields.try_emplace(VD, Field);
262 }
263 GlobalizedRD->completeDefinition();
264 return GlobalizedRD;
265}
266
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000267/// Get the list of variables that can escape their declaration context.
268class CheckVarsEscapingDeclContext final
269 : public ConstStmtVisitor<CheckVarsEscapingDeclContext> {
270 CodeGenFunction &CGF;
271 llvm::SetVector<const ValueDecl *> EscapedDecls;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000272 llvm::SetVector<const ValueDecl *> EscapedVariableLengthDecls;
Alexey Bataevc99042b2018-03-15 18:10:54 +0000273 llvm::SmallPtrSet<const Decl *, 4> EscapedParameters;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000274 RecordDecl *GlobalizedRD = nullptr;
275 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000276 bool AllEscaped = false;
Alexey Bataev91433f62018-06-26 17:24:03 +0000277 bool IsForCombinedParallelRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000278
279 void markAsEscaped(const ValueDecl *VD) {
Alexey Bataev03f270c2018-03-30 18:31:07 +0000280 // Do not globalize declare target variables.
Alexey Bataev97b72212018-08-14 18:31:20 +0000281 if (!isa<VarDecl>(VD) ||
282 OMPDeclareTargetDeclAttr::isDeclareTargetDeclaration(VD))
Alexey Bataev03f270c2018-03-30 18:31:07 +0000283 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000284 VD = cast<ValueDecl>(VD->getCanonicalDecl());
Alexey Bataevc99042b2018-03-15 18:10:54 +0000285 // Variables captured by value must be globalized.
286 if (auto *CSI = CGF.CapturedStmtInfo) {
Mikael Holmen9f373a32018-03-16 07:27:57 +0000287 if (const FieldDecl *FD = CSI->lookup(cast<VarDecl>(VD))) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000288 // Check if need to capture the variable that was already captured by
289 // value in the outer region.
Alexey Bataev91433f62018-06-26 17:24:03 +0000290 if (!IsForCombinedParallelRegion) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000291 if (!FD->hasAttrs())
292 return;
293 const auto *Attr = FD->getAttr<OMPCaptureKindAttr>();
294 if (!Attr)
295 return;
296 if (!isOpenMPPrivate(
297 static_cast<OpenMPClauseKind>(Attr->getCaptureKind())) ||
298 Attr->getCaptureKind() == OMPC_map)
299 return;
300 }
301 if (!FD->getType()->isReferenceType()) {
302 assert(!VD->getType()->isVariablyModifiedType() &&
303 "Parameter captured by value with variably modified type");
304 EscapedParameters.insert(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000305 } else if (!IsForCombinedParallelRegion) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000306 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000307 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000308 }
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000309 }
310 if ((!CGF.CapturedStmtInfo ||
Alexey Bataev91433f62018-06-26 17:24:03 +0000311 (IsForCombinedParallelRegion && CGF.CapturedStmtInfo)) &&
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000312 VD->getType()->isReferenceType())
313 // Do not globalize variables with reference type.
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000314 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000315 if (VD->getType()->isVariablyModifiedType())
316 EscapedVariableLengthDecls.insert(VD);
317 else
318 EscapedDecls.insert(VD);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000319 }
320
321 void VisitValueDecl(const ValueDecl *VD) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000322 if (VD->getType()->isLValueReferenceType())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000323 markAsEscaped(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000324 if (const auto *VarD = dyn_cast<VarDecl>(VD)) {
325 if (!isa<ParmVarDecl>(VarD) && VarD->hasInit()) {
326 const bool SavedAllEscaped = AllEscaped;
327 AllEscaped = VD->getType()->isLValueReferenceType();
328 Visit(VarD->getInit());
329 AllEscaped = SavedAllEscaped;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000330 }
331 }
332 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000333 void VisitOpenMPCapturedStmt(const CapturedStmt *S,
334 ArrayRef<OMPClause *> Clauses,
335 bool IsCombinedParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000336 if (!S)
337 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000338 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000339 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
340 const ValueDecl *VD = C.getCapturedVar();
Alexey Bataev91433f62018-06-26 17:24:03 +0000341 bool SavedIsForCombinedParallelRegion = IsForCombinedParallelRegion;
342 if (IsCombinedParallelRegion) {
343 // Check if the variable is privatized in the combined construct and
344 // those private copies must be shared in the inner parallel
345 // directive.
346 IsForCombinedParallelRegion = false;
347 for (const OMPClause *C : Clauses) {
348 if (!isOpenMPPrivate(C->getClauseKind()) ||
349 C->getClauseKind() == OMPC_reduction ||
350 C->getClauseKind() == OMPC_linear ||
351 C->getClauseKind() == OMPC_private)
352 continue;
353 ArrayRef<const Expr *> Vars;
354 if (const auto *PC = dyn_cast<OMPFirstprivateClause>(C))
355 Vars = PC->getVarRefs();
356 else if (const auto *PC = dyn_cast<OMPLastprivateClause>(C))
357 Vars = PC->getVarRefs();
358 else
359 llvm_unreachable("Unexpected clause.");
360 for (const auto *E : Vars) {
361 const Decl *D =
362 cast<DeclRefExpr>(E)->getDecl()->getCanonicalDecl();
363 if (D == VD->getCanonicalDecl()) {
364 IsForCombinedParallelRegion = true;
365 break;
366 }
367 }
368 if (IsForCombinedParallelRegion)
369 break;
370 }
371 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000372 markAsEscaped(VD);
373 if (isa<OMPCapturedExprDecl>(VD))
374 VisitValueDecl(VD);
Alexey Bataev91433f62018-06-26 17:24:03 +0000375 IsForCombinedParallelRegion = SavedIsForCombinedParallelRegion;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000376 }
377 }
378 }
379
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000380 void buildRecordForGlobalizedVars(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000381 assert(!GlobalizedRD &&
382 "Record for globalized variables is built already.");
Alexey Bataevff23bb62018-10-11 18:30:31 +0000383 ArrayRef<const ValueDecl *> EscapedDeclsForParallel, EscapedDeclsForTeams;
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000384 if (IsInTTDRegion)
Alexey Bataevff23bb62018-10-11 18:30:31 +0000385 EscapedDeclsForTeams = EscapedDecls.getArrayRef();
386 else
387 EscapedDeclsForParallel = EscapedDecls.getArrayRef();
Alexey Bataev2adecff2018-09-21 14:22:53 +0000388 GlobalizedRD = ::buildRecordForGlobalizedVars(
Alexey Bataevff23bb62018-10-11 18:30:31 +0000389 CGF.getContext(), EscapedDeclsForParallel, EscapedDeclsForTeams,
Alexey Bataev9ea3c382018-10-09 14:49:00 +0000390 MappedDeclsFields);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000391 }
392
393public:
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000394 CheckVarsEscapingDeclContext(CodeGenFunction &CGF) : CGF(CGF) {}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000395 virtual ~CheckVarsEscapingDeclContext() = default;
396 void VisitDeclStmt(const DeclStmt *S) {
397 if (!S)
398 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000399 for (const Decl *D : S->decls())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000400 if (const auto *VD = dyn_cast_or_null<ValueDecl>(D))
401 VisitValueDecl(VD);
402 }
403 void VisitOMPExecutableDirective(const OMPExecutableDirective *D) {
404 if (!D)
405 return;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000406 if (!D->hasAssociatedStmt())
407 return;
408 if (const auto *S =
409 dyn_cast_or_null<CapturedStmt>(D->getAssociatedStmt())) {
410 // Do not analyze directives that do not actually require capturing,
411 // like `omp for` or `omp simd` directives.
412 llvm::SmallVector<OpenMPDirectiveKind, 4> CaptureRegions;
413 getOpenMPCaptureRegions(CaptureRegions, D->getDirectiveKind());
414 if (CaptureRegions.size() == 1 && CaptureRegions.back() == OMPD_unknown) {
415 VisitStmt(S->getCapturedStmt());
416 return;
Alexey Bataev673110d2018-05-16 13:36:30 +0000417 }
Alexey Bataev91433f62018-06-26 17:24:03 +0000418 VisitOpenMPCapturedStmt(
419 S, D->clauses(),
420 CaptureRegions.back() == OMPD_parallel &&
421 isOpenMPDistributeDirective(D->getDirectiveKind()));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000422 }
423 }
424 void VisitCapturedStmt(const CapturedStmt *S) {
425 if (!S)
426 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000427 for (const CapturedStmt::Capture &C : S->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000428 if (C.capturesVariable() && !C.capturesVariableByCopy()) {
429 const ValueDecl *VD = C.getCapturedVar();
430 markAsEscaped(VD);
431 if (isa<OMPCapturedExprDecl>(VD))
432 VisitValueDecl(VD);
433 }
434 }
435 }
436 void VisitLambdaExpr(const LambdaExpr *E) {
437 if (!E)
438 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000439 for (const LambdaCapture &C : E->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000440 if (C.capturesVariable()) {
441 if (C.getCaptureKind() == LCK_ByRef) {
442 const ValueDecl *VD = C.getCapturedVar();
443 markAsEscaped(VD);
444 if (E->isInitCapture(&C) || isa<OMPCapturedExprDecl>(VD))
445 VisitValueDecl(VD);
446 }
447 }
448 }
449 }
450 void VisitBlockExpr(const BlockExpr *E) {
451 if (!E)
452 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000453 for (const BlockDecl::Capture &C : E->getBlockDecl()->captures()) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000454 if (C.isByRef()) {
455 const VarDecl *VD = C.getVariable();
456 markAsEscaped(VD);
457 if (isa<OMPCapturedExprDecl>(VD) || VD->isInitCapture())
458 VisitValueDecl(VD);
459 }
460 }
461 }
462 void VisitCallExpr(const CallExpr *E) {
463 if (!E)
464 return;
465 for (const Expr *Arg : E->arguments()) {
466 if (!Arg)
467 continue;
468 if (Arg->isLValue()) {
469 const bool SavedAllEscaped = AllEscaped;
470 AllEscaped = true;
471 Visit(Arg);
472 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000473 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000474 Visit(Arg);
Alexey Bataev9ff80832018-04-16 20:16:21 +0000475 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000476 }
477 Visit(E->getCallee());
478 }
479 void VisitDeclRefExpr(const DeclRefExpr *E) {
480 if (!E)
481 return;
482 const ValueDecl *VD = E->getDecl();
483 if (AllEscaped)
484 markAsEscaped(VD);
485 if (isa<OMPCapturedExprDecl>(VD))
486 VisitValueDecl(VD);
487 else if (const auto *VarD = dyn_cast<VarDecl>(VD))
488 if (VarD->isInitCapture())
489 VisitValueDecl(VD);
490 }
491 void VisitUnaryOperator(const UnaryOperator *E) {
492 if (!E)
493 return;
494 if (E->getOpcode() == UO_AddrOf) {
495 const bool SavedAllEscaped = AllEscaped;
496 AllEscaped = true;
497 Visit(E->getSubExpr());
498 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000499 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000500 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000501 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000502 }
503 void VisitImplicitCastExpr(const ImplicitCastExpr *E) {
504 if (!E)
505 return;
506 if (E->getCastKind() == CK_ArrayToPointerDecay) {
507 const bool SavedAllEscaped = AllEscaped;
508 AllEscaped = true;
509 Visit(E->getSubExpr());
510 AllEscaped = SavedAllEscaped;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000511 } else {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000512 Visit(E->getSubExpr());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000513 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000514 }
515 void VisitExpr(const Expr *E) {
516 if (!E)
517 return;
518 bool SavedAllEscaped = AllEscaped;
519 if (!E->isLValue())
520 AllEscaped = false;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000521 for (const Stmt *Child : E->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000522 if (Child)
523 Visit(Child);
524 AllEscaped = SavedAllEscaped;
525 }
526 void VisitStmt(const Stmt *S) {
527 if (!S)
528 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +0000529 for (const Stmt *Child : S->children())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000530 if (Child)
531 Visit(Child);
532 }
533
Alexey Bataevc99042b2018-03-15 18:10:54 +0000534 /// Returns the record that handles all the escaped local variables and used
535 /// instead of their original storage.
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000536 const RecordDecl *getGlobalizedRecord(bool IsInTTDRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000537 if (!GlobalizedRD)
Alexey Bataev4ac58d12018-10-12 20:19:59 +0000538 buildRecordForGlobalizedVars(IsInTTDRegion);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000539 return GlobalizedRD;
540 }
541
Alexey Bataevc99042b2018-03-15 18:10:54 +0000542 /// Returns the field in the globalized record for the escaped variable.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000543 const FieldDecl *getFieldForGlobalizedVar(const ValueDecl *VD) const {
544 assert(GlobalizedRD &&
545 "Record for globalized variables must be generated already.");
546 auto I = MappedDeclsFields.find(VD);
547 if (I == MappedDeclsFields.end())
548 return nullptr;
549 return I->getSecond();
550 }
551
Alexey Bataevc99042b2018-03-15 18:10:54 +0000552 /// Returns the list of the escaped local variables/parameters.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000553 ArrayRef<const ValueDecl *> getEscapedDecls() const {
554 return EscapedDecls.getArrayRef();
555 }
Alexey Bataevc99042b2018-03-15 18:10:54 +0000556
557 /// Checks if the escaped local variable is actually a parameter passed by
558 /// value.
559 const llvm::SmallPtrSetImpl<const Decl *> &getEscapedParameters() const {
560 return EscapedParameters;
561 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +0000562
563 /// Returns the list of the escaped variables with the variably modified
564 /// types.
565 ArrayRef<const ValueDecl *> getEscapedVariableLengthDecls() const {
566 return EscapedVariableLengthDecls.getArrayRef();
567 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +0000568};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000569} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000570
571/// Get the GPU warp size.
572static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000573 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000574 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000575 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000576 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000577}
578
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000579/// Get the id of the current thread on the GPU.
580static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000581 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000582 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000583 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000584 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585}
586
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000587/// Get the id of the warp in the block.
588/// We assume that the warp size is 32, which is always the case
589/// on the NVPTX device, to generate more efficient code.
590static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
591 CGBuilderTy &Bld = CGF.Builder;
592 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
593}
594
595/// Get the id of the current lane in the Warp.
596/// We assume that the warp size is 32, which is always the case
597/// on the NVPTX device, to generate more efficient code.
598static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
599 CGBuilderTy &Bld = CGF.Builder;
600 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
601 "nvptx_lane_id");
602}
603
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000604/// Get the maximum number of threads in a block of the GPU.
605static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000606 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000607 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000608 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000609 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000610}
611
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000612/// Get barrier to synchronize all threads in a block.
613static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000614 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000615 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000616}
617
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000618/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
619/// a CTA.
620static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
621 llvm::Value *NumThreads) {
622 CGBuilderTy &Bld = CGF.Builder;
623 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000624 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
625 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
626 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000627}
628
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000629/// Synchronize all GPU threads in a block.
630static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000631
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000632/// Synchronize worker threads in a parallel region.
633static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
634 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
635}
636
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000637/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000638/// For the 'generic' execution mode, the runtime encodes thread_limit in
639/// the launch parameters, always starting thread_limit+warpSize threads per
640/// CTA. The threads in the last warp are reserved for master execution.
641/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
642static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000643 bool IsInSPMDExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000644 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev4065b9a2018-06-21 20:26:33 +0000645 return IsInSPMDExecutionMode
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000646 ? getNVPTXNumThreads(CGF)
Alexey Bataeve290ec02018-04-06 16:03:36 +0000647 : Bld.CreateNUWSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
648 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000649}
650
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000651/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000652/// The master thread id is the first thread (lane) of the last warp in the
653/// GPU block. Warp size is assumed to be some power of 2.
654/// Thread id is 0 indexed.
655/// E.g: If NumThreads is 33, master id is 32.
656/// If NumThreads is 64, master id is 32.
657/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000658static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000659 CGBuilderTy &Bld = CGF.Builder;
660 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
661
662 // We assume that the warp size is a power of 2.
Alexey Bataeve290ec02018-04-06 16:03:36 +0000663 llvm::Value *Mask = Bld.CreateNUWSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000664
Alexey Bataeve290ec02018-04-06 16:03:36 +0000665 return Bld.CreateAnd(Bld.CreateNUWSub(NumThreads, Bld.getInt32(1)),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000666 Bld.CreateNot(Mask), "master_tid");
667}
668
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000669CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
Alexey Bataev7cae94e2018-01-04 19:45:16 +0000670 CodeGenModule &CGM, SourceLocation Loc)
Alexey Bataev9ff80832018-04-16 20:16:21 +0000671 : WorkerFn(nullptr), CGFI(CGM.getTypes().arrangeNullaryFunction()),
672 Loc(Loc) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000673 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000674}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000675
676void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
677 CodeGenModule &CGM) {
678 // Create an worker function with no arguments.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000679
680 WorkerFn = llvm::Function::Create(
Alexey Bataev9ff80832018-04-16 20:16:21 +0000681 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataevaee93892018-01-08 20:09:47 +0000682 /*placeholder=*/"_worker", &CGM.getModule());
Alexey Bataev9ff80832018-04-16 20:16:21 +0000683 CGM.SetInternalFunctionAttributes(GlobalDecl(), WorkerFn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +0000684 WorkerFn->setDoesNotRecurse();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000685}
686
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000687CGOpenMPRuntimeNVPTX::ExecutionMode
688CGOpenMPRuntimeNVPTX::getExecutionMode() const {
689 return CurrentExecutionMode;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000690}
691
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000692static CGOpenMPRuntimeNVPTX::DataSharingMode
693getDataSharingMode(CodeGenModule &CGM) {
694 return CGM.getLangOpts().OpenMPCUDAMode ? CGOpenMPRuntimeNVPTX::CUDA
695 : CGOpenMPRuntimeNVPTX::Generic;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000696}
697
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000698/// Checks if the \p Body is the \a CompoundStmt and returns its child statement
699/// iff there is only one.
700static const Stmt *getSingleCompoundChild(const Stmt *Body) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000701 if (const auto *C = dyn_cast<CompoundStmt>(Body))
702 if (C->size() == 1)
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000703 return C->body_front();
704 return Body;
705}
706
707/// Check if the parallel directive has an 'if' clause with non-constant or
Alexey Bataev2a3320a2018-05-15 18:01:01 +0000708/// false condition. Also, check if the number of threads is strictly specified
709/// and run those directives in non-SPMD mode.
710static bool hasParallelIfNumThreadsClause(ASTContext &Ctx,
711 const OMPExecutableDirective &D) {
712 if (D.hasClausesOfKind<OMPNumThreadsClause>())
713 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000714 for (const auto *C : D.getClausesOfKind<OMPIfClause>()) {
715 OpenMPDirectiveKind NameModifier = C->getNameModifier();
716 if (NameModifier != OMPD_parallel && NameModifier != OMPD_unknown)
717 continue;
718 const Expr *Cond = C->getCondition();
719 bool Result;
720 if (!Cond->EvaluateAsBooleanCondition(Result, Ctx) || !Result)
721 return true;
722 }
723 return false;
724}
725
726/// Check for inner (nested) SPMD construct, if any
727static bool hasNestedSPMDDirective(ASTContext &Ctx,
728 const OMPExecutableDirective &D) {
729 const auto *CS = D.getInnermostCapturedStmt();
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000730 const auto *Body =
731 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000732 const Stmt *ChildStmt = getSingleCompoundChild(Body);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000733
734 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
735 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000736 switch (D.getDirectiveKind()) {
737 case OMPD_target:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000738 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000739 !hasParallelIfNumThreadsClause(Ctx, *NestedDir))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000740 return true;
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000741 if (DKind == OMPD_teams) {
742 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
743 /*IgnoreCaptured=*/true);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000744 if (!Body)
745 return false;
746 ChildStmt = getSingleCompoundChild(Body);
747 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
748 DKind = NND->getDirectiveKind();
Alexey Bataevdf093e72018-05-11 19:45:14 +0000749 if (isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000750 !hasParallelIfNumThreadsClause(Ctx, *NND))
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000751 return true;
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000752 }
753 }
754 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000755 case OMPD_target_teams:
Alexey Bataevdf093e72018-05-11 19:45:14 +0000756 return isOpenMPParallelDirective(DKind) &&
Alexey Bataev2adecff2018-09-21 14:22:53 +0000757 !hasParallelIfNumThreadsClause(Ctx, *NestedDir);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000758 case OMPD_target_simd:
759 case OMPD_target_parallel:
760 case OMPD_target_parallel_for:
761 case OMPD_target_parallel_for_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000762 case OMPD_target_teams_distribute:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000763 case OMPD_target_teams_distribute_simd:
764 case OMPD_target_teams_distribute_parallel_for:
765 case OMPD_target_teams_distribute_parallel_for_simd:
766 case OMPD_parallel:
767 case OMPD_for:
768 case OMPD_parallel_for:
769 case OMPD_parallel_sections:
770 case OMPD_for_simd:
771 case OMPD_parallel_for_simd:
772 case OMPD_cancel:
773 case OMPD_cancellation_point:
774 case OMPD_ordered:
775 case OMPD_threadprivate:
776 case OMPD_task:
777 case OMPD_simd:
778 case OMPD_sections:
779 case OMPD_section:
780 case OMPD_single:
781 case OMPD_master:
782 case OMPD_critical:
783 case OMPD_taskyield:
784 case OMPD_barrier:
785 case OMPD_taskwait:
786 case OMPD_taskgroup:
787 case OMPD_atomic:
788 case OMPD_flush:
789 case OMPD_teams:
790 case OMPD_target_data:
791 case OMPD_target_exit_data:
792 case OMPD_target_enter_data:
793 case OMPD_distribute:
794 case OMPD_distribute_simd:
795 case OMPD_distribute_parallel_for:
796 case OMPD_distribute_parallel_for_simd:
797 case OMPD_teams_distribute:
798 case OMPD_teams_distribute_simd:
799 case OMPD_teams_distribute_parallel_for:
800 case OMPD_teams_distribute_parallel_for_simd:
801 case OMPD_target_update:
802 case OMPD_declare_simd:
803 case OMPD_declare_target:
804 case OMPD_end_declare_target:
805 case OMPD_declare_reduction:
806 case OMPD_taskloop:
807 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000808 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000809 case OMPD_unknown:
810 llvm_unreachable("Unexpected directive.");
811 }
812 }
813
814 return false;
815}
816
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000817static bool supportsSPMDExecutionMode(ASTContext &Ctx,
818 const OMPExecutableDirective &D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000819 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
820 switch (DirectiveKind) {
821 case OMPD_target:
822 case OMPD_target_teams:
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000823 return hasNestedSPMDDirective(Ctx, D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000824 case OMPD_target_parallel:
825 case OMPD_target_parallel_for:
826 case OMPD_target_parallel_for_simd:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +0000827 case OMPD_target_teams_distribute_parallel_for:
828 case OMPD_target_teams_distribute_parallel_for_simd:
Alexey Bataev2adecff2018-09-21 14:22:53 +0000829 return !hasParallelIfNumThreadsClause(Ctx, D);
Alexey Bataevbf5c8482018-05-10 18:32:08 +0000830 case OMPD_target_simd:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000831 case OMPD_target_teams_distribute:
832 case OMPD_target_teams_distribute_simd:
833 return false;
834 case OMPD_parallel:
835 case OMPD_for:
836 case OMPD_parallel_for:
837 case OMPD_parallel_sections:
838 case OMPD_for_simd:
839 case OMPD_parallel_for_simd:
840 case OMPD_cancel:
841 case OMPD_cancellation_point:
842 case OMPD_ordered:
843 case OMPD_threadprivate:
844 case OMPD_task:
845 case OMPD_simd:
846 case OMPD_sections:
847 case OMPD_section:
848 case OMPD_single:
849 case OMPD_master:
850 case OMPD_critical:
851 case OMPD_taskyield:
852 case OMPD_barrier:
853 case OMPD_taskwait:
854 case OMPD_taskgroup:
855 case OMPD_atomic:
856 case OMPD_flush:
857 case OMPD_teams:
858 case OMPD_target_data:
859 case OMPD_target_exit_data:
860 case OMPD_target_enter_data:
861 case OMPD_distribute:
862 case OMPD_distribute_simd:
863 case OMPD_distribute_parallel_for:
864 case OMPD_distribute_parallel_for_simd:
865 case OMPD_teams_distribute:
866 case OMPD_teams_distribute_simd:
867 case OMPD_teams_distribute_parallel_for:
868 case OMPD_teams_distribute_parallel_for_simd:
869 case OMPD_target_update:
870 case OMPD_declare_simd:
871 case OMPD_declare_target:
872 case OMPD_end_declare_target:
873 case OMPD_declare_reduction:
874 case OMPD_taskloop:
875 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +0000876 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +0000877 case OMPD_unknown:
878 break;
879 }
880 llvm_unreachable(
881 "Unknown programming model for OpenMP directive on NVPTX target.");
882}
883
884/// Check if the directive is loops based and has schedule clause at all or has
885/// static scheduling.
886static bool hasStaticScheduling(const OMPExecutableDirective &D) {
887 assert(isOpenMPWorksharingDirective(D.getDirectiveKind()) &&
888 isOpenMPLoopDirective(D.getDirectiveKind()) &&
889 "Expected loop-based directive.");
890 return !D.hasClausesOfKind<OMPOrderedClause>() &&
891 (!D.hasClausesOfKind<OMPScheduleClause>() ||
892 llvm::any_of(D.getClausesOfKind<OMPScheduleClause>(),
893 [](const OMPScheduleClause *C) {
894 return C->getScheduleKind() == OMPC_SCHEDULE_static;
895 }));
896}
897
898/// Check for inner (nested) lightweight runtime construct, if any
899static bool hasNestedLightweightDirective(ASTContext &Ctx,
900 const OMPExecutableDirective &D) {
901 assert(supportsSPMDExecutionMode(Ctx, D) && "Expected SPMD mode directive.");
902 const auto *CS = D.getInnermostCapturedStmt();
903 const auto *Body =
904 CS->getCapturedStmt()->IgnoreContainers(/*IgnoreCaptured=*/true);
905 const Stmt *ChildStmt = getSingleCompoundChild(Body);
906
907 if (const auto *NestedDir = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
908 OpenMPDirectiveKind DKind = NestedDir->getDirectiveKind();
909 switch (D.getDirectiveKind()) {
910 case OMPD_target:
911 if (isOpenMPParallelDirective(DKind) &&
912 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
913 hasStaticScheduling(*NestedDir))
914 return true;
915 if (DKind == OMPD_parallel) {
916 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
917 /*IgnoreCaptured=*/true);
918 if (!Body)
919 return false;
920 ChildStmt = getSingleCompoundChild(Body);
921 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
922 DKind = NND->getDirectiveKind();
923 if (isOpenMPWorksharingDirective(DKind) &&
924 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
925 return true;
926 }
927 } else if (DKind == OMPD_teams) {
928 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
929 /*IgnoreCaptured=*/true);
930 if (!Body)
931 return false;
932 ChildStmt = getSingleCompoundChild(Body);
933 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
934 DKind = NND->getDirectiveKind();
935 if (isOpenMPParallelDirective(DKind) &&
936 isOpenMPWorksharingDirective(DKind) &&
937 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
938 return true;
939 if (DKind == OMPD_parallel) {
940 Body = NND->getInnermostCapturedStmt()->IgnoreContainers(
941 /*IgnoreCaptured=*/true);
942 if (!Body)
943 return false;
944 ChildStmt = getSingleCompoundChild(Body);
945 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
946 DKind = NND->getDirectiveKind();
947 if (isOpenMPWorksharingDirective(DKind) &&
948 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
949 return true;
950 }
951 }
952 }
953 }
954 return false;
955 case OMPD_target_teams:
956 if (isOpenMPParallelDirective(DKind) &&
957 isOpenMPWorksharingDirective(DKind) && isOpenMPLoopDirective(DKind) &&
958 hasStaticScheduling(*NestedDir))
959 return true;
960 if (DKind == OMPD_parallel) {
961 Body = NestedDir->getInnermostCapturedStmt()->IgnoreContainers(
962 /*IgnoreCaptured=*/true);
963 if (!Body)
964 return false;
965 ChildStmt = getSingleCompoundChild(Body);
966 if (const auto *NND = dyn_cast<OMPExecutableDirective>(ChildStmt)) {
967 DKind = NND->getDirectiveKind();
968 if (isOpenMPWorksharingDirective(DKind) &&
969 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NND))
970 return true;
971 }
972 }
973 return false;
974 case OMPD_target_parallel:
975 return isOpenMPWorksharingDirective(DKind) &&
976 isOpenMPLoopDirective(DKind) && hasStaticScheduling(*NestedDir);
977 case OMPD_target_teams_distribute:
978 case OMPD_target_simd:
979 case OMPD_target_parallel_for:
980 case OMPD_target_parallel_for_simd:
981 case OMPD_target_teams_distribute_simd:
982 case OMPD_target_teams_distribute_parallel_for:
983 case OMPD_target_teams_distribute_parallel_for_simd:
984 case OMPD_parallel:
985 case OMPD_for:
986 case OMPD_parallel_for:
987 case OMPD_parallel_sections:
988 case OMPD_for_simd:
989 case OMPD_parallel_for_simd:
990 case OMPD_cancel:
991 case OMPD_cancellation_point:
992 case OMPD_ordered:
993 case OMPD_threadprivate:
994 case OMPD_task:
995 case OMPD_simd:
996 case OMPD_sections:
997 case OMPD_section:
998 case OMPD_single:
999 case OMPD_master:
1000 case OMPD_critical:
1001 case OMPD_taskyield:
1002 case OMPD_barrier:
1003 case OMPD_taskwait:
1004 case OMPD_taskgroup:
1005 case OMPD_atomic:
1006 case OMPD_flush:
1007 case OMPD_teams:
1008 case OMPD_target_data:
1009 case OMPD_target_exit_data:
1010 case OMPD_target_enter_data:
1011 case OMPD_distribute:
1012 case OMPD_distribute_simd:
1013 case OMPD_distribute_parallel_for:
1014 case OMPD_distribute_parallel_for_simd:
1015 case OMPD_teams_distribute:
1016 case OMPD_teams_distribute_simd:
1017 case OMPD_teams_distribute_parallel_for:
1018 case OMPD_teams_distribute_parallel_for_simd:
1019 case OMPD_target_update:
1020 case OMPD_declare_simd:
1021 case OMPD_declare_target:
1022 case OMPD_end_declare_target:
1023 case OMPD_declare_reduction:
1024 case OMPD_taskloop:
1025 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001026 case OMPD_requires:
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001027 case OMPD_unknown:
1028 llvm_unreachable("Unexpected directive.");
1029 }
1030 }
1031
1032 return false;
1033}
1034
1035/// Checks if the construct supports lightweight runtime. It must be SPMD
1036/// construct + inner loop-based construct with static scheduling.
1037static bool supportsLightweightRuntime(ASTContext &Ctx,
1038 const OMPExecutableDirective &D) {
1039 if (!supportsSPMDExecutionMode(Ctx, D))
1040 return false;
1041 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
1042 switch (DirectiveKind) {
1043 case OMPD_target:
1044 case OMPD_target_teams:
1045 case OMPD_target_parallel:
1046 return hasNestedLightweightDirective(Ctx, D);
1047 case OMPD_target_parallel_for:
1048 case OMPD_target_parallel_for_simd:
1049 case OMPD_target_teams_distribute_parallel_for:
1050 case OMPD_target_teams_distribute_parallel_for_simd:
1051 // (Last|First)-privates must be shared in parallel region.
1052 return hasStaticScheduling(D);
1053 case OMPD_target_simd:
1054 case OMPD_target_teams_distribute:
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001055 case OMPD_target_teams_distribute_simd:
Alexey Bataevdf093e72018-05-11 19:45:14 +00001056 return false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001057 case OMPD_parallel:
1058 case OMPD_for:
1059 case OMPD_parallel_for:
1060 case OMPD_parallel_sections:
1061 case OMPD_for_simd:
1062 case OMPD_parallel_for_simd:
1063 case OMPD_cancel:
1064 case OMPD_cancellation_point:
1065 case OMPD_ordered:
1066 case OMPD_threadprivate:
1067 case OMPD_task:
1068 case OMPD_simd:
1069 case OMPD_sections:
1070 case OMPD_section:
1071 case OMPD_single:
1072 case OMPD_master:
1073 case OMPD_critical:
1074 case OMPD_taskyield:
1075 case OMPD_barrier:
1076 case OMPD_taskwait:
1077 case OMPD_taskgroup:
1078 case OMPD_atomic:
1079 case OMPD_flush:
1080 case OMPD_teams:
1081 case OMPD_target_data:
1082 case OMPD_target_exit_data:
1083 case OMPD_target_enter_data:
1084 case OMPD_distribute:
1085 case OMPD_distribute_simd:
1086 case OMPD_distribute_parallel_for:
1087 case OMPD_distribute_parallel_for_simd:
1088 case OMPD_teams_distribute:
1089 case OMPD_teams_distribute_simd:
1090 case OMPD_teams_distribute_parallel_for:
1091 case OMPD_teams_distribute_parallel_for_simd:
1092 case OMPD_target_update:
1093 case OMPD_declare_simd:
1094 case OMPD_declare_target:
1095 case OMPD_end_declare_target:
1096 case OMPD_declare_reduction:
1097 case OMPD_taskloop:
1098 case OMPD_taskloop_simd:
Kelvin Li1408f912018-09-26 04:28:39 +00001099 case OMPD_requires:
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001100 case OMPD_unknown:
1101 break;
1102 }
1103 llvm_unreachable(
1104 "Unknown programming model for OpenMP directive on NVPTX target.");
1105}
1106
1107void CGOpenMPRuntimeNVPTX::emitNonSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001108 StringRef ParentName,
1109 llvm::Function *&OutlinedFn,
1110 llvm::Constant *&OutlinedFnID,
1111 bool IsOffloadEntry,
1112 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001113 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001114 EntryFunctionState EST;
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001115 WorkerFunctionState WST(CGM, D.getBeginLoc());
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001116 Work.clear();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001117 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001118
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001119 // Emit target region as a standalone region.
1120 class NVPTXPrePostActionTy : public PrePostActionTy {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001121 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1122 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001123
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001124 public:
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001125 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001126 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001127 : EST(EST), WST(WST) {}
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001128 void Enter(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001129 auto &RT =
1130 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001131 RT.emitNonSPMDEntryHeader(CGF, EST, WST);
1132 // Skip target region initialization.
1133 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001134 }
1135 void Exit(CodeGenFunction &CGF) override {
Alexey Bataeve4090182018-11-02 14:54:07 +00001136 auto &RT =
1137 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
Alexey Bataev6bc27322018-10-05 15:27:47 +00001138 RT.clearLocThreadIdInsertPt(CGF);
1139 RT.emitNonSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001140 }
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001141 } Action(EST, WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001142 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001143 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001144 // Reserve place for the globalized memory.
1145 GlobalizedRecords.emplace_back();
1146 if (!StaticGlobalized) {
1147 StaticGlobalized = new llvm::GlobalVariable(
1148 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
1149 llvm::GlobalValue::WeakAnyLinkage, nullptr,
1150 "_openmp_static_glob_rd$ptr");
1151 StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1152 }
1153 if (!KernelStaticGlobalized) {
1154 KernelStaticGlobalized = new llvm::GlobalVariable(
1155 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1156 llvm::GlobalValue::InternalLinkage,
1157 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1158 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1159 llvm::GlobalValue::NotThreadLocal,
1160 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1161 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001162 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1163 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001164 IsInTTDRegion = false;
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001165
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001166 // Now change the name of the worker function to correspond to this target
1167 // region's entry function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001168 WST.WorkerFn->setName(Twine(OutlinedFn->getName(), "_worker"));
Alexey Bataevaee93892018-01-08 20:09:47 +00001169
1170 // Create the worker function
1171 emitWorkerFunction(WST);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001172}
1173
1174// Setup NVPTX threads for master-worker OpenMP scheme.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001175void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryHeader(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001176 EntryFunctionState &EST,
1177 WorkerFunctionState &WST) {
1178 CGBuilderTy &Bld = CGF.Builder;
1179
1180 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
1181 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
1182 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
1183 EST.ExitBB = CGF.createBasicBlock(".exit");
1184
Alexey Bataev9ff80832018-04-16 20:16:21 +00001185 llvm::Value *IsWorker =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001186 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
1187 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
1188
1189 CGF.EmitBlock(WorkerBB);
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001190 emitCall(CGF, WST.Loc, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001191 CGF.EmitBranch(EST.ExitBB);
1192
1193 CGF.EmitBlock(MasterCheckBB);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001194 llvm::Value *IsMaster =
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001195 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
1196 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
1197
1198 CGF.EmitBlock(MasterBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001199 IsInTargetMasterThreadRegion = true;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001200 // SEQUENTIAL (MASTER) REGION START
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001201 // First action in sequential region:
1202 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001203 // TODO: Optimize runtime initialization and pass in correct value.
1204 llvm::Value *Args[] = {getThreadLimit(CGF),
1205 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001206 CGF.EmitRuntimeCall(
1207 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001208
1209 // For data sharing, we need to initialize the stack.
1210 CGF.EmitRuntimeCall(
1211 createNVPTXRuntimeFunction(
1212 OMPRTL_NVPTX__kmpc_data_sharing_init_stack));
1213
Alexey Bataevc99042b2018-03-15 18:10:54 +00001214 emitGenericVarsProlog(CGF, WST.Loc);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001215}
1216
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001217void CGOpenMPRuntimeNVPTX::emitNonSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001218 EntryFunctionState &EST) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001219 IsInTargetMasterThreadRegion = false;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001220 if (!CGF.HaveInsertPoint())
1221 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001222
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001223 emitGenericVarsEpilog(CGF);
1224
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001225 if (!EST.ExitBB)
1226 EST.ExitBB = CGF.createBasicBlock(".exit");
1227
1228 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
1229 CGF.EmitBranch(TerminateBB);
1230
1231 CGF.EmitBlock(TerminateBB);
1232 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001233 // TODO: Optimize runtime initialization and pass in correct value.
1234 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001235 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001236 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001237 // Barrier to terminate worker threads.
1238 syncCTAThreads(CGF);
1239 // Master thread jumps to exit point.
1240 CGF.EmitBranch(EST.ExitBB);
1241
1242 CGF.EmitBlock(EST.ExitBB);
1243 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001244}
1245
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001246void CGOpenMPRuntimeNVPTX::emitSPMDKernel(const OMPExecutableDirective &D,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001247 StringRef ParentName,
1248 llvm::Function *&OutlinedFn,
1249 llvm::Constant *&OutlinedFnID,
1250 bool IsOffloadEntry,
1251 const RegionCodeGenTy &CodeGen) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001252 ExecutionModeRAII ModeRAII(CurrentExecutionMode, /*IsSPMD=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001253 EntryFunctionState EST;
1254
1255 // Emit target region as a standalone region.
1256 class NVPTXPrePostActionTy : public PrePostActionTy {
1257 CGOpenMPRuntimeNVPTX &RT;
1258 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
1259 const OMPExecutableDirective &D;
1260
1261 public:
1262 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
1263 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
1264 const OMPExecutableDirective &D)
1265 : RT(RT), EST(EST), D(D) {}
1266 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001267 RT.emitSPMDEntryHeader(CGF, EST, D);
Alexey Bataevfd006c42018-10-05 15:08:53 +00001268 // Skip target region initialization.
1269 RT.setLocThreadIdInsertPt(CGF, /*AtCurrentPoint=*/true);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001270 }
1271 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevfd006c42018-10-05 15:08:53 +00001272 RT.clearLocThreadIdInsertPt(CGF);
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001273 RT.emitSPMDEntryFooter(CGF, EST);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001274 }
1275 } Action(*this, EST, D);
1276 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001277 IsInTTDRegion = true;
Alexey Bataeve4090182018-11-02 14:54:07 +00001278 // Reserve place for the globalized memory.
1279 GlobalizedRecords.emplace_back();
1280 if (!StaticGlobalized) {
1281 StaticGlobalized = new llvm::GlobalVariable(
1282 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/true,
1283 llvm::GlobalValue::WeakAnyLinkage, nullptr,
1284 "_openmp_static_glob_rd$ptr");
1285 StaticGlobalized->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
1286 }
1287 if (!KernelStaticGlobalized) {
1288 KernelStaticGlobalized = new llvm::GlobalVariable(
1289 CGM.getModule(), CGM.VoidPtrTy, /*isConstant=*/false,
1290 llvm::GlobalValue::InternalLinkage,
1291 llvm::ConstantPointerNull::get(CGM.VoidPtrTy),
1292 "_openmp_kernel_static_glob_rd$ptr", /*InsertBefore=*/nullptr,
1293 llvm::GlobalValue::NotThreadLocal,
1294 CGM.getContext().getTargetAddressSpace(LangAS::cuda_shared));
1295 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001296 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
1297 IsOffloadEntry, CodeGen);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001298 IsInTTDRegion = false;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001299}
1300
Alexey Bataeve79451a2018-10-01 16:20:57 +00001301static void
1302getDistributeLastprivateVars(const OMPExecutableDirective &D,
1303 llvm::SmallVectorImpl<const ValueDecl *> &Vars);
1304
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001305void CGOpenMPRuntimeNVPTX::emitSPMDEntryHeader(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001306 CodeGenFunction &CGF, EntryFunctionState &EST,
1307 const OMPExecutableDirective &D) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001308 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001309
1310 // Setup BBs in entry function.
1311 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
1312 EST.ExitBB = CGF.createBasicBlock(".exit");
1313
1314 // Initialize the OMP state in the runtime; called by all active threads.
Alexey Bataev80a9a612018-08-30 14:45:24 +00001315 bool RequiresFullRuntime = CGM.getLangOpts().OpenMPCUDAForceFullRuntime ||
1316 !supportsLightweightRuntime(CGF.getContext(), D);
Alexey Bataeve79451a2018-10-01 16:20:57 +00001317 // Check if we have inner distribute + lastprivate|reduction clauses.
1318 bool RequiresDatasharing = RequiresFullRuntime;
1319 if (!RequiresDatasharing) {
1320 const OMPExecutableDirective *TD = &D;
1321 if (!isOpenMPTeamsDirective(TD->getDirectiveKind()) &&
1322 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1323 const Stmt *S = getSingleCompoundChild(
1324 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1325 /*IgnoreCaptured=*/true));
1326 TD = cast<OMPExecutableDirective>(S);
1327 }
1328 if (!isOpenMPDistributeDirective(TD->getDirectiveKind()) &&
1329 !isOpenMPParallelDirective(TD->getDirectiveKind())) {
1330 const Stmt *S = getSingleCompoundChild(
1331 TD->getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1332 /*IgnoreCaptured=*/true));
1333 TD = cast<OMPExecutableDirective>(S);
1334 }
1335 if (isOpenMPDistributeDirective(TD->getDirectiveKind()))
1336 RequiresDatasharing = TD->hasClausesOfKind<OMPLastprivateClause>() ||
1337 TD->hasClausesOfKind<OMPReductionClause>();
1338 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001339 llvm::Value *Args[] = {
1340 getThreadLimit(CGF, /*IsInSPMDExecutionMode=*/true),
1341 /*RequiresOMPRuntime=*/
1342 Bld.getInt16(RequiresFullRuntime ? 1 : 0),
Alexey Bataeve79451a2018-10-01 16:20:57 +00001343 /*RequiresDataSharing=*/Bld.getInt16(RequiresDatasharing ? 1 : 0)};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001344 CGF.EmitRuntimeCall(
1345 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001346
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001347 if (RequiresFullRuntime) {
1348 // For data sharing, we need to initialize the stack.
1349 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
1350 OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd));
1351 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001352
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001353 CGF.EmitBranch(ExecuteBB);
1354
1355 CGF.EmitBlock(ExecuteBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001356
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001357 IsInTargetMasterThreadRegion = true;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001358}
1359
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001360void CGOpenMPRuntimeNVPTX::emitSPMDEntryFooter(CodeGenFunction &CGF,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001361 EntryFunctionState &EST) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001362 IsInTargetMasterThreadRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001363 if (!CGF.HaveInsertPoint())
1364 return;
1365
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001366 if (!EST.ExitBB)
1367 EST.ExitBB = CGF.createBasicBlock(".exit");
1368
1369 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
1370 CGF.EmitBranch(OMPDeInitBB);
1371
1372 CGF.EmitBlock(OMPDeInitBB);
1373 // DeInitialize the OMP state in the runtime; called by all active threads.
1374 CGF.EmitRuntimeCall(
1375 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
1376 CGF.EmitBranch(EST.ExitBB);
1377
1378 CGF.EmitBlock(EST.ExitBB);
1379 EST.ExitBB = nullptr;
1380}
1381
1382// Create a unique global variable to indicate the execution mode of this target
1383// region. The execution mode is either 'generic', or 'spmd' depending on the
1384// target directive. This variable is picked up by the offload library to setup
1385// the device appropriately before kernel launch. If the execution mode is
1386// 'generic', the runtime reserves one warp for the master, otherwise, all
1387// warps participate in parallel work.
1388static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001389 bool Mode) {
1390 auto *GVMode =
1391 new llvm::GlobalVariable(CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
1392 llvm::GlobalValue::WeakAnyLinkage,
1393 llvm::ConstantInt::get(CGM.Int8Ty, Mode ? 0 : 1),
1394 Twine(Name, "_exec_mode"));
Alexey Bataev9ff80832018-04-16 20:16:21 +00001395 CGM.addCompilerUsedGlobal(GVMode);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001396}
1397
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001398void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00001399 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001400
1401 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001402 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, WST.CGFI, {},
Alexey Bataev7cae94e2018-01-04 19:45:16 +00001403 WST.Loc, WST.Loc);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001404 emitWorkerLoop(CGF, WST);
1405 CGF.FinishFunction();
1406}
1407
1408void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
1409 WorkerFunctionState &WST) {
1410 //
1411 // The workers enter this loop and wait for parallel work from the master.
1412 // When the master encounters a parallel region it sets up the work + variable
1413 // arguments, and wakes up the workers. The workers first check to see if
1414 // they are required for the parallel region, i.e., within the # of requested
1415 // parallel threads. The activated workers load the variable arguments and
1416 // execute the parallel work.
1417 //
1418
1419 CGBuilderTy &Bld = CGF.Builder;
1420
1421 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
1422 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
1423 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
1424 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
1425 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
1426 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
1427
1428 CGF.EmitBranch(AwaitBB);
1429
1430 // Workers wait for work from master.
1431 CGF.EmitBlock(AwaitBB);
1432 // Wait for parallel work
1433 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001434
1435 Address WorkFn =
1436 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
1437 Address ExecStatus =
1438 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
1439 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
1440 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
1441
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001442 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001443 llvm::Value *Args[] = {WorkFn.getPointer(),
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +00001444 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001445 llvm::Value *Ret = CGF.EmitRuntimeCall(
1446 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
1447 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001448
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001449 // On termination condition (workid == 0), exit loop.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001450 llvm::Value *WorkID = Bld.CreateLoad(WorkFn);
1451 llvm::Value *ShouldTerminate = Bld.CreateIsNull(WorkID, "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001452 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
1453
1454 // Activate requested workers.
1455 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001456 llvm::Value *IsActive =
1457 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
1458 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001459
1460 // Signal start of parallel region.
1461 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001462
1463 // Process work items: outlined parallel functions.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001464 for (llvm::Function *W : Work) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001465 // Try to match this outlined function.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001466 llvm::Value *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001467
1468 llvm::Value *WorkFnMatch =
1469 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
1470
1471 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
1472 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
1473 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
1474
1475 // Execute this outlined function.
1476 CGF.EmitBlock(ExecuteFNBB);
1477
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001478 // Insert call to work function via shared wrapper. The shared
1479 // wrapper takes two arguments:
1480 // - the parallelism level;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00001481 // - the thread ID;
1482 emitCall(CGF, WST.Loc, W,
1483 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001484
1485 // Go to end of parallel region.
1486 CGF.EmitBranch(TerminateBB);
1487
1488 CGF.EmitBlock(CheckNextBB);
1489 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001490 // Default case: call to outlined function through pointer if the target
1491 // region makes a declare target call that may contain an orphaned parallel
1492 // directive.
1493 auto *ParallelFnTy =
1494 llvm::FunctionType::get(CGM.VoidTy, {CGM.Int16Ty, CGM.Int32Ty},
1495 /*isVarArg=*/false)
1496 ->getPointerTo();
1497 llvm::Value *WorkFnCast = Bld.CreateBitCast(WorkID, ParallelFnTy);
1498 // Insert call to work function via shared wrapper. The shared
1499 // wrapper takes two arguments:
1500 // - the parallelism level;
1501 // - the thread ID;
1502 emitCall(CGF, WST.Loc, WorkFnCast,
1503 {Bld.getInt16(/*ParallelLevel=*/0), getThreadID(CGF, WST.Loc)});
1504 // Go to end of parallel region.
1505 CGF.EmitBranch(TerminateBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001506
1507 // Signal end of parallel region.
1508 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001509 CGF.EmitRuntimeCall(
1510 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
1511 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001512 CGF.EmitBranch(BarrierBB);
1513
1514 // All active and inactive workers wait at a barrier after parallel region.
1515 CGF.EmitBlock(BarrierBB);
1516 // Barrier after parallel region.
1517 syncCTAThreads(CGF);
1518 CGF.EmitBranch(AwaitBB);
1519
1520 // Exit target region.
1521 CGF.EmitBlock(ExitBB);
1522}
1523
Adrian Prantl9fc8faf2018-05-09 01:00:01 +00001524/// Returns specified OpenMP runtime function for the current OpenMP
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001525/// implementation. Specialized for the NVPTX device.
1526/// \param Function OpenMP runtime function.
1527/// \return Specified function.
1528llvm::Constant *
1529CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
1530 llvm::Constant *RTLFn = nullptr;
1531 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
1532 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001533 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
1534 // RequiresOMPRuntime);
1535 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001536 auto *FnTy =
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001537 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1538 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
1539 break;
1540 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001541 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001542 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
1543 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001544 auto *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001545 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +00001546 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
1547 break;
1548 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001549 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
1550 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +00001551 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001552 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001553 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001554 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1555 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
1556 break;
1557 }
1558 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
1559 // Build void __kmpc_spmd_kernel_deinit();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001560 auto *FnTy =
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001561 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1562 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
1563 break;
1564 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001565 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
1566 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001567 /// void *outlined_function, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001568 llvm::Type *TypeParams[] = {CGM.Int8PtrTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001569 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001570 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1571 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
1572 break;
1573 }
1574 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Bercea7d80da12018-03-07 21:59:50 +00001575 /// Build bool __kmpc_kernel_parallel(void **outlined_function,
1576 /// int16_t IsOMPRuntimeInitialized);
1577 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy, CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001578 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
Alexey Bataev9ff80832018-04-16 20:16:21 +00001579 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001580 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
1581 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
1582 break;
1583 }
1584 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
1585 /// Build void __kmpc_kernel_end_parallel();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001586 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001587 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1588 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
1589 break;
1590 }
1591 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
1592 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
1593 // global_tid);
1594 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001595 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001596 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1597 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
1598 break;
1599 }
1600 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
1601 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
1602 // global_tid);
1603 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001604 auto *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001605 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1606 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
1607 break;
1608 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001609 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
1610 // Build int32_t __kmpc_shuffle_int32(int32_t element,
1611 // int16_t lane_offset, int16_t warp_size);
1612 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001613 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001614 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
1615 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
1616 break;
1617 }
1618 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
1619 // Build int64_t __kmpc_shuffle_int64(int64_t element,
1620 // int16_t lane_offset, int16_t warp_size);
1621 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001622 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001623 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
1624 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
1625 break;
1626 }
1627 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
1628 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
1629 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1630 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1631 // lane_offset, int16_t Algorithm Version),
1632 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1633 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1634 CGM.Int16Ty, CGM.Int16Ty};
1635 auto *ShuffleReduceFnTy =
1636 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1637 /*isVarArg=*/false);
1638 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1639 auto *InterWarpCopyFnTy =
1640 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1641 /*isVarArg=*/false);
1642 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1643 CGM.Int32Ty,
1644 CGM.SizeTy,
1645 CGM.VoidPtrTy,
1646 ShuffleReduceFnTy->getPointerTo(),
1647 InterWarpCopyFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001648 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001649 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1650 RTLFn = CGM.CreateRuntimeFunction(
1651 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
1652 break;
1653 }
Alexey Bataevfac26cf2018-05-02 20:03:27 +00001654 case OMPRTL_NVPTX__kmpc_simd_reduce_nowait: {
1655 // Build int32_t kmpc_nvptx_simd_reduce_nowait(kmp_int32 global_tid,
1656 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
1657 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1658 // lane_offset, int16_t Algorithm Version),
1659 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
1660 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1661 CGM.Int16Ty, CGM.Int16Ty};
1662 auto *ShuffleReduceFnTy =
1663 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1664 /*isVarArg=*/false);
1665 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1666 auto *InterWarpCopyFnTy =
1667 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1668 /*isVarArg=*/false);
1669 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1670 CGM.Int32Ty,
1671 CGM.SizeTy,
1672 CGM.VoidPtrTy,
1673 ShuffleReduceFnTy->getPointerTo(),
1674 InterWarpCopyFnTy->getPointerTo()};
1675 auto *FnTy =
1676 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1677 RTLFn = CGM.CreateRuntimeFunction(
1678 FnTy, /*Name=*/"__kmpc_nvptx_simd_reduce_nowait");
1679 break;
1680 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001681 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
1682 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
1683 // int32_t num_vars, size_t reduce_size, void *reduce_data,
1684 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
1685 // lane_offset, int16_t shortCircuit),
1686 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
1687 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
1688 // int32_t index, int32_t width),
1689 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
1690 // int32_t index, int32_t width, int32_t reduce))
1691 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
1692 CGM.Int16Ty, CGM.Int16Ty};
1693 auto *ShuffleReduceFnTy =
1694 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
1695 /*isVarArg=*/false);
1696 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
1697 auto *InterWarpCopyFnTy =
1698 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
1699 /*isVarArg=*/false);
1700 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
1701 CGM.Int32Ty, CGM.Int32Ty};
1702 auto *CopyToScratchpadFnTy =
1703 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
1704 /*isVarArg=*/false);
1705 llvm::Type *LoadReduceTypeParams[] = {
1706 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
1707 auto *LoadReduceFnTy =
1708 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
1709 /*isVarArg=*/false);
1710 llvm::Type *TypeParams[] = {CGM.Int32Ty,
1711 CGM.Int32Ty,
1712 CGM.SizeTy,
1713 CGM.VoidPtrTy,
1714 ShuffleReduceFnTy->getPointerTo(),
1715 InterWarpCopyFnTy->getPointerTo(),
1716 CopyToScratchpadFnTy->getPointerTo(),
1717 LoadReduceFnTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001718 auto *FnTy =
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001719 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
1720 RTLFn = CGM.CreateRuntimeFunction(
1721 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
1722 break;
1723 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001724 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
1725 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
1726 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001727 auto *FnTy =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001728 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1729 RTLFn = CGM.CreateRuntimeFunction(
1730 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
1731 break;
1732 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001733 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack: {
1734 /// Build void __kmpc_data_sharing_init_stack();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001735 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001736 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1737 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack");
1738 break;
1739 }
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001740 case OMPRTL_NVPTX__kmpc_data_sharing_init_stack_spmd: {
1741 /// Build void __kmpc_data_sharing_init_stack_spmd();
1742 auto *FnTy =
1743 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00001744 RTLFn =
1745 CGM.CreateRuntimeFunction(FnTy, "__kmpc_data_sharing_init_stack_spmd");
Gheorghe-Teodor Berceaad4e5792018-07-13 16:18:24 +00001746 break;
1747 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001748 case OMPRTL_NVPTX__kmpc_data_sharing_push_stack: {
1749 // Build void *__kmpc_data_sharing_push_stack(size_t size,
1750 // int16_t UseSharedMemory);
1751 llvm::Type *TypeParams[] = {CGM.SizeTy, CGM.Int16Ty};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001752 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001753 llvm::FunctionType::get(CGM.VoidPtrTy, TypeParams, /*isVarArg=*/false);
1754 RTLFn = CGM.CreateRuntimeFunction(
1755 FnTy, /*Name=*/"__kmpc_data_sharing_push_stack");
1756 break;
1757 }
1758 case OMPRTL_NVPTX__kmpc_data_sharing_pop_stack: {
1759 // Build void __kmpc_data_sharing_pop_stack(void *a);
1760 llvm::Type *TypeParams[] = {CGM.VoidPtrTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001761 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001762 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
1763 RTLFn = CGM.CreateRuntimeFunction(FnTy,
1764 /*Name=*/"__kmpc_data_sharing_pop_stack");
1765 break;
1766 }
1767 case OMPRTL_NVPTX__kmpc_begin_sharing_variables: {
1768 /// Build void __kmpc_begin_sharing_variables(void ***args,
1769 /// size_t n_args);
1770 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo(), CGM.SizeTy};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001771 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001772 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1773 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_begin_sharing_variables");
1774 break;
1775 }
1776 case OMPRTL_NVPTX__kmpc_end_sharing_variables: {
1777 /// Build void __kmpc_end_sharing_variables();
Alexey Bataev9ff80832018-04-16 20:16:21 +00001778 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001779 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
1780 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_sharing_variables");
1781 break;
1782 }
1783 case OMPRTL_NVPTX__kmpc_get_shared_variables: {
1784 /// Build void __kmpc_get_shared_variables(void ***GlobalArgs);
1785 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy->getPointerTo()};
Alexey Bataev9ff80832018-04-16 20:16:21 +00001786 auto *FnTy =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001787 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1788 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_shared_variables");
1789 break;
1790 }
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001791 case OMPRTL_NVPTX__kmpc_parallel_level: {
1792 // Build uint16_t __kmpc_parallel_level(ident_t *loc, kmp_int32 global_tid);
1793 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
1794 auto *FnTy =
1795 llvm::FunctionType::get(CGM.Int16Ty, TypeParams, /*isVarArg*/ false);
1796 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_parallel_level");
1797 break;
1798 }
Alexey Bataev673110d2018-05-16 13:36:30 +00001799 case OMPRTL_NVPTX__kmpc_is_spmd_exec_mode: {
1800 // Build int8_t __kmpc_is_spmd_exec_mode();
1801 auto *FnTy = llvm::FunctionType::get(CGM.Int8Ty, /*isVarArg=*/false);
1802 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_is_spmd_exec_mode");
1803 break;
1804 }
Alexey Bataeve4090182018-11-02 14:54:07 +00001805 case OMPRTL_NVPTX__kmpc_get_team_static_memory: {
1806 // Build void __kmpc_get_team_static_memory(const void *buf, size_t size,
1807 // int16_t is_shared, const void **res);
1808 llvm::Type *TypeParams[] = {CGM.VoidPtrTy, CGM.SizeTy, CGM.Int16Ty,
1809 CGM.VoidPtrPtrTy};
1810 auto *FnTy =
1811 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
1812 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_get_team_static_memory");
1813 break;
1814 }
1815 case OMPRTL_NVPTX__kmpc_restore_team_static_memory: {
1816 // Build void __kmpc_restore_team_static_memory(int16_t is_shared);
1817 auto *FnTy =
1818 llvm::FunctionType::get(CGM.VoidTy, CGM.Int16Ty, /*isVarArg=*/false);
1819 RTLFn =
1820 CGM.CreateRuntimeFunction(FnTy, "__kmpc_restore_team_static_memory");
1821 break;
1822 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001823 }
1824 return RTLFn;
1825}
1826
1827void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
1828 llvm::Constant *Addr,
Alexey Bataev03f270c2018-03-30 18:31:07 +00001829 uint64_t Size, int32_t,
1830 llvm::GlobalValue::LinkageTypes) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001831 // TODO: Add support for global variables on the device after declare target
1832 // support.
Alexey Bataev9ff80832018-04-16 20:16:21 +00001833 if (!isa<llvm::Function>(Addr))
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001834 return;
Alexey Bataev9ff80832018-04-16 20:16:21 +00001835 llvm::Module &M = CGM.getModule();
1836 llvm::LLVMContext &Ctx = CGM.getLLVMContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001837
1838 // Get "nvvm.annotations" metadata node
Alexey Bataev9ff80832018-04-16 20:16:21 +00001839 llvm::NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001840
1841 llvm::Metadata *MDVals[] = {
Alexey Bataev9ff80832018-04-16 20:16:21 +00001842 llvm::ConstantAsMetadata::get(Addr), llvm::MDString::get(Ctx, "kernel"),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001843 llvm::ConstantAsMetadata::get(
1844 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
1845 // Append metadata to nvvm.annotations
1846 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
1847}
1848
1849void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
1850 const OMPExecutableDirective &D, StringRef ParentName,
1851 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +00001852 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001853 if (!IsOffloadEntry) // Nothing to do.
1854 return;
1855
1856 assert(!ParentName.empty() && "Invalid target region parent name!");
1857
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001858 bool Mode = supportsSPMDExecutionMode(CGM.getContext(), D);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001859 if (Mode)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001860 emitSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001861 CodeGen);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001862 else
1863 emitNonSPMDKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
1864 CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001865
1866 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001867}
1868
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001869CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001870 : CGOpenMPRuntime(CGM, "_", "$") {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001871 if (!CGM.getLangOpts().OpenMPIsDevice)
1872 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +00001873}
Carlo Bertollic6872252016-04-04 15:55:02 +00001874
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001875void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
1876 OpenMPProcBindClauseKind ProcBind,
1877 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001878 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001879 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +00001880 return;
1881
1882 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
1883}
1884
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001885void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
1886 llvm::Value *NumThreads,
1887 SourceLocation Loc) {
Alexey Bataev4065b9a2018-06-21 20:26:33 +00001888 // Do nothing in case of SPMD mode and L0 parallel.
Alexey Bataev2a3320a2018-05-15 18:01:01 +00001889 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +00001890 return;
1891
1892 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
1893}
1894
Carlo Bertollic6872252016-04-04 15:55:02 +00001895void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
1896 const Expr *NumTeams,
1897 const Expr *ThreadLimit,
1898 SourceLocation Loc) {}
1899
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001900llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
1901 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1902 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00001903 // Emit target region as a standalone region.
1904 class NVPTXPrePostActionTy : public PrePostActionTy {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001905 bool &IsInParallelRegion;
1906 bool PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001907
1908 public:
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001909 NVPTXPrePostActionTy(bool &IsInParallelRegion)
1910 : IsInParallelRegion(IsInParallelRegion) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001911 void Enter(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001912 PrevIsInParallelRegion = IsInParallelRegion;
1913 IsInParallelRegion = true;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001914 }
1915 void Exit(CodeGenFunction &CGF) override {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001916 IsInParallelRegion = PrevIsInParallelRegion;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001917 }
Alexey Bataevb99dcb52018-07-09 17:43:58 +00001918 } Action(IsInParallelRegion);
Alexey Bataevc99042b2018-03-15 18:10:54 +00001919 CodeGen.setAction(Action);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001920 bool PrevIsInTTDRegion = IsInTTDRegion;
1921 IsInTTDRegion = false;
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001922 bool PrevIsInTargetMasterThreadRegion = IsInTargetMasterThreadRegion;
1923 IsInTargetMasterThreadRegion = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001924 auto *OutlinedFun =
1925 cast<llvm::Function>(CGOpenMPRuntime::emitParallelOutlinedFunction(
1926 D, ThreadIDVar, InnermostKind, CodeGen));
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00001927 IsInTargetMasterThreadRegion = PrevIsInTargetMasterThreadRegion;
Alexey Bataev4ac58d12018-10-12 20:19:59 +00001928 IsInTTDRegion = PrevIsInTTDRegion;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00001929 if (getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD &&
1930 !IsInParallelRegion) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00001931 llvm::Function *WrapperFun =
1932 createParallelDataSharingWrapper(OutlinedFun, D);
1933 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
1934 }
1935
1936 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001937}
1938
Alexey Bataev2adecff2018-09-21 14:22:53 +00001939/// Get list of lastprivate variables from the teams distribute ... or
1940/// teams {distribute ...} directives.
1941static void
1942getDistributeLastprivateVars(const OMPExecutableDirective &D,
1943 llvm::SmallVectorImpl<const ValueDecl *> &Vars) {
1944 assert(isOpenMPTeamsDirective(D.getDirectiveKind()) &&
1945 "expected teams directive.");
1946 const OMPExecutableDirective *Dir = &D;
1947 if (!isOpenMPDistributeDirective(D.getDirectiveKind())) {
1948 if (const Stmt *S = getSingleCompoundChild(
1949 D.getInnermostCapturedStmt()->getCapturedStmt()->IgnoreContainers(
1950 /*IgnoreCaptured=*/true))) {
1951 Dir = dyn_cast<OMPExecutableDirective>(S);
1952 if (Dir && !isOpenMPDistributeDirective(Dir->getDirectiveKind()))
1953 Dir = nullptr;
1954 }
1955 }
1956 if (!Dir)
1957 return;
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001958 for (const auto *C : Dir->getClausesOfKind<OMPLastprivateClause>()) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001959 for (const Expr *E : C->getVarRefs()) {
1960 const auto *DE = cast<DeclRefExpr>(E->IgnoreParens());
1961 Vars.push_back(cast<ValueDecl>(DE->getDecl()->getCanonicalDecl()));
1962 }
1963 }
1964}
1965
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00001966llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +00001967 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
1968 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Stephen Kellyf2ceec42018-08-09 21:08:08 +00001969 SourceLocation Loc = D.getBeginLoc();
Carlo Bertollic6872252016-04-04 15:55:02 +00001970
Alexey Bataev2adecff2018-09-21 14:22:53 +00001971 const RecordDecl *GlobalizedRD = nullptr;
1972 llvm::SmallVector<const ValueDecl *, 4> LastPrivates;
1973 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *> MappedDeclsFields;
1974 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
1975 getDistributeLastprivateVars(D, LastPrivates);
1976 if (!LastPrivates.empty())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00001977 GlobalizedRD = ::buildRecordForGlobalizedVars(
1978 CGM.getContext(), llvm::None, LastPrivates, MappedDeclsFields);
Alexey Bataev2adecff2018-09-21 14:22:53 +00001979 }
1980
Alexey Bataevc99042b2018-03-15 18:10:54 +00001981 // Emit target region as a standalone region.
1982 class NVPTXPrePostActionTy : public PrePostActionTy {
1983 SourceLocation &Loc;
Alexey Bataev2adecff2018-09-21 14:22:53 +00001984 const RecordDecl *GlobalizedRD;
1985 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1986 &MappedDeclsFields;
Alexey Bataevc99042b2018-03-15 18:10:54 +00001987
1988 public:
Alexey Bataev2adecff2018-09-21 14:22:53 +00001989 NVPTXPrePostActionTy(
1990 SourceLocation &Loc, const RecordDecl *GlobalizedRD,
1991 llvm::SmallDenseMap<const ValueDecl *, const FieldDecl *>
1992 &MappedDeclsFields)
1993 : Loc(Loc), GlobalizedRD(GlobalizedRD),
1994 MappedDeclsFields(MappedDeclsFields) {}
Alexey Bataevc99042b2018-03-15 18:10:54 +00001995 void Enter(CodeGenFunction &CGF) override {
Alexey Bataev2adecff2018-09-21 14:22:53 +00001996 auto &Rt =
1997 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime());
1998 if (GlobalizedRD) {
1999 auto I = Rt.FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
2000 I->getSecond().GlobalRecord = GlobalizedRD;
2001 I->getSecond().MappedParams =
2002 llvm::make_unique<CodeGenFunction::OMPMapVars>();
2003 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
2004 for (const auto &Pair : MappedDeclsFields) {
2005 assert(Pair.getFirst()->isCanonicalDecl() &&
2006 "Expected canonical declaration");
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002007 Data.insert(std::make_pair(Pair.getFirst(),
2008 MappedVarData(Pair.getSecond(),
2009 /*IsOnePerTeam=*/true)));
Alexey Bataev2adecff2018-09-21 14:22:53 +00002010 }
2011 }
2012 Rt.emitGenericVarsProlog(CGF, Loc);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002013 }
2014 void Exit(CodeGenFunction &CGF) override {
2015 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
2016 .emitGenericVarsEpilog(CGF);
2017 }
Alexey Bataev2adecff2018-09-21 14:22:53 +00002018 } Action(Loc, GlobalizedRD, MappedDeclsFields);
2019 CodeGen.setAction(Action);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002020 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
2021 D, ThreadIDVar, InnermostKind, CodeGen);
2022 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
2023 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +00002024 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +00002025 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +00002026
2027 return OutlinedFun;
2028}
2029
Alexey Bataevc99042b2018-03-15 18:10:54 +00002030void CGOpenMPRuntimeNVPTX::emitGenericVarsProlog(CodeGenFunction &CGF,
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002031 SourceLocation Loc,
2032 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002033 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2034 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002035 return;
2036
Alexey Bataevc99042b2018-03-15 18:10:54 +00002037 CGBuilderTy &Bld = CGF.Builder;
2038
2039 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
2040 if (I == FunctionGlobalizedDecls.end())
2041 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002042 if (const RecordDecl *GlobalizedVarsRecord = I->getSecond().GlobalRecord) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002043 QualType GlobalRecTy = CGM.getContext().getRecordType(GlobalizedVarsRecord);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002044 QualType SecGlobalRecTy;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002045
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002046 // Recover pointer to this function's global record. The runtime will
2047 // handle the specifics of the allocation of the memory.
2048 // Use actual memory size of the record including the padding
2049 // for alignment purposes.
2050 unsigned Alignment =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002051 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002052 unsigned GlobalRecordSize =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002053 CGM.getContext().getTypeSizeInChars(GlobalRecTy).getQuantity();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002054 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002055
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002056 llvm::PointerType *GlobalRecPtrTy =
2057 CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo();
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002058 llvm::Value *GlobalRecCastAddr;
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002059 llvm::Value *IsTTD = nullptr;
Alexey Bataeve4090182018-11-02 14:54:07 +00002060 if (!IsInTTDRegion &&
2061 (WithSPMDCheck ||
2062 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002063 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2064 llvm::BasicBlock *SPMDBB = CGF.createBasicBlock(".spmd");
2065 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002066 if (I->getSecond().SecondaryGlobalRecord.hasValue()) {
2067 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2068 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2069 llvm::Value *PL = CGF.EmitRuntimeCall(
2070 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2071 {RTLoc, ThreadID});
2072 IsTTD = Bld.CreateIsNull(PL);
2073 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002074 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2075 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
2076 Bld.CreateCondBr(IsSPMD, SPMDBB, NonSPMDBB);
2077 // There is no need to emit line number for unconditional branch.
2078 (void)ApplyDebugLocation::CreateEmpty(CGF);
2079 CGF.EmitBlock(SPMDBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002080 Address RecPtr = Address(llvm::ConstantPointerNull::get(GlobalRecPtrTy),
2081 CharUnits::fromQuantity(Alignment));
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002082 CGF.EmitBranch(ExitBB);
2083 // There is no need to emit line number for unconditional branch.
2084 (void)ApplyDebugLocation::CreateEmpty(CGF);
2085 CGF.EmitBlock(NonSPMDBB);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002086 llvm::Value *Size = llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize);
2087 if (const RecordDecl *SecGlobalizedVarsRecord =
2088 I->getSecond().SecondaryGlobalRecord.getValueOr(nullptr)) {
2089 SecGlobalRecTy =
2090 CGM.getContext().getRecordType(SecGlobalizedVarsRecord);
2091
2092 // Recover pointer to this function's global record. The runtime will
2093 // handle the specifics of the allocation of the memory.
2094 // Use actual memory size of the record including the padding
2095 // for alignment purposes.
2096 unsigned Alignment =
2097 CGM.getContext().getTypeAlignInChars(SecGlobalRecTy).getQuantity();
2098 unsigned GlobalRecordSize =
2099 CGM.getContext().getTypeSizeInChars(SecGlobalRecTy).getQuantity();
2100 GlobalRecordSize = llvm::alignTo(GlobalRecordSize, Alignment);
2101 Size = Bld.CreateSelect(
2102 IsTTD, llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize), Size);
2103 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002104 // TODO: allow the usage of shared memory to be controlled by
2105 // the user, for now, default to global.
2106 llvm::Value *GlobalRecordSizeArg[] = {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002107 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002108 llvm::Value *GlobalRecValue =
2109 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2110 OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
2111 GlobalRecordSizeArg);
2112 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002113 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002114 CGF.EmitBlock(ExitBB);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002115 auto *Phi = Bld.CreatePHI(GlobalRecPtrTy,
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002116 /*NumReservedValues=*/2, "_select_stack");
2117 Phi->addIncoming(RecPtr.getPointer(), SPMDBB);
2118 Phi->addIncoming(GlobalRecCastAddr, NonSPMDBB);
2119 GlobalRecCastAddr = Phi;
2120 I->getSecond().GlobalRecordAddr = Phi;
2121 I->getSecond().IsInSPMDModeFlag = IsSPMD;
Alexey Bataeve4090182018-11-02 14:54:07 +00002122 } else if (IsInTTDRegion) {
2123 assert(GlobalizedRecords.back().Records.size() < 2 &&
2124 "Expected less than 2 globalized records: one for target and one "
2125 "for teams.");
2126 unsigned Offset = 0;
2127 for (const RecordDecl *RD : GlobalizedRecords.back().Records) {
2128 QualType RDTy = CGM.getContext().getRecordType(RD);
2129 unsigned Alignment =
2130 CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
2131 unsigned Size = CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
2132 Offset =
2133 llvm::alignTo(llvm::alignTo(Offset, Alignment) + Size, Alignment);
2134 }
2135 unsigned Alignment =
2136 CGM.getContext().getTypeAlignInChars(GlobalRecTy).getQuantity();
2137 Offset = llvm::alignTo(Offset, Alignment);
2138 GlobalizedRecords.back().Records.push_back(GlobalizedVarsRecord);
2139 ++GlobalizedRecords.back().RegionCounter;
2140 if (GlobalizedRecords.back().Records.size() == 1) {
2141 assert(StaticGlobalized &&
2142 "Static pointer must be initialized already.");
2143 Address Buffer = CGF.EmitLoadOfPointer(
2144 Address(StaticGlobalized, CGM.getPointerAlign()),
2145 CGM.getContext()
2146 .getPointerType(CGM.getContext().VoidPtrTy)
2147 .castAs<PointerType>());
2148 auto *RecSize = new llvm::GlobalVariable(
2149 CGM.getModule(), CGM.SizeTy, /*isConstant=*/true,
2150 llvm::GlobalValue::InternalLinkage, nullptr,
2151 "_openmp_static_kernel$size");
2152 RecSize->setUnnamedAddr(llvm::GlobalValue::UnnamedAddr::Global);
2153 llvm::Value *Ld = CGF.EmitLoadOfScalar(
2154 Address(RecSize, CGM.getPointerAlign()), /*Volatile=*/false,
2155 CGM.getContext().getSizeType(), Loc);
2156 llvm::Value *ResAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2157 KernelStaticGlobalized, CGM.VoidPtrPtrTy);
2158 llvm::Value *GlobalRecordSizeArg[] = {
2159 Buffer.getPointer(), Ld,
2160 llvm::ConstantInt::getNullValue(CGM.Int16Ty), ResAddr};
2161 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2162 OMPRTL_NVPTX__kmpc_get_team_static_memory),
2163 GlobalRecordSizeArg);
2164 GlobalizedRecords.back().RecSize = RecSize;
2165 }
2166 assert(KernelStaticGlobalized && "Global address must be set already.");
2167 Address FrameAddr = CGF.EmitLoadOfPointer(
2168 Address(KernelStaticGlobalized, CGM.getPointerAlign()),
2169 CGM.getContext()
2170 .getPointerType(CGM.getContext().VoidPtrTy)
2171 .castAs<PointerType>());
2172 llvm::Value *GlobalRecValue =
2173 Bld.CreateConstInBoundsGEP(FrameAddr, Offset, CharUnits::One())
2174 .getPointer();
2175 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2176 I->getSecond().IsInSPMDModeFlag = nullptr;
2177 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2178 GlobalRecValue, CGF.ConvertTypeForMem(GlobalRecTy)->getPointerTo());
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002179 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002180 // TODO: allow the usage of shared memory to be controlled by
2181 // the user, for now, default to global.
2182 llvm::Value *GlobalRecordSizeArg[] = {
2183 llvm::ConstantInt::get(CGM.SizeTy, GlobalRecordSize),
2184 CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2185 llvm::Value *GlobalRecValue =
2186 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2187 OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
2188 GlobalRecordSizeArg);
2189 GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002190 GlobalRecValue, GlobalRecPtrTy);
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002191 I->getSecond().GlobalRecordAddr = GlobalRecValue;
2192 I->getSecond().IsInSPMDModeFlag = nullptr;
2193 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002194 LValue Base =
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002195 CGF.MakeNaturalAlignPointeeAddrLValue(GlobalRecCastAddr, GlobalRecTy);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002196
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002197 // Emit the "global alloca" which is a GEP from the global declaration
2198 // record using the pointer returned by the runtime.
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002199 LValue SecBase;
2200 decltype(I->getSecond().LocalVarData)::const_iterator SecIt;
2201 if (IsTTD) {
2202 SecIt = I->getSecond().SecondaryLocalVarData->begin();
2203 llvm::PointerType *SecGlobalRecPtrTy =
2204 CGF.ConvertTypeForMem(SecGlobalRecTy)->getPointerTo();
2205 SecBase = CGF.MakeNaturalAlignPointeeAddrLValue(
2206 Bld.CreatePointerBitCastOrAddrSpaceCast(
2207 I->getSecond().GlobalRecordAddr, SecGlobalRecPtrTy),
2208 SecGlobalRecTy);
2209 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002210 for (auto &Rec : I->getSecond().LocalVarData) {
2211 bool EscapedParam = I->getSecond().EscapedParameters.count(Rec.first);
2212 llvm::Value *ParValue;
2213 if (EscapedParam) {
2214 const auto *VD = cast<VarDecl>(Rec.first);
2215 LValue ParLVal =
2216 CGF.MakeAddrLValue(CGF.GetAddrOfLocalVar(VD), VD->getType());
2217 ParValue = CGF.EmitLoadOfScalar(ParLVal, Loc);
2218 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002219 LValue VarAddr = CGF.EmitLValueForField(Base, Rec.second.FD);
2220 // Emit VarAddr basing on lane-id if required.
2221 QualType VarTy;
2222 if (Rec.second.IsOnePerTeam) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002223 VarTy = Rec.second.FD->getType();
2224 } else {
2225 llvm::Value *Ptr = CGF.Builder.CreateInBoundsGEP(
2226 VarAddr.getAddress().getPointer(),
2227 {Bld.getInt32(0), getNVPTXLaneID(CGF)});
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002228 VarTy =
2229 Rec.second.FD->getType()->castAsArrayTypeUnsafe()->getElementType();
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002230 VarAddr = CGF.MakeAddrLValue(
2231 Address(Ptr, CGM.getContext().getDeclAlign(Rec.first)), VarTy,
2232 AlignmentSource::Decl);
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002233 }
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002234 Rec.second.PrivateAddr = VarAddr.getAddress();
Alexey Bataeve4090182018-11-02 14:54:07 +00002235 if (!IsInTTDRegion &&
2236 (WithSPMDCheck ||
2237 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002238 assert(I->getSecond().IsInSPMDModeFlag &&
2239 "Expected unknown execution mode or required SPMD check.");
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00002240 if (IsTTD) {
2241 assert(SecIt->second.IsOnePerTeam &&
2242 "Secondary glob data must be one per team.");
2243 LValue SecVarAddr = CGF.EmitLValueForField(SecBase, SecIt->second.FD);
2244 VarAddr.setAddress(
2245 Address(Bld.CreateSelect(IsTTD, SecVarAddr.getPointer(),
2246 VarAddr.getPointer()),
2247 VarAddr.getAlignment()));
2248 Rec.second.PrivateAddr = VarAddr.getAddress();
2249 }
Alexey Bataev9ea3c382018-10-09 14:49:00 +00002250 Address GlobalPtr = Rec.second.PrivateAddr;
2251 Address LocalAddr = CGF.CreateMemTemp(VarTy, Rec.second.FD->getName());
2252 Rec.second.PrivateAddr = Address(
2253 Bld.CreateSelect(I->getSecond().IsInSPMDModeFlag,
2254 LocalAddr.getPointer(), GlobalPtr.getPointer()),
2255 LocalAddr.getAlignment());
2256 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002257 if (EscapedParam) {
2258 const auto *VD = cast<VarDecl>(Rec.first);
2259 CGF.EmitStoreOfScalar(ParValue, VarAddr);
2260 I->getSecond().MappedParams->setVarAddr(CGF, VD, VarAddr.getAddress());
2261 }
Alexey Bataev93a38d62018-10-16 00:09:06 +00002262 if (IsTTD)
2263 ++SecIt;
Alexey Bataevc99042b2018-03-15 18:10:54 +00002264 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002265 }
2266 for (const ValueDecl *VD : I->getSecond().EscapedVariableLengthDecls) {
2267 // Recover pointer to this function's global record. The runtime will
2268 // handle the specifics of the allocation of the memory.
2269 // Use actual memory size of the record including the padding
2270 // for alignment purposes.
Alexey Bataev9ff80832018-04-16 20:16:21 +00002271 CGBuilderTy &Bld = CGF.Builder;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002272 llvm::Value *Size = CGF.getTypeSize(VD->getType());
2273 CharUnits Align = CGM.getContext().getDeclAlign(VD);
2274 Size = Bld.CreateNUWAdd(
2275 Size, llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity() - 1));
2276 llvm::Value *AlignVal =
2277 llvm::ConstantInt::get(CGF.SizeTy, Align.getQuantity());
2278 Size = Bld.CreateUDiv(Size, AlignVal);
2279 Size = Bld.CreateNUWMul(Size, AlignVal);
2280 // TODO: allow the usage of shared memory to be controlled by
2281 // the user, for now, default to global.
2282 llvm::Value *GlobalRecordSizeArg[] = {
2283 Size, CGF.Builder.getInt16(/*UseSharedMemory=*/0)};
2284 llvm::Value *GlobalRecValue = CGF.EmitRuntimeCall(
2285 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_push_stack),
2286 GlobalRecordSizeArg);
2287 llvm::Value *GlobalRecCastAddr = Bld.CreatePointerBitCastOrAddrSpaceCast(
2288 GlobalRecValue, CGF.ConvertTypeForMem(VD->getType())->getPointerTo());
2289 LValue Base = CGF.MakeAddrLValue(GlobalRecCastAddr, VD->getType(),
2290 CGM.getContext().getDeclAlign(VD),
2291 AlignmentSource::Decl);
2292 I->getSecond().MappedParams->setVarAddr(CGF, cast<VarDecl>(VD),
2293 Base.getAddress());
2294 I->getSecond().EscapedVariableLengthDeclsAddrs.emplace_back(GlobalRecValue);
Alexey Bataevc99042b2018-03-15 18:10:54 +00002295 }
2296 I->getSecond().MappedParams->apply(CGF);
2297}
2298
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00002299void CGOpenMPRuntimeNVPTX::emitGenericVarsEpilog(CodeGenFunction &CGF,
2300 bool WithSPMDCheck) {
Alexey Bataev2adecff2018-09-21 14:22:53 +00002301 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic &&
2302 getExecutionMode() != CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002303 return;
2304
Alexey Bataevc99042b2018-03-15 18:10:54 +00002305 const auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002306 if (I != FunctionGlobalizedDecls.end()) {
Alexey Bataevc99042b2018-03-15 18:10:54 +00002307 I->getSecond().MappedParams->restore(CGF);
2308 if (!CGF.HaveInsertPoint())
2309 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002310 for (llvm::Value *Addr :
2311 llvm::reverse(I->getSecond().EscapedVariableLengthDeclsAddrs)) {
2312 CGF.EmitRuntimeCall(
2313 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2314 Addr);
2315 }
2316 if (I->getSecond().GlobalRecordAddr) {
Alexey Bataeve4090182018-11-02 14:54:07 +00002317 if (!IsInTTDRegion &&
2318 (WithSPMDCheck ||
2319 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_Unknown)) {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002320 CGBuilderTy &Bld = CGF.Builder;
2321 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
2322 llvm::BasicBlock *NonSPMDBB = CGF.createBasicBlock(".non-spmd");
2323 Bld.CreateCondBr(I->getSecond().IsInSPMDModeFlag, ExitBB, NonSPMDBB);
2324 // There is no need to emit line number for unconditional branch.
2325 (void)ApplyDebugLocation::CreateEmpty(CGF);
2326 CGF.EmitBlock(NonSPMDBB);
2327 CGF.EmitRuntimeCall(
2328 createNVPTXRuntimeFunction(
2329 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2330 CGF.EmitCastToVoidPtr(I->getSecond().GlobalRecordAddr));
2331 CGF.EmitBlock(ExitBB);
Alexey Bataeve4090182018-11-02 14:54:07 +00002332 } else if (IsInTTDRegion) {
2333 assert(GlobalizedRecords.back().RegionCounter > 0 &&
2334 "region counter must be > 0.");
2335 --GlobalizedRecords.back().RegionCounter;
2336 // Emit the restore function only in the target region.
2337 if (GlobalizedRecords.back().RegionCounter == 0) {
2338 CGF.EmitRuntimeCall(
2339 createNVPTXRuntimeFunction(
2340 OMPRTL_NVPTX__kmpc_restore_team_static_memory),
2341 llvm::ConstantInt::getNullValue(CGM.Int16Ty));
2342 }
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002343 } else {
Alexey Bataev8d8e1232018-08-29 18:32:21 +00002344 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2345 OMPRTL_NVPTX__kmpc_data_sharing_pop_stack),
2346 I->getSecond().GlobalRecordAddr);
2347 }
Alexey Bataev63cc8e92018-03-20 14:45:59 +00002348 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00002349 }
2350}
2351
Carlo Bertollic6872252016-04-04 15:55:02 +00002352void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
2353 const OMPExecutableDirective &D,
2354 SourceLocation Loc,
2355 llvm::Value *OutlinedFn,
2356 ArrayRef<llvm::Value *> CapturedVars) {
2357 if (!CGF.HaveInsertPoint())
2358 return;
2359
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002360 Address ZeroAddr = CGF.CreateMemTemp(
2361 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
2362 /*Name*/ ".zero.addr");
Carlo Bertollic6872252016-04-04 15:55:02 +00002363 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
2364 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00002365 OutlinedFnArgs.push_back(emitThreadIDAddress(CGF, Loc).getPointer());
Carlo Bertollic6872252016-04-04 15:55:02 +00002366 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2367 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00002368 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +00002369}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002370
2371void CGOpenMPRuntimeNVPTX::emitParallelCall(
2372 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2373 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2374 if (!CGF.HaveInsertPoint())
2375 return;
2376
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002377 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002378 emitSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002379 else
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002380 emitNonSPMDParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002381}
2382
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002383void CGOpenMPRuntimeNVPTX::emitNonSPMDParallelCall(
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002384 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2385 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2386 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002387
2388 // Force inline this outlined function at its call site.
2389 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2390
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002391 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2392 /*DestWidth=*/32, /*Signed=*/1),
2393 ".zero.addr");
2394 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002395 // ThreadId for serialized parallels is 0.
2396 Address ThreadIDAddr = ZeroAddr;
2397 auto &&CodeGen = [this, Fn, CapturedVars, Loc, ZeroAddr, &ThreadIDAddr](
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002398 CodeGenFunction &CGF, PrePostActionTy &Action) {
2399 Action.Enter(CGF);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002400
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002401 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2402 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2403 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2404 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2405 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
2406 };
2407 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2408 PrePostActionTy &) {
2409
2410 RegionCodeGenTy RCG(CodeGen);
2411 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2412 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2413 llvm::Value *Args[] = {RTLoc, ThreadID};
2414
2415 NVPTXActionTy Action(
2416 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2417 Args,
2418 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2419 Args);
2420 RCG.setAction(Action);
2421 RCG(CGF);
2422 };
2423
2424 auto &&L0ParallelGen = [this, CapturedVars, Fn](CodeGenFunction &CGF,
2425 PrePostActionTy &Action) {
2426 CGBuilderTy &Bld = CGF.Builder;
2427 llvm::Function *WFn = WrapperFunctionsMap[Fn];
2428 assert(WFn && "Wrapper function does not exist!");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002429 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
2430
2431 // Prepare for parallel region. Indicate the outlined function.
2432 llvm::Value *Args[] = {ID, /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002433 CGF.EmitRuntimeCall(
2434 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
2435 Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002436
2437 // Create a private scope that will globalize the arguments
2438 // passed from the outside of the target region.
2439 CodeGenFunction::OMPPrivateScope PrivateArgScope(CGF);
2440
2441 // There's somehting to share.
2442 if (!CapturedVars.empty()) {
2443 // Prepare for parallel region. Indicate the outlined function.
2444 Address SharedArgs =
2445 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "shared_arg_refs");
2446 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
2447
2448 llvm::Value *DataSharingArgs[] = {
2449 SharedArgsPtr,
2450 llvm::ConstantInt::get(CGM.SizeTy, CapturedVars.size())};
2451 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(
2452 OMPRTL_NVPTX__kmpc_begin_sharing_variables),
2453 DataSharingArgs);
2454
2455 // Store variable address in a list of references to pass to workers.
2456 unsigned Idx = 0;
2457 ASTContext &Ctx = CGF.getContext();
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002458 Address SharedArgListAddress = CGF.EmitLoadOfPointer(
2459 SharedArgs, Ctx.getPointerType(Ctx.getPointerType(Ctx.VoidPtrTy))
2460 .castAs<PointerType>());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002461 for (llvm::Value *V : CapturedVars) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002462 Address Dst = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
2463 CGF.getPointerSize());
2464 llvm::Value *PtrV;
Alexey Bataev17314212018-03-20 15:41:05 +00002465 if (V->getType()->isIntegerTy())
2466 PtrV = Bld.CreateIntToPtr(V, CGF.VoidPtrTy);
2467 else
2468 PtrV = Bld.CreatePointerBitCastOrAddrSpaceCast(V, CGF.VoidPtrTy);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002469 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
2470 Ctx.getPointerType(Ctx.VoidPtrTy));
Alexey Bataevc99042b2018-03-15 18:10:54 +00002471 ++Idx;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002472 }
2473 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002474
2475 // Activate workers. This barrier is used by the master to signal
2476 // work for the workers.
2477 syncCTAThreads(CGF);
2478
2479 // OpenMP [2.5, Parallel Construct, p.49]
2480 // There is an implied barrier at the end of a parallel region. After the
2481 // end of a parallel region, only the master thread of the team resumes
2482 // execution of the enclosing task region.
2483 //
2484 // The master waits at this barrier until all workers are done.
2485 syncCTAThreads(CGF);
2486
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002487 if (!CapturedVars.empty())
2488 CGF.EmitRuntimeCall(
2489 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_sharing_variables));
2490
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002491 // Remember for post-processing in worker loop.
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00002492 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002493 };
2494
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002495 auto &&LNParallelGen = [this, Loc, &SeqGen, &L0ParallelGen](
2496 CodeGenFunction &CGF, PrePostActionTy &Action) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002497 if (IsInParallelRegion) {
2498 SeqGen(CGF, Action);
2499 } else if (IsInTargetMasterThreadRegion) {
2500 L0ParallelGen(CGF, Action);
2501 } else {
2502 // Check for master and then parallelism:
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002503 // if (__kmpc_is_spmd_exec_mode() || __kmpc_parallel_level(loc, gtid)) {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002504 // Serialized execution.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002505 // } else {
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002506 // Worker call.
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002507 // }
2508 CGBuilderTy &Bld = CGF.Builder;
2509 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002510 llvm::BasicBlock *SeqBB = CGF.createBasicBlock(".sequential");
2511 llvm::BasicBlock *ParallelCheckBB = CGF.createBasicBlock(".parcheck");
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002512 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
Alexey Bataev673110d2018-05-16 13:36:30 +00002513 llvm::Value *IsSPMD = Bld.CreateIsNotNull(CGF.EmitNounwindRuntimeCall(
2514 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_is_spmd_exec_mode)));
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002515 Bld.CreateCondBr(IsSPMD, SeqBB, ParallelCheckBB);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002516 // There is no need to emit line number for unconditional branch.
2517 (void)ApplyDebugLocation::CreateEmpty(CGF);
2518 CGF.EmitBlock(ParallelCheckBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002519 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2520 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2521 llvm::Value *PL = CGF.EmitRuntimeCall(
2522 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_level),
2523 {RTLoc, ThreadID});
2524 llvm::Value *Res = Bld.CreateIsNotNull(PL);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002525 Bld.CreateCondBr(Res, SeqBB, MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002526 CGF.EmitBlock(SeqBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002527 SeqGen(CGF, Action);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002528 CGF.EmitBranch(ExitBB);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002529 // There is no need to emit line number for unconditional branch.
2530 (void)ApplyDebugLocation::CreateEmpty(CGF);
Jonas Hahnfeld3ca47012018-10-02 19:12:54 +00002531 CGF.EmitBlock(MasterBB);
Alexey Bataev0baba9e2018-05-25 20:16:03 +00002532 L0ParallelGen(CGF, Action);
2533 CGF.EmitBranch(ExitBB);
2534 // There is no need to emit line number for unconditional branch.
2535 (void)ApplyDebugLocation::CreateEmpty(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002536 // Emit the continuation block for code after the if.
2537 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2538 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002539 };
2540
Alexey Bataev9ff80832018-04-16 20:16:21 +00002541 if (IfCond) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002542 emitOMPIfClause(CGF, IfCond, LNParallelGen, SeqGen);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002543 } else {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002544 CodeGenFunction::RunCleanupsScope Scope(CGF);
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00002545 RegionCodeGenTy ThenRCG(LNParallelGen);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00002546 ThenRCG(CGF);
2547 }
2548}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002549
Alexey Bataev4065b9a2018-06-21 20:26:33 +00002550void CGOpenMPRuntimeNVPTX::emitSPMDParallelCall(
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002551 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
2552 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
2553 // Just call the outlined function to execute the parallel region.
2554 // OutlinedFn(&GTid, &zero, CapturedStruct);
2555 //
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002556 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
Carlo Bertolli79712092018-02-28 20:48:35 +00002557
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002558 Address ZeroAddr = CGF.CreateMemTemp(CGF.getContext().getIntTypeForBitwidth(
2559 /*DestWidth=*/32, /*Signed=*/1),
2560 ".zero.addr");
Carlo Bertolli79712092018-02-28 20:48:35 +00002561 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Alexey Bataev8521ff62018-07-25 20:03:01 +00002562 // ThreadId for serialized parallels is 0.
2563 Address ThreadIDAddr = ZeroAddr;
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002564 auto &&CodeGen = [this, OutlinedFn, CapturedVars, Loc, ZeroAddr,
Alexey Bataev8521ff62018-07-25 20:03:01 +00002565 &ThreadIDAddr](CodeGenFunction &CGF,
2566 PrePostActionTy &Action) {
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002567 Action.Enter(CGF);
2568
2569 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
2570 OutlinedFnArgs.push_back(ThreadIDAddr.getPointer());
2571 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
2572 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
2573 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
2574 };
2575 auto &&SeqGen = [this, &CodeGen, Loc](CodeGenFunction &CGF,
2576 PrePostActionTy &) {
2577
2578 RegionCodeGenTy RCG(CodeGen);
2579 llvm::Value *RTLoc = emitUpdateLocation(CGF, Loc);
2580 llvm::Value *ThreadID = getThreadID(CGF, Loc);
2581 llvm::Value *Args[] = {RTLoc, ThreadID};
2582
2583 NVPTXActionTy Action(
2584 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
2585 Args,
2586 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
2587 Args);
2588 RCG.setAction(Action);
2589 RCG(CGF);
2590 };
2591
2592 if (IsInTargetMasterThreadRegion) {
Alexey Bataev8521ff62018-07-25 20:03:01 +00002593 // In the worker need to use the real thread id.
2594 ThreadIDAddr = emitThreadIDAddress(CGF, Loc);
Alexey Bataevbf5c8482018-05-10 18:32:08 +00002595 RegionCodeGenTy RCG(CodeGen);
2596 RCG(CGF);
2597 } else {
2598 // If we are not in the target region, it is definitely L2 parallelism or
2599 // more, because for SPMD mode we always has L1 parallel level, sowe don't
2600 // need to check for orphaned directives.
2601 RegionCodeGenTy RCG(SeqGen);
2602 RCG(CGF);
2603 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00002604}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002605
Alexey Bataev504fc2d2018-05-07 17:23:05 +00002606void CGOpenMPRuntimeNVPTX::emitCriticalRegion(
2607 CodeGenFunction &CGF, StringRef CriticalName,
2608 const RegionCodeGenTy &CriticalOpGen, SourceLocation Loc,
2609 const Expr *Hint) {
2610 llvm::BasicBlock *LoopBB = CGF.createBasicBlock("omp.critical.loop");
2611 llvm::BasicBlock *TestBB = CGF.createBasicBlock("omp.critical.test");
2612 llvm::BasicBlock *SyncBB = CGF.createBasicBlock("omp.critical.sync");
2613 llvm::BasicBlock *BodyBB = CGF.createBasicBlock("omp.critical.body");
2614 llvm::BasicBlock *ExitBB = CGF.createBasicBlock("omp.critical.exit");
2615
2616 // Fetch team-local id of the thread.
2617 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
2618
2619 // Get the width of the team.
2620 llvm::Value *TeamWidth = getNVPTXNumThreads(CGF);
2621
2622 // Initialize the counter variable for the loop.
2623 QualType Int32Ty =
2624 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/0);
2625 Address Counter = CGF.CreateMemTemp(Int32Ty, "critical_counter");
2626 LValue CounterLVal = CGF.MakeAddrLValue(Counter, Int32Ty);
2627 CGF.EmitStoreOfScalar(llvm::Constant::getNullValue(CGM.Int32Ty), CounterLVal,
2628 /*isInit=*/true);
2629
2630 // Block checks if loop counter exceeds upper bound.
2631 CGF.EmitBlock(LoopBB);
2632 llvm::Value *CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2633 llvm::Value *CmpLoopBound = CGF.Builder.CreateICmpSLT(CounterVal, TeamWidth);
2634 CGF.Builder.CreateCondBr(CmpLoopBound, TestBB, ExitBB);
2635
2636 // Block tests which single thread should execute region, and which threads
2637 // should go straight to synchronisation point.
2638 CGF.EmitBlock(TestBB);
2639 CounterVal = CGF.EmitLoadOfScalar(CounterLVal, Loc);
2640 llvm::Value *CmpThreadToCounter =
2641 CGF.Builder.CreateICmpEQ(ThreadID, CounterVal);
2642 CGF.Builder.CreateCondBr(CmpThreadToCounter, BodyBB, SyncBB);
2643
2644 // Block emits the body of the critical region.
2645 CGF.EmitBlock(BodyBB);
2646
2647 // Output the critical statement.
2648 CriticalOpGen(CGF);
2649
2650 // After the body surrounded by the critical region, the single executing
2651 // thread will jump to the synchronisation point.
2652 // Block waits for all threads in current team to finish then increments the
2653 // counter variable and returns to the loop.
2654 CGF.EmitBlock(SyncBB);
2655 getNVPTXCTABarrier(CGF);
2656
2657 llvm::Value *IncCounterVal =
2658 CGF.Builder.CreateNSWAdd(CounterVal, CGF.Builder.getInt32(1));
2659 CGF.EmitStoreOfScalar(IncCounterVal, CounterLVal);
2660 CGF.EmitBranch(LoopBB);
2661
2662 // Block that is reached when all threads in the team complete the region.
2663 CGF.EmitBlock(ExitBB, /*IsFinished=*/true);
2664}
2665
Alexey Bataevb2575932018-01-04 20:18:55 +00002666/// Cast value to the specified type.
Alexey Bataeva453f362018-03-19 17:53:56 +00002667static llvm::Value *castValueToType(CodeGenFunction &CGF, llvm::Value *Val,
2668 QualType ValTy, QualType CastTy,
2669 SourceLocation Loc) {
2670 assert(!CGF.getContext().getTypeSizeInChars(CastTy).isZero() &&
2671 "Cast type must sized.");
2672 assert(!CGF.getContext().getTypeSizeInChars(ValTy).isZero() &&
2673 "Val type must sized.");
2674 llvm::Type *LLVMCastTy = CGF.ConvertTypeForMem(CastTy);
2675 if (ValTy == CastTy)
Alexey Bataevb2575932018-01-04 20:18:55 +00002676 return Val;
Alexey Bataeva453f362018-03-19 17:53:56 +00002677 if (CGF.getContext().getTypeSizeInChars(ValTy) ==
2678 CGF.getContext().getTypeSizeInChars(CastTy))
2679 return CGF.Builder.CreateBitCast(Val, LLVMCastTy);
2680 if (CastTy->isIntegerType() && ValTy->isIntegerType())
2681 return CGF.Builder.CreateIntCast(Val, LLVMCastTy,
2682 CastTy->hasSignedIntegerRepresentation());
2683 Address CastItem = CGF.CreateMemTemp(CastTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002684 Address ValCastItem = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2685 CastItem, Val->getType()->getPointerTo(CastItem.getAddressSpace()));
Alexey Bataeva453f362018-03-19 17:53:56 +00002686 CGF.EmitStoreOfScalar(Val, ValCastItem, /*Volatile=*/false, ValTy);
2687 return CGF.EmitLoadOfScalar(CastItem, /*Volatile=*/false, CastTy, Loc);
Alexey Bataevb2575932018-01-04 20:18:55 +00002688}
2689
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002690/// This function creates calls to one of two shuffle functions to copy
2691/// variables between lanes in a warp.
2692static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002693 llvm::Value *Elem,
Alexey Bataeva453f362018-03-19 17:53:56 +00002694 QualType ElemType,
2695 llvm::Value *Offset,
2696 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00002697 CodeGenModule &CGM = CGF.CGM;
2698 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002699 CGOpenMPRuntimeNVPTX &RT =
2700 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
2701
Alexey Bataeva453f362018-03-19 17:53:56 +00002702 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2703 assert(Size.getQuantity() <= 8 &&
2704 "Unsupported bitwidth in shuffle instruction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002705
Alexey Bataeva453f362018-03-19 17:53:56 +00002706 OpenMPRTLFunctionNVPTX ShuffleFn = Size.getQuantity() <= 4
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002707 ? OMPRTL_NVPTX__kmpc_shuffle_int32
2708 : OMPRTL_NVPTX__kmpc_shuffle_int64;
2709
2710 // Cast all types to 32- or 64-bit values before calling shuffle routines.
Alexey Bataeva453f362018-03-19 17:53:56 +00002711 QualType CastTy = CGF.getContext().getIntTypeForBitwidth(
2712 Size.getQuantity() <= 4 ? 32 : 64, /*Signed=*/1);
2713 llvm::Value *ElemCast = castValueToType(CGF, Elem, ElemType, CastTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002714 llvm::Value *WarpSize =
Alexey Bataevb2575932018-01-04 20:18:55 +00002715 Bld.CreateIntCast(getNVPTXWarpSize(CGF), CGM.Int16Ty, /*isSigned=*/true);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002716
Alexey Bataev9ff80832018-04-16 20:16:21 +00002717 llvm::Value *ShuffledVal = CGF.EmitRuntimeCall(
2718 RT.createNVPTXRuntimeFunction(ShuffleFn), {ElemCast, Offset, WarpSize});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002719
Alexey Bataeva453f362018-03-19 17:53:56 +00002720 return castValueToType(CGF, ShuffledVal, CastTy, ElemType, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002721}
2722
Alexey Bataev12c62902018-06-22 19:10:38 +00002723static void shuffleAndStore(CodeGenFunction &CGF, Address SrcAddr,
2724 Address DestAddr, QualType ElemType,
2725 llvm::Value *Offset, SourceLocation Loc) {
2726 CGBuilderTy &Bld = CGF.Builder;
2727
2728 CharUnits Size = CGF.getContext().getTypeSizeInChars(ElemType);
2729 // Create the loop over the big sized data.
2730 // ptr = (void*)Elem;
2731 // ptrEnd = (void*) Elem + 1;
2732 // Step = 8;
2733 // while (ptr + Step < ptrEnd)
2734 // shuffle((int64_t)*ptr);
2735 // Step = 4;
2736 // while (ptr + Step < ptrEnd)
2737 // shuffle((int32_t)*ptr);
2738 // ...
2739 Address ElemPtr = DestAddr;
2740 Address Ptr = SrcAddr;
2741 Address PtrEnd = Bld.CreatePointerBitCastOrAddrSpaceCast(
2742 Bld.CreateConstGEP(SrcAddr, 1, Size), CGF.VoidPtrTy);
2743 for (int IntSize = 8; IntSize >= 1; IntSize /= 2) {
2744 if (Size < CharUnits::fromQuantity(IntSize))
2745 continue;
2746 QualType IntType = CGF.getContext().getIntTypeForBitwidth(
2747 CGF.getContext().toBits(CharUnits::fromQuantity(IntSize)),
2748 /*Signed=*/1);
2749 llvm::Type *IntTy = CGF.ConvertTypeForMem(IntType);
2750 Ptr = Bld.CreatePointerBitCastOrAddrSpaceCast(Ptr, IntTy->getPointerTo());
2751 ElemPtr =
2752 Bld.CreatePointerBitCastOrAddrSpaceCast(ElemPtr, IntTy->getPointerTo());
2753 if (Size.getQuantity() / IntSize > 1) {
2754 llvm::BasicBlock *PreCondBB = CGF.createBasicBlock(".shuffle.pre_cond");
2755 llvm::BasicBlock *ThenBB = CGF.createBasicBlock(".shuffle.then");
2756 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".shuffle.exit");
2757 llvm::BasicBlock *CurrentBB = Bld.GetInsertBlock();
2758 CGF.EmitBlock(PreCondBB);
2759 llvm::PHINode *PhiSrc =
2760 Bld.CreatePHI(Ptr.getType(), /*NumReservedValues=*/2);
2761 PhiSrc->addIncoming(Ptr.getPointer(), CurrentBB);
2762 llvm::PHINode *PhiDest =
2763 Bld.CreatePHI(ElemPtr.getType(), /*NumReservedValues=*/2);
2764 PhiDest->addIncoming(ElemPtr.getPointer(), CurrentBB);
2765 Ptr = Address(PhiSrc, Ptr.getAlignment());
2766 ElemPtr = Address(PhiDest, ElemPtr.getAlignment());
2767 llvm::Value *PtrDiff = Bld.CreatePtrDiff(
2768 PtrEnd.getPointer(), Bld.CreatePointerBitCastOrAddrSpaceCast(
2769 Ptr.getPointer(), CGF.VoidPtrTy));
2770 Bld.CreateCondBr(Bld.CreateICmpSGT(PtrDiff, Bld.getInt64(IntSize - 1)),
2771 ThenBB, ExitBB);
2772 CGF.EmitBlock(ThenBB);
2773 llvm::Value *Res = createRuntimeShuffleFunction(
2774 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2775 IntType, Offset, Loc);
2776 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2777 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2778 ElemPtr =
2779 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2780 PhiSrc->addIncoming(Ptr.getPointer(), ThenBB);
2781 PhiDest->addIncoming(ElemPtr.getPointer(), ThenBB);
2782 CGF.EmitBranch(PreCondBB);
2783 CGF.EmitBlock(ExitBB);
2784 } else {
2785 llvm::Value *Res = createRuntimeShuffleFunction(
2786 CGF, CGF.EmitLoadOfScalar(Ptr, /*Volatile=*/false, IntType, Loc),
2787 IntType, Offset, Loc);
2788 CGF.EmitStoreOfScalar(Res, ElemPtr, /*Volatile=*/false, IntType);
2789 Ptr = Bld.CreateConstGEP(Ptr, 1, CharUnits::fromQuantity(IntSize));
2790 ElemPtr =
2791 Bld.CreateConstGEP(ElemPtr, 1, CharUnits::fromQuantity(IntSize));
2792 }
2793 Size = Size % IntSize;
2794 }
2795}
2796
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002797namespace {
2798enum CopyAction : unsigned {
2799 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
2800 // the warp using shuffle instructions.
2801 RemoteLaneToThread,
2802 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
2803 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002804 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
2805 ThreadToScratchpad,
2806 // ScratchpadToThread: Copy from a scratchpad array in global memory
2807 // containing team-reduced data to a thread's stack.
2808 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002809};
2810} // namespace
2811
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002812struct CopyOptionsTy {
2813 llvm::Value *RemoteLaneOffset;
2814 llvm::Value *ScratchpadIndex;
2815 llvm::Value *ScratchpadWidth;
2816};
2817
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002818/// Emit instructions to copy a Reduce list, which contains partially
2819/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002820static void emitReductionListCopy(
2821 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
2822 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
2823 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002824
Alexey Bataev9ff80832018-04-16 20:16:21 +00002825 CodeGenModule &CGM = CGF.CGM;
2826 ASTContext &C = CGM.getContext();
2827 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002828
Alexey Bataev9ff80832018-04-16 20:16:21 +00002829 llvm::Value *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
2830 llvm::Value *ScratchpadIndex = CopyOptions.ScratchpadIndex;
2831 llvm::Value *ScratchpadWidth = CopyOptions.ScratchpadWidth;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002832
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002833 // Iterates, element-by-element, through the source Reduce list and
2834 // make a copy.
2835 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002836 unsigned Size = Privates.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00002837 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002838 Address SrcElementAddr = Address::invalid();
2839 Address DestElementAddr = Address::invalid();
2840 Address DestElementPtrAddr = Address::invalid();
2841 // Should we shuffle in an element from a remote lane?
2842 bool ShuffleInElement = false;
2843 // Set to true to update the pointer in the dest Reduce list to a
2844 // newly created element.
2845 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002846 // Increment the src or dest pointer to the scratchpad, for each
2847 // new element.
2848 bool IncrScratchpadSrc = false;
2849 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002850
2851 switch (Action) {
2852 case RemoteLaneToThread: {
2853 // Step 1.1: Get the address for the src element in the Reduce list.
2854 Address SrcElementPtrAddr =
2855 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002856 SrcElementAddr = CGF.EmitLoadOfPointer(
2857 SrcElementPtrAddr,
2858 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002859
2860 // Step 1.2: Create a temporary to store the element in the destination
2861 // Reduce list.
2862 DestElementPtrAddr =
2863 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2864 DestElementAddr =
2865 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2866 ShuffleInElement = true;
2867 UpdateDestListPtr = true;
2868 break;
2869 }
2870 case ThreadCopy: {
2871 // Step 1.1: Get the address for the src element in the Reduce list.
2872 Address SrcElementPtrAddr =
2873 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002874 SrcElementAddr = CGF.EmitLoadOfPointer(
2875 SrcElementPtrAddr,
2876 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002877
2878 // Step 1.2: Get the address for dest element. The destination
2879 // element has already been created on the thread's stack.
2880 DestElementPtrAddr =
2881 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002882 DestElementAddr = CGF.EmitLoadOfPointer(
2883 DestElementPtrAddr,
2884 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002885 break;
2886 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002887 case ThreadToScratchpad: {
2888 // Step 1.1: Get the address for the src element in the Reduce list.
2889 Address SrcElementPtrAddr =
2890 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
Alexey Bataevb2575932018-01-04 20:18:55 +00002891 SrcElementAddr = CGF.EmitLoadOfPointer(
2892 SrcElementPtrAddr,
2893 C.getPointerType(Private->getType())->castAs<PointerType>());
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002894
2895 // Step 1.2: Get the address for dest element:
2896 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002897 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002898 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002899 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002900 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002901 Bld.CreateNUWAdd(DestBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002902 ScratchPadElemAbsolutePtrVal =
2903 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
Alexey Bataevb2575932018-01-04 20:18:55 +00002904 DestElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2905 C.getTypeAlignInChars(Private->getType()));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002906 IncrScratchpadDest = true;
2907 break;
2908 }
2909 case ScratchpadToThread: {
2910 // Step 1.1: Get the address for the src element in the scratchpad.
2911 // address = base + index * ElementSizeInChars.
Alexey Bataeve290ec02018-04-06 16:03:36 +00002912 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
Alexey Bataev9ff80832018-04-16 20:16:21 +00002913 llvm::Value *CurrentOffset =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002914 Bld.CreateNUWMul(ElementSizeInChars, ScratchpadIndex);
Alexey Bataev9ff80832018-04-16 20:16:21 +00002915 llvm::Value *ScratchPadElemAbsolutePtrVal =
Alexey Bataeve290ec02018-04-06 16:03:36 +00002916 Bld.CreateNUWAdd(SrcBase.getPointer(), CurrentOffset);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002917 ScratchPadElemAbsolutePtrVal =
2918 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
2919 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
2920 C.getTypeAlignInChars(Private->getType()));
2921 IncrScratchpadSrc = true;
2922
2923 // Step 1.2: Create a temporary to store the element in the destination
2924 // Reduce list.
2925 DestElementPtrAddr =
2926 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
2927 DestElementAddr =
2928 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
2929 UpdateDestListPtr = true;
2930 break;
2931 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002932 }
2933
2934 // Regardless of src and dest of copy, we emit the load of src
2935 // element as this is required in all directions
2936 SrcElementAddr = Bld.CreateElementBitCast(
2937 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
Alexey Bataev12c62902018-06-22 19:10:38 +00002938 DestElementAddr = Bld.CreateElementBitCast(DestElementAddr,
2939 SrcElementAddr.getElementType());
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002940
2941 // Now that all active lanes have read the element in the
2942 // Reduce list, shuffle over the value from the remote lane.
Alexey Bataeva453f362018-03-19 17:53:56 +00002943 if (ShuffleInElement) {
Alexey Bataev12c62902018-06-22 19:10:38 +00002944 shuffleAndStore(CGF, SrcElementAddr, DestElementAddr, Private->getType(),
2945 RemoteLaneOffset, Private->getExprLoc());
2946 } else {
2947 if (Private->getType()->isScalarType()) {
2948 llvm::Value *Elem =
2949 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
2950 Private->getType(), Private->getExprLoc());
2951 // Store the source element value to the dest element address.
2952 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
2953 Private->getType());
2954 } else {
2955 CGF.EmitAggregateCopy(
2956 CGF.MakeAddrLValue(DestElementAddr, Private->getType()),
2957 CGF.MakeAddrLValue(SrcElementAddr, Private->getType()),
2958 Private->getType(), AggValueSlot::DoesNotOverlap);
2959 }
Alexey Bataeva453f362018-03-19 17:53:56 +00002960 }
Alexey Bataevb2575932018-01-04 20:18:55 +00002961
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002962 // Step 3.1: Modify reference in dest Reduce list as needed.
2963 // Modifying the reference in Reduce list to point to the newly
2964 // created element. The element is live in the current function
2965 // scope and that of functions it invokes (i.e., reduce_function).
2966 // RemoteReduceData[i] = (void*)&RemoteElem
2967 if (UpdateDestListPtr) {
2968 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
2969 DestElementAddr.getPointer(), CGF.VoidPtrTy),
2970 DestElementPtrAddr, /*Volatile=*/false,
2971 C.VoidPtrTy);
2972 }
2973
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002974 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
2975 // address of the next element in scratchpad memory, unless we're currently
2976 // processing the last one. Memory alignment is also taken care of here.
2977 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
2978 llvm::Value *ScratchpadBasePtr =
2979 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
Alexey Bataeve290ec02018-04-06 16:03:36 +00002980 llvm::Value *ElementSizeInChars = CGF.getTypeSize(Private->getType());
2981 ScratchpadBasePtr = Bld.CreateNUWAdd(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002982 ScratchpadBasePtr,
Alexey Bataeve290ec02018-04-06 16:03:36 +00002983 Bld.CreateNUWMul(ScratchpadWidth, ElementSizeInChars));
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002984
2985 // Take care of global memory alignment for performance
Alexey Bataeve290ec02018-04-06 16:03:36 +00002986 ScratchpadBasePtr = Bld.CreateNUWSub(
2987 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2988 ScratchpadBasePtr = Bld.CreateUDiv(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002989 ScratchpadBasePtr,
2990 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
Alexey Bataeve290ec02018-04-06 16:03:36 +00002991 ScratchpadBasePtr = Bld.CreateNUWAdd(
2992 ScratchpadBasePtr, llvm::ConstantInt::get(CGM.SizeTy, 1));
2993 ScratchpadBasePtr = Bld.CreateNUWMul(
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002994 ScratchpadBasePtr,
2995 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
2996
2997 if (IncrScratchpadDest)
2998 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
2999 else /* IncrScratchpadSrc = true */
3000 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
3001 }
3002
Alexey Bataev9ff80832018-04-16 20:16:21 +00003003 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003004 }
3005}
3006
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003007/// This function emits a helper that loads data from the scratchpad array
3008/// and (optionally) reduces it with the input operand.
3009///
3010/// load_and_reduce(local, scratchpad, index, width, should_reduce)
3011/// reduce_data remote;
3012/// for elem in remote:
3013/// remote.elem = Scratchpad[elem_id][index]
3014/// if (should_reduce)
3015/// local = local @ remote
3016/// else
3017/// local = remote
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003018static llvm::Value *emitReduceScratchpadFunction(
3019 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3020 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003021 ASTContext &C = CGM.getContext();
3022 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003023
3024 // Destination of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003025 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3026 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003027 // Base address of the scratchpad array, with each element storing a
3028 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003029 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3030 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003031 // A source index into the scratchpad array.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003032 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3033 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003034 // Row width of an element in the scratchpad array, typically
3035 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003036 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3037 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003038 // If should_reduce == 1, then it's load AND reduce,
3039 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
3040 // The latter case is used for initialization.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003041 ImplicitParamDecl ShouldReduceArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3042 Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003043
3044 FunctionArgList Args;
3045 Args.push_back(&ReduceListArg);
3046 Args.push_back(&ScratchPadArg);
3047 Args.push_back(&IndexArg);
3048 Args.push_back(&WidthArg);
3049 Args.push_back(&ShouldReduceArg);
3050
Alexey Bataev9ff80832018-04-16 20:16:21 +00003051 const CGFunctionInfo &CGFI =
3052 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003053 auto *Fn = llvm::Function::Create(
3054 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3055 "_omp_reduction_load_and_reduce", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003056 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003057 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003058 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003059 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003060
Alexey Bataev9ff80832018-04-16 20:16:21 +00003061 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003062
3063 // Get local Reduce list pointer.
3064 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3065 Address ReduceListAddr(
3066 Bld.CreatePointerBitCastOrAddrSpaceCast(
3067 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003068 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003069 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3070 CGF.getPointerAlign());
3071
3072 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3073 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003074 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003075
3076 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003077 llvm::Value *IndexVal = Bld.CreateIntCast(
3078 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3079 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003080
3081 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003082 llvm::Value *WidthVal = Bld.CreateIntCast(
3083 CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false, Int32Ty, Loc),
3084 CGM.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003085
3086 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
3087 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003088 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003089
3090 // The absolute ptr address to the base addr of the next element to copy.
3091 llvm::Value *CumulativeElemBasePtr =
3092 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3093 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3094
3095 // Create a Remote Reduce list to store the elements read from the
3096 // scratchpad array.
3097 Address RemoteReduceList =
3098 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
3099
3100 // Assemble remote Reduce list from scratchpad array.
3101 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
3102 SrcDataAddr, RemoteReduceList,
3103 {/*RemoteLaneOffset=*/nullptr,
3104 /*ScratchpadIndex=*/IndexVal,
3105 /*ScratchpadWidth=*/WidthVal});
3106
3107 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3108 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3109 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3110
Alexey Bataev9ff80832018-04-16 20:16:21 +00003111 llvm::Value *CondReduce = Bld.CreateIsNotNull(ShouldReduceVal);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003112 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3113
3114 CGF.EmitBlock(ThenBB);
3115 // We should reduce with the local Reduce list.
3116 // reduce_function(LocalReduceList, RemoteReduceList)
3117 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3118 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
3119 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3120 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003121 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3122 CGF, Loc, ReduceFn, {LocalDataPtr, RemoteDataPtr});
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003123 Bld.CreateBr(MergeBB);
3124
3125 CGF.EmitBlock(ElseBB);
3126 // No reduction; just copy:
3127 // Local Reduce list = Remote Reduce list.
3128 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3129 RemoteReduceList, ReduceListAddr);
3130 Bld.CreateBr(MergeBB);
3131
3132 CGF.EmitBlock(MergeBB);
3133
3134 CGF.FinishFunction();
3135 return Fn;
3136}
3137
3138/// This function emits a helper that stores reduced data from the team
3139/// master to a scratchpad array in global memory.
3140///
3141/// for elem in Reduce List:
3142/// scratchpad[elem_id][index] = elem
3143///
Benjamin Kramer674d5792017-05-26 20:08:24 +00003144static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
3145 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003146 QualType ReductionArrayTy,
3147 SourceLocation Loc) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003148
Alexey Bataev9ff80832018-04-16 20:16:21 +00003149 ASTContext &C = CGM.getContext();
3150 QualType Int32Ty = C.getIntTypeForBitwidth(32, /*Signed=*/1);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003151
3152 // Source of the copy.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003153 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3154 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003155 // Base address of the scratchpad array, with each element storing a
3156 // Reduce list per team.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003157 ImplicitParamDecl ScratchPadArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3158 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003159 // A destination index into the scratchpad array, typically the team
3160 // identifier.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003161 ImplicitParamDecl IndexArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3162 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003163 // Row width of an element in the scratchpad array, typically
3164 // the number of teams.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003165 ImplicitParamDecl WidthArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, Int32Ty,
3166 ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003167
3168 FunctionArgList Args;
3169 Args.push_back(&ReduceListArg);
3170 Args.push_back(&ScratchPadArg);
3171 Args.push_back(&IndexArg);
3172 Args.push_back(&WidthArg);
3173
Alexey Bataev9ff80832018-04-16 20:16:21 +00003174 const CGFunctionInfo &CGFI =
3175 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003176 auto *Fn = llvm::Function::Create(
3177 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3178 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003179 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003180 Fn->setDoesNotRecurse();
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003181 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003182 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003183
Alexey Bataev9ff80832018-04-16 20:16:21 +00003184 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003185
3186 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3187 Address SrcDataAddr(
3188 Bld.CreatePointerBitCastOrAddrSpaceCast(
3189 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003190 C.VoidPtrTy, Loc),
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003191 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3192 CGF.getPointerAlign());
3193
3194 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
3195 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003196 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003197
3198 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
Alexey Bataeva9b9cc02018-01-23 18:12:38 +00003199 llvm::Value *IndexVal = Bld.CreateIntCast(
3200 CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false, Int32Ty, Loc),
3201 CGF.SizeTy, /*isSigned=*/true);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003202
3203 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
3204 llvm::Value *WidthVal =
3205 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
3206 Int32Ty, SourceLocation()),
3207 CGF.SizeTy, /*isSigned=*/true);
3208
3209 // The absolute ptr address to the base addr of the next element to copy.
3210 llvm::Value *CumulativeElemBasePtr =
3211 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
3212 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
3213
3214 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
3215 SrcDataAddr, DestDataAddr,
3216 {/*RemoteLaneOffset=*/nullptr,
3217 /*ScratchpadIndex=*/IndexVal,
3218 /*ScratchpadWidth=*/WidthVal});
3219
3220 CGF.FinishFunction();
3221 return Fn;
3222}
3223
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003224/// This function emits a helper that gathers Reduce lists from the first
3225/// lane of every active warp to lanes in the first warp.
3226///
3227/// void inter_warp_copy_func(void* reduce_data, num_warps)
3228/// shared smem[warp_size];
3229/// For all data entries D in reduce_data:
3230/// If (I am the first lane in each warp)
3231/// Copy my local D to smem[warp_id]
3232/// sync
3233/// if (I am the first warp)
3234/// Copy smem[thread_id] to my local D
3235/// sync
3236static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
3237 ArrayRef<const Expr *> Privates,
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003238 QualType ReductionArrayTy,
3239 SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003240 ASTContext &C = CGM.getContext();
3241 llvm::Module &M = CGM.getModule();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003242
3243 // ReduceList: thread local Reduce list.
3244 // At the stage of the computation when this function is called, partially
3245 // aggregated values reside in the first lane of every active warp.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003246 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3247 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003248 // NumWarps: number of warps active in the parallel region. This could
3249 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003250 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
Alexey Bataev56223232017-06-09 13:40:18 +00003251 C.getIntTypeForBitwidth(32, /* Signed */ true),
3252 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003253 FunctionArgList Args;
3254 Args.push_back(&ReduceListArg);
3255 Args.push_back(&NumWarpsArg);
3256
Alexey Bataev9ff80832018-04-16 20:16:21 +00003257 const CGFunctionInfo &CGFI =
3258 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003259 auto *Fn = llvm::Function::Create(
3260 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3261 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003262 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003263 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003264 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003265 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003266
Alexey Bataev9ff80832018-04-16 20:16:21 +00003267 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003268
3269 // This array is used as a medium to transfer, one reduce element at a time,
3270 // the data from the first lane of every warp to lanes in the first warp
3271 // in order to perform the final step of a reduction in a parallel region
3272 // (reduction across warps). The array is placed in NVPTX __shared__ memory
3273 // for reduced latency, as well as to have a distinct copy for concurrently
3274 // executing target regions. The array is declared with common linkage so
3275 // as to be shared across compilation units.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003276 StringRef TransferMediumName =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003277 "__openmp_nvptx_data_transfer_temporary_storage";
3278 llvm::GlobalVariable *TransferMedium =
3279 M.getGlobalVariable(TransferMediumName);
3280 if (!TransferMedium) {
3281 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
3282 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
3283 TransferMedium = new llvm::GlobalVariable(
3284 M, Ty,
3285 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
3286 llvm::Constant::getNullValue(Ty), TransferMediumName,
3287 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
3288 SharedAddressSpace);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003289 CGM.addCompilerUsedGlobal(TransferMedium);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003290 }
3291
3292 // Get the CUDA thread id of the current OpenMP thread on the GPU.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003293 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003294 // nvptx_lane_id = nvptx_id % warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003295 llvm::Value *LaneID = getNVPTXLaneID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003296 // nvptx_warp_id = nvptx_id / warpsize
Alexey Bataev9ff80832018-04-16 20:16:21 +00003297 llvm::Value *WarpID = getNVPTXWarpID(CGF);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003298
3299 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3300 Address LocalReduceList(
3301 Bld.CreatePointerBitCastOrAddrSpaceCast(
3302 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3303 C.VoidPtrTy, SourceLocation()),
3304 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3305 CGF.getPointerAlign());
3306
3307 unsigned Idx = 0;
Alexey Bataev9ff80832018-04-16 20:16:21 +00003308 for (const Expr *Private : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003309 //
3310 // Warp master copies reduce element to transfer medium in __shared__
3311 // memory.
3312 //
3313 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3314 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3315 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3316
3317 // if (lane_id == 0)
Alexey Bataev9ff80832018-04-16 20:16:21 +00003318 llvm::Value *IsWarpMaster = Bld.CreateIsNull(LaneID, "warp_master");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003319 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
3320 CGF.EmitBlock(ThenBB);
3321
3322 // Reduce element = LocalReduceList[i]
3323 Address ElemPtrPtrAddr =
3324 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3325 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
3326 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3327 // elemptr = (type[i]*)(elemptrptr)
3328 Address ElemPtr =
3329 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
3330 ElemPtr = Bld.CreateElementBitCast(
3331 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003332
3333 // Get pointer to location in transfer medium.
3334 // MediumPtr = &medium[warp_id]
3335 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
3336 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
3337 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
3338 // Casting to actual data type.
3339 // MediumPtr = (type[i]*)MediumPtrAddr;
3340 MediumPtr = Bld.CreateElementBitCast(
3341 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
3342
Alexey Bataev12c62902018-06-22 19:10:38 +00003343 // elem = *elemptr
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003344 //*MediumPtr = elem
Alexey Bataev12c62902018-06-22 19:10:38 +00003345 if (Private->getType()->isScalarType()) {
3346 llvm::Value *Elem = CGF.EmitLoadOfScalar(ElemPtr, /*Volatile=*/false,
3347 Private->getType(), Loc);
3348 // Store the source element value to the dest element address.
3349 CGF.EmitStoreOfScalar(Elem, MediumPtr, /*Volatile=*/false,
3350 Private->getType());
3351 } else {
3352 CGF.EmitAggregateCopy(CGF.MakeAddrLValue(ElemPtr, Private->getType()),
3353 CGF.MakeAddrLValue(MediumPtr, Private->getType()),
3354 Private->getType(), AggValueSlot::DoesNotOverlap);
3355 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003356
3357 Bld.CreateBr(MergeBB);
3358
3359 CGF.EmitBlock(ElseBB);
3360 Bld.CreateBr(MergeBB);
3361
3362 CGF.EmitBlock(MergeBB);
3363
3364 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
3365 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
3366 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
3367
Alexey Bataev9ff80832018-04-16 20:16:21 +00003368 llvm::Value *NumActiveThreads = Bld.CreateNSWMul(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003369 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
3370 // named_barrier_sync(ParallelBarrierID, num_active_threads)
3371 syncParallelThreads(CGF, NumActiveThreads);
3372
3373 //
3374 // Warp 0 copies reduce element from transfer medium.
3375 //
3376 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
3377 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
3378 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
3379
3380 // Up to 32 threads in warp 0 are active.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003381 llvm::Value *IsActiveThread =
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003382 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
3383 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
3384
3385 CGF.EmitBlock(W0ThenBB);
3386
3387 // SrcMediumPtr = &medium[tid]
3388 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
3389 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
3390 Address SrcMediumPtr(SrcMediumPtrVal,
3391 C.getTypeAlignInChars(Private->getType()));
3392 // SrcMediumVal = *SrcMediumPtr;
3393 SrcMediumPtr = Bld.CreateElementBitCast(
3394 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003395
3396 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
3397 Address TargetElemPtrPtr =
3398 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
3399 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
3400 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
3401 Address TargetElemPtr =
3402 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
3403 TargetElemPtr = Bld.CreateElementBitCast(
3404 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
3405
3406 // *TargetElemPtr = SrcMediumVal;
Alexey Bataev12c62902018-06-22 19:10:38 +00003407 if (Private->getType()->isScalarType()) {
3408 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
3409 SrcMediumPtr, /*Volatile=*/false, Private->getType(), Loc);
3410 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
3411 Private->getType());
3412 } else {
3413 CGF.EmitAggregateCopy(
3414 CGF.MakeAddrLValue(SrcMediumPtr, Private->getType()),
3415 CGF.MakeAddrLValue(TargetElemPtr, Private->getType()),
3416 Private->getType(), AggValueSlot::DoesNotOverlap);
3417 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003418 Bld.CreateBr(W0MergeBB);
3419
3420 CGF.EmitBlock(W0ElseBB);
3421 Bld.CreateBr(W0MergeBB);
3422
3423 CGF.EmitBlock(W0MergeBB);
3424
3425 // While warp 0 copies values from transfer medium, all other warps must
3426 // wait.
3427 syncParallelThreads(CGF, NumActiveThreads);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003428 ++Idx;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003429 }
3430
3431 CGF.FinishFunction();
3432 return Fn;
3433}
3434
3435/// Emit a helper that reduces data across two OpenMP threads (lanes)
3436/// in the same warp. It uses shuffle instructions to copy over data from
3437/// a remote lane's stack. The reduction algorithm performed is specified
3438/// by the fourth parameter.
3439///
3440/// Algorithm Versions.
3441/// Full Warp Reduce (argument value 0):
3442/// This algorithm assumes that all 32 lanes are active and gathers
3443/// data from these 32 lanes, producing a single resultant value.
3444/// Contiguous Partial Warp Reduce (argument value 1):
3445/// This algorithm assumes that only a *contiguous* subset of lanes
3446/// are active. This happens for the last warp in a parallel region
3447/// when the user specified num_threads is not an integer multiple of
3448/// 32. This contiguous subset always starts with the zeroth lane.
3449/// Partial Warp Reduce (argument value 2):
3450/// This algorithm gathers data from any number of lanes at any position.
3451/// All reduced values are stored in the lowest possible lane. The set
3452/// of problems every algorithm addresses is a super set of those
3453/// addressable by algorithms with a lower version number. Overhead
3454/// increases as algorithm version increases.
3455///
3456/// Terminology
3457/// Reduce element:
3458/// Reduce element refers to the individual data field with primitive
3459/// data types to be combined and reduced across threads.
3460/// Reduce list:
3461/// Reduce list refers to a collection of local, thread-private
3462/// reduce elements.
3463/// Remote Reduce list:
3464/// Remote Reduce list refers to a collection of remote (relative to
3465/// the current thread) reduce elements.
3466///
3467/// We distinguish between three states of threads that are important to
3468/// the implementation of this function.
3469/// Alive threads:
3470/// Threads in a warp executing the SIMT instruction, as distinguished from
3471/// threads that are inactive due to divergent control flow.
3472/// Active threads:
3473/// The minimal set of threads that has to be alive upon entry to this
3474/// function. The computation is correct iff active threads are alive.
3475/// Some threads are alive but they are not active because they do not
3476/// contribute to the computation in any useful manner. Turning them off
3477/// may introduce control flow overheads without any tangible benefits.
3478/// Effective threads:
3479/// In order to comply with the argument requirements of the shuffle
3480/// function, we must keep all lanes holding data alive. But at most
3481/// half of them perform value aggregation; we refer to this half of
3482/// threads as effective. The other half is simply handing off their
3483/// data.
3484///
3485/// Procedure
3486/// Value shuffle:
3487/// In this step active threads transfer data from higher lane positions
3488/// in the warp to lower lane positions, creating Remote Reduce list.
3489/// Value aggregation:
3490/// In this step, effective threads combine their thread local Reduce list
3491/// with Remote Reduce list and store the result in the thread local
3492/// Reduce list.
3493/// Value copy:
3494/// In this step, we deal with the assumption made by algorithm 2
3495/// (i.e. contiguity assumption). When we have an odd number of lanes
3496/// active, say 2k+1, only k threads will be effective and therefore k
3497/// new values will be produced. However, the Reduce list owned by the
3498/// (2k+1)th thread is ignored in the value aggregation. Therefore
3499/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
3500/// that the contiguity assumption still holds.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003501static llvm::Value *emitShuffleAndReduceFunction(
3502 CodeGenModule &CGM, ArrayRef<const Expr *> Privates,
3503 QualType ReductionArrayTy, llvm::Value *ReduceFn, SourceLocation Loc) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003504 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003505
3506 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003507 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3508 C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003509 // Current lane id; could be logical.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003510 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr, C.ShortTy,
3511 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003512 // Offset of the remote source lane relative to the current lane.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003513 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3514 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003515 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003516 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, Loc, /*Id=*/nullptr,
3517 C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003518 FunctionArgList Args;
3519 Args.push_back(&ReduceListArg);
3520 Args.push_back(&LaneIDArg);
3521 Args.push_back(&RemoteLaneOffsetArg);
3522 Args.push_back(&AlgoVerArg);
3523
Alexey Bataev9ff80832018-04-16 20:16:21 +00003524 const CGFunctionInfo &CGFI =
3525 CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003526 auto *Fn = llvm::Function::Create(
3527 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
3528 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
Rafael Espindola51ec5a92018-02-28 23:46:35 +00003529 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00003530 Fn->setDoesNotRecurse();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003531 CodeGenFunction CGF(CGM);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003532 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args, Loc, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003533
Alexey Bataev9ff80832018-04-16 20:16:21 +00003534 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003535
3536 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
3537 Address LocalReduceList(
3538 Bld.CreatePointerBitCastOrAddrSpaceCast(
3539 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
3540 C.VoidPtrTy, SourceLocation()),
3541 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
3542 CGF.getPointerAlign());
3543
3544 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
3545 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
3546 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3547
3548 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
3549 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
3550 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3551
3552 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
3553 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
3554 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
3555
3556 // Create a local thread-private variable to host the Reduce list
3557 // from a remote lane.
3558 Address RemoteReduceList =
3559 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
3560
3561 // This loop iterates through the list of reduce elements and copies,
3562 // element by element, from a remote lane in the warp to RemoteReduceList,
3563 // hosted on the thread's stack.
3564 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
3565 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003566 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
3567 /*ScratchpadIndex=*/nullptr,
3568 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003569
3570 // The actions to be performed on the Remote Reduce list is dependent
3571 // on the algorithm version.
3572 //
3573 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
3574 // LaneId % 2 == 0 && Offset > 0):
3575 // do the reduction value aggregation
3576 //
3577 // The thread local variable Reduce list is mutated in place to host the
3578 // reduced data, which is the aggregated value produced from local and
3579 // remote lanes.
3580 //
3581 // Note that AlgoVer is expected to be a constant integer known at compile
3582 // time.
3583 // When AlgoVer==0, the first conjunction evaluates to true, making
3584 // the entire predicate true during compile time.
3585 // When AlgoVer==1, the second conjunction has only the second part to be
3586 // evaluated during runtime. Other conjunctions evaluates to false
3587 // during compile time.
3588 // When AlgoVer==2, the third conjunction has only the second part to be
3589 // evaluated during runtime. Other conjunctions evaluates to false
3590 // during compile time.
Alexey Bataev9ff80832018-04-16 20:16:21 +00003591 llvm::Value *CondAlgo0 = Bld.CreateIsNull(AlgoVerArgVal);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003592
Alexey Bataev9ff80832018-04-16 20:16:21 +00003593 llvm::Value *Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
3594 llvm::Value *CondAlgo1 = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003595 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
3596
Alexey Bataev9ff80832018-04-16 20:16:21 +00003597 llvm::Value *Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
3598 llvm::Value *CondAlgo2 = Bld.CreateAnd(
3599 Algo2, Bld.CreateIsNull(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1))));
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003600 CondAlgo2 = Bld.CreateAnd(
3601 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
3602
Alexey Bataev9ff80832018-04-16 20:16:21 +00003603 llvm::Value *CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003604 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
3605
3606 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
3607 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
3608 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
3609 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
3610
3611 CGF.EmitBlock(ThenBB);
3612 // reduce_function(LocalReduceList, RemoteReduceList)
3613 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3614 LocalReduceList.getPointer(), CGF.VoidPtrTy);
3615 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
3616 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003617 CGM.getOpenMPRuntime().emitOutlinedFunctionCall(
3618 CGF, Loc, ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003619 Bld.CreateBr(MergeBB);
3620
3621 CGF.EmitBlock(ElseBB);
3622 Bld.CreateBr(MergeBB);
3623
3624 CGF.EmitBlock(MergeBB);
3625
3626 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
3627 // Reduce list.
3628 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
Alexey Bataev9ff80832018-04-16 20:16:21 +00003629 llvm::Value *CondCopy = Bld.CreateAnd(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003630 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
3631
3632 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
3633 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
3634 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
3635 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
3636
3637 CGF.EmitBlock(CpyThenBB);
3638 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
3639 RemoteReduceList, LocalReduceList);
3640 Bld.CreateBr(CpyMergeBB);
3641
3642 CGF.EmitBlock(CpyElseBB);
3643 Bld.CreateBr(CpyMergeBB);
3644
3645 CGF.EmitBlock(CpyMergeBB);
3646
3647 CGF.FinishFunction();
3648 return Fn;
3649}
3650
3651///
3652/// Design of OpenMP reductions on the GPU
3653///
3654/// Consider a typical OpenMP program with one or more reduction
3655/// clauses:
3656///
3657/// float foo;
3658/// double bar;
3659/// #pragma omp target teams distribute parallel for \
3660/// reduction(+:foo) reduction(*:bar)
3661/// for (int i = 0; i < N; i++) {
3662/// foo += A[i]; bar *= B[i];
3663/// }
3664///
3665/// where 'foo' and 'bar' are reduced across all OpenMP threads in
3666/// all teams. In our OpenMP implementation on the NVPTX device an
3667/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
3668/// within a team are mapped to CUDA threads within a threadblock.
3669/// Our goal is to efficiently aggregate values across all OpenMP
3670/// threads such that:
3671///
3672/// - the compiler and runtime are logically concise, and
3673/// - the reduction is performed efficiently in a hierarchical
3674/// manner as follows: within OpenMP threads in the same warp,
3675/// across warps in a threadblock, and finally across teams on
3676/// the NVPTX device.
3677///
3678/// Introduction to Decoupling
3679///
3680/// We would like to decouple the compiler and the runtime so that the
3681/// latter is ignorant of the reduction variables (number, data types)
3682/// and the reduction operators. This allows a simpler interface
3683/// and implementation while still attaining good performance.
3684///
3685/// Pseudocode for the aforementioned OpenMP program generated by the
3686/// compiler is as follows:
3687///
3688/// 1. Create private copies of reduction variables on each OpenMP
3689/// thread: 'foo_private', 'bar_private'
3690/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
3691/// to it and writes the result in 'foo_private' and 'bar_private'
3692/// respectively.
3693/// 3. Call the OpenMP runtime on the GPU to reduce within a team
3694/// and store the result on the team master:
3695///
3696/// __kmpc_nvptx_parallel_reduce_nowait(...,
3697/// reduceData, shuffleReduceFn, interWarpCpyFn)
3698///
3699/// where:
3700/// struct ReduceData {
3701/// double *foo;
3702/// double *bar;
3703/// } reduceData
3704/// reduceData.foo = &foo_private
3705/// reduceData.bar = &bar_private
3706///
3707/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
3708/// auxiliary functions generated by the compiler that operate on
3709/// variables of type 'ReduceData'. They aid the runtime perform
3710/// algorithmic steps in a data agnostic manner.
3711///
3712/// 'shuffleReduceFn' is a pointer to a function that reduces data
3713/// of type 'ReduceData' across two OpenMP threads (lanes) in the
3714/// same warp. It takes the following arguments as input:
3715///
3716/// a. variable of type 'ReduceData' on the calling lane,
3717/// b. its lane_id,
3718/// c. an offset relative to the current lane_id to generate a
3719/// remote_lane_id. The remote lane contains the second
3720/// variable of type 'ReduceData' that is to be reduced.
3721/// d. an algorithm version parameter determining which reduction
3722/// algorithm to use.
3723///
3724/// 'shuffleReduceFn' retrieves data from the remote lane using
3725/// efficient GPU shuffle intrinsics and reduces, using the
3726/// algorithm specified by the 4th parameter, the two operands
3727/// element-wise. The result is written to the first operand.
3728///
3729/// Different reduction algorithms are implemented in different
3730/// runtime functions, all calling 'shuffleReduceFn' to perform
3731/// the essential reduction step. Therefore, based on the 4th
3732/// parameter, this function behaves slightly differently to
3733/// cooperate with the runtime to ensure correctness under
3734/// different circumstances.
3735///
3736/// 'InterWarpCpyFn' is a pointer to a function that transfers
3737/// reduced variables across warps. It tunnels, through CUDA
3738/// shared memory, the thread-private data of type 'ReduceData'
3739/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003740/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
3741/// The last team writes the global reduced value to memory.
3742///
3743/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
3744/// reduceData, shuffleReduceFn, interWarpCpyFn,
3745/// scratchpadCopyFn, loadAndReduceFn)
3746///
3747/// 'scratchpadCopyFn' is a helper that stores reduced
3748/// data from the team master to a scratchpad array in
3749/// global memory.
3750///
3751/// 'loadAndReduceFn' is a helper that loads data from
3752/// the scratchpad array and reduces it with the input
3753/// operand.
3754///
3755/// These compiler generated functions hide address
3756/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003757/// 5. if ret == 1:
3758/// The team master of the last team stores the reduced
3759/// result to the globals in memory.
3760/// foo += reduceData.foo; bar *= reduceData.bar
3761///
3762///
3763/// Warp Reduction Algorithms
3764///
3765/// On the warp level, we have three algorithms implemented in the
3766/// OpenMP runtime depending on the number of active lanes:
3767///
3768/// Full Warp Reduction
3769///
3770/// The reduce algorithm within a warp where all lanes are active
3771/// is implemented in the runtime as follows:
3772///
3773/// full_warp_reduce(void *reduce_data,
3774/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3775/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
3776/// ShuffleReduceFn(reduce_data, 0, offset, 0);
3777/// }
3778///
3779/// The algorithm completes in log(2, WARPSIZE) steps.
3780///
3781/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
3782/// not used therefore we save instructions by not retrieving lane_id
3783/// from the corresponding special registers. The 4th parameter, which
3784/// represents the version of the algorithm being used, is set to 0 to
3785/// signify full warp reduction.
3786///
3787/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3788///
3789/// #reduce_elem refers to an element in the local lane's data structure
3790/// #remote_elem is retrieved from a remote lane
3791/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3792/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
3793///
3794/// Contiguous Partial Warp Reduction
3795///
3796/// This reduce algorithm is used within a warp where only the first
3797/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
3798/// number of OpenMP threads in a parallel region is not a multiple of
3799/// WARPSIZE. The algorithm is implemented in the runtime as follows:
3800///
3801/// void
3802/// contiguous_partial_reduce(void *reduce_data,
3803/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
3804/// int size, int lane_id) {
3805/// int curr_size;
3806/// int offset;
3807/// curr_size = size;
3808/// mask = curr_size/2;
3809/// while (offset>0) {
3810/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
3811/// curr_size = (curr_size+1)/2;
3812/// offset = curr_size/2;
3813/// }
3814/// }
3815///
3816/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3817///
3818/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3819/// if (lane_id < offset)
3820/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3821/// else
3822/// reduce_elem = remote_elem
3823///
3824/// This algorithm assumes that the data to be reduced are located in a
3825/// contiguous subset of lanes starting from the first. When there is
3826/// an odd number of active lanes, the data in the last lane is not
3827/// aggregated with any other lane's dat but is instead copied over.
3828///
3829/// Dispersed Partial Warp Reduction
3830///
3831/// This algorithm is used within a warp when any discontiguous subset of
3832/// lanes are active. It is used to implement the reduction operation
3833/// across lanes in an OpenMP simd region or in a nested parallel region.
3834///
3835/// void
3836/// dispersed_partial_reduce(void *reduce_data,
3837/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
3838/// int size, remote_id;
3839/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
3840/// do {
3841/// remote_id = next_active_lane_id_right_after_me();
3842/// # the above function returns 0 of no active lane
3843/// # is present right after the current lane.
3844/// size = number_of_active_lanes_in_this_warp();
3845/// logical_lane_id /= 2;
3846/// ShuffleReduceFn(reduce_data, logical_lane_id,
3847/// remote_id-1-threadIdx.x, 2);
3848/// } while (logical_lane_id % 2 == 0 && size > 1);
3849/// }
3850///
3851/// There is no assumption made about the initial state of the reduction.
3852/// Any number of lanes (>=1) could be active at any position. The reduction
3853/// result is returned in the first active lane.
3854///
3855/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
3856///
3857/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
3858/// if (lane_id % 2 == 0 && offset > 0)
3859/// reduce_elem = reduce_elem REDUCE_OP remote_elem
3860/// else
3861/// reduce_elem = remote_elem
3862///
3863///
3864/// Intra-Team Reduction
3865///
3866/// This function, as implemented in the runtime call
3867/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
3868/// threads in a team. It first reduces within a warp using the
3869/// aforementioned algorithms. We then proceed to gather all such
3870/// reduced values at the first warp.
3871///
3872/// The runtime makes use of the function 'InterWarpCpyFn', which copies
3873/// data from each of the "warp master" (zeroth lane of each warp, where
3874/// warp-reduced data is held) to the zeroth warp. This step reduces (in
3875/// a mathematical sense) the problem of reduction across warp masters in
3876/// a block to the problem of warp reduction.
3877///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003878///
3879/// Inter-Team Reduction
3880///
3881/// Once a team has reduced its data to a single value, it is stored in
3882/// a global scratchpad array. Since each team has a distinct slot, this
3883/// can be done without locking.
3884///
3885/// The last team to write to the scratchpad array proceeds to reduce the
3886/// scratchpad array. One or more workers in the last team use the helper
3887/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
3888/// the k'th worker reduces every k'th element.
3889///
3890/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
3891/// reduce across workers and compute a globally reduced value.
3892///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003893void CGOpenMPRuntimeNVPTX::emitReduction(
3894 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
3895 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
3896 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
3897 if (!CGF.HaveInsertPoint())
3898 return;
3899
3900 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003901 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003902 bool SimdReduction = isOpenMPSimdDirective(Options.ReductionKind);
3903 assert((TeamsReduction || ParallelReduction || SimdReduction) &&
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003904 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003905
Alexey Bataev7b55d2d2018-06-18 17:11:45 +00003906 if (Options.SimpleReduction) {
3907 CGOpenMPRuntime::emitReduction(CGF, Loc, Privates, LHSExprs, RHSExprs,
3908 ReductionOps, Options);
3909 return;
3910 }
3911
Alexey Bataev9ff80832018-04-16 20:16:21 +00003912 ASTContext &C = CGM.getContext();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003913
3914 // 1. Build a list of reduction variables.
3915 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
3916 auto Size = RHSExprs.size();
Alexey Bataev9ff80832018-04-16 20:16:21 +00003917 for (const Expr *E : Privates) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003918 if (E->getType()->isVariablyModifiedType())
3919 // Reserve place for array size.
3920 ++Size;
3921 }
3922 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
3923 QualType ReductionArrayTy =
3924 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
3925 /*IndexTypeQuals=*/0);
3926 Address ReductionList =
3927 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
3928 auto IPriv = Privates.begin();
3929 unsigned Idx = 0;
3930 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
3931 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3932 CGF.getPointerSize());
3933 CGF.Builder.CreateStore(
3934 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
3935 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
3936 Elem);
3937 if ((*IPriv)->getType()->isVariablyModifiedType()) {
3938 // Store array size.
3939 ++Idx;
3940 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
3941 CGF.getPointerSize());
3942 llvm::Value *Size = CGF.Builder.CreateIntCast(
3943 CGF.getVLASize(
3944 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
Sander de Smalen891af03a2018-02-03 13:55:59 +00003945 .NumElts,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003946 CGF.SizeTy, /*isSigned=*/false);
3947 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
3948 Elem);
3949 }
3950 }
3951
3952 // 2. Emit reduce_func().
Alexey Bataev9ff80832018-04-16 20:16:21 +00003953 llvm::Value *ReductionFn = emitReductionFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003954 CGM, Loc, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(),
3955 Privates, LHSExprs, RHSExprs, ReductionOps);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003956
3957 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
3958 // RedList, shuffle_reduce_func, interwarp_copy_func);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003959 llvm::Value *ThreadId = getThreadID(CGF, Loc);
3960 llvm::Value *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
3961 llvm::Value *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003962 ReductionList.getPointer(), CGF.VoidPtrTy);
3963
Alexey Bataev9ff80832018-04-16 20:16:21 +00003964 llvm::Value *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003965 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003966 llvm::Value *InterWarpCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003967 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy, Loc);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003968
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003969 llvm::Value *Args[] = {ThreadId,
3970 CGF.Builder.getInt32(RHSExprs.size()),
3971 ReductionArrayTySize,
3972 RL,
3973 ShuffleAndReduceFn,
3974 InterWarpCopyFn};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003975
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003976 llvm::Value *Res = nullptr;
3977 if (ParallelReduction)
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003978 Res = CGF.EmitRuntimeCall(
3979 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
3980 Args);
Alexey Bataevfac26cf2018-05-02 20:03:27 +00003981 else if (SimdReduction)
3982 Res = CGF.EmitRuntimeCall(
3983 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_simd_reduce_nowait),
3984 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00003985
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003986 if (TeamsReduction) {
Alexey Bataev9ff80832018-04-16 20:16:21 +00003987 llvm::Value *ScratchPadCopyFn =
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003988 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy, Loc);
Alexey Bataev9ff80832018-04-16 20:16:21 +00003989 llvm::Value *LoadAndReduceFn = emitReduceScratchpadFunction(
Alexey Bataev7cae94e2018-01-04 19:45:16 +00003990 CGM, Privates, ReductionArrayTy, ReductionFn, Loc);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00003991
3992 llvm::Value *Args[] = {ThreadId,
3993 CGF.Builder.getInt32(RHSExprs.size()),
3994 ReductionArrayTySize,
3995 RL,
3996 ShuffleAndReduceFn,
3997 InterWarpCopyFn,
3998 ScratchPadCopyFn,
3999 LoadAndReduceFn};
4000 Res = CGF.EmitRuntimeCall(
4001 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
4002 Args);
4003 }
4004
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004005 // 5. Build switch(res)
Alexey Bataev9ff80832018-04-16 20:16:21 +00004006 llvm::BasicBlock *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
4007 llvm::SwitchInst *SwInst =
4008 CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004009
4010 // 6. Build case 1: where we have reduced values in the master
4011 // thread in each team.
4012 // __kmpc_end_reduce{_nowait}(<gtid>);
4013 // break;
Alexey Bataev9ff80832018-04-16 20:16:21 +00004014 llvm::BasicBlock *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004015 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
4016 CGF.EmitBlock(Case1BB);
4017
4018 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
4019 llvm::Value *EndArgs[] = {ThreadId};
Alexey Bataev9ff80832018-04-16 20:16:21 +00004020 auto &&CodeGen = [Privates, LHSExprs, RHSExprs, ReductionOps,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004021 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
4022 auto IPriv = Privates.begin();
4023 auto ILHS = LHSExprs.begin();
4024 auto IRHS = RHSExprs.begin();
Alexey Bataev9ff80832018-04-16 20:16:21 +00004025 for (const Expr *E : ReductionOps) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00004026 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
4027 cast<DeclRefExpr>(*IRHS));
4028 ++IPriv;
4029 ++ILHS;
4030 ++IRHS;
4031 }
4032 };
4033 RegionCodeGenTy RCG(CodeGen);
4034 NVPTXActionTy Action(
4035 nullptr, llvm::None,
4036 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
4037 EndArgs);
4038 RCG.setAction(Action);
4039 RCG(CGF);
4040 CGF.EmitBranch(DefaultBB);
4041 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
4042}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004043
4044const VarDecl *
4045CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
4046 const VarDecl *NativeParam) const {
4047 if (!NativeParam->getType()->isReferenceType())
4048 return NativeParam;
4049 QualType ArgType = NativeParam->getType();
4050 QualifierCollector QC;
4051 const Type *NonQualTy = QC.strip(ArgType);
4052 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4053 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
4054 if (Attr->getCaptureKind() == OMPC_map) {
4055 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
4056 LangAS::opencl_global);
4057 }
4058 }
4059 ArgType = CGM.getContext().getPointerType(PointeeTy);
4060 QC.addRestrict();
4061 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00004062 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004063 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataev9ff80832018-04-16 20:16:21 +00004064 if (isa<ImplicitParamDecl>(NativeParam))
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004065 return ImplicitParamDecl::Create(
4066 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
4067 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004068 return ParmVarDecl::Create(
4069 CGM.getContext(),
4070 const_cast<DeclContext *>(NativeParam->getDeclContext()),
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004071 NativeParam->getBeginLoc(), NativeParam->getLocation(),
Alexey Bataevb45d43c2017-11-22 16:02:03 +00004072 NativeParam->getIdentifier(), ArgType,
4073 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004074}
4075
4076Address
4077CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
4078 const VarDecl *NativeParam,
4079 const VarDecl *TargetParam) const {
4080 assert(NativeParam != TargetParam &&
4081 NativeParam->getType()->isReferenceType() &&
4082 "Native arg must not be the same as target arg.");
4083 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
4084 QualType NativeParamType = NativeParam->getType();
4085 QualifierCollector QC;
4086 const Type *NonQualTy = QC.strip(NativeParamType);
4087 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
4088 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00004089 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004090 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004091 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004092 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004093 // First cast to generic.
4094 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4095 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4096 /*AddrSpace=*/0));
4097 // Cast from generic to native address space.
4098 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
4099 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
4100 NativePointeeAddrSpace));
4101 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
4102 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00004103 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004104 return NativeParamAddr;
4105}
4106
4107void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00004108 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004109 ArrayRef<llvm::Value *> Args) const {
4110 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004111 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004112 auto *FnType =
4113 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
4114 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00004115 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
4116 TargetArgs.append(std::next(Args.begin(), I), Args.end());
4117 break;
4118 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004119 llvm::Type *TargetType = FnType->getParamType(I);
4120 llvm::Value *NativeArg = Args[I];
4121 if (!TargetType->isPointerTy()) {
4122 TargetArgs.emplace_back(NativeArg);
4123 continue;
4124 }
4125 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
Alexey Bataevc99042b2018-03-15 18:10:54 +00004126 NativeArg,
4127 NativeArg->getType()->getPointerElementType()->getPointerTo());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004128 TargetArgs.emplace_back(
4129 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
4130 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00004131 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00004132}
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004133
4134/// Emit function which wraps the outline parallel region
4135/// and controls the arguments which are passed to this function.
4136/// The wrapper ensures that the outlined function is called
4137/// with the correct arguments when data is shared.
4138llvm::Function *CGOpenMPRuntimeNVPTX::createParallelDataSharingWrapper(
4139 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
4140 ASTContext &Ctx = CGM.getContext();
4141 const auto &CS = *D.getCapturedStmt(OMPD_parallel);
4142
4143 // Create a function that takes as argument the source thread.
4144 FunctionArgList WrapperArgs;
4145 QualType Int16QTy =
4146 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
4147 QualType Int32QTy =
4148 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004149 ImplicitParamDecl ParallelLevelArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004150 /*Id=*/nullptr, Int16QTy,
4151 ImplicitParamDecl::Other);
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004152 ImplicitParamDecl WrapperArg(Ctx, /*DC=*/nullptr, D.getBeginLoc(),
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004153 /*Id=*/nullptr, Int32QTy,
4154 ImplicitParamDecl::Other);
4155 WrapperArgs.emplace_back(&ParallelLevelArg);
4156 WrapperArgs.emplace_back(&WrapperArg);
4157
Alexey Bataev9ff80832018-04-16 20:16:21 +00004158 const CGFunctionInfo &CGFI =
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004159 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
4160
4161 auto *Fn = llvm::Function::Create(
4162 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
Alexey Bataev9ff80832018-04-16 20:16:21 +00004163 Twine(OutlinedParallelFn->getName(), "_wrapper"), &CGM.getModule());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004164 CGM.SetInternalFunctionAttributes(GlobalDecl(), Fn, CGFI);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004165 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
Alexey Bataevc0f879b2018-04-10 20:10:53 +00004166 Fn->setDoesNotRecurse();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004167
4168 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
4169 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs,
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004170 D.getBeginLoc(), D.getBeginLoc());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004171
4172 const auto *RD = CS.getCapturedRecordDecl();
4173 auto CurField = RD->field_begin();
4174
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004175 Address ZeroAddr = CGF.CreateMemTemp(
4176 CGF.getContext().getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/1),
4177 /*Name*/ ".zero.addr");
4178 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004179 // Get the array of arguments.
4180 SmallVector<llvm::Value *, 8> Args;
4181
Alexey Bataevb7f3cba2018-03-19 17:04:07 +00004182 Args.emplace_back(CGF.GetAddrOfLocalVar(&WrapperArg).getPointer());
4183 Args.emplace_back(ZeroAddr.getPointer());
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004184
4185 CGBuilderTy &Bld = CGF.Builder;
4186 auto CI = CS.capture_begin();
4187
4188 // Use global memory for data sharing.
4189 // Handle passing of global args to workers.
4190 Address GlobalArgs =
4191 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy, "global_args");
4192 llvm::Value *GlobalArgsPtr = GlobalArgs.getPointer();
4193 llvm::Value *DataSharingArgs[] = {GlobalArgsPtr};
4194 CGF.EmitRuntimeCall(
4195 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_get_shared_variables),
4196 DataSharingArgs);
4197
4198 // Retrieve the shared variables from the list of references returned
4199 // by the runtime. Pass the variables to the outlined function.
Alexey Bataev17314212018-03-20 15:41:05 +00004200 Address SharedArgListAddress = Address::invalid();
4201 if (CS.capture_size() > 0 ||
4202 isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4203 SharedArgListAddress = CGF.EmitLoadOfPointer(
4204 GlobalArgs, CGF.getContext()
4205 .getPointerType(CGF.getContext().getPointerType(
4206 CGF.getContext().VoidPtrTy))
4207 .castAs<PointerType>());
4208 }
4209 unsigned Idx = 0;
4210 if (isOpenMPLoopBoundSharingDirective(D.getDirectiveKind())) {
4211 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4212 CGF.getPointerSize());
4213 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4214 Src, CGF.SizeTy->getPointerTo());
4215 llvm::Value *LB = CGF.EmitLoadOfScalar(
4216 TypedAddress,
4217 /*Volatile=*/false,
4218 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4219 cast<OMPLoopDirective>(D).getLowerBoundVariable()->getExprLoc());
4220 Args.emplace_back(LB);
4221 ++Idx;
4222 Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, Idx,
4223 CGF.getPointerSize());
4224 TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
4225 Src, CGF.SizeTy->getPointerTo());
4226 llvm::Value *UB = CGF.EmitLoadOfScalar(
4227 TypedAddress,
4228 /*Volatile=*/false,
4229 CGF.getContext().getPointerType(CGF.getContext().getSizeType()),
4230 cast<OMPLoopDirective>(D).getUpperBoundVariable()->getExprLoc());
4231 Args.emplace_back(UB);
4232 ++Idx;
4233 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004234 if (CS.capture_size() > 0) {
4235 ASTContext &CGFContext = CGF.getContext();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004236 for (unsigned I = 0, E = CS.capture_size(); I < E; ++I, ++CI, ++CurField) {
4237 QualType ElemTy = CurField->getType();
Alexey Bataev17314212018-03-20 15:41:05 +00004238 Address Src = Bld.CreateConstInBoundsGEP(SharedArgListAddress, I + Idx,
4239 CGF.getPointerSize());
4240 Address TypedAddress = Bld.CreatePointerBitCastOrAddrSpaceCast(
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004241 Src, CGF.ConvertTypeForMem(CGFContext.getPointerType(ElemTy)));
4242 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedAddress,
4243 /*Volatile=*/false,
4244 CGFContext.getPointerType(ElemTy),
4245 CI->getLocation());
Alexey Bataev2091ca62018-04-23 17:33:41 +00004246 if (CI->capturesVariableByCopy() &&
4247 !CI->getCapturedVar()->getType()->isAnyPointerType()) {
Alexey Bataev17314212018-03-20 15:41:05 +00004248 Arg = castValueToType(CGF, Arg, ElemTy, CGFContext.getUIntPtrType(),
4249 CI->getLocation());
4250 }
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004251 Args.emplace_back(Arg);
4252 }
4253 }
4254
Stephen Kellyf2ceec42018-08-09 21:08:08 +00004255 emitOutlinedFunctionCall(CGF, D.getBeginLoc(), OutlinedParallelFn, Args);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004256 CGF.FinishFunction();
4257 return Fn;
4258}
4259
4260void CGOpenMPRuntimeNVPTX::emitFunctionProlog(CodeGenFunction &CGF,
4261 const Decl *D) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004262 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4263 return;
4264
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004265 assert(D && "Expected function or captured|block decl.");
4266 assert(FunctionGlobalizedDecls.count(CGF.CurFn) == 0 &&
4267 "Function is registered already.");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004268 const Stmt *Body = nullptr;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004269 bool NeedToDelayGlobalization = false;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004270 if (const auto *FD = dyn_cast<FunctionDecl>(D)) {
4271 Body = FD->getBody();
4272 } else if (const auto *BD = dyn_cast<BlockDecl>(D)) {
4273 Body = BD->getBody();
4274 } else if (const auto *CD = dyn_cast<CapturedDecl>(D)) {
4275 Body = CD->getBody();
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004276 NeedToDelayGlobalization = CGF.CapturedStmtInfo->getKind() == CR_OpenMP;
Alexey Bataev2adecff2018-09-21 14:22:53 +00004277 if (NeedToDelayGlobalization &&
4278 getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD)
4279 return;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004280 }
4281 if (!Body)
4282 return;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004283 CheckVarsEscapingDeclContext VarChecker(CGF);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004284 VarChecker.Visit(Body);
Alexey Bataevff23bb62018-10-11 18:30:31 +00004285 const RecordDecl *GlobalizedVarsRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004286 VarChecker.getGlobalizedRecord(IsInTTDRegion);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004287 ArrayRef<const ValueDecl *> EscapedVariableLengthDecls =
4288 VarChecker.getEscapedVariableLengthDecls();
4289 if (!GlobalizedVarsRecord && EscapedVariableLengthDecls.empty())
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004290 return;
Alexey Bataevc99042b2018-03-15 18:10:54 +00004291 auto I = FunctionGlobalizedDecls.try_emplace(CGF.CurFn).first;
4292 I->getSecond().MappedParams =
4293 llvm::make_unique<CodeGenFunction::OMPMapVars>();
4294 I->getSecond().GlobalRecord = GlobalizedVarsRecord;
4295 I->getSecond().EscapedParameters.insert(
4296 VarChecker.getEscapedParameters().begin(),
4297 VarChecker.getEscapedParameters().end());
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004298 I->getSecond().EscapedVariableLengthDecls.append(
4299 EscapedVariableLengthDecls.begin(), EscapedVariableLengthDecls.end());
Alexey Bataevc99042b2018-03-15 18:10:54 +00004300 DeclToAddrMapTy &Data = I->getSecond().LocalVarData;
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004301 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004302 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004303 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004304 Data.insert(std::make_pair(VD, MappedVarData(FD, IsInTTDRegion)));
Alexey Bataevc99042b2018-03-15 18:10:54 +00004305 }
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004306 if (!IsInTTDRegion && !NeedToDelayGlobalization && !IsInParallelRegion) {
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004307 CheckVarsEscapingDeclContext VarChecker(CGF);
4308 VarChecker.Visit(Body);
4309 I->getSecond().SecondaryGlobalRecord =
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004310 VarChecker.getGlobalizedRecord(/*IsInTTDRegion=*/true);
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004311 I->getSecond().SecondaryLocalVarData.emplace();
4312 DeclToAddrMapTy &Data = I->getSecond().SecondaryLocalVarData.getValue();
4313 for (const ValueDecl *VD : VarChecker.getEscapedDecls()) {
4314 assert(VD->isCanonicalDecl() && "Expected canonical declaration");
4315 const FieldDecl *FD = VarChecker.getFieldForGlobalizedVar(VD);
Alexey Bataev4ac58d12018-10-12 20:19:59 +00004316 Data.insert(
4317 std::make_pair(VD, MappedVarData(FD, /*IsInTTDRegion=*/true)));
Alexey Bataev9bfe91d2018-10-12 16:04:20 +00004318 }
4319 }
Alexey Bataevc99042b2018-03-15 18:10:54 +00004320 if (!NeedToDelayGlobalization) {
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004321 emitGenericVarsProlog(CGF, D->getBeginLoc(), /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004322 struct GlobalizationScope final : EHScopeStack::Cleanup {
4323 GlobalizationScope() = default;
4324
4325 void Emit(CodeGenFunction &CGF, Flags flags) override {
4326 static_cast<CGOpenMPRuntimeNVPTX &>(CGF.CGM.getOpenMPRuntime())
Alexey Bataevbd8ff9b2018-08-30 18:56:11 +00004327 .emitGenericVarsEpilog(CGF, /*WithSPMDCheck=*/true);
Alexey Bataevc99042b2018-03-15 18:10:54 +00004328 }
4329 };
4330 CGF.EHStack.pushCleanup<GlobalizationScope>(NormalAndEHCleanup);
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004331 }
4332}
4333
4334Address CGOpenMPRuntimeNVPTX::getAddressOfLocalVariable(CodeGenFunction &CGF,
4335 const VarDecl *VD) {
Alexey Bataevd7ff6d62018-05-07 14:50:05 +00004336 if (getDataSharingMode(CGM) != CGOpenMPRuntimeNVPTX::Generic)
4337 return Address::invalid();
4338
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004339 VD = VD->getCanonicalDecl();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004340 auto I = FunctionGlobalizedDecls.find(CGF.CurFn);
4341 if (I == FunctionGlobalizedDecls.end())
4342 return Address::invalid();
Alexey Bataevc99042b2018-03-15 18:10:54 +00004343 auto VDI = I->getSecond().LocalVarData.find(VD);
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004344 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004345 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004346 if (VD->hasAttrs()) {
4347 for (specific_attr_iterator<OMPReferencedVarAttr> IT(VD->attr_begin()),
4348 E(VD->attr_end());
4349 IT != E; ++IT) {
4350 auto VDI = I->getSecond().LocalVarData.find(
4351 cast<VarDecl>(cast<DeclRefExpr>(IT->getRef())->getDecl())
4352 ->getCanonicalDecl());
4353 if (VDI != I->getSecond().LocalVarData.end())
Alexey Bataev9ea3c382018-10-09 14:49:00 +00004354 return VDI->second.PrivateAddr;
Alexey Bataev63cc8e92018-03-20 14:45:59 +00004355 }
4356 }
4357 return Address::invalid();
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004358}
4359
4360void CGOpenMPRuntimeNVPTX::functionFinished(CodeGenFunction &CGF) {
Gheorghe-Teodor Bercead3dcf2f2018-03-14 14:17:45 +00004361 FunctionGlobalizedDecls.erase(CGF.CurFn);
4362 CGOpenMPRuntime::functionFinished(CGF);
4363}
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004364
4365void CGOpenMPRuntimeNVPTX::getDefaultDistScheduleAndChunk(
4366 CodeGenFunction &CGF, const OMPLoopDirective &S,
4367 OpenMPDistScheduleClauseKind &ScheduleKind,
4368 llvm::Value *&Chunk) const {
4369 if (getExecutionMode() == CGOpenMPRuntimeNVPTX::EM_SPMD) {
4370 ScheduleKind = OMPC_DIST_SCHEDULE_static;
4371 Chunk = CGF.EmitScalarConversion(getNVPTXNumThreads(CGF),
4372 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4373 S.getIterationVariable()->getType(), S.getBeginLoc());
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004374 return;
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004375 }
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004376 CGOpenMPRuntime::getDefaultDistScheduleAndChunk(
4377 CGF, S, ScheduleKind, Chunk);
Gheorghe-Teodor Bercea02650d42018-09-27 19:22:56 +00004378}
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004379
4380void CGOpenMPRuntimeNVPTX::getDefaultScheduleAndChunk(
4381 CodeGenFunction &CGF, const OMPLoopDirective &S,
4382 OpenMPScheduleClauseKind &ScheduleKind,
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004383 const Expr *&ChunkExpr) const {
Gheorghe-Teodor Bercea669dbde2018-10-29 15:23:23 +00004384 ScheduleKind = OMPC_SCHEDULE_static;
Gheorghe-Teodor Berceae9256762018-10-29 15:45:47 +00004385 // Chunk size is 1 in this case.
4386 llvm::APInt ChunkSize(32, 1);
4387 ChunkExpr = IntegerLiteral::Create(CGF.getContext(), ChunkSize,
4388 CGF.getContext().getIntTypeForBitwidth(32, /*Signed=*/0),
4389 SourceLocation());
Gheorghe-Teodor Bercea8233af92018-09-27 20:29:00 +00004390}
Alexey Bataev60705422018-10-30 15:50:12 +00004391
4392void CGOpenMPRuntimeNVPTX::adjustTargetSpecificDataForLambdas(
4393 CodeGenFunction &CGF, const OMPExecutableDirective &D) const {
4394 assert(isOpenMPTargetExecutionDirective(D.getDirectiveKind()) &&
4395 " Expected target-based directive.");
4396 const CapturedStmt *CS = D.getCapturedStmt(OMPD_target);
4397 for (const CapturedStmt::Capture &C : CS->captures()) {
4398 // Capture variables captured by reference in lambdas for target-based
4399 // directives.
4400 if (!C.capturesVariable())
4401 continue;
4402 const VarDecl *VD = C.getCapturedVar();
4403 const auto *RD = VD->getType()
4404 .getCanonicalType()
4405 .getNonReferenceType()
4406 ->getAsCXXRecordDecl();
4407 if (!RD || !RD->isLambda())
4408 continue;
4409 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4410 LValue VDLVal;
4411 if (VD->getType().getCanonicalType()->isReferenceType())
4412 VDLVal = CGF.EmitLoadOfReferenceLValue(VDAddr, VD->getType());
4413 else
4414 VDLVal = CGF.MakeAddrLValue(
4415 VDAddr, VD->getType().getCanonicalType().getNonReferenceType());
4416 llvm::DenseMap<const VarDecl *, FieldDecl *> Captures;
4417 FieldDecl *ThisCapture = nullptr;
4418 RD->getCaptureFields(Captures, ThisCapture);
4419 if (ThisCapture && CGF.CapturedStmtInfo->isCXXThisExprCaptured()) {
4420 LValue ThisLVal =
4421 CGF.EmitLValueForFieldInitialization(VDLVal, ThisCapture);
4422 llvm::Value *CXXThis = CGF.LoadCXXThis();
4423 CGF.EmitStoreOfScalar(CXXThis, ThisLVal);
4424 }
4425 for (const LambdaCapture &LC : RD->captures()) {
4426 if (LC.getCaptureKind() != LCK_ByRef)
4427 continue;
4428 const VarDecl *VD = LC.getCapturedVar();
4429 if (!CS->capturesVariable(VD))
4430 continue;
4431 auto It = Captures.find(VD);
4432 assert(It != Captures.end() && "Found lambda capture without field.");
4433 LValue VarLVal = CGF.EmitLValueForFieldInitialization(VDLVal, It->second);
4434 Address VDAddr = CGF.GetAddrOfLocalVar(VD);
4435 if (VD->getType().getCanonicalType()->isReferenceType())
4436 VDAddr = CGF.EmitLoadOfReferenceLValue(VDAddr,
4437 VD->getType().getCanonicalType())
4438 .getAddress();
4439 CGF.EmitStoreOfScalar(VDAddr.getPointer(), VarLVal);
4440 }
4441 }
4442}
4443
Alexey Bataeve4090182018-11-02 14:54:07 +00004444/// Get number of SMs and number of blocks per SM.
4445static std::pair<unsigned, unsigned> getSMsBlocksPerSM(CodeGenModule &CGM) {
4446 std::pair<unsigned, unsigned> Data;
4447 if (CGM.getLangOpts().OpenMPCUDANumSMs)
4448 Data.first = CGM.getLangOpts().OpenMPCUDANumSMs;
4449 if (CGM.getLangOpts().OpenMPCUDABlocksPerSM)
4450 Data.second = CGM.getLangOpts().OpenMPCUDABlocksPerSM;
4451 if (Data.first && Data.second)
4452 return Data;
4453 if (CGM.getTarget().hasFeature("ptx")) {
4454 llvm::StringMap<bool> Features;
4455 CGM.getTarget().initFeatureMap(Features, CGM.getDiags(),
4456 CGM.getTarget().getTargetOpts().CPU,
4457 CGM.getTarget().getTargetOpts().Features);
4458 for (const auto &Feature : Features) {
4459 if (Feature.getValue()) {
4460 switch (StringToCudaArch(Feature.getKey())) {
4461 case CudaArch::SM_20:
4462 case CudaArch::SM_21:
4463 case CudaArch::SM_30:
4464 case CudaArch::SM_32:
4465 case CudaArch::SM_35:
4466 case CudaArch::SM_37:
4467 case CudaArch::SM_50:
4468 case CudaArch::SM_52:
4469 case CudaArch::SM_53:
4470 return {16, 16};
4471 case CudaArch::SM_60:
4472 case CudaArch::SM_61:
4473 case CudaArch::SM_62:
4474 return {56, 32};
4475 case CudaArch::SM_70:
4476 case CudaArch::SM_72:
4477 case CudaArch::SM_75:
4478 return {84, 32};
4479 case CudaArch::GFX600:
4480 case CudaArch::GFX601:
4481 case CudaArch::GFX700:
4482 case CudaArch::GFX701:
4483 case CudaArch::GFX702:
4484 case CudaArch::GFX703:
4485 case CudaArch::GFX704:
4486 case CudaArch::GFX801:
4487 case CudaArch::GFX802:
4488 case CudaArch::GFX803:
4489 case CudaArch::GFX810:
4490 case CudaArch::GFX900:
4491 case CudaArch::GFX902:
4492 case CudaArch::GFX904:
4493 case CudaArch::GFX906:
4494 case CudaArch::GFX909:
4495 case CudaArch::UNKNOWN:
4496 break;
4497 case CudaArch::LAST:
4498 llvm_unreachable("Unexpected Cuda arch.");
4499 }
4500 }
4501 }
4502 }
4503 llvm_unreachable("Unexpected NVPTX target without ptx feature.");
4504}
4505
4506void CGOpenMPRuntimeNVPTX::clear() {
4507 if (!GlobalizedRecords.empty()) {
4508 ASTContext &C = CGM.getContext();
4509 RecordDecl *StaticRD = C.buildImplicitRecord(
4510 "_openmp_static_memory_type_$_", RecordDecl::TagKind::TTK_Union);
4511 StaticRD->startDefinition();
4512 for (const GlobalPtrSizeRecsTy &Records : GlobalizedRecords) {
4513 if (Records.Records.empty())
4514 continue;
4515 unsigned Size = 0;
4516 unsigned RecAlignment = 0;
4517 for (const RecordDecl *RD : Records.Records) {
4518 QualType RDTy = CGM.getContext().getRecordType(RD);
4519 unsigned Alignment =
4520 CGM.getContext().getTypeAlignInChars(RDTy).getQuantity();
4521 RecAlignment = std::max(RecAlignment, Alignment);
4522 unsigned RecSize =
4523 CGM.getContext().getTypeSizeInChars(RDTy).getQuantity();
4524 Size =
4525 llvm::alignTo(llvm::alignTo(Size, Alignment) + RecSize, Alignment);
4526 }
4527 Size = llvm::alignTo(Size, RecAlignment);
4528 llvm::APInt ArySize(/*numBits=*/64, Size);
4529 QualType SubTy = C.getConstantArrayType(
4530 C.CharTy, ArySize, ArrayType::Normal, /*IndexTypeQuals=*/0);
4531 auto *Field = FieldDecl::Create(
4532 C, StaticRD, SourceLocation(), SourceLocation(), nullptr, SubTy,
4533 C.getTrivialTypeSourceInfo(SubTy, SourceLocation()),
4534 /*BW=*/nullptr, /*Mutable=*/false,
4535 /*InitStyle=*/ICIS_NoInit);
4536 Field->setAccess(AS_public);
4537 StaticRD->addDecl(Field);
4538 Records.RecSize->setInitializer(llvm::ConstantInt::get(CGM.SizeTy, Size));
4539 }
4540 StaticRD->completeDefinition();
4541 QualType StaticTy = C.getRecordType(StaticRD);
4542 std::pair<unsigned, unsigned> SMsBlockPerSM = getSMsBlocksPerSM(CGM);
4543 llvm::APInt Size1(32, SMsBlockPerSM.second);
4544 QualType Arr1Ty = C.getConstantArrayType(StaticTy, Size1, ArrayType::Normal,
4545 /*IndexTypeQuals=*/0);
4546 llvm::APInt Size2(32, SMsBlockPerSM.first);
4547 QualType Arr2Ty = C.getConstantArrayType(Arr1Ty, Size2, ArrayType::Normal,
4548 /*IndexTypeQuals=*/0);
4549 llvm::Type *LLVMArr2Ty = CGM.getTypes().ConvertTypeForMem(Arr2Ty);
4550 auto *GV = new llvm::GlobalVariable(
4551 CGM.getModule(), LLVMArr2Ty,
4552 /*isConstant=*/false, llvm::GlobalValue::WeakAnyLinkage,
4553 llvm::Constant::getNullValue(LLVMArr2Ty), "_openmp_static_glob_rd_$_");
4554 StaticGlobalized->setInitializer(
4555 llvm::ConstantExpr::getPointerBitCastOrAddrSpaceCast(GV,
4556 CGM.VoidPtrTy));
4557 }
4558 CGOpenMPRuntime::clear();
4559}