blob: c92b58e1d6ba9fe2d3f910b419653a9af62d7dab [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataevc5b1d322016-03-04 09:22:22 +000016#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000017#include "CodeGenFunction.h"
18#include "clang/AST/StmtOpenMP.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000019
20using namespace clang;
21using namespace CodeGen;
22
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000023namespace {
24enum OpenMPRTLFunctionNVPTX {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000025 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000026 OMPRTL_NVPTX__kmpc_kernel_init,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000027 /// \brief Call to void __kmpc_kernel_deinit();
28 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000029 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
30 /// short RequiresOMPRuntime, short RequiresDataSharing);
31 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
32 /// \brief Call to void __kmpc_spmd_kernel_deinit();
33 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000034 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
35 /// *outlined_function);
36 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
37 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
38 OMPRTL_NVPTX__kmpc_kernel_parallel,
39 /// \brief Call to void __kmpc_kernel_end_parallel();
40 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
41 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
42 /// global_tid);
43 OMPRTL_NVPTX__kmpc_serialized_parallel,
44 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
45 /// global_tid);
46 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000047 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
48 /// int16_t lane_offset, int16_t warp_size);
49 OMPRTL_NVPTX__kmpc_shuffle_int32,
50 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
51 /// int16_t lane_offset, int16_t warp_size);
52 OMPRTL_NVPTX__kmpc_shuffle_int64,
53 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
54 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
55 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
56 /// lane_offset, int16_t shortCircuit),
57 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
58 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000059 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
60 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
61 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
62 /// lane_offset, int16_t shortCircuit),
63 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
64 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
65 /// int32_t index, int32_t width),
66 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
67 /// index, int32_t width, int32_t reduce))
68 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000069 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
70 OMPRTL_NVPTX__kmpc_end_reduce_nowait
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000071};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000072
73/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
74class NVPTXActionTy final : public PrePostActionTy {
75 llvm::Value *EnterCallee;
76 ArrayRef<llvm::Value *> EnterArgs;
77 llvm::Value *ExitCallee;
78 ArrayRef<llvm::Value *> ExitArgs;
79 bool Conditional;
80 llvm::BasicBlock *ContBlock = nullptr;
81
82public:
83 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
84 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
85 bool Conditional = false)
86 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
87 ExitArgs(ExitArgs), Conditional(Conditional) {}
88 void Enter(CodeGenFunction &CGF) override {
89 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
90 if (Conditional) {
91 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
92 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
93 ContBlock = CGF.createBasicBlock("omp_if.end");
94 // Generate the branch (If-stmt)
95 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
96 CGF.EmitBlock(ThenBlock);
97 }
98 }
99 void Done(CodeGenFunction &CGF) {
100 // Emit the rest of blocks/branches
101 CGF.EmitBranch(ContBlock);
102 CGF.EmitBlock(ContBlock, true);
103 }
104 void Exit(CodeGenFunction &CGF) override {
105 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
106 }
107};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000108
109// A class to track the execution mode when codegening directives within
110// a target region. The appropriate mode (generic/spmd) is set on entry
111// to the target region and used by containing directives such as 'parallel'
112// to emit optimized code.
113class ExecutionModeRAII {
114private:
115 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
116 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
117
118public:
119 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
120 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
121 : Mode(Mode) {
122 SavedMode = Mode;
123 Mode = NewMode;
124 }
125 ~ExecutionModeRAII() { Mode = SavedMode; }
126};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000127
128/// GPU Configuration: This information can be derived from cuda registers,
129/// however, providing compile time constants helps generate more efficient
130/// code. For all practical purposes this is fine because the configuration
131/// is the same for all known NVPTX architectures.
132enum MachineConfiguration : unsigned {
133 WarpSize = 32,
134 /// Number of bits required to represent a lane identifier, which is
135 /// computed as log_2(WarpSize).
136 LaneIDBits = 5,
137 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000138
139 /// Global memory alignment for performance.
140 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000141};
142
143enum NamedBarrier : unsigned {
144 /// Synchronize on this barrier #ID using a named barrier primitive.
145 /// Only the subset of active threads in a parallel region arrive at the
146 /// barrier.
147 NB_Parallel = 1,
148};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000149} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000150
151/// Get the GPU warp size.
152static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000153 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000154 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000155 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000156 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000157}
158
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000159/// Get the id of the current thread on the GPU.
160static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000161 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000162 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000163 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000164 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000165}
166
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000167/// Get the id of the warp in the block.
168/// We assume that the warp size is 32, which is always the case
169/// on the NVPTX device, to generate more efficient code.
170static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
171 CGBuilderTy &Bld = CGF.Builder;
172 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
173}
174
175/// Get the id of the current lane in the Warp.
176/// We assume that the warp size is 32, which is always the case
177/// on the NVPTX device, to generate more efficient code.
178static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
179 CGBuilderTy &Bld = CGF.Builder;
180 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
181 "nvptx_lane_id");
182}
183
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000184/// Get the maximum number of threads in a block of the GPU.
185static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000186 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000187 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000188 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000189 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000190}
191
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000192/// Get barrier to synchronize all threads in a block.
193static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000194 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000195 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000196}
197
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000198/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
199/// a CTA.
200static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
201 llvm::Value *NumThreads) {
202 CGBuilderTy &Bld = CGF.Builder;
203 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000204 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
205 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
206 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000207}
208
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000209/// Synchronize all GPU threads in a block.
210static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000211
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000212/// Synchronize worker threads in a parallel region.
213static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
214 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
215}
216
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000217/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000218/// For the 'generic' execution mode, the runtime encodes thread_limit in
219/// the launch parameters, always starting thread_limit+warpSize threads per
220/// CTA. The threads in the last warp are reserved for master execution.
221/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
222static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
223 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000224 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000225 return IsInSpmdExecutionMode
226 ? getNVPTXNumThreads(CGF)
227 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
228 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000229}
230
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000231/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000232/// The master thread id is the first thread (lane) of the last warp in the
233/// GPU block. Warp size is assumed to be some power of 2.
234/// Thread id is 0 indexed.
235/// E.g: If NumThreads is 33, master id is 32.
236/// If NumThreads is 64, master id is 32.
237/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000238static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000239 CGBuilderTy &Bld = CGF.Builder;
240 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
241
242 // We assume that the warp size is a power of 2.
243 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
244
245 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
246 Bld.CreateNot(Mask), "master_tid");
247}
248
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000249CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
250 CodeGenModule &CGM)
251 : WorkerFn(nullptr), CGFI(nullptr) {
252 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000253}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000254
255void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
256 CodeGenModule &CGM) {
257 // Create an worker function with no arguments.
258 CGFI = &CGM.getTypes().arrangeNullaryFunction();
259
260 WorkerFn = llvm::Function::Create(
261 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
262 /* placeholder */ "_worker", &CGM.getModule());
263 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000264}
265
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000266bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
267 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
268}
269
270static CGOpenMPRuntimeNVPTX::ExecutionMode
271getExecutionModeForDirective(CodeGenModule &CGM,
272 const OMPExecutableDirective &D) {
273 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
274 switch (DirectiveKind) {
275 case OMPD_target:
Arpith Chacko Jacobcca61a32017-01-26 15:43:27 +0000276 case OMPD_target_teams:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000277 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
278 case OMPD_target_parallel:
Alexey Bataevfb0ebec2017-11-08 20:16:14 +0000279 case OMPD_target_parallel_for:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000280 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
281 default:
282 llvm_unreachable("Unsupported directive on NVPTX device.");
283 }
284 llvm_unreachable("Unsupported directive on NVPTX device.");
285}
286
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000287void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
288 StringRef ParentName,
289 llvm::Function *&OutlinedFn,
290 llvm::Constant *&OutlinedFnID,
291 bool IsOffloadEntry,
292 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000293 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
294 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000295 EntryFunctionState EST;
296 WorkerFunctionState WST(CGM);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000297 Work.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000298
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000299 // Emit target region as a standalone region.
300 class NVPTXPrePostActionTy : public PrePostActionTy {
301 CGOpenMPRuntimeNVPTX &RT;
302 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
303 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000304
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000305 public:
306 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
307 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
308 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
309 : RT(RT), EST(EST), WST(WST) {}
310 void Enter(CodeGenFunction &CGF) override {
311 RT.emitGenericEntryHeader(CGF, EST, WST);
312 }
313 void Exit(CodeGenFunction &CGF) override {
314 RT.emitGenericEntryFooter(CGF, EST);
315 }
316 } Action(*this, EST, WST);
317 CodeGen.setAction(Action);
318 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
319 IsOffloadEntry, CodeGen);
320
321 // Create the worker function
322 emitWorkerFunction(WST);
323
324 // Now change the name of the worker function to correspond to this target
325 // region's entry function.
326 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
327}
328
329// Setup NVPTX threads for master-worker OpenMP scheme.
330void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
331 EntryFunctionState &EST,
332 WorkerFunctionState &WST) {
333 CGBuilderTy &Bld = CGF.Builder;
334
335 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
336 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
337 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
338 EST.ExitBB = CGF.createBasicBlock(".exit");
339
340 auto *IsWorker =
341 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
342 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
343
344 CGF.EmitBlock(WorkerBB);
Alexey Bataev3c595a62017-08-14 15:01:03 +0000345 emitCall(CGF, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000346 CGF.EmitBranch(EST.ExitBB);
347
348 CGF.EmitBlock(MasterCheckBB);
349 auto *IsMaster =
350 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
351 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
352
353 CGF.EmitBlock(MasterBB);
354 // First action in sequential region:
355 // Initialize the state of the OpenMP runtime library on the GPU.
356 llvm::Value *Args[] = {getThreadLimit(CGF)};
357 CGF.EmitRuntimeCall(
358 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
359}
360
361void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
362 EntryFunctionState &EST) {
363 if (!EST.ExitBB)
364 EST.ExitBB = CGF.createBasicBlock(".exit");
365
366 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
367 CGF.EmitBranch(TerminateBB);
368
369 CGF.EmitBlock(TerminateBB);
370 // Signal termination condition.
371 CGF.EmitRuntimeCall(
372 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
373 // Barrier to terminate worker threads.
374 syncCTAThreads(CGF);
375 // Master thread jumps to exit point.
376 CGF.EmitBranch(EST.ExitBB);
377
378 CGF.EmitBlock(EST.ExitBB);
379 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000380}
381
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000382void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
383 StringRef ParentName,
384 llvm::Function *&OutlinedFn,
385 llvm::Constant *&OutlinedFnID,
386 bool IsOffloadEntry,
387 const RegionCodeGenTy &CodeGen) {
388 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
389 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
390 EntryFunctionState EST;
391
392 // Emit target region as a standalone region.
393 class NVPTXPrePostActionTy : public PrePostActionTy {
394 CGOpenMPRuntimeNVPTX &RT;
395 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
396 const OMPExecutableDirective &D;
397
398 public:
399 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
400 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
401 const OMPExecutableDirective &D)
402 : RT(RT), EST(EST), D(D) {}
403 void Enter(CodeGenFunction &CGF) override {
404 RT.emitSpmdEntryHeader(CGF, EST, D);
405 }
406 void Exit(CodeGenFunction &CGF) override {
407 RT.emitSpmdEntryFooter(CGF, EST);
408 }
409 } Action(*this, EST, D);
410 CodeGen.setAction(Action);
411 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
412 IsOffloadEntry, CodeGen);
413 return;
414}
415
416void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
417 CodeGenFunction &CGF, EntryFunctionState &EST,
418 const OMPExecutableDirective &D) {
419 auto &Bld = CGF.Builder;
420
421 // Setup BBs in entry function.
422 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
423 EST.ExitBB = CGF.createBasicBlock(".exit");
424
425 // Initialize the OMP state in the runtime; called by all active threads.
426 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
427 // based on code analysis of the target region.
428 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
429 /*RequiresOMPRuntime=*/Bld.getInt16(1),
430 /*RequiresDataSharing=*/Bld.getInt16(1)};
431 CGF.EmitRuntimeCall(
432 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
433 CGF.EmitBranch(ExecuteBB);
434
435 CGF.EmitBlock(ExecuteBB);
436}
437
438void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
439 EntryFunctionState &EST) {
440 if (!EST.ExitBB)
441 EST.ExitBB = CGF.createBasicBlock(".exit");
442
443 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
444 CGF.EmitBranch(OMPDeInitBB);
445
446 CGF.EmitBlock(OMPDeInitBB);
447 // DeInitialize the OMP state in the runtime; called by all active threads.
448 CGF.EmitRuntimeCall(
449 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
450 CGF.EmitBranch(EST.ExitBB);
451
452 CGF.EmitBlock(EST.ExitBB);
453 EST.ExitBB = nullptr;
454}
455
456// Create a unique global variable to indicate the execution mode of this target
457// region. The execution mode is either 'generic', or 'spmd' depending on the
458// target directive. This variable is picked up by the offload library to setup
459// the device appropriately before kernel launch. If the execution mode is
460// 'generic', the runtime reserves one warp for the master, otherwise, all
461// warps participate in parallel work.
462static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
463 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
464 (void)new llvm::GlobalVariable(
465 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
466 llvm::GlobalValue::WeakAnyLinkage,
467 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
468}
469
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000470void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
471 auto &Ctx = CGM.getContext();
472
473 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000474 CGF.disableDebugInfo();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000475 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
476 emitWorkerLoop(CGF, WST);
477 CGF.FinishFunction();
478}
479
480void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
481 WorkerFunctionState &WST) {
482 //
483 // The workers enter this loop and wait for parallel work from the master.
484 // When the master encounters a parallel region it sets up the work + variable
485 // arguments, and wakes up the workers. The workers first check to see if
486 // they are required for the parallel region, i.e., within the # of requested
487 // parallel threads. The activated workers load the variable arguments and
488 // execute the parallel work.
489 //
490
491 CGBuilderTy &Bld = CGF.Builder;
492
493 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
494 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
495 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
496 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
497 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
498 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
499
500 CGF.EmitBranch(AwaitBB);
501
502 // Workers wait for work from master.
503 CGF.EmitBlock(AwaitBB);
504 // Wait for parallel work
505 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000506
507 Address WorkFn =
508 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
509 Address ExecStatus =
510 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
511 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
512 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
513
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000514 llvm::Value *Args[] = {WorkFn.getPointer()};
515 llvm::Value *Ret = CGF.EmitRuntimeCall(
516 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
517 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000518
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000519 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000520 llvm::Value *ShouldTerminate =
521 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000522 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
523
524 // Activate requested workers.
525 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000526 llvm::Value *IsActive =
527 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
528 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000529
530 // Signal start of parallel region.
531 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000532
533 // Process work items: outlined parallel functions.
534 for (auto *W : Work) {
535 // Try to match this outlined function.
536 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
537
538 llvm::Value *WorkFnMatch =
539 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
540
541 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
542 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
543 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
544
545 // Execute this outlined function.
546 CGF.EmitBlock(ExecuteFNBB);
547
548 // Insert call to work function.
549 // FIXME: Pass arguments to outlined function from master thread.
550 auto *Fn = cast<llvm::Function>(W);
551 Address ZeroAddr =
552 CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
553 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
554 llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000555 emitCall(CGF, Fn, FnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000556
557 // Go to end of parallel region.
558 CGF.EmitBranch(TerminateBB);
559
560 CGF.EmitBlock(CheckNextBB);
561 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000562
563 // Signal end of parallel region.
564 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000565 CGF.EmitRuntimeCall(
566 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
567 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000568 CGF.EmitBranch(BarrierBB);
569
570 // All active and inactive workers wait at a barrier after parallel region.
571 CGF.EmitBlock(BarrierBB);
572 // Barrier after parallel region.
573 syncCTAThreads(CGF);
574 CGF.EmitBranch(AwaitBB);
575
576 // Exit target region.
577 CGF.EmitBlock(ExitBB);
578}
579
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000580/// \brief Returns specified OpenMP runtime function for the current OpenMP
581/// implementation. Specialized for the NVPTX device.
582/// \param Function OpenMP runtime function.
583/// \return Specified function.
584llvm::Constant *
585CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
586 llvm::Constant *RTLFn = nullptr;
587 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
588 case OMPRTL_NVPTX__kmpc_kernel_init: {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000589 // Build void __kmpc_kernel_init(kmp_int32 thread_limit);
590 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000591 llvm::FunctionType *FnTy =
592 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
593 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
594 break;
595 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000596 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
597 // Build void __kmpc_kernel_deinit();
598 llvm::FunctionType *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000599 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000600 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
601 break;
602 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000603 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
604 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
605 // short RequiresOMPRuntime, short RequiresDataSharing);
606 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
607 llvm::FunctionType *FnTy =
608 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
609 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
610 break;
611 }
612 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
613 // Build void __kmpc_spmd_kernel_deinit();
614 llvm::FunctionType *FnTy =
615 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
616 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
617 break;
618 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000619 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
620 /// Build void __kmpc_kernel_prepare_parallel(
621 /// void *outlined_function);
622 llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
623 llvm::FunctionType *FnTy =
624 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
625 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
626 break;
627 }
628 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
629 /// Build bool __kmpc_kernel_parallel(void **outlined_function);
630 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
631 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
632 llvm::FunctionType *FnTy =
633 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
634 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
635 break;
636 }
637 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
638 /// Build void __kmpc_kernel_end_parallel();
639 llvm::FunctionType *FnTy =
640 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
641 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
642 break;
643 }
644 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
645 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
646 // global_tid);
647 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
648 llvm::FunctionType *FnTy =
649 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
650 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
651 break;
652 }
653 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
654 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
655 // global_tid);
656 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
657 llvm::FunctionType *FnTy =
658 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
659 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
660 break;
661 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000662 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
663 // Build int32_t __kmpc_shuffle_int32(int32_t element,
664 // int16_t lane_offset, int16_t warp_size);
665 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
666 llvm::FunctionType *FnTy =
667 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
668 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
669 break;
670 }
671 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
672 // Build int64_t __kmpc_shuffle_int64(int64_t element,
673 // int16_t lane_offset, int16_t warp_size);
674 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
675 llvm::FunctionType *FnTy =
676 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
677 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
678 break;
679 }
680 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
681 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
682 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
683 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
684 // lane_offset, int16_t Algorithm Version),
685 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
686 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
687 CGM.Int16Ty, CGM.Int16Ty};
688 auto *ShuffleReduceFnTy =
689 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
690 /*isVarArg=*/false);
691 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
692 auto *InterWarpCopyFnTy =
693 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
694 /*isVarArg=*/false);
695 llvm::Type *TypeParams[] = {CGM.Int32Ty,
696 CGM.Int32Ty,
697 CGM.SizeTy,
698 CGM.VoidPtrTy,
699 ShuffleReduceFnTy->getPointerTo(),
700 InterWarpCopyFnTy->getPointerTo()};
701 llvm::FunctionType *FnTy =
702 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
703 RTLFn = CGM.CreateRuntimeFunction(
704 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
705 break;
706 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000707 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
708 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
709 // int32_t num_vars, size_t reduce_size, void *reduce_data,
710 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
711 // lane_offset, int16_t shortCircuit),
712 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
713 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
714 // int32_t index, int32_t width),
715 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
716 // int32_t index, int32_t width, int32_t reduce))
717 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
718 CGM.Int16Ty, CGM.Int16Ty};
719 auto *ShuffleReduceFnTy =
720 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
721 /*isVarArg=*/false);
722 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
723 auto *InterWarpCopyFnTy =
724 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
725 /*isVarArg=*/false);
726 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
727 CGM.Int32Ty, CGM.Int32Ty};
728 auto *CopyToScratchpadFnTy =
729 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
730 /*isVarArg=*/false);
731 llvm::Type *LoadReduceTypeParams[] = {
732 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
733 auto *LoadReduceFnTy =
734 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
735 /*isVarArg=*/false);
736 llvm::Type *TypeParams[] = {CGM.Int32Ty,
737 CGM.Int32Ty,
738 CGM.SizeTy,
739 CGM.VoidPtrTy,
740 ShuffleReduceFnTy->getPointerTo(),
741 InterWarpCopyFnTy->getPointerTo(),
742 CopyToScratchpadFnTy->getPointerTo(),
743 LoadReduceFnTy->getPointerTo()};
744 llvm::FunctionType *FnTy =
745 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
746 RTLFn = CGM.CreateRuntimeFunction(
747 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
748 break;
749 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000750 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
751 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
752 llvm::Type *TypeParams[] = {CGM.Int32Ty};
753 llvm::FunctionType *FnTy =
754 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
755 RTLFn = CGM.CreateRuntimeFunction(
756 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
757 break;
758 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000759 }
760 return RTLFn;
761}
762
763void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
764 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +0000765 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000766 auto *F = dyn_cast<llvm::Function>(Addr);
767 // TODO: Add support for global variables on the device after declare target
768 // support.
769 if (!F)
770 return;
771 llvm::Module *M = F->getParent();
772 llvm::LLVMContext &Ctx = M->getContext();
773
774 // Get "nvvm.annotations" metadata node
775 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
776
777 llvm::Metadata *MDVals[] = {
778 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
779 llvm::ConstantAsMetadata::get(
780 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
781 // Append metadata to nvvm.annotations
782 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
783}
784
785void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
786 const OMPExecutableDirective &D, StringRef ParentName,
787 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +0000788 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000789 if (!IsOffloadEntry) // Nothing to do.
790 return;
791
792 assert(!ParentName.empty() && "Invalid target region parent name!");
793
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000794 CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
795 getExecutionModeForDirective(CGM, D);
796 switch (Mode) {
797 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
798 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
799 CodeGen);
800 break;
801 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
802 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
803 CodeGen);
804 break;
805 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
806 llvm_unreachable(
807 "Unknown programming model for OpenMP directive on NVPTX target.");
808 }
809
810 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000811}
812
Samuel Antao45bfe4c2016-02-08 15:59:20 +0000813CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000814 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000815 if (!CGM.getLangOpts().OpenMPIsDevice)
816 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000817}
Carlo Bertollic6872252016-04-04 15:55:02 +0000818
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +0000819void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
820 OpenMPProcBindClauseKind ProcBind,
821 SourceLocation Loc) {
822 // Do nothing in case of Spmd mode and L0 parallel.
823 // TODO: If in Spmd mode and L1 parallel emit the clause.
824 if (isInSpmdExecutionMode())
825 return;
826
827 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
828}
829
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +0000830void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
831 llvm::Value *NumThreads,
832 SourceLocation Loc) {
833 // Do nothing in case of Spmd mode and L0 parallel.
834 // TODO: If in Spmd mode and L1 parallel emit the clause.
835 if (isInSpmdExecutionMode())
836 return;
837
838 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
839}
840
Carlo Bertollic6872252016-04-04 15:55:02 +0000841void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
842 const Expr *NumTeams,
843 const Expr *ThreadLimit,
844 SourceLocation Loc) {}
845
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000846llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
847 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
848 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
849 return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
850 InnermostKind, CodeGen);
851}
852
853llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +0000854 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
855 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
856
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000857 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
858 D, ThreadIDVar, InnermostKind, CodeGen);
859 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
860 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +0000861 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000862 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +0000863
864 return OutlinedFun;
865}
866
867void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
868 const OMPExecutableDirective &D,
869 SourceLocation Loc,
870 llvm::Value *OutlinedFn,
871 ArrayRef<llvm::Value *> CapturedVars) {
872 if (!CGF.HaveInsertPoint())
873 return;
874
875 Address ZeroAddr =
876 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
877 /*Name*/ ".zero.addr");
878 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
879 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
880 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
881 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
882 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +0000883 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +0000884}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000885
886void CGOpenMPRuntimeNVPTX::emitParallelCall(
887 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
888 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
889 if (!CGF.HaveInsertPoint())
890 return;
891
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000892 if (isInSpmdExecutionMode())
893 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
894 else
895 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000896}
897
898void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
899 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
900 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
901 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
902
Malcolm Parsonsc6e45832017-01-13 18:55:32 +0000903 auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000904 CGBuilderTy &Bld = CGF.Builder;
905
906 // Prepare for parallel region. Indicate the outlined function.
907 llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
908 CGF.EmitRuntimeCall(
909 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
910 Args);
911
912 // Activate workers. This barrier is used by the master to signal
913 // work for the workers.
914 syncCTAThreads(CGF);
915
916 // OpenMP [2.5, Parallel Construct, p.49]
917 // There is an implied barrier at the end of a parallel region. After the
918 // end of a parallel region, only the master thread of the team resumes
919 // execution of the enclosing task region.
920 //
921 // The master waits at this barrier until all workers are done.
922 syncCTAThreads(CGF);
923
924 // Remember for post-processing in worker loop.
925 Work.push_back(Fn);
926 };
927
928 auto *RTLoc = emitUpdateLocation(CGF, Loc);
929 auto *ThreadID = getThreadID(CGF, Loc);
930 llvm::Value *Args[] = {RTLoc, ThreadID};
931
Alexey Bataev3c595a62017-08-14 15:01:03 +0000932 auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,
933 PrePostActionTy &) {
934 auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,
935 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000936 Action.Enter(CGF);
937
938 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
939 OutlinedFnArgs.push_back(
940 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
941 OutlinedFnArgs.push_back(
942 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
943 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +0000944 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000945 };
946
947 RegionCodeGenTy RCG(CodeGen);
948 NVPTXActionTy Action(
949 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
950 Args,
951 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
952 Args);
953 RCG.setAction(Action);
954 RCG(CGF);
955 };
956
957 if (IfCond)
958 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
959 else {
960 CodeGenFunction::RunCleanupsScope Scope(CGF);
961 RegionCodeGenTy ThenRCG(L0ParallelGen);
962 ThenRCG(CGF);
963 }
964}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000965
966void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
967 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
968 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
969 // Just call the outlined function to execute the parallel region.
970 // OutlinedFn(&GTid, &zero, CapturedStruct);
971 //
972 // TODO: Do something with IfCond when support for the 'if' clause
973 // is added on Spmd target directives.
974 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
975 OutlinedFnArgs.push_back(
976 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
977 OutlinedFnArgs.push_back(
978 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
979 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +0000980 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000981}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000982
983/// This function creates calls to one of two shuffle functions to copy
984/// variables between lanes in a warp.
985static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
986 QualType ElemTy,
987 llvm::Value *Elem,
988 llvm::Value *Offset) {
989 auto &CGM = CGF.CGM;
990 auto &C = CGM.getContext();
991 auto &Bld = CGF.Builder;
992 CGOpenMPRuntimeNVPTX &RT =
993 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
994
995 unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
996 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
997
998 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
999 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1000 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1001
1002 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1003 auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1004 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1005 auto *WarpSize = CGF.EmitScalarConversion(
1006 getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
1007 C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
1008
1009 auto *ShuffledVal =
1010 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1011 {ElemCast, Offset, WarpSize});
1012
1013 return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
1014}
1015
1016namespace {
1017enum CopyAction : unsigned {
1018 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1019 // the warp using shuffle instructions.
1020 RemoteLaneToThread,
1021 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1022 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001023 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1024 ThreadToScratchpad,
1025 // ScratchpadToThread: Copy from a scratchpad array in global memory
1026 // containing team-reduced data to a thread's stack.
1027 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001028};
1029} // namespace
1030
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001031struct CopyOptionsTy {
1032 llvm::Value *RemoteLaneOffset;
1033 llvm::Value *ScratchpadIndex;
1034 llvm::Value *ScratchpadWidth;
1035};
1036
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001037/// Emit instructions to copy a Reduce list, which contains partially
1038/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001039static void emitReductionListCopy(
1040 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1041 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1042 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001043
1044 auto &CGM = CGF.CGM;
1045 auto &C = CGM.getContext();
1046 auto &Bld = CGF.Builder;
1047
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001048 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1049 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1050 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1051
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001052 // Iterates, element-by-element, through the source Reduce list and
1053 // make a copy.
1054 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001055 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001056 for (auto &Private : Privates) {
1057 Address SrcElementAddr = Address::invalid();
1058 Address DestElementAddr = Address::invalid();
1059 Address DestElementPtrAddr = Address::invalid();
1060 // Should we shuffle in an element from a remote lane?
1061 bool ShuffleInElement = false;
1062 // Set to true to update the pointer in the dest Reduce list to a
1063 // newly created element.
1064 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001065 // Increment the src or dest pointer to the scratchpad, for each
1066 // new element.
1067 bool IncrScratchpadSrc = false;
1068 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001069
1070 switch (Action) {
1071 case RemoteLaneToThread: {
1072 // Step 1.1: Get the address for the src element in the Reduce list.
1073 Address SrcElementPtrAddr =
1074 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1075 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1076 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1077 SrcElementAddr =
1078 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1079
1080 // Step 1.2: Create a temporary to store the element in the destination
1081 // Reduce list.
1082 DestElementPtrAddr =
1083 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1084 DestElementAddr =
1085 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1086 ShuffleInElement = true;
1087 UpdateDestListPtr = true;
1088 break;
1089 }
1090 case ThreadCopy: {
1091 // Step 1.1: Get the address for the src element in the Reduce list.
1092 Address SrcElementPtrAddr =
1093 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1094 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1095 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1096 SrcElementAddr =
1097 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1098
1099 // Step 1.2: Get the address for dest element. The destination
1100 // element has already been created on the thread's stack.
1101 DestElementPtrAddr =
1102 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1103 llvm::Value *DestElementPtr =
1104 CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1105 C.VoidPtrTy, SourceLocation());
1106 Address DestElemAddr =
1107 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1108 DestElementAddr = Bld.CreateElementBitCast(
1109 DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1110 break;
1111 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001112 case ThreadToScratchpad: {
1113 // Step 1.1: Get the address for the src element in the Reduce list.
1114 Address SrcElementPtrAddr =
1115 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1116 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1117 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1118 SrcElementAddr =
1119 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1120
1121 // Step 1.2: Get the address for dest element:
1122 // address = base + index * ElementSizeInChars.
1123 unsigned ElementSizeInChars =
1124 C.getTypeSizeInChars(Private->getType()).getQuantity();
1125 auto *CurrentOffset =
1126 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1127 ScratchpadIndex);
1128 auto *ScratchPadElemAbsolutePtrVal =
1129 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1130 ScratchPadElemAbsolutePtrVal =
1131 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1132 Address ScratchpadPtr =
1133 Address(ScratchPadElemAbsolutePtrVal,
1134 C.getTypeAlignInChars(Private->getType()));
1135 DestElementAddr = Bld.CreateElementBitCast(
1136 ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
1137 IncrScratchpadDest = true;
1138 break;
1139 }
1140 case ScratchpadToThread: {
1141 // Step 1.1: Get the address for the src element in the scratchpad.
1142 // address = base + index * ElementSizeInChars.
1143 unsigned ElementSizeInChars =
1144 C.getTypeSizeInChars(Private->getType()).getQuantity();
1145 auto *CurrentOffset =
1146 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1147 ScratchpadIndex);
1148 auto *ScratchPadElemAbsolutePtrVal =
1149 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1150 ScratchPadElemAbsolutePtrVal =
1151 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1152 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1153 C.getTypeAlignInChars(Private->getType()));
1154 IncrScratchpadSrc = true;
1155
1156 // Step 1.2: Create a temporary to store the element in the destination
1157 // Reduce list.
1158 DestElementPtrAddr =
1159 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1160 DestElementAddr =
1161 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1162 UpdateDestListPtr = true;
1163 break;
1164 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001165 }
1166
1167 // Regardless of src and dest of copy, we emit the load of src
1168 // element as this is required in all directions
1169 SrcElementAddr = Bld.CreateElementBitCast(
1170 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1171 llvm::Value *Elem =
1172 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1173 Private->getType(), SourceLocation());
1174
1175 // Now that all active lanes have read the element in the
1176 // Reduce list, shuffle over the value from the remote lane.
1177 if (ShuffleInElement) {
1178 Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1179 RemoteLaneOffset);
1180 }
1181
1182 // Store the source element value to the dest element address.
1183 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1184 Private->getType());
1185
1186 // Step 3.1: Modify reference in dest Reduce list as needed.
1187 // Modifying the reference in Reduce list to point to the newly
1188 // created element. The element is live in the current function
1189 // scope and that of functions it invokes (i.e., reduce_function).
1190 // RemoteReduceData[i] = (void*)&RemoteElem
1191 if (UpdateDestListPtr) {
1192 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1193 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1194 DestElementPtrAddr, /*Volatile=*/false,
1195 C.VoidPtrTy);
1196 }
1197
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001198 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1199 // address of the next element in scratchpad memory, unless we're currently
1200 // processing the last one. Memory alignment is also taken care of here.
1201 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1202 llvm::Value *ScratchpadBasePtr =
1203 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1204 unsigned ElementSizeInChars =
1205 C.getTypeSizeInChars(Private->getType()).getQuantity();
1206 ScratchpadBasePtr = Bld.CreateAdd(
1207 ScratchpadBasePtr,
1208 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1209 CGM.SizeTy, ElementSizeInChars)));
1210
1211 // Take care of global memory alignment for performance
1212 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1213 llvm::ConstantInt::get(CGM.SizeTy, 1));
1214 ScratchpadBasePtr = Bld.CreateSDiv(
1215 ScratchpadBasePtr,
1216 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1217 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1218 llvm::ConstantInt::get(CGM.SizeTy, 1));
1219 ScratchpadBasePtr = Bld.CreateMul(
1220 ScratchpadBasePtr,
1221 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1222
1223 if (IncrScratchpadDest)
1224 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1225 else /* IncrScratchpadSrc = true */
1226 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1227 }
1228
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001229 Idx++;
1230 }
1231}
1232
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001233/// This function emits a helper that loads data from the scratchpad array
1234/// and (optionally) reduces it with the input operand.
1235///
1236/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1237/// reduce_data remote;
1238/// for elem in remote:
1239/// remote.elem = Scratchpad[elem_id][index]
1240/// if (should_reduce)
1241/// local = local @ remote
1242/// else
1243/// local = remote
Benjamin Kramer674d5792017-05-26 20:08:24 +00001244static llvm::Value *
1245emitReduceScratchpadFunction(CodeGenModule &CGM,
1246 ArrayRef<const Expr *> Privates,
1247 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001248 auto &C = CGM.getContext();
1249 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1250
1251 // Destination of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001252 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001253 // Base address of the scratchpad array, with each element storing a
1254 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001255 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001256 // A source index into the scratchpad array.
Alexey Bataev56223232017-06-09 13:40:18 +00001257 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001258 // Row width of an element in the scratchpad array, typically
1259 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001260 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001261 // If should_reduce == 1, then it's load AND reduce,
1262 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1263 // The latter case is used for initialization.
Alexey Bataev56223232017-06-09 13:40:18 +00001264 ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001265
1266 FunctionArgList Args;
1267 Args.push_back(&ReduceListArg);
1268 Args.push_back(&ScratchPadArg);
1269 Args.push_back(&IndexArg);
1270 Args.push_back(&WidthArg);
1271 Args.push_back(&ShouldReduceArg);
1272
1273 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1274 auto *Fn = llvm::Function::Create(
1275 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1276 "_omp_reduction_load_and_reduce", &CGM.getModule());
1277 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1278 CodeGenFunction CGF(CGM);
1279 // We don't need debug information in this function as nothing here refers to
1280 // user code.
1281 CGF.disableDebugInfo();
1282 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1283
1284 auto &Bld = CGF.Builder;
1285
1286 // Get local Reduce list pointer.
1287 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1288 Address ReduceListAddr(
1289 Bld.CreatePointerBitCastOrAddrSpaceCast(
1290 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1291 C.VoidPtrTy, SourceLocation()),
1292 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1293 CGF.getPointerAlign());
1294
1295 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1296 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1297 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1298
1299 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1300 llvm::Value *IndexVal =
1301 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1302 Int32Ty, SourceLocation()),
1303 CGM.SizeTy, /*isSigned=*/true);
1304
1305 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1306 llvm::Value *WidthVal =
1307 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1308 Int32Ty, SourceLocation()),
1309 CGM.SizeTy, /*isSigned=*/true);
1310
1311 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1312 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
1313 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
1314
1315 // The absolute ptr address to the base addr of the next element to copy.
1316 llvm::Value *CumulativeElemBasePtr =
1317 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1318 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1319
1320 // Create a Remote Reduce list to store the elements read from the
1321 // scratchpad array.
1322 Address RemoteReduceList =
1323 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1324
1325 // Assemble remote Reduce list from scratchpad array.
1326 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1327 SrcDataAddr, RemoteReduceList,
1328 {/*RemoteLaneOffset=*/nullptr,
1329 /*ScratchpadIndex=*/IndexVal,
1330 /*ScratchpadWidth=*/WidthVal});
1331
1332 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1333 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1334 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1335
1336 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1337 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1338
1339 CGF.EmitBlock(ThenBB);
1340 // We should reduce with the local Reduce list.
1341 // reduce_function(LocalReduceList, RemoteReduceList)
1342 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1343 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1344 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1345 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1346 CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
1347 Bld.CreateBr(MergeBB);
1348
1349 CGF.EmitBlock(ElseBB);
1350 // No reduction; just copy:
1351 // Local Reduce list = Remote Reduce list.
1352 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1353 RemoteReduceList, ReduceListAddr);
1354 Bld.CreateBr(MergeBB);
1355
1356 CGF.EmitBlock(MergeBB);
1357
1358 CGF.FinishFunction();
1359 return Fn;
1360}
1361
1362/// This function emits a helper that stores reduced data from the team
1363/// master to a scratchpad array in global memory.
1364///
1365/// for elem in Reduce List:
1366/// scratchpad[elem_id][index] = elem
1367///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001368static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1369 ArrayRef<const Expr *> Privates,
1370 QualType ReductionArrayTy) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001371
1372 auto &C = CGM.getContext();
1373 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1374
1375 // Source of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001376 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001377 // Base address of the scratchpad array, with each element storing a
1378 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001379 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001380 // A destination index into the scratchpad array, typically the team
1381 // identifier.
Alexey Bataev56223232017-06-09 13:40:18 +00001382 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001383 // Row width of an element in the scratchpad array, typically
1384 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001385 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001386
1387 FunctionArgList Args;
1388 Args.push_back(&ReduceListArg);
1389 Args.push_back(&ScratchPadArg);
1390 Args.push_back(&IndexArg);
1391 Args.push_back(&WidthArg);
1392
1393 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1394 auto *Fn = llvm::Function::Create(
1395 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1396 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
1397 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1398 CodeGenFunction CGF(CGM);
1399 // We don't need debug information in this function as nothing here refers to
1400 // user code.
1401 CGF.disableDebugInfo();
1402 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1403
1404 auto &Bld = CGF.Builder;
1405
1406 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1407 Address SrcDataAddr(
1408 Bld.CreatePointerBitCastOrAddrSpaceCast(
1409 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1410 C.VoidPtrTy, SourceLocation()),
1411 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1412 CGF.getPointerAlign());
1413
1414 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1415 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1416 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1417
1418 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1419 llvm::Value *IndexVal =
1420 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1421 Int32Ty, SourceLocation()),
1422 CGF.SizeTy, /*isSigned=*/true);
1423
1424 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1425 llvm::Value *WidthVal =
1426 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1427 Int32Ty, SourceLocation()),
1428 CGF.SizeTy, /*isSigned=*/true);
1429
1430 // The absolute ptr address to the base addr of the next element to copy.
1431 llvm::Value *CumulativeElemBasePtr =
1432 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1433 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1434
1435 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1436 SrcDataAddr, DestDataAddr,
1437 {/*RemoteLaneOffset=*/nullptr,
1438 /*ScratchpadIndex=*/IndexVal,
1439 /*ScratchpadWidth=*/WidthVal});
1440
1441 CGF.FinishFunction();
1442 return Fn;
1443}
1444
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001445/// This function emits a helper that gathers Reduce lists from the first
1446/// lane of every active warp to lanes in the first warp.
1447///
1448/// void inter_warp_copy_func(void* reduce_data, num_warps)
1449/// shared smem[warp_size];
1450/// For all data entries D in reduce_data:
1451/// If (I am the first lane in each warp)
1452/// Copy my local D to smem[warp_id]
1453/// sync
1454/// if (I am the first warp)
1455/// Copy smem[thread_id] to my local D
1456/// sync
1457static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1458 ArrayRef<const Expr *> Privates,
1459 QualType ReductionArrayTy) {
1460 auto &C = CGM.getContext();
1461 auto &M = CGM.getModule();
1462
1463 // ReduceList: thread local Reduce list.
1464 // At the stage of the computation when this function is called, partially
1465 // aggregated values reside in the first lane of every active warp.
Alexey Bataev56223232017-06-09 13:40:18 +00001466 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001467 // NumWarps: number of warps active in the parallel region. This could
1468 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev56223232017-06-09 13:40:18 +00001469 ImplicitParamDecl NumWarpsArg(C,
1470 C.getIntTypeForBitwidth(32, /* Signed */ true),
1471 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001472 FunctionArgList Args;
1473 Args.push_back(&ReduceListArg);
1474 Args.push_back(&NumWarpsArg);
1475
1476 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1477 auto *Fn = llvm::Function::Create(
1478 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1479 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1480 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1481 CodeGenFunction CGF(CGM);
1482 // We don't need debug information in this function as nothing here refers to
1483 // user code.
1484 CGF.disableDebugInfo();
1485 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1486
1487 auto &Bld = CGF.Builder;
1488
1489 // This array is used as a medium to transfer, one reduce element at a time,
1490 // the data from the first lane of every warp to lanes in the first warp
1491 // in order to perform the final step of a reduction in a parallel region
1492 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1493 // for reduced latency, as well as to have a distinct copy for concurrently
1494 // executing target regions. The array is declared with common linkage so
1495 // as to be shared across compilation units.
1496 const char *TransferMediumName =
1497 "__openmp_nvptx_data_transfer_temporary_storage";
1498 llvm::GlobalVariable *TransferMedium =
1499 M.getGlobalVariable(TransferMediumName);
1500 if (!TransferMedium) {
1501 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1502 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1503 TransferMedium = new llvm::GlobalVariable(
1504 M, Ty,
1505 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1506 llvm::Constant::getNullValue(Ty), TransferMediumName,
1507 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1508 SharedAddressSpace);
1509 }
1510
1511 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1512 auto *ThreadID = getNVPTXThreadID(CGF);
1513 // nvptx_lane_id = nvptx_id % warpsize
1514 auto *LaneID = getNVPTXLaneID(CGF);
1515 // nvptx_warp_id = nvptx_id / warpsize
1516 auto *WarpID = getNVPTXWarpID(CGF);
1517
1518 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1519 Address LocalReduceList(
1520 Bld.CreatePointerBitCastOrAddrSpaceCast(
1521 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1522 C.VoidPtrTy, SourceLocation()),
1523 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1524 CGF.getPointerAlign());
1525
1526 unsigned Idx = 0;
1527 for (auto &Private : Privates) {
1528 //
1529 // Warp master copies reduce element to transfer medium in __shared__
1530 // memory.
1531 //
1532 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1533 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1534 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1535
1536 // if (lane_id == 0)
1537 auto IsWarpMaster =
1538 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1539 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1540 CGF.EmitBlock(ThenBB);
1541
1542 // Reduce element = LocalReduceList[i]
1543 Address ElemPtrPtrAddr =
1544 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1545 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1546 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1547 // elemptr = (type[i]*)(elemptrptr)
1548 Address ElemPtr =
1549 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1550 ElemPtr = Bld.CreateElementBitCast(
1551 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1552 // elem = *elemptr
1553 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1554 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1555
1556 // Get pointer to location in transfer medium.
1557 // MediumPtr = &medium[warp_id]
1558 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1559 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1560 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1561 // Casting to actual data type.
1562 // MediumPtr = (type[i]*)MediumPtrAddr;
1563 MediumPtr = Bld.CreateElementBitCast(
1564 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1565
1566 //*MediumPtr = elem
1567 Bld.CreateStore(Elem, MediumPtr);
1568
1569 Bld.CreateBr(MergeBB);
1570
1571 CGF.EmitBlock(ElseBB);
1572 Bld.CreateBr(MergeBB);
1573
1574 CGF.EmitBlock(MergeBB);
1575
1576 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1577 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1578 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1579
1580 auto *NumActiveThreads = Bld.CreateNSWMul(
1581 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1582 // named_barrier_sync(ParallelBarrierID, num_active_threads)
1583 syncParallelThreads(CGF, NumActiveThreads);
1584
1585 //
1586 // Warp 0 copies reduce element from transfer medium.
1587 //
1588 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1589 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1590 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1591
1592 // Up to 32 threads in warp 0 are active.
1593 auto IsActiveThread =
1594 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1595 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1596
1597 CGF.EmitBlock(W0ThenBB);
1598
1599 // SrcMediumPtr = &medium[tid]
1600 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1601 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1602 Address SrcMediumPtr(SrcMediumPtrVal,
1603 C.getTypeAlignInChars(Private->getType()));
1604 // SrcMediumVal = *SrcMediumPtr;
1605 SrcMediumPtr = Bld.CreateElementBitCast(
1606 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1607 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1608 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1609
1610 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1611 Address TargetElemPtrPtr =
1612 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1613 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1614 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1615 Address TargetElemPtr =
1616 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1617 TargetElemPtr = Bld.CreateElementBitCast(
1618 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1619
1620 // *TargetElemPtr = SrcMediumVal;
1621 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1622 Private->getType());
1623 Bld.CreateBr(W0MergeBB);
1624
1625 CGF.EmitBlock(W0ElseBB);
1626 Bld.CreateBr(W0MergeBB);
1627
1628 CGF.EmitBlock(W0MergeBB);
1629
1630 // While warp 0 copies values from transfer medium, all other warps must
1631 // wait.
1632 syncParallelThreads(CGF, NumActiveThreads);
1633 Idx++;
1634 }
1635
1636 CGF.FinishFunction();
1637 return Fn;
1638}
1639
1640/// Emit a helper that reduces data across two OpenMP threads (lanes)
1641/// in the same warp. It uses shuffle instructions to copy over data from
1642/// a remote lane's stack. The reduction algorithm performed is specified
1643/// by the fourth parameter.
1644///
1645/// Algorithm Versions.
1646/// Full Warp Reduce (argument value 0):
1647/// This algorithm assumes that all 32 lanes are active and gathers
1648/// data from these 32 lanes, producing a single resultant value.
1649/// Contiguous Partial Warp Reduce (argument value 1):
1650/// This algorithm assumes that only a *contiguous* subset of lanes
1651/// are active. This happens for the last warp in a parallel region
1652/// when the user specified num_threads is not an integer multiple of
1653/// 32. This contiguous subset always starts with the zeroth lane.
1654/// Partial Warp Reduce (argument value 2):
1655/// This algorithm gathers data from any number of lanes at any position.
1656/// All reduced values are stored in the lowest possible lane. The set
1657/// of problems every algorithm addresses is a super set of those
1658/// addressable by algorithms with a lower version number. Overhead
1659/// increases as algorithm version increases.
1660///
1661/// Terminology
1662/// Reduce element:
1663/// Reduce element refers to the individual data field with primitive
1664/// data types to be combined and reduced across threads.
1665/// Reduce list:
1666/// Reduce list refers to a collection of local, thread-private
1667/// reduce elements.
1668/// Remote Reduce list:
1669/// Remote Reduce list refers to a collection of remote (relative to
1670/// the current thread) reduce elements.
1671///
1672/// We distinguish between three states of threads that are important to
1673/// the implementation of this function.
1674/// Alive threads:
1675/// Threads in a warp executing the SIMT instruction, as distinguished from
1676/// threads that are inactive due to divergent control flow.
1677/// Active threads:
1678/// The minimal set of threads that has to be alive upon entry to this
1679/// function. The computation is correct iff active threads are alive.
1680/// Some threads are alive but they are not active because they do not
1681/// contribute to the computation in any useful manner. Turning them off
1682/// may introduce control flow overheads without any tangible benefits.
1683/// Effective threads:
1684/// In order to comply with the argument requirements of the shuffle
1685/// function, we must keep all lanes holding data alive. But at most
1686/// half of them perform value aggregation; we refer to this half of
1687/// threads as effective. The other half is simply handing off their
1688/// data.
1689///
1690/// Procedure
1691/// Value shuffle:
1692/// In this step active threads transfer data from higher lane positions
1693/// in the warp to lower lane positions, creating Remote Reduce list.
1694/// Value aggregation:
1695/// In this step, effective threads combine their thread local Reduce list
1696/// with Remote Reduce list and store the result in the thread local
1697/// Reduce list.
1698/// Value copy:
1699/// In this step, we deal with the assumption made by algorithm 2
1700/// (i.e. contiguity assumption). When we have an odd number of lanes
1701/// active, say 2k+1, only k threads will be effective and therefore k
1702/// new values will be produced. However, the Reduce list owned by the
1703/// (2k+1)th thread is ignored in the value aggregation. Therefore
1704/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1705/// that the contiguity assumption still holds.
1706static llvm::Value *
1707emitShuffleAndReduceFunction(CodeGenModule &CGM,
1708 ArrayRef<const Expr *> Privates,
1709 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1710 auto &C = CGM.getContext();
1711
1712 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev56223232017-06-09 13:40:18 +00001713 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001714 // Current lane id; could be logical.
Alexey Bataev56223232017-06-09 13:40:18 +00001715 ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001716 // Offset of the remote source lane relative to the current lane.
Alexey Bataev56223232017-06-09 13:40:18 +00001717 ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
1718 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001719 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev56223232017-06-09 13:40:18 +00001720 ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001721 FunctionArgList Args;
1722 Args.push_back(&ReduceListArg);
1723 Args.push_back(&LaneIDArg);
1724 Args.push_back(&RemoteLaneOffsetArg);
1725 Args.push_back(&AlgoVerArg);
1726
1727 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1728 auto *Fn = llvm::Function::Create(
1729 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1730 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1731 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1732 CodeGenFunction CGF(CGM);
1733 // We don't need debug information in this function as nothing here refers to
1734 // user code.
1735 CGF.disableDebugInfo();
1736 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1737
1738 auto &Bld = CGF.Builder;
1739
1740 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1741 Address LocalReduceList(
1742 Bld.CreatePointerBitCastOrAddrSpaceCast(
1743 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1744 C.VoidPtrTy, SourceLocation()),
1745 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1746 CGF.getPointerAlign());
1747
1748 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1749 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1750 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1751
1752 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1753 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1754 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1755
1756 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1757 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1758 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1759
1760 // Create a local thread-private variable to host the Reduce list
1761 // from a remote lane.
1762 Address RemoteReduceList =
1763 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1764
1765 // This loop iterates through the list of reduce elements and copies,
1766 // element by element, from a remote lane in the warp to RemoteReduceList,
1767 // hosted on the thread's stack.
1768 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1769 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001770 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
1771 /*ScratchpadIndex=*/nullptr,
1772 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001773
1774 // The actions to be performed on the Remote Reduce list is dependent
1775 // on the algorithm version.
1776 //
1777 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1778 // LaneId % 2 == 0 && Offset > 0):
1779 // do the reduction value aggregation
1780 //
1781 // The thread local variable Reduce list is mutated in place to host the
1782 // reduced data, which is the aggregated value produced from local and
1783 // remote lanes.
1784 //
1785 // Note that AlgoVer is expected to be a constant integer known at compile
1786 // time.
1787 // When AlgoVer==0, the first conjunction evaluates to true, making
1788 // the entire predicate true during compile time.
1789 // When AlgoVer==1, the second conjunction has only the second part to be
1790 // evaluated during runtime. Other conjunctions evaluates to false
1791 // during compile time.
1792 // When AlgoVer==2, the third conjunction has only the second part to be
1793 // evaluated during runtime. Other conjunctions evaluates to false
1794 // during compile time.
1795 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1796
1797 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1798 auto CondAlgo1 = Bld.CreateAnd(
1799 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1800
1801 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1802 auto CondAlgo2 = Bld.CreateAnd(
1803 Algo2,
1804 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1805 Bld.getInt16(0)));
1806 CondAlgo2 = Bld.CreateAnd(
1807 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1808
1809 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1810 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1811
1812 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1813 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1814 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1815 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1816
1817 CGF.EmitBlock(ThenBB);
1818 // reduce_function(LocalReduceList, RemoteReduceList)
1819 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1820 LocalReduceList.getPointer(), CGF.VoidPtrTy);
1821 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1822 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1823 CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1824 Bld.CreateBr(MergeBB);
1825
1826 CGF.EmitBlock(ElseBB);
1827 Bld.CreateBr(MergeBB);
1828
1829 CGF.EmitBlock(MergeBB);
1830
1831 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1832 // Reduce list.
1833 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1834 auto CondCopy = Bld.CreateAnd(
1835 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1836
1837 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1838 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1839 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1840 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1841
1842 CGF.EmitBlock(CpyThenBB);
1843 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1844 RemoteReduceList, LocalReduceList);
1845 Bld.CreateBr(CpyMergeBB);
1846
1847 CGF.EmitBlock(CpyElseBB);
1848 Bld.CreateBr(CpyMergeBB);
1849
1850 CGF.EmitBlock(CpyMergeBB);
1851
1852 CGF.FinishFunction();
1853 return Fn;
1854}
1855
1856///
1857/// Design of OpenMP reductions on the GPU
1858///
1859/// Consider a typical OpenMP program with one or more reduction
1860/// clauses:
1861///
1862/// float foo;
1863/// double bar;
1864/// #pragma omp target teams distribute parallel for \
1865/// reduction(+:foo) reduction(*:bar)
1866/// for (int i = 0; i < N; i++) {
1867/// foo += A[i]; bar *= B[i];
1868/// }
1869///
1870/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1871/// all teams. In our OpenMP implementation on the NVPTX device an
1872/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1873/// within a team are mapped to CUDA threads within a threadblock.
1874/// Our goal is to efficiently aggregate values across all OpenMP
1875/// threads such that:
1876///
1877/// - the compiler and runtime are logically concise, and
1878/// - the reduction is performed efficiently in a hierarchical
1879/// manner as follows: within OpenMP threads in the same warp,
1880/// across warps in a threadblock, and finally across teams on
1881/// the NVPTX device.
1882///
1883/// Introduction to Decoupling
1884///
1885/// We would like to decouple the compiler and the runtime so that the
1886/// latter is ignorant of the reduction variables (number, data types)
1887/// and the reduction operators. This allows a simpler interface
1888/// and implementation while still attaining good performance.
1889///
1890/// Pseudocode for the aforementioned OpenMP program generated by the
1891/// compiler is as follows:
1892///
1893/// 1. Create private copies of reduction variables on each OpenMP
1894/// thread: 'foo_private', 'bar_private'
1895/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1896/// to it and writes the result in 'foo_private' and 'bar_private'
1897/// respectively.
1898/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1899/// and store the result on the team master:
1900///
1901/// __kmpc_nvptx_parallel_reduce_nowait(...,
1902/// reduceData, shuffleReduceFn, interWarpCpyFn)
1903///
1904/// where:
1905/// struct ReduceData {
1906/// double *foo;
1907/// double *bar;
1908/// } reduceData
1909/// reduceData.foo = &foo_private
1910/// reduceData.bar = &bar_private
1911///
1912/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1913/// auxiliary functions generated by the compiler that operate on
1914/// variables of type 'ReduceData'. They aid the runtime perform
1915/// algorithmic steps in a data agnostic manner.
1916///
1917/// 'shuffleReduceFn' is a pointer to a function that reduces data
1918/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1919/// same warp. It takes the following arguments as input:
1920///
1921/// a. variable of type 'ReduceData' on the calling lane,
1922/// b. its lane_id,
1923/// c. an offset relative to the current lane_id to generate a
1924/// remote_lane_id. The remote lane contains the second
1925/// variable of type 'ReduceData' that is to be reduced.
1926/// d. an algorithm version parameter determining which reduction
1927/// algorithm to use.
1928///
1929/// 'shuffleReduceFn' retrieves data from the remote lane using
1930/// efficient GPU shuffle intrinsics and reduces, using the
1931/// algorithm specified by the 4th parameter, the two operands
1932/// element-wise. The result is written to the first operand.
1933///
1934/// Different reduction algorithms are implemented in different
1935/// runtime functions, all calling 'shuffleReduceFn' to perform
1936/// the essential reduction step. Therefore, based on the 4th
1937/// parameter, this function behaves slightly differently to
1938/// cooperate with the runtime to ensure correctness under
1939/// different circumstances.
1940///
1941/// 'InterWarpCpyFn' is a pointer to a function that transfers
1942/// reduced variables across warps. It tunnels, through CUDA
1943/// shared memory, the thread-private data of type 'ReduceData'
1944/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001945/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
1946/// The last team writes the global reduced value to memory.
1947///
1948/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
1949/// reduceData, shuffleReduceFn, interWarpCpyFn,
1950/// scratchpadCopyFn, loadAndReduceFn)
1951///
1952/// 'scratchpadCopyFn' is a helper that stores reduced
1953/// data from the team master to a scratchpad array in
1954/// global memory.
1955///
1956/// 'loadAndReduceFn' is a helper that loads data from
1957/// the scratchpad array and reduces it with the input
1958/// operand.
1959///
1960/// These compiler generated functions hide address
1961/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001962/// 5. if ret == 1:
1963/// The team master of the last team stores the reduced
1964/// result to the globals in memory.
1965/// foo += reduceData.foo; bar *= reduceData.bar
1966///
1967///
1968/// Warp Reduction Algorithms
1969///
1970/// On the warp level, we have three algorithms implemented in the
1971/// OpenMP runtime depending on the number of active lanes:
1972///
1973/// Full Warp Reduction
1974///
1975/// The reduce algorithm within a warp where all lanes are active
1976/// is implemented in the runtime as follows:
1977///
1978/// full_warp_reduce(void *reduce_data,
1979/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1980/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1981/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1982/// }
1983///
1984/// The algorithm completes in log(2, WARPSIZE) steps.
1985///
1986/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1987/// not used therefore we save instructions by not retrieving lane_id
1988/// from the corresponding special registers. The 4th parameter, which
1989/// represents the version of the algorithm being used, is set to 0 to
1990/// signify full warp reduction.
1991///
1992/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1993///
1994/// #reduce_elem refers to an element in the local lane's data structure
1995/// #remote_elem is retrieved from a remote lane
1996/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1997/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1998///
1999/// Contiguous Partial Warp Reduction
2000///
2001/// This reduce algorithm is used within a warp where only the first
2002/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2003/// number of OpenMP threads in a parallel region is not a multiple of
2004/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2005///
2006/// void
2007/// contiguous_partial_reduce(void *reduce_data,
2008/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2009/// int size, int lane_id) {
2010/// int curr_size;
2011/// int offset;
2012/// curr_size = size;
2013/// mask = curr_size/2;
2014/// while (offset>0) {
2015/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2016/// curr_size = (curr_size+1)/2;
2017/// offset = curr_size/2;
2018/// }
2019/// }
2020///
2021/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2022///
2023/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2024/// if (lane_id < offset)
2025/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2026/// else
2027/// reduce_elem = remote_elem
2028///
2029/// This algorithm assumes that the data to be reduced are located in a
2030/// contiguous subset of lanes starting from the first. When there is
2031/// an odd number of active lanes, the data in the last lane is not
2032/// aggregated with any other lane's dat but is instead copied over.
2033///
2034/// Dispersed Partial Warp Reduction
2035///
2036/// This algorithm is used within a warp when any discontiguous subset of
2037/// lanes are active. It is used to implement the reduction operation
2038/// across lanes in an OpenMP simd region or in a nested parallel region.
2039///
2040/// void
2041/// dispersed_partial_reduce(void *reduce_data,
2042/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2043/// int size, remote_id;
2044/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2045/// do {
2046/// remote_id = next_active_lane_id_right_after_me();
2047/// # the above function returns 0 of no active lane
2048/// # is present right after the current lane.
2049/// size = number_of_active_lanes_in_this_warp();
2050/// logical_lane_id /= 2;
2051/// ShuffleReduceFn(reduce_data, logical_lane_id,
2052/// remote_id-1-threadIdx.x, 2);
2053/// } while (logical_lane_id % 2 == 0 && size > 1);
2054/// }
2055///
2056/// There is no assumption made about the initial state of the reduction.
2057/// Any number of lanes (>=1) could be active at any position. The reduction
2058/// result is returned in the first active lane.
2059///
2060/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2061///
2062/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2063/// if (lane_id % 2 == 0 && offset > 0)
2064/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2065/// else
2066/// reduce_elem = remote_elem
2067///
2068///
2069/// Intra-Team Reduction
2070///
2071/// This function, as implemented in the runtime call
2072/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2073/// threads in a team. It first reduces within a warp using the
2074/// aforementioned algorithms. We then proceed to gather all such
2075/// reduced values at the first warp.
2076///
2077/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2078/// data from each of the "warp master" (zeroth lane of each warp, where
2079/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2080/// a mathematical sense) the problem of reduction across warp masters in
2081/// a block to the problem of warp reduction.
2082///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002083///
2084/// Inter-Team Reduction
2085///
2086/// Once a team has reduced its data to a single value, it is stored in
2087/// a global scratchpad array. Since each team has a distinct slot, this
2088/// can be done without locking.
2089///
2090/// The last team to write to the scratchpad array proceeds to reduce the
2091/// scratchpad array. One or more workers in the last team use the helper
2092/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2093/// the k'th worker reduces every k'th element.
2094///
2095/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2096/// reduce across workers and compute a globally reduced value.
2097///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002098void CGOpenMPRuntimeNVPTX::emitReduction(
2099 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2100 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2101 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2102 if (!CGF.HaveInsertPoint())
2103 return;
2104
2105 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002106 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2107 // FIXME: Add support for simd reduction.
2108 assert((TeamsReduction || ParallelReduction) &&
2109 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002110
2111 auto &C = CGM.getContext();
2112
2113 // 1. Build a list of reduction variables.
2114 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2115 auto Size = RHSExprs.size();
2116 for (auto *E : Privates) {
2117 if (E->getType()->isVariablyModifiedType())
2118 // Reserve place for array size.
2119 ++Size;
2120 }
2121 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2122 QualType ReductionArrayTy =
2123 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2124 /*IndexTypeQuals=*/0);
2125 Address ReductionList =
2126 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2127 auto IPriv = Privates.begin();
2128 unsigned Idx = 0;
2129 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2130 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2131 CGF.getPointerSize());
2132 CGF.Builder.CreateStore(
2133 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2134 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2135 Elem);
2136 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2137 // Store array size.
2138 ++Idx;
2139 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2140 CGF.getPointerSize());
2141 llvm::Value *Size = CGF.Builder.CreateIntCast(
2142 CGF.getVLASize(
2143 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2144 .first,
2145 CGF.SizeTy, /*isSigned=*/false);
2146 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2147 Elem);
2148 }
2149 }
2150
2151 // 2. Emit reduce_func().
2152 auto *ReductionFn = emitReductionFunction(
2153 CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
2154 LHSExprs, RHSExprs, ReductionOps);
2155
2156 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2157 // RedList, shuffle_reduce_func, interwarp_copy_func);
2158 auto *ThreadId = getThreadID(CGF, Loc);
2159 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2160 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2161 ReductionList.getPointer(), CGF.VoidPtrTy);
2162
2163 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2164 CGM, Privates, ReductionArrayTy, ReductionFn);
2165 auto *InterWarpCopyFn =
2166 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
2167
2168 llvm::Value *Res = nullptr;
2169 if (ParallelReduction) {
2170 llvm::Value *Args[] = {ThreadId,
2171 CGF.Builder.getInt32(RHSExprs.size()),
2172 ReductionArrayTySize,
2173 RL,
2174 ShuffleAndReduceFn,
2175 InterWarpCopyFn};
2176
2177 Res = CGF.EmitRuntimeCall(
2178 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2179 Args);
2180 }
2181
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002182 if (TeamsReduction) {
2183 auto *ScratchPadCopyFn =
2184 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
2185 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
2186 CGM, Privates, ReductionArrayTy, ReductionFn);
2187
2188 llvm::Value *Args[] = {ThreadId,
2189 CGF.Builder.getInt32(RHSExprs.size()),
2190 ReductionArrayTySize,
2191 RL,
2192 ShuffleAndReduceFn,
2193 InterWarpCopyFn,
2194 ScratchPadCopyFn,
2195 LoadAndReduceFn};
2196 Res = CGF.EmitRuntimeCall(
2197 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2198 Args);
2199 }
2200
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002201 // 5. Build switch(res)
2202 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2203 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2204
2205 // 6. Build case 1: where we have reduced values in the master
2206 // thread in each team.
2207 // __kmpc_end_reduce{_nowait}(<gtid>);
2208 // break;
2209 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2210 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2211 CGF.EmitBlock(Case1BB);
2212
2213 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2214 llvm::Value *EndArgs[] = {ThreadId};
2215 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2216 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2217 auto IPriv = Privates.begin();
2218 auto ILHS = LHSExprs.begin();
2219 auto IRHS = RHSExprs.begin();
2220 for (auto *E : ReductionOps) {
2221 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2222 cast<DeclRefExpr>(*IRHS));
2223 ++IPriv;
2224 ++ILHS;
2225 ++IRHS;
2226 }
2227 };
2228 RegionCodeGenTy RCG(CodeGen);
2229 NVPTXActionTy Action(
2230 nullptr, llvm::None,
2231 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2232 EndArgs);
2233 RCG.setAction(Action);
2234 RCG(CGF);
2235 CGF.EmitBranch(DefaultBB);
2236 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2237}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002238
2239const VarDecl *
2240CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2241 const VarDecl *NativeParam) const {
2242 if (!NativeParam->getType()->isReferenceType())
2243 return NativeParam;
2244 QualType ArgType = NativeParam->getType();
2245 QualifierCollector QC;
2246 const Type *NonQualTy = QC.strip(ArgType);
2247 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2248 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2249 if (Attr->getCaptureKind() == OMPC_map) {
2250 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2251 LangAS::opencl_global);
2252 }
2253 }
2254 ArgType = CGM.getContext().getPointerType(PointeeTy);
2255 QC.addRestrict();
2256 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002257 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002258 ArgType = QC.apply(CGM.getContext(), ArgType);
2259 return ImplicitParamDecl::Create(
2260 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2261 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2262}
2263
2264Address
2265CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2266 const VarDecl *NativeParam,
2267 const VarDecl *TargetParam) const {
2268 assert(NativeParam != TargetParam &&
2269 NativeParam->getType()->isReferenceType() &&
2270 "Native arg must not be the same as target arg.");
2271 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2272 QualType NativeParamType = NativeParam->getType();
2273 QualifierCollector QC;
2274 const Type *NonQualTy = QC.strip(NativeParamType);
2275 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2276 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002277 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002278 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002279 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002280 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002281 // First cast to generic.
2282 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2283 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2284 /*AddrSpace=*/0));
2285 // Cast from generic to native address space.
2286 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2287 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2288 NativePointeeAddrSpace));
2289 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2290 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002291 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002292 return NativeParamAddr;
2293}
2294
2295void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002296 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002297 ArrayRef<llvm::Value *> Args) const {
2298 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002299 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002300 auto *FnType =
2301 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2302 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002303 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2304 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2305 break;
2306 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002307 llvm::Type *TargetType = FnType->getParamType(I);
2308 llvm::Value *NativeArg = Args[I];
2309 if (!TargetType->isPointerTy()) {
2310 TargetArgs.emplace_back(NativeArg);
2311 continue;
2312 }
2313 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2314 NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
2315 /*AddrSpace=*/0));
2316 TargetArgs.emplace_back(
2317 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2318 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002319 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002320}