blob: 810e0e013e83e649b7616b853a153849d0ec235b [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataevc5b1d322016-03-04 09:22:22 +000016#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000017#include "CodeGenFunction.h"
18#include "clang/AST/StmtOpenMP.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000019
20using namespace clang;
21using namespace CodeGen;
22
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000023namespace {
24enum OpenMPRTLFunctionNVPTX {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000025 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000026 OMPRTL_NVPTX__kmpc_kernel_init,
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000027 /// \brief Call to void __kmpc_kernel_deinit();
28 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000029 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
30 /// short RequiresOMPRuntime, short RequiresDataSharing);
31 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
32 /// \brief Call to void __kmpc_spmd_kernel_deinit();
33 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000034 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
35 /// *outlined_function);
36 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
37 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
38 OMPRTL_NVPTX__kmpc_kernel_parallel,
39 /// \brief Call to void __kmpc_kernel_end_parallel();
40 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
41 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
42 /// global_tid);
43 OMPRTL_NVPTX__kmpc_serialized_parallel,
44 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
45 /// global_tid);
46 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000047 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
48 /// int16_t lane_offset, int16_t warp_size);
49 OMPRTL_NVPTX__kmpc_shuffle_int32,
50 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
51 /// int16_t lane_offset, int16_t warp_size);
52 OMPRTL_NVPTX__kmpc_shuffle_int64,
53 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
54 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
55 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
56 /// lane_offset, int16_t shortCircuit),
57 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
58 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
59 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
60 OMPRTL_NVPTX__kmpc_end_reduce_nowait
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000061};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000062
63/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
64class NVPTXActionTy final : public PrePostActionTy {
65 llvm::Value *EnterCallee;
66 ArrayRef<llvm::Value *> EnterArgs;
67 llvm::Value *ExitCallee;
68 ArrayRef<llvm::Value *> ExitArgs;
69 bool Conditional;
70 llvm::BasicBlock *ContBlock = nullptr;
71
72public:
73 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
74 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
75 bool Conditional = false)
76 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
77 ExitArgs(ExitArgs), Conditional(Conditional) {}
78 void Enter(CodeGenFunction &CGF) override {
79 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
80 if (Conditional) {
81 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
82 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
83 ContBlock = CGF.createBasicBlock("omp_if.end");
84 // Generate the branch (If-stmt)
85 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
86 CGF.EmitBlock(ThenBlock);
87 }
88 }
89 void Done(CodeGenFunction &CGF) {
90 // Emit the rest of blocks/branches
91 CGF.EmitBranch(ContBlock);
92 CGF.EmitBlock(ContBlock, true);
93 }
94 void Exit(CodeGenFunction &CGF) override {
95 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
96 }
97};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000098
99// A class to track the execution mode when codegening directives within
100// a target region. The appropriate mode (generic/spmd) is set on entry
101// to the target region and used by containing directives such as 'parallel'
102// to emit optimized code.
103class ExecutionModeRAII {
104private:
105 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
106 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
107
108public:
109 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
110 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
111 : Mode(Mode) {
112 SavedMode = Mode;
113 Mode = NewMode;
114 }
115 ~ExecutionModeRAII() { Mode = SavedMode; }
116};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000117
118/// GPU Configuration: This information can be derived from cuda registers,
119/// however, providing compile time constants helps generate more efficient
120/// code. For all practical purposes this is fine because the configuration
121/// is the same for all known NVPTX architectures.
122enum MachineConfiguration : unsigned {
123 WarpSize = 32,
124 /// Number of bits required to represent a lane identifier, which is
125 /// computed as log_2(WarpSize).
126 LaneIDBits = 5,
127 LaneIDMask = WarpSize - 1,
128};
129
130enum NamedBarrier : unsigned {
131 /// Synchronize on this barrier #ID using a named barrier primitive.
132 /// Only the subset of active threads in a parallel region arrive at the
133 /// barrier.
134 NB_Parallel = 1,
135};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000136} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000137
138/// Get the GPU warp size.
139static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000140 CGBuilderTy &Bld = CGF.Builder;
141 return Bld.CreateCall(
142 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000143 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000144 llvm::None, "nvptx_warp_size");
145}
146
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000147/// Get the id of the current thread on the GPU.
148static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000149 CGBuilderTy &Bld = CGF.Builder;
150 return Bld.CreateCall(
151 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000152 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000153 llvm::None, "nvptx_tid");
154}
155
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000156/// Get the id of the warp in the block.
157/// We assume that the warp size is 32, which is always the case
158/// on the NVPTX device, to generate more efficient code.
159static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
160 CGBuilderTy &Bld = CGF.Builder;
161 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
162}
163
164/// Get the id of the current lane in the Warp.
165/// We assume that the warp size is 32, which is always the case
166/// on the NVPTX device, to generate more efficient code.
167static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
168 CGBuilderTy &Bld = CGF.Builder;
169 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
170 "nvptx_lane_id");
171}
172
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000173/// Get the maximum number of threads in a block of the GPU.
174static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000175 CGBuilderTy &Bld = CGF.Builder;
176 return Bld.CreateCall(
177 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000178 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000179 llvm::None, "nvptx_num_threads");
180}
181
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000182/// Get barrier to synchronize all threads in a block.
183static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000184 CGBuilderTy &Bld = CGF.Builder;
185 Bld.CreateCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000186 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000187}
188
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000189/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
190/// a CTA.
191static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
192 llvm::Value *NumThreads) {
193 CGBuilderTy &Bld = CGF.Builder;
194 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
195 Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(),
196 llvm::Intrinsic::nvvm_barrier),
197 Args);
198}
199
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000200/// Synchronize all GPU threads in a block.
201static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000202
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000203/// Synchronize worker threads in a parallel region.
204static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
205 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
206}
207
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000208/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000209/// For the 'generic' execution mode, the runtime encodes thread_limit in
210/// the launch parameters, always starting thread_limit+warpSize threads per
211/// CTA. The threads in the last warp are reserved for master execution.
212/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
213static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
214 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000215 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000216 return IsInSpmdExecutionMode
217 ? getNVPTXNumThreads(CGF)
218 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
219 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000220}
221
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000222/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000223/// The master thread id is the first thread (lane) of the last warp in the
224/// GPU block. Warp size is assumed to be some power of 2.
225/// Thread id is 0 indexed.
226/// E.g: If NumThreads is 33, master id is 32.
227/// If NumThreads is 64, master id is 32.
228/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000229static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000230 CGBuilderTy &Bld = CGF.Builder;
231 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
232
233 // We assume that the warp size is a power of 2.
234 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
235
236 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
237 Bld.CreateNot(Mask), "master_tid");
238}
239
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000240CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
241 CodeGenModule &CGM)
242 : WorkerFn(nullptr), CGFI(nullptr) {
243 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000244}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000245
246void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
247 CodeGenModule &CGM) {
248 // Create an worker function with no arguments.
249 CGFI = &CGM.getTypes().arrangeNullaryFunction();
250
251 WorkerFn = llvm::Function::Create(
252 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
253 /* placeholder */ "_worker", &CGM.getModule());
254 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000255}
256
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000257bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
258 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
259}
260
261static CGOpenMPRuntimeNVPTX::ExecutionMode
262getExecutionModeForDirective(CodeGenModule &CGM,
263 const OMPExecutableDirective &D) {
264 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
265 switch (DirectiveKind) {
266 case OMPD_target:
Arpith Chacko Jacobcca61a32017-01-26 15:43:27 +0000267 case OMPD_target_teams:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000268 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
269 case OMPD_target_parallel:
270 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
271 default:
272 llvm_unreachable("Unsupported directive on NVPTX device.");
273 }
274 llvm_unreachable("Unsupported directive on NVPTX device.");
275}
276
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000277void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
278 StringRef ParentName,
279 llvm::Function *&OutlinedFn,
280 llvm::Constant *&OutlinedFnID,
281 bool IsOffloadEntry,
282 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000283 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
284 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000285 EntryFunctionState EST;
286 WorkerFunctionState WST(CGM);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000287 Work.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000288
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000289 // Emit target region as a standalone region.
290 class NVPTXPrePostActionTy : public PrePostActionTy {
291 CGOpenMPRuntimeNVPTX &RT;
292 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
293 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000294
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000295 public:
296 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
297 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
298 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
299 : RT(RT), EST(EST), WST(WST) {}
300 void Enter(CodeGenFunction &CGF) override {
301 RT.emitGenericEntryHeader(CGF, EST, WST);
302 }
303 void Exit(CodeGenFunction &CGF) override {
304 RT.emitGenericEntryFooter(CGF, EST);
305 }
306 } Action(*this, EST, WST);
307 CodeGen.setAction(Action);
308 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
309 IsOffloadEntry, CodeGen);
310
311 // Create the worker function
312 emitWorkerFunction(WST);
313
314 // Now change the name of the worker function to correspond to this target
315 // region's entry function.
316 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
317}
318
319// Setup NVPTX threads for master-worker OpenMP scheme.
320void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
321 EntryFunctionState &EST,
322 WorkerFunctionState &WST) {
323 CGBuilderTy &Bld = CGF.Builder;
324
325 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
326 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
327 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
328 EST.ExitBB = CGF.createBasicBlock(".exit");
329
330 auto *IsWorker =
331 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
332 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
333
334 CGF.EmitBlock(WorkerBB);
335 CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
336 CGF.EmitBranch(EST.ExitBB);
337
338 CGF.EmitBlock(MasterCheckBB);
339 auto *IsMaster =
340 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
341 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
342
343 CGF.EmitBlock(MasterBB);
344 // First action in sequential region:
345 // Initialize the state of the OpenMP runtime library on the GPU.
346 llvm::Value *Args[] = {getThreadLimit(CGF)};
347 CGF.EmitRuntimeCall(
348 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
349}
350
351void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
352 EntryFunctionState &EST) {
353 if (!EST.ExitBB)
354 EST.ExitBB = CGF.createBasicBlock(".exit");
355
356 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
357 CGF.EmitBranch(TerminateBB);
358
359 CGF.EmitBlock(TerminateBB);
360 // Signal termination condition.
361 CGF.EmitRuntimeCall(
362 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
363 // Barrier to terminate worker threads.
364 syncCTAThreads(CGF);
365 // Master thread jumps to exit point.
366 CGF.EmitBranch(EST.ExitBB);
367
368 CGF.EmitBlock(EST.ExitBB);
369 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000370}
371
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000372void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
373 StringRef ParentName,
374 llvm::Function *&OutlinedFn,
375 llvm::Constant *&OutlinedFnID,
376 bool IsOffloadEntry,
377 const RegionCodeGenTy &CodeGen) {
378 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
379 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
380 EntryFunctionState EST;
381
382 // Emit target region as a standalone region.
383 class NVPTXPrePostActionTy : public PrePostActionTy {
384 CGOpenMPRuntimeNVPTX &RT;
385 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
386 const OMPExecutableDirective &D;
387
388 public:
389 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
390 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
391 const OMPExecutableDirective &D)
392 : RT(RT), EST(EST), D(D) {}
393 void Enter(CodeGenFunction &CGF) override {
394 RT.emitSpmdEntryHeader(CGF, EST, D);
395 }
396 void Exit(CodeGenFunction &CGF) override {
397 RT.emitSpmdEntryFooter(CGF, EST);
398 }
399 } Action(*this, EST, D);
400 CodeGen.setAction(Action);
401 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
402 IsOffloadEntry, CodeGen);
403 return;
404}
405
406void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
407 CodeGenFunction &CGF, EntryFunctionState &EST,
408 const OMPExecutableDirective &D) {
409 auto &Bld = CGF.Builder;
410
411 // Setup BBs in entry function.
412 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
413 EST.ExitBB = CGF.createBasicBlock(".exit");
414
415 // Initialize the OMP state in the runtime; called by all active threads.
416 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
417 // based on code analysis of the target region.
418 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
419 /*RequiresOMPRuntime=*/Bld.getInt16(1),
420 /*RequiresDataSharing=*/Bld.getInt16(1)};
421 CGF.EmitRuntimeCall(
422 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
423 CGF.EmitBranch(ExecuteBB);
424
425 CGF.EmitBlock(ExecuteBB);
426}
427
428void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
429 EntryFunctionState &EST) {
430 if (!EST.ExitBB)
431 EST.ExitBB = CGF.createBasicBlock(".exit");
432
433 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
434 CGF.EmitBranch(OMPDeInitBB);
435
436 CGF.EmitBlock(OMPDeInitBB);
437 // DeInitialize the OMP state in the runtime; called by all active threads.
438 CGF.EmitRuntimeCall(
439 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
440 CGF.EmitBranch(EST.ExitBB);
441
442 CGF.EmitBlock(EST.ExitBB);
443 EST.ExitBB = nullptr;
444}
445
446// Create a unique global variable to indicate the execution mode of this target
447// region. The execution mode is either 'generic', or 'spmd' depending on the
448// target directive. This variable is picked up by the offload library to setup
449// the device appropriately before kernel launch. If the execution mode is
450// 'generic', the runtime reserves one warp for the master, otherwise, all
451// warps participate in parallel work.
452static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
453 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
454 (void)new llvm::GlobalVariable(
455 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
456 llvm::GlobalValue::WeakAnyLinkage,
457 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
458}
459
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000460void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
461 auto &Ctx = CGM.getContext();
462
463 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000464 CGF.disableDebugInfo();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000465 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
466 emitWorkerLoop(CGF, WST);
467 CGF.FinishFunction();
468}
469
470void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
471 WorkerFunctionState &WST) {
472 //
473 // The workers enter this loop and wait for parallel work from the master.
474 // When the master encounters a parallel region it sets up the work + variable
475 // arguments, and wakes up the workers. The workers first check to see if
476 // they are required for the parallel region, i.e., within the # of requested
477 // parallel threads. The activated workers load the variable arguments and
478 // execute the parallel work.
479 //
480
481 CGBuilderTy &Bld = CGF.Builder;
482
483 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
484 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
485 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
486 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
487 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
488 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
489
490 CGF.EmitBranch(AwaitBB);
491
492 // Workers wait for work from master.
493 CGF.EmitBlock(AwaitBB);
494 // Wait for parallel work
495 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000496
497 Address WorkFn =
498 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
499 Address ExecStatus =
500 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
501 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
502 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
503
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000504 llvm::Value *Args[] = {WorkFn.getPointer()};
505 llvm::Value *Ret = CGF.EmitRuntimeCall(
506 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
507 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000508
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000509 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000510 llvm::Value *ShouldTerminate =
511 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000512 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
513
514 // Activate requested workers.
515 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000516 llvm::Value *IsActive =
517 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
518 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000519
520 // Signal start of parallel region.
521 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000522
523 // Process work items: outlined parallel functions.
524 for (auto *W : Work) {
525 // Try to match this outlined function.
526 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
527
528 llvm::Value *WorkFnMatch =
529 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
530
531 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
532 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
533 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
534
535 // Execute this outlined function.
536 CGF.EmitBlock(ExecuteFNBB);
537
538 // Insert call to work function.
539 // FIXME: Pass arguments to outlined function from master thread.
540 auto *Fn = cast<llvm::Function>(W);
541 Address ZeroAddr =
542 CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr");
543 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0));
544 llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
545 CGF.EmitCallOrInvoke(Fn, FnArgs);
546
547 // Go to end of parallel region.
548 CGF.EmitBranch(TerminateBB);
549
550 CGF.EmitBlock(CheckNextBB);
551 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000552
553 // Signal end of parallel region.
554 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000555 CGF.EmitRuntimeCall(
556 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
557 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000558 CGF.EmitBranch(BarrierBB);
559
560 // All active and inactive workers wait at a barrier after parallel region.
561 CGF.EmitBlock(BarrierBB);
562 // Barrier after parallel region.
563 syncCTAThreads(CGF);
564 CGF.EmitBranch(AwaitBB);
565
566 // Exit target region.
567 CGF.EmitBlock(ExitBB);
568}
569
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000570/// \brief Returns specified OpenMP runtime function for the current OpenMP
571/// implementation. Specialized for the NVPTX device.
572/// \param Function OpenMP runtime function.
573/// \return Specified function.
574llvm::Constant *
575CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
576 llvm::Constant *RTLFn = nullptr;
577 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
578 case OMPRTL_NVPTX__kmpc_kernel_init: {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000579 // Build void __kmpc_kernel_init(kmp_int32 thread_limit);
580 llvm::Type *TypeParams[] = {CGM.Int32Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000581 llvm::FunctionType *FnTy =
582 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
583 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
584 break;
585 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000586 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
587 // Build void __kmpc_kernel_deinit();
588 llvm::FunctionType *FnTy =
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000589 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000590 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
591 break;
592 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000593 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
594 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
595 // short RequiresOMPRuntime, short RequiresDataSharing);
596 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
597 llvm::FunctionType *FnTy =
598 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
599 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
600 break;
601 }
602 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
603 // Build void __kmpc_spmd_kernel_deinit();
604 llvm::FunctionType *FnTy =
605 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
606 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
607 break;
608 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000609 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
610 /// Build void __kmpc_kernel_prepare_parallel(
611 /// void *outlined_function);
612 llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
613 llvm::FunctionType *FnTy =
614 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
615 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
616 break;
617 }
618 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
619 /// Build bool __kmpc_kernel_parallel(void **outlined_function);
620 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
621 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
622 llvm::FunctionType *FnTy =
623 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
624 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
625 break;
626 }
627 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
628 /// Build void __kmpc_kernel_end_parallel();
629 llvm::FunctionType *FnTy =
630 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
631 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
632 break;
633 }
634 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
635 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
636 // global_tid);
637 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
638 llvm::FunctionType *FnTy =
639 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
640 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
641 break;
642 }
643 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
644 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
645 // global_tid);
646 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
647 llvm::FunctionType *FnTy =
648 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
649 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
650 break;
651 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000652 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
653 // Build int32_t __kmpc_shuffle_int32(int32_t element,
654 // int16_t lane_offset, int16_t warp_size);
655 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
656 llvm::FunctionType *FnTy =
657 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
658 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
659 break;
660 }
661 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
662 // Build int64_t __kmpc_shuffle_int64(int64_t element,
663 // int16_t lane_offset, int16_t warp_size);
664 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
665 llvm::FunctionType *FnTy =
666 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
667 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
668 break;
669 }
670 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
671 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
672 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
673 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
674 // lane_offset, int16_t Algorithm Version),
675 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
676 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
677 CGM.Int16Ty, CGM.Int16Ty};
678 auto *ShuffleReduceFnTy =
679 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
680 /*isVarArg=*/false);
681 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
682 auto *InterWarpCopyFnTy =
683 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
684 /*isVarArg=*/false);
685 llvm::Type *TypeParams[] = {CGM.Int32Ty,
686 CGM.Int32Ty,
687 CGM.SizeTy,
688 CGM.VoidPtrTy,
689 ShuffleReduceFnTy->getPointerTo(),
690 InterWarpCopyFnTy->getPointerTo()};
691 llvm::FunctionType *FnTy =
692 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
693 RTLFn = CGM.CreateRuntimeFunction(
694 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
695 break;
696 }
697 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
698 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
699 llvm::Type *TypeParams[] = {CGM.Int32Ty};
700 llvm::FunctionType *FnTy =
701 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
702 RTLFn = CGM.CreateRuntimeFunction(
703 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
704 break;
705 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000706 }
707 return RTLFn;
708}
709
710void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
711 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +0000712 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000713 auto *F = dyn_cast<llvm::Function>(Addr);
714 // TODO: Add support for global variables on the device after declare target
715 // support.
716 if (!F)
717 return;
718 llvm::Module *M = F->getParent();
719 llvm::LLVMContext &Ctx = M->getContext();
720
721 // Get "nvvm.annotations" metadata node
722 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
723
724 llvm::Metadata *MDVals[] = {
725 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
726 llvm::ConstantAsMetadata::get(
727 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
728 // Append metadata to nvvm.annotations
729 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
730}
731
732void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
733 const OMPExecutableDirective &D, StringRef ParentName,
734 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +0000735 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000736 if (!IsOffloadEntry) // Nothing to do.
737 return;
738
739 assert(!ParentName.empty() && "Invalid target region parent name!");
740
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000741 CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
742 getExecutionModeForDirective(CGM, D);
743 switch (Mode) {
744 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
745 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
746 CodeGen);
747 break;
748 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
749 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
750 CodeGen);
751 break;
752 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
753 llvm_unreachable(
754 "Unknown programming model for OpenMP directive on NVPTX target.");
755 }
756
757 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000758}
759
Samuel Antao45bfe4c2016-02-08 15:59:20 +0000760CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000761 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000762 if (!CGM.getLangOpts().OpenMPIsDevice)
763 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000764}
Carlo Bertollic6872252016-04-04 15:55:02 +0000765
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +0000766void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
767 OpenMPProcBindClauseKind ProcBind,
768 SourceLocation Loc) {
769 // Do nothing in case of Spmd mode and L0 parallel.
770 // TODO: If in Spmd mode and L1 parallel emit the clause.
771 if (isInSpmdExecutionMode())
772 return;
773
774 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
775}
776
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +0000777void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
778 llvm::Value *NumThreads,
779 SourceLocation Loc) {
780 // Do nothing in case of Spmd mode and L0 parallel.
781 // TODO: If in Spmd mode and L1 parallel emit the clause.
782 if (isInSpmdExecutionMode())
783 return;
784
785 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
786}
787
Carlo Bertollic6872252016-04-04 15:55:02 +0000788void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
789 const Expr *NumTeams,
790 const Expr *ThreadLimit,
791 SourceLocation Loc) {}
792
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000793llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
794 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
795 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
796 return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
797 InnermostKind, CodeGen);
798}
799
800llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +0000801 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
802 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
803
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000804 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
805 D, ThreadIDVar, InnermostKind, CodeGen);
806 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
807 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
808 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +0000809
810 return OutlinedFun;
811}
812
813void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
814 const OMPExecutableDirective &D,
815 SourceLocation Loc,
816 llvm::Value *OutlinedFn,
817 ArrayRef<llvm::Value *> CapturedVars) {
818 if (!CGF.HaveInsertPoint())
819 return;
820
821 Address ZeroAddr =
822 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
823 /*Name*/ ".zero.addr");
824 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
825 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
826 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
827 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
828 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
829 CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
830}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000831
832void CGOpenMPRuntimeNVPTX::emitParallelCall(
833 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
834 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
835 if (!CGF.HaveInsertPoint())
836 return;
837
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000838 if (isInSpmdExecutionMode())
839 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
840 else
841 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000842}
843
844void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
845 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
846 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
847 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
848
Malcolm Parsonsc6e45832017-01-13 18:55:32 +0000849 auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000850 CGBuilderTy &Bld = CGF.Builder;
851
852 // Prepare for parallel region. Indicate the outlined function.
853 llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
854 CGF.EmitRuntimeCall(
855 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
856 Args);
857
858 // Activate workers. This barrier is used by the master to signal
859 // work for the workers.
860 syncCTAThreads(CGF);
861
862 // OpenMP [2.5, Parallel Construct, p.49]
863 // There is an implied barrier at the end of a parallel region. After the
864 // end of a parallel region, only the master thread of the team resumes
865 // execution of the enclosing task region.
866 //
867 // The master waits at this barrier until all workers are done.
868 syncCTAThreads(CGF);
869
870 // Remember for post-processing in worker loop.
871 Work.push_back(Fn);
872 };
873
874 auto *RTLoc = emitUpdateLocation(CGF, Loc);
875 auto *ThreadID = getThreadID(CGF, Loc);
876 llvm::Value *Args[] = {RTLoc, ThreadID};
877
878 auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
879 PrePostActionTy &) {
Malcolm Parsonsc6e45832017-01-13 18:55:32 +0000880 auto &&CodeGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF,
881 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000882 Action.Enter(CGF);
883
884 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
885 OutlinedFnArgs.push_back(
886 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
887 OutlinedFnArgs.push_back(
888 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
889 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
890 CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
891 };
892
893 RegionCodeGenTy RCG(CodeGen);
894 NVPTXActionTy Action(
895 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
896 Args,
897 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
898 Args);
899 RCG.setAction(Action);
900 RCG(CGF);
901 };
902
903 if (IfCond)
904 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
905 else {
906 CodeGenFunction::RunCleanupsScope Scope(CGF);
907 RegionCodeGenTy ThenRCG(L0ParallelGen);
908 ThenRCG(CGF);
909 }
910}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000911
912void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
913 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
914 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
915 // Just call the outlined function to execute the parallel region.
916 // OutlinedFn(&GTid, &zero, CapturedStruct);
917 //
918 // TODO: Do something with IfCond when support for the 'if' clause
919 // is added on Spmd target directives.
920 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
921 OutlinedFnArgs.push_back(
922 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
923 OutlinedFnArgs.push_back(
924 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
925 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
926 CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
927}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000928
929/// This function creates calls to one of two shuffle functions to copy
930/// variables between lanes in a warp.
931static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
932 QualType ElemTy,
933 llvm::Value *Elem,
934 llvm::Value *Offset) {
935 auto &CGM = CGF.CGM;
936 auto &C = CGM.getContext();
937 auto &Bld = CGF.Builder;
938 CGOpenMPRuntimeNVPTX &RT =
939 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
940
941 unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
942 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
943
944 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
945 ? OMPRTL_NVPTX__kmpc_shuffle_int32
946 : OMPRTL_NVPTX__kmpc_shuffle_int64;
947
948 // Cast all types to 32- or 64-bit values before calling shuffle routines.
949 auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
950 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
951 auto *WarpSize = CGF.EmitScalarConversion(
952 getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
953 C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
954
955 auto *ShuffledVal =
956 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
957 {ElemCast, Offset, WarpSize});
958
959 return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
960}
961
962namespace {
963enum CopyAction : unsigned {
964 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
965 // the warp using shuffle instructions.
966 RemoteLaneToThread,
967 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
968 ThreadCopy,
969};
970} // namespace
971
972/// Emit instructions to copy a Reduce list, which contains partially
973/// aggregated values, in the specified direction.
974static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
975 QualType ReductionArrayTy,
976 ArrayRef<const Expr *> Privates,
977 Address SrcBase, Address DestBase,
978 llvm::Value *RemoteLaneOffset = nullptr) {
979
980 auto &CGM = CGF.CGM;
981 auto &C = CGM.getContext();
982 auto &Bld = CGF.Builder;
983
984 // Iterates, element-by-element, through the source Reduce list and
985 // make a copy.
986 unsigned Idx = 0;
987 for (auto &Private : Privates) {
988 Address SrcElementAddr = Address::invalid();
989 Address DestElementAddr = Address::invalid();
990 Address DestElementPtrAddr = Address::invalid();
991 // Should we shuffle in an element from a remote lane?
992 bool ShuffleInElement = false;
993 // Set to true to update the pointer in the dest Reduce list to a
994 // newly created element.
995 bool UpdateDestListPtr = false;
996
997 switch (Action) {
998 case RemoteLaneToThread: {
999 // Step 1.1: Get the address for the src element in the Reduce list.
1000 Address SrcElementPtrAddr =
1001 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1002 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1003 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1004 SrcElementAddr =
1005 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1006
1007 // Step 1.2: Create a temporary to store the element in the destination
1008 // Reduce list.
1009 DestElementPtrAddr =
1010 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1011 DestElementAddr =
1012 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1013 ShuffleInElement = true;
1014 UpdateDestListPtr = true;
1015 break;
1016 }
1017 case ThreadCopy: {
1018 // Step 1.1: Get the address for the src element in the Reduce list.
1019 Address SrcElementPtrAddr =
1020 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1021 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1022 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1023 SrcElementAddr =
1024 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1025
1026 // Step 1.2: Get the address for dest element. The destination
1027 // element has already been created on the thread's stack.
1028 DestElementPtrAddr =
1029 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1030 llvm::Value *DestElementPtr =
1031 CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1032 C.VoidPtrTy, SourceLocation());
1033 Address DestElemAddr =
1034 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1035 DestElementAddr = Bld.CreateElementBitCast(
1036 DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1037 break;
1038 }
1039 }
1040
1041 // Regardless of src and dest of copy, we emit the load of src
1042 // element as this is required in all directions
1043 SrcElementAddr = Bld.CreateElementBitCast(
1044 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1045 llvm::Value *Elem =
1046 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1047 Private->getType(), SourceLocation());
1048
1049 // Now that all active lanes have read the element in the
1050 // Reduce list, shuffle over the value from the remote lane.
1051 if (ShuffleInElement) {
1052 Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1053 RemoteLaneOffset);
1054 }
1055
1056 // Store the source element value to the dest element address.
1057 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1058 Private->getType());
1059
1060 // Step 3.1: Modify reference in dest Reduce list as needed.
1061 // Modifying the reference in Reduce list to point to the newly
1062 // created element. The element is live in the current function
1063 // scope and that of functions it invokes (i.e., reduce_function).
1064 // RemoteReduceData[i] = (void*)&RemoteElem
1065 if (UpdateDestListPtr) {
1066 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1067 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1068 DestElementPtrAddr, /*Volatile=*/false,
1069 C.VoidPtrTy);
1070 }
1071
1072 Idx++;
1073 }
1074}
1075
1076/// This function emits a helper that gathers Reduce lists from the first
1077/// lane of every active warp to lanes in the first warp.
1078///
1079/// void inter_warp_copy_func(void* reduce_data, num_warps)
1080/// shared smem[warp_size];
1081/// For all data entries D in reduce_data:
1082/// If (I am the first lane in each warp)
1083/// Copy my local D to smem[warp_id]
1084/// sync
1085/// if (I am the first warp)
1086/// Copy smem[thread_id] to my local D
1087/// sync
1088static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1089 ArrayRef<const Expr *> Privates,
1090 QualType ReductionArrayTy) {
1091 auto &C = CGM.getContext();
1092 auto &M = CGM.getModule();
1093
1094 // ReduceList: thread local Reduce list.
1095 // At the stage of the computation when this function is called, partially
1096 // aggregated values reside in the first lane of every active warp.
1097 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(),
1098 /*Id=*/nullptr, C.VoidPtrTy);
1099 // NumWarps: number of warps active in the parallel region. This could
1100 // be smaller than 32 (max warps in a CTA) for partial block reduction.
1101 ImplicitParamDecl NumWarpsArg(C, /*DC=*/nullptr, SourceLocation(),
1102 /*Id=*/nullptr,
1103 C.getIntTypeForBitwidth(32, /* Signed */ true));
1104 FunctionArgList Args;
1105 Args.push_back(&ReduceListArg);
1106 Args.push_back(&NumWarpsArg);
1107
1108 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1109 auto *Fn = llvm::Function::Create(
1110 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1111 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1112 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1113 CodeGenFunction CGF(CGM);
1114 // We don't need debug information in this function as nothing here refers to
1115 // user code.
1116 CGF.disableDebugInfo();
1117 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1118
1119 auto &Bld = CGF.Builder;
1120
1121 // This array is used as a medium to transfer, one reduce element at a time,
1122 // the data from the first lane of every warp to lanes in the first warp
1123 // in order to perform the final step of a reduction in a parallel region
1124 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1125 // for reduced latency, as well as to have a distinct copy for concurrently
1126 // executing target regions. The array is declared with common linkage so
1127 // as to be shared across compilation units.
1128 const char *TransferMediumName =
1129 "__openmp_nvptx_data_transfer_temporary_storage";
1130 llvm::GlobalVariable *TransferMedium =
1131 M.getGlobalVariable(TransferMediumName);
1132 if (!TransferMedium) {
1133 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1134 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1135 TransferMedium = new llvm::GlobalVariable(
1136 M, Ty,
1137 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1138 llvm::Constant::getNullValue(Ty), TransferMediumName,
1139 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1140 SharedAddressSpace);
1141 }
1142
1143 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1144 auto *ThreadID = getNVPTXThreadID(CGF);
1145 // nvptx_lane_id = nvptx_id % warpsize
1146 auto *LaneID = getNVPTXLaneID(CGF);
1147 // nvptx_warp_id = nvptx_id / warpsize
1148 auto *WarpID = getNVPTXWarpID(CGF);
1149
1150 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1151 Address LocalReduceList(
1152 Bld.CreatePointerBitCastOrAddrSpaceCast(
1153 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1154 C.VoidPtrTy, SourceLocation()),
1155 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1156 CGF.getPointerAlign());
1157
1158 unsigned Idx = 0;
1159 for (auto &Private : Privates) {
1160 //
1161 // Warp master copies reduce element to transfer medium in __shared__
1162 // memory.
1163 //
1164 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1165 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1166 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1167
1168 // if (lane_id == 0)
1169 auto IsWarpMaster =
1170 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1171 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1172 CGF.EmitBlock(ThenBB);
1173
1174 // Reduce element = LocalReduceList[i]
1175 Address ElemPtrPtrAddr =
1176 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1177 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1178 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1179 // elemptr = (type[i]*)(elemptrptr)
1180 Address ElemPtr =
1181 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1182 ElemPtr = Bld.CreateElementBitCast(
1183 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1184 // elem = *elemptr
1185 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1186 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1187
1188 // Get pointer to location in transfer medium.
1189 // MediumPtr = &medium[warp_id]
1190 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1191 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1192 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1193 // Casting to actual data type.
1194 // MediumPtr = (type[i]*)MediumPtrAddr;
1195 MediumPtr = Bld.CreateElementBitCast(
1196 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1197
1198 //*MediumPtr = elem
1199 Bld.CreateStore(Elem, MediumPtr);
1200
1201 Bld.CreateBr(MergeBB);
1202
1203 CGF.EmitBlock(ElseBB);
1204 Bld.CreateBr(MergeBB);
1205
1206 CGF.EmitBlock(MergeBB);
1207
1208 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1209 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1210 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1211
1212 auto *NumActiveThreads = Bld.CreateNSWMul(
1213 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1214 // named_barrier_sync(ParallelBarrierID, num_active_threads)
1215 syncParallelThreads(CGF, NumActiveThreads);
1216
1217 //
1218 // Warp 0 copies reduce element from transfer medium.
1219 //
1220 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1221 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1222 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1223
1224 // Up to 32 threads in warp 0 are active.
1225 auto IsActiveThread =
1226 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1227 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1228
1229 CGF.EmitBlock(W0ThenBB);
1230
1231 // SrcMediumPtr = &medium[tid]
1232 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1233 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1234 Address SrcMediumPtr(SrcMediumPtrVal,
1235 C.getTypeAlignInChars(Private->getType()));
1236 // SrcMediumVal = *SrcMediumPtr;
1237 SrcMediumPtr = Bld.CreateElementBitCast(
1238 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1239 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1240 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1241
1242 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1243 Address TargetElemPtrPtr =
1244 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1245 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1246 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1247 Address TargetElemPtr =
1248 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1249 TargetElemPtr = Bld.CreateElementBitCast(
1250 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1251
1252 // *TargetElemPtr = SrcMediumVal;
1253 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1254 Private->getType());
1255 Bld.CreateBr(W0MergeBB);
1256
1257 CGF.EmitBlock(W0ElseBB);
1258 Bld.CreateBr(W0MergeBB);
1259
1260 CGF.EmitBlock(W0MergeBB);
1261
1262 // While warp 0 copies values from transfer medium, all other warps must
1263 // wait.
1264 syncParallelThreads(CGF, NumActiveThreads);
1265 Idx++;
1266 }
1267
1268 CGF.FinishFunction();
1269 return Fn;
1270}
1271
1272/// Emit a helper that reduces data across two OpenMP threads (lanes)
1273/// in the same warp. It uses shuffle instructions to copy over data from
1274/// a remote lane's stack. The reduction algorithm performed is specified
1275/// by the fourth parameter.
1276///
1277/// Algorithm Versions.
1278/// Full Warp Reduce (argument value 0):
1279/// This algorithm assumes that all 32 lanes are active and gathers
1280/// data from these 32 lanes, producing a single resultant value.
1281/// Contiguous Partial Warp Reduce (argument value 1):
1282/// This algorithm assumes that only a *contiguous* subset of lanes
1283/// are active. This happens for the last warp in a parallel region
1284/// when the user specified num_threads is not an integer multiple of
1285/// 32. This contiguous subset always starts with the zeroth lane.
1286/// Partial Warp Reduce (argument value 2):
1287/// This algorithm gathers data from any number of lanes at any position.
1288/// All reduced values are stored in the lowest possible lane. The set
1289/// of problems every algorithm addresses is a super set of those
1290/// addressable by algorithms with a lower version number. Overhead
1291/// increases as algorithm version increases.
1292///
1293/// Terminology
1294/// Reduce element:
1295/// Reduce element refers to the individual data field with primitive
1296/// data types to be combined and reduced across threads.
1297/// Reduce list:
1298/// Reduce list refers to a collection of local, thread-private
1299/// reduce elements.
1300/// Remote Reduce list:
1301/// Remote Reduce list refers to a collection of remote (relative to
1302/// the current thread) reduce elements.
1303///
1304/// We distinguish between three states of threads that are important to
1305/// the implementation of this function.
1306/// Alive threads:
1307/// Threads in a warp executing the SIMT instruction, as distinguished from
1308/// threads that are inactive due to divergent control flow.
1309/// Active threads:
1310/// The minimal set of threads that has to be alive upon entry to this
1311/// function. The computation is correct iff active threads are alive.
1312/// Some threads are alive but they are not active because they do not
1313/// contribute to the computation in any useful manner. Turning them off
1314/// may introduce control flow overheads without any tangible benefits.
1315/// Effective threads:
1316/// In order to comply with the argument requirements of the shuffle
1317/// function, we must keep all lanes holding data alive. But at most
1318/// half of them perform value aggregation; we refer to this half of
1319/// threads as effective. The other half is simply handing off their
1320/// data.
1321///
1322/// Procedure
1323/// Value shuffle:
1324/// In this step active threads transfer data from higher lane positions
1325/// in the warp to lower lane positions, creating Remote Reduce list.
1326/// Value aggregation:
1327/// In this step, effective threads combine their thread local Reduce list
1328/// with Remote Reduce list and store the result in the thread local
1329/// Reduce list.
1330/// Value copy:
1331/// In this step, we deal with the assumption made by algorithm 2
1332/// (i.e. contiguity assumption). When we have an odd number of lanes
1333/// active, say 2k+1, only k threads will be effective and therefore k
1334/// new values will be produced. However, the Reduce list owned by the
1335/// (2k+1)th thread is ignored in the value aggregation. Therefore
1336/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1337/// that the contiguity assumption still holds.
1338static llvm::Value *
1339emitShuffleAndReduceFunction(CodeGenModule &CGM,
1340 ArrayRef<const Expr *> Privates,
1341 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1342 auto &C = CGM.getContext();
1343
1344 // Thread local Reduce list used to host the values of data to be reduced.
1345 ImplicitParamDecl ReduceListArg(C, /*DC=*/nullptr, SourceLocation(),
1346 /*Id=*/nullptr, C.VoidPtrTy);
1347 // Current lane id; could be logical.
1348 ImplicitParamDecl LaneIDArg(C, /*DC=*/nullptr, SourceLocation(),
1349 /*Id=*/nullptr, C.ShortTy);
1350 // Offset of the remote source lane relative to the current lane.
1351 ImplicitParamDecl RemoteLaneOffsetArg(C, /*DC=*/nullptr, SourceLocation(),
1352 /*Id=*/nullptr, C.ShortTy);
1353 // Algorithm version. This is expected to be known at compile time.
1354 ImplicitParamDecl AlgoVerArg(C, /*DC=*/nullptr, SourceLocation(),
1355 /*Id=*/nullptr, C.ShortTy);
1356 FunctionArgList Args;
1357 Args.push_back(&ReduceListArg);
1358 Args.push_back(&LaneIDArg);
1359 Args.push_back(&RemoteLaneOffsetArg);
1360 Args.push_back(&AlgoVerArg);
1361
1362 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1363 auto *Fn = llvm::Function::Create(
1364 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1365 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1366 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1367 CodeGenFunction CGF(CGM);
1368 // We don't need debug information in this function as nothing here refers to
1369 // user code.
1370 CGF.disableDebugInfo();
1371 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1372
1373 auto &Bld = CGF.Builder;
1374
1375 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1376 Address LocalReduceList(
1377 Bld.CreatePointerBitCastOrAddrSpaceCast(
1378 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1379 C.VoidPtrTy, SourceLocation()),
1380 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1381 CGF.getPointerAlign());
1382
1383 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1384 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1385 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1386
1387 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1388 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1389 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1390
1391 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1392 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1393 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1394
1395 // Create a local thread-private variable to host the Reduce list
1396 // from a remote lane.
1397 Address RemoteReduceList =
1398 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1399
1400 // This loop iterates through the list of reduce elements and copies,
1401 // element by element, from a remote lane in the warp to RemoteReduceList,
1402 // hosted on the thread's stack.
1403 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1404 LocalReduceList, RemoteReduceList,
1405 RemoteLaneOffsetArgVal);
1406
1407 // The actions to be performed on the Remote Reduce list is dependent
1408 // on the algorithm version.
1409 //
1410 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1411 // LaneId % 2 == 0 && Offset > 0):
1412 // do the reduction value aggregation
1413 //
1414 // The thread local variable Reduce list is mutated in place to host the
1415 // reduced data, which is the aggregated value produced from local and
1416 // remote lanes.
1417 //
1418 // Note that AlgoVer is expected to be a constant integer known at compile
1419 // time.
1420 // When AlgoVer==0, the first conjunction evaluates to true, making
1421 // the entire predicate true during compile time.
1422 // When AlgoVer==1, the second conjunction has only the second part to be
1423 // evaluated during runtime. Other conjunctions evaluates to false
1424 // during compile time.
1425 // When AlgoVer==2, the third conjunction has only the second part to be
1426 // evaluated during runtime. Other conjunctions evaluates to false
1427 // during compile time.
1428 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1429
1430 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1431 auto CondAlgo1 = Bld.CreateAnd(
1432 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1433
1434 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1435 auto CondAlgo2 = Bld.CreateAnd(
1436 Algo2,
1437 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1438 Bld.getInt16(0)));
1439 CondAlgo2 = Bld.CreateAnd(
1440 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1441
1442 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1443 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1444
1445 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1446 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1447 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1448 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1449
1450 CGF.EmitBlock(ThenBB);
1451 // reduce_function(LocalReduceList, RemoteReduceList)
1452 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1453 LocalReduceList.getPointer(), CGF.VoidPtrTy);
1454 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1455 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1456 CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1457 Bld.CreateBr(MergeBB);
1458
1459 CGF.EmitBlock(ElseBB);
1460 Bld.CreateBr(MergeBB);
1461
1462 CGF.EmitBlock(MergeBB);
1463
1464 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1465 // Reduce list.
1466 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1467 auto CondCopy = Bld.CreateAnd(
1468 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1469
1470 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1471 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1472 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1473 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1474
1475 CGF.EmitBlock(CpyThenBB);
1476 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1477 RemoteReduceList, LocalReduceList);
1478 Bld.CreateBr(CpyMergeBB);
1479
1480 CGF.EmitBlock(CpyElseBB);
1481 Bld.CreateBr(CpyMergeBB);
1482
1483 CGF.EmitBlock(CpyMergeBB);
1484
1485 CGF.FinishFunction();
1486 return Fn;
1487}
1488
1489///
1490/// Design of OpenMP reductions on the GPU
1491///
1492/// Consider a typical OpenMP program with one or more reduction
1493/// clauses:
1494///
1495/// float foo;
1496/// double bar;
1497/// #pragma omp target teams distribute parallel for \
1498/// reduction(+:foo) reduction(*:bar)
1499/// for (int i = 0; i < N; i++) {
1500/// foo += A[i]; bar *= B[i];
1501/// }
1502///
1503/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1504/// all teams. In our OpenMP implementation on the NVPTX device an
1505/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1506/// within a team are mapped to CUDA threads within a threadblock.
1507/// Our goal is to efficiently aggregate values across all OpenMP
1508/// threads such that:
1509///
1510/// - the compiler and runtime are logically concise, and
1511/// - the reduction is performed efficiently in a hierarchical
1512/// manner as follows: within OpenMP threads in the same warp,
1513/// across warps in a threadblock, and finally across teams on
1514/// the NVPTX device.
1515///
1516/// Introduction to Decoupling
1517///
1518/// We would like to decouple the compiler and the runtime so that the
1519/// latter is ignorant of the reduction variables (number, data types)
1520/// and the reduction operators. This allows a simpler interface
1521/// and implementation while still attaining good performance.
1522///
1523/// Pseudocode for the aforementioned OpenMP program generated by the
1524/// compiler is as follows:
1525///
1526/// 1. Create private copies of reduction variables on each OpenMP
1527/// thread: 'foo_private', 'bar_private'
1528/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1529/// to it and writes the result in 'foo_private' and 'bar_private'
1530/// respectively.
1531/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1532/// and store the result on the team master:
1533///
1534/// __kmpc_nvptx_parallel_reduce_nowait(...,
1535/// reduceData, shuffleReduceFn, interWarpCpyFn)
1536///
1537/// where:
1538/// struct ReduceData {
1539/// double *foo;
1540/// double *bar;
1541/// } reduceData
1542/// reduceData.foo = &foo_private
1543/// reduceData.bar = &bar_private
1544///
1545/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1546/// auxiliary functions generated by the compiler that operate on
1547/// variables of type 'ReduceData'. They aid the runtime perform
1548/// algorithmic steps in a data agnostic manner.
1549///
1550/// 'shuffleReduceFn' is a pointer to a function that reduces data
1551/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1552/// same warp. It takes the following arguments as input:
1553///
1554/// a. variable of type 'ReduceData' on the calling lane,
1555/// b. its lane_id,
1556/// c. an offset relative to the current lane_id to generate a
1557/// remote_lane_id. The remote lane contains the second
1558/// variable of type 'ReduceData' that is to be reduced.
1559/// d. an algorithm version parameter determining which reduction
1560/// algorithm to use.
1561///
1562/// 'shuffleReduceFn' retrieves data from the remote lane using
1563/// efficient GPU shuffle intrinsics and reduces, using the
1564/// algorithm specified by the 4th parameter, the two operands
1565/// element-wise. The result is written to the first operand.
1566///
1567/// Different reduction algorithms are implemented in different
1568/// runtime functions, all calling 'shuffleReduceFn' to perform
1569/// the essential reduction step. Therefore, based on the 4th
1570/// parameter, this function behaves slightly differently to
1571/// cooperate with the runtime to ensure correctness under
1572/// different circumstances.
1573///
1574/// 'InterWarpCpyFn' is a pointer to a function that transfers
1575/// reduced variables across warps. It tunnels, through CUDA
1576/// shared memory, the thread-private data of type 'ReduceData'
1577/// from lane 0 of each warp to a lane in the first warp.
1578/// 5. if ret == 1:
1579/// The team master of the last team stores the reduced
1580/// result to the globals in memory.
1581/// foo += reduceData.foo; bar *= reduceData.bar
1582///
1583///
1584/// Warp Reduction Algorithms
1585///
1586/// On the warp level, we have three algorithms implemented in the
1587/// OpenMP runtime depending on the number of active lanes:
1588///
1589/// Full Warp Reduction
1590///
1591/// The reduce algorithm within a warp where all lanes are active
1592/// is implemented in the runtime as follows:
1593///
1594/// full_warp_reduce(void *reduce_data,
1595/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1596/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
1597/// ShuffleReduceFn(reduce_data, 0, offset, 0);
1598/// }
1599///
1600/// The algorithm completes in log(2, WARPSIZE) steps.
1601///
1602/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
1603/// not used therefore we save instructions by not retrieving lane_id
1604/// from the corresponding special registers. The 4th parameter, which
1605/// represents the version of the algorithm being used, is set to 0 to
1606/// signify full warp reduction.
1607///
1608/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1609///
1610/// #reduce_elem refers to an element in the local lane's data structure
1611/// #remote_elem is retrieved from a remote lane
1612/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1613/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
1614///
1615/// Contiguous Partial Warp Reduction
1616///
1617/// This reduce algorithm is used within a warp where only the first
1618/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
1619/// number of OpenMP threads in a parallel region is not a multiple of
1620/// WARPSIZE. The algorithm is implemented in the runtime as follows:
1621///
1622/// void
1623/// contiguous_partial_reduce(void *reduce_data,
1624/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
1625/// int size, int lane_id) {
1626/// int curr_size;
1627/// int offset;
1628/// curr_size = size;
1629/// mask = curr_size/2;
1630/// while (offset>0) {
1631/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
1632/// curr_size = (curr_size+1)/2;
1633/// offset = curr_size/2;
1634/// }
1635/// }
1636///
1637/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1638///
1639/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1640/// if (lane_id < offset)
1641/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1642/// else
1643/// reduce_elem = remote_elem
1644///
1645/// This algorithm assumes that the data to be reduced are located in a
1646/// contiguous subset of lanes starting from the first. When there is
1647/// an odd number of active lanes, the data in the last lane is not
1648/// aggregated with any other lane's dat but is instead copied over.
1649///
1650/// Dispersed Partial Warp Reduction
1651///
1652/// This algorithm is used within a warp when any discontiguous subset of
1653/// lanes are active. It is used to implement the reduction operation
1654/// across lanes in an OpenMP simd region or in a nested parallel region.
1655///
1656/// void
1657/// dispersed_partial_reduce(void *reduce_data,
1658/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
1659/// int size, remote_id;
1660/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
1661/// do {
1662/// remote_id = next_active_lane_id_right_after_me();
1663/// # the above function returns 0 of no active lane
1664/// # is present right after the current lane.
1665/// size = number_of_active_lanes_in_this_warp();
1666/// logical_lane_id /= 2;
1667/// ShuffleReduceFn(reduce_data, logical_lane_id,
1668/// remote_id-1-threadIdx.x, 2);
1669/// } while (logical_lane_id % 2 == 0 && size > 1);
1670/// }
1671///
1672/// There is no assumption made about the initial state of the reduction.
1673/// Any number of lanes (>=1) could be active at any position. The reduction
1674/// result is returned in the first active lane.
1675///
1676/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
1677///
1678/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
1679/// if (lane_id % 2 == 0 && offset > 0)
1680/// reduce_elem = reduce_elem REDUCE_OP remote_elem
1681/// else
1682/// reduce_elem = remote_elem
1683///
1684///
1685/// Intra-Team Reduction
1686///
1687/// This function, as implemented in the runtime call
1688/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
1689/// threads in a team. It first reduces within a warp using the
1690/// aforementioned algorithms. We then proceed to gather all such
1691/// reduced values at the first warp.
1692///
1693/// The runtime makes use of the function 'InterWarpCpyFn', which copies
1694/// data from each of the "warp master" (zeroth lane of each warp, where
1695/// warp-reduced data is held) to the zeroth warp. This step reduces (in
1696/// a mathematical sense) the problem of reduction across warp masters in
1697/// a block to the problem of warp reduction.
1698///
1699void CGOpenMPRuntimeNVPTX::emitReduction(
1700 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
1701 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
1702 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
1703 if (!CGF.HaveInsertPoint())
1704 return;
1705
1706 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
1707 assert(ParallelReduction && "Invalid reduction selection in emitReduction.");
1708
1709 auto &C = CGM.getContext();
1710
1711 // 1. Build a list of reduction variables.
1712 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
1713 auto Size = RHSExprs.size();
1714 for (auto *E : Privates) {
1715 if (E->getType()->isVariablyModifiedType())
1716 // Reserve place for array size.
1717 ++Size;
1718 }
1719 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
1720 QualType ReductionArrayTy =
1721 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
1722 /*IndexTypeQuals=*/0);
1723 Address ReductionList =
1724 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
1725 auto IPriv = Privates.begin();
1726 unsigned Idx = 0;
1727 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
1728 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
1729 CGF.getPointerSize());
1730 CGF.Builder.CreateStore(
1731 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1732 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
1733 Elem);
1734 if ((*IPriv)->getType()->isVariablyModifiedType()) {
1735 // Store array size.
1736 ++Idx;
1737 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
1738 CGF.getPointerSize());
1739 llvm::Value *Size = CGF.Builder.CreateIntCast(
1740 CGF.getVLASize(
1741 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
1742 .first,
1743 CGF.SizeTy, /*isSigned=*/false);
1744 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
1745 Elem);
1746 }
1747 }
1748
1749 // 2. Emit reduce_func().
1750 auto *ReductionFn = emitReductionFunction(
1751 CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
1752 LHSExprs, RHSExprs, ReductionOps);
1753
1754 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
1755 // RedList, shuffle_reduce_func, interwarp_copy_func);
1756 auto *ThreadId = getThreadID(CGF, Loc);
1757 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
1758 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
1759 ReductionList.getPointer(), CGF.VoidPtrTy);
1760
1761 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
1762 CGM, Privates, ReductionArrayTy, ReductionFn);
1763 auto *InterWarpCopyFn =
1764 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
1765
1766 llvm::Value *Res = nullptr;
1767 if (ParallelReduction) {
1768 llvm::Value *Args[] = {ThreadId,
1769 CGF.Builder.getInt32(RHSExprs.size()),
1770 ReductionArrayTySize,
1771 RL,
1772 ShuffleAndReduceFn,
1773 InterWarpCopyFn};
1774
1775 Res = CGF.EmitRuntimeCall(
1776 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
1777 Args);
1778 }
1779
1780 // 5. Build switch(res)
1781 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
1782 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
1783
1784 // 6. Build case 1: where we have reduced values in the master
1785 // thread in each team.
1786 // __kmpc_end_reduce{_nowait}(<gtid>);
1787 // break;
1788 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
1789 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
1790 CGF.EmitBlock(Case1BB);
1791
1792 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
1793 llvm::Value *EndArgs[] = {ThreadId};
1794 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
1795 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
1796 auto IPriv = Privates.begin();
1797 auto ILHS = LHSExprs.begin();
1798 auto IRHS = RHSExprs.begin();
1799 for (auto *E : ReductionOps) {
1800 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
1801 cast<DeclRefExpr>(*IRHS));
1802 ++IPriv;
1803 ++ILHS;
1804 ++IRHS;
1805 }
1806 };
1807 RegionCodeGenTy RCG(CodeGen);
1808 NVPTXActionTy Action(
1809 nullptr, llvm::None,
1810 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
1811 EndArgs);
1812 RCG.setAction(Action);
1813 RCG(CGF);
1814 CGF.EmitBranch(DefaultBB);
1815 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
1816}