blob: bc769f4f4953cf749ba74e84173cd9f2348f674d [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataevc5b1d322016-03-04 09:22:22 +000016#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000017#include "CodeGenFunction.h"
18#include "clang/AST/StmtOpenMP.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000019
20using namespace clang;
21using namespace CodeGen;
22
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000023namespace {
24enum OpenMPRTLFunctionNVPTX {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000025 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
26 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000027 OMPRTL_NVPTX__kmpc_kernel_init,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000028 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000029 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000030 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000031 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000032 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
33 /// \brief Call to void __kmpc_spmd_kernel_deinit();
34 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000035 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000036 /// *outlined_function, void ***args, kmp_int32 nArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000037 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000038 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void
39 /// ***args);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000040 OMPRTL_NVPTX__kmpc_kernel_parallel,
41 /// \brief Call to void __kmpc_kernel_end_parallel();
42 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
43 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
44 /// global_tid);
45 OMPRTL_NVPTX__kmpc_serialized_parallel,
46 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
47 /// global_tid);
48 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000049 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
50 /// int16_t lane_offset, int16_t warp_size);
51 OMPRTL_NVPTX__kmpc_shuffle_int32,
52 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
53 /// int16_t lane_offset, int16_t warp_size);
54 OMPRTL_NVPTX__kmpc_shuffle_int64,
55 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
56 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
57 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
58 /// lane_offset, int16_t shortCircuit),
59 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
60 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000061 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
62 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
63 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
64 /// lane_offset, int16_t shortCircuit),
65 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
66 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
67 /// int32_t index, int32_t width),
68 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
69 /// index, int32_t width, int32_t reduce))
70 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000071 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
72 OMPRTL_NVPTX__kmpc_end_reduce_nowait
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000073};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000074
75/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
76class NVPTXActionTy final : public PrePostActionTy {
77 llvm::Value *EnterCallee;
78 ArrayRef<llvm::Value *> EnterArgs;
79 llvm::Value *ExitCallee;
80 ArrayRef<llvm::Value *> ExitArgs;
81 bool Conditional;
82 llvm::BasicBlock *ContBlock = nullptr;
83
84public:
85 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
86 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
87 bool Conditional = false)
88 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
89 ExitArgs(ExitArgs), Conditional(Conditional) {}
90 void Enter(CodeGenFunction &CGF) override {
91 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
92 if (Conditional) {
93 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
94 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
95 ContBlock = CGF.createBasicBlock("omp_if.end");
96 // Generate the branch (If-stmt)
97 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
98 CGF.EmitBlock(ThenBlock);
99 }
100 }
101 void Done(CodeGenFunction &CGF) {
102 // Emit the rest of blocks/branches
103 CGF.EmitBranch(ContBlock);
104 CGF.EmitBlock(ContBlock, true);
105 }
106 void Exit(CodeGenFunction &CGF) override {
107 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
108 }
109};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000110
111// A class to track the execution mode when codegening directives within
112// a target region. The appropriate mode (generic/spmd) is set on entry
113// to the target region and used by containing directives such as 'parallel'
114// to emit optimized code.
115class ExecutionModeRAII {
116private:
117 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
118 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
119
120public:
121 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
122 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
123 : Mode(Mode) {
124 SavedMode = Mode;
125 Mode = NewMode;
126 }
127 ~ExecutionModeRAII() { Mode = SavedMode; }
128};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000129
130/// GPU Configuration: This information can be derived from cuda registers,
131/// however, providing compile time constants helps generate more efficient
132/// code. For all practical purposes this is fine because the configuration
133/// is the same for all known NVPTX architectures.
134enum MachineConfiguration : unsigned {
135 WarpSize = 32,
136 /// Number of bits required to represent a lane identifier, which is
137 /// computed as log_2(WarpSize).
138 LaneIDBits = 5,
139 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000140
141 /// Global memory alignment for performance.
142 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000143};
144
145enum NamedBarrier : unsigned {
146 /// Synchronize on this barrier #ID using a named barrier primitive.
147 /// Only the subset of active threads in a parallel region arrive at the
148 /// barrier.
149 NB_Parallel = 1,
150};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000151} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000152
153/// Get the GPU warp size.
154static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000155 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000156 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000157 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000158 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000159}
160
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000161/// Get the id of the current thread on the GPU.
162static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000163 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000164 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000165 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000166 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000167}
168
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000169/// Get the id of the warp in the block.
170/// We assume that the warp size is 32, which is always the case
171/// on the NVPTX device, to generate more efficient code.
172static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
173 CGBuilderTy &Bld = CGF.Builder;
174 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
175}
176
177/// Get the id of the current lane in the Warp.
178/// We assume that the warp size is 32, which is always the case
179/// on the NVPTX device, to generate more efficient code.
180static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
181 CGBuilderTy &Bld = CGF.Builder;
182 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
183 "nvptx_lane_id");
184}
185
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000186/// Get the maximum number of threads in a block of the GPU.
187static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000188 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000189 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000190 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000191 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000192}
193
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000194/// Get barrier to synchronize all threads in a block.
195static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000196 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000197 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000198}
199
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000200/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
201/// a CTA.
202static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
203 llvm::Value *NumThreads) {
204 CGBuilderTy &Bld = CGF.Builder;
205 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000206 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
207 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
208 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000209}
210
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000211/// Synchronize all GPU threads in a block.
212static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000213
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000214/// Synchronize worker threads in a parallel region.
215static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
216 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
217}
218
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000219/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000220/// For the 'generic' execution mode, the runtime encodes thread_limit in
221/// the launch parameters, always starting thread_limit+warpSize threads per
222/// CTA. The threads in the last warp are reserved for master execution.
223/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
224static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
225 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000226 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000227 return IsInSpmdExecutionMode
228 ? getNVPTXNumThreads(CGF)
229 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
230 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000231}
232
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000233/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000234/// The master thread id is the first thread (lane) of the last warp in the
235/// GPU block. Warp size is assumed to be some power of 2.
236/// Thread id is 0 indexed.
237/// E.g: If NumThreads is 33, master id is 32.
238/// If NumThreads is 64, master id is 32.
239/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000240static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000241 CGBuilderTy &Bld = CGF.Builder;
242 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
243
244 // We assume that the warp size is a power of 2.
245 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
246
247 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
248 Bld.CreateNot(Mask), "master_tid");
249}
250
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000251CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
252 CodeGenModule &CGM)
253 : WorkerFn(nullptr), CGFI(nullptr) {
254 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000255}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000256
257void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
258 CodeGenModule &CGM) {
259 // Create an worker function with no arguments.
260 CGFI = &CGM.getTypes().arrangeNullaryFunction();
261
262 WorkerFn = llvm::Function::Create(
263 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
264 /* placeholder */ "_worker", &CGM.getModule());
265 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000266}
267
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000268bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
269 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
270}
271
272static CGOpenMPRuntimeNVPTX::ExecutionMode
273getExecutionModeForDirective(CodeGenModule &CGM,
274 const OMPExecutableDirective &D) {
275 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
276 switch (DirectiveKind) {
277 case OMPD_target:
Arpith Chacko Jacobcca61a32017-01-26 15:43:27 +0000278 case OMPD_target_teams:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000279 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
280 case OMPD_target_parallel:
Alexey Bataevfb0ebec2017-11-08 20:16:14 +0000281 case OMPD_target_parallel_for:
Alexey Bataev5d7edca2017-11-09 17:32:15 +0000282 case OMPD_target_parallel_for_simd:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000283 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
284 default:
285 llvm_unreachable("Unsupported directive on NVPTX device.");
286 }
287 llvm_unreachable("Unsupported directive on NVPTX device.");
288}
289
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000290void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
291 StringRef ParentName,
292 llvm::Function *&OutlinedFn,
293 llvm::Constant *&OutlinedFnID,
294 bool IsOffloadEntry,
295 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000296 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
297 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000298 EntryFunctionState EST;
299 WorkerFunctionState WST(CGM);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000300 Work.clear();
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000301 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000302
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000303 // Emit target region as a standalone region.
304 class NVPTXPrePostActionTy : public PrePostActionTy {
305 CGOpenMPRuntimeNVPTX &RT;
306 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
307 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000308
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000309 public:
310 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
311 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
312 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
313 : RT(RT), EST(EST), WST(WST) {}
314 void Enter(CodeGenFunction &CGF) override {
315 RT.emitGenericEntryHeader(CGF, EST, WST);
316 }
317 void Exit(CodeGenFunction &CGF) override {
318 RT.emitGenericEntryFooter(CGF, EST);
319 }
320 } Action(*this, EST, WST);
321 CodeGen.setAction(Action);
322 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
323 IsOffloadEntry, CodeGen);
324
325 // Create the worker function
326 emitWorkerFunction(WST);
327
328 // Now change the name of the worker function to correspond to this target
329 // region's entry function.
330 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
331}
332
333// Setup NVPTX threads for master-worker OpenMP scheme.
334void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
335 EntryFunctionState &EST,
336 WorkerFunctionState &WST) {
337 CGBuilderTy &Bld = CGF.Builder;
338
339 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
340 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
341 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
342 EST.ExitBB = CGF.createBasicBlock(".exit");
343
344 auto *IsWorker =
345 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
346 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
347
348 CGF.EmitBlock(WorkerBB);
Alexey Bataev3c595a62017-08-14 15:01:03 +0000349 emitCall(CGF, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000350 CGF.EmitBranch(EST.ExitBB);
351
352 CGF.EmitBlock(MasterCheckBB);
353 auto *IsMaster =
354 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
355 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
356
357 CGF.EmitBlock(MasterBB);
358 // First action in sequential region:
359 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000360 // TODO: Optimize runtime initialization and pass in correct value.
361 llvm::Value *Args[] = {getThreadLimit(CGF),
362 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000363 CGF.EmitRuntimeCall(
364 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
365}
366
367void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
368 EntryFunctionState &EST) {
369 if (!EST.ExitBB)
370 EST.ExitBB = CGF.createBasicBlock(".exit");
371
372 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
373 CGF.EmitBranch(TerminateBB);
374
375 CGF.EmitBlock(TerminateBB);
376 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000377 // TODO: Optimize runtime initialization and pass in correct value.
378 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000379 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000380 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000381 // Barrier to terminate worker threads.
382 syncCTAThreads(CGF);
383 // Master thread jumps to exit point.
384 CGF.EmitBranch(EST.ExitBB);
385
386 CGF.EmitBlock(EST.ExitBB);
387 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000388}
389
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000390void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
391 StringRef ParentName,
392 llvm::Function *&OutlinedFn,
393 llvm::Constant *&OutlinedFnID,
394 bool IsOffloadEntry,
395 const RegionCodeGenTy &CodeGen) {
396 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
397 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
398 EntryFunctionState EST;
399
400 // Emit target region as a standalone region.
401 class NVPTXPrePostActionTy : public PrePostActionTy {
402 CGOpenMPRuntimeNVPTX &RT;
403 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
404 const OMPExecutableDirective &D;
405
406 public:
407 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
408 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
409 const OMPExecutableDirective &D)
410 : RT(RT), EST(EST), D(D) {}
411 void Enter(CodeGenFunction &CGF) override {
412 RT.emitSpmdEntryHeader(CGF, EST, D);
413 }
414 void Exit(CodeGenFunction &CGF) override {
415 RT.emitSpmdEntryFooter(CGF, EST);
416 }
417 } Action(*this, EST, D);
418 CodeGen.setAction(Action);
419 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
420 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000421}
422
423void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
424 CodeGenFunction &CGF, EntryFunctionState &EST,
425 const OMPExecutableDirective &D) {
426 auto &Bld = CGF.Builder;
427
428 // Setup BBs in entry function.
429 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
430 EST.ExitBB = CGF.createBasicBlock(".exit");
431
432 // Initialize the OMP state in the runtime; called by all active threads.
433 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
434 // based on code analysis of the target region.
435 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
436 /*RequiresOMPRuntime=*/Bld.getInt16(1),
437 /*RequiresDataSharing=*/Bld.getInt16(1)};
438 CGF.EmitRuntimeCall(
439 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
440 CGF.EmitBranch(ExecuteBB);
441
442 CGF.EmitBlock(ExecuteBB);
443}
444
445void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
446 EntryFunctionState &EST) {
447 if (!EST.ExitBB)
448 EST.ExitBB = CGF.createBasicBlock(".exit");
449
450 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
451 CGF.EmitBranch(OMPDeInitBB);
452
453 CGF.EmitBlock(OMPDeInitBB);
454 // DeInitialize the OMP state in the runtime; called by all active threads.
455 CGF.EmitRuntimeCall(
456 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
457 CGF.EmitBranch(EST.ExitBB);
458
459 CGF.EmitBlock(EST.ExitBB);
460 EST.ExitBB = nullptr;
461}
462
463// Create a unique global variable to indicate the execution mode of this target
464// region. The execution mode is either 'generic', or 'spmd' depending on the
465// target directive. This variable is picked up by the offload library to setup
466// the device appropriately before kernel launch. If the execution mode is
467// 'generic', the runtime reserves one warp for the master, otherwise, all
468// warps participate in parallel work.
469static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
470 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
471 (void)new llvm::GlobalVariable(
472 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
473 llvm::GlobalValue::WeakAnyLinkage,
474 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
475}
476
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000477void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000478 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000479
480 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000481 CGF.disableDebugInfo();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000482 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
483 emitWorkerLoop(CGF, WST);
484 CGF.FinishFunction();
485}
486
487void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
488 WorkerFunctionState &WST) {
489 //
490 // The workers enter this loop and wait for parallel work from the master.
491 // When the master encounters a parallel region it sets up the work + variable
492 // arguments, and wakes up the workers. The workers first check to see if
493 // they are required for the parallel region, i.e., within the # of requested
494 // parallel threads. The activated workers load the variable arguments and
495 // execute the parallel work.
496 //
497
498 CGBuilderTy &Bld = CGF.Builder;
499
500 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
501 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
502 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
503 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
504 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
505 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
506
507 CGF.EmitBranch(AwaitBB);
508
509 // Workers wait for work from master.
510 CGF.EmitBlock(AwaitBB);
511 // Wait for parallel work
512 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000513
514 Address WorkFn =
515 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
516 Address ExecStatus =
517 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
518 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
519 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
520
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000521 // Set up shared arguments
522 Address SharedArgs =
523 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
524 llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer()};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000525 llvm::Value *Ret = CGF.EmitRuntimeCall(
526 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
527 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000528
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000529 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000530 llvm::Value *ShouldTerminate =
531 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000532 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
533
534 // Activate requested workers.
535 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000536 llvm::Value *IsActive =
537 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
538 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000539
540 // Signal start of parallel region.
541 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000542
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000543 // Current context
544 ASTContext &Ctx = CGF.getContext();
545
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000546 // Process work items: outlined parallel functions.
547 for (auto *W : Work) {
548 // Try to match this outlined function.
549 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
550
551 llvm::Value *WorkFnMatch =
552 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
553
554 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
555 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
556 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
557
558 // Execute this outlined function.
559 CGF.EmitBlock(ExecuteFNBB);
560
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000561 // Insert call to work function via shared wrapper. The shared
562 // wrapper takes exactly three arguments:
563 // - the parallelism level;
564 // - the master thread ID;
565 // - the list of references to shared arguments.
566 //
567 // TODO: Assert that the function is a wrapper function.s
568 Address Capture = CGF.EmitLoadOfPointer(SharedArgs,
569 Ctx.getPointerType(
570 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());
571 emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0),
572 getMasterThreadID(CGF), Capture.getPointer()});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000573
574 // Go to end of parallel region.
575 CGF.EmitBranch(TerminateBB);
576
577 CGF.EmitBlock(CheckNextBB);
578 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000579
580 // Signal end of parallel region.
581 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000582 CGF.EmitRuntimeCall(
583 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
584 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000585 CGF.EmitBranch(BarrierBB);
586
587 // All active and inactive workers wait at a barrier after parallel region.
588 CGF.EmitBlock(BarrierBB);
589 // Barrier after parallel region.
590 syncCTAThreads(CGF);
591 CGF.EmitBranch(AwaitBB);
592
593 // Exit target region.
594 CGF.EmitBlock(ExitBB);
595}
596
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000597/// \brief Returns specified OpenMP runtime function for the current OpenMP
598/// implementation. Specialized for the NVPTX device.
599/// \param Function OpenMP runtime function.
600/// \return Specified function.
601llvm::Constant *
602CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
603 llvm::Constant *RTLFn = nullptr;
604 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
605 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000606 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
607 // RequiresOMPRuntime);
608 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000609 llvm::FunctionType *FnTy =
610 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
611 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
612 break;
613 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000614 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000615 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
616 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000617 llvm::FunctionType *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000618 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000619 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
620 break;
621 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000622 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
623 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000624 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000625 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
626 llvm::FunctionType *FnTy =
627 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
628 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
629 break;
630 }
631 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
632 // Build void __kmpc_spmd_kernel_deinit();
633 llvm::FunctionType *FnTy =
634 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
635 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
636 break;
637 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000638 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
639 /// Build void __kmpc_kernel_prepare_parallel(
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000640 /// void *outlined_function, void ***args, kmp_int32 nArgs);
641 llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
642 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000643 llvm::FunctionType *FnTy =
644 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
645 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
646 break;
647 }
648 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000649 /// Build bool __kmpc_kernel_parallel(void **outlined_function, void ***args);
650 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
651 CGM.Int8PtrPtrTy->getPointerTo(0)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000652 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
653 llvm::FunctionType *FnTy =
654 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
655 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
656 break;
657 }
658 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
659 /// Build void __kmpc_kernel_end_parallel();
660 llvm::FunctionType *FnTy =
661 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
662 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
663 break;
664 }
665 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
666 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
667 // global_tid);
668 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
669 llvm::FunctionType *FnTy =
670 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
671 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
672 break;
673 }
674 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
675 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
676 // global_tid);
677 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
678 llvm::FunctionType *FnTy =
679 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
680 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
681 break;
682 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000683 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
684 // Build int32_t __kmpc_shuffle_int32(int32_t element,
685 // int16_t lane_offset, int16_t warp_size);
686 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
687 llvm::FunctionType *FnTy =
688 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
689 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
690 break;
691 }
692 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
693 // Build int64_t __kmpc_shuffle_int64(int64_t element,
694 // int16_t lane_offset, int16_t warp_size);
695 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
696 llvm::FunctionType *FnTy =
697 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
698 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
699 break;
700 }
701 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
702 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
703 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
704 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
705 // lane_offset, int16_t Algorithm Version),
706 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
707 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
708 CGM.Int16Ty, CGM.Int16Ty};
709 auto *ShuffleReduceFnTy =
710 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
711 /*isVarArg=*/false);
712 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
713 auto *InterWarpCopyFnTy =
714 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
715 /*isVarArg=*/false);
716 llvm::Type *TypeParams[] = {CGM.Int32Ty,
717 CGM.Int32Ty,
718 CGM.SizeTy,
719 CGM.VoidPtrTy,
720 ShuffleReduceFnTy->getPointerTo(),
721 InterWarpCopyFnTy->getPointerTo()};
722 llvm::FunctionType *FnTy =
723 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
724 RTLFn = CGM.CreateRuntimeFunction(
725 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
726 break;
727 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000728 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
729 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
730 // int32_t num_vars, size_t reduce_size, void *reduce_data,
731 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
732 // lane_offset, int16_t shortCircuit),
733 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
734 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
735 // int32_t index, int32_t width),
736 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
737 // int32_t index, int32_t width, int32_t reduce))
738 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
739 CGM.Int16Ty, CGM.Int16Ty};
740 auto *ShuffleReduceFnTy =
741 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
742 /*isVarArg=*/false);
743 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
744 auto *InterWarpCopyFnTy =
745 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
746 /*isVarArg=*/false);
747 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
748 CGM.Int32Ty, CGM.Int32Ty};
749 auto *CopyToScratchpadFnTy =
750 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
751 /*isVarArg=*/false);
752 llvm::Type *LoadReduceTypeParams[] = {
753 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
754 auto *LoadReduceFnTy =
755 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
756 /*isVarArg=*/false);
757 llvm::Type *TypeParams[] = {CGM.Int32Ty,
758 CGM.Int32Ty,
759 CGM.SizeTy,
760 CGM.VoidPtrTy,
761 ShuffleReduceFnTy->getPointerTo(),
762 InterWarpCopyFnTy->getPointerTo(),
763 CopyToScratchpadFnTy->getPointerTo(),
764 LoadReduceFnTy->getPointerTo()};
765 llvm::FunctionType *FnTy =
766 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
767 RTLFn = CGM.CreateRuntimeFunction(
768 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
769 break;
770 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000771 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
772 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
773 llvm::Type *TypeParams[] = {CGM.Int32Ty};
774 llvm::FunctionType *FnTy =
775 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
776 RTLFn = CGM.CreateRuntimeFunction(
777 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
778 break;
779 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000780 }
781 return RTLFn;
782}
783
784void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
785 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +0000786 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000787 auto *F = dyn_cast<llvm::Function>(Addr);
788 // TODO: Add support for global variables on the device after declare target
789 // support.
790 if (!F)
791 return;
792 llvm::Module *M = F->getParent();
793 llvm::LLVMContext &Ctx = M->getContext();
794
795 // Get "nvvm.annotations" metadata node
796 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
797
798 llvm::Metadata *MDVals[] = {
799 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
800 llvm::ConstantAsMetadata::get(
801 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
802 // Append metadata to nvvm.annotations
803 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
804}
805
806void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
807 const OMPExecutableDirective &D, StringRef ParentName,
808 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +0000809 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000810 if (!IsOffloadEntry) // Nothing to do.
811 return;
812
813 assert(!ParentName.empty() && "Invalid target region parent name!");
814
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000815 CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
816 getExecutionModeForDirective(CGM, D);
817 switch (Mode) {
818 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
819 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
820 CodeGen);
821 break;
822 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
823 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
824 CodeGen);
825 break;
826 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
827 llvm_unreachable(
828 "Unknown programming model for OpenMP directive on NVPTX target.");
829 }
830
831 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000832}
833
Samuel Antao45bfe4c2016-02-08 15:59:20 +0000834CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000835 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000836 if (!CGM.getLangOpts().OpenMPIsDevice)
837 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000838}
Carlo Bertollic6872252016-04-04 15:55:02 +0000839
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +0000840void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
841 OpenMPProcBindClauseKind ProcBind,
842 SourceLocation Loc) {
843 // Do nothing in case of Spmd mode and L0 parallel.
844 // TODO: If in Spmd mode and L1 parallel emit the clause.
845 if (isInSpmdExecutionMode())
846 return;
847
848 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
849}
850
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +0000851void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
852 llvm::Value *NumThreads,
853 SourceLocation Loc) {
854 // Do nothing in case of Spmd mode and L0 parallel.
855 // TODO: If in Spmd mode and L1 parallel emit the clause.
856 if (isInSpmdExecutionMode())
857 return;
858
859 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
860}
861
Carlo Bertollic6872252016-04-04 15:55:02 +0000862void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
863 const Expr *NumTeams,
864 const Expr *ThreadLimit,
865 SourceLocation Loc) {}
866
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000867llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
868 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
869 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000870
871 auto *OutlinedFun = cast<llvm::Function>(
872 CGOpenMPRuntime::emitParallelOutlinedFunction(
873 D, ThreadIDVar, InnermostKind, CodeGen));
874 if (!isInSpmdExecutionMode()) {
875 llvm::Function *WrapperFun =
876 createDataSharingWrapper(OutlinedFun, D);
877 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
878 }
879
880 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000881}
882
883llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +0000884 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
885 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
886
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000887 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
888 D, ThreadIDVar, InnermostKind, CodeGen);
889 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
890 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +0000891 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000892 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +0000893
894 return OutlinedFun;
895}
896
897void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
898 const OMPExecutableDirective &D,
899 SourceLocation Loc,
900 llvm::Value *OutlinedFn,
901 ArrayRef<llvm::Value *> CapturedVars) {
902 if (!CGF.HaveInsertPoint())
903 return;
904
905 Address ZeroAddr =
906 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
907 /*Name*/ ".zero.addr");
908 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
909 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
910 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
911 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
912 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +0000913 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +0000914}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000915
916void CGOpenMPRuntimeNVPTX::emitParallelCall(
917 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
918 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
919 if (!CGF.HaveInsertPoint())
920 return;
921
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000922 if (isInSpmdExecutionMode())
923 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
924 else
925 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000926}
927
928void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
929 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
930 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
931 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000932 llvm::Function *WFn = WrapperFunctionsMap[Fn];
933 assert(WFn && "Wrapper function does not exist!");
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000934
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000935 // Force inline this outlined function at its call site.
936 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
937
938 auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
939 PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000940 CGBuilderTy &Bld = CGF.Builder;
941
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000942 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
943
944 if (!CapturedVars.empty()) {
945 // Prepare for parallel region. Indicate the outlined function.
946 Address SharedArgs =
947 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
948 "shared_args");
949 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
950 llvm::Value *Args[] = {ID, SharedArgsPtr,
951 Bld.getInt32(CapturedVars.size())};
952
953 CGF.EmitRuntimeCall(
954 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
955 Args);
956
957 unsigned Idx = 0;
958 ASTContext &Ctx = CGF.getContext();
959 for (llvm::Value *V : CapturedVars) {
960 Address Dst = Bld.CreateConstInBoundsGEP(
961 CGF.EmitLoadOfPointer(SharedArgs,
962 Ctx.getPointerType(
963 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()),
964 Idx, CGF.getPointerSize());
965 llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
966 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
967 Ctx.getPointerType(Ctx.VoidPtrTy));
968 Idx++;
969 }
970 } else {
971 llvm::Value *Args[] = {ID,
972 llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),
973 /*nArgs=*/Bld.getInt32(0)};
974 CGF.EmitRuntimeCall(
975 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
976 Args);
977 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000978
979 // Activate workers. This barrier is used by the master to signal
980 // work for the workers.
981 syncCTAThreads(CGF);
982
983 // OpenMP [2.5, Parallel Construct, p.49]
984 // There is an implied barrier at the end of a parallel region. After the
985 // end of a parallel region, only the master thread of the team resumes
986 // execution of the enclosing task region.
987 //
988 // The master waits at this barrier until all workers are done.
989 syncCTAThreads(CGF);
990
991 // Remember for post-processing in worker loop.
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000992 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000993 };
994
995 auto *RTLoc = emitUpdateLocation(CGF, Loc);
996 auto *ThreadID = getThreadID(CGF, Loc);
997 llvm::Value *Args[] = {RTLoc, ThreadID};
998
Alexey Bataev3c595a62017-08-14 15:01:03 +0000999 auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1000 PrePostActionTy &) {
1001 auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,
1002 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001003 Action.Enter(CGF);
1004
1005 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1006 OutlinedFnArgs.push_back(
1007 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1008 OutlinedFnArgs.push_back(
1009 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1010 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001011 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001012 };
1013
1014 RegionCodeGenTy RCG(CodeGen);
1015 NVPTXActionTy Action(
1016 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1017 Args,
1018 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1019 Args);
1020 RCG.setAction(Action);
1021 RCG(CGF);
1022 };
1023
1024 if (IfCond)
1025 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1026 else {
1027 CodeGenFunction::RunCleanupsScope Scope(CGF);
1028 RegionCodeGenTy ThenRCG(L0ParallelGen);
1029 ThenRCG(CGF);
1030 }
1031}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001032
1033void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1034 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1035 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1036 // Just call the outlined function to execute the parallel region.
1037 // OutlinedFn(&GTid, &zero, CapturedStruct);
1038 //
1039 // TODO: Do something with IfCond when support for the 'if' clause
1040 // is added on Spmd target directives.
1041 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1042 OutlinedFnArgs.push_back(
1043 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1044 OutlinedFnArgs.push_back(
1045 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1046 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001047 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001048}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001049
1050/// This function creates calls to one of two shuffle functions to copy
1051/// variables between lanes in a warp.
1052static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1053 QualType ElemTy,
1054 llvm::Value *Elem,
1055 llvm::Value *Offset) {
1056 auto &CGM = CGF.CGM;
1057 auto &C = CGM.getContext();
1058 auto &Bld = CGF.Builder;
1059 CGOpenMPRuntimeNVPTX &RT =
1060 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1061
1062 unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
1063 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
1064
1065 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
1066 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1067 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1068
1069 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1070 auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1071 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1072 auto *WarpSize = CGF.EmitScalarConversion(
1073 getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
1074 C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
1075
1076 auto *ShuffledVal =
1077 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1078 {ElemCast, Offset, WarpSize});
1079
1080 return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
1081}
1082
1083namespace {
1084enum CopyAction : unsigned {
1085 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1086 // the warp using shuffle instructions.
1087 RemoteLaneToThread,
1088 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1089 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001090 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1091 ThreadToScratchpad,
1092 // ScratchpadToThread: Copy from a scratchpad array in global memory
1093 // containing team-reduced data to a thread's stack.
1094 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001095};
1096} // namespace
1097
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001098struct CopyOptionsTy {
1099 llvm::Value *RemoteLaneOffset;
1100 llvm::Value *ScratchpadIndex;
1101 llvm::Value *ScratchpadWidth;
1102};
1103
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001104/// Emit instructions to copy a Reduce list, which contains partially
1105/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001106static void emitReductionListCopy(
1107 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1108 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1109 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001110
1111 auto &CGM = CGF.CGM;
1112 auto &C = CGM.getContext();
1113 auto &Bld = CGF.Builder;
1114
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001115 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1116 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1117 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1118
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001119 // Iterates, element-by-element, through the source Reduce list and
1120 // make a copy.
1121 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001122 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001123 for (auto &Private : Privates) {
1124 Address SrcElementAddr = Address::invalid();
1125 Address DestElementAddr = Address::invalid();
1126 Address DestElementPtrAddr = Address::invalid();
1127 // Should we shuffle in an element from a remote lane?
1128 bool ShuffleInElement = false;
1129 // Set to true to update the pointer in the dest Reduce list to a
1130 // newly created element.
1131 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001132 // Increment the src or dest pointer to the scratchpad, for each
1133 // new element.
1134 bool IncrScratchpadSrc = false;
1135 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001136
1137 switch (Action) {
1138 case RemoteLaneToThread: {
1139 // Step 1.1: Get the address for the src element in the Reduce list.
1140 Address SrcElementPtrAddr =
1141 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1142 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1143 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1144 SrcElementAddr =
1145 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1146
1147 // Step 1.2: Create a temporary to store the element in the destination
1148 // Reduce list.
1149 DestElementPtrAddr =
1150 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1151 DestElementAddr =
1152 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1153 ShuffleInElement = true;
1154 UpdateDestListPtr = true;
1155 break;
1156 }
1157 case ThreadCopy: {
1158 // Step 1.1: Get the address for the src element in the Reduce list.
1159 Address SrcElementPtrAddr =
1160 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1161 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1162 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1163 SrcElementAddr =
1164 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1165
1166 // Step 1.2: Get the address for dest element. The destination
1167 // element has already been created on the thread's stack.
1168 DestElementPtrAddr =
1169 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1170 llvm::Value *DestElementPtr =
1171 CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1172 C.VoidPtrTy, SourceLocation());
1173 Address DestElemAddr =
1174 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1175 DestElementAddr = Bld.CreateElementBitCast(
1176 DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1177 break;
1178 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001179 case ThreadToScratchpad: {
1180 // Step 1.1: Get the address for the src element in the Reduce list.
1181 Address SrcElementPtrAddr =
1182 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1183 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1184 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1185 SrcElementAddr =
1186 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1187
1188 // Step 1.2: Get the address for dest element:
1189 // address = base + index * ElementSizeInChars.
1190 unsigned ElementSizeInChars =
1191 C.getTypeSizeInChars(Private->getType()).getQuantity();
1192 auto *CurrentOffset =
1193 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1194 ScratchpadIndex);
1195 auto *ScratchPadElemAbsolutePtrVal =
1196 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1197 ScratchPadElemAbsolutePtrVal =
1198 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1199 Address ScratchpadPtr =
1200 Address(ScratchPadElemAbsolutePtrVal,
1201 C.getTypeAlignInChars(Private->getType()));
1202 DestElementAddr = Bld.CreateElementBitCast(
1203 ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
1204 IncrScratchpadDest = true;
1205 break;
1206 }
1207 case ScratchpadToThread: {
1208 // Step 1.1: Get the address for the src element in the scratchpad.
1209 // address = base + index * ElementSizeInChars.
1210 unsigned ElementSizeInChars =
1211 C.getTypeSizeInChars(Private->getType()).getQuantity();
1212 auto *CurrentOffset =
1213 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1214 ScratchpadIndex);
1215 auto *ScratchPadElemAbsolutePtrVal =
1216 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1217 ScratchPadElemAbsolutePtrVal =
1218 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1219 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1220 C.getTypeAlignInChars(Private->getType()));
1221 IncrScratchpadSrc = true;
1222
1223 // Step 1.2: Create a temporary to store the element in the destination
1224 // Reduce list.
1225 DestElementPtrAddr =
1226 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1227 DestElementAddr =
1228 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1229 UpdateDestListPtr = true;
1230 break;
1231 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001232 }
1233
1234 // Regardless of src and dest of copy, we emit the load of src
1235 // element as this is required in all directions
1236 SrcElementAddr = Bld.CreateElementBitCast(
1237 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1238 llvm::Value *Elem =
1239 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1240 Private->getType(), SourceLocation());
1241
1242 // Now that all active lanes have read the element in the
1243 // Reduce list, shuffle over the value from the remote lane.
1244 if (ShuffleInElement) {
1245 Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1246 RemoteLaneOffset);
1247 }
1248
1249 // Store the source element value to the dest element address.
1250 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1251 Private->getType());
1252
1253 // Step 3.1: Modify reference in dest Reduce list as needed.
1254 // Modifying the reference in Reduce list to point to the newly
1255 // created element. The element is live in the current function
1256 // scope and that of functions it invokes (i.e., reduce_function).
1257 // RemoteReduceData[i] = (void*)&RemoteElem
1258 if (UpdateDestListPtr) {
1259 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1260 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1261 DestElementPtrAddr, /*Volatile=*/false,
1262 C.VoidPtrTy);
1263 }
1264
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001265 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1266 // address of the next element in scratchpad memory, unless we're currently
1267 // processing the last one. Memory alignment is also taken care of here.
1268 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1269 llvm::Value *ScratchpadBasePtr =
1270 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1271 unsigned ElementSizeInChars =
1272 C.getTypeSizeInChars(Private->getType()).getQuantity();
1273 ScratchpadBasePtr = Bld.CreateAdd(
1274 ScratchpadBasePtr,
1275 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1276 CGM.SizeTy, ElementSizeInChars)));
1277
1278 // Take care of global memory alignment for performance
1279 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1280 llvm::ConstantInt::get(CGM.SizeTy, 1));
1281 ScratchpadBasePtr = Bld.CreateSDiv(
1282 ScratchpadBasePtr,
1283 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1284 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1285 llvm::ConstantInt::get(CGM.SizeTy, 1));
1286 ScratchpadBasePtr = Bld.CreateMul(
1287 ScratchpadBasePtr,
1288 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1289
1290 if (IncrScratchpadDest)
1291 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1292 else /* IncrScratchpadSrc = true */
1293 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1294 }
1295
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001296 Idx++;
1297 }
1298}
1299
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001300/// This function emits a helper that loads data from the scratchpad array
1301/// and (optionally) reduces it with the input operand.
1302///
1303/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1304/// reduce_data remote;
1305/// for elem in remote:
1306/// remote.elem = Scratchpad[elem_id][index]
1307/// if (should_reduce)
1308/// local = local @ remote
1309/// else
1310/// local = remote
Benjamin Kramer674d5792017-05-26 20:08:24 +00001311static llvm::Value *
1312emitReduceScratchpadFunction(CodeGenModule &CGM,
1313 ArrayRef<const Expr *> Privates,
1314 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001315 auto &C = CGM.getContext();
1316 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1317
1318 // Destination of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001319 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001320 // Base address of the scratchpad array, with each element storing a
1321 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001322 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001323 // A source index into the scratchpad array.
Alexey Bataev56223232017-06-09 13:40:18 +00001324 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001325 // Row width of an element in the scratchpad array, typically
1326 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001327 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001328 // If should_reduce == 1, then it's load AND reduce,
1329 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1330 // The latter case is used for initialization.
Alexey Bataev56223232017-06-09 13:40:18 +00001331 ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001332
1333 FunctionArgList Args;
1334 Args.push_back(&ReduceListArg);
1335 Args.push_back(&ScratchPadArg);
1336 Args.push_back(&IndexArg);
1337 Args.push_back(&WidthArg);
1338 Args.push_back(&ShouldReduceArg);
1339
1340 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1341 auto *Fn = llvm::Function::Create(
1342 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1343 "_omp_reduction_load_and_reduce", &CGM.getModule());
1344 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1345 CodeGenFunction CGF(CGM);
1346 // We don't need debug information in this function as nothing here refers to
1347 // user code.
1348 CGF.disableDebugInfo();
1349 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1350
1351 auto &Bld = CGF.Builder;
1352
1353 // Get local Reduce list pointer.
1354 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1355 Address ReduceListAddr(
1356 Bld.CreatePointerBitCastOrAddrSpaceCast(
1357 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1358 C.VoidPtrTy, SourceLocation()),
1359 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1360 CGF.getPointerAlign());
1361
1362 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1363 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1364 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1365
1366 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1367 llvm::Value *IndexVal =
1368 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1369 Int32Ty, SourceLocation()),
1370 CGM.SizeTy, /*isSigned=*/true);
1371
1372 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1373 llvm::Value *WidthVal =
1374 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1375 Int32Ty, SourceLocation()),
1376 CGM.SizeTy, /*isSigned=*/true);
1377
1378 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1379 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
1380 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
1381
1382 // The absolute ptr address to the base addr of the next element to copy.
1383 llvm::Value *CumulativeElemBasePtr =
1384 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1385 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1386
1387 // Create a Remote Reduce list to store the elements read from the
1388 // scratchpad array.
1389 Address RemoteReduceList =
1390 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1391
1392 // Assemble remote Reduce list from scratchpad array.
1393 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1394 SrcDataAddr, RemoteReduceList,
1395 {/*RemoteLaneOffset=*/nullptr,
1396 /*ScratchpadIndex=*/IndexVal,
1397 /*ScratchpadWidth=*/WidthVal});
1398
1399 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1400 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1401 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1402
1403 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1404 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1405
1406 CGF.EmitBlock(ThenBB);
1407 // We should reduce with the local Reduce list.
1408 // reduce_function(LocalReduceList, RemoteReduceList)
1409 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1410 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1411 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1412 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1413 CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
1414 Bld.CreateBr(MergeBB);
1415
1416 CGF.EmitBlock(ElseBB);
1417 // No reduction; just copy:
1418 // Local Reduce list = Remote Reduce list.
1419 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1420 RemoteReduceList, ReduceListAddr);
1421 Bld.CreateBr(MergeBB);
1422
1423 CGF.EmitBlock(MergeBB);
1424
1425 CGF.FinishFunction();
1426 return Fn;
1427}
1428
1429/// This function emits a helper that stores reduced data from the team
1430/// master to a scratchpad array in global memory.
1431///
1432/// for elem in Reduce List:
1433/// scratchpad[elem_id][index] = elem
1434///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001435static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1436 ArrayRef<const Expr *> Privates,
1437 QualType ReductionArrayTy) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001438
1439 auto &C = CGM.getContext();
1440 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1441
1442 // Source of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001443 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001444 // Base address of the scratchpad array, with each element storing a
1445 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001446 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001447 // A destination index into the scratchpad array, typically the team
1448 // identifier.
Alexey Bataev56223232017-06-09 13:40:18 +00001449 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001450 // Row width of an element in the scratchpad array, typically
1451 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001452 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001453
1454 FunctionArgList Args;
1455 Args.push_back(&ReduceListArg);
1456 Args.push_back(&ScratchPadArg);
1457 Args.push_back(&IndexArg);
1458 Args.push_back(&WidthArg);
1459
1460 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1461 auto *Fn = llvm::Function::Create(
1462 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1463 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
1464 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1465 CodeGenFunction CGF(CGM);
1466 // We don't need debug information in this function as nothing here refers to
1467 // user code.
1468 CGF.disableDebugInfo();
1469 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1470
1471 auto &Bld = CGF.Builder;
1472
1473 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1474 Address SrcDataAddr(
1475 Bld.CreatePointerBitCastOrAddrSpaceCast(
1476 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1477 C.VoidPtrTy, SourceLocation()),
1478 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1479 CGF.getPointerAlign());
1480
1481 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1482 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1483 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1484
1485 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1486 llvm::Value *IndexVal =
1487 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1488 Int32Ty, SourceLocation()),
1489 CGF.SizeTy, /*isSigned=*/true);
1490
1491 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1492 llvm::Value *WidthVal =
1493 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1494 Int32Ty, SourceLocation()),
1495 CGF.SizeTy, /*isSigned=*/true);
1496
1497 // The absolute ptr address to the base addr of the next element to copy.
1498 llvm::Value *CumulativeElemBasePtr =
1499 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1500 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1501
1502 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1503 SrcDataAddr, DestDataAddr,
1504 {/*RemoteLaneOffset=*/nullptr,
1505 /*ScratchpadIndex=*/IndexVal,
1506 /*ScratchpadWidth=*/WidthVal});
1507
1508 CGF.FinishFunction();
1509 return Fn;
1510}
1511
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001512/// This function emits a helper that gathers Reduce lists from the first
1513/// lane of every active warp to lanes in the first warp.
1514///
1515/// void inter_warp_copy_func(void* reduce_data, num_warps)
1516/// shared smem[warp_size];
1517/// For all data entries D in reduce_data:
1518/// If (I am the first lane in each warp)
1519/// Copy my local D to smem[warp_id]
1520/// sync
1521/// if (I am the first warp)
1522/// Copy smem[thread_id] to my local D
1523/// sync
1524static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1525 ArrayRef<const Expr *> Privates,
1526 QualType ReductionArrayTy) {
1527 auto &C = CGM.getContext();
1528 auto &M = CGM.getModule();
1529
1530 // ReduceList: thread local Reduce list.
1531 // At the stage of the computation when this function is called, partially
1532 // aggregated values reside in the first lane of every active warp.
Alexey Bataev56223232017-06-09 13:40:18 +00001533 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001534 // NumWarps: number of warps active in the parallel region. This could
1535 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev56223232017-06-09 13:40:18 +00001536 ImplicitParamDecl NumWarpsArg(C,
1537 C.getIntTypeForBitwidth(32, /* Signed */ true),
1538 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001539 FunctionArgList Args;
1540 Args.push_back(&ReduceListArg);
1541 Args.push_back(&NumWarpsArg);
1542
1543 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1544 auto *Fn = llvm::Function::Create(
1545 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1546 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1547 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1548 CodeGenFunction CGF(CGM);
1549 // We don't need debug information in this function as nothing here refers to
1550 // user code.
1551 CGF.disableDebugInfo();
1552 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1553
1554 auto &Bld = CGF.Builder;
1555
1556 // This array is used as a medium to transfer, one reduce element at a time,
1557 // the data from the first lane of every warp to lanes in the first warp
1558 // in order to perform the final step of a reduction in a parallel region
1559 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1560 // for reduced latency, as well as to have a distinct copy for concurrently
1561 // executing target regions. The array is declared with common linkage so
1562 // as to be shared across compilation units.
1563 const char *TransferMediumName =
1564 "__openmp_nvptx_data_transfer_temporary_storage";
1565 llvm::GlobalVariable *TransferMedium =
1566 M.getGlobalVariable(TransferMediumName);
1567 if (!TransferMedium) {
1568 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1569 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1570 TransferMedium = new llvm::GlobalVariable(
1571 M, Ty,
1572 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1573 llvm::Constant::getNullValue(Ty), TransferMediumName,
1574 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1575 SharedAddressSpace);
1576 }
1577
1578 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1579 auto *ThreadID = getNVPTXThreadID(CGF);
1580 // nvptx_lane_id = nvptx_id % warpsize
1581 auto *LaneID = getNVPTXLaneID(CGF);
1582 // nvptx_warp_id = nvptx_id / warpsize
1583 auto *WarpID = getNVPTXWarpID(CGF);
1584
1585 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1586 Address LocalReduceList(
1587 Bld.CreatePointerBitCastOrAddrSpaceCast(
1588 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1589 C.VoidPtrTy, SourceLocation()),
1590 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1591 CGF.getPointerAlign());
1592
1593 unsigned Idx = 0;
1594 for (auto &Private : Privates) {
1595 //
1596 // Warp master copies reduce element to transfer medium in __shared__
1597 // memory.
1598 //
1599 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1600 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1601 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1602
1603 // if (lane_id == 0)
1604 auto IsWarpMaster =
1605 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1606 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1607 CGF.EmitBlock(ThenBB);
1608
1609 // Reduce element = LocalReduceList[i]
1610 Address ElemPtrPtrAddr =
1611 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1612 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1613 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1614 // elemptr = (type[i]*)(elemptrptr)
1615 Address ElemPtr =
1616 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1617 ElemPtr = Bld.CreateElementBitCast(
1618 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1619 // elem = *elemptr
1620 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1621 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1622
1623 // Get pointer to location in transfer medium.
1624 // MediumPtr = &medium[warp_id]
1625 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1626 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1627 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1628 // Casting to actual data type.
1629 // MediumPtr = (type[i]*)MediumPtrAddr;
1630 MediumPtr = Bld.CreateElementBitCast(
1631 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1632
1633 //*MediumPtr = elem
1634 Bld.CreateStore(Elem, MediumPtr);
1635
1636 Bld.CreateBr(MergeBB);
1637
1638 CGF.EmitBlock(ElseBB);
1639 Bld.CreateBr(MergeBB);
1640
1641 CGF.EmitBlock(MergeBB);
1642
1643 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1644 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1645 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1646
1647 auto *NumActiveThreads = Bld.CreateNSWMul(
1648 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1649 // named_barrier_sync(ParallelBarrierID, num_active_threads)
1650 syncParallelThreads(CGF, NumActiveThreads);
1651
1652 //
1653 // Warp 0 copies reduce element from transfer medium.
1654 //
1655 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1656 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1657 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1658
1659 // Up to 32 threads in warp 0 are active.
1660 auto IsActiveThread =
1661 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1662 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1663
1664 CGF.EmitBlock(W0ThenBB);
1665
1666 // SrcMediumPtr = &medium[tid]
1667 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1668 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1669 Address SrcMediumPtr(SrcMediumPtrVal,
1670 C.getTypeAlignInChars(Private->getType()));
1671 // SrcMediumVal = *SrcMediumPtr;
1672 SrcMediumPtr = Bld.CreateElementBitCast(
1673 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1674 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1675 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1676
1677 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1678 Address TargetElemPtrPtr =
1679 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1680 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1681 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1682 Address TargetElemPtr =
1683 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1684 TargetElemPtr = Bld.CreateElementBitCast(
1685 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1686
1687 // *TargetElemPtr = SrcMediumVal;
1688 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1689 Private->getType());
1690 Bld.CreateBr(W0MergeBB);
1691
1692 CGF.EmitBlock(W0ElseBB);
1693 Bld.CreateBr(W0MergeBB);
1694
1695 CGF.EmitBlock(W0MergeBB);
1696
1697 // While warp 0 copies values from transfer medium, all other warps must
1698 // wait.
1699 syncParallelThreads(CGF, NumActiveThreads);
1700 Idx++;
1701 }
1702
1703 CGF.FinishFunction();
1704 return Fn;
1705}
1706
1707/// Emit a helper that reduces data across two OpenMP threads (lanes)
1708/// in the same warp. It uses shuffle instructions to copy over data from
1709/// a remote lane's stack. The reduction algorithm performed is specified
1710/// by the fourth parameter.
1711///
1712/// Algorithm Versions.
1713/// Full Warp Reduce (argument value 0):
1714/// This algorithm assumes that all 32 lanes are active and gathers
1715/// data from these 32 lanes, producing a single resultant value.
1716/// Contiguous Partial Warp Reduce (argument value 1):
1717/// This algorithm assumes that only a *contiguous* subset of lanes
1718/// are active. This happens for the last warp in a parallel region
1719/// when the user specified num_threads is not an integer multiple of
1720/// 32. This contiguous subset always starts with the zeroth lane.
1721/// Partial Warp Reduce (argument value 2):
1722/// This algorithm gathers data from any number of lanes at any position.
1723/// All reduced values are stored in the lowest possible lane. The set
1724/// of problems every algorithm addresses is a super set of those
1725/// addressable by algorithms with a lower version number. Overhead
1726/// increases as algorithm version increases.
1727///
1728/// Terminology
1729/// Reduce element:
1730/// Reduce element refers to the individual data field with primitive
1731/// data types to be combined and reduced across threads.
1732/// Reduce list:
1733/// Reduce list refers to a collection of local, thread-private
1734/// reduce elements.
1735/// Remote Reduce list:
1736/// Remote Reduce list refers to a collection of remote (relative to
1737/// the current thread) reduce elements.
1738///
1739/// We distinguish between three states of threads that are important to
1740/// the implementation of this function.
1741/// Alive threads:
1742/// Threads in a warp executing the SIMT instruction, as distinguished from
1743/// threads that are inactive due to divergent control flow.
1744/// Active threads:
1745/// The minimal set of threads that has to be alive upon entry to this
1746/// function. The computation is correct iff active threads are alive.
1747/// Some threads are alive but they are not active because they do not
1748/// contribute to the computation in any useful manner. Turning them off
1749/// may introduce control flow overheads without any tangible benefits.
1750/// Effective threads:
1751/// In order to comply with the argument requirements of the shuffle
1752/// function, we must keep all lanes holding data alive. But at most
1753/// half of them perform value aggregation; we refer to this half of
1754/// threads as effective. The other half is simply handing off their
1755/// data.
1756///
1757/// Procedure
1758/// Value shuffle:
1759/// In this step active threads transfer data from higher lane positions
1760/// in the warp to lower lane positions, creating Remote Reduce list.
1761/// Value aggregation:
1762/// In this step, effective threads combine their thread local Reduce list
1763/// with Remote Reduce list and store the result in the thread local
1764/// Reduce list.
1765/// Value copy:
1766/// In this step, we deal with the assumption made by algorithm 2
1767/// (i.e. contiguity assumption). When we have an odd number of lanes
1768/// active, say 2k+1, only k threads will be effective and therefore k
1769/// new values will be produced. However, the Reduce list owned by the
1770/// (2k+1)th thread is ignored in the value aggregation. Therefore
1771/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1772/// that the contiguity assumption still holds.
1773static llvm::Value *
1774emitShuffleAndReduceFunction(CodeGenModule &CGM,
1775 ArrayRef<const Expr *> Privates,
1776 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1777 auto &C = CGM.getContext();
1778
1779 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev56223232017-06-09 13:40:18 +00001780 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001781 // Current lane id; could be logical.
Alexey Bataev56223232017-06-09 13:40:18 +00001782 ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001783 // Offset of the remote source lane relative to the current lane.
Alexey Bataev56223232017-06-09 13:40:18 +00001784 ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
1785 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001786 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev56223232017-06-09 13:40:18 +00001787 ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001788 FunctionArgList Args;
1789 Args.push_back(&ReduceListArg);
1790 Args.push_back(&LaneIDArg);
1791 Args.push_back(&RemoteLaneOffsetArg);
1792 Args.push_back(&AlgoVerArg);
1793
1794 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1795 auto *Fn = llvm::Function::Create(
1796 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1797 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1798 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1799 CodeGenFunction CGF(CGM);
1800 // We don't need debug information in this function as nothing here refers to
1801 // user code.
1802 CGF.disableDebugInfo();
1803 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1804
1805 auto &Bld = CGF.Builder;
1806
1807 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1808 Address LocalReduceList(
1809 Bld.CreatePointerBitCastOrAddrSpaceCast(
1810 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1811 C.VoidPtrTy, SourceLocation()),
1812 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1813 CGF.getPointerAlign());
1814
1815 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1816 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1817 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1818
1819 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1820 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1821 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1822
1823 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1824 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1825 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1826
1827 // Create a local thread-private variable to host the Reduce list
1828 // from a remote lane.
1829 Address RemoteReduceList =
1830 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1831
1832 // This loop iterates through the list of reduce elements and copies,
1833 // element by element, from a remote lane in the warp to RemoteReduceList,
1834 // hosted on the thread's stack.
1835 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1836 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001837 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
1838 /*ScratchpadIndex=*/nullptr,
1839 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001840
1841 // The actions to be performed on the Remote Reduce list is dependent
1842 // on the algorithm version.
1843 //
1844 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1845 // LaneId % 2 == 0 && Offset > 0):
1846 // do the reduction value aggregation
1847 //
1848 // The thread local variable Reduce list is mutated in place to host the
1849 // reduced data, which is the aggregated value produced from local and
1850 // remote lanes.
1851 //
1852 // Note that AlgoVer is expected to be a constant integer known at compile
1853 // time.
1854 // When AlgoVer==0, the first conjunction evaluates to true, making
1855 // the entire predicate true during compile time.
1856 // When AlgoVer==1, the second conjunction has only the second part to be
1857 // evaluated during runtime. Other conjunctions evaluates to false
1858 // during compile time.
1859 // When AlgoVer==2, the third conjunction has only the second part to be
1860 // evaluated during runtime. Other conjunctions evaluates to false
1861 // during compile time.
1862 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1863
1864 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1865 auto CondAlgo1 = Bld.CreateAnd(
1866 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1867
1868 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1869 auto CondAlgo2 = Bld.CreateAnd(
1870 Algo2,
1871 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1872 Bld.getInt16(0)));
1873 CondAlgo2 = Bld.CreateAnd(
1874 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1875
1876 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1877 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1878
1879 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1880 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1881 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1882 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1883
1884 CGF.EmitBlock(ThenBB);
1885 // reduce_function(LocalReduceList, RemoteReduceList)
1886 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1887 LocalReduceList.getPointer(), CGF.VoidPtrTy);
1888 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1889 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1890 CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1891 Bld.CreateBr(MergeBB);
1892
1893 CGF.EmitBlock(ElseBB);
1894 Bld.CreateBr(MergeBB);
1895
1896 CGF.EmitBlock(MergeBB);
1897
1898 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1899 // Reduce list.
1900 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1901 auto CondCopy = Bld.CreateAnd(
1902 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1903
1904 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1905 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1906 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1907 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1908
1909 CGF.EmitBlock(CpyThenBB);
1910 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1911 RemoteReduceList, LocalReduceList);
1912 Bld.CreateBr(CpyMergeBB);
1913
1914 CGF.EmitBlock(CpyElseBB);
1915 Bld.CreateBr(CpyMergeBB);
1916
1917 CGF.EmitBlock(CpyMergeBB);
1918
1919 CGF.FinishFunction();
1920 return Fn;
1921}
1922
1923///
1924/// Design of OpenMP reductions on the GPU
1925///
1926/// Consider a typical OpenMP program with one or more reduction
1927/// clauses:
1928///
1929/// float foo;
1930/// double bar;
1931/// #pragma omp target teams distribute parallel for \
1932/// reduction(+:foo) reduction(*:bar)
1933/// for (int i = 0; i < N; i++) {
1934/// foo += A[i]; bar *= B[i];
1935/// }
1936///
1937/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1938/// all teams. In our OpenMP implementation on the NVPTX device an
1939/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1940/// within a team are mapped to CUDA threads within a threadblock.
1941/// Our goal is to efficiently aggregate values across all OpenMP
1942/// threads such that:
1943///
1944/// - the compiler and runtime are logically concise, and
1945/// - the reduction is performed efficiently in a hierarchical
1946/// manner as follows: within OpenMP threads in the same warp,
1947/// across warps in a threadblock, and finally across teams on
1948/// the NVPTX device.
1949///
1950/// Introduction to Decoupling
1951///
1952/// We would like to decouple the compiler and the runtime so that the
1953/// latter is ignorant of the reduction variables (number, data types)
1954/// and the reduction operators. This allows a simpler interface
1955/// and implementation while still attaining good performance.
1956///
1957/// Pseudocode for the aforementioned OpenMP program generated by the
1958/// compiler is as follows:
1959///
1960/// 1. Create private copies of reduction variables on each OpenMP
1961/// thread: 'foo_private', 'bar_private'
1962/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1963/// to it and writes the result in 'foo_private' and 'bar_private'
1964/// respectively.
1965/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1966/// and store the result on the team master:
1967///
1968/// __kmpc_nvptx_parallel_reduce_nowait(...,
1969/// reduceData, shuffleReduceFn, interWarpCpyFn)
1970///
1971/// where:
1972/// struct ReduceData {
1973/// double *foo;
1974/// double *bar;
1975/// } reduceData
1976/// reduceData.foo = &foo_private
1977/// reduceData.bar = &bar_private
1978///
1979/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1980/// auxiliary functions generated by the compiler that operate on
1981/// variables of type 'ReduceData'. They aid the runtime perform
1982/// algorithmic steps in a data agnostic manner.
1983///
1984/// 'shuffleReduceFn' is a pointer to a function that reduces data
1985/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1986/// same warp. It takes the following arguments as input:
1987///
1988/// a. variable of type 'ReduceData' on the calling lane,
1989/// b. its lane_id,
1990/// c. an offset relative to the current lane_id to generate a
1991/// remote_lane_id. The remote lane contains the second
1992/// variable of type 'ReduceData' that is to be reduced.
1993/// d. an algorithm version parameter determining which reduction
1994/// algorithm to use.
1995///
1996/// 'shuffleReduceFn' retrieves data from the remote lane using
1997/// efficient GPU shuffle intrinsics and reduces, using the
1998/// algorithm specified by the 4th parameter, the two operands
1999/// element-wise. The result is written to the first operand.
2000///
2001/// Different reduction algorithms are implemented in different
2002/// runtime functions, all calling 'shuffleReduceFn' to perform
2003/// the essential reduction step. Therefore, based on the 4th
2004/// parameter, this function behaves slightly differently to
2005/// cooperate with the runtime to ensure correctness under
2006/// different circumstances.
2007///
2008/// 'InterWarpCpyFn' is a pointer to a function that transfers
2009/// reduced variables across warps. It tunnels, through CUDA
2010/// shared memory, the thread-private data of type 'ReduceData'
2011/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002012/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2013/// The last team writes the global reduced value to memory.
2014///
2015/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2016/// reduceData, shuffleReduceFn, interWarpCpyFn,
2017/// scratchpadCopyFn, loadAndReduceFn)
2018///
2019/// 'scratchpadCopyFn' is a helper that stores reduced
2020/// data from the team master to a scratchpad array in
2021/// global memory.
2022///
2023/// 'loadAndReduceFn' is a helper that loads data from
2024/// the scratchpad array and reduces it with the input
2025/// operand.
2026///
2027/// These compiler generated functions hide address
2028/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002029/// 5. if ret == 1:
2030/// The team master of the last team stores the reduced
2031/// result to the globals in memory.
2032/// foo += reduceData.foo; bar *= reduceData.bar
2033///
2034///
2035/// Warp Reduction Algorithms
2036///
2037/// On the warp level, we have three algorithms implemented in the
2038/// OpenMP runtime depending on the number of active lanes:
2039///
2040/// Full Warp Reduction
2041///
2042/// The reduce algorithm within a warp where all lanes are active
2043/// is implemented in the runtime as follows:
2044///
2045/// full_warp_reduce(void *reduce_data,
2046/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2047/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2048/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2049/// }
2050///
2051/// The algorithm completes in log(2, WARPSIZE) steps.
2052///
2053/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2054/// not used therefore we save instructions by not retrieving lane_id
2055/// from the corresponding special registers. The 4th parameter, which
2056/// represents the version of the algorithm being used, is set to 0 to
2057/// signify full warp reduction.
2058///
2059/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2060///
2061/// #reduce_elem refers to an element in the local lane's data structure
2062/// #remote_elem is retrieved from a remote lane
2063/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2064/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2065///
2066/// Contiguous Partial Warp Reduction
2067///
2068/// This reduce algorithm is used within a warp where only the first
2069/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2070/// number of OpenMP threads in a parallel region is not a multiple of
2071/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2072///
2073/// void
2074/// contiguous_partial_reduce(void *reduce_data,
2075/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2076/// int size, int lane_id) {
2077/// int curr_size;
2078/// int offset;
2079/// curr_size = size;
2080/// mask = curr_size/2;
2081/// while (offset>0) {
2082/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2083/// curr_size = (curr_size+1)/2;
2084/// offset = curr_size/2;
2085/// }
2086/// }
2087///
2088/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2089///
2090/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2091/// if (lane_id < offset)
2092/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2093/// else
2094/// reduce_elem = remote_elem
2095///
2096/// This algorithm assumes that the data to be reduced are located in a
2097/// contiguous subset of lanes starting from the first. When there is
2098/// an odd number of active lanes, the data in the last lane is not
2099/// aggregated with any other lane's dat but is instead copied over.
2100///
2101/// Dispersed Partial Warp Reduction
2102///
2103/// This algorithm is used within a warp when any discontiguous subset of
2104/// lanes are active. It is used to implement the reduction operation
2105/// across lanes in an OpenMP simd region or in a nested parallel region.
2106///
2107/// void
2108/// dispersed_partial_reduce(void *reduce_data,
2109/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2110/// int size, remote_id;
2111/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2112/// do {
2113/// remote_id = next_active_lane_id_right_after_me();
2114/// # the above function returns 0 of no active lane
2115/// # is present right after the current lane.
2116/// size = number_of_active_lanes_in_this_warp();
2117/// logical_lane_id /= 2;
2118/// ShuffleReduceFn(reduce_data, logical_lane_id,
2119/// remote_id-1-threadIdx.x, 2);
2120/// } while (logical_lane_id % 2 == 0 && size > 1);
2121/// }
2122///
2123/// There is no assumption made about the initial state of the reduction.
2124/// Any number of lanes (>=1) could be active at any position. The reduction
2125/// result is returned in the first active lane.
2126///
2127/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2128///
2129/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2130/// if (lane_id % 2 == 0 && offset > 0)
2131/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2132/// else
2133/// reduce_elem = remote_elem
2134///
2135///
2136/// Intra-Team Reduction
2137///
2138/// This function, as implemented in the runtime call
2139/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2140/// threads in a team. It first reduces within a warp using the
2141/// aforementioned algorithms. We then proceed to gather all such
2142/// reduced values at the first warp.
2143///
2144/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2145/// data from each of the "warp master" (zeroth lane of each warp, where
2146/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2147/// a mathematical sense) the problem of reduction across warp masters in
2148/// a block to the problem of warp reduction.
2149///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002150///
2151/// Inter-Team Reduction
2152///
2153/// Once a team has reduced its data to a single value, it is stored in
2154/// a global scratchpad array. Since each team has a distinct slot, this
2155/// can be done without locking.
2156///
2157/// The last team to write to the scratchpad array proceeds to reduce the
2158/// scratchpad array. One or more workers in the last team use the helper
2159/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2160/// the k'th worker reduces every k'th element.
2161///
2162/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2163/// reduce across workers and compute a globally reduced value.
2164///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002165void CGOpenMPRuntimeNVPTX::emitReduction(
2166 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2167 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2168 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2169 if (!CGF.HaveInsertPoint())
2170 return;
2171
2172 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002173 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2174 // FIXME: Add support for simd reduction.
2175 assert((TeamsReduction || ParallelReduction) &&
2176 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002177
2178 auto &C = CGM.getContext();
2179
2180 // 1. Build a list of reduction variables.
2181 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2182 auto Size = RHSExprs.size();
2183 for (auto *E : Privates) {
2184 if (E->getType()->isVariablyModifiedType())
2185 // Reserve place for array size.
2186 ++Size;
2187 }
2188 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2189 QualType ReductionArrayTy =
2190 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2191 /*IndexTypeQuals=*/0);
2192 Address ReductionList =
2193 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2194 auto IPriv = Privates.begin();
2195 unsigned Idx = 0;
2196 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2197 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2198 CGF.getPointerSize());
2199 CGF.Builder.CreateStore(
2200 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2201 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2202 Elem);
2203 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2204 // Store array size.
2205 ++Idx;
2206 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2207 CGF.getPointerSize());
2208 llvm::Value *Size = CGF.Builder.CreateIntCast(
2209 CGF.getVLASize(
2210 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2211 .first,
2212 CGF.SizeTy, /*isSigned=*/false);
2213 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2214 Elem);
2215 }
2216 }
2217
2218 // 2. Emit reduce_func().
2219 auto *ReductionFn = emitReductionFunction(
2220 CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
2221 LHSExprs, RHSExprs, ReductionOps);
2222
2223 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2224 // RedList, shuffle_reduce_func, interwarp_copy_func);
2225 auto *ThreadId = getThreadID(CGF, Loc);
2226 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2227 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2228 ReductionList.getPointer(), CGF.VoidPtrTy);
2229
2230 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2231 CGM, Privates, ReductionArrayTy, ReductionFn);
2232 auto *InterWarpCopyFn =
2233 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
2234
2235 llvm::Value *Res = nullptr;
2236 if (ParallelReduction) {
2237 llvm::Value *Args[] = {ThreadId,
2238 CGF.Builder.getInt32(RHSExprs.size()),
2239 ReductionArrayTySize,
2240 RL,
2241 ShuffleAndReduceFn,
2242 InterWarpCopyFn};
2243
2244 Res = CGF.EmitRuntimeCall(
2245 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2246 Args);
2247 }
2248
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002249 if (TeamsReduction) {
2250 auto *ScratchPadCopyFn =
2251 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
2252 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
2253 CGM, Privates, ReductionArrayTy, ReductionFn);
2254
2255 llvm::Value *Args[] = {ThreadId,
2256 CGF.Builder.getInt32(RHSExprs.size()),
2257 ReductionArrayTySize,
2258 RL,
2259 ShuffleAndReduceFn,
2260 InterWarpCopyFn,
2261 ScratchPadCopyFn,
2262 LoadAndReduceFn};
2263 Res = CGF.EmitRuntimeCall(
2264 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2265 Args);
2266 }
2267
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002268 // 5. Build switch(res)
2269 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2270 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2271
2272 // 6. Build case 1: where we have reduced values in the master
2273 // thread in each team.
2274 // __kmpc_end_reduce{_nowait}(<gtid>);
2275 // break;
2276 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2277 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2278 CGF.EmitBlock(Case1BB);
2279
2280 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2281 llvm::Value *EndArgs[] = {ThreadId};
2282 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2283 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2284 auto IPriv = Privates.begin();
2285 auto ILHS = LHSExprs.begin();
2286 auto IRHS = RHSExprs.begin();
2287 for (auto *E : ReductionOps) {
2288 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2289 cast<DeclRefExpr>(*IRHS));
2290 ++IPriv;
2291 ++ILHS;
2292 ++IRHS;
2293 }
2294 };
2295 RegionCodeGenTy RCG(CodeGen);
2296 NVPTXActionTy Action(
2297 nullptr, llvm::None,
2298 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2299 EndArgs);
2300 RCG.setAction(Action);
2301 RCG(CGF);
2302 CGF.EmitBranch(DefaultBB);
2303 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2304}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002305
2306const VarDecl *
2307CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2308 const VarDecl *NativeParam) const {
2309 if (!NativeParam->getType()->isReferenceType())
2310 return NativeParam;
2311 QualType ArgType = NativeParam->getType();
2312 QualifierCollector QC;
2313 const Type *NonQualTy = QC.strip(ArgType);
2314 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2315 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2316 if (Attr->getCaptureKind() == OMPC_map) {
2317 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2318 LangAS::opencl_global);
2319 }
2320 }
2321 ArgType = CGM.getContext().getPointerType(PointeeTy);
2322 QC.addRestrict();
2323 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002324 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002325 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00002326 if (isa<ImplicitParamDecl>(NativeParam)) {
2327 return ImplicitParamDecl::Create(
2328 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2329 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2330 }
2331 return ParmVarDecl::Create(
2332 CGM.getContext(),
2333 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2334 NativeParam->getLocStart(), NativeParam->getLocation(),
2335 NativeParam->getIdentifier(), ArgType,
2336 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002337}
2338
2339Address
2340CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2341 const VarDecl *NativeParam,
2342 const VarDecl *TargetParam) const {
2343 assert(NativeParam != TargetParam &&
2344 NativeParam->getType()->isReferenceType() &&
2345 "Native arg must not be the same as target arg.");
2346 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2347 QualType NativeParamType = NativeParam->getType();
2348 QualifierCollector QC;
2349 const Type *NonQualTy = QC.strip(NativeParamType);
2350 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2351 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002352 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002353 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002354 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002355 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002356 // First cast to generic.
2357 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2358 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2359 /*AddrSpace=*/0));
2360 // Cast from generic to native address space.
2361 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2362 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2363 NativePointeeAddrSpace));
2364 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2365 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002366 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002367 return NativeParamAddr;
2368}
2369
2370void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002371 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002372 ArrayRef<llvm::Value *> Args) const {
2373 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002374 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002375 auto *FnType =
2376 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2377 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002378 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2379 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2380 break;
2381 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002382 llvm::Type *TargetType = FnType->getParamType(I);
2383 llvm::Value *NativeArg = Args[I];
2384 if (!TargetType->isPointerTy()) {
2385 TargetArgs.emplace_back(NativeArg);
2386 continue;
2387 }
2388 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2389 NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
2390 /*AddrSpace=*/0));
2391 TargetArgs.emplace_back(
2392 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2393 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002394 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002395}
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00002396
2397/// Emit function which wraps the outline parallel region
2398/// and controls the arguments which are passed to this function.
2399/// The wrapper ensures that the outlined function is called
2400/// with the correct arguments when data is shared.
2401llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
2402 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2403 ASTContext &Ctx = CGM.getContext();
2404 const auto &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
2405
2406 // Create a function that takes as argument the source thread.
2407 FunctionArgList WrapperArgs;
2408 QualType Int16QTy =
2409 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2410 QualType Int32QTy =
2411 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2412 QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);
2413 QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);
2414 ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other);
2415 ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other);
2416 ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy,
2417 ImplicitParamDecl::Other);
2418 WrapperArgs.emplace_back(&ParallelLevelArg);
2419 WrapperArgs.emplace_back(&WrapperArg);
2420 WrapperArgs.emplace_back(&SharedArgsList);
2421
2422 auto &CGFI =
2423 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2424
2425 auto *Fn = llvm::Function::Create(
2426 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2427 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
2428 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
2429 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2430
2431 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2432 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs);
2433
2434 const auto *RD = CS.getCapturedRecordDecl();
2435 auto CurField = RD->field_begin();
2436
2437 // Get the array of arguments.
2438 SmallVector<llvm::Value *, 8> Args;
2439
2440 // TODO: suppport SIMD and pass actual values
2441 Args.emplace_back(llvm::ConstantPointerNull::get(
2442 CGM.Int32Ty->getPointerTo()));
2443 Args.emplace_back(llvm::ConstantPointerNull::get(
2444 CGM.Int32Ty->getPointerTo()));
2445
2446 CGBuilderTy &Bld = CGF.Builder;
2447 auto CI = CS.capture_begin();
2448
2449 // Load the start of the array
2450 auto SharedArgs =
2451 CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList),
2452 VoidPtrPtrQTy->castAs<PointerType>());
2453
2454 // For each captured variable
2455 for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {
2456 // Name of captured variable
2457 StringRef Name;
2458 if (CI->capturesThis())
2459 Name = "this";
2460 else
2461 Name = CI->getCapturedVar()->getName();
2462
2463 // We retrieve the CLANG type of the argument. We use it to create
2464 // an alloca which will give us the LLVM type.
2465 QualType ElemTy = CurField->getType();
2466 // If this is a capture by copy the element type has to be the pointer to
2467 // the data.
2468 if (CI->capturesVariableByCopy())
2469 ElemTy = Ctx.getPointerType(ElemTy);
2470
2471 // Get shared address of the captured variable.
2472 Address ArgAddress = Bld.CreateConstInBoundsGEP(
2473 SharedArgs, I, CGF.getPointerSize());
2474 Address TypedArgAddress = Bld.CreateBitCast(
2475 ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy)));
2476 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress,
2477 /*Volatile=*/false, Int32PtrQTy, SourceLocation());
2478 Args.emplace_back(Arg);
2479 }
2480
2481 emitCall(CGF, OutlinedParallelFn, Args);
2482 CGF.FinishFunction();
2483 return Fn;
2484}