blob: 7b2993cfd38db53cbf9fedf9a9921b56060c410d [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataevc5b1d322016-03-04 09:22:22 +000016#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000017#include "CodeGenFunction.h"
18#include "clang/AST/StmtOpenMP.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000019
20using namespace clang;
21using namespace CodeGen;
22
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000023namespace {
24enum OpenMPRTLFunctionNVPTX {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000025 /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit,
26 /// int16_t RequiresOMPRuntime);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000027 OMPRTL_NVPTX__kmpc_kernel_init,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000028 /// \brief Call to void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +000029 OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000030 /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000031 /// int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +000032 OMPRTL_NVPTX__kmpc_spmd_kernel_init,
33 /// \brief Call to void __kmpc_spmd_kernel_deinit();
34 OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000035 /// \brief Call to void __kmpc_kernel_prepare_parallel(void
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000036 /// *outlined_function, void ***args, kmp_int32 nArgs, int16_t
37 /// IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000038 OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +000039 /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function, void
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +000040 /// ***args, int16_t IsOMPRuntimeInitialized);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000041 OMPRTL_NVPTX__kmpc_kernel_parallel,
42 /// \brief Call to void __kmpc_kernel_end_parallel();
43 OMPRTL_NVPTX__kmpc_kernel_end_parallel,
44 /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
45 /// global_tid);
46 OMPRTL_NVPTX__kmpc_serialized_parallel,
47 /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
48 /// global_tid);
49 OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000050 /// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
51 /// int16_t lane_offset, int16_t warp_size);
52 OMPRTL_NVPTX__kmpc_shuffle_int32,
53 /// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
54 /// int16_t lane_offset, int16_t warp_size);
55 OMPRTL_NVPTX__kmpc_shuffle_int64,
56 /// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
57 /// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
58 /// void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
59 /// lane_offset, int16_t shortCircuit),
60 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num));
61 OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +000062 /// \brief Call to __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
63 /// int32_t num_vars, size_t reduce_size, void *reduce_data,
64 /// void (*kmp_ShuffleReductFctPtr)(void *rhs, int16_t lane_id, int16_t
65 /// lane_offset, int16_t shortCircuit),
66 /// void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
67 /// void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
68 /// int32_t index, int32_t width),
69 /// void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad, int32_t
70 /// index, int32_t width, int32_t reduce))
71 OMPRTL_NVPTX__kmpc_teams_reduce_nowait,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +000072 /// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
73 OMPRTL_NVPTX__kmpc_end_reduce_nowait
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000074};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +000075
76/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
77class NVPTXActionTy final : public PrePostActionTy {
78 llvm::Value *EnterCallee;
79 ArrayRef<llvm::Value *> EnterArgs;
80 llvm::Value *ExitCallee;
81 ArrayRef<llvm::Value *> ExitArgs;
82 bool Conditional;
83 llvm::BasicBlock *ContBlock = nullptr;
84
85public:
86 NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs,
87 llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs,
88 bool Conditional = false)
89 : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
90 ExitArgs(ExitArgs), Conditional(Conditional) {}
91 void Enter(CodeGenFunction &CGF) override {
92 llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
93 if (Conditional) {
94 llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
95 auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
96 ContBlock = CGF.createBasicBlock("omp_if.end");
97 // Generate the branch (If-stmt)
98 CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
99 CGF.EmitBlock(ThenBlock);
100 }
101 }
102 void Done(CodeGenFunction &CGF) {
103 // Emit the rest of blocks/branches
104 CGF.EmitBranch(ContBlock);
105 CGF.EmitBlock(ContBlock, true);
106 }
107 void Exit(CodeGenFunction &CGF) override {
108 CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
109 }
110};
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000111
112// A class to track the execution mode when codegening directives within
113// a target region. The appropriate mode (generic/spmd) is set on entry
114// to the target region and used by containing directives such as 'parallel'
115// to emit optimized code.
116class ExecutionModeRAII {
117private:
118 CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
119 CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
120
121public:
122 ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
123 CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
124 : Mode(Mode) {
125 SavedMode = Mode;
126 Mode = NewMode;
127 }
128 ~ExecutionModeRAII() { Mode = SavedMode; }
129};
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000130
131/// GPU Configuration: This information can be derived from cuda registers,
132/// however, providing compile time constants helps generate more efficient
133/// code. For all practical purposes this is fine because the configuration
134/// is the same for all known NVPTX architectures.
135enum MachineConfiguration : unsigned {
136 WarpSize = 32,
137 /// Number of bits required to represent a lane identifier, which is
138 /// computed as log_2(WarpSize).
139 LaneIDBits = 5,
140 LaneIDMask = WarpSize - 1,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000141
142 /// Global memory alignment for performance.
143 GlobalMemoryAlignment = 256,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000144};
145
146enum NamedBarrier : unsigned {
147 /// Synchronize on this barrier #ID using a named barrier primitive.
148 /// Only the subset of active threads in a parallel region arrive at the
149 /// barrier.
150 NB_Parallel = 1,
151};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000152} // anonymous namespace
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000153
154/// Get the GPU warp size.
155static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000156 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000157 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000158 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000159 "nvptx_warp_size");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000160}
161
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000162/// Get the id of the current thread on the GPU.
163static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000164 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000165 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000166 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000167 "nvptx_tid");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000168}
169
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000170/// Get the id of the warp in the block.
171/// We assume that the warp size is 32, which is always the case
172/// on the NVPTX device, to generate more efficient code.
173static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
174 CGBuilderTy &Bld = CGF.Builder;
175 return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
176}
177
178/// Get the id of the current lane in the Warp.
179/// We assume that the warp size is 32, which is always the case
180/// on the NVPTX device, to generate more efficient code.
181static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
182 CGBuilderTy &Bld = CGF.Builder;
183 return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
184 "nvptx_lane_id");
185}
186
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000187/// Get the maximum number of threads in a block of the GPU.
188static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000189 return CGF.EmitRuntimeCall(
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000190 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000191 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Alexey Bataev3c595a62017-08-14 15:01:03 +0000192 "nvptx_num_threads");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000193}
194
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000195/// Get barrier to synchronize all threads in a block.
196static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Alexey Bataev3c595a62017-08-14 15:01:03 +0000197 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000198 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000199}
200
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000201/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
202/// a CTA.
203static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
204 llvm::Value *NumThreads) {
205 CGBuilderTy &Bld = CGF.Builder;
206 llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
Alexey Bataev3c595a62017-08-14 15:01:03 +0000207 CGF.EmitRuntimeCall(llvm::Intrinsic::getDeclaration(
208 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier),
209 Args);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000210}
211
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000212/// Synchronize all GPU threads in a block.
213static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000214
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000215/// Synchronize worker threads in a parallel region.
216static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
217 return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
218}
219
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000220/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000221/// For the 'generic' execution mode, the runtime encodes thread_limit in
222/// the launch parameters, always starting thread_limit+warpSize threads per
223/// CTA. The threads in the last warp are reserved for master execution.
224/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
225static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
226 bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000227 CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000228 return IsInSpmdExecutionMode
229 ? getNVPTXNumThreads(CGF)
230 : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
231 "thread_limit");
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000232}
233
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000234/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000235/// The master thread id is the first thread (lane) of the last warp in the
236/// GPU block. Warp size is assumed to be some power of 2.
237/// Thread id is 0 indexed.
238/// E.g: If NumThreads is 33, master id is 32.
239/// If NumThreads is 64, master id is 32.
240/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +0000241static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000242 CGBuilderTy &Bld = CGF.Builder;
243 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
244
245 // We assume that the warp size is a power of 2.
246 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
247
248 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
249 Bld.CreateNot(Mask), "master_tid");
250}
251
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000252CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
253 CodeGenModule &CGM)
254 : WorkerFn(nullptr), CGFI(nullptr) {
255 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +0000256}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000257
258void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
259 CodeGenModule &CGM) {
260 // Create an worker function with no arguments.
261 CGFI = &CGM.getTypes().arrangeNullaryFunction();
262
263 WorkerFn = llvm::Function::Create(
264 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
265 /* placeholder */ "_worker", &CGM.getModule());
266 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000267}
268
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000269bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
270 return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
271}
272
273static CGOpenMPRuntimeNVPTX::ExecutionMode
274getExecutionModeForDirective(CodeGenModule &CGM,
275 const OMPExecutableDirective &D) {
276 OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
277 switch (DirectiveKind) {
278 case OMPD_target:
Arpith Chacko Jacobcca61a32017-01-26 15:43:27 +0000279 case OMPD_target_teams:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000280 return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
281 case OMPD_target_parallel:
Alexey Bataevfb0ebec2017-11-08 20:16:14 +0000282 case OMPD_target_parallel_for:
Alexey Bataev5d7edca2017-11-09 17:32:15 +0000283 case OMPD_target_parallel_for_simd:
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000284 return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
285 default:
286 llvm_unreachable("Unsupported directive on NVPTX device.");
287 }
288 llvm_unreachable("Unsupported directive on NVPTX device.");
289}
290
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000291void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
292 StringRef ParentName,
293 llvm::Function *&OutlinedFn,
294 llvm::Constant *&OutlinedFnID,
295 bool IsOffloadEntry,
296 const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000297 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
298 CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000299 EntryFunctionState EST;
300 WorkerFunctionState WST(CGM);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000301 Work.clear();
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000302 WrapperFunctionsMap.clear();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000303
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000304 // Emit target region as a standalone region.
305 class NVPTXPrePostActionTy : public PrePostActionTy {
306 CGOpenMPRuntimeNVPTX &RT;
307 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
308 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000309
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000310 public:
311 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
312 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
313 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
314 : RT(RT), EST(EST), WST(WST) {}
315 void Enter(CodeGenFunction &CGF) override {
316 RT.emitGenericEntryHeader(CGF, EST, WST);
317 }
318 void Exit(CodeGenFunction &CGF) override {
319 RT.emitGenericEntryFooter(CGF, EST);
320 }
321 } Action(*this, EST, WST);
322 CodeGen.setAction(Action);
323 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
324 IsOffloadEntry, CodeGen);
325
326 // Create the worker function
327 emitWorkerFunction(WST);
328
329 // Now change the name of the worker function to correspond to this target
330 // region's entry function.
331 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
332}
333
334// Setup NVPTX threads for master-worker OpenMP scheme.
335void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
336 EntryFunctionState &EST,
337 WorkerFunctionState &WST) {
338 CGBuilderTy &Bld = CGF.Builder;
339
340 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
341 llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
342 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
343 EST.ExitBB = CGF.createBasicBlock(".exit");
344
345 auto *IsWorker =
346 Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
347 Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
348
349 CGF.EmitBlock(WorkerBB);
Alexey Bataev3c595a62017-08-14 15:01:03 +0000350 emitCall(CGF, WST.WorkerFn);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000351 CGF.EmitBranch(EST.ExitBB);
352
353 CGF.EmitBlock(MasterCheckBB);
354 auto *IsMaster =
355 Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
356 Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
357
358 CGF.EmitBlock(MasterBB);
359 // First action in sequential region:
360 // Initialize the state of the OpenMP runtime library on the GPU.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000361 // TODO: Optimize runtime initialization and pass in correct value.
362 llvm::Value *Args[] = {getThreadLimit(CGF),
363 Bld.getInt16(/*RequiresOMPRuntime=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000364 CGF.EmitRuntimeCall(
365 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
366}
367
368void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
369 EntryFunctionState &EST) {
370 if (!EST.ExitBB)
371 EST.ExitBB = CGF.createBasicBlock(".exit");
372
373 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
374 CGF.EmitBranch(TerminateBB);
375
376 CGF.EmitBlock(TerminateBB);
377 // Signal termination condition.
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000378 // TODO: Optimize runtime initialization and pass in correct value.
379 llvm::Value *Args[] = {CGF.Builder.getInt16(/*IsOMPRuntimeInitialized=*/1)};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000380 CGF.EmitRuntimeCall(
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000381 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), Args);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000382 // Barrier to terminate worker threads.
383 syncCTAThreads(CGF);
384 // Master thread jumps to exit point.
385 CGF.EmitBranch(EST.ExitBB);
386
387 CGF.EmitBlock(EST.ExitBB);
388 EST.ExitBB = nullptr;
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000389}
390
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000391void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
392 StringRef ParentName,
393 llvm::Function *&OutlinedFn,
394 llvm::Constant *&OutlinedFnID,
395 bool IsOffloadEntry,
396 const RegionCodeGenTy &CodeGen) {
397 ExecutionModeRAII ModeRAII(CurrentExecutionMode,
398 CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
399 EntryFunctionState EST;
400
401 // Emit target region as a standalone region.
402 class NVPTXPrePostActionTy : public PrePostActionTy {
403 CGOpenMPRuntimeNVPTX &RT;
404 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
405 const OMPExecutableDirective &D;
406
407 public:
408 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
409 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
410 const OMPExecutableDirective &D)
411 : RT(RT), EST(EST), D(D) {}
412 void Enter(CodeGenFunction &CGF) override {
413 RT.emitSpmdEntryHeader(CGF, EST, D);
414 }
415 void Exit(CodeGenFunction &CGF) override {
416 RT.emitSpmdEntryFooter(CGF, EST);
417 }
418 } Action(*this, EST, D);
419 CodeGen.setAction(Action);
420 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
421 IsOffloadEntry, CodeGen);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000422}
423
424void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
425 CodeGenFunction &CGF, EntryFunctionState &EST,
426 const OMPExecutableDirective &D) {
427 auto &Bld = CGF.Builder;
428
429 // Setup BBs in entry function.
430 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
431 EST.ExitBB = CGF.createBasicBlock(".exit");
432
433 // Initialize the OMP state in the runtime; called by all active threads.
434 // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
435 // based on code analysis of the target region.
436 llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true),
437 /*RequiresOMPRuntime=*/Bld.getInt16(1),
438 /*RequiresDataSharing=*/Bld.getInt16(1)};
439 CGF.EmitRuntimeCall(
440 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
441 CGF.EmitBranch(ExecuteBB);
442
443 CGF.EmitBlock(ExecuteBB);
444}
445
446void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
447 EntryFunctionState &EST) {
448 if (!EST.ExitBB)
449 EST.ExitBB = CGF.createBasicBlock(".exit");
450
451 llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
452 CGF.EmitBranch(OMPDeInitBB);
453
454 CGF.EmitBlock(OMPDeInitBB);
455 // DeInitialize the OMP state in the runtime; called by all active threads.
456 CGF.EmitRuntimeCall(
457 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
458 CGF.EmitBranch(EST.ExitBB);
459
460 CGF.EmitBlock(EST.ExitBB);
461 EST.ExitBB = nullptr;
462}
463
464// Create a unique global variable to indicate the execution mode of this target
465// region. The execution mode is either 'generic', or 'spmd' depending on the
466// target directive. This variable is picked up by the offload library to setup
467// the device appropriately before kernel launch. If the execution mode is
468// 'generic', the runtime reserves one warp for the master, otherwise, all
469// warps participate in parallel work.
470static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
471 CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
472 (void)new llvm::GlobalVariable(
473 CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true,
474 llvm::GlobalValue::WeakAnyLinkage,
475 llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
476}
477
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000478void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000479 ASTContext &Ctx = CGM.getContext();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000480
481 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000482 CGF.disableDebugInfo();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000483 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
484 emitWorkerLoop(CGF, WST);
485 CGF.FinishFunction();
486}
487
488void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
489 WorkerFunctionState &WST) {
490 //
491 // The workers enter this loop and wait for parallel work from the master.
492 // When the master encounters a parallel region it sets up the work + variable
493 // arguments, and wakes up the workers. The workers first check to see if
494 // they are required for the parallel region, i.e., within the # of requested
495 // parallel threads. The activated workers load the variable arguments and
496 // execute the parallel work.
497 //
498
499 CGBuilderTy &Bld = CGF.Builder;
500
501 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
502 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
503 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
504 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
505 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
506 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
507
508 CGF.EmitBranch(AwaitBB);
509
510 // Workers wait for work from master.
511 CGF.EmitBlock(AwaitBB);
512 // Wait for parallel work
513 syncCTAThreads(CGF);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000514
515 Address WorkFn =
516 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn");
517 Address ExecStatus =
518 CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status");
519 CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0));
520 CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
521
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000522 // Set up shared arguments
523 Address SharedArgs =
524 CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrPtrTy, "shared_args");
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000525 // TODO: Optimize runtime initialization and pass in correct value.
526 llvm::Value *Args[] = {WorkFn.getPointer(), SharedArgs.getPointer(),
527 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000528 llvm::Value *Ret = CGF.EmitRuntimeCall(
529 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
530 Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000531
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000532 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000533 llvm::Value *ShouldTerminate =
534 Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000535 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
536
537 // Activate requested workers.
538 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000539 llvm::Value *IsActive =
540 Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
541 Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000542
543 // Signal start of parallel region.
544 CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000545
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000546 // Current context
547 ASTContext &Ctx = CGF.getContext();
548
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000549 // Process work items: outlined parallel functions.
550 for (auto *W : Work) {
551 // Try to match this outlined function.
552 auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
553
554 llvm::Value *WorkFnMatch =
555 Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
556
557 llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
558 llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
559 Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
560
561 // Execute this outlined function.
562 CGF.EmitBlock(ExecuteFNBB);
563
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000564 // Insert call to work function via shared wrapper. The shared
565 // wrapper takes exactly three arguments:
566 // - the parallelism level;
567 // - the master thread ID;
568 // - the list of references to shared arguments.
569 //
570 // TODO: Assert that the function is a wrapper function.s
571 Address Capture = CGF.EmitLoadOfPointer(SharedArgs,
572 Ctx.getPointerType(
573 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>());
574 emitCall(CGF, W, {Bld.getInt16(/*ParallelLevel=*/0),
575 getMasterThreadID(CGF), Capture.getPointer()});
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000576
577 // Go to end of parallel region.
578 CGF.EmitBranch(TerminateBB);
579
580 CGF.EmitBlock(CheckNextBB);
581 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000582
583 // Signal end of parallel region.
584 CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000585 CGF.EmitRuntimeCall(
586 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
587 llvm::None);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000588 CGF.EmitBranch(BarrierBB);
589
590 // All active and inactive workers wait at a barrier after parallel region.
591 CGF.EmitBlock(BarrierBB);
592 // Barrier after parallel region.
593 syncCTAThreads(CGF);
594 CGF.EmitBranch(AwaitBB);
595
596 // Exit target region.
597 CGF.EmitBlock(ExitBB);
598}
599
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000600/// \brief Returns specified OpenMP runtime function for the current OpenMP
601/// implementation. Specialized for the NVPTX device.
602/// \param Function OpenMP runtime function.
603/// \return Specified function.
604llvm::Constant *
605CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
606 llvm::Constant *RTLFn = nullptr;
607 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
608 case OMPRTL_NVPTX__kmpc_kernel_init: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000609 // Build void __kmpc_kernel_init(kmp_int32 thread_limit, int16_t
610 // RequiresOMPRuntime);
611 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000612 llvm::FunctionType *FnTy =
613 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
614 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
615 break;
616 }
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000617 case OMPRTL_NVPTX__kmpc_kernel_deinit: {
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000618 // Build void __kmpc_kernel_deinit(int16_t IsOMPRuntimeInitialized);
619 llvm::Type *TypeParams[] = {CGM.Int16Ty};
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000620 llvm::FunctionType *FnTy =
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000621 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
Arpith Chacko Jacob406acdb2017-01-05 15:24:05 +0000622 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
623 break;
624 }
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000625 case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
626 // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
Jonas Hahnfeld891c7fb2017-11-22 14:46:49 +0000627 // int16_t RequiresOMPRuntime, int16_t RequiresDataSharing);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000628 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
629 llvm::FunctionType *FnTy =
630 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
631 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
632 break;
633 }
634 case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
635 // Build void __kmpc_spmd_kernel_deinit();
636 llvm::FunctionType *FnTy =
637 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
638 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
639 break;
640 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000641 case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
642 /// Build void __kmpc_kernel_prepare_parallel(
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000643 /// void *outlined_function, void ***args, kmp_int32 nArgs, int16_t
644 /// IsOMPRuntimeInitialized);
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000645 llvm::Type *TypeParams[] = {CGM.Int8PtrTy,
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000646 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int32Ty,
647 CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000648 llvm::FunctionType *FnTy =
649 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
650 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
651 break;
652 }
653 case OMPRTL_NVPTX__kmpc_kernel_parallel: {
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000654 /// Build bool __kmpc_kernel_parallel(void **outlined_function, void
655 /// ***args, int16_t IsOMPRuntimeInitialized);
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000656 llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy,
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000657 CGM.Int8PtrPtrTy->getPointerTo(0), CGM.Int16Ty};
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000658 llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
659 llvm::FunctionType *FnTy =
660 llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false);
661 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
662 break;
663 }
664 case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
665 /// Build void __kmpc_kernel_end_parallel();
666 llvm::FunctionType *FnTy =
667 llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false);
668 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
669 break;
670 }
671 case OMPRTL_NVPTX__kmpc_serialized_parallel: {
672 // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
673 // global_tid);
674 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
675 llvm::FunctionType *FnTy =
676 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
677 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
678 break;
679 }
680 case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
681 // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
682 // global_tid);
683 llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
684 llvm::FunctionType *FnTy =
685 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
686 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
687 break;
688 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000689 case OMPRTL_NVPTX__kmpc_shuffle_int32: {
690 // Build int32_t __kmpc_shuffle_int32(int32_t element,
691 // int16_t lane_offset, int16_t warp_size);
692 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
693 llvm::FunctionType *FnTy =
694 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg*/ false);
695 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
696 break;
697 }
698 case OMPRTL_NVPTX__kmpc_shuffle_int64: {
699 // Build int64_t __kmpc_shuffle_int64(int64_t element,
700 // int16_t lane_offset, int16_t warp_size);
701 llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
702 llvm::FunctionType *FnTy =
703 llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /*isVarArg*/ false);
704 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
705 break;
706 }
707 case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
708 // Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
709 // kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
710 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
711 // lane_offset, int16_t Algorithm Version),
712 // void (*kmp_InterWarpCopyFctPtr)(void* src, int warp_num));
713 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
714 CGM.Int16Ty, CGM.Int16Ty};
715 auto *ShuffleReduceFnTy =
716 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
717 /*isVarArg=*/false);
718 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
719 auto *InterWarpCopyFnTy =
720 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
721 /*isVarArg=*/false);
722 llvm::Type *TypeParams[] = {CGM.Int32Ty,
723 CGM.Int32Ty,
724 CGM.SizeTy,
725 CGM.VoidPtrTy,
726 ShuffleReduceFnTy->getPointerTo(),
727 InterWarpCopyFnTy->getPointerTo()};
728 llvm::FunctionType *FnTy =
729 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
730 RTLFn = CGM.CreateRuntimeFunction(
731 FnTy, /*Name=*/"__kmpc_nvptx_parallel_reduce_nowait");
732 break;
733 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +0000734 case OMPRTL_NVPTX__kmpc_teams_reduce_nowait: {
735 // Build int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid,
736 // int32_t num_vars, size_t reduce_size, void *reduce_data,
737 // void (*kmp_ShuffleReductFctPtr)(void *rhsData, int16_t lane_id, int16_t
738 // lane_offset, int16_t shortCircuit),
739 // void (*kmp_InterWarpCopyFctPtr)(void* src, int32_t warp_num),
740 // void (*kmp_CopyToScratchpadFctPtr)(void *reduce_data, void * scratchpad,
741 // int32_t index, int32_t width),
742 // void (*kmp_LoadReduceFctPtr)(void *reduce_data, void * scratchpad,
743 // int32_t index, int32_t width, int32_t reduce))
744 llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
745 CGM.Int16Ty, CGM.Int16Ty};
746 auto *ShuffleReduceFnTy =
747 llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
748 /*isVarArg=*/false);
749 llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
750 auto *InterWarpCopyFnTy =
751 llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
752 /*isVarArg=*/false);
753 llvm::Type *CopyToScratchpadTypeParams[] = {CGM.VoidPtrTy, CGM.VoidPtrTy,
754 CGM.Int32Ty, CGM.Int32Ty};
755 auto *CopyToScratchpadFnTy =
756 llvm::FunctionType::get(CGM.VoidTy, CopyToScratchpadTypeParams,
757 /*isVarArg=*/false);
758 llvm::Type *LoadReduceTypeParams[] = {
759 CGM.VoidPtrTy, CGM.VoidPtrTy, CGM.Int32Ty, CGM.Int32Ty, CGM.Int32Ty};
760 auto *LoadReduceFnTy =
761 llvm::FunctionType::get(CGM.VoidTy, LoadReduceTypeParams,
762 /*isVarArg=*/false);
763 llvm::Type *TypeParams[] = {CGM.Int32Ty,
764 CGM.Int32Ty,
765 CGM.SizeTy,
766 CGM.VoidPtrTy,
767 ShuffleReduceFnTy->getPointerTo(),
768 InterWarpCopyFnTy->getPointerTo(),
769 CopyToScratchpadFnTy->getPointerTo(),
770 LoadReduceFnTy->getPointerTo()};
771 llvm::FunctionType *FnTy =
772 llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /*isVarArg=*/false);
773 RTLFn = CGM.CreateRuntimeFunction(
774 FnTy, /*Name=*/"__kmpc_nvptx_teams_reduce_nowait");
775 break;
776 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +0000777 case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
778 // Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
779 llvm::Type *TypeParams[] = {CGM.Int32Ty};
780 llvm::FunctionType *FnTy =
781 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg=*/false);
782 RTLFn = CGM.CreateRuntimeFunction(
783 FnTy, /*Name=*/"__kmpc_nvptx_end_reduce_nowait");
784 break;
785 }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000786 }
787 return RTLFn;
788}
789
790void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
791 llvm::Constant *Addr,
Samuel Antaof83efdb2017-01-05 16:02:49 +0000792 uint64_t Size, int32_t) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000793 auto *F = dyn_cast<llvm::Function>(Addr);
794 // TODO: Add support for global variables on the device after declare target
795 // support.
796 if (!F)
797 return;
798 llvm::Module *M = F->getParent();
799 llvm::LLVMContext &Ctx = M->getContext();
800
801 // Get "nvvm.annotations" metadata node
802 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
803
804 llvm::Metadata *MDVals[] = {
805 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
806 llvm::ConstantAsMetadata::get(
807 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
808 // Append metadata to nvvm.annotations
809 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
810}
811
812void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
813 const OMPExecutableDirective &D, StringRef ParentName,
814 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +0000815 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000816 if (!IsOffloadEntry) // Nothing to do.
817 return;
818
819 assert(!ParentName.empty() && "Invalid target region parent name!");
820
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000821 CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
822 getExecutionModeForDirective(CGM, D);
823 switch (Mode) {
824 case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
825 emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
826 CodeGen);
827 break;
828 case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
829 emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
830 CodeGen);
831 break;
832 case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
833 llvm_unreachable(
834 "Unknown programming model for OpenMP directive on NVPTX target.");
835 }
836
837 setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000838}
839
Samuel Antao45bfe4c2016-02-08 15:59:20 +0000840CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000841 : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000842 if (!CGM.getLangOpts().OpenMPIsDevice)
843 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000844}
Carlo Bertollic6872252016-04-04 15:55:02 +0000845
Arpith Chacko Jacob2cd6eea2017-01-25 16:55:10 +0000846void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
847 OpenMPProcBindClauseKind ProcBind,
848 SourceLocation Loc) {
849 // Do nothing in case of Spmd mode and L0 parallel.
850 // TODO: If in Spmd mode and L1 parallel emit the clause.
851 if (isInSpmdExecutionMode())
852 return;
853
854 CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
855}
856
Arpith Chacko Jacobe04da5d2017-01-25 01:18:34 +0000857void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
858 llvm::Value *NumThreads,
859 SourceLocation Loc) {
860 // Do nothing in case of Spmd mode and L0 parallel.
861 // TODO: If in Spmd mode and L1 parallel emit the clause.
862 if (isInSpmdExecutionMode())
863 return;
864
865 CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
866}
867
Carlo Bertollic6872252016-04-04 15:55:02 +0000868void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
869 const Expr *NumTeams,
870 const Expr *ThreadLimit,
871 SourceLocation Loc) {}
872
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000873llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
874 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
875 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000876
877 auto *OutlinedFun = cast<llvm::Function>(
878 CGOpenMPRuntime::emitParallelOutlinedFunction(
879 D, ThreadIDVar, InnermostKind, CodeGen));
880 if (!isInSpmdExecutionMode()) {
881 llvm::Function *WrapperFun =
882 createDataSharingWrapper(OutlinedFun, D);
883 WrapperFunctionsMap[OutlinedFun] = WrapperFun;
884 }
885
886 return OutlinedFun;
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000887}
888
889llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertollic6872252016-04-04 15:55:02 +0000890 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
891 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
892
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000893 llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
894 D, ThreadIDVar, InnermostKind, CodeGen);
895 llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
896 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Mehdi Amini6aa9e9b2017-05-29 05:38:20 +0000897 OutlinedFun->removeFnAttr(llvm::Attribute::OptimizeNone);
Arpith Chacko Jacob19b911c2017-01-18 18:18:53 +0000898 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertollic6872252016-04-04 15:55:02 +0000899
900 return OutlinedFun;
901}
902
903void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
904 const OMPExecutableDirective &D,
905 SourceLocation Loc,
906 llvm::Value *OutlinedFn,
907 ArrayRef<llvm::Value *> CapturedVars) {
908 if (!CGF.HaveInsertPoint())
909 return;
910
911 Address ZeroAddr =
912 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
913 /*Name*/ ".zero.addr");
914 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
915 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
916 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
917 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
918 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +0000919 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Carlo Bertollic6872252016-04-04 15:55:02 +0000920}
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000921
922void CGOpenMPRuntimeNVPTX::emitParallelCall(
923 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
924 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
925 if (!CGF.HaveInsertPoint())
926 return;
927
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +0000928 if (isInSpmdExecutionMode())
929 emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
930 else
931 emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000932}
933
934void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
935 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
936 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
937 llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000938 llvm::Function *WFn = WrapperFunctionsMap[Fn];
939 assert(WFn && "Wrapper function does not exist!");
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000940
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000941 // Force inline this outlined function at its call site.
942 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
943
944 auto &&L0ParallelGen = [this, WFn, &CapturedVars](CodeGenFunction &CGF,
945 PrePostActionTy &) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000946 CGBuilderTy &Bld = CGF.Builder;
947
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000948 llvm::Value *ID = Bld.CreateBitOrPointerCast(WFn, CGM.Int8PtrTy);
949
950 if (!CapturedVars.empty()) {
Gheorghe-Teodor Berceab4c74c62017-12-12 21:38:43 +0000951 // There's somehting to share, add the attribute
952 CGF.CurFn->addFnAttr("has-nvptx-shared-depot");
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000953 // Prepare for parallel region. Indicate the outlined function.
954 Address SharedArgs =
955 CGF.CreateDefaultAlignTempAlloca(CGF.VoidPtrPtrTy,
956 "shared_args");
957 llvm::Value *SharedArgsPtr = SharedArgs.getPointer();
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000958 // TODO: Optimize runtime initialization and pass in correct value.
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000959 llvm::Value *Args[] = {ID, SharedArgsPtr,
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000960 Bld.getInt32(CapturedVars.size()),
961 /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000962
963 CGF.EmitRuntimeCall(
964 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
965 Args);
966
967 unsigned Idx = 0;
968 ASTContext &Ctx = CGF.getContext();
969 for (llvm::Value *V : CapturedVars) {
970 Address Dst = Bld.CreateConstInBoundsGEP(
971 CGF.EmitLoadOfPointer(SharedArgs,
972 Ctx.getPointerType(
973 Ctx.getPointerType(Ctx.VoidPtrTy)).castAs<PointerType>()),
974 Idx, CGF.getPointerSize());
975 llvm::Value *PtrV = Bld.CreateBitCast(V, CGF.VoidPtrTy);
976 CGF.EmitStoreOfScalar(PtrV, Dst, /*Volatile=*/false,
977 Ctx.getPointerType(Ctx.VoidPtrTy));
978 Idx++;
979 }
980 } else {
Jonas Hahnfeldfa059ba2017-12-27 10:39:56 +0000981 // TODO: Optimize runtime initialization and pass in correct value.
982 llvm::Value *Args[] = {
983 ID, llvm::ConstantPointerNull::get(CGF.VoidPtrPtrTy->getPointerTo(0)),
984 /*nArgs=*/Bld.getInt32(0), /*RequiresOMPRuntime=*/Bld.getInt16(1)};
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +0000985 CGF.EmitRuntimeCall(
986 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
987 Args);
988 }
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +0000989
990 // Activate workers. This barrier is used by the master to signal
991 // work for the workers.
992 syncCTAThreads(CGF);
993
994 // OpenMP [2.5, Parallel Construct, p.49]
995 // There is an implied barrier at the end of a parallel region. After the
996 // end of a parallel region, only the master thread of the team resumes
997 // execution of the enclosing task region.
998 //
999 // The master waits at this barrier until all workers are done.
1000 syncCTAThreads(CGF);
1001
1002 // Remember for post-processing in worker loop.
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00001003 Work.emplace_back(WFn);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001004 };
1005
1006 auto *RTLoc = emitUpdateLocation(CGF, Loc);
1007 auto *ThreadID = getThreadID(CGF, Loc);
1008 llvm::Value *Args[] = {RTLoc, ThreadID};
1009
Alexey Bataev3c595a62017-08-14 15:01:03 +00001010 auto &&SeqGen = [this, Fn, &CapturedVars, &Args, Loc](CodeGenFunction &CGF,
1011 PrePostActionTy &) {
1012 auto &&CodeGen = [this, Fn, &CapturedVars, Loc](CodeGenFunction &CGF,
1013 PrePostActionTy &Action) {
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001014 Action.Enter(CGF);
1015
1016 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1017 OutlinedFnArgs.push_back(
1018 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1019 OutlinedFnArgs.push_back(
1020 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1021 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001022 emitOutlinedFunctionCall(CGF, Loc, Fn, OutlinedFnArgs);
Arpith Chacko Jacobbb36fe82017-01-10 15:42:51 +00001023 };
1024
1025 RegionCodeGenTy RCG(CodeGen);
1026 NVPTXActionTy Action(
1027 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
1028 Args,
1029 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
1030 Args);
1031 RCG.setAction(Action);
1032 RCG(CGF);
1033 };
1034
1035 if (IfCond)
1036 emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
1037 else {
1038 CodeGenFunction::RunCleanupsScope Scope(CGF);
1039 RegionCodeGenTy ThenRCG(L0ParallelGen);
1040 ThenRCG(CGF);
1041 }
1042}
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001043
1044void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
1045 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
1046 ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) {
1047 // Just call the outlined function to execute the parallel region.
1048 // OutlinedFn(&GTid, &zero, CapturedStruct);
1049 //
1050 // TODO: Do something with IfCond when support for the 'if' clause
1051 // is added on Spmd target directives.
1052 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
1053 OutlinedFnArgs.push_back(
1054 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1055 OutlinedFnArgs.push_back(
1056 llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
1057 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
Alexey Bataev3c595a62017-08-14 15:01:03 +00001058 emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, OutlinedFnArgs);
Arpith Chacko Jacob44a87c92017-01-18 19:35:00 +00001059}
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001060
1061/// This function creates calls to one of two shuffle functions to copy
1062/// variables between lanes in a warp.
1063static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
1064 QualType ElemTy,
1065 llvm::Value *Elem,
1066 llvm::Value *Offset) {
1067 auto &CGM = CGF.CGM;
1068 auto &C = CGM.getContext();
1069 auto &Bld = CGF.Builder;
1070 CGOpenMPRuntimeNVPTX &RT =
1071 *(static_cast<CGOpenMPRuntimeNVPTX *>(&CGM.getOpenMPRuntime()));
1072
1073 unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
1074 assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
1075
1076 OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
1077 ? OMPRTL_NVPTX__kmpc_shuffle_int32
1078 : OMPRTL_NVPTX__kmpc_shuffle_int64;
1079
1080 // Cast all types to 32- or 64-bit values before calling shuffle routines.
1081 auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
1082 auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
1083 auto *WarpSize = CGF.EmitScalarConversion(
1084 getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
1085 C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
1086
1087 auto *ShuffledVal =
1088 CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
1089 {ElemCast, Offset, WarpSize});
1090
1091 return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
1092}
1093
1094namespace {
1095enum CopyAction : unsigned {
1096 // RemoteLaneToThread: Copy over a Reduce list from a remote lane in
1097 // the warp using shuffle instructions.
1098 RemoteLaneToThread,
1099 // ThreadCopy: Make a copy of a Reduce list on the thread's stack.
1100 ThreadCopy,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001101 // ThreadToScratchpad: Copy a team-reduced array to the scratchpad.
1102 ThreadToScratchpad,
1103 // ScratchpadToThread: Copy from a scratchpad array in global memory
1104 // containing team-reduced data to a thread's stack.
1105 ScratchpadToThread,
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001106};
1107} // namespace
1108
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001109struct CopyOptionsTy {
1110 llvm::Value *RemoteLaneOffset;
1111 llvm::Value *ScratchpadIndex;
1112 llvm::Value *ScratchpadWidth;
1113};
1114
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001115/// Emit instructions to copy a Reduce list, which contains partially
1116/// aggregated values, in the specified direction.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001117static void emitReductionListCopy(
1118 CopyAction Action, CodeGenFunction &CGF, QualType ReductionArrayTy,
1119 ArrayRef<const Expr *> Privates, Address SrcBase, Address DestBase,
1120 CopyOptionsTy CopyOptions = {nullptr, nullptr, nullptr}) {
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001121
1122 auto &CGM = CGF.CGM;
1123 auto &C = CGM.getContext();
1124 auto &Bld = CGF.Builder;
1125
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001126 auto *RemoteLaneOffset = CopyOptions.RemoteLaneOffset;
1127 auto *ScratchpadIndex = CopyOptions.ScratchpadIndex;
1128 auto *ScratchpadWidth = CopyOptions.ScratchpadWidth;
1129
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001130 // Iterates, element-by-element, through the source Reduce list and
1131 // make a copy.
1132 unsigned Idx = 0;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001133 unsigned Size = Privates.size();
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001134 for (auto &Private : Privates) {
1135 Address SrcElementAddr = Address::invalid();
1136 Address DestElementAddr = Address::invalid();
1137 Address DestElementPtrAddr = Address::invalid();
1138 // Should we shuffle in an element from a remote lane?
1139 bool ShuffleInElement = false;
1140 // Set to true to update the pointer in the dest Reduce list to a
1141 // newly created element.
1142 bool UpdateDestListPtr = false;
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001143 // Increment the src or dest pointer to the scratchpad, for each
1144 // new element.
1145 bool IncrScratchpadSrc = false;
1146 bool IncrScratchpadDest = false;
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001147
1148 switch (Action) {
1149 case RemoteLaneToThread: {
1150 // Step 1.1: Get the address for the src element in the Reduce list.
1151 Address SrcElementPtrAddr =
1152 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1153 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1154 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1155 SrcElementAddr =
1156 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1157
1158 // Step 1.2: Create a temporary to store the element in the destination
1159 // Reduce list.
1160 DestElementPtrAddr =
1161 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1162 DestElementAddr =
1163 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1164 ShuffleInElement = true;
1165 UpdateDestListPtr = true;
1166 break;
1167 }
1168 case ThreadCopy: {
1169 // Step 1.1: Get the address for the src element in the Reduce list.
1170 Address SrcElementPtrAddr =
1171 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1172 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1173 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1174 SrcElementAddr =
1175 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1176
1177 // Step 1.2: Get the address for dest element. The destination
1178 // element has already been created on the thread's stack.
1179 DestElementPtrAddr =
1180 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1181 llvm::Value *DestElementPtr =
1182 CGF.EmitLoadOfScalar(DestElementPtrAddr, /*Volatile=*/false,
1183 C.VoidPtrTy, SourceLocation());
1184 Address DestElemAddr =
1185 Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
1186 DestElementAddr = Bld.CreateElementBitCast(
1187 DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
1188 break;
1189 }
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001190 case ThreadToScratchpad: {
1191 // Step 1.1: Get the address for the src element in the Reduce list.
1192 Address SrcElementPtrAddr =
1193 Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
1194 llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
1195 SrcElementPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1196 SrcElementAddr =
1197 Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
1198
1199 // Step 1.2: Get the address for dest element:
1200 // address = base + index * ElementSizeInChars.
1201 unsigned ElementSizeInChars =
1202 C.getTypeSizeInChars(Private->getType()).getQuantity();
1203 auto *CurrentOffset =
1204 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1205 ScratchpadIndex);
1206 auto *ScratchPadElemAbsolutePtrVal =
1207 Bld.CreateAdd(DestBase.getPointer(), CurrentOffset);
1208 ScratchPadElemAbsolutePtrVal =
1209 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1210 Address ScratchpadPtr =
1211 Address(ScratchPadElemAbsolutePtrVal,
1212 C.getTypeAlignInChars(Private->getType()));
1213 DestElementAddr = Bld.CreateElementBitCast(
1214 ScratchpadPtr, CGF.ConvertTypeForMem(Private->getType()));
1215 IncrScratchpadDest = true;
1216 break;
1217 }
1218 case ScratchpadToThread: {
1219 // Step 1.1: Get the address for the src element in the scratchpad.
1220 // address = base + index * ElementSizeInChars.
1221 unsigned ElementSizeInChars =
1222 C.getTypeSizeInChars(Private->getType()).getQuantity();
1223 auto *CurrentOffset =
1224 Bld.CreateMul(llvm::ConstantInt::get(CGM.SizeTy, ElementSizeInChars),
1225 ScratchpadIndex);
1226 auto *ScratchPadElemAbsolutePtrVal =
1227 Bld.CreateAdd(SrcBase.getPointer(), CurrentOffset);
1228 ScratchPadElemAbsolutePtrVal =
1229 Bld.CreateIntToPtr(ScratchPadElemAbsolutePtrVal, CGF.VoidPtrTy);
1230 SrcElementAddr = Address(ScratchPadElemAbsolutePtrVal,
1231 C.getTypeAlignInChars(Private->getType()));
1232 IncrScratchpadSrc = true;
1233
1234 // Step 1.2: Create a temporary to store the element in the destination
1235 // Reduce list.
1236 DestElementPtrAddr =
1237 Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
1238 DestElementAddr =
1239 CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
1240 UpdateDestListPtr = true;
1241 break;
1242 }
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001243 }
1244
1245 // Regardless of src and dest of copy, we emit the load of src
1246 // element as this is required in all directions
1247 SrcElementAddr = Bld.CreateElementBitCast(
1248 SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
1249 llvm::Value *Elem =
1250 CGF.EmitLoadOfScalar(SrcElementAddr, /*Volatile=*/false,
1251 Private->getType(), SourceLocation());
1252
1253 // Now that all active lanes have read the element in the
1254 // Reduce list, shuffle over the value from the remote lane.
1255 if (ShuffleInElement) {
1256 Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
1257 RemoteLaneOffset);
1258 }
1259
1260 // Store the source element value to the dest element address.
1261 CGF.EmitStoreOfScalar(Elem, DestElementAddr, /*Volatile=*/false,
1262 Private->getType());
1263
1264 // Step 3.1: Modify reference in dest Reduce list as needed.
1265 // Modifying the reference in Reduce list to point to the newly
1266 // created element. The element is live in the current function
1267 // scope and that of functions it invokes (i.e., reduce_function).
1268 // RemoteReduceData[i] = (void*)&RemoteElem
1269 if (UpdateDestListPtr) {
1270 CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
1271 DestElementAddr.getPointer(), CGF.VoidPtrTy),
1272 DestElementPtrAddr, /*Volatile=*/false,
1273 C.VoidPtrTy);
1274 }
1275
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001276 // Step 4.1: Increment SrcBase/DestBase so that it points to the starting
1277 // address of the next element in scratchpad memory, unless we're currently
1278 // processing the last one. Memory alignment is also taken care of here.
1279 if ((IncrScratchpadDest || IncrScratchpadSrc) && (Idx + 1 < Size)) {
1280 llvm::Value *ScratchpadBasePtr =
1281 IncrScratchpadDest ? DestBase.getPointer() : SrcBase.getPointer();
1282 unsigned ElementSizeInChars =
1283 C.getTypeSizeInChars(Private->getType()).getQuantity();
1284 ScratchpadBasePtr = Bld.CreateAdd(
1285 ScratchpadBasePtr,
1286 Bld.CreateMul(ScratchpadWidth, llvm::ConstantInt::get(
1287 CGM.SizeTy, ElementSizeInChars)));
1288
1289 // Take care of global memory alignment for performance
1290 ScratchpadBasePtr = Bld.CreateSub(ScratchpadBasePtr,
1291 llvm::ConstantInt::get(CGM.SizeTy, 1));
1292 ScratchpadBasePtr = Bld.CreateSDiv(
1293 ScratchpadBasePtr,
1294 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1295 ScratchpadBasePtr = Bld.CreateAdd(ScratchpadBasePtr,
1296 llvm::ConstantInt::get(CGM.SizeTy, 1));
1297 ScratchpadBasePtr = Bld.CreateMul(
1298 ScratchpadBasePtr,
1299 llvm::ConstantInt::get(CGM.SizeTy, GlobalMemoryAlignment));
1300
1301 if (IncrScratchpadDest)
1302 DestBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1303 else /* IncrScratchpadSrc = true */
1304 SrcBase = Address(ScratchpadBasePtr, CGF.getPointerAlign());
1305 }
1306
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001307 Idx++;
1308 }
1309}
1310
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001311/// This function emits a helper that loads data from the scratchpad array
1312/// and (optionally) reduces it with the input operand.
1313///
1314/// load_and_reduce(local, scratchpad, index, width, should_reduce)
1315/// reduce_data remote;
1316/// for elem in remote:
1317/// remote.elem = Scratchpad[elem_id][index]
1318/// if (should_reduce)
1319/// local = local @ remote
1320/// else
1321/// local = remote
Benjamin Kramer674d5792017-05-26 20:08:24 +00001322static llvm::Value *
1323emitReduceScratchpadFunction(CodeGenModule &CGM,
1324 ArrayRef<const Expr *> Privates,
1325 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001326 auto &C = CGM.getContext();
1327 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1328
1329 // Destination of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001330 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001331 // Base address of the scratchpad array, with each element storing a
1332 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001333 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001334 // A source index into the scratchpad array.
Alexey Bataev56223232017-06-09 13:40:18 +00001335 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001336 // Row width of an element in the scratchpad array, typically
1337 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001338 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001339 // If should_reduce == 1, then it's load AND reduce,
1340 // If should_reduce == 0 (or otherwise), then it only loads (+ copy).
1341 // The latter case is used for initialization.
Alexey Bataev56223232017-06-09 13:40:18 +00001342 ImplicitParamDecl ShouldReduceArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001343
1344 FunctionArgList Args;
1345 Args.push_back(&ReduceListArg);
1346 Args.push_back(&ScratchPadArg);
1347 Args.push_back(&IndexArg);
1348 Args.push_back(&WidthArg);
1349 Args.push_back(&ShouldReduceArg);
1350
1351 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1352 auto *Fn = llvm::Function::Create(
1353 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1354 "_omp_reduction_load_and_reduce", &CGM.getModule());
1355 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1356 CodeGenFunction CGF(CGM);
1357 // We don't need debug information in this function as nothing here refers to
1358 // user code.
1359 CGF.disableDebugInfo();
1360 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1361
1362 auto &Bld = CGF.Builder;
1363
1364 // Get local Reduce list pointer.
1365 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1366 Address ReduceListAddr(
1367 Bld.CreatePointerBitCastOrAddrSpaceCast(
1368 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1369 C.VoidPtrTy, SourceLocation()),
1370 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1371 CGF.getPointerAlign());
1372
1373 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1374 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1375 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1376
1377 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1378 llvm::Value *IndexVal =
1379 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1380 Int32Ty, SourceLocation()),
1381 CGM.SizeTy, /*isSigned=*/true);
1382
1383 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1384 llvm::Value *WidthVal =
1385 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1386 Int32Ty, SourceLocation()),
1387 CGM.SizeTy, /*isSigned=*/true);
1388
1389 Address AddrShouldReduceArg = CGF.GetAddrOfLocalVar(&ShouldReduceArg);
1390 llvm::Value *ShouldReduceVal = CGF.EmitLoadOfScalar(
1391 AddrShouldReduceArg, /*Volatile=*/false, Int32Ty, SourceLocation());
1392
1393 // The absolute ptr address to the base addr of the next element to copy.
1394 llvm::Value *CumulativeElemBasePtr =
1395 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1396 Address SrcDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1397
1398 // Create a Remote Reduce list to store the elements read from the
1399 // scratchpad array.
1400 Address RemoteReduceList =
1401 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_red_list");
1402
1403 // Assemble remote Reduce list from scratchpad array.
1404 emitReductionListCopy(ScratchpadToThread, CGF, ReductionArrayTy, Privates,
1405 SrcDataAddr, RemoteReduceList,
1406 {/*RemoteLaneOffset=*/nullptr,
1407 /*ScratchpadIndex=*/IndexVal,
1408 /*ScratchpadWidth=*/WidthVal});
1409
1410 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1411 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1412 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1413
1414 auto CondReduce = Bld.CreateICmpEQ(ShouldReduceVal, Bld.getInt32(1));
1415 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1416
1417 CGF.EmitBlock(ThenBB);
1418 // We should reduce with the local Reduce list.
1419 // reduce_function(LocalReduceList, RemoteReduceList)
1420 llvm::Value *LocalDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1421 ReduceListAddr.getPointer(), CGF.VoidPtrTy);
1422 llvm::Value *RemoteDataPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1423 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1424 CGF.EmitCallOrInvoke(ReduceFn, {LocalDataPtr, RemoteDataPtr});
1425 Bld.CreateBr(MergeBB);
1426
1427 CGF.EmitBlock(ElseBB);
1428 // No reduction; just copy:
1429 // Local Reduce list = Remote Reduce list.
1430 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1431 RemoteReduceList, ReduceListAddr);
1432 Bld.CreateBr(MergeBB);
1433
1434 CGF.EmitBlock(MergeBB);
1435
1436 CGF.FinishFunction();
1437 return Fn;
1438}
1439
1440/// This function emits a helper that stores reduced data from the team
1441/// master to a scratchpad array in global memory.
1442///
1443/// for elem in Reduce List:
1444/// scratchpad[elem_id][index] = elem
1445///
Benjamin Kramer674d5792017-05-26 20:08:24 +00001446static llvm::Value *emitCopyToScratchpad(CodeGenModule &CGM,
1447 ArrayRef<const Expr *> Privates,
1448 QualType ReductionArrayTy) {
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001449
1450 auto &C = CGM.getContext();
1451 auto Int32Ty = C.getIntTypeForBitwidth(32, /* Signed */ true);
1452
1453 // Source of the copy.
Alexey Bataev56223232017-06-09 13:40:18 +00001454 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001455 // Base address of the scratchpad array, with each element storing a
1456 // Reduce list per team.
Alexey Bataev56223232017-06-09 13:40:18 +00001457 ImplicitParamDecl ScratchPadArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001458 // A destination index into the scratchpad array, typically the team
1459 // identifier.
Alexey Bataev56223232017-06-09 13:40:18 +00001460 ImplicitParamDecl IndexArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001461 // Row width of an element in the scratchpad array, typically
1462 // the number of teams.
Alexey Bataev56223232017-06-09 13:40:18 +00001463 ImplicitParamDecl WidthArg(C, Int32Ty, ImplicitParamDecl::Other);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001464
1465 FunctionArgList Args;
1466 Args.push_back(&ReduceListArg);
1467 Args.push_back(&ScratchPadArg);
1468 Args.push_back(&IndexArg);
1469 Args.push_back(&WidthArg);
1470
1471 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1472 auto *Fn = llvm::Function::Create(
1473 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1474 "_omp_reduction_copy_to_scratchpad", &CGM.getModule());
1475 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1476 CodeGenFunction CGF(CGM);
1477 // We don't need debug information in this function as nothing here refers to
1478 // user code.
1479 CGF.disableDebugInfo();
1480 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1481
1482 auto &Bld = CGF.Builder;
1483
1484 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1485 Address SrcDataAddr(
1486 Bld.CreatePointerBitCastOrAddrSpaceCast(
1487 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1488 C.VoidPtrTy, SourceLocation()),
1489 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1490 CGF.getPointerAlign());
1491
1492 Address AddrScratchPadArg = CGF.GetAddrOfLocalVar(&ScratchPadArg);
1493 llvm::Value *ScratchPadBase = CGF.EmitLoadOfScalar(
1494 AddrScratchPadArg, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1495
1496 Address AddrIndexArg = CGF.GetAddrOfLocalVar(&IndexArg);
1497 llvm::Value *IndexVal =
1498 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrIndexArg, /*Volatile=*/false,
1499 Int32Ty, SourceLocation()),
1500 CGF.SizeTy, /*isSigned=*/true);
1501
1502 Address AddrWidthArg = CGF.GetAddrOfLocalVar(&WidthArg);
1503 llvm::Value *WidthVal =
1504 Bld.CreateIntCast(CGF.EmitLoadOfScalar(AddrWidthArg, /*Volatile=*/false,
1505 Int32Ty, SourceLocation()),
1506 CGF.SizeTy, /*isSigned=*/true);
1507
1508 // The absolute ptr address to the base addr of the next element to copy.
1509 llvm::Value *CumulativeElemBasePtr =
1510 Bld.CreatePtrToInt(ScratchPadBase, CGM.SizeTy);
1511 Address DestDataAddr(CumulativeElemBasePtr, CGF.getPointerAlign());
1512
1513 emitReductionListCopy(ThreadToScratchpad, CGF, ReductionArrayTy, Privates,
1514 SrcDataAddr, DestDataAddr,
1515 {/*RemoteLaneOffset=*/nullptr,
1516 /*ScratchpadIndex=*/IndexVal,
1517 /*ScratchpadWidth=*/WidthVal});
1518
1519 CGF.FinishFunction();
1520 return Fn;
1521}
1522
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001523/// This function emits a helper that gathers Reduce lists from the first
1524/// lane of every active warp to lanes in the first warp.
1525///
1526/// void inter_warp_copy_func(void* reduce_data, num_warps)
1527/// shared smem[warp_size];
1528/// For all data entries D in reduce_data:
1529/// If (I am the first lane in each warp)
1530/// Copy my local D to smem[warp_id]
1531/// sync
1532/// if (I am the first warp)
1533/// Copy smem[thread_id] to my local D
1534/// sync
1535static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
1536 ArrayRef<const Expr *> Privates,
1537 QualType ReductionArrayTy) {
1538 auto &C = CGM.getContext();
1539 auto &M = CGM.getModule();
1540
1541 // ReduceList: thread local Reduce list.
1542 // At the stage of the computation when this function is called, partially
1543 // aggregated values reside in the first lane of every active warp.
Alexey Bataev56223232017-06-09 13:40:18 +00001544 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001545 // NumWarps: number of warps active in the parallel region. This could
1546 // be smaller than 32 (max warps in a CTA) for partial block reduction.
Alexey Bataev56223232017-06-09 13:40:18 +00001547 ImplicitParamDecl NumWarpsArg(C,
1548 C.getIntTypeForBitwidth(32, /* Signed */ true),
1549 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001550 FunctionArgList Args;
1551 Args.push_back(&ReduceListArg);
1552 Args.push_back(&NumWarpsArg);
1553
1554 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1555 auto *Fn = llvm::Function::Create(
1556 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1557 "_omp_reduction_inter_warp_copy_func", &CGM.getModule());
1558 CGM.SetInternalFunctionAttributes(/*DC=*/nullptr, Fn, CGFI);
1559 CodeGenFunction CGF(CGM);
1560 // We don't need debug information in this function as nothing here refers to
1561 // user code.
1562 CGF.disableDebugInfo();
1563 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1564
1565 auto &Bld = CGF.Builder;
1566
1567 // This array is used as a medium to transfer, one reduce element at a time,
1568 // the data from the first lane of every warp to lanes in the first warp
1569 // in order to perform the final step of a reduction in a parallel region
1570 // (reduction across warps). The array is placed in NVPTX __shared__ memory
1571 // for reduced latency, as well as to have a distinct copy for concurrently
1572 // executing target regions. The array is declared with common linkage so
1573 // as to be shared across compilation units.
1574 const char *TransferMediumName =
1575 "__openmp_nvptx_data_transfer_temporary_storage";
1576 llvm::GlobalVariable *TransferMedium =
1577 M.getGlobalVariable(TransferMediumName);
1578 if (!TransferMedium) {
1579 auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
1580 unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
1581 TransferMedium = new llvm::GlobalVariable(
1582 M, Ty,
1583 /*isConstant=*/false, llvm::GlobalVariable::CommonLinkage,
1584 llvm::Constant::getNullValue(Ty), TransferMediumName,
1585 /*InsertBefore=*/nullptr, llvm::GlobalVariable::NotThreadLocal,
1586 SharedAddressSpace);
1587 }
1588
1589 // Get the CUDA thread id of the current OpenMP thread on the GPU.
1590 auto *ThreadID = getNVPTXThreadID(CGF);
1591 // nvptx_lane_id = nvptx_id % warpsize
1592 auto *LaneID = getNVPTXLaneID(CGF);
1593 // nvptx_warp_id = nvptx_id / warpsize
1594 auto *WarpID = getNVPTXWarpID(CGF);
1595
1596 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1597 Address LocalReduceList(
1598 Bld.CreatePointerBitCastOrAddrSpaceCast(
1599 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1600 C.VoidPtrTy, SourceLocation()),
1601 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1602 CGF.getPointerAlign());
1603
1604 unsigned Idx = 0;
1605 for (auto &Private : Privates) {
1606 //
1607 // Warp master copies reduce element to transfer medium in __shared__
1608 // memory.
1609 //
1610 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1611 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1612 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1613
1614 // if (lane_id == 0)
1615 auto IsWarpMaster =
1616 Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
1617 Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
1618 CGF.EmitBlock(ThenBB);
1619
1620 // Reduce element = LocalReduceList[i]
1621 Address ElemPtrPtrAddr =
1622 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1623 llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
1624 ElemPtrPtrAddr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1625 // elemptr = (type[i]*)(elemptrptr)
1626 Address ElemPtr =
1627 Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
1628 ElemPtr = Bld.CreateElementBitCast(
1629 ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1630 // elem = *elemptr
1631 llvm::Value *Elem = CGF.EmitLoadOfScalar(
1632 ElemPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1633
1634 // Get pointer to location in transfer medium.
1635 // MediumPtr = &medium[warp_id]
1636 llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
1637 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
1638 Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
1639 // Casting to actual data type.
1640 // MediumPtr = (type[i]*)MediumPtrAddr;
1641 MediumPtr = Bld.CreateElementBitCast(
1642 MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1643
1644 //*MediumPtr = elem
1645 Bld.CreateStore(Elem, MediumPtr);
1646
1647 Bld.CreateBr(MergeBB);
1648
1649 CGF.EmitBlock(ElseBB);
1650 Bld.CreateBr(MergeBB);
1651
1652 CGF.EmitBlock(MergeBB);
1653
1654 Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
1655 llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
1656 AddrNumWarpsArg, /*Volatile=*/false, C.IntTy, SourceLocation());
1657
1658 auto *NumActiveThreads = Bld.CreateNSWMul(
1659 NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
1660 // named_barrier_sync(ParallelBarrierID, num_active_threads)
1661 syncParallelThreads(CGF, NumActiveThreads);
1662
1663 //
1664 // Warp 0 copies reduce element from transfer medium.
1665 //
1666 llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
1667 llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
1668 llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
1669
1670 // Up to 32 threads in warp 0 are active.
1671 auto IsActiveThread =
1672 Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
1673 Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
1674
1675 CGF.EmitBlock(W0ThenBB);
1676
1677 // SrcMediumPtr = &medium[tid]
1678 llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
1679 TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
1680 Address SrcMediumPtr(SrcMediumPtrVal,
1681 C.getTypeAlignInChars(Private->getType()));
1682 // SrcMediumVal = *SrcMediumPtr;
1683 SrcMediumPtr = Bld.CreateElementBitCast(
1684 SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
1685 llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
1686 SrcMediumPtr, /*Volatile=*/false, Private->getType(), SourceLocation());
1687
1688 // TargetElemPtr = (type[i]*)(SrcDataAddr[i])
1689 Address TargetElemPtrPtr =
1690 Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
1691 llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
1692 TargetElemPtrPtr, /*Volatile=*/false, C.VoidPtrTy, SourceLocation());
1693 Address TargetElemPtr =
1694 Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
1695 TargetElemPtr = Bld.CreateElementBitCast(
1696 TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
1697
1698 // *TargetElemPtr = SrcMediumVal;
1699 CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /*Volatile=*/false,
1700 Private->getType());
1701 Bld.CreateBr(W0MergeBB);
1702
1703 CGF.EmitBlock(W0ElseBB);
1704 Bld.CreateBr(W0MergeBB);
1705
1706 CGF.EmitBlock(W0MergeBB);
1707
1708 // While warp 0 copies values from transfer medium, all other warps must
1709 // wait.
1710 syncParallelThreads(CGF, NumActiveThreads);
1711 Idx++;
1712 }
1713
1714 CGF.FinishFunction();
1715 return Fn;
1716}
1717
1718/// Emit a helper that reduces data across two OpenMP threads (lanes)
1719/// in the same warp. It uses shuffle instructions to copy over data from
1720/// a remote lane's stack. The reduction algorithm performed is specified
1721/// by the fourth parameter.
1722///
1723/// Algorithm Versions.
1724/// Full Warp Reduce (argument value 0):
1725/// This algorithm assumes that all 32 lanes are active and gathers
1726/// data from these 32 lanes, producing a single resultant value.
1727/// Contiguous Partial Warp Reduce (argument value 1):
1728/// This algorithm assumes that only a *contiguous* subset of lanes
1729/// are active. This happens for the last warp in a parallel region
1730/// when the user specified num_threads is not an integer multiple of
1731/// 32. This contiguous subset always starts with the zeroth lane.
1732/// Partial Warp Reduce (argument value 2):
1733/// This algorithm gathers data from any number of lanes at any position.
1734/// All reduced values are stored in the lowest possible lane. The set
1735/// of problems every algorithm addresses is a super set of those
1736/// addressable by algorithms with a lower version number. Overhead
1737/// increases as algorithm version increases.
1738///
1739/// Terminology
1740/// Reduce element:
1741/// Reduce element refers to the individual data field with primitive
1742/// data types to be combined and reduced across threads.
1743/// Reduce list:
1744/// Reduce list refers to a collection of local, thread-private
1745/// reduce elements.
1746/// Remote Reduce list:
1747/// Remote Reduce list refers to a collection of remote (relative to
1748/// the current thread) reduce elements.
1749///
1750/// We distinguish between three states of threads that are important to
1751/// the implementation of this function.
1752/// Alive threads:
1753/// Threads in a warp executing the SIMT instruction, as distinguished from
1754/// threads that are inactive due to divergent control flow.
1755/// Active threads:
1756/// The minimal set of threads that has to be alive upon entry to this
1757/// function. The computation is correct iff active threads are alive.
1758/// Some threads are alive but they are not active because they do not
1759/// contribute to the computation in any useful manner. Turning them off
1760/// may introduce control flow overheads without any tangible benefits.
1761/// Effective threads:
1762/// In order to comply with the argument requirements of the shuffle
1763/// function, we must keep all lanes holding data alive. But at most
1764/// half of them perform value aggregation; we refer to this half of
1765/// threads as effective. The other half is simply handing off their
1766/// data.
1767///
1768/// Procedure
1769/// Value shuffle:
1770/// In this step active threads transfer data from higher lane positions
1771/// in the warp to lower lane positions, creating Remote Reduce list.
1772/// Value aggregation:
1773/// In this step, effective threads combine their thread local Reduce list
1774/// with Remote Reduce list and store the result in the thread local
1775/// Reduce list.
1776/// Value copy:
1777/// In this step, we deal with the assumption made by algorithm 2
1778/// (i.e. contiguity assumption). When we have an odd number of lanes
1779/// active, say 2k+1, only k threads will be effective and therefore k
1780/// new values will be produced. However, the Reduce list owned by the
1781/// (2k+1)th thread is ignored in the value aggregation. Therefore
1782/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
1783/// that the contiguity assumption still holds.
1784static llvm::Value *
1785emitShuffleAndReduceFunction(CodeGenModule &CGM,
1786 ArrayRef<const Expr *> Privates,
1787 QualType ReductionArrayTy, llvm::Value *ReduceFn) {
1788 auto &C = CGM.getContext();
1789
1790 // Thread local Reduce list used to host the values of data to be reduced.
Alexey Bataev56223232017-06-09 13:40:18 +00001791 ImplicitParamDecl ReduceListArg(C, C.VoidPtrTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001792 // Current lane id; could be logical.
Alexey Bataev56223232017-06-09 13:40:18 +00001793 ImplicitParamDecl LaneIDArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001794 // Offset of the remote source lane relative to the current lane.
Alexey Bataev56223232017-06-09 13:40:18 +00001795 ImplicitParamDecl RemoteLaneOffsetArg(C, C.ShortTy,
1796 ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001797 // Algorithm version. This is expected to be known at compile time.
Alexey Bataev56223232017-06-09 13:40:18 +00001798 ImplicitParamDecl AlgoVerArg(C, C.ShortTy, ImplicitParamDecl::Other);
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001799 FunctionArgList Args;
1800 Args.push_back(&ReduceListArg);
1801 Args.push_back(&LaneIDArg);
1802 Args.push_back(&RemoteLaneOffsetArg);
1803 Args.push_back(&AlgoVerArg);
1804
1805 auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
1806 auto *Fn = llvm::Function::Create(
1807 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
1808 "_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
1809 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
1810 CodeGenFunction CGF(CGM);
1811 // We don't need debug information in this function as nothing here refers to
1812 // user code.
1813 CGF.disableDebugInfo();
1814 CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
1815
1816 auto &Bld = CGF.Builder;
1817
1818 Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
1819 Address LocalReduceList(
1820 Bld.CreatePointerBitCastOrAddrSpaceCast(
1821 CGF.EmitLoadOfScalar(AddrReduceListArg, /*Volatile=*/false,
1822 C.VoidPtrTy, SourceLocation()),
1823 CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
1824 CGF.getPointerAlign());
1825
1826 Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
1827 llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
1828 AddrLaneIDArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1829
1830 Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
1831 llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
1832 AddrRemoteLaneOffsetArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1833
1834 Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
1835 llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
1836 AddrAlgoVerArg, /*Volatile=*/false, C.ShortTy, SourceLocation());
1837
1838 // Create a local thread-private variable to host the Reduce list
1839 // from a remote lane.
1840 Address RemoteReduceList =
1841 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
1842
1843 // This loop iterates through the list of reduce elements and copies,
1844 // element by element, from a remote lane in the warp to RemoteReduceList,
1845 // hosted on the thread's stack.
1846 emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
1847 LocalReduceList, RemoteReduceList,
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00001848 {/*RemoteLaneOffset=*/RemoteLaneOffsetArgVal,
1849 /*ScratchpadIndex=*/nullptr,
1850 /*ScratchpadWidth=*/nullptr});
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00001851
1852 // The actions to be performed on the Remote Reduce list is dependent
1853 // on the algorithm version.
1854 //
1855 // if (AlgoVer==0) || (AlgoVer==1 && (LaneId < Offset)) || (AlgoVer==2 &&
1856 // LaneId % 2 == 0 && Offset > 0):
1857 // do the reduction value aggregation
1858 //
1859 // The thread local variable Reduce list is mutated in place to host the
1860 // reduced data, which is the aggregated value produced from local and
1861 // remote lanes.
1862 //
1863 // Note that AlgoVer is expected to be a constant integer known at compile
1864 // time.
1865 // When AlgoVer==0, the first conjunction evaluates to true, making
1866 // the entire predicate true during compile time.
1867 // When AlgoVer==1, the second conjunction has only the second part to be
1868 // evaluated during runtime. Other conjunctions evaluates to false
1869 // during compile time.
1870 // When AlgoVer==2, the third conjunction has only the second part to be
1871 // evaluated during runtime. Other conjunctions evaluates to false
1872 // during compile time.
1873 auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
1874
1875 auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1876 auto CondAlgo1 = Bld.CreateAnd(
1877 Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
1878
1879 auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
1880 auto CondAlgo2 = Bld.CreateAnd(
1881 Algo2,
1882 Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
1883 Bld.getInt16(0)));
1884 CondAlgo2 = Bld.CreateAnd(
1885 CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
1886
1887 auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
1888 CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
1889
1890 llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
1891 llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
1892 llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
1893 Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
1894
1895 CGF.EmitBlock(ThenBB);
1896 // reduce_function(LocalReduceList, RemoteReduceList)
1897 llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1898 LocalReduceList.getPointer(), CGF.VoidPtrTy);
1899 llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
1900 RemoteReduceList.getPointer(), CGF.VoidPtrTy);
1901 CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
1902 Bld.CreateBr(MergeBB);
1903
1904 CGF.EmitBlock(ElseBB);
1905 Bld.CreateBr(MergeBB);
1906
1907 CGF.EmitBlock(MergeBB);
1908
1909 // if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
1910 // Reduce list.
1911 Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
1912 auto CondCopy = Bld.CreateAnd(
1913 Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
1914
1915 llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
1916 llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
1917 llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
1918 Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
1919
1920 CGF.EmitBlock(CpyThenBB);
1921 emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
1922 RemoteReduceList, LocalReduceList);
1923 Bld.CreateBr(CpyMergeBB);
1924
1925 CGF.EmitBlock(CpyElseBB);
1926 Bld.CreateBr(CpyMergeBB);
1927
1928 CGF.EmitBlock(CpyMergeBB);
1929
1930 CGF.FinishFunction();
1931 return Fn;
1932}
1933
1934///
1935/// Design of OpenMP reductions on the GPU
1936///
1937/// Consider a typical OpenMP program with one or more reduction
1938/// clauses:
1939///
1940/// float foo;
1941/// double bar;
1942/// #pragma omp target teams distribute parallel for \
1943/// reduction(+:foo) reduction(*:bar)
1944/// for (int i = 0; i < N; i++) {
1945/// foo += A[i]; bar *= B[i];
1946/// }
1947///
1948/// where 'foo' and 'bar' are reduced across all OpenMP threads in
1949/// all teams. In our OpenMP implementation on the NVPTX device an
1950/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
1951/// within a team are mapped to CUDA threads within a threadblock.
1952/// Our goal is to efficiently aggregate values across all OpenMP
1953/// threads such that:
1954///
1955/// - the compiler and runtime are logically concise, and
1956/// - the reduction is performed efficiently in a hierarchical
1957/// manner as follows: within OpenMP threads in the same warp,
1958/// across warps in a threadblock, and finally across teams on
1959/// the NVPTX device.
1960///
1961/// Introduction to Decoupling
1962///
1963/// We would like to decouple the compiler and the runtime so that the
1964/// latter is ignorant of the reduction variables (number, data types)
1965/// and the reduction operators. This allows a simpler interface
1966/// and implementation while still attaining good performance.
1967///
1968/// Pseudocode for the aforementioned OpenMP program generated by the
1969/// compiler is as follows:
1970///
1971/// 1. Create private copies of reduction variables on each OpenMP
1972/// thread: 'foo_private', 'bar_private'
1973/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
1974/// to it and writes the result in 'foo_private' and 'bar_private'
1975/// respectively.
1976/// 3. Call the OpenMP runtime on the GPU to reduce within a team
1977/// and store the result on the team master:
1978///
1979/// __kmpc_nvptx_parallel_reduce_nowait(...,
1980/// reduceData, shuffleReduceFn, interWarpCpyFn)
1981///
1982/// where:
1983/// struct ReduceData {
1984/// double *foo;
1985/// double *bar;
1986/// } reduceData
1987/// reduceData.foo = &foo_private
1988/// reduceData.bar = &bar_private
1989///
1990/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
1991/// auxiliary functions generated by the compiler that operate on
1992/// variables of type 'ReduceData'. They aid the runtime perform
1993/// algorithmic steps in a data agnostic manner.
1994///
1995/// 'shuffleReduceFn' is a pointer to a function that reduces data
1996/// of type 'ReduceData' across two OpenMP threads (lanes) in the
1997/// same warp. It takes the following arguments as input:
1998///
1999/// a. variable of type 'ReduceData' on the calling lane,
2000/// b. its lane_id,
2001/// c. an offset relative to the current lane_id to generate a
2002/// remote_lane_id. The remote lane contains the second
2003/// variable of type 'ReduceData' that is to be reduced.
2004/// d. an algorithm version parameter determining which reduction
2005/// algorithm to use.
2006///
2007/// 'shuffleReduceFn' retrieves data from the remote lane using
2008/// efficient GPU shuffle intrinsics and reduces, using the
2009/// algorithm specified by the 4th parameter, the two operands
2010/// element-wise. The result is written to the first operand.
2011///
2012/// Different reduction algorithms are implemented in different
2013/// runtime functions, all calling 'shuffleReduceFn' to perform
2014/// the essential reduction step. Therefore, based on the 4th
2015/// parameter, this function behaves slightly differently to
2016/// cooperate with the runtime to ensure correctness under
2017/// different circumstances.
2018///
2019/// 'InterWarpCpyFn' is a pointer to a function that transfers
2020/// reduced variables across warps. It tunnels, through CUDA
2021/// shared memory, the thread-private data of type 'ReduceData'
2022/// from lane 0 of each warp to a lane in the first warp.
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002023/// 4. Call the OpenMP runtime on the GPU to reduce across teams.
2024/// The last team writes the global reduced value to memory.
2025///
2026/// ret = __kmpc_nvptx_teams_reduce_nowait(...,
2027/// reduceData, shuffleReduceFn, interWarpCpyFn,
2028/// scratchpadCopyFn, loadAndReduceFn)
2029///
2030/// 'scratchpadCopyFn' is a helper that stores reduced
2031/// data from the team master to a scratchpad array in
2032/// global memory.
2033///
2034/// 'loadAndReduceFn' is a helper that loads data from
2035/// the scratchpad array and reduces it with the input
2036/// operand.
2037///
2038/// These compiler generated functions hide address
2039/// calculation and alignment information from the runtime.
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002040/// 5. if ret == 1:
2041/// The team master of the last team stores the reduced
2042/// result to the globals in memory.
2043/// foo += reduceData.foo; bar *= reduceData.bar
2044///
2045///
2046/// Warp Reduction Algorithms
2047///
2048/// On the warp level, we have three algorithms implemented in the
2049/// OpenMP runtime depending on the number of active lanes:
2050///
2051/// Full Warp Reduction
2052///
2053/// The reduce algorithm within a warp where all lanes are active
2054/// is implemented in the runtime as follows:
2055///
2056/// full_warp_reduce(void *reduce_data,
2057/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2058/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
2059/// ShuffleReduceFn(reduce_data, 0, offset, 0);
2060/// }
2061///
2062/// The algorithm completes in log(2, WARPSIZE) steps.
2063///
2064/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
2065/// not used therefore we save instructions by not retrieving lane_id
2066/// from the corresponding special registers. The 4th parameter, which
2067/// represents the version of the algorithm being used, is set to 0 to
2068/// signify full warp reduction.
2069///
2070/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2071///
2072/// #reduce_elem refers to an element in the local lane's data structure
2073/// #remote_elem is retrieved from a remote lane
2074/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2075/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
2076///
2077/// Contiguous Partial Warp Reduction
2078///
2079/// This reduce algorithm is used within a warp where only the first
2080/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
2081/// number of OpenMP threads in a parallel region is not a multiple of
2082/// WARPSIZE. The algorithm is implemented in the runtime as follows:
2083///
2084/// void
2085/// contiguous_partial_reduce(void *reduce_data,
2086/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
2087/// int size, int lane_id) {
2088/// int curr_size;
2089/// int offset;
2090/// curr_size = size;
2091/// mask = curr_size/2;
2092/// while (offset>0) {
2093/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
2094/// curr_size = (curr_size+1)/2;
2095/// offset = curr_size/2;
2096/// }
2097/// }
2098///
2099/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2100///
2101/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2102/// if (lane_id < offset)
2103/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2104/// else
2105/// reduce_elem = remote_elem
2106///
2107/// This algorithm assumes that the data to be reduced are located in a
2108/// contiguous subset of lanes starting from the first. When there is
2109/// an odd number of active lanes, the data in the last lane is not
2110/// aggregated with any other lane's dat but is instead copied over.
2111///
2112/// Dispersed Partial Warp Reduction
2113///
2114/// This algorithm is used within a warp when any discontiguous subset of
2115/// lanes are active. It is used to implement the reduction operation
2116/// across lanes in an OpenMP simd region or in a nested parallel region.
2117///
2118/// void
2119/// dispersed_partial_reduce(void *reduce_data,
2120/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
2121/// int size, remote_id;
2122/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
2123/// do {
2124/// remote_id = next_active_lane_id_right_after_me();
2125/// # the above function returns 0 of no active lane
2126/// # is present right after the current lane.
2127/// size = number_of_active_lanes_in_this_warp();
2128/// logical_lane_id /= 2;
2129/// ShuffleReduceFn(reduce_data, logical_lane_id,
2130/// remote_id-1-threadIdx.x, 2);
2131/// } while (logical_lane_id % 2 == 0 && size > 1);
2132/// }
2133///
2134/// There is no assumption made about the initial state of the reduction.
2135/// Any number of lanes (>=1) could be active at any position. The reduction
2136/// result is returned in the first active lane.
2137///
2138/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
2139///
2140/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
2141/// if (lane_id % 2 == 0 && offset > 0)
2142/// reduce_elem = reduce_elem REDUCE_OP remote_elem
2143/// else
2144/// reduce_elem = remote_elem
2145///
2146///
2147/// Intra-Team Reduction
2148///
2149/// This function, as implemented in the runtime call
2150/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
2151/// threads in a team. It first reduces within a warp using the
2152/// aforementioned algorithms. We then proceed to gather all such
2153/// reduced values at the first warp.
2154///
2155/// The runtime makes use of the function 'InterWarpCpyFn', which copies
2156/// data from each of the "warp master" (zeroth lane of each warp, where
2157/// warp-reduced data is held) to the zeroth warp. This step reduces (in
2158/// a mathematical sense) the problem of reduction across warp masters in
2159/// a block to the problem of warp reduction.
2160///
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002161///
2162/// Inter-Team Reduction
2163///
2164/// Once a team has reduced its data to a single value, it is stored in
2165/// a global scratchpad array. Since each team has a distinct slot, this
2166/// can be done without locking.
2167///
2168/// The last team to write to the scratchpad array proceeds to reduce the
2169/// scratchpad array. One or more workers in the last team use the helper
2170/// 'loadAndReduceDataFn' to load and reduce values from the array, i.e.,
2171/// the k'th worker reduces every k'th element.
2172///
2173/// Finally, a call is made to '__kmpc_nvptx_parallel_reduce_nowait' to
2174/// reduce across workers and compute a globally reduced value.
2175///
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002176void CGOpenMPRuntimeNVPTX::emitReduction(
2177 CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
2178 ArrayRef<const Expr *> LHSExprs, ArrayRef<const Expr *> RHSExprs,
2179 ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
2180 if (!CGF.HaveInsertPoint())
2181 return;
2182
2183 bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002184 bool TeamsReduction = isOpenMPTeamsDirective(Options.ReductionKind);
2185 // FIXME: Add support for simd reduction.
2186 assert((TeamsReduction || ParallelReduction) &&
2187 "Invalid reduction selection in emitReduction.");
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002188
2189 auto &C = CGM.getContext();
2190
2191 // 1. Build a list of reduction variables.
2192 // void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
2193 auto Size = RHSExprs.size();
2194 for (auto *E : Privates) {
2195 if (E->getType()->isVariablyModifiedType())
2196 // Reserve place for array size.
2197 ++Size;
2198 }
2199 llvm::APInt ArraySize(/*unsigned int numBits=*/32, Size);
2200 QualType ReductionArrayTy =
2201 C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
2202 /*IndexTypeQuals=*/0);
2203 Address ReductionList =
2204 CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
2205 auto IPriv = Privates.begin();
2206 unsigned Idx = 0;
2207 for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
2208 Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2209 CGF.getPointerSize());
2210 CGF.Builder.CreateStore(
2211 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2212 CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
2213 Elem);
2214 if ((*IPriv)->getType()->isVariablyModifiedType()) {
2215 // Store array size.
2216 ++Idx;
2217 Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
2218 CGF.getPointerSize());
2219 llvm::Value *Size = CGF.Builder.CreateIntCast(
2220 CGF.getVLASize(
2221 CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
2222 .first,
2223 CGF.SizeTy, /*isSigned=*/false);
2224 CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
2225 Elem);
2226 }
2227 }
2228
2229 // 2. Emit reduce_func().
2230 auto *ReductionFn = emitReductionFunction(
2231 CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
2232 LHSExprs, RHSExprs, ReductionOps);
2233
2234 // 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
2235 // RedList, shuffle_reduce_func, interwarp_copy_func);
2236 auto *ThreadId = getThreadID(CGF, Loc);
2237 auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
2238 auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2239 ReductionList.getPointer(), CGF.VoidPtrTy);
2240
2241 auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
2242 CGM, Privates, ReductionArrayTy, ReductionFn);
2243 auto *InterWarpCopyFn =
2244 emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
2245
2246 llvm::Value *Res = nullptr;
2247 if (ParallelReduction) {
2248 llvm::Value *Args[] = {ThreadId,
2249 CGF.Builder.getInt32(RHSExprs.size()),
2250 ReductionArrayTySize,
2251 RL,
2252 ShuffleAndReduceFn,
2253 InterWarpCopyFn};
2254
2255 Res = CGF.EmitRuntimeCall(
2256 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
2257 Args);
2258 }
2259
Arpith Chacko Jacobfc711b12017-02-16 16:48:49 +00002260 if (TeamsReduction) {
2261 auto *ScratchPadCopyFn =
2262 emitCopyToScratchpad(CGM, Privates, ReductionArrayTy);
2263 auto *LoadAndReduceFn = emitReduceScratchpadFunction(
2264 CGM, Privates, ReductionArrayTy, ReductionFn);
2265
2266 llvm::Value *Args[] = {ThreadId,
2267 CGF.Builder.getInt32(RHSExprs.size()),
2268 ReductionArrayTySize,
2269 RL,
2270 ShuffleAndReduceFn,
2271 InterWarpCopyFn,
2272 ScratchPadCopyFn,
2273 LoadAndReduceFn};
2274 Res = CGF.EmitRuntimeCall(
2275 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_teams_reduce_nowait),
2276 Args);
2277 }
2278
Arpith Chacko Jacob101e8fb2017-02-16 16:20:16 +00002279 // 5. Build switch(res)
2280 auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
2281 auto *SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /*NumCases=*/1);
2282
2283 // 6. Build case 1: where we have reduced values in the master
2284 // thread in each team.
2285 // __kmpc_end_reduce{_nowait}(<gtid>);
2286 // break;
2287 auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
2288 SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
2289 CGF.EmitBlock(Case1BB);
2290
2291 // Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
2292 llvm::Value *EndArgs[] = {ThreadId};
2293 auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
2294 this](CodeGenFunction &CGF, PrePostActionTy &Action) {
2295 auto IPriv = Privates.begin();
2296 auto ILHS = LHSExprs.begin();
2297 auto IRHS = RHSExprs.begin();
2298 for (auto *E : ReductionOps) {
2299 emitSingleReductionCombiner(CGF, E, *IPriv, cast<DeclRefExpr>(*ILHS),
2300 cast<DeclRefExpr>(*IRHS));
2301 ++IPriv;
2302 ++ILHS;
2303 ++IRHS;
2304 }
2305 };
2306 RegionCodeGenTy RCG(CodeGen);
2307 NVPTXActionTy Action(
2308 nullptr, llvm::None,
2309 createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
2310 EndArgs);
2311 RCG.setAction(Action);
2312 RCG(CGF);
2313 CGF.EmitBranch(DefaultBB);
2314 CGF.EmitBlock(DefaultBB, /*IsFinished=*/true);
2315}
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002316
2317const VarDecl *
2318CGOpenMPRuntimeNVPTX::translateParameter(const FieldDecl *FD,
2319 const VarDecl *NativeParam) const {
2320 if (!NativeParam->getType()->isReferenceType())
2321 return NativeParam;
2322 QualType ArgType = NativeParam->getType();
2323 QualifierCollector QC;
2324 const Type *NonQualTy = QC.strip(ArgType);
2325 QualType PointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2326 if (const auto *Attr = FD->getAttr<OMPCaptureKindAttr>()) {
2327 if (Attr->getCaptureKind() == OMPC_map) {
2328 PointeeTy = CGM.getContext().getAddrSpaceQualType(PointeeTy,
2329 LangAS::opencl_global);
2330 }
2331 }
2332 ArgType = CGM.getContext().getPointerType(PointeeTy);
2333 QC.addRestrict();
2334 enum { NVPTX_local_addr = 5 };
Alexander Richardson6d989432017-10-15 18:48:14 +00002335 QC.addAddressSpace(getLangASFromTargetAS(NVPTX_local_addr));
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002336 ArgType = QC.apply(CGM.getContext(), ArgType);
Alexey Bataevb45d43c2017-11-22 16:02:03 +00002337 if (isa<ImplicitParamDecl>(NativeParam)) {
2338 return ImplicitParamDecl::Create(
2339 CGM.getContext(), /*DC=*/nullptr, NativeParam->getLocation(),
2340 NativeParam->getIdentifier(), ArgType, ImplicitParamDecl::Other);
2341 }
2342 return ParmVarDecl::Create(
2343 CGM.getContext(),
2344 const_cast<DeclContext *>(NativeParam->getDeclContext()),
2345 NativeParam->getLocStart(), NativeParam->getLocation(),
2346 NativeParam->getIdentifier(), ArgType,
2347 /*TInfo=*/nullptr, SC_None, /*DefArg=*/nullptr);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002348}
2349
2350Address
2351CGOpenMPRuntimeNVPTX::getParameterAddress(CodeGenFunction &CGF,
2352 const VarDecl *NativeParam,
2353 const VarDecl *TargetParam) const {
2354 assert(NativeParam != TargetParam &&
2355 NativeParam->getType()->isReferenceType() &&
2356 "Native arg must not be the same as target arg.");
2357 Address LocalAddr = CGF.GetAddrOfLocalVar(TargetParam);
2358 QualType NativeParamType = NativeParam->getType();
2359 QualifierCollector QC;
2360 const Type *NonQualTy = QC.strip(NativeParamType);
2361 QualType NativePointeeTy = cast<ReferenceType>(NonQualTy)->getPointeeType();
2362 unsigned NativePointeeAddrSpace =
Alexander Richardson6d989432017-10-15 18:48:14 +00002363 CGF.getContext().getTargetAddressSpace(NativePointeeTy);
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002364 QualType TargetTy = TargetParam->getType();
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002365 llvm::Value *TargetAddr = CGF.EmitLoadOfScalar(
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002366 LocalAddr, /*Volatile=*/false, TargetTy, SourceLocation());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002367 // First cast to generic.
2368 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2369 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2370 /*AddrSpace=*/0));
2371 // Cast from generic to native address space.
2372 TargetAddr = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2373 TargetAddr, TargetAddr->getType()->getPointerElementType()->getPointerTo(
2374 NativePointeeAddrSpace));
2375 Address NativeParamAddr = CGF.CreateMemTemp(NativeParamType);
2376 CGF.EmitStoreOfScalar(TargetAddr, NativeParamAddr, /*Volatile=*/false,
Alexey Bataev36f2c4d2017-09-13 20:20:59 +00002377 NativeParamType);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002378 return NativeParamAddr;
2379}
2380
2381void CGOpenMPRuntimeNVPTX::emitOutlinedFunctionCall(
Alexey Bataev3c595a62017-08-14 15:01:03 +00002382 CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002383 ArrayRef<llvm::Value *> Args) const {
2384 SmallVector<llvm::Value *, 4> TargetArgs;
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002385 TargetArgs.reserve(Args.size());
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002386 auto *FnType =
2387 cast<llvm::FunctionType>(OutlinedFn->getType()->getPointerElementType());
2388 for (unsigned I = 0, E = Args.size(); I < E; ++I) {
Alexey Bataev07ed94a2017-08-15 14:34:04 +00002389 if (FnType->isVarArg() && FnType->getNumParams() <= I) {
2390 TargetArgs.append(std::next(Args.begin(), I), Args.end());
2391 break;
2392 }
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002393 llvm::Type *TargetType = FnType->getParamType(I);
2394 llvm::Value *NativeArg = Args[I];
2395 if (!TargetType->isPointerTy()) {
2396 TargetArgs.emplace_back(NativeArg);
2397 continue;
2398 }
2399 llvm::Value *TargetArg = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
2400 NativeArg, NativeArg->getType()->getPointerElementType()->getPointerTo(
2401 /*AddrSpace=*/0));
2402 TargetArgs.emplace_back(
2403 CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(TargetArg, TargetType));
2404 }
Alexey Bataev3c595a62017-08-14 15:01:03 +00002405 CGOpenMPRuntime::emitOutlinedFunctionCall(CGF, Loc, OutlinedFn, TargetArgs);
Alexey Bataev3b8d5582017-08-08 18:04:06 +00002406}
Gheorghe-Teodor Berceaeb89b1d2017-11-21 15:54:54 +00002407
2408/// Emit function which wraps the outline parallel region
2409/// and controls the arguments which are passed to this function.
2410/// The wrapper ensures that the outlined function is called
2411/// with the correct arguments when data is shared.
2412llvm::Function *CGOpenMPRuntimeNVPTX::createDataSharingWrapper(
2413 llvm::Function *OutlinedParallelFn, const OMPExecutableDirective &D) {
2414 ASTContext &Ctx = CGM.getContext();
2415 const auto &CS = *cast<CapturedStmt>(D.getAssociatedStmt());
2416
2417 // Create a function that takes as argument the source thread.
2418 FunctionArgList WrapperArgs;
2419 QualType Int16QTy =
2420 Ctx.getIntTypeForBitwidth(/*DestWidth=*/16, /*Signed=*/false);
2421 QualType Int32QTy =
2422 Ctx.getIntTypeForBitwidth(/*DestWidth=*/32, /*Signed=*/false);
2423 QualType Int32PtrQTy = Ctx.getPointerType(Int32QTy);
2424 QualType VoidPtrPtrQTy = Ctx.getPointerType(Ctx.VoidPtrTy);
2425 ImplicitParamDecl ParallelLevelArg(Ctx, Int16QTy, ImplicitParamDecl::Other);
2426 ImplicitParamDecl WrapperArg(Ctx, Int32QTy, ImplicitParamDecl::Other);
2427 ImplicitParamDecl SharedArgsList(Ctx, VoidPtrPtrQTy,
2428 ImplicitParamDecl::Other);
2429 WrapperArgs.emplace_back(&ParallelLevelArg);
2430 WrapperArgs.emplace_back(&WrapperArg);
2431 WrapperArgs.emplace_back(&SharedArgsList);
2432
2433 auto &CGFI =
2434 CGM.getTypes().arrangeBuiltinFunctionDeclaration(Ctx.VoidTy, WrapperArgs);
2435
2436 auto *Fn = llvm::Function::Create(
2437 CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
2438 OutlinedParallelFn->getName() + "_wrapper", &CGM.getModule());
2439 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, Fn, CGFI);
2440 Fn->setLinkage(llvm::GlobalValue::InternalLinkage);
2441
2442 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
2443 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, Fn, CGFI, WrapperArgs);
2444
2445 const auto *RD = CS.getCapturedRecordDecl();
2446 auto CurField = RD->field_begin();
2447
2448 // Get the array of arguments.
2449 SmallVector<llvm::Value *, 8> Args;
2450
2451 // TODO: suppport SIMD and pass actual values
2452 Args.emplace_back(llvm::ConstantPointerNull::get(
2453 CGM.Int32Ty->getPointerTo()));
2454 Args.emplace_back(llvm::ConstantPointerNull::get(
2455 CGM.Int32Ty->getPointerTo()));
2456
2457 CGBuilderTy &Bld = CGF.Builder;
2458 auto CI = CS.capture_begin();
2459
2460 // Load the start of the array
2461 auto SharedArgs =
2462 CGF.EmitLoadOfPointer(CGF.GetAddrOfLocalVar(&SharedArgsList),
2463 VoidPtrPtrQTy->castAs<PointerType>());
2464
2465 // For each captured variable
2466 for (unsigned I = 0; I < CS.capture_size(); ++I, ++CI, ++CurField) {
2467 // Name of captured variable
2468 StringRef Name;
2469 if (CI->capturesThis())
2470 Name = "this";
2471 else
2472 Name = CI->getCapturedVar()->getName();
2473
2474 // We retrieve the CLANG type of the argument. We use it to create
2475 // an alloca which will give us the LLVM type.
2476 QualType ElemTy = CurField->getType();
2477 // If this is a capture by copy the element type has to be the pointer to
2478 // the data.
2479 if (CI->capturesVariableByCopy())
2480 ElemTy = Ctx.getPointerType(ElemTy);
2481
2482 // Get shared address of the captured variable.
2483 Address ArgAddress = Bld.CreateConstInBoundsGEP(
2484 SharedArgs, I, CGF.getPointerSize());
2485 Address TypedArgAddress = Bld.CreateBitCast(
2486 ArgAddress, CGF.ConvertTypeForMem(Ctx.getPointerType(ElemTy)));
2487 llvm::Value *Arg = CGF.EmitLoadOfScalar(TypedArgAddress,
2488 /*Volatile=*/false, Int32PtrQTy, SourceLocation());
2489 Args.emplace_back(Arg);
2490 }
2491
2492 emitCall(CGF, OutlinedParallelFn, Args);
2493 CGF.FinishFunction();
2494 return Fn;
2495}