blob: fe0e2acdfdbf2f6ee100ac1d55a5c37a0079b6fd [file] [log] [blame]
Samuel Antao45bfe4c2016-02-08 15:59:20 +00001//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9//
10// This provides a class for OpenMP runtime code generation specialized to NVPTX
11// targets.
12//
13//===----------------------------------------------------------------------===//
14
15#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataevc5b1d322016-03-04 09:22:22 +000016#include "clang/AST/DeclOpenMP.h"
Carlo Bertollic6872252016-04-04 15:55:02 +000017#include "CodeGenFunction.h"
18#include "clang/AST/StmtOpenMP.h"
Samuel Antao45bfe4c2016-02-08 15:59:20 +000019
20using namespace clang;
21using namespace CodeGen;
22
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000023namespace {
24enum OpenMPRTLFunctionNVPTX {
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +000025 /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle,
26 /// kmp_int32 thread_limit);
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000027 OMPRTL_NVPTX__kmpc_kernel_init,
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +000028};
29
30// NVPTX Address space
31enum AddressSpace {
32 AddressSpaceShared = 3,
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000033};
34} // namespace
35
36/// Get the GPU warp size.
37static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000038 CGBuilderTy &Bld = CGF.Builder;
39 return Bld.CreateCall(
40 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000041 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000042 llvm::None, "nvptx_warp_size");
43}
44
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000045/// Get the id of the current thread on the GPU.
46static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000047 CGBuilderTy &Bld = CGF.Builder;
48 return Bld.CreateCall(
49 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000050 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000051 llvm::None, "nvptx_tid");
52}
53
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000054/// Get the maximum number of threads in a block of the GPU.
55static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000056 CGBuilderTy &Bld = CGF.Builder;
57 return Bld.CreateCall(
58 llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000059 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000060 llvm::None, "nvptx_num_threads");
61}
62
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000063/// Get barrier to synchronize all threads in a block.
64static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000065 CGBuilderTy &Bld = CGF.Builder;
66 Bld.CreateCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000067 &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000068}
69
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000070/// Synchronize all GPU threads in a block.
71static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000072
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000073/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000074/// The master thread id is the first thread (lane) of the last warp in the
75/// GPU block. Warp size is assumed to be some power of 2.
76/// Thread id is 0 indexed.
77/// E.g: If NumThreads is 33, master id is 32.
78/// If NumThreads is 64, master id is 32.
79/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacobccf2f732017-01-03 20:19:56 +000080static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000081 CGBuilderTy &Bld = CGF.Builder;
82 llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
83
84 // We assume that the warp size is a power of 2.
85 llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
86
87 return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
88 Bld.CreateNot(Mask), "master_tid");
89}
90
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000091CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
92 CodeGenModule &CGM)
93 : WorkerFn(nullptr), CGFI(nullptr) {
94 createWorkerFunction(CGM);
Vasileios Kalintirise5c09592016-03-22 10:41:20 +000095}
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +000096
97void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
98 CodeGenModule &CGM) {
99 // Create an worker function with no arguments.
100 CGFI = &CGM.getTypes().arrangeNullaryFunction();
101
102 WorkerFn = llvm::Function::Create(
103 CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
104 /* placeholder */ "_worker", &CGM.getModule());
105 CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000106 WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage);
107 WorkerFn->addFnAttr(llvm::Attribute::NoInline);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000108}
109
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000110void CGOpenMPRuntimeNVPTX::initializeEnvironment() {
111 //
112 // Initialize master-worker control state in shared memory.
113 //
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000114
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000115 auto DL = CGM.getDataLayout();
116 ActiveWorkers = new llvm::GlobalVariable(
117 CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false,
118 llvm::GlobalValue::CommonLinkage,
119 llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0,
120 llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared);
121 ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000122
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000123 WorkID = new llvm::GlobalVariable(
124 CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false,
125 llvm::GlobalValue::CommonLinkage,
126 llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0,
127 llvm::GlobalVariable::NotThreadLocal, AddressSpaceShared);
128 WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty));
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000129}
130
131void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
132 auto &Ctx = CGM.getContext();
133
134 CodeGenFunction CGF(CGM, /*suppressNewContext=*/true);
135 CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
136 emitWorkerLoop(CGF, WST);
137 CGF.FinishFunction();
138}
139
140void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
141 WorkerFunctionState &WST) {
142 //
143 // The workers enter this loop and wait for parallel work from the master.
144 // When the master encounters a parallel region it sets up the work + variable
145 // arguments, and wakes up the workers. The workers first check to see if
146 // they are required for the parallel region, i.e., within the # of requested
147 // parallel threads. The activated workers load the variable arguments and
148 // execute the parallel work.
149 //
150
151 CGBuilderTy &Bld = CGF.Builder;
152
153 llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
154 llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
155 llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
156 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
157 llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
158 llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
159
160 CGF.EmitBranch(AwaitBB);
161
162 // Workers wait for work from master.
163 CGF.EmitBlock(AwaitBB);
164 // Wait for parallel work
165 syncCTAThreads(CGF);
166 // On termination condition (workid == 0), exit loop.
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000167 llvm::Value *ShouldTerminate = Bld.CreateICmpEQ(
168 Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()),
169 llvm::Constant::getNullValue(WorkID->getType()->getElementType()),
170 "should_terminate");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000171 Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
172
173 // Activate requested workers.
174 CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000175 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
176 llvm::Value *ActiveThread = Bld.CreateICmpSLT(
177 ThreadID,
178 Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()),
179 "active_thread");
180 Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB);
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000181
182 // Signal start of parallel region.
183 CGF.EmitBlock(ExecuteBB);
184 // TODO: Add parallel work.
185
186 // Signal end of parallel region.
187 CGF.EmitBlock(TerminateBB);
188 CGF.EmitBranch(BarrierBB);
189
190 // All active and inactive workers wait at a barrier after parallel region.
191 CGF.EmitBlock(BarrierBB);
192 // Barrier after parallel region.
193 syncCTAThreads(CGF);
194 CGF.EmitBranch(AwaitBB);
195
196 // Exit target region.
197 CGF.EmitBlock(ExitBB);
198}
199
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000200// Setup NVPTX threads for master-worker OpenMP scheme.
201void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF,
202 EntryFunctionState &EST,
203 WorkerFunctionState &WST) {
204 CGBuilderTy &Bld = CGF.Builder;
205
206 // Get the master thread id.
207 llvm::Value *MasterID = getMasterThreadID(CGF);
208 // Current thread's identifier.
209 llvm::Value *ThreadID = getNVPTXThreadID(CGF);
210
211 // Setup BBs in entry function.
212 llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker");
213 llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
214 llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
215 EST.ExitBB = CGF.createBasicBlock(".exit");
216
217 // The head (master thread) marches on while its body of companion threads in
218 // the warp go to sleep.
219 llvm::Value *ShouldDie =
220 Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp");
221 Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB);
222
223 // Select worker threads...
224 CGF.EmitBlock(WorkerCheckBB);
225 llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker");
226 Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB);
227
228 // ... and send to worker loop, awaiting parallel invocation.
229 CGF.EmitBlock(WorkerBB);
230 CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
231 CGF.EmitBranch(EST.ExitBB);
232
233 // Only master thread executes subsequent serial code.
234 CGF.EmitBlock(MasterBB);
235
236 // First action in sequential region:
237 // Initialize the state of the OpenMP runtime library on the GPU.
238 llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)};
239 CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init),
240 Args);
241}
242
243void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF,
244 EntryFunctionState &EST) {
245 if (!EST.ExitBB)
246 EST.ExitBB = CGF.createBasicBlock(".exit");
247
248 CGBuilderTy &Bld = CGF.Builder;
249 llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
250 CGF.EmitBranch(TerminateBB);
251
252 CGF.EmitBlock(TerminateBB);
253 // Signal termination condition.
254 Bld.CreateAlignedStore(
255 llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID,
256 WorkID->getAlignment());
257 // Barrier to terminate worker threads.
258 syncCTAThreads(CGF);
259 // Master thread jumps to exit point.
260 CGF.EmitBranch(EST.ExitBB);
261
262 CGF.EmitBlock(EST.ExitBB);
263 EST.ExitBB = nullptr;
264}
265
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000266/// \brief Returns specified OpenMP runtime function for the current OpenMP
267/// implementation. Specialized for the NVPTX device.
268/// \param Function OpenMP runtime function.
269/// \return Specified function.
270llvm::Constant *
271CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
272 llvm::Constant *RTLFn = nullptr;
273 switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
274 case OMPRTL_NVPTX__kmpc_kernel_init: {
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000275 // Build void __kmpc_kernel_init(kmp_int32 omp_handle,
276 // kmp_int32 thread_limit);
277 llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty};
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000278 llvm::FunctionType *FnTy =
279 llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false);
280 RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
281 break;
282 }
283 }
284 return RTLFn;
285}
286
287void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
288 llvm::Constant *Addr,
289 uint64_t Size) {
290 auto *F = dyn_cast<llvm::Function>(Addr);
291 // TODO: Add support for global variables on the device after declare target
292 // support.
293 if (!F)
294 return;
295 llvm::Module *M = F->getParent();
296 llvm::LLVMContext &Ctx = M->getContext();
297
298 // Get "nvvm.annotations" metadata node
299 llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
300
301 llvm::Metadata *MDVals[] = {
302 llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
303 llvm::ConstantAsMetadata::get(
304 llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
305 // Append metadata to nvvm.annotations
306 MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
307}
308
309void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
310 const OMPExecutableDirective &D, StringRef ParentName,
311 llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID,
Alexey Bataev14fa1c62016-03-29 05:34:15 +0000312 bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000313 if (!IsOffloadEntry) // Nothing to do.
314 return;
315
316 assert(!ParentName.empty() && "Invalid target region parent name!");
317
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000318 EntryFunctionState EST;
319 WorkerFunctionState WST(CGM);
320
321 // Emit target region as a standalone region.
322 class NVPTXPrePostActionTy : public PrePostActionTy {
323 CGOpenMPRuntimeNVPTX &RT;
324 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
325 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
326
327 public:
328 NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
329 CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
330 CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
331 : RT(RT), EST(EST), WST(WST) {}
332 void Enter(CodeGenFunction &CGF) override {
333 RT.emitEntryHeader(CGF, EST, WST);
334 }
335 void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); }
336 } Action(*this, EST, WST);
337 CodeGen.setAction(Action);
338 emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
339 IsOffloadEntry, CodeGen);
340
341 // Create the worker function
342 emitWorkerFunction(WST);
343
344 // Now change the name of the worker function to correspond to this target
345 // region's entry function.
346 WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000347}
348
Samuel Antao45bfe4c2016-02-08 15:59:20 +0000349CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000350 : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) {
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000351 if (!CGM.getLangOpts().OpenMPIsDevice)
352 llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacobb0d96f52017-01-04 19:14:43 +0000353
354 // Called once per module during initialization.
355 initializeEnvironment();
Arpith Chacko Jacob5c309e42016-03-22 01:48:56 +0000356}
Carlo Bertollic6872252016-04-04 15:55:02 +0000357
358void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
359 const Expr *NumTeams,
360 const Expr *ThreadLimit,
361 SourceLocation Loc) {}
362
363llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction(
364 const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
365 OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
366
367 llvm::Function *OutlinedFun = nullptr;
368 if (isa<OMPTeamsDirective>(D)) {
369 llvm::Value *OutlinedFunVal =
370 CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction(
371 D, ThreadIDVar, InnermostKind, CodeGen);
372 OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
Chandler Carruthfcd33142016-12-23 01:24:49 +0000373 OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
Carlo Bertollic6872252016-04-04 15:55:02 +0000374 OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
375 } else
376 llvm_unreachable("parallel directive is not yet supported for nvptx "
377 "backend.");
378
379 return OutlinedFun;
380}
381
382void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
383 const OMPExecutableDirective &D,
384 SourceLocation Loc,
385 llvm::Value *OutlinedFn,
386 ArrayRef<llvm::Value *> CapturedVars) {
387 if (!CGF.HaveInsertPoint())
388 return;
389
390 Address ZeroAddr =
391 CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
392 /*Name*/ ".zero.addr");
393 CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0));
394 llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
395 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
396 OutlinedFnArgs.push_back(ZeroAddr.getPointer());
397 OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
398 CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
399}