Pirama Arumuga Nainar | 4967a71 | 2016-09-19 22:19:55 -0700 | [diff] [blame] | 1 | //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This provides a class for OpenMP runtime code generation specialized to NVPTX |
| 11 | // targets. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "CGOpenMPRuntimeNVPTX.h" |
| 16 | #include "clang/AST/DeclOpenMP.h" |
| 17 | #include "CodeGenFunction.h" |
| 18 | #include "clang/AST/StmtOpenMP.h" |
| 19 | |
| 20 | using namespace clang; |
| 21 | using namespace CodeGen; |
| 22 | |
| 23 | /// \brief Get the GPU warp size. |
| 24 | llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXWarpSize(CodeGenFunction &CGF) { |
| 25 | CGBuilderTy &Bld = CGF.Builder; |
| 26 | return Bld.CreateCall( |
| 27 | llvm::Intrinsic::getDeclaration( |
| 28 | &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), |
| 29 | llvm::None, "nvptx_warp_size"); |
| 30 | } |
| 31 | |
| 32 | /// \brief Get the id of the current thread on the GPU. |
| 33 | llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXThreadID(CodeGenFunction &CGF) { |
| 34 | CGBuilderTy &Bld = CGF.Builder; |
| 35 | return Bld.CreateCall( |
| 36 | llvm::Intrinsic::getDeclaration( |
| 37 | &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), |
| 38 | llvm::None, "nvptx_tid"); |
| 39 | } |
| 40 | |
| 41 | // \brief Get the maximum number of threads in a block of the GPU. |
| 42 | llvm::Value *CGOpenMPRuntimeNVPTX::getNVPTXNumThreads(CodeGenFunction &CGF) { |
| 43 | CGBuilderTy &Bld = CGF.Builder; |
| 44 | return Bld.CreateCall( |
| 45 | llvm::Intrinsic::getDeclaration( |
| 46 | &CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), |
| 47 | llvm::None, "nvptx_num_threads"); |
| 48 | } |
| 49 | |
| 50 | /// \brief Get barrier to synchronize all threads in a block. |
| 51 | void CGOpenMPRuntimeNVPTX::getNVPTXCTABarrier(CodeGenFunction &CGF) { |
| 52 | CGBuilderTy &Bld = CGF.Builder; |
| 53 | Bld.CreateCall(llvm::Intrinsic::getDeclaration( |
| 54 | &CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); |
| 55 | } |
| 56 | |
| 57 | // \brief Synchronize all GPU threads in a block. |
| 58 | void CGOpenMPRuntimeNVPTX::syncCTAThreads(CodeGenFunction &CGF) { |
| 59 | getNVPTXCTABarrier(CGF); |
| 60 | } |
| 61 | |
| 62 | /// \brief Get the thread id of the OMP master thread. |
| 63 | /// The master thread id is the first thread (lane) of the last warp in the |
| 64 | /// GPU block. Warp size is assumed to be some power of 2. |
| 65 | /// Thread id is 0 indexed. |
| 66 | /// E.g: If NumThreads is 33, master id is 32. |
| 67 | /// If NumThreads is 64, master id is 32. |
| 68 | /// If NumThreads is 1024, master id is 992. |
| 69 | llvm::Value *CGOpenMPRuntimeNVPTX::getMasterThreadID(CodeGenFunction &CGF) { |
| 70 | CGBuilderTy &Bld = CGF.Builder; |
| 71 | llvm::Value *NumThreads = getNVPTXNumThreads(CGF); |
| 72 | |
| 73 | // We assume that the warp size is a power of 2. |
| 74 | llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); |
| 75 | |
| 76 | return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), |
| 77 | Bld.CreateNot(Mask), "master_tid"); |
| 78 | } |
| 79 | |
| 80 | namespace { |
| 81 | enum OpenMPRTLFunctionNVPTX { |
| 82 | /// \brief Call to void __kmpc_kernel_init(kmp_int32 omp_handle, |
| 83 | /// kmp_int32 thread_limit); |
| 84 | OMPRTL_NVPTX__kmpc_kernel_init, |
| 85 | }; |
| 86 | |
| 87 | // NVPTX Address space |
| 88 | enum ADDRESS_SPACE { |
| 89 | ADDRESS_SPACE_SHARED = 3, |
| 90 | }; |
| 91 | } // namespace |
| 92 | |
| 93 | CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( |
| 94 | CodeGenModule &CGM) |
| 95 | : WorkerFn(nullptr), CGFI(nullptr) { |
| 96 | createWorkerFunction(CGM); |
| 97 | } |
| 98 | |
| 99 | void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( |
| 100 | CodeGenModule &CGM) { |
| 101 | // Create an worker function with no arguments. |
| 102 | CGFI = &CGM.getTypes().arrangeNullaryFunction(); |
| 103 | |
| 104 | WorkerFn = llvm::Function::Create( |
| 105 | CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, |
| 106 | /* placeholder */ "_worker", &CGM.getModule()); |
| 107 | CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); |
| 108 | WorkerFn->setLinkage(llvm::GlobalValue::InternalLinkage); |
| 109 | WorkerFn->addFnAttr(llvm::Attribute::NoInline); |
| 110 | } |
| 111 | |
| 112 | void CGOpenMPRuntimeNVPTX::initializeEnvironment() { |
| 113 | // |
| 114 | // Initialize master-worker control state in shared memory. |
| 115 | // |
| 116 | |
| 117 | auto DL = CGM.getDataLayout(); |
| 118 | ActiveWorkers = new llvm::GlobalVariable( |
| 119 | CGM.getModule(), CGM.Int32Ty, /*isConstant=*/false, |
| 120 | llvm::GlobalValue::CommonLinkage, |
| 121 | llvm::Constant::getNullValue(CGM.Int32Ty), "__omp_num_threads", 0, |
| 122 | llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); |
| 123 | ActiveWorkers->setAlignment(DL.getPrefTypeAlignment(CGM.Int32Ty)); |
| 124 | |
| 125 | WorkID = new llvm::GlobalVariable( |
| 126 | CGM.getModule(), CGM.Int64Ty, /*isConstant=*/false, |
| 127 | llvm::GlobalValue::CommonLinkage, |
| 128 | llvm::Constant::getNullValue(CGM.Int64Ty), "__tgt_work_id", 0, |
| 129 | llvm::GlobalVariable::NotThreadLocal, ADDRESS_SPACE_SHARED); |
| 130 | WorkID->setAlignment(DL.getPrefTypeAlignment(CGM.Int64Ty)); |
| 131 | } |
| 132 | |
| 133 | void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { |
| 134 | auto &Ctx = CGM.getContext(); |
| 135 | |
| 136 | CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); |
| 137 | CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); |
| 138 | emitWorkerLoop(CGF, WST); |
| 139 | CGF.FinishFunction(); |
| 140 | } |
| 141 | |
| 142 | void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, |
| 143 | WorkerFunctionState &WST) { |
| 144 | // |
| 145 | // The workers enter this loop and wait for parallel work from the master. |
| 146 | // When the master encounters a parallel region it sets up the work + variable |
| 147 | // arguments, and wakes up the workers. The workers first check to see if |
| 148 | // they are required for the parallel region, i.e., within the # of requested |
| 149 | // parallel threads. The activated workers load the variable arguments and |
| 150 | // execute the parallel work. |
| 151 | // |
| 152 | |
| 153 | CGBuilderTy &Bld = CGF.Builder; |
| 154 | |
| 155 | llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); |
| 156 | llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); |
| 157 | llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); |
| 158 | llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); |
| 159 | llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); |
| 160 | llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| 161 | |
| 162 | CGF.EmitBranch(AwaitBB); |
| 163 | |
| 164 | // Workers wait for work from master. |
| 165 | CGF.EmitBlock(AwaitBB); |
| 166 | // Wait for parallel work |
| 167 | syncCTAThreads(CGF); |
| 168 | // On termination condition (workid == 0), exit loop. |
| 169 | llvm::Value *ShouldTerminate = Bld.CreateICmpEQ( |
| 170 | Bld.CreateAlignedLoad(WorkID, WorkID->getAlignment()), |
| 171 | llvm::Constant::getNullValue(WorkID->getType()->getElementType()), |
| 172 | "should_terminate"); |
| 173 | Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); |
| 174 | |
| 175 | // Activate requested workers. |
| 176 | CGF.EmitBlock(SelectWorkersBB); |
| 177 | llvm::Value *ThreadID = getNVPTXThreadID(CGF); |
| 178 | llvm::Value *ActiveThread = Bld.CreateICmpSLT( |
| 179 | ThreadID, |
| 180 | Bld.CreateAlignedLoad(ActiveWorkers, ActiveWorkers->getAlignment()), |
| 181 | "active_thread"); |
| 182 | Bld.CreateCondBr(ActiveThread, ExecuteBB, BarrierBB); |
| 183 | |
| 184 | // Signal start of parallel region. |
| 185 | CGF.EmitBlock(ExecuteBB); |
| 186 | // TODO: Add parallel work. |
| 187 | |
| 188 | // Signal end of parallel region. |
| 189 | CGF.EmitBlock(TerminateBB); |
| 190 | CGF.EmitBranch(BarrierBB); |
| 191 | |
| 192 | // All active and inactive workers wait at a barrier after parallel region. |
| 193 | CGF.EmitBlock(BarrierBB); |
| 194 | // Barrier after parallel region. |
| 195 | syncCTAThreads(CGF); |
| 196 | CGF.EmitBranch(AwaitBB); |
| 197 | |
| 198 | // Exit target region. |
| 199 | CGF.EmitBlock(ExitBB); |
| 200 | } |
| 201 | |
| 202 | // Setup NVPTX threads for master-worker OpenMP scheme. |
| 203 | void CGOpenMPRuntimeNVPTX::emitEntryHeader(CodeGenFunction &CGF, |
| 204 | EntryFunctionState &EST, |
| 205 | WorkerFunctionState &WST) { |
| 206 | CGBuilderTy &Bld = CGF.Builder; |
| 207 | |
| 208 | // Get the master thread id. |
| 209 | llvm::Value *MasterID = getMasterThreadID(CGF); |
| 210 | // Current thread's identifier. |
| 211 | llvm::Value *ThreadID = getNVPTXThreadID(CGF); |
| 212 | |
| 213 | // Setup BBs in entry function. |
| 214 | llvm::BasicBlock *WorkerCheckBB = CGF.createBasicBlock(".check.for.worker"); |
| 215 | llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); |
| 216 | llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); |
| 217 | EST.ExitBB = CGF.createBasicBlock(".exit"); |
| 218 | |
| 219 | // The head (master thread) marches on while its body of companion threads in |
| 220 | // the warp go to sleep. |
| 221 | llvm::Value *ShouldDie = |
| 222 | Bld.CreateICmpUGT(ThreadID, MasterID, "excess_in_master_warp"); |
| 223 | Bld.CreateCondBr(ShouldDie, EST.ExitBB, WorkerCheckBB); |
| 224 | |
| 225 | // Select worker threads... |
| 226 | CGF.EmitBlock(WorkerCheckBB); |
| 227 | llvm::Value *IsWorker = Bld.CreateICmpULT(ThreadID, MasterID, "is_worker"); |
| 228 | Bld.CreateCondBr(IsWorker, WorkerBB, MasterBB); |
| 229 | |
| 230 | // ... and send to worker loop, awaiting parallel invocation. |
| 231 | CGF.EmitBlock(WorkerBB); |
| 232 | CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); |
| 233 | CGF.EmitBranch(EST.ExitBB); |
| 234 | |
| 235 | // Only master thread executes subsequent serial code. |
| 236 | CGF.EmitBlock(MasterBB); |
| 237 | |
| 238 | // First action in sequential region: |
| 239 | // Initialize the state of the OpenMP runtime library on the GPU. |
| 240 | llvm::Value *Args[] = {Bld.getInt32(/*OmpHandle=*/0), getNVPTXThreadID(CGF)}; |
| 241 | CGF.EmitRuntimeCall(createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), |
| 242 | Args); |
| 243 | } |
| 244 | |
| 245 | void CGOpenMPRuntimeNVPTX::emitEntryFooter(CodeGenFunction &CGF, |
| 246 | EntryFunctionState &EST) { |
| 247 | CGBuilderTy &Bld = CGF.Builder; |
| 248 | llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); |
| 249 | CGF.EmitBranch(TerminateBB); |
| 250 | |
| 251 | CGF.EmitBlock(TerminateBB); |
| 252 | // Signal termination condition. |
| 253 | Bld.CreateAlignedStore( |
| 254 | llvm::Constant::getNullValue(WorkID->getType()->getElementType()), WorkID, |
| 255 | WorkID->getAlignment()); |
| 256 | // Barrier to terminate worker threads. |
| 257 | syncCTAThreads(CGF); |
| 258 | // Master thread jumps to exit point. |
| 259 | CGF.EmitBranch(EST.ExitBB); |
| 260 | |
| 261 | CGF.EmitBlock(EST.ExitBB); |
| 262 | } |
| 263 | |
| 264 | /// \brief Returns specified OpenMP runtime function for the current OpenMP |
| 265 | /// implementation. Specialized for the NVPTX device. |
| 266 | /// \param Function OpenMP runtime function. |
| 267 | /// \return Specified function. |
| 268 | llvm::Constant * |
| 269 | CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { |
| 270 | llvm::Constant *RTLFn = nullptr; |
| 271 | switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { |
| 272 | case OMPRTL_NVPTX__kmpc_kernel_init: { |
| 273 | // Build void __kmpc_kernel_init(kmp_int32 omp_handle, |
| 274 | // kmp_int32 thread_limit); |
| 275 | llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int32Ty}; |
| 276 | llvm::FunctionType *FnTy = |
| 277 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 278 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); |
| 279 | break; |
| 280 | } |
| 281 | } |
| 282 | return RTLFn; |
| 283 | } |
| 284 | |
| 285 | void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, |
| 286 | llvm::Constant *Addr, |
| 287 | uint64_t Size) { |
| 288 | auto *F = dyn_cast<llvm::Function>(Addr); |
| 289 | // TODO: Add support for global variables on the device after declare target |
| 290 | // support. |
| 291 | if (!F) |
| 292 | return; |
| 293 | llvm::Module *M = F->getParent(); |
| 294 | llvm::LLVMContext &Ctx = M->getContext(); |
| 295 | |
| 296 | // Get "nvvm.annotations" metadata node |
| 297 | llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); |
| 298 | |
| 299 | llvm::Metadata *MDVals[] = { |
| 300 | llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), |
| 301 | llvm::ConstantAsMetadata::get( |
| 302 | llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; |
| 303 | // Append metadata to nvvm.annotations |
| 304 | MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); |
| 305 | } |
| 306 | |
| 307 | void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( |
| 308 | const OMPExecutableDirective &D, StringRef ParentName, |
| 309 | llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, |
| 310 | bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { |
| 311 | if (!IsOffloadEntry) // Nothing to do. |
| 312 | return; |
| 313 | |
| 314 | assert(!ParentName.empty() && "Invalid target region parent name!"); |
| 315 | |
| 316 | EntryFunctionState EST; |
| 317 | WorkerFunctionState WST(CGM); |
| 318 | |
| 319 | // Emit target region as a standalone region. |
| 320 | class NVPTXPrePostActionTy : public PrePostActionTy { |
| 321 | CGOpenMPRuntimeNVPTX &RT; |
| 322 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; |
| 323 | CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; |
| 324 | |
| 325 | public: |
| 326 | NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, |
| 327 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, |
| 328 | CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) |
| 329 | : RT(RT), EST(EST), WST(WST) {} |
| 330 | void Enter(CodeGenFunction &CGF) override { |
| 331 | RT.emitEntryHeader(CGF, EST, WST); |
| 332 | } |
| 333 | void Exit(CodeGenFunction &CGF) override { RT.emitEntryFooter(CGF, EST); } |
| 334 | } Action(*this, EST, WST); |
| 335 | CodeGen.setAction(Action); |
| 336 | emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| 337 | IsOffloadEntry, CodeGen); |
| 338 | |
| 339 | // Create the worker function |
| 340 | emitWorkerFunction(WST); |
| 341 | |
| 342 | // Now change the name of the worker function to correspond to this target |
| 343 | // region's entry function. |
| 344 | WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); |
| 345 | } |
| 346 | |
| 347 | CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) |
| 348 | : CGOpenMPRuntime(CGM), ActiveWorkers(nullptr), WorkID(nullptr) { |
| 349 | if (!CGM.getLangOpts().OpenMPIsDevice) |
| 350 | llvm_unreachable("OpenMP NVPTX can only handle device code."); |
| 351 | |
| 352 | // Called once per module during initialization. |
| 353 | initializeEnvironment(); |
| 354 | } |
| 355 | |
| 356 | void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, |
| 357 | const Expr *NumTeams, |
| 358 | const Expr *ThreadLimit, |
| 359 | SourceLocation Loc) {} |
| 360 | |
| 361 | llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOrTeamsOutlinedFunction( |
| 362 | const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| 363 | OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| 364 | |
| 365 | llvm::Function *OutlinedFun = nullptr; |
| 366 | if (isa<OMPTeamsDirective>(D)) { |
| 367 | llvm::Value *OutlinedFunVal = |
| 368 | CGOpenMPRuntime::emitParallelOrTeamsOutlinedFunction( |
| 369 | D, ThreadIDVar, InnermostKind, CodeGen); |
| 370 | OutlinedFun = cast<llvm::Function>(OutlinedFunVal); |
| 371 | OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); |
| 372 | } else |
| 373 | llvm_unreachable("parallel directive is not yet supported for nvptx " |
| 374 | "backend."); |
| 375 | |
| 376 | return OutlinedFun; |
| 377 | } |
| 378 | |
| 379 | void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, |
| 380 | const OMPExecutableDirective &D, |
| 381 | SourceLocation Loc, |
| 382 | llvm::Value *OutlinedFn, |
| 383 | ArrayRef<llvm::Value *> CapturedVars) { |
| 384 | if (!CGF.HaveInsertPoint()) |
| 385 | return; |
| 386 | |
| 387 | Address ZeroAddr = |
| 388 | CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), |
| 389 | /*Name*/ ".zero.addr"); |
| 390 | CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| 391 | llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| 392 | OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| 393 | OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| 394 | OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| 395 | CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); |
| 396 | } |