Samuel Antao | 45bfe4c | 2016-02-08 15:59:20 +0000 | [diff] [blame] | 1 | //===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | // |
| 10 | // This provides a class for OpenMP runtime code generation specialized to NVPTX |
| 11 | // targets. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include "CGOpenMPRuntimeNVPTX.h" |
Alexey Bataev | c5b1d32 | 2016-03-04 09:22:22 +0000 | [diff] [blame] | 16 | #include "clang/AST/DeclOpenMP.h" |
Carlo Bertolli | c687225 | 2016-04-04 15:55:02 +0000 | [diff] [blame] | 17 | #include "CodeGenFunction.h" |
| 18 | #include "clang/AST/StmtOpenMP.h" |
Samuel Antao | 45bfe4c | 2016-02-08 15:59:20 +0000 | [diff] [blame] | 19 | |
| 20 | using namespace clang; |
| 21 | using namespace CodeGen; |
| 22 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 23 | namespace { |
| 24 | enum OpenMPRTLFunctionNVPTX { |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 25 | /// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit); |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 26 | OMPRTL_NVPTX__kmpc_kernel_init, |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 27 | /// \brief Call to void __kmpc_kernel_deinit(); |
| 28 | OMPRTL_NVPTX__kmpc_kernel_deinit, |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 29 | /// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, |
| 30 | /// short RequiresOMPRuntime, short RequiresDataSharing); |
| 31 | OMPRTL_NVPTX__kmpc_spmd_kernel_init, |
| 32 | /// \brief Call to void __kmpc_spmd_kernel_deinit(); |
| 33 | OMPRTL_NVPTX__kmpc_spmd_kernel_deinit, |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 34 | /// \brief Call to void __kmpc_kernel_prepare_parallel(void |
| 35 | /// *outlined_function); |
| 36 | OMPRTL_NVPTX__kmpc_kernel_prepare_parallel, |
| 37 | /// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function); |
| 38 | OMPRTL_NVPTX__kmpc_kernel_parallel, |
| 39 | /// \brief Call to void __kmpc_kernel_end_parallel(); |
| 40 | OMPRTL_NVPTX__kmpc_kernel_end_parallel, |
| 41 | /// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| 42 | /// global_tid); |
| 43 | OMPRTL_NVPTX__kmpc_serialized_parallel, |
| 44 | /// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| 45 | /// global_tid); |
| 46 | OMPRTL_NVPTX__kmpc_end_serialized_parallel, |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 47 | }; |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 48 | |
| 49 | /// Pre(post)-action for different OpenMP constructs specialized for NVPTX. |
| 50 | class NVPTXActionTy final : public PrePostActionTy { |
| 51 | llvm::Value *EnterCallee; |
| 52 | ArrayRef<llvm::Value *> EnterArgs; |
| 53 | llvm::Value *ExitCallee; |
| 54 | ArrayRef<llvm::Value *> ExitArgs; |
| 55 | bool Conditional; |
| 56 | llvm::BasicBlock *ContBlock = nullptr; |
| 57 | |
| 58 | public: |
| 59 | NVPTXActionTy(llvm::Value *EnterCallee, ArrayRef<llvm::Value *> EnterArgs, |
| 60 | llvm::Value *ExitCallee, ArrayRef<llvm::Value *> ExitArgs, |
| 61 | bool Conditional = false) |
| 62 | : EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee), |
| 63 | ExitArgs(ExitArgs), Conditional(Conditional) {} |
| 64 | void Enter(CodeGenFunction &CGF) override { |
| 65 | llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs); |
| 66 | if (Conditional) { |
| 67 | llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes); |
| 68 | auto *ThenBlock = CGF.createBasicBlock("omp_if.then"); |
| 69 | ContBlock = CGF.createBasicBlock("omp_if.end"); |
| 70 | // Generate the branch (If-stmt) |
| 71 | CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock); |
| 72 | CGF.EmitBlock(ThenBlock); |
| 73 | } |
| 74 | } |
| 75 | void Done(CodeGenFunction &CGF) { |
| 76 | // Emit the rest of blocks/branches |
| 77 | CGF.EmitBranch(ContBlock); |
| 78 | CGF.EmitBlock(ContBlock, true); |
| 79 | } |
| 80 | void Exit(CodeGenFunction &CGF) override { |
| 81 | CGF.EmitRuntimeCall(ExitCallee, ExitArgs); |
| 82 | } |
| 83 | }; |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 84 | |
| 85 | // A class to track the execution mode when codegening directives within |
| 86 | // a target region. The appropriate mode (generic/spmd) is set on entry |
| 87 | // to the target region and used by containing directives such as 'parallel' |
| 88 | // to emit optimized code. |
| 89 | class ExecutionModeRAII { |
| 90 | private: |
| 91 | CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode; |
| 92 | CGOpenMPRuntimeNVPTX::ExecutionMode &Mode; |
| 93 | |
| 94 | public: |
| 95 | ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode, |
| 96 | CGOpenMPRuntimeNVPTX::ExecutionMode NewMode) |
| 97 | : Mode(Mode) { |
| 98 | SavedMode = Mode; |
| 99 | Mode = NewMode; |
| 100 | } |
| 101 | ~ExecutionModeRAII() { Mode = SavedMode; } |
| 102 | }; |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 103 | } // anonymous namespace |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 104 | |
| 105 | /// Get the GPU warp size. |
| 106 | static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 107 | CGBuilderTy &Bld = CGF.Builder; |
| 108 | return Bld.CreateCall( |
| 109 | llvm::Intrinsic::getDeclaration( |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 110 | &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize), |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 111 | llvm::None, "nvptx_warp_size"); |
| 112 | } |
| 113 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 114 | /// Get the id of the current thread on the GPU. |
| 115 | static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 116 | CGBuilderTy &Bld = CGF.Builder; |
| 117 | return Bld.CreateCall( |
| 118 | llvm::Intrinsic::getDeclaration( |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 119 | &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x), |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 120 | llvm::None, "nvptx_tid"); |
| 121 | } |
| 122 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 123 | /// Get the maximum number of threads in a block of the GPU. |
| 124 | static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 125 | CGBuilderTy &Bld = CGF.Builder; |
| 126 | return Bld.CreateCall( |
| 127 | llvm::Intrinsic::getDeclaration( |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 128 | &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x), |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 129 | llvm::None, "nvptx_num_threads"); |
| 130 | } |
| 131 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 132 | /// Get barrier to synchronize all threads in a block. |
| 133 | static void getNVPTXCTABarrier(CodeGenFunction &CGF) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 134 | CGBuilderTy &Bld = CGF.Builder; |
| 135 | Bld.CreateCall(llvm::Intrinsic::getDeclaration( |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 136 | &CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0)); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 137 | } |
| 138 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 139 | /// Synchronize all GPU threads in a block. |
| 140 | static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); } |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 141 | |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 142 | /// Get the value of the thread_limit clause in the teams directive. |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 143 | /// For the 'generic' execution mode, the runtime encodes thread_limit in |
| 144 | /// the launch parameters, always starting thread_limit+warpSize threads per |
| 145 | /// CTA. The threads in the last warp are reserved for master execution. |
| 146 | /// For the 'spmd' execution mode, all threads in a CTA are part of the team. |
| 147 | static llvm::Value *getThreadLimit(CodeGenFunction &CGF, |
| 148 | bool IsInSpmdExecutionMode = false) { |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 149 | CGBuilderTy &Bld = CGF.Builder; |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 150 | return IsInSpmdExecutionMode |
| 151 | ? getNVPTXNumThreads(CGF) |
| 152 | : Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF), |
| 153 | "thread_limit"); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 154 | } |
| 155 | |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 156 | /// Get the thread id of the OMP master thread. |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 157 | /// The master thread id is the first thread (lane) of the last warp in the |
| 158 | /// GPU block. Warp size is assumed to be some power of 2. |
| 159 | /// Thread id is 0 indexed. |
| 160 | /// E.g: If NumThreads is 33, master id is 32. |
| 161 | /// If NumThreads is 64, master id is 32. |
| 162 | /// If NumThreads is 1024, master id is 992. |
Arpith Chacko Jacob | ccf2f73 | 2017-01-03 20:19:56 +0000 | [diff] [blame] | 163 | static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 164 | CGBuilderTy &Bld = CGF.Builder; |
| 165 | llvm::Value *NumThreads = getNVPTXNumThreads(CGF); |
| 166 | |
| 167 | // We assume that the warp size is a power of 2. |
| 168 | llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1)); |
| 169 | |
| 170 | return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)), |
| 171 | Bld.CreateNot(Mask), "master_tid"); |
| 172 | } |
| 173 | |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 174 | CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState( |
| 175 | CodeGenModule &CGM) |
| 176 | : WorkerFn(nullptr), CGFI(nullptr) { |
| 177 | createWorkerFunction(CGM); |
Vasileios Kalintiris | e5c0959 | 2016-03-22 10:41:20 +0000 | [diff] [blame] | 178 | } |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 179 | |
| 180 | void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction( |
| 181 | CodeGenModule &CGM) { |
| 182 | // Create an worker function with no arguments. |
| 183 | CGFI = &CGM.getTypes().arrangeNullaryFunction(); |
| 184 | |
| 185 | WorkerFn = llvm::Function::Create( |
| 186 | CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage, |
| 187 | /* placeholder */ "_worker", &CGM.getModule()); |
| 188 | CGM.SetInternalFunctionAttributes(/*D=*/nullptr, WorkerFn, *CGFI); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 189 | } |
| 190 | |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 191 | bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const { |
| 192 | return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; |
| 193 | } |
| 194 | |
| 195 | static CGOpenMPRuntimeNVPTX::ExecutionMode |
| 196 | getExecutionModeForDirective(CodeGenModule &CGM, |
| 197 | const OMPExecutableDirective &D) { |
| 198 | OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind(); |
| 199 | switch (DirectiveKind) { |
| 200 | case OMPD_target: |
| 201 | return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic; |
| 202 | case OMPD_target_parallel: |
| 203 | return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd; |
| 204 | default: |
| 205 | llvm_unreachable("Unsupported directive on NVPTX device."); |
| 206 | } |
| 207 | llvm_unreachable("Unsupported directive on NVPTX device."); |
| 208 | } |
| 209 | |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 210 | void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D, |
| 211 | StringRef ParentName, |
| 212 | llvm::Function *&OutlinedFn, |
| 213 | llvm::Constant *&OutlinedFnID, |
| 214 | bool IsOffloadEntry, |
| 215 | const RegionCodeGenTy &CodeGen) { |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 216 | ExecutionModeRAII ModeRAII(CurrentExecutionMode, |
| 217 | CGOpenMPRuntimeNVPTX::ExecutionMode::Generic); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 218 | EntryFunctionState EST; |
| 219 | WorkerFunctionState WST(CGM); |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 220 | Work.clear(); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 221 | |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 222 | // Emit target region as a standalone region. |
| 223 | class NVPTXPrePostActionTy : public PrePostActionTy { |
| 224 | CGOpenMPRuntimeNVPTX &RT; |
| 225 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; |
| 226 | CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST; |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 227 | |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 228 | public: |
| 229 | NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, |
| 230 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, |
| 231 | CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST) |
| 232 | : RT(RT), EST(EST), WST(WST) {} |
| 233 | void Enter(CodeGenFunction &CGF) override { |
| 234 | RT.emitGenericEntryHeader(CGF, EST, WST); |
| 235 | } |
| 236 | void Exit(CodeGenFunction &CGF) override { |
| 237 | RT.emitGenericEntryFooter(CGF, EST); |
| 238 | } |
| 239 | } Action(*this, EST, WST); |
| 240 | CodeGen.setAction(Action); |
| 241 | emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| 242 | IsOffloadEntry, CodeGen); |
| 243 | |
| 244 | // Create the worker function |
| 245 | emitWorkerFunction(WST); |
| 246 | |
| 247 | // Now change the name of the worker function to correspond to this target |
| 248 | // region's entry function. |
| 249 | WST.WorkerFn->setName(OutlinedFn->getName() + "_worker"); |
| 250 | } |
| 251 | |
| 252 | // Setup NVPTX threads for master-worker OpenMP scheme. |
| 253 | void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF, |
| 254 | EntryFunctionState &EST, |
| 255 | WorkerFunctionState &WST) { |
| 256 | CGBuilderTy &Bld = CGF.Builder; |
| 257 | |
| 258 | llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker"); |
| 259 | llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck"); |
| 260 | llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master"); |
| 261 | EST.ExitBB = CGF.createBasicBlock(".exit"); |
| 262 | |
| 263 | auto *IsWorker = |
| 264 | Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF)); |
| 265 | Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB); |
| 266 | |
| 267 | CGF.EmitBlock(WorkerBB); |
| 268 | CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None); |
| 269 | CGF.EmitBranch(EST.ExitBB); |
| 270 | |
| 271 | CGF.EmitBlock(MasterCheckBB); |
| 272 | auto *IsMaster = |
| 273 | Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF)); |
| 274 | Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB); |
| 275 | |
| 276 | CGF.EmitBlock(MasterBB); |
| 277 | // First action in sequential region: |
| 278 | // Initialize the state of the OpenMP runtime library on the GPU. |
| 279 | llvm::Value *Args[] = {getThreadLimit(CGF)}; |
| 280 | CGF.EmitRuntimeCall( |
| 281 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args); |
| 282 | } |
| 283 | |
| 284 | void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF, |
| 285 | EntryFunctionState &EST) { |
| 286 | if (!EST.ExitBB) |
| 287 | EST.ExitBB = CGF.createBasicBlock(".exit"); |
| 288 | |
| 289 | llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier"); |
| 290 | CGF.EmitBranch(TerminateBB); |
| 291 | |
| 292 | CGF.EmitBlock(TerminateBB); |
| 293 | // Signal termination condition. |
| 294 | CGF.EmitRuntimeCall( |
| 295 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None); |
| 296 | // Barrier to terminate worker threads. |
| 297 | syncCTAThreads(CGF); |
| 298 | // Master thread jumps to exit point. |
| 299 | CGF.EmitBranch(EST.ExitBB); |
| 300 | |
| 301 | CGF.EmitBlock(EST.ExitBB); |
| 302 | EST.ExitBB = nullptr; |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 303 | } |
| 304 | |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 305 | void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D, |
| 306 | StringRef ParentName, |
| 307 | llvm::Function *&OutlinedFn, |
| 308 | llvm::Constant *&OutlinedFnID, |
| 309 | bool IsOffloadEntry, |
| 310 | const RegionCodeGenTy &CodeGen) { |
| 311 | ExecutionModeRAII ModeRAII(CurrentExecutionMode, |
| 312 | CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd); |
| 313 | EntryFunctionState EST; |
| 314 | |
| 315 | // Emit target region as a standalone region. |
| 316 | class NVPTXPrePostActionTy : public PrePostActionTy { |
| 317 | CGOpenMPRuntimeNVPTX &RT; |
| 318 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST; |
| 319 | const OMPExecutableDirective &D; |
| 320 | |
| 321 | public: |
| 322 | NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT, |
| 323 | CGOpenMPRuntimeNVPTX::EntryFunctionState &EST, |
| 324 | const OMPExecutableDirective &D) |
| 325 | : RT(RT), EST(EST), D(D) {} |
| 326 | void Enter(CodeGenFunction &CGF) override { |
| 327 | RT.emitSpmdEntryHeader(CGF, EST, D); |
| 328 | } |
| 329 | void Exit(CodeGenFunction &CGF) override { |
| 330 | RT.emitSpmdEntryFooter(CGF, EST); |
| 331 | } |
| 332 | } Action(*this, EST, D); |
| 333 | CodeGen.setAction(Action); |
| 334 | emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID, |
| 335 | IsOffloadEntry, CodeGen); |
| 336 | return; |
| 337 | } |
| 338 | |
| 339 | void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader( |
| 340 | CodeGenFunction &CGF, EntryFunctionState &EST, |
| 341 | const OMPExecutableDirective &D) { |
| 342 | auto &Bld = CGF.Builder; |
| 343 | |
| 344 | // Setup BBs in entry function. |
| 345 | llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute"); |
| 346 | EST.ExitBB = CGF.createBasicBlock(".exit"); |
| 347 | |
| 348 | // Initialize the OMP state in the runtime; called by all active threads. |
| 349 | // TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters |
| 350 | // based on code analysis of the target region. |
| 351 | llvm::Value *Args[] = {getThreadLimit(CGF, /*IsInSpmdExecutionMode=*/true), |
| 352 | /*RequiresOMPRuntime=*/Bld.getInt16(1), |
| 353 | /*RequiresDataSharing=*/Bld.getInt16(1)}; |
| 354 | CGF.EmitRuntimeCall( |
| 355 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args); |
| 356 | CGF.EmitBranch(ExecuteBB); |
| 357 | |
| 358 | CGF.EmitBlock(ExecuteBB); |
| 359 | } |
| 360 | |
| 361 | void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF, |
| 362 | EntryFunctionState &EST) { |
| 363 | if (!EST.ExitBB) |
| 364 | EST.ExitBB = CGF.createBasicBlock(".exit"); |
| 365 | |
| 366 | llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit"); |
| 367 | CGF.EmitBranch(OMPDeInitBB); |
| 368 | |
| 369 | CGF.EmitBlock(OMPDeInitBB); |
| 370 | // DeInitialize the OMP state in the runtime; called by all active threads. |
| 371 | CGF.EmitRuntimeCall( |
| 372 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None); |
| 373 | CGF.EmitBranch(EST.ExitBB); |
| 374 | |
| 375 | CGF.EmitBlock(EST.ExitBB); |
| 376 | EST.ExitBB = nullptr; |
| 377 | } |
| 378 | |
| 379 | // Create a unique global variable to indicate the execution mode of this target |
| 380 | // region. The execution mode is either 'generic', or 'spmd' depending on the |
| 381 | // target directive. This variable is picked up by the offload library to setup |
| 382 | // the device appropriately before kernel launch. If the execution mode is |
| 383 | // 'generic', the runtime reserves one warp for the master, otherwise, all |
| 384 | // warps participate in parallel work. |
| 385 | static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name, |
| 386 | CGOpenMPRuntimeNVPTX::ExecutionMode Mode) { |
| 387 | (void)new llvm::GlobalVariable( |
| 388 | CGM.getModule(), CGM.Int8Ty, /*isConstant=*/true, |
| 389 | llvm::GlobalValue::WeakAnyLinkage, |
| 390 | llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode")); |
| 391 | } |
| 392 | |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 393 | void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) { |
| 394 | auto &Ctx = CGM.getContext(); |
| 395 | |
| 396 | CodeGenFunction CGF(CGM, /*suppressNewContext=*/true); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 397 | CGF.disableDebugInfo(); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 398 | CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {}); |
| 399 | emitWorkerLoop(CGF, WST); |
| 400 | CGF.FinishFunction(); |
| 401 | } |
| 402 | |
| 403 | void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF, |
| 404 | WorkerFunctionState &WST) { |
| 405 | // |
| 406 | // The workers enter this loop and wait for parallel work from the master. |
| 407 | // When the master encounters a parallel region it sets up the work + variable |
| 408 | // arguments, and wakes up the workers. The workers first check to see if |
| 409 | // they are required for the parallel region, i.e., within the # of requested |
| 410 | // parallel threads. The activated workers load the variable arguments and |
| 411 | // execute the parallel work. |
| 412 | // |
| 413 | |
| 414 | CGBuilderTy &Bld = CGF.Builder; |
| 415 | |
| 416 | llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work"); |
| 417 | llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers"); |
| 418 | llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel"); |
| 419 | llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel"); |
| 420 | llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel"); |
| 421 | llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit"); |
| 422 | |
| 423 | CGF.EmitBranch(AwaitBB); |
| 424 | |
| 425 | // Workers wait for work from master. |
| 426 | CGF.EmitBlock(AwaitBB); |
| 427 | // Wait for parallel work |
| 428 | syncCTAThreads(CGF); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 429 | |
| 430 | Address WorkFn = |
| 431 | CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /*Name=*/"work_fn"); |
| 432 | Address ExecStatus = |
| 433 | CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /*Name=*/"exec_status"); |
| 434 | CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/*C=*/0)); |
| 435 | CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy)); |
| 436 | |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 437 | llvm::Value *Args[] = {WorkFn.getPointer()}; |
| 438 | llvm::Value *Ret = CGF.EmitRuntimeCall( |
| 439 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args); |
| 440 | Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 441 | |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 442 | // On termination condition (workid == 0), exit loop. |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 443 | llvm::Value *ShouldTerminate = |
| 444 | Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate"); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 445 | Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB); |
| 446 | |
| 447 | // Activate requested workers. |
| 448 | CGF.EmitBlock(SelectWorkersBB); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 449 | llvm::Value *IsActive = |
| 450 | Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active"); |
| 451 | Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 452 | |
| 453 | // Signal start of parallel region. |
| 454 | CGF.EmitBlock(ExecuteBB); |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 455 | |
| 456 | // Process work items: outlined parallel functions. |
| 457 | for (auto *W : Work) { |
| 458 | // Try to match this outlined function. |
| 459 | auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy); |
| 460 | |
| 461 | llvm::Value *WorkFnMatch = |
| 462 | Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match"); |
| 463 | |
| 464 | llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn"); |
| 465 | llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next"); |
| 466 | Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB); |
| 467 | |
| 468 | // Execute this outlined function. |
| 469 | CGF.EmitBlock(ExecuteFNBB); |
| 470 | |
| 471 | // Insert call to work function. |
| 472 | // FIXME: Pass arguments to outlined function from master thread. |
| 473 | auto *Fn = cast<llvm::Function>(W); |
| 474 | Address ZeroAddr = |
| 475 | CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /*Name=*/".zero.addr"); |
| 476 | CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C=*/0)); |
| 477 | llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()}; |
| 478 | CGF.EmitCallOrInvoke(Fn, FnArgs); |
| 479 | |
| 480 | // Go to end of parallel region. |
| 481 | CGF.EmitBranch(TerminateBB); |
| 482 | |
| 483 | CGF.EmitBlock(CheckNextBB); |
| 484 | } |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 485 | |
| 486 | // Signal end of parallel region. |
| 487 | CGF.EmitBlock(TerminateBB); |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 488 | CGF.EmitRuntimeCall( |
| 489 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel), |
| 490 | llvm::None); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 491 | CGF.EmitBranch(BarrierBB); |
| 492 | |
| 493 | // All active and inactive workers wait at a barrier after parallel region. |
| 494 | CGF.EmitBlock(BarrierBB); |
| 495 | // Barrier after parallel region. |
| 496 | syncCTAThreads(CGF); |
| 497 | CGF.EmitBranch(AwaitBB); |
| 498 | |
| 499 | // Exit target region. |
| 500 | CGF.EmitBlock(ExitBB); |
| 501 | } |
| 502 | |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 503 | /// \brief Returns specified OpenMP runtime function for the current OpenMP |
| 504 | /// implementation. Specialized for the NVPTX device. |
| 505 | /// \param Function OpenMP runtime function. |
| 506 | /// \return Specified function. |
| 507 | llvm::Constant * |
| 508 | CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) { |
| 509 | llvm::Constant *RTLFn = nullptr; |
| 510 | switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) { |
| 511 | case OMPRTL_NVPTX__kmpc_kernel_init: { |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 512 | // Build void __kmpc_kernel_init(kmp_int32 thread_limit); |
| 513 | llvm::Type *TypeParams[] = {CGM.Int32Ty}; |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 514 | llvm::FunctionType *FnTy = |
| 515 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 516 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init"); |
| 517 | break; |
| 518 | } |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 519 | case OMPRTL_NVPTX__kmpc_kernel_deinit: { |
| 520 | // Build void __kmpc_kernel_deinit(); |
| 521 | llvm::FunctionType *FnTy = |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 522 | llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
Arpith Chacko Jacob | 406acdb | 2017-01-05 15:24:05 +0000 | [diff] [blame] | 523 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit"); |
| 524 | break; |
| 525 | } |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 526 | case OMPRTL_NVPTX__kmpc_spmd_kernel_init: { |
| 527 | // Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit, |
| 528 | // short RequiresOMPRuntime, short RequiresDataSharing); |
| 529 | llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty}; |
| 530 | llvm::FunctionType *FnTy = |
| 531 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 532 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init"); |
| 533 | break; |
| 534 | } |
| 535 | case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: { |
| 536 | // Build void __kmpc_spmd_kernel_deinit(); |
| 537 | llvm::FunctionType *FnTy = |
| 538 | llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| 539 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit"); |
| 540 | break; |
| 541 | } |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 542 | case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: { |
| 543 | /// Build void __kmpc_kernel_prepare_parallel( |
| 544 | /// void *outlined_function); |
| 545 | llvm::Type *TypeParams[] = {CGM.Int8PtrTy}; |
| 546 | llvm::FunctionType *FnTy = |
| 547 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 548 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel"); |
| 549 | break; |
| 550 | } |
| 551 | case OMPRTL_NVPTX__kmpc_kernel_parallel: { |
| 552 | /// Build bool __kmpc_kernel_parallel(void **outlined_function); |
| 553 | llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy}; |
| 554 | llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy); |
| 555 | llvm::FunctionType *FnTy = |
| 556 | llvm::FunctionType::get(RetTy, TypeParams, /*isVarArg*/ false); |
| 557 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel"); |
| 558 | break; |
| 559 | } |
| 560 | case OMPRTL_NVPTX__kmpc_kernel_end_parallel: { |
| 561 | /// Build void __kmpc_kernel_end_parallel(); |
| 562 | llvm::FunctionType *FnTy = |
| 563 | llvm::FunctionType::get(CGM.VoidTy, llvm::None, /*isVarArg*/ false); |
| 564 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel"); |
| 565 | break; |
| 566 | } |
| 567 | case OMPRTL_NVPTX__kmpc_serialized_parallel: { |
| 568 | // Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32 |
| 569 | // global_tid); |
| 570 | llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| 571 | llvm::FunctionType *FnTy = |
| 572 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 573 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel"); |
| 574 | break; |
| 575 | } |
| 576 | case OMPRTL_NVPTX__kmpc_end_serialized_parallel: { |
| 577 | // Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32 |
| 578 | // global_tid); |
| 579 | llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty}; |
| 580 | llvm::FunctionType *FnTy = |
| 581 | llvm::FunctionType::get(CGM.VoidTy, TypeParams, /*isVarArg*/ false); |
| 582 | RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel"); |
| 583 | break; |
| 584 | } |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 585 | } |
| 586 | return RTLFn; |
| 587 | } |
| 588 | |
| 589 | void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID, |
| 590 | llvm::Constant *Addr, |
Samuel Antao | f83efdb | 2017-01-05 16:02:49 +0000 | [diff] [blame] | 591 | uint64_t Size, int32_t) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 592 | auto *F = dyn_cast<llvm::Function>(Addr); |
| 593 | // TODO: Add support for global variables on the device after declare target |
| 594 | // support. |
| 595 | if (!F) |
| 596 | return; |
| 597 | llvm::Module *M = F->getParent(); |
| 598 | llvm::LLVMContext &Ctx = M->getContext(); |
| 599 | |
| 600 | // Get "nvvm.annotations" metadata node |
| 601 | llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations"); |
| 602 | |
| 603 | llvm::Metadata *MDVals[] = { |
| 604 | llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"), |
| 605 | llvm::ConstantAsMetadata::get( |
| 606 | llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))}; |
| 607 | // Append metadata to nvvm.annotations |
| 608 | MD->addOperand(llvm::MDNode::get(Ctx, MDVals)); |
| 609 | } |
| 610 | |
| 611 | void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction( |
| 612 | const OMPExecutableDirective &D, StringRef ParentName, |
| 613 | llvm::Function *&OutlinedFn, llvm::Constant *&OutlinedFnID, |
Alexey Bataev | 14fa1c6 | 2016-03-29 05:34:15 +0000 | [diff] [blame] | 614 | bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 615 | if (!IsOffloadEntry) // Nothing to do. |
| 616 | return; |
| 617 | |
| 618 | assert(!ParentName.empty() && "Invalid target region parent name!"); |
| 619 | |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 620 | CGOpenMPRuntimeNVPTX::ExecutionMode Mode = |
| 621 | getExecutionModeForDirective(CGM, D); |
| 622 | switch (Mode) { |
| 623 | case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic: |
| 624 | emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| 625 | CodeGen); |
| 626 | break; |
| 627 | case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd: |
| 628 | emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry, |
| 629 | CodeGen); |
| 630 | break; |
| 631 | case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown: |
| 632 | llvm_unreachable( |
| 633 | "Unknown programming model for OpenMP directive on NVPTX target."); |
| 634 | } |
| 635 | |
| 636 | setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 637 | } |
| 638 | |
Samuel Antao | 45bfe4c | 2016-02-08 15:59:20 +0000 | [diff] [blame] | 639 | CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM) |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 640 | : CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) { |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 641 | if (!CGM.getLangOpts().OpenMPIsDevice) |
| 642 | llvm_unreachable("OpenMP NVPTX can only handle device code."); |
Arpith Chacko Jacob | 5c309e4 | 2016-03-22 01:48:56 +0000 | [diff] [blame] | 643 | } |
Carlo Bertolli | c687225 | 2016-04-04 15:55:02 +0000 | [diff] [blame] | 644 | |
Arpith Chacko Jacob | 2cd6eea | 2017-01-25 16:55:10 +0000 | [diff] [blame] | 645 | void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF, |
| 646 | OpenMPProcBindClauseKind ProcBind, |
| 647 | SourceLocation Loc) { |
| 648 | // Do nothing in case of Spmd mode and L0 parallel. |
| 649 | // TODO: If in Spmd mode and L1 parallel emit the clause. |
| 650 | if (isInSpmdExecutionMode()) |
| 651 | return; |
| 652 | |
| 653 | CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc); |
| 654 | } |
| 655 | |
Arpith Chacko Jacob | e04da5d | 2017-01-25 01:18:34 +0000 | [diff] [blame] | 656 | void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF, |
| 657 | llvm::Value *NumThreads, |
| 658 | SourceLocation Loc) { |
| 659 | // Do nothing in case of Spmd mode and L0 parallel. |
| 660 | // TODO: If in Spmd mode and L1 parallel emit the clause. |
| 661 | if (isInSpmdExecutionMode()) |
| 662 | return; |
| 663 | |
| 664 | CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc); |
| 665 | } |
| 666 | |
Carlo Bertolli | c687225 | 2016-04-04 15:55:02 +0000 | [diff] [blame] | 667 | void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF, |
| 668 | const Expr *NumTeams, |
| 669 | const Expr *ThreadLimit, |
| 670 | SourceLocation Loc) {} |
| 671 | |
Arpith Chacko Jacob | 19b911c | 2017-01-18 18:18:53 +0000 | [diff] [blame] | 672 | llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction( |
| 673 | const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| 674 | OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| 675 | return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar, |
| 676 | InnermostKind, CodeGen); |
| 677 | } |
| 678 | |
| 679 | llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction( |
Carlo Bertolli | c687225 | 2016-04-04 15:55:02 +0000 | [diff] [blame] | 680 | const OMPExecutableDirective &D, const VarDecl *ThreadIDVar, |
| 681 | OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) { |
| 682 | |
Arpith Chacko Jacob | 19b911c | 2017-01-18 18:18:53 +0000 | [diff] [blame] | 683 | llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction( |
| 684 | D, ThreadIDVar, InnermostKind, CodeGen); |
| 685 | llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal); |
| 686 | OutlinedFun->removeFnAttr(llvm::Attribute::NoInline); |
| 687 | OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline); |
Carlo Bertolli | c687225 | 2016-04-04 15:55:02 +0000 | [diff] [blame] | 688 | |
| 689 | return OutlinedFun; |
| 690 | } |
| 691 | |
| 692 | void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF, |
| 693 | const OMPExecutableDirective &D, |
| 694 | SourceLocation Loc, |
| 695 | llvm::Value *OutlinedFn, |
| 696 | ArrayRef<llvm::Value *> CapturedVars) { |
| 697 | if (!CGF.HaveInsertPoint()) |
| 698 | return; |
| 699 | |
| 700 | Address ZeroAddr = |
| 701 | CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4), |
| 702 | /*Name*/ ".zero.addr"); |
| 703 | CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/*C*/ 0)); |
| 704 | llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| 705 | OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| 706 | OutlinedFnArgs.push_back(ZeroAddr.getPointer()); |
| 707 | OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| 708 | CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); |
| 709 | } |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 710 | |
| 711 | void CGOpenMPRuntimeNVPTX::emitParallelCall( |
| 712 | CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| 713 | ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| 714 | if (!CGF.HaveInsertPoint()) |
| 715 | return; |
| 716 | |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 717 | if (isInSpmdExecutionMode()) |
| 718 | emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); |
| 719 | else |
| 720 | emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond); |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 721 | } |
| 722 | |
| 723 | void CGOpenMPRuntimeNVPTX::emitGenericParallelCall( |
| 724 | CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| 725 | ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| 726 | llvm::Function *Fn = cast<llvm::Function>(OutlinedFn); |
| 727 | |
Malcolm Parsons | c6e4583 | 2017-01-13 18:55:32 +0000 | [diff] [blame] | 728 | auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) { |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 729 | CGBuilderTy &Bld = CGF.Builder; |
| 730 | |
| 731 | // Prepare for parallel region. Indicate the outlined function. |
| 732 | llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)}; |
| 733 | CGF.EmitRuntimeCall( |
| 734 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel), |
| 735 | Args); |
| 736 | |
| 737 | // Activate workers. This barrier is used by the master to signal |
| 738 | // work for the workers. |
| 739 | syncCTAThreads(CGF); |
| 740 | |
| 741 | // OpenMP [2.5, Parallel Construct, p.49] |
| 742 | // There is an implied barrier at the end of a parallel region. After the |
| 743 | // end of a parallel region, only the master thread of the team resumes |
| 744 | // execution of the enclosing task region. |
| 745 | // |
| 746 | // The master waits at this barrier until all workers are done. |
| 747 | syncCTAThreads(CGF); |
| 748 | |
| 749 | // Remember for post-processing in worker loop. |
| 750 | Work.push_back(Fn); |
| 751 | }; |
| 752 | |
| 753 | auto *RTLoc = emitUpdateLocation(CGF, Loc); |
| 754 | auto *ThreadID = getThreadID(CGF, Loc); |
| 755 | llvm::Value *Args[] = {RTLoc, ThreadID}; |
| 756 | |
| 757 | auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF, |
| 758 | PrePostActionTy &) { |
Malcolm Parsons | c6e4583 | 2017-01-13 18:55:32 +0000 | [diff] [blame] | 759 | auto &&CodeGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF, |
| 760 | PrePostActionTy &Action) { |
Arpith Chacko Jacob | bb36fe8 | 2017-01-10 15:42:51 +0000 | [diff] [blame] | 761 | Action.Enter(CGF); |
| 762 | |
| 763 | llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| 764 | OutlinedFnArgs.push_back( |
| 765 | llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| 766 | OutlinedFnArgs.push_back( |
| 767 | llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| 768 | OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| 769 | CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs); |
| 770 | }; |
| 771 | |
| 772 | RegionCodeGenTy RCG(CodeGen); |
| 773 | NVPTXActionTy Action( |
| 774 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel), |
| 775 | Args, |
| 776 | createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel), |
| 777 | Args); |
| 778 | RCG.setAction(Action); |
| 779 | RCG(CGF); |
| 780 | }; |
| 781 | |
| 782 | if (IfCond) |
| 783 | emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen); |
| 784 | else { |
| 785 | CodeGenFunction::RunCleanupsScope Scope(CGF); |
| 786 | RegionCodeGenTy ThenRCG(L0ParallelGen); |
| 787 | ThenRCG(CGF); |
| 788 | } |
| 789 | } |
Arpith Chacko Jacob | 44a87c9 | 2017-01-18 19:35:00 +0000 | [diff] [blame] | 790 | |
| 791 | void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall( |
| 792 | CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn, |
| 793 | ArrayRef<llvm::Value *> CapturedVars, const Expr *IfCond) { |
| 794 | // Just call the outlined function to execute the parallel region. |
| 795 | // OutlinedFn(>id, &zero, CapturedStruct); |
| 796 | // |
| 797 | // TODO: Do something with IfCond when support for the 'if' clause |
| 798 | // is added on Spmd target directives. |
| 799 | llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs; |
| 800 | OutlinedFnArgs.push_back( |
| 801 | llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| 802 | OutlinedFnArgs.push_back( |
| 803 | llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo())); |
| 804 | OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end()); |
| 805 | CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs); |
| 806 | } |