Blame - clang/lib/CodeGen/CGOpenMPRuntimeNVPTX.cpp - toolchain/llvm-project

blob: 810e0e013e83e649b7616b853a153849d0ec235b [file] [log] [blame]

Samuel Antao	45bfe4c	2016-02-08 15:59:20 +0000	[diff] [blame]	1	//===---- CGOpenMPRuntimeNVPTX.cpp - Interface to OpenMP NVPTX Runtimes ---===//
				2	//
				3	// The LLVM Compiler Infrastructure
				4	//
				5	// This file is distributed under the University of Illinois Open Source
				6	// License. See LICENSE.TXT for details.
				7	//
				8	//===----------------------------------------------------------------------===//
				9	//
				10	// This provides a class for OpenMP runtime code generation specialized to NVPTX
				11	// targets.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#include "CGOpenMPRuntimeNVPTX.h"
Alexey Bataev	c5b1d32	2016-03-04 09:22:22 +0000	[diff] [blame]	16	#include "clang/AST/DeclOpenMP.h"
Carlo Bertolli	c687225	2016-04-04 15:55:02 +0000	[diff] [blame]	17	#include "CodeGenFunction.h"
				18	#include "clang/AST/StmtOpenMP.h"
Samuel Antao	45bfe4c	2016-02-08 15:59:20 +0000	[diff] [blame]	19
				20	using namespace clang;
				21	using namespace CodeGen;
				22
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	23	namespace {
				24	enum OpenMPRTLFunctionNVPTX {
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	25	/// \brief Call to void __kmpc_kernel_init(kmp_int32 thread_limit);
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	26	OMPRTL_NVPTX__kmpc_kernel_init,
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	27	/// \brief Call to void __kmpc_kernel_deinit();
				28	OMPRTL_NVPTX__kmpc_kernel_deinit,
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	29	/// \brief Call to void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
				30	/// short RequiresOMPRuntime, short RequiresDataSharing);
				31	OMPRTL_NVPTX__kmpc_spmd_kernel_init,
				32	/// \brief Call to void __kmpc_spmd_kernel_deinit();
				33	OMPRTL_NVPTX__kmpc_spmd_kernel_deinit,
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	34	/// \brief Call to void __kmpc_kernel_prepare_parallel(void
				35	/// *outlined_function);
				36	OMPRTL_NVPTX__kmpc_kernel_prepare_parallel,
				37	/// \brief Call to bool __kmpc_kernel_parallel(void **outlined_function);
				38	OMPRTL_NVPTX__kmpc_kernel_parallel,
				39	/// \brief Call to void __kmpc_kernel_end_parallel();
				40	OMPRTL_NVPTX__kmpc_kernel_end_parallel,
				41	/// Call to void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
				42	/// global_tid);
				43	OMPRTL_NVPTX__kmpc_serialized_parallel,
				44	/// Call to void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
				45	/// global_tid);
				46	OMPRTL_NVPTX__kmpc_end_serialized_parallel,
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	47	/// \brief Call to int32_t __kmpc_shuffle_int32(int32_t element,
				48	/// int16_t lane_offset, int16_t warp_size);
				49	OMPRTL_NVPTX__kmpc_shuffle_int32,
				50	/// \brief Call to int64_t __kmpc_shuffle_int64(int64_t element,
				51	/// int16_t lane_offset, int16_t warp_size);
				52	OMPRTL_NVPTX__kmpc_shuffle_int64,
				53	/// \brief Call to __kmpc_nvptx_parallel_reduce_nowait(kmp_int32
				54	/// global_tid, kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
				55	/// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
				56	/// lane_offset, int16_t shortCircuit),
				57	/// void (kmp_InterWarpCopyFctPtr)(void src, int32_t warp_num));
				58	OMPRTL_NVPTX__kmpc_parallel_reduce_nowait,
				59	/// \brief Call to __kmpc_nvptx_end_reduce_nowait(int32_t global_tid);
				60	OMPRTL_NVPTX__kmpc_end_reduce_nowait
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	61	};
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	62
				63	/// Pre(post)-action for different OpenMP constructs specialized for NVPTX.
				64	class NVPTXActionTy final : public PrePostActionTy {
				65	llvm::Value *EnterCallee;
				66	ArrayRef<llvm::Value *> EnterArgs;
				67	llvm::Value *ExitCallee;
				68	ArrayRef<llvm::Value *> ExitArgs;
				69	bool Conditional;
				70	llvm::BasicBlock *ContBlock = nullptr;
				71
				72	public:
				73	NVPTXActionTy(llvm::Value EnterCallee, ArrayRef<llvm::Value > EnterArgs,
				74	llvm::Value ExitCallee, ArrayRef<llvm::Value > ExitArgs,
				75	bool Conditional = false)
				76	: EnterCallee(EnterCallee), EnterArgs(EnterArgs), ExitCallee(ExitCallee),
				77	ExitArgs(ExitArgs), Conditional(Conditional) {}
				78	void Enter(CodeGenFunction &CGF) override {
				79	llvm::Value *EnterRes = CGF.EmitRuntimeCall(EnterCallee, EnterArgs);
				80	if (Conditional) {
				81	llvm::Value *CallBool = CGF.Builder.CreateIsNotNull(EnterRes);
				82	auto *ThenBlock = CGF.createBasicBlock("omp_if.then");
				83	ContBlock = CGF.createBasicBlock("omp_if.end");
				84	// Generate the branch (If-stmt)
				85	CGF.Builder.CreateCondBr(CallBool, ThenBlock, ContBlock);
				86	CGF.EmitBlock(ThenBlock);
				87	}
				88	}
				89	void Done(CodeGenFunction &CGF) {
				90	// Emit the rest of blocks/branches
				91	CGF.EmitBranch(ContBlock);
				92	CGF.EmitBlock(ContBlock, true);
				93	}
				94	void Exit(CodeGenFunction &CGF) override {
				95	CGF.EmitRuntimeCall(ExitCallee, ExitArgs);
				96	}
				97	};
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	98
				99	// A class to track the execution mode when codegening directives within
				100	// a target region. The appropriate mode (generic/spmd) is set on entry
				101	// to the target region and used by containing directives such as 'parallel'
				102	// to emit optimized code.
				103	class ExecutionModeRAII {
				104	private:
				105	CGOpenMPRuntimeNVPTX::ExecutionMode SavedMode;
				106	CGOpenMPRuntimeNVPTX::ExecutionMode &Mode;
				107
				108	public:
				109	ExecutionModeRAII(CGOpenMPRuntimeNVPTX::ExecutionMode &Mode,
				110	CGOpenMPRuntimeNVPTX::ExecutionMode NewMode)
				111	: Mode(Mode) {
				112	SavedMode = Mode;
				113	Mode = NewMode;
				114	}
				115	~ExecutionModeRAII() { Mode = SavedMode; }
				116	};
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	117
				118	/// GPU Configuration: This information can be derived from cuda registers,
				119	/// however, providing compile time constants helps generate more efficient
				120	/// code. For all practical purposes this is fine because the configuration
				121	/// is the same for all known NVPTX architectures.
				122	enum MachineConfiguration : unsigned {
				123	WarpSize = 32,
				124	/// Number of bits required to represent a lane identifier, which is
				125	/// computed as log_2(WarpSize).
				126	LaneIDBits = 5,
				127	LaneIDMask = WarpSize - 1,
				128	};
				129
				130	enum NamedBarrier : unsigned {
				131	/// Synchronize on this barrier #ID using a named barrier primitive.
				132	/// Only the subset of active threads in a parallel region arrive at the
				133	/// barrier.
				134	NB_Parallel = 1,
				135	};
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	136	} // anonymous namespace
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	137
				138	/// Get the GPU warp size.
				139	static llvm::Value *getNVPTXWarpSize(CodeGenFunction &CGF) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	140	CGBuilderTy &Bld = CGF.Builder;
				141	return Bld.CreateCall(
				142	llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	143	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_warpsize),
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	144	llvm::None, "nvptx_warp_size");
				145	}
				146
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	147	/// Get the id of the current thread on the GPU.
				148	static llvm::Value *getNVPTXThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	149	CGBuilderTy &Bld = CGF.Builder;
				150	return Bld.CreateCall(
				151	llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	152	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_tid_x),
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	153	llvm::None, "nvptx_tid");
				154	}
				155
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	156	/// Get the id of the warp in the block.
				157	/// We assume that the warp size is 32, which is always the case
				158	/// on the NVPTX device, to generate more efficient code.
				159	static llvm::Value *getNVPTXWarpID(CodeGenFunction &CGF) {
				160	CGBuilderTy &Bld = CGF.Builder;
				161	return Bld.CreateAShr(getNVPTXThreadID(CGF), LaneIDBits, "nvptx_warp_id");
				162	}
				163
				164	/// Get the id of the current lane in the Warp.
				165	/// We assume that the warp size is 32, which is always the case
				166	/// on the NVPTX device, to generate more efficient code.
				167	static llvm::Value *getNVPTXLaneID(CodeGenFunction &CGF) {
				168	CGBuilderTy &Bld = CGF.Builder;
				169	return Bld.CreateAnd(getNVPTXThreadID(CGF), Bld.getInt32(LaneIDMask),
				170	"nvptx_lane_id");
				171	}
				172
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	173	/// Get the maximum number of threads in a block of the GPU.
				174	static llvm::Value *getNVPTXNumThreads(CodeGenFunction &CGF) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	175	CGBuilderTy &Bld = CGF.Builder;
				176	return Bld.CreateCall(
				177	llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	178	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_read_ptx_sreg_ntid_x),
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	179	llvm::None, "nvptx_num_threads");
				180	}
				181
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	182	/// Get barrier to synchronize all threads in a block.
				183	static void getNVPTXCTABarrier(CodeGenFunction &CGF) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	184	CGBuilderTy &Bld = CGF.Builder;
				185	Bld.CreateCall(llvm::Intrinsic::getDeclaration(
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	186	&CGF.CGM.getModule(), llvm::Intrinsic::nvvm_barrier0));
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	187	}
				188
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	189	/// Get barrier #ID to synchronize selected (multiple of warp size) threads in
				190	/// a CTA.
				191	static void getNVPTXBarrier(CodeGenFunction &CGF, int ID,
				192	llvm::Value *NumThreads) {
				193	CGBuilderTy &Bld = CGF.Builder;
				194	llvm::Value *Args[] = {Bld.getInt32(ID), NumThreads};
				195	Bld.CreateCall(llvm::Intrinsic::getDeclaration(&CGF.CGM.getModule(),
				196	llvm::Intrinsic::nvvm_barrier),
				197	Args);
				198	}
				199
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	200	/// Synchronize all GPU threads in a block.
				201	static void syncCTAThreads(CodeGenFunction &CGF) { getNVPTXCTABarrier(CGF); }
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	202
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	203	/// Synchronize worker threads in a parallel region.
				204	static void syncParallelThreads(CodeGenFunction &CGF, llvm::Value *NumThreads) {
				205	return getNVPTXBarrier(CGF, NB_Parallel, NumThreads);
				206	}
				207
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	208	/// Get the value of the thread_limit clause in the teams directive.
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	209	/// For the 'generic' execution mode, the runtime encodes thread_limit in
				210	/// the launch parameters, always starting thread_limit+warpSize threads per
				211	/// CTA. The threads in the last warp are reserved for master execution.
				212	/// For the 'spmd' execution mode, all threads in a CTA are part of the team.
				213	static llvm::Value *getThreadLimit(CodeGenFunction &CGF,
				214	bool IsInSpmdExecutionMode = false) {
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	215	CGBuilderTy &Bld = CGF.Builder;
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	216	return IsInSpmdExecutionMode
				217	? getNVPTXNumThreads(CGF)
				218	: Bld.CreateSub(getNVPTXNumThreads(CGF), getNVPTXWarpSize(CGF),
				219	"thread_limit");
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	220	}
				221
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	222	/// Get the thread id of the OMP master thread.
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	223	/// The master thread id is the first thread (lane) of the last warp in the
				224	/// GPU block. Warp size is assumed to be some power of 2.
				225	/// Thread id is 0 indexed.
				226	/// E.g: If NumThreads is 33, master id is 32.
				227	/// If NumThreads is 64, master id is 32.
				228	/// If NumThreads is 1024, master id is 992.
Arpith Chacko Jacob	ccf2f73	2017-01-03 20:19:56 +0000	[diff] [blame]	229	static llvm::Value *getMasterThreadID(CodeGenFunction &CGF) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	230	CGBuilderTy &Bld = CGF.Builder;
				231	llvm::Value *NumThreads = getNVPTXNumThreads(CGF);
				232
				233	// We assume that the warp size is a power of 2.
				234	llvm::Value *Mask = Bld.CreateSub(getNVPTXWarpSize(CGF), Bld.getInt32(1));
				235
				236	return Bld.CreateAnd(Bld.CreateSub(NumThreads, Bld.getInt32(1)),
				237	Bld.CreateNot(Mask), "master_tid");
				238	}
				239
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	240	CGOpenMPRuntimeNVPTX::WorkerFunctionState::WorkerFunctionState(
				241	CodeGenModule &CGM)
				242	: WorkerFn(nullptr), CGFI(nullptr) {
				243	createWorkerFunction(CGM);
Vasileios Kalintiris	e5c0959	2016-03-22 10:41:20 +0000	[diff] [blame]	244	}
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	245
				246	void CGOpenMPRuntimeNVPTX::WorkerFunctionState::createWorkerFunction(
				247	CodeGenModule &CGM) {
				248	// Create an worker function with no arguments.
				249	CGFI = &CGM.getTypes().arrangeNullaryFunction();
				250
				251	WorkerFn = llvm::Function::Create(
				252	CGM.getTypes().GetFunctionType(*CGFI), llvm::GlobalValue::InternalLinkage,
				253	/* placeholder */ "_worker", &CGM.getModule());
				254	CGM.SetInternalFunctionAttributes(/D=/nullptr, WorkerFn, *CGFI);
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	255	}
				256
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	257	bool CGOpenMPRuntimeNVPTX::isInSpmdExecutionMode() const {
				258	return CurrentExecutionMode == CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
				259	}
				260
				261	static CGOpenMPRuntimeNVPTX::ExecutionMode
				262	getExecutionModeForDirective(CodeGenModule &CGM,
				263	const OMPExecutableDirective &D) {
				264	OpenMPDirectiveKind DirectiveKind = D.getDirectiveKind();
				265	switch (DirectiveKind) {
				266	case OMPD_target:
Arpith Chacko Jacob	cca61a3	2017-01-26 15:43:27 +0000	[diff] [blame]	267	case OMPD_target_teams:
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	268	return CGOpenMPRuntimeNVPTX::ExecutionMode::Generic;
				269	case OMPD_target_parallel:
				270	return CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd;
				271	default:
				272	llvm_unreachable("Unsupported directive on NVPTX device.");
				273	}
				274	llvm_unreachable("Unsupported directive on NVPTX device.");
				275	}
				276
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	277	void CGOpenMPRuntimeNVPTX::emitGenericKernel(const OMPExecutableDirective &D,
				278	StringRef ParentName,
				279	llvm::Function *&OutlinedFn,
				280	llvm::Constant *&OutlinedFnID,
				281	bool IsOffloadEntry,
				282	const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	283	ExecutionModeRAII ModeRAII(CurrentExecutionMode,
				284	CGOpenMPRuntimeNVPTX::ExecutionMode::Generic);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	285	EntryFunctionState EST;
				286	WorkerFunctionState WST(CGM);
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	287	Work.clear();
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	288
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	289	// Emit target region as a standalone region.
				290	class NVPTXPrePostActionTy : public PrePostActionTy {
				291	CGOpenMPRuntimeNVPTX &RT;
				292	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
				293	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST;
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	294
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	295	public:
				296	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
				297	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
				298	CGOpenMPRuntimeNVPTX::WorkerFunctionState &WST)
				299	: RT(RT), EST(EST), WST(WST) {}
				300	void Enter(CodeGenFunction &CGF) override {
				301	RT.emitGenericEntryHeader(CGF, EST, WST);
				302	}
				303	void Exit(CodeGenFunction &CGF) override {
				304	RT.emitGenericEntryFooter(CGF, EST);
				305	}
				306	} Action(*this, EST, WST);
				307	CodeGen.setAction(Action);
				308	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
				309	IsOffloadEntry, CodeGen);
				310
				311	// Create the worker function
				312	emitWorkerFunction(WST);
				313
				314	// Now change the name of the worker function to correspond to this target
				315	// region's entry function.
				316	WST.WorkerFn->setName(OutlinedFn->getName() + "_worker");
				317	}
				318
				319	// Setup NVPTX threads for master-worker OpenMP scheme.
				320	void CGOpenMPRuntimeNVPTX::emitGenericEntryHeader(CodeGenFunction &CGF,
				321	EntryFunctionState &EST,
				322	WorkerFunctionState &WST) {
				323	CGBuilderTy &Bld = CGF.Builder;
				324
				325	llvm::BasicBlock *WorkerBB = CGF.createBasicBlock(".worker");
				326	llvm::BasicBlock *MasterCheckBB = CGF.createBasicBlock(".mastercheck");
				327	llvm::BasicBlock *MasterBB = CGF.createBasicBlock(".master");
				328	EST.ExitBB = CGF.createBasicBlock(".exit");
				329
				330	auto *IsWorker =
				331	Bld.CreateICmpULT(getNVPTXThreadID(CGF), getThreadLimit(CGF));
				332	Bld.CreateCondBr(IsWorker, WorkerBB, MasterCheckBB);
				333
				334	CGF.EmitBlock(WorkerBB);
				335	CGF.EmitCallOrInvoke(WST.WorkerFn, llvm::None);
				336	CGF.EmitBranch(EST.ExitBB);
				337
				338	CGF.EmitBlock(MasterCheckBB);
				339	auto *IsMaster =
				340	Bld.CreateICmpEQ(getNVPTXThreadID(CGF), getMasterThreadID(CGF));
				341	Bld.CreateCondBr(IsMaster, MasterBB, EST.ExitBB);
				342
				343	CGF.EmitBlock(MasterBB);
				344	// First action in sequential region:
				345	// Initialize the state of the OpenMP runtime library on the GPU.
				346	llvm::Value *Args[] = {getThreadLimit(CGF)};
				347	CGF.EmitRuntimeCall(
				348	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_init), Args);
				349	}
				350
				351	void CGOpenMPRuntimeNVPTX::emitGenericEntryFooter(CodeGenFunction &CGF,
				352	EntryFunctionState &EST) {
				353	if (!EST.ExitBB)
				354	EST.ExitBB = CGF.createBasicBlock(".exit");
				355
				356	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".termination.notifier");
				357	CGF.EmitBranch(TerminateBB);
				358
				359	CGF.EmitBlock(TerminateBB);
				360	// Signal termination condition.
				361	CGF.EmitRuntimeCall(
				362	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_deinit), None);
				363	// Barrier to terminate worker threads.
				364	syncCTAThreads(CGF);
				365	// Master thread jumps to exit point.
				366	CGF.EmitBranch(EST.ExitBB);
				367
				368	CGF.EmitBlock(EST.ExitBB);
				369	EST.ExitBB = nullptr;
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	370	}
				371
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	372	void CGOpenMPRuntimeNVPTX::emitSpmdKernel(const OMPExecutableDirective &D,
				373	StringRef ParentName,
				374	llvm::Function *&OutlinedFn,
				375	llvm::Constant *&OutlinedFnID,
				376	bool IsOffloadEntry,
				377	const RegionCodeGenTy &CodeGen) {
				378	ExecutionModeRAII ModeRAII(CurrentExecutionMode,
				379	CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd);
				380	EntryFunctionState EST;
				381
				382	// Emit target region as a standalone region.
				383	class NVPTXPrePostActionTy : public PrePostActionTy {
				384	CGOpenMPRuntimeNVPTX &RT;
				385	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST;
				386	const OMPExecutableDirective &D;
				387
				388	public:
				389	NVPTXPrePostActionTy(CGOpenMPRuntimeNVPTX &RT,
				390	CGOpenMPRuntimeNVPTX::EntryFunctionState &EST,
				391	const OMPExecutableDirective &D)
				392	: RT(RT), EST(EST), D(D) {}
				393	void Enter(CodeGenFunction &CGF) override {
				394	RT.emitSpmdEntryHeader(CGF, EST, D);
				395	}
				396	void Exit(CodeGenFunction &CGF) override {
				397	RT.emitSpmdEntryFooter(CGF, EST);
				398	}
				399	} Action(*this, EST, D);
				400	CodeGen.setAction(Action);
				401	emitTargetOutlinedFunctionHelper(D, ParentName, OutlinedFn, OutlinedFnID,
				402	IsOffloadEntry, CodeGen);
				403	return;
				404	}
				405
				406	void CGOpenMPRuntimeNVPTX::emitSpmdEntryHeader(
				407	CodeGenFunction &CGF, EntryFunctionState &EST,
				408	const OMPExecutableDirective &D) {
				409	auto &Bld = CGF.Builder;
				410
				411	// Setup BBs in entry function.
				412	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute");
				413	EST.ExitBB = CGF.createBasicBlock(".exit");
				414
				415	// Initialize the OMP state in the runtime; called by all active threads.
				416	// TODO: Set RequiresOMPRuntime and RequiresDataSharing parameters
				417	// based on code analysis of the target region.
				418	llvm::Value Args[] = {getThreadLimit(CGF, /IsInSpmdExecutionMode=*/true),
				419	/RequiresOMPRuntime=/Bld.getInt16(1),
				420	/RequiresDataSharing=/Bld.getInt16(1)};
				421	CGF.EmitRuntimeCall(
				422	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_init), Args);
				423	CGF.EmitBranch(ExecuteBB);
				424
				425	CGF.EmitBlock(ExecuteBB);
				426	}
				427
				428	void CGOpenMPRuntimeNVPTX::emitSpmdEntryFooter(CodeGenFunction &CGF,
				429	EntryFunctionState &EST) {
				430	if (!EST.ExitBB)
				431	EST.ExitBB = CGF.createBasicBlock(".exit");
				432
				433	llvm::BasicBlock *OMPDeInitBB = CGF.createBasicBlock(".omp.deinit");
				434	CGF.EmitBranch(OMPDeInitBB);
				435
				436	CGF.EmitBlock(OMPDeInitBB);
				437	// DeInitialize the OMP state in the runtime; called by all active threads.
				438	CGF.EmitRuntimeCall(
				439	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_spmd_kernel_deinit), None);
				440	CGF.EmitBranch(EST.ExitBB);
				441
				442	CGF.EmitBlock(EST.ExitBB);
				443	EST.ExitBB = nullptr;
				444	}
				445
				446	// Create a unique global variable to indicate the execution mode of this target
				447	// region. The execution mode is either 'generic', or 'spmd' depending on the
				448	// target directive. This variable is picked up by the offload library to setup
				449	// the device appropriately before kernel launch. If the execution mode is
				450	// 'generic', the runtime reserves one warp for the master, otherwise, all
				451	// warps participate in parallel work.
				452	static void setPropertyExecutionMode(CodeGenModule &CGM, StringRef Name,
				453	CGOpenMPRuntimeNVPTX::ExecutionMode Mode) {
				454	(void)new llvm::GlobalVariable(
				455	CGM.getModule(), CGM.Int8Ty, /isConstant=/true,
				456	llvm::GlobalValue::WeakAnyLinkage,
				457	llvm::ConstantInt::get(CGM.Int8Ty, Mode), Name + Twine("_exec_mode"));
				458	}
				459
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	460	void CGOpenMPRuntimeNVPTX::emitWorkerFunction(WorkerFunctionState &WST) {
				461	auto &Ctx = CGM.getContext();
				462
				463	CodeGenFunction CGF(CGM, /suppressNewContext=/true);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	464	CGF.disableDebugInfo();
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	465	CGF.StartFunction(GlobalDecl(), Ctx.VoidTy, WST.WorkerFn, *WST.CGFI, {});
				466	emitWorkerLoop(CGF, WST);
				467	CGF.FinishFunction();
				468	}
				469
				470	void CGOpenMPRuntimeNVPTX::emitWorkerLoop(CodeGenFunction &CGF,
				471	WorkerFunctionState &WST) {
				472	//
				473	// The workers enter this loop and wait for parallel work from the master.
				474	// When the master encounters a parallel region it sets up the work + variable
				475	// arguments, and wakes up the workers. The workers first check to see if
				476	// they are required for the parallel region, i.e., within the # of requested
				477	// parallel threads. The activated workers load the variable arguments and
				478	// execute the parallel work.
				479	//
				480
				481	CGBuilderTy &Bld = CGF.Builder;
				482
				483	llvm::BasicBlock *AwaitBB = CGF.createBasicBlock(".await.work");
				484	llvm::BasicBlock *SelectWorkersBB = CGF.createBasicBlock(".select.workers");
				485	llvm::BasicBlock *ExecuteBB = CGF.createBasicBlock(".execute.parallel");
				486	llvm::BasicBlock *TerminateBB = CGF.createBasicBlock(".terminate.parallel");
				487	llvm::BasicBlock *BarrierBB = CGF.createBasicBlock(".barrier.parallel");
				488	llvm::BasicBlock *ExitBB = CGF.createBasicBlock(".exit");
				489
				490	CGF.EmitBranch(AwaitBB);
				491
				492	// Workers wait for work from master.
				493	CGF.EmitBlock(AwaitBB);
				494	// Wait for parallel work
				495	syncCTAThreads(CGF);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	496
				497	Address WorkFn =
				498	CGF.CreateDefaultAlignTempAlloca(CGF.Int8PtrTy, /Name=/"work_fn");
				499	Address ExecStatus =
				500	CGF.CreateDefaultAlignTempAlloca(CGF.Int8Ty, /Name=/"exec_status");
				501	CGF.InitTempAlloca(ExecStatus, Bld.getInt8(/C=/0));
				502	CGF.InitTempAlloca(WorkFn, llvm::Constant::getNullValue(CGF.Int8PtrTy));
				503
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	504	llvm::Value *Args[] = {WorkFn.getPointer()};
				505	llvm::Value *Ret = CGF.EmitRuntimeCall(
				506	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_parallel), Args);
				507	Bld.CreateStore(Bld.CreateZExt(Ret, CGF.Int8Ty), ExecStatus);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	508
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	509	// On termination condition (workid == 0), exit loop.
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	510	llvm::Value *ShouldTerminate =
				511	Bld.CreateIsNull(Bld.CreateLoad(WorkFn), "should_terminate");
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	512	Bld.CreateCondBr(ShouldTerminate, ExitBB, SelectWorkersBB);
				513
				514	// Activate requested workers.
				515	CGF.EmitBlock(SelectWorkersBB);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	516	llvm::Value *IsActive =
				517	Bld.CreateIsNotNull(Bld.CreateLoad(ExecStatus), "is_active");
				518	Bld.CreateCondBr(IsActive, ExecuteBB, BarrierBB);
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	519
				520	// Signal start of parallel region.
				521	CGF.EmitBlock(ExecuteBB);
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	522
				523	// Process work items: outlined parallel functions.
				524	for (auto *W : Work) {
				525	// Try to match this outlined function.
				526	auto *ID = Bld.CreatePointerBitCastOrAddrSpaceCast(W, CGM.Int8PtrTy);
				527
				528	llvm::Value *WorkFnMatch =
				529	Bld.CreateICmpEQ(Bld.CreateLoad(WorkFn), ID, "work_match");
				530
				531	llvm::BasicBlock *ExecuteFNBB = CGF.createBasicBlock(".execute.fn");
				532	llvm::BasicBlock *CheckNextBB = CGF.createBasicBlock(".check.next");
				533	Bld.CreateCondBr(WorkFnMatch, ExecuteFNBB, CheckNextBB);
				534
				535	// Execute this outlined function.
				536	CGF.EmitBlock(ExecuteFNBB);
				537
				538	// Insert call to work function.
				539	// FIXME: Pass arguments to outlined function from master thread.
				540	auto *Fn = cast<llvm::Function>(W);
				541	Address ZeroAddr =
				542	CGF.CreateDefaultAlignTempAlloca(CGF.Int32Ty, /Name=/".zero.addr");
				543	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C=/0));
				544	llvm::Value *FnArgs[] = {ZeroAddr.getPointer(), ZeroAddr.getPointer()};
				545	CGF.EmitCallOrInvoke(Fn, FnArgs);
				546
				547	// Go to end of parallel region.
				548	CGF.EmitBranch(TerminateBB);
				549
				550	CGF.EmitBlock(CheckNextBB);
				551	}
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	552
				553	// Signal end of parallel region.
				554	CGF.EmitBlock(TerminateBB);
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	555	CGF.EmitRuntimeCall(
				556	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_end_parallel),
				557	llvm::None);
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	558	CGF.EmitBranch(BarrierBB);
				559
				560	// All active and inactive workers wait at a barrier after parallel region.
				561	CGF.EmitBlock(BarrierBB);
				562	// Barrier after parallel region.
				563	syncCTAThreads(CGF);
				564	CGF.EmitBranch(AwaitBB);
				565
				566	// Exit target region.
				567	CGF.EmitBlock(ExitBB);
				568	}
				569
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	570	/// \brief Returns specified OpenMP runtime function for the current OpenMP
				571	/// implementation. Specialized for the NVPTX device.
				572	/// \param Function OpenMP runtime function.
				573	/// \return Specified function.
				574	llvm::Constant *
				575	CGOpenMPRuntimeNVPTX::createNVPTXRuntimeFunction(unsigned Function) {
				576	llvm::Constant *RTLFn = nullptr;
				577	switch (static_cast<OpenMPRTLFunctionNVPTX>(Function)) {
				578	case OMPRTL_NVPTX__kmpc_kernel_init: {
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	579	// Build void __kmpc_kernel_init(kmp_int32 thread_limit);
				580	llvm::Type *TypeParams[] = {CGM.Int32Ty};
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	581	llvm::FunctionType *FnTy =
				582	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
				583	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_init");
				584	break;
				585	}
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	586	case OMPRTL_NVPTX__kmpc_kernel_deinit: {
				587	// Build void __kmpc_kernel_deinit();
				588	llvm::FunctionType *FnTy =
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	589	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
Arpith Chacko Jacob	406acdb	2017-01-05 15:24:05 +0000	[diff] [blame]	590	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_deinit");
				591	break;
				592	}
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	593	case OMPRTL_NVPTX__kmpc_spmd_kernel_init: {
				594	// Build void __kmpc_spmd_kernel_init(kmp_int32 thread_limit,
				595	// short RequiresOMPRuntime, short RequiresDataSharing);
				596	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
				597	llvm::FunctionType *FnTy =
				598	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
				599	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_init");
				600	break;
				601	}
				602	case OMPRTL_NVPTX__kmpc_spmd_kernel_deinit: {
				603	// Build void __kmpc_spmd_kernel_deinit();
				604	llvm::FunctionType *FnTy =
				605	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
				606	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_spmd_kernel_deinit");
				607	break;
				608	}
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	609	case OMPRTL_NVPTX__kmpc_kernel_prepare_parallel: {
				610	/// Build void __kmpc_kernel_prepare_parallel(
				611	/// void *outlined_function);
				612	llvm::Type *TypeParams[] = {CGM.Int8PtrTy};
				613	llvm::FunctionType *FnTy =
				614	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
				615	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_prepare_parallel");
				616	break;
				617	}
				618	case OMPRTL_NVPTX__kmpc_kernel_parallel: {
				619	/// Build bool __kmpc_kernel_parallel(void **outlined_function);
				620	llvm::Type *TypeParams[] = {CGM.Int8PtrPtrTy};
				621	llvm::Type *RetTy = CGM.getTypes().ConvertType(CGM.getContext().BoolTy);
				622	llvm::FunctionType *FnTy =
				623	llvm::FunctionType::get(RetTy, TypeParams, /isVarArg/ false);
				624	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_parallel");
				625	break;
				626	}
				627	case OMPRTL_NVPTX__kmpc_kernel_end_parallel: {
				628	/// Build void __kmpc_kernel_end_parallel();
				629	llvm::FunctionType *FnTy =
				630	llvm::FunctionType::get(CGM.VoidTy, llvm::None, /isVarArg/ false);
				631	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_kernel_end_parallel");
				632	break;
				633	}
				634	case OMPRTL_NVPTX__kmpc_serialized_parallel: {
				635	// Build void __kmpc_serialized_parallel(ident_t *loc, kmp_int32
				636	// global_tid);
				637	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
				638	llvm::FunctionType *FnTy =
				639	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
				640	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_serialized_parallel");
				641	break;
				642	}
				643	case OMPRTL_NVPTX__kmpc_end_serialized_parallel: {
				644	// Build void __kmpc_end_serialized_parallel(ident_t *loc, kmp_int32
				645	// global_tid);
				646	llvm::Type *TypeParams[] = {getIdentTyPointerTy(), CGM.Int32Ty};
				647	llvm::FunctionType *FnTy =
				648	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg/ false);
				649	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_end_serialized_parallel");
				650	break;
				651	}
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	652	case OMPRTL_NVPTX__kmpc_shuffle_int32: {
				653	// Build int32_t __kmpc_shuffle_int32(int32_t element,
				654	// int16_t lane_offset, int16_t warp_size);
				655	llvm::Type *TypeParams[] = {CGM.Int32Ty, CGM.Int16Ty, CGM.Int16Ty};
				656	llvm::FunctionType *FnTy =
				657	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg/ false);
				658	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int32");
				659	break;
				660	}
				661	case OMPRTL_NVPTX__kmpc_shuffle_int64: {
				662	// Build int64_t __kmpc_shuffle_int64(int64_t element,
				663	// int16_t lane_offset, int16_t warp_size);
				664	llvm::Type *TypeParams[] = {CGM.Int64Ty, CGM.Int16Ty, CGM.Int16Ty};
				665	llvm::FunctionType *FnTy =
				666	llvm::FunctionType::get(CGM.Int64Ty, TypeParams, /isVarArg/ false);
				667	RTLFn = CGM.CreateRuntimeFunction(FnTy, "__kmpc_shuffle_int64");
				668	break;
				669	}
				670	case OMPRTL_NVPTX__kmpc_parallel_reduce_nowait: {
				671	// Build int32_t kmpc_nvptx_parallel_reduce_nowait(kmp_int32 global_tid,
				672	// kmp_int32 num_vars, size_t reduce_size, void* reduce_data,
				673	// void (kmp_ShuffleReductFctPtr)(void rhsData, int16_t lane_id, int16_t
				674	// lane_offset, int16_t Algorithm Version),
				675	// void (kmp_InterWarpCopyFctPtr)(void src, int warp_num));
				676	llvm::Type *ShuffleReduceTypeParams[] = {CGM.VoidPtrTy, CGM.Int16Ty,
				677	CGM.Int16Ty, CGM.Int16Ty};
				678	auto *ShuffleReduceFnTy =
				679	llvm::FunctionType::get(CGM.VoidTy, ShuffleReduceTypeParams,
				680	/isVarArg=/false);
				681	llvm::Type *InterWarpCopyTypeParams[] = {CGM.VoidPtrTy, CGM.Int32Ty};
				682	auto *InterWarpCopyFnTy =
				683	llvm::FunctionType::get(CGM.VoidTy, InterWarpCopyTypeParams,
				684	/isVarArg=/false);
				685	llvm::Type *TypeParams[] = {CGM.Int32Ty,
				686	CGM.Int32Ty,
				687	CGM.SizeTy,
				688	CGM.VoidPtrTy,
				689	ShuffleReduceFnTy->getPointerTo(),
				690	InterWarpCopyFnTy->getPointerTo()};
				691	llvm::FunctionType *FnTy =
				692	llvm::FunctionType::get(CGM.Int32Ty, TypeParams, /isVarArg=/false);
				693	RTLFn = CGM.CreateRuntimeFunction(
				694	FnTy, /Name=/"__kmpc_nvptx_parallel_reduce_nowait");
				695	break;
				696	}
				697	case OMPRTL_NVPTX__kmpc_end_reduce_nowait: {
				698	// Build __kmpc_end_reduce_nowait(kmp_int32 global_tid);
				699	llvm::Type *TypeParams[] = {CGM.Int32Ty};
				700	llvm::FunctionType *FnTy =
				701	llvm::FunctionType::get(CGM.VoidTy, TypeParams, /isVarArg=/false);
				702	RTLFn = CGM.CreateRuntimeFunction(
				703	FnTy, /Name=/"__kmpc_nvptx_end_reduce_nowait");
				704	break;
				705	}
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	706	}
				707	return RTLFn;
				708	}
				709
				710	void CGOpenMPRuntimeNVPTX::createOffloadEntry(llvm::Constant *ID,
				711	llvm::Constant *Addr,
Samuel Antao	f83efdb	2017-01-05 16:02:49 +0000	[diff] [blame]	712	uint64_t Size, int32_t) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	713	auto *F = dyn_cast<llvm::Function>(Addr);
				714	// TODO: Add support for global variables on the device after declare target
				715	// support.
				716	if (!F)
				717	return;
				718	llvm::Module *M = F->getParent();
				719	llvm::LLVMContext &Ctx = M->getContext();
				720
				721	// Get "nvvm.annotations" metadata node
				722	llvm::NamedMDNode *MD = M->getOrInsertNamedMetadata("nvvm.annotations");
				723
				724	llvm::Metadata *MDVals[] = {
				725	llvm::ConstantAsMetadata::get(F), llvm::MDString::get(Ctx, "kernel"),
				726	llvm::ConstantAsMetadata::get(
				727	llvm::ConstantInt::get(llvm::Type::getInt32Ty(Ctx), 1))};
				728	// Append metadata to nvvm.annotations
				729	MD->addOperand(llvm::MDNode::get(Ctx, MDVals));
				730	}
				731
				732	void CGOpenMPRuntimeNVPTX::emitTargetOutlinedFunction(
				733	const OMPExecutableDirective &D, StringRef ParentName,
				734	llvm::Function &OutlinedFn, llvm::Constant &OutlinedFnID,
Alexey Bataev	14fa1c6	2016-03-29 05:34:15 +0000	[diff] [blame]	735	bool IsOffloadEntry, const RegionCodeGenTy &CodeGen) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	736	if (!IsOffloadEntry) // Nothing to do.
				737	return;
				738
				739	assert(!ParentName.empty() && "Invalid target region parent name!");
				740
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	741	CGOpenMPRuntimeNVPTX::ExecutionMode Mode =
				742	getExecutionModeForDirective(CGM, D);
				743	switch (Mode) {
				744	case CGOpenMPRuntimeNVPTX::ExecutionMode::Generic:
				745	emitGenericKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
				746	CodeGen);
				747	break;
				748	case CGOpenMPRuntimeNVPTX::ExecutionMode::Spmd:
				749	emitSpmdKernel(D, ParentName, OutlinedFn, OutlinedFnID, IsOffloadEntry,
				750	CodeGen);
				751	break;
				752	case CGOpenMPRuntimeNVPTX::ExecutionMode::Unknown:
				753	llvm_unreachable(
				754	"Unknown programming model for OpenMP directive on NVPTX target.");
				755	}
				756
				757	setPropertyExecutionMode(CGM, OutlinedFn->getName(), Mode);
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	758	}
				759
Samuel Antao	45bfe4c	2016-02-08 15:59:20 +0000	[diff] [blame]	760	CGOpenMPRuntimeNVPTX::CGOpenMPRuntimeNVPTX(CodeGenModule &CGM)
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	761	: CGOpenMPRuntime(CGM), CurrentExecutionMode(ExecutionMode::Unknown) {
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	762	if (!CGM.getLangOpts().OpenMPIsDevice)
				763	llvm_unreachable("OpenMP NVPTX can only handle device code.");
Arpith Chacko Jacob	5c309e4	2016-03-22 01:48:56 +0000	[diff] [blame]	764	}
Carlo Bertolli	c687225	2016-04-04 15:55:02 +0000	[diff] [blame]	765
Arpith Chacko Jacob	2cd6eea	2017-01-25 16:55:10 +0000	[diff] [blame]	766	void CGOpenMPRuntimeNVPTX::emitProcBindClause(CodeGenFunction &CGF,
				767	OpenMPProcBindClauseKind ProcBind,
				768	SourceLocation Loc) {
				769	// Do nothing in case of Spmd mode and L0 parallel.
				770	// TODO: If in Spmd mode and L1 parallel emit the clause.
				771	if (isInSpmdExecutionMode())
				772	return;
				773
				774	CGOpenMPRuntime::emitProcBindClause(CGF, ProcBind, Loc);
				775	}
				776
Arpith Chacko Jacob	e04da5d	2017-01-25 01:18:34 +0000	[diff] [blame]	777	void CGOpenMPRuntimeNVPTX::emitNumThreadsClause(CodeGenFunction &CGF,
				778	llvm::Value *NumThreads,
				779	SourceLocation Loc) {
				780	// Do nothing in case of Spmd mode and L0 parallel.
				781	// TODO: If in Spmd mode and L1 parallel emit the clause.
				782	if (isInSpmdExecutionMode())
				783	return;
				784
				785	CGOpenMPRuntime::emitNumThreadsClause(CGF, NumThreads, Loc);
				786	}
				787
Carlo Bertolli	c687225	2016-04-04 15:55:02 +0000	[diff] [blame]	788	void CGOpenMPRuntimeNVPTX::emitNumTeamsClause(CodeGenFunction &CGF,
				789	const Expr *NumTeams,
				790	const Expr *ThreadLimit,
				791	SourceLocation Loc) {}
				792
Arpith Chacko Jacob	19b911c	2017-01-18 18:18:53 +0000	[diff] [blame]	793	llvm::Value *CGOpenMPRuntimeNVPTX::emitParallelOutlinedFunction(
				794	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
				795	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
				796	return CGOpenMPRuntime::emitParallelOutlinedFunction(D, ThreadIDVar,
				797	InnermostKind, CodeGen);
				798	}
				799
				800	llvm::Value *CGOpenMPRuntimeNVPTX::emitTeamsOutlinedFunction(
Carlo Bertolli	c687225	2016-04-04 15:55:02 +0000	[diff] [blame]	801	const OMPExecutableDirective &D, const VarDecl *ThreadIDVar,
				802	OpenMPDirectiveKind InnermostKind, const RegionCodeGenTy &CodeGen) {
				803
Arpith Chacko Jacob	19b911c	2017-01-18 18:18:53 +0000	[diff] [blame]	804	llvm::Value *OutlinedFunVal = CGOpenMPRuntime::emitTeamsOutlinedFunction(
				805	D, ThreadIDVar, InnermostKind, CodeGen);
				806	llvm::Function *OutlinedFun = cast<llvm::Function>(OutlinedFunVal);
				807	OutlinedFun->removeFnAttr(llvm::Attribute::NoInline);
				808	OutlinedFun->addFnAttr(llvm::Attribute::AlwaysInline);
Carlo Bertolli	c687225	2016-04-04 15:55:02 +0000	[diff] [blame]	809
				810	return OutlinedFun;
				811	}
				812
				813	void CGOpenMPRuntimeNVPTX::emitTeamsCall(CodeGenFunction &CGF,
				814	const OMPExecutableDirective &D,
				815	SourceLocation Loc,
				816	llvm::Value *OutlinedFn,
				817	ArrayRef<llvm::Value *> CapturedVars) {
				818	if (!CGF.HaveInsertPoint())
				819	return;
				820
				821	Address ZeroAddr =
				822	CGF.CreateTempAlloca(CGF.Int32Ty, CharUnits::fromQuantity(4),
				823	/Name/ ".zero.addr");
				824	CGF.InitTempAlloca(ZeroAddr, CGF.Builder.getInt32(/C/ 0));
				825	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
				826	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
				827	OutlinedFnArgs.push_back(ZeroAddr.getPointer());
				828	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
				829	CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
				830	}
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	831
				832	void CGOpenMPRuntimeNVPTX::emitParallelCall(
				833	CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
				834	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
				835	if (!CGF.HaveInsertPoint())
				836	return;
				837
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	838	if (isInSpmdExecutionMode())
				839	emitSpmdParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
				840	else
				841	emitGenericParallelCall(CGF, Loc, OutlinedFn, CapturedVars, IfCond);
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	842	}
				843
				844	void CGOpenMPRuntimeNVPTX::emitGenericParallelCall(
				845	CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
				846	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
				847	llvm::Function *Fn = cast<llvm::Function>(OutlinedFn);
				848
Malcolm Parsons	c6e4583	2017-01-13 18:55:32 +0000	[diff] [blame]	849	auto &&L0ParallelGen = [this, Fn](CodeGenFunction &CGF, PrePostActionTy &) {
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	850	CGBuilderTy &Bld = CGF.Builder;
				851
				852	// Prepare for parallel region. Indicate the outlined function.
				853	llvm::Value *Args[] = {Bld.CreateBitOrPointerCast(Fn, CGM.Int8PtrTy)};
				854	CGF.EmitRuntimeCall(
				855	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_kernel_prepare_parallel),
				856	Args);
				857
				858	// Activate workers. This barrier is used by the master to signal
				859	// work for the workers.
				860	syncCTAThreads(CGF);
				861
				862	// OpenMP [2.5, Parallel Construct, p.49]
				863	// There is an implied barrier at the end of a parallel region. After the
				864	// end of a parallel region, only the master thread of the team resumes
				865	// execution of the enclosing task region.
				866	//
				867	// The master waits at this barrier until all workers are done.
				868	syncCTAThreads(CGF);
				869
				870	// Remember for post-processing in worker loop.
				871	Work.push_back(Fn);
				872	};
				873
				874	auto *RTLoc = emitUpdateLocation(CGF, Loc);
				875	auto *ThreadID = getThreadID(CGF, Loc);
				876	llvm::Value *Args[] = {RTLoc, ThreadID};
				877
				878	auto &&SeqGen = [this, Fn, &CapturedVars, &Args](CodeGenFunction &CGF,
				879	PrePostActionTy &) {
Malcolm Parsons	c6e4583	2017-01-13 18:55:32 +0000	[diff] [blame]	880	auto &&CodeGen = [this, Fn, &CapturedVars](CodeGenFunction &CGF,
				881	PrePostActionTy &Action) {
Arpith Chacko Jacob	bb36fe8	2017-01-10 15:42:51 +0000	[diff] [blame]	882	Action.Enter(CGF);
				883
				884	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
				885	OutlinedFnArgs.push_back(
				886	llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
				887	OutlinedFnArgs.push_back(
				888	llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
				889	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
				890	CGF.EmitCallOrInvoke(Fn, OutlinedFnArgs);
				891	};
				892
				893	RegionCodeGenTy RCG(CodeGen);
				894	NVPTXActionTy Action(
				895	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_serialized_parallel),
				896	Args,
				897	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_serialized_parallel),
				898	Args);
				899	RCG.setAction(Action);
				900	RCG(CGF);
				901	};
				902
				903	if (IfCond)
				904	emitOMPIfClause(CGF, IfCond, L0ParallelGen, SeqGen);
				905	else {
				906	CodeGenFunction::RunCleanupsScope Scope(CGF);
				907	RegionCodeGenTy ThenRCG(L0ParallelGen);
				908	ThenRCG(CGF);
				909	}
				910	}
Arpith Chacko Jacob	44a87c9	2017-01-18 19:35:00 +0000	[diff] [blame]	911
				912	void CGOpenMPRuntimeNVPTX::emitSpmdParallelCall(
				913	CodeGenFunction &CGF, SourceLocation Loc, llvm::Value *OutlinedFn,
				914	ArrayRef<llvm::Value > CapturedVars, const Expr IfCond) {
				915	// Just call the outlined function to execute the parallel region.
				916	// OutlinedFn(&GTid, &zero, CapturedStruct);
				917	//
				918	// TODO: Do something with IfCond when support for the 'if' clause
				919	// is added on Spmd target directives.
				920	llvm::SmallVector<llvm::Value *, 16> OutlinedFnArgs;
				921	OutlinedFnArgs.push_back(
				922	llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
				923	OutlinedFnArgs.push_back(
				924	llvm::ConstantPointerNull::get(CGM.Int32Ty->getPointerTo()));
				925	OutlinedFnArgs.append(CapturedVars.begin(), CapturedVars.end());
				926	CGF.EmitCallOrInvoke(OutlinedFn, OutlinedFnArgs);
				927	}
Arpith Chacko Jacob	101e8fb	2017-02-16 16:20:16 +0000	[diff] [blame^]	928
				929	/// This function creates calls to one of two shuffle functions to copy
				930	/// variables between lanes in a warp.
				931	static llvm::Value *createRuntimeShuffleFunction(CodeGenFunction &CGF,
				932	QualType ElemTy,
				933	llvm::Value *Elem,
				934	llvm::Value *Offset) {
				935	auto &CGM = CGF.CGM;
				936	auto &C = CGM.getContext();
				937	auto &Bld = CGF.Builder;
				938	CGOpenMPRuntimeNVPTX &RT =
				939	(static_cast<CGOpenMPRuntimeNVPTX >(&CGM.getOpenMPRuntime()));
				940
				941	unsigned Size = CGM.getContext().getTypeSizeInChars(ElemTy).getQuantity();
				942	assert(Size <= 8 && "Unsupported bitwidth in shuffle instruction.");
				943
				944	OpenMPRTLFunctionNVPTX ShuffleFn = Size <= 4
				945	? OMPRTL_NVPTX__kmpc_shuffle_int32
				946	: OMPRTL_NVPTX__kmpc_shuffle_int64;
				947
				948	// Cast all types to 32- or 64-bit values before calling shuffle routines.
				949	auto CastTy = Size <= 4 ? CGM.Int32Ty : CGM.Int64Ty;
				950	auto *ElemCast = Bld.CreateSExtOrBitCast(Elem, CastTy);
				951	auto *WarpSize = CGF.EmitScalarConversion(
				952	getNVPTXWarpSize(CGF), C.getIntTypeForBitwidth(32, /* Signed */ true),
				953	C.getIntTypeForBitwidth(16, /* Signed */ true), SourceLocation());
				954
				955	auto *ShuffledVal =
				956	CGF.EmitRuntimeCall(RT.createNVPTXRuntimeFunction(ShuffleFn),
				957	{ElemCast, Offset, WarpSize});
				958
				959	return Bld.CreateTruncOrBitCast(ShuffledVal, CGF.ConvertTypeForMem(ElemTy));
				960	}
				961
				962	namespace {
				963	enum CopyAction : unsigned {
				964	// RemoteLaneToThread: Copy over a Reduce list from a remote lane in
				965	// the warp using shuffle instructions.
				966	RemoteLaneToThread,
				967	// ThreadCopy: Make a copy of a Reduce list on the thread's stack.
				968	ThreadCopy,
				969	};
				970	} // namespace
				971
				972	/// Emit instructions to copy a Reduce list, which contains partially
				973	/// aggregated values, in the specified direction.
				974	static void emitReductionListCopy(CopyAction Action, CodeGenFunction &CGF,
				975	QualType ReductionArrayTy,
				976	ArrayRef<const Expr *> Privates,
				977	Address SrcBase, Address DestBase,
				978	llvm::Value *RemoteLaneOffset = nullptr) {
				979
				980	auto &CGM = CGF.CGM;
				981	auto &C = CGM.getContext();
				982	auto &Bld = CGF.Builder;
				983
				984	// Iterates, element-by-element, through the source Reduce list and
				985	// make a copy.
				986	unsigned Idx = 0;
				987	for (auto &Private : Privates) {
				988	Address SrcElementAddr = Address::invalid();
				989	Address DestElementAddr = Address::invalid();
				990	Address DestElementPtrAddr = Address::invalid();
				991	// Should we shuffle in an element from a remote lane?
				992	bool ShuffleInElement = false;
				993	// Set to true to update the pointer in the dest Reduce list to a
				994	// newly created element.
				995	bool UpdateDestListPtr = false;
				996
				997	switch (Action) {
				998	case RemoteLaneToThread: {
				999	// Step 1.1: Get the address for the src element in the Reduce list.
				1000	Address SrcElementPtrAddr =
				1001	Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
				1002	llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
				1003	SrcElementPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
				1004	SrcElementAddr =
				1005	Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
				1006
				1007	// Step 1.2: Create a temporary to store the element in the destination
				1008	// Reduce list.
				1009	DestElementPtrAddr =
				1010	Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
				1011	DestElementAddr =
				1012	CGF.CreateMemTemp(Private->getType(), ".omp.reduction.element");
				1013	ShuffleInElement = true;
				1014	UpdateDestListPtr = true;
				1015	break;
				1016	}
				1017	case ThreadCopy: {
				1018	// Step 1.1: Get the address for the src element in the Reduce list.
				1019	Address SrcElementPtrAddr =
				1020	Bld.CreateConstArrayGEP(SrcBase, Idx, CGF.getPointerSize());
				1021	llvm::Value *SrcElementPtrPtr = CGF.EmitLoadOfScalar(
				1022	SrcElementPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
				1023	SrcElementAddr =
				1024	Address(SrcElementPtrPtr, C.getTypeAlignInChars(Private->getType()));
				1025
				1026	// Step 1.2: Get the address for dest element. The destination
				1027	// element has already been created on the thread's stack.
				1028	DestElementPtrAddr =
				1029	Bld.CreateConstArrayGEP(DestBase, Idx, CGF.getPointerSize());
				1030	llvm::Value *DestElementPtr =
				1031	CGF.EmitLoadOfScalar(DestElementPtrAddr, /Volatile=/false,
				1032	C.VoidPtrTy, SourceLocation());
				1033	Address DestElemAddr =
				1034	Address(DestElementPtr, C.getTypeAlignInChars(Private->getType()));
				1035	DestElementAddr = Bld.CreateElementBitCast(
				1036	DestElemAddr, CGF.ConvertTypeForMem(Private->getType()));
				1037	break;
				1038	}
				1039	}
				1040
				1041	// Regardless of src and dest of copy, we emit the load of src
				1042	// element as this is required in all directions
				1043	SrcElementAddr = Bld.CreateElementBitCast(
				1044	SrcElementAddr, CGF.ConvertTypeForMem(Private->getType()));
				1045	llvm::Value *Elem =
				1046	CGF.EmitLoadOfScalar(SrcElementAddr, /Volatile=/false,
				1047	Private->getType(), SourceLocation());
				1048
				1049	// Now that all active lanes have read the element in the
				1050	// Reduce list, shuffle over the value from the remote lane.
				1051	if (ShuffleInElement) {
				1052	Elem = createRuntimeShuffleFunction(CGF, Private->getType(), Elem,
				1053	RemoteLaneOffset);
				1054	}
				1055
				1056	// Store the source element value to the dest element address.
				1057	CGF.EmitStoreOfScalar(Elem, DestElementAddr, /Volatile=/false,
				1058	Private->getType());
				1059
				1060	// Step 3.1: Modify reference in dest Reduce list as needed.
				1061	// Modifying the reference in Reduce list to point to the newly
				1062	// created element. The element is live in the current function
				1063	// scope and that of functions it invokes (i.e., reduce_function).
				1064	// RemoteReduceData[i] = (void*)&RemoteElem
				1065	if (UpdateDestListPtr) {
				1066	CGF.EmitStoreOfScalar(Bld.CreatePointerBitCastOrAddrSpaceCast(
				1067	DestElementAddr.getPointer(), CGF.VoidPtrTy),
				1068	DestElementPtrAddr, /Volatile=/false,
				1069	C.VoidPtrTy);
				1070	}
				1071
				1072	Idx++;
				1073	}
				1074	}
				1075
				1076	/// This function emits a helper that gathers Reduce lists from the first
				1077	/// lane of every active warp to lanes in the first warp.
				1078	///
				1079	/// void inter_warp_copy_func(void* reduce_data, num_warps)
				1080	/// shared smem[warp_size];
				1081	/// For all data entries D in reduce_data:
				1082	/// If (I am the first lane in each warp)
				1083	/// Copy my local D to smem[warp_id]
				1084	/// sync
				1085	/// if (I am the first warp)
				1086	/// Copy smem[thread_id] to my local D
				1087	/// sync
				1088	static llvm::Value *emitInterWarpCopyFunction(CodeGenModule &CGM,
				1089	ArrayRef<const Expr *> Privates,
				1090	QualType ReductionArrayTy) {
				1091	auto &C = CGM.getContext();
				1092	auto &M = CGM.getModule();
				1093
				1094	// ReduceList: thread local Reduce list.
				1095	// At the stage of the computation when this function is called, partially
				1096	// aggregated values reside in the first lane of every active warp.
				1097	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, SourceLocation(),
				1098	/Id=/nullptr, C.VoidPtrTy);
				1099	// NumWarps: number of warps active in the parallel region. This could
				1100	// be smaller than 32 (max warps in a CTA) for partial block reduction.
				1101	ImplicitParamDecl NumWarpsArg(C, /DC=/nullptr, SourceLocation(),
				1102	/Id=/nullptr,
				1103	C.getIntTypeForBitwidth(32, /* Signed */ true));
				1104	FunctionArgList Args;
				1105	Args.push_back(&ReduceListArg);
				1106	Args.push_back(&NumWarpsArg);
				1107
				1108	auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
				1109	auto *Fn = llvm::Function::Create(
				1110	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
				1111	"_omp_reduction_inter_warp_copy_func", &CGM.getModule());
				1112	CGM.SetInternalFunctionAttributes(/DC=/nullptr, Fn, CGFI);
				1113	CodeGenFunction CGF(CGM);
				1114	// We don't need debug information in this function as nothing here refers to
				1115	// user code.
				1116	CGF.disableDebugInfo();
				1117	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
				1118
				1119	auto &Bld = CGF.Builder;
				1120
				1121	// This array is used as a medium to transfer, one reduce element at a time,
				1122	// the data from the first lane of every warp to lanes in the first warp
				1123	// in order to perform the final step of a reduction in a parallel region
				1124	// (reduction across warps). The array is placed in NVPTX __shared__ memory
				1125	// for reduced latency, as well as to have a distinct copy for concurrently
				1126	// executing target regions. The array is declared with common linkage so
				1127	// as to be shared across compilation units.
				1128	const char *TransferMediumName =
				1129	"__openmp_nvptx_data_transfer_temporary_storage";
				1130	llvm::GlobalVariable *TransferMedium =
				1131	M.getGlobalVariable(TransferMediumName);
				1132	if (!TransferMedium) {
				1133	auto *Ty = llvm::ArrayType::get(CGM.Int64Ty, WarpSize);
				1134	unsigned SharedAddressSpace = C.getTargetAddressSpace(LangAS::cuda_shared);
				1135	TransferMedium = new llvm::GlobalVariable(
				1136	M, Ty,
				1137	/isConstant=/false, llvm::GlobalVariable::CommonLinkage,
				1138	llvm::Constant::getNullValue(Ty), TransferMediumName,
				1139	/InsertBefore=/nullptr, llvm::GlobalVariable::NotThreadLocal,
				1140	SharedAddressSpace);
				1141	}
				1142
				1143	// Get the CUDA thread id of the current OpenMP thread on the GPU.
				1144	auto *ThreadID = getNVPTXThreadID(CGF);
				1145	// nvptx_lane_id = nvptx_id % warpsize
				1146	auto *LaneID = getNVPTXLaneID(CGF);
				1147	// nvptx_warp_id = nvptx_id / warpsize
				1148	auto *WarpID = getNVPTXWarpID(CGF);
				1149
				1150	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
				1151	Address LocalReduceList(
				1152	Bld.CreatePointerBitCastOrAddrSpaceCast(
				1153	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
				1154	C.VoidPtrTy, SourceLocation()),
				1155	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
				1156	CGF.getPointerAlign());
				1157
				1158	unsigned Idx = 0;
				1159	for (auto &Private : Privates) {
				1160	//
				1161	// Warp master copies reduce element to transfer medium in __shared__
				1162	// memory.
				1163	//
				1164	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
				1165	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
				1166	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
				1167
				1168	// if (lane_id == 0)
				1169	auto IsWarpMaster =
				1170	Bld.CreateICmpEQ(LaneID, Bld.getInt32(0), "warp_master");
				1171	Bld.CreateCondBr(IsWarpMaster, ThenBB, ElseBB);
				1172	CGF.EmitBlock(ThenBB);
				1173
				1174	// Reduce element = LocalReduceList[i]
				1175	Address ElemPtrPtrAddr =
				1176	Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
				1177	llvm::Value *ElemPtrPtr = CGF.EmitLoadOfScalar(
				1178	ElemPtrPtrAddr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
				1179	// elemptr = (type[i]*)(elemptrptr)
				1180	Address ElemPtr =
				1181	Address(ElemPtrPtr, C.getTypeAlignInChars(Private->getType()));
				1182	ElemPtr = Bld.CreateElementBitCast(
				1183	ElemPtr, CGF.ConvertTypeForMem(Private->getType()));
				1184	// elem = *elemptr
				1185	llvm::Value *Elem = CGF.EmitLoadOfScalar(
				1186	ElemPtr, /Volatile=/false, Private->getType(), SourceLocation());
				1187
				1188	// Get pointer to location in transfer medium.
				1189	// MediumPtr = &medium[warp_id]
				1190	llvm::Value *MediumPtrVal = Bld.CreateInBoundsGEP(
				1191	TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), WarpID});
				1192	Address MediumPtr(MediumPtrVal, C.getTypeAlignInChars(Private->getType()));
				1193	// Casting to actual data type.
				1194	// MediumPtr = (type[i]*)MediumPtrAddr;
				1195	MediumPtr = Bld.CreateElementBitCast(
				1196	MediumPtr, CGF.ConvertTypeForMem(Private->getType()));
				1197
				1198	//*MediumPtr = elem
				1199	Bld.CreateStore(Elem, MediumPtr);
				1200
				1201	Bld.CreateBr(MergeBB);
				1202
				1203	CGF.EmitBlock(ElseBB);
				1204	Bld.CreateBr(MergeBB);
				1205
				1206	CGF.EmitBlock(MergeBB);
				1207
				1208	Address AddrNumWarpsArg = CGF.GetAddrOfLocalVar(&NumWarpsArg);
				1209	llvm::Value *NumWarpsVal = CGF.EmitLoadOfScalar(
				1210	AddrNumWarpsArg, /Volatile=/false, C.IntTy, SourceLocation());
				1211
				1212	auto *NumActiveThreads = Bld.CreateNSWMul(
				1213	NumWarpsVal, getNVPTXWarpSize(CGF), "num_active_threads");
				1214	// named_barrier_sync(ParallelBarrierID, num_active_threads)
				1215	syncParallelThreads(CGF, NumActiveThreads);
				1216
				1217	//
				1218	// Warp 0 copies reduce element from transfer medium.
				1219	//
				1220	llvm::BasicBlock *W0ThenBB = CGF.createBasicBlock("then");
				1221	llvm::BasicBlock *W0ElseBB = CGF.createBasicBlock("else");
				1222	llvm::BasicBlock *W0MergeBB = CGF.createBasicBlock("ifcont");
				1223
				1224	// Up to 32 threads in warp 0 are active.
				1225	auto IsActiveThread =
				1226	Bld.CreateICmpULT(ThreadID, NumWarpsVal, "is_active_thread");
				1227	Bld.CreateCondBr(IsActiveThread, W0ThenBB, W0ElseBB);
				1228
				1229	CGF.EmitBlock(W0ThenBB);
				1230
				1231	// SrcMediumPtr = &medium[tid]
				1232	llvm::Value *SrcMediumPtrVal = Bld.CreateInBoundsGEP(
				1233	TransferMedium, {llvm::Constant::getNullValue(CGM.Int64Ty), ThreadID});
				1234	Address SrcMediumPtr(SrcMediumPtrVal,
				1235	C.getTypeAlignInChars(Private->getType()));
				1236	// SrcMediumVal = *SrcMediumPtr;
				1237	SrcMediumPtr = Bld.CreateElementBitCast(
				1238	SrcMediumPtr, CGF.ConvertTypeForMem(Private->getType()));
				1239	llvm::Value *SrcMediumValue = CGF.EmitLoadOfScalar(
				1240	SrcMediumPtr, /Volatile=/false, Private->getType(), SourceLocation());
				1241
				1242	// TargetElemPtr = (type[i]*)(SrcDataAddr[i])
				1243	Address TargetElemPtrPtr =
				1244	Bld.CreateConstArrayGEP(LocalReduceList, Idx, CGF.getPointerSize());
				1245	llvm::Value *TargetElemPtrVal = CGF.EmitLoadOfScalar(
				1246	TargetElemPtrPtr, /Volatile=/false, C.VoidPtrTy, SourceLocation());
				1247	Address TargetElemPtr =
				1248	Address(TargetElemPtrVal, C.getTypeAlignInChars(Private->getType()));
				1249	TargetElemPtr = Bld.CreateElementBitCast(
				1250	TargetElemPtr, CGF.ConvertTypeForMem(Private->getType()));
				1251
				1252	// *TargetElemPtr = SrcMediumVal;
				1253	CGF.EmitStoreOfScalar(SrcMediumValue, TargetElemPtr, /Volatile=/false,
				1254	Private->getType());
				1255	Bld.CreateBr(W0MergeBB);
				1256
				1257	CGF.EmitBlock(W0ElseBB);
				1258	Bld.CreateBr(W0MergeBB);
				1259
				1260	CGF.EmitBlock(W0MergeBB);
				1261
				1262	// While warp 0 copies values from transfer medium, all other warps must
				1263	// wait.
				1264	syncParallelThreads(CGF, NumActiveThreads);
				1265	Idx++;
				1266	}
				1267
				1268	CGF.FinishFunction();
				1269	return Fn;
				1270	}
				1271
				1272	/// Emit a helper that reduces data across two OpenMP threads (lanes)
				1273	/// in the same warp. It uses shuffle instructions to copy over data from
				1274	/// a remote lane's stack. The reduction algorithm performed is specified
				1275	/// by the fourth parameter.
				1276	///
				1277	/// Algorithm Versions.
				1278	/// Full Warp Reduce (argument value 0):
				1279	/// This algorithm assumes that all 32 lanes are active and gathers
				1280	/// data from these 32 lanes, producing a single resultant value.
				1281	/// Contiguous Partial Warp Reduce (argument value 1):
				1282	/// This algorithm assumes that only a contiguous subset of lanes
				1283	/// are active. This happens for the last warp in a parallel region
				1284	/// when the user specified num_threads is not an integer multiple of
				1285	/// 32. This contiguous subset always starts with the zeroth lane.
				1286	/// Partial Warp Reduce (argument value 2):
				1287	/// This algorithm gathers data from any number of lanes at any position.
				1288	/// All reduced values are stored in the lowest possible lane. The set
				1289	/// of problems every algorithm addresses is a super set of those
				1290	/// addressable by algorithms with a lower version number. Overhead
				1291	/// increases as algorithm version increases.
				1292	///
				1293	/// Terminology
				1294	/// Reduce element:
				1295	/// Reduce element refers to the individual data field with primitive
				1296	/// data types to be combined and reduced across threads.
				1297	/// Reduce list:
				1298	/// Reduce list refers to a collection of local, thread-private
				1299	/// reduce elements.
				1300	/// Remote Reduce list:
				1301	/// Remote Reduce list refers to a collection of remote (relative to
				1302	/// the current thread) reduce elements.
				1303	///
				1304	/// We distinguish between three states of threads that are important to
				1305	/// the implementation of this function.
				1306	/// Alive threads:
				1307	/// Threads in a warp executing the SIMT instruction, as distinguished from
				1308	/// threads that are inactive due to divergent control flow.
				1309	/// Active threads:
				1310	/// The minimal set of threads that has to be alive upon entry to this
				1311	/// function. The computation is correct iff active threads are alive.
				1312	/// Some threads are alive but they are not active because they do not
				1313	/// contribute to the computation in any useful manner. Turning them off
				1314	/// may introduce control flow overheads without any tangible benefits.
				1315	/// Effective threads:
				1316	/// In order to comply with the argument requirements of the shuffle
				1317	/// function, we must keep all lanes holding data alive. But at most
				1318	/// half of them perform value aggregation; we refer to this half of
				1319	/// threads as effective. The other half is simply handing off their
				1320	/// data.
				1321	///
				1322	/// Procedure
				1323	/// Value shuffle:
				1324	/// In this step active threads transfer data from higher lane positions
				1325	/// in the warp to lower lane positions, creating Remote Reduce list.
				1326	/// Value aggregation:
				1327	/// In this step, effective threads combine their thread local Reduce list
				1328	/// with Remote Reduce list and store the result in the thread local
				1329	/// Reduce list.
				1330	/// Value copy:
				1331	/// In this step, we deal with the assumption made by algorithm 2
				1332	/// (i.e. contiguity assumption). When we have an odd number of lanes
				1333	/// active, say 2k+1, only k threads will be effective and therefore k
				1334	/// new values will be produced. However, the Reduce list owned by the
				1335	/// (2k+1)th thread is ignored in the value aggregation. Therefore
				1336	/// we copy the Reduce list from the (2k+1)th lane to (k+1)th lane so
				1337	/// that the contiguity assumption still holds.
				1338	static llvm::Value *
				1339	emitShuffleAndReduceFunction(CodeGenModule &CGM,
				1340	ArrayRef<const Expr *> Privates,
				1341	QualType ReductionArrayTy, llvm::Value *ReduceFn) {
				1342	auto &C = CGM.getContext();
				1343
				1344	// Thread local Reduce list used to host the values of data to be reduced.
				1345	ImplicitParamDecl ReduceListArg(C, /DC=/nullptr, SourceLocation(),
				1346	/Id=/nullptr, C.VoidPtrTy);
				1347	// Current lane id; could be logical.
				1348	ImplicitParamDecl LaneIDArg(C, /DC=/nullptr, SourceLocation(),
				1349	/Id=/nullptr, C.ShortTy);
				1350	// Offset of the remote source lane relative to the current lane.
				1351	ImplicitParamDecl RemoteLaneOffsetArg(C, /DC=/nullptr, SourceLocation(),
				1352	/Id=/nullptr, C.ShortTy);
				1353	// Algorithm version. This is expected to be known at compile time.
				1354	ImplicitParamDecl AlgoVerArg(C, /DC=/nullptr, SourceLocation(),
				1355	/Id=/nullptr, C.ShortTy);
				1356	FunctionArgList Args;
				1357	Args.push_back(&ReduceListArg);
				1358	Args.push_back(&LaneIDArg);
				1359	Args.push_back(&RemoteLaneOffsetArg);
				1360	Args.push_back(&AlgoVerArg);
				1361
				1362	auto &CGFI = CGM.getTypes().arrangeBuiltinFunctionDeclaration(C.VoidTy, Args);
				1363	auto *Fn = llvm::Function::Create(
				1364	CGM.getTypes().GetFunctionType(CGFI), llvm::GlobalValue::InternalLinkage,
				1365	"_omp_reduction_shuffle_and_reduce_func", &CGM.getModule());
				1366	CGM.SetInternalFunctionAttributes(/D=/nullptr, Fn, CGFI);
				1367	CodeGenFunction CGF(CGM);
				1368	// We don't need debug information in this function as nothing here refers to
				1369	// user code.
				1370	CGF.disableDebugInfo();
				1371	CGF.StartFunction(GlobalDecl(), C.VoidTy, Fn, CGFI, Args);
				1372
				1373	auto &Bld = CGF.Builder;
				1374
				1375	Address AddrReduceListArg = CGF.GetAddrOfLocalVar(&ReduceListArg);
				1376	Address LocalReduceList(
				1377	Bld.CreatePointerBitCastOrAddrSpaceCast(
				1378	CGF.EmitLoadOfScalar(AddrReduceListArg, /Volatile=/false,
				1379	C.VoidPtrTy, SourceLocation()),
				1380	CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo()),
				1381	CGF.getPointerAlign());
				1382
				1383	Address AddrLaneIDArg = CGF.GetAddrOfLocalVar(&LaneIDArg);
				1384	llvm::Value *LaneIDArgVal = CGF.EmitLoadOfScalar(
				1385	AddrLaneIDArg, /Volatile=/false, C.ShortTy, SourceLocation());
				1386
				1387	Address AddrRemoteLaneOffsetArg = CGF.GetAddrOfLocalVar(&RemoteLaneOffsetArg);
				1388	llvm::Value *RemoteLaneOffsetArgVal = CGF.EmitLoadOfScalar(
				1389	AddrRemoteLaneOffsetArg, /Volatile=/false, C.ShortTy, SourceLocation());
				1390
				1391	Address AddrAlgoVerArg = CGF.GetAddrOfLocalVar(&AlgoVerArg);
				1392	llvm::Value *AlgoVerArgVal = CGF.EmitLoadOfScalar(
				1393	AddrAlgoVerArg, /Volatile=/false, C.ShortTy, SourceLocation());
				1394
				1395	// Create a local thread-private variable to host the Reduce list
				1396	// from a remote lane.
				1397	Address RemoteReduceList =
				1398	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.remote_reduce_list");
				1399
				1400	// This loop iterates through the list of reduce elements and copies,
				1401	// element by element, from a remote lane in the warp to RemoteReduceList,
				1402	// hosted on the thread's stack.
				1403	emitReductionListCopy(RemoteLaneToThread, CGF, ReductionArrayTy, Privates,
				1404	LocalReduceList, RemoteReduceList,
				1405	RemoteLaneOffsetArgVal);
				1406
				1407	// The actions to be performed on the Remote Reduce list is dependent
				1408	// on the algorithm version.
				1409	//
				1410	// if (AlgoVer==0) \|\| (AlgoVer==1 && (LaneId < Offset)) \|\| (AlgoVer==2 &&
				1411	// LaneId % 2 == 0 && Offset > 0):
				1412	// do the reduction value aggregation
				1413	//
				1414	// The thread local variable Reduce list is mutated in place to host the
				1415	// reduced data, which is the aggregated value produced from local and
				1416	// remote lanes.
				1417	//
				1418	// Note that AlgoVer is expected to be a constant integer known at compile
				1419	// time.
				1420	// When AlgoVer==0, the first conjunction evaluates to true, making
				1421	// the entire predicate true during compile time.
				1422	// When AlgoVer==1, the second conjunction has only the second part to be
				1423	// evaluated during runtime. Other conjunctions evaluates to false
				1424	// during compile time.
				1425	// When AlgoVer==2, the third conjunction has only the second part to be
				1426	// evaluated during runtime. Other conjunctions evaluates to false
				1427	// during compile time.
				1428	auto CondAlgo0 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(0));
				1429
				1430	auto Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
				1431	auto CondAlgo1 = Bld.CreateAnd(
				1432	Algo1, Bld.CreateICmpULT(LaneIDArgVal, RemoteLaneOffsetArgVal));
				1433
				1434	auto Algo2 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(2));
				1435	auto CondAlgo2 = Bld.CreateAnd(
				1436	Algo2,
				1437	Bld.CreateICmpEQ(Bld.CreateAnd(LaneIDArgVal, Bld.getInt16(1)),
				1438	Bld.getInt16(0)));
				1439	CondAlgo2 = Bld.CreateAnd(
				1440	CondAlgo2, Bld.CreateICmpSGT(RemoteLaneOffsetArgVal, Bld.getInt16(0)));
				1441
				1442	auto CondReduce = Bld.CreateOr(CondAlgo0, CondAlgo1);
				1443	CondReduce = Bld.CreateOr(CondReduce, CondAlgo2);
				1444
				1445	llvm::BasicBlock *ThenBB = CGF.createBasicBlock("then");
				1446	llvm::BasicBlock *ElseBB = CGF.createBasicBlock("else");
				1447	llvm::BasicBlock *MergeBB = CGF.createBasicBlock("ifcont");
				1448	Bld.CreateCondBr(CondReduce, ThenBB, ElseBB);
				1449
				1450	CGF.EmitBlock(ThenBB);
				1451	// reduce_function(LocalReduceList, RemoteReduceList)
				1452	llvm::Value *LocalReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
				1453	LocalReduceList.getPointer(), CGF.VoidPtrTy);
				1454	llvm::Value *RemoteReduceListPtr = Bld.CreatePointerBitCastOrAddrSpaceCast(
				1455	RemoteReduceList.getPointer(), CGF.VoidPtrTy);
				1456	CGF.EmitCallOrInvoke(ReduceFn, {LocalReduceListPtr, RemoteReduceListPtr});
				1457	Bld.CreateBr(MergeBB);
				1458
				1459	CGF.EmitBlock(ElseBB);
				1460	Bld.CreateBr(MergeBB);
				1461
				1462	CGF.EmitBlock(MergeBB);
				1463
				1464	// if (AlgoVer==1 && (LaneId >= Offset)) copy Remote Reduce list to local
				1465	// Reduce list.
				1466	Algo1 = Bld.CreateICmpEQ(AlgoVerArgVal, Bld.getInt16(1));
				1467	auto CondCopy = Bld.CreateAnd(
				1468	Algo1, Bld.CreateICmpUGE(LaneIDArgVal, RemoteLaneOffsetArgVal));
				1469
				1470	llvm::BasicBlock *CpyThenBB = CGF.createBasicBlock("then");
				1471	llvm::BasicBlock *CpyElseBB = CGF.createBasicBlock("else");
				1472	llvm::BasicBlock *CpyMergeBB = CGF.createBasicBlock("ifcont");
				1473	Bld.CreateCondBr(CondCopy, CpyThenBB, CpyElseBB);
				1474
				1475	CGF.EmitBlock(CpyThenBB);
				1476	emitReductionListCopy(ThreadCopy, CGF, ReductionArrayTy, Privates,
				1477	RemoteReduceList, LocalReduceList);
				1478	Bld.CreateBr(CpyMergeBB);
				1479
				1480	CGF.EmitBlock(CpyElseBB);
				1481	Bld.CreateBr(CpyMergeBB);
				1482
				1483	CGF.EmitBlock(CpyMergeBB);
				1484
				1485	CGF.FinishFunction();
				1486	return Fn;
				1487	}
				1488
				1489	///
				1490	/// Design of OpenMP reductions on the GPU
				1491	///
				1492	/// Consider a typical OpenMP program with one or more reduction
				1493	/// clauses:
				1494	///
				1495	/// float foo;
				1496	/// double bar;
				1497	/// #pragma omp target teams distribute parallel for \
				1498	/// reduction(+:foo) reduction(*:bar)
				1499	/// for (int i = 0; i < N; i++) {
				1500	/// foo += A[i]; bar *= B[i];
				1501	/// }
				1502	///
				1503	/// where 'foo' and 'bar' are reduced across all OpenMP threads in
				1504	/// all teams. In our OpenMP implementation on the NVPTX device an
				1505	/// OpenMP team is mapped to a CUDA threadblock and OpenMP threads
				1506	/// within a team are mapped to CUDA threads within a threadblock.
				1507	/// Our goal is to efficiently aggregate values across all OpenMP
				1508	/// threads such that:
				1509	///
				1510	/// - the compiler and runtime are logically concise, and
				1511	/// - the reduction is performed efficiently in a hierarchical
				1512	/// manner as follows: within OpenMP threads in the same warp,
				1513	/// across warps in a threadblock, and finally across teams on
				1514	/// the NVPTX device.
				1515	///
				1516	/// Introduction to Decoupling
				1517	///
				1518	/// We would like to decouple the compiler and the runtime so that the
				1519	/// latter is ignorant of the reduction variables (number, data types)
				1520	/// and the reduction operators. This allows a simpler interface
				1521	/// and implementation while still attaining good performance.
				1522	///
				1523	/// Pseudocode for the aforementioned OpenMP program generated by the
				1524	/// compiler is as follows:
				1525	///
				1526	/// 1. Create private copies of reduction variables on each OpenMP
				1527	/// thread: 'foo_private', 'bar_private'
				1528	/// 2. Each OpenMP thread reduces the chunk of 'A' and 'B' assigned
				1529	/// to it and writes the result in 'foo_private' and 'bar_private'
				1530	/// respectively.
				1531	/// 3. Call the OpenMP runtime on the GPU to reduce within a team
				1532	/// and store the result on the team master:
				1533	///
				1534	/// __kmpc_nvptx_parallel_reduce_nowait(...,
				1535	/// reduceData, shuffleReduceFn, interWarpCpyFn)
				1536	///
				1537	/// where:
				1538	/// struct ReduceData {
				1539	/// double *foo;
				1540	/// double *bar;
				1541	/// } reduceData
				1542	/// reduceData.foo = &foo_private
				1543	/// reduceData.bar = &bar_private
				1544	///
				1545	/// 'shuffleReduceFn' and 'interWarpCpyFn' are pointers to two
				1546	/// auxiliary functions generated by the compiler that operate on
				1547	/// variables of type 'ReduceData'. They aid the runtime perform
				1548	/// algorithmic steps in a data agnostic manner.
				1549	///
				1550	/// 'shuffleReduceFn' is a pointer to a function that reduces data
				1551	/// of type 'ReduceData' across two OpenMP threads (lanes) in the
				1552	/// same warp. It takes the following arguments as input:
				1553	///
				1554	/// a. variable of type 'ReduceData' on the calling lane,
				1555	/// b. its lane_id,
				1556	/// c. an offset relative to the current lane_id to generate a
				1557	/// remote_lane_id. The remote lane contains the second
				1558	/// variable of type 'ReduceData' that is to be reduced.
				1559	/// d. an algorithm version parameter determining which reduction
				1560	/// algorithm to use.
				1561	///
				1562	/// 'shuffleReduceFn' retrieves data from the remote lane using
				1563	/// efficient GPU shuffle intrinsics and reduces, using the
				1564	/// algorithm specified by the 4th parameter, the two operands
				1565	/// element-wise. The result is written to the first operand.
				1566	///
				1567	/// Different reduction algorithms are implemented in different
				1568	/// runtime functions, all calling 'shuffleReduceFn' to perform
				1569	/// the essential reduction step. Therefore, based on the 4th
				1570	/// parameter, this function behaves slightly differently to
				1571	/// cooperate with the runtime to ensure correctness under
				1572	/// different circumstances.
				1573	///
				1574	/// 'InterWarpCpyFn' is a pointer to a function that transfers
				1575	/// reduced variables across warps. It tunnels, through CUDA
				1576	/// shared memory, the thread-private data of type 'ReduceData'
				1577	/// from lane 0 of each warp to a lane in the first warp.
				1578	/// 5. if ret == 1:
				1579	/// The team master of the last team stores the reduced
				1580	/// result to the globals in memory.
				1581	/// foo += reduceData.foo; bar *= reduceData.bar
				1582	///
				1583	///
				1584	/// Warp Reduction Algorithms
				1585	///
				1586	/// On the warp level, we have three algorithms implemented in the
				1587	/// OpenMP runtime depending on the number of active lanes:
				1588	///
				1589	/// Full Warp Reduction
				1590	///
				1591	/// The reduce algorithm within a warp where all lanes are active
				1592	/// is implemented in the runtime as follows:
				1593	///
				1594	/// full_warp_reduce(void *reduce_data,
				1595	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
				1596	/// for (int offset = WARPSIZE/2; offset > 0; offset /= 2)
				1597	/// ShuffleReduceFn(reduce_data, 0, offset, 0);
				1598	/// }
				1599	///
				1600	/// The algorithm completes in log(2, WARPSIZE) steps.
				1601	///
				1602	/// 'ShuffleReduceFn' is used here with lane_id set to 0 because it is
				1603	/// not used therefore we save instructions by not retrieving lane_id
				1604	/// from the corresponding special registers. The 4th parameter, which
				1605	/// represents the version of the algorithm being used, is set to 0 to
				1606	/// signify full warp reduction.
				1607	///
				1608	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
				1609	///
				1610	/// #reduce_elem refers to an element in the local lane's data structure
				1611	/// #remote_elem is retrieved from a remote lane
				1612	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
				1613	/// reduce_elem = reduce_elem REDUCE_OP remote_elem;
				1614	///
				1615	/// Contiguous Partial Warp Reduction
				1616	///
				1617	/// This reduce algorithm is used within a warp where only the first
				1618	/// 'n' (n <= WARPSIZE) lanes are active. It is typically used when the
				1619	/// number of OpenMP threads in a parallel region is not a multiple of
				1620	/// WARPSIZE. The algorithm is implemented in the runtime as follows:
				1621	///
				1622	/// void
				1623	/// contiguous_partial_reduce(void *reduce_data,
				1624	/// kmp_ShuffleReductFctPtr ShuffleReduceFn,
				1625	/// int size, int lane_id) {
				1626	/// int curr_size;
				1627	/// int offset;
				1628	/// curr_size = size;
				1629	/// mask = curr_size/2;
				1630	/// while (offset>0) {
				1631	/// ShuffleReduceFn(reduce_data, lane_id, offset, 1);
				1632	/// curr_size = (curr_size+1)/2;
				1633	/// offset = curr_size/2;
				1634	/// }
				1635	/// }
				1636	///
				1637	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
				1638	///
				1639	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
				1640	/// if (lane_id < offset)
				1641	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
				1642	/// else
				1643	/// reduce_elem = remote_elem
				1644	///
				1645	/// This algorithm assumes that the data to be reduced are located in a
				1646	/// contiguous subset of lanes starting from the first. When there is
				1647	/// an odd number of active lanes, the data in the last lane is not
				1648	/// aggregated with any other lane's dat but is instead copied over.
				1649	///
				1650	/// Dispersed Partial Warp Reduction
				1651	///
				1652	/// This algorithm is used within a warp when any discontiguous subset of
				1653	/// lanes are active. It is used to implement the reduction operation
				1654	/// across lanes in an OpenMP simd region or in a nested parallel region.
				1655	///
				1656	/// void
				1657	/// dispersed_partial_reduce(void *reduce_data,
				1658	/// kmp_ShuffleReductFctPtr ShuffleReduceFn) {
				1659	/// int size, remote_id;
				1660	/// int logical_lane_id = number_of_active_lanes_before_me() * 2;
				1661	/// do {
				1662	/// remote_id = next_active_lane_id_right_after_me();
				1663	/// # the above function returns 0 of no active lane
				1664	/// # is present right after the current lane.
				1665	/// size = number_of_active_lanes_in_this_warp();
				1666	/// logical_lane_id /= 2;
				1667	/// ShuffleReduceFn(reduce_data, logical_lane_id,
				1668	/// remote_id-1-threadIdx.x, 2);
				1669	/// } while (logical_lane_id % 2 == 0 && size > 1);
				1670	/// }
				1671	///
				1672	/// There is no assumption made about the initial state of the reduction.
				1673	/// Any number of lanes (>=1) could be active at any position. The reduction
				1674	/// result is returned in the first active lane.
				1675	///
				1676	/// In this version, 'ShuffleReduceFn' behaves, per element, as follows:
				1677	///
				1678	/// remote_elem = shuffle_down(reduce_elem, offset, WARPSIZE);
				1679	/// if (lane_id % 2 == 0 && offset > 0)
				1680	/// reduce_elem = reduce_elem REDUCE_OP remote_elem
				1681	/// else
				1682	/// reduce_elem = remote_elem
				1683	///
				1684	///
				1685	/// Intra-Team Reduction
				1686	///
				1687	/// This function, as implemented in the runtime call
				1688	/// '__kmpc_nvptx_parallel_reduce_nowait', aggregates data across OpenMP
				1689	/// threads in a team. It first reduces within a warp using the
				1690	/// aforementioned algorithms. We then proceed to gather all such
				1691	/// reduced values at the first warp.
				1692	///
				1693	/// The runtime makes use of the function 'InterWarpCpyFn', which copies
				1694	/// data from each of the "warp master" (zeroth lane of each warp, where
				1695	/// warp-reduced data is held) to the zeroth warp. This step reduces (in
				1696	/// a mathematical sense) the problem of reduction across warp masters in
				1697	/// a block to the problem of warp reduction.
				1698	///
				1699	void CGOpenMPRuntimeNVPTX::emitReduction(
				1700	CodeGenFunction &CGF, SourceLocation Loc, ArrayRef<const Expr *> Privates,
				1701	ArrayRef<const Expr > LHSExprs, ArrayRef<const Expr > RHSExprs,
				1702	ArrayRef<const Expr *> ReductionOps, ReductionOptionsTy Options) {
				1703	if (!CGF.HaveInsertPoint())
				1704	return;
				1705
				1706	bool ParallelReduction = isOpenMPParallelDirective(Options.ReductionKind);
				1707	assert(ParallelReduction && "Invalid reduction selection in emitReduction.");
				1708
				1709	auto &C = CGM.getContext();
				1710
				1711	// 1. Build a list of reduction variables.
				1712	// void *RedList[<n>] = {<ReductionVars>[0], ..., <ReductionVars>[<n>-1]};
				1713	auto Size = RHSExprs.size();
				1714	for (auto *E : Privates) {
				1715	if (E->getType()->isVariablyModifiedType())
				1716	// Reserve place for array size.
				1717	++Size;
				1718	}
				1719	llvm::APInt ArraySize(/unsigned int numBits=/32, Size);
				1720	QualType ReductionArrayTy =
				1721	C.getConstantArrayType(C.VoidPtrTy, ArraySize, ArrayType::Normal,
				1722	/IndexTypeQuals=/0);
				1723	Address ReductionList =
				1724	CGF.CreateMemTemp(ReductionArrayTy, ".omp.reduction.red_list");
				1725	auto IPriv = Privates.begin();
				1726	unsigned Idx = 0;
				1727	for (unsigned I = 0, E = RHSExprs.size(); I < E; ++I, ++IPriv, ++Idx) {
				1728	Address Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
				1729	CGF.getPointerSize());
				1730	CGF.Builder.CreateStore(
				1731	CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
				1732	CGF.EmitLValue(RHSExprs[I]).getPointer(), CGF.VoidPtrTy),
				1733	Elem);
				1734	if ((*IPriv)->getType()->isVariablyModifiedType()) {
				1735	// Store array size.
				1736	++Idx;
				1737	Elem = CGF.Builder.CreateConstArrayGEP(ReductionList, Idx,
				1738	CGF.getPointerSize());
				1739	llvm::Value *Size = CGF.Builder.CreateIntCast(
				1740	CGF.getVLASize(
				1741	CGF.getContext().getAsVariableArrayType((*IPriv)->getType()))
				1742	.first,
				1743	CGF.SizeTy, /isSigned=/false);
				1744	CGF.Builder.CreateStore(CGF.Builder.CreateIntToPtr(Size, CGF.VoidPtrTy),
				1745	Elem);
				1746	}
				1747	}
				1748
				1749	// 2. Emit reduce_func().
				1750	auto *ReductionFn = emitReductionFunction(
				1751	CGM, CGF.ConvertTypeForMem(ReductionArrayTy)->getPointerTo(), Privates,
				1752	LHSExprs, RHSExprs, ReductionOps);
				1753
				1754	// 4. Build res = __kmpc_reduce{_nowait}(<gtid>, <n>, sizeof(RedList),
				1755	// RedList, shuffle_reduce_func, interwarp_copy_func);
				1756	auto *ThreadId = getThreadID(CGF, Loc);
				1757	auto *ReductionArrayTySize = CGF.getTypeSize(ReductionArrayTy);
				1758	auto *RL = CGF.Builder.CreatePointerBitCastOrAddrSpaceCast(
				1759	ReductionList.getPointer(), CGF.VoidPtrTy);
				1760
				1761	auto *ShuffleAndReduceFn = emitShuffleAndReduceFunction(
				1762	CGM, Privates, ReductionArrayTy, ReductionFn);
				1763	auto *InterWarpCopyFn =
				1764	emitInterWarpCopyFunction(CGM, Privates, ReductionArrayTy);
				1765
				1766	llvm::Value *Res = nullptr;
				1767	if (ParallelReduction) {
				1768	llvm::Value *Args[] = {ThreadId,
				1769	CGF.Builder.getInt32(RHSExprs.size()),
				1770	ReductionArrayTySize,
				1771	RL,
				1772	ShuffleAndReduceFn,
				1773	InterWarpCopyFn};
				1774
				1775	Res = CGF.EmitRuntimeCall(
				1776	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_parallel_reduce_nowait),
				1777	Args);
				1778	}
				1779
				1780	// 5. Build switch(res)
				1781	auto *DefaultBB = CGF.createBasicBlock(".omp.reduction.default");
				1782	auto SwInst = CGF.Builder.CreateSwitch(Res, DefaultBB, /NumCases=*/1);
				1783
				1784	// 6. Build case 1: where we have reduced values in the master
				1785	// thread in each team.
				1786	// __kmpc_end_reduce{_nowait}(<gtid>);
				1787	// break;
				1788	auto *Case1BB = CGF.createBasicBlock(".omp.reduction.case1");
				1789	SwInst->addCase(CGF.Builder.getInt32(1), Case1BB);
				1790	CGF.EmitBlock(Case1BB);
				1791
				1792	// Add emission of __kmpc_end_reduce{_nowait}(<gtid>);
				1793	llvm::Value *EndArgs[] = {ThreadId};
				1794	auto &&CodeGen = [&Privates, &LHSExprs, &RHSExprs, &ReductionOps,
				1795	this](CodeGenFunction &CGF, PrePostActionTy &Action) {
				1796	auto IPriv = Privates.begin();
				1797	auto ILHS = LHSExprs.begin();
				1798	auto IRHS = RHSExprs.begin();
				1799	for (auto *E : ReductionOps) {
				1800	emitSingleReductionCombiner(CGF, E, IPriv, cast<DeclRefExpr>(ILHS),
				1801	cast<DeclRefExpr>(*IRHS));
				1802	++IPriv;
				1803	++ILHS;
				1804	++IRHS;
				1805	}
				1806	};
				1807	RegionCodeGenTy RCG(CodeGen);
				1808	NVPTXActionTy Action(
				1809	nullptr, llvm::None,
				1810	createNVPTXRuntimeFunction(OMPRTL_NVPTX__kmpc_end_reduce_nowait),
				1811	EndArgs);
				1812	RCG.setAction(Action);
				1813	RCG(CGF);
				1814	CGF.EmitBranch(DefaultBB);
				1815	CGF.EmitBlock(DefaultBB, /IsFinished=/true);
				1816	}