Blame - openmp/libomptarget/deviceRTLs/nvptx/src/reduction.cu - toolchain/llvm-project

blob: dbe2d9e7865da6b6ca894f054805c5b2d8aab356 [file] [log] [blame]

George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	1	//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
				2	//-*-===//
				3	//
				4	// The LLVM Compiler Infrastructure
				5	//
				6	// This file is dual licensed under the MIT and the University of Illinois Open
				7	// Source Licenses. See LICENSE.txt for details.
				8	//
				9	//===----------------------------------------------------------------------===//
				10	//
				11	// This file contains the implementation of reduction with KMPC interface.
				12	//
				13	//===----------------------------------------------------------------------===//
				14
				15	#include <complex.h>
				16	#include <stdio.h>
				17
				18	#include "omptarget-nvptx.h"
				19
				20	// may eventually remove this
				21	EXTERN
				22	int32_t __gpu_block_reduce() {
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	23	bool isSPMDExecutionMode = isSPMDMode();
				24	int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
				25	int nt =
				26	GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	27	if (nt != blockDim.x)
				28	return 0;
				29	unsigned tnum = __ACTIVEMASK();
Alexey Bataev	37d4156	2018-07-23 13:52:12 +0000	[diff] [blame]	30	if (tnum != (~0x0)) // assume swapSize is 32
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	31	return 0;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	32	return 1;
				33	}
				34
				35	EXTERN
Gheorghe-Teodor Bercea	ad8632a	2018-11-27 19:45:10 +0000	[diff] [blame]	36	int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	37	size_t reduce_size, void *reduce_data,
				38	void reduce_array_size, kmp_ReductFctPtr reductFct,
				39	kmp_CriticalName *lck) {
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	40	int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	41	omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
				42	int numthread;
				43	if (currTaskDescr->IsParallelConstruct()) {
				44	numthread =
Gheorghe-Teodor Bercea	ad8632a	2018-11-27 19:45:10 +0000	[diff] [blame]	45	GetNumberOfOmpThreads(threadId, checkSPMDMode(loc),
				46	checkRuntimeUninitialized(loc));
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	47	} else {
				48	numthread = GetNumberOfOmpTeams();
				49	}
				50
				51	if (numthread == 1)
				52	return 1;
Alexey Bataev	37d4156	2018-07-23 13:52:12 +0000	[diff] [blame]	53	if (!__gpu_block_reduce())
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	54	return 2;
Alexey Bataev	37d4156	2018-07-23 13:52:12 +0000	[diff] [blame]	55	if (threadIdx.x == 0)
				56	return 1;
				57	return 0;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	58	}
				59
				60	EXTERN
Gheorghe-Teodor Bercea	ad8632a	2018-11-27 19:45:10 +0000	[diff] [blame]	61	int32_t __kmpc_reduce_combined(kmp_Ident *loc) {
Alexey Bataev	37d4156	2018-07-23 13:52:12 +0000	[diff] [blame]	62	return threadIdx.x == 0 ? 2 : 0;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	63	}
				64
				65	EXTERN
Gheorghe-Teodor Bercea	ad8632a	2018-11-27 19:45:10 +0000	[diff] [blame]	66	int32_t __kmpc_reduce_simd(kmp_Ident *loc) {
Alexey Bataev	37d4156	2018-07-23 13:52:12 +0000	[diff] [blame]	67	return (threadIdx.x % 32 == 0) ? 1 : 0;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	68	}
				69
				70	EXTERN
				71	void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
				72
				73	EXTERN
				74	void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
				75
				76	EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
				77	return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
				78	}
				79
				80	EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
Alexey Bataev	cc6cf64	2018-12-10 14:29:05 +0000	[diff] [blame]	81	int lo, hi;
				82	asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
				83	hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
				84	lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
				85	asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
				86	return val;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	87	}
				88
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	89	INLINE static void gpu_regular_warp_reduce(void *reduce_data,
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	90	kmp_ShuffleReductFctPtr shflFct) {
				91	for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
				92	shflFct(reduce_data, /LaneId - not used= / 0,
				93	/Offset = / mask, /AlgoVersion=/0);
				94	}
				95	}
				96
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	97	INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	98	kmp_ShuffleReductFctPtr shflFct,
				99	uint32_t size, uint32_t tid) {
				100	uint32_t curr_size;
				101	uint32_t mask;
				102	curr_size = size;
				103	mask = curr_size / 2;
				104	while (mask > 0) {
				105	shflFct(reduce_data, /LaneId = / tid, /Offset=/mask, /AlgoVersion=/1);
				106	curr_size = (curr_size + 1) / 2;
				107	mask = curr_size / 2;
				108	}
				109	}
				110
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	111	INLINE static uint32_t
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	112	gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
				113	uint32_t lanemask_lt;
				114	uint32_t lanemask_gt;
				115	uint32_t size, remote_id, physical_lane_id;
				116	physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
				117	asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	118	uint32_t Liveness = __ACTIVEMASK();
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	119	uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
				120	asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
				121	do {
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	122	Liveness = __ACTIVEMASK();
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	123	remote_id = __ffs(Liveness & lanemask_gt);
				124	size = __popc(Liveness);
				125	logical_lane_id /= 2;
				126	shflFct(reduce_data, /LaneId =/logical_lane_id,
				127	/Offset=/remote_id - 1 - physical_lane_id, /AlgoVersion=/2);
				128	} while (logical_lane_id % 2 == 0 && size > 1);
				129	return (logical_lane_id == 0);
				130	}
				131
				132	EXTERN
				133	int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
				134	size_t reduce_size, void *reduce_data,
				135	kmp_ShuffleReductFctPtr shflFct,
				136	kmp_InterWarpCopyFctPtr cpyFct) {
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	137	uint32_t Liveness = __ACTIVEMASK();
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	138	if (Liveness == 0xffffffff) {
				139	gpu_regular_warp_reduce(reduce_data, shflFct);
				140	return GetThreadIdInBlock() % WARPSIZE ==
				141	0; // Result on lane 0 of the simd warp.
				142	} else {
				143	return gpu_irregular_simd_reduce(
				144	reduce_data, shflFct); // Result on the first active lane.
				145	}
				146	}
				147
				148	INLINE
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	149	static int32_t nvptx_parallel_reduce_nowait(
				150	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				151	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
				152	bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	153	uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
Alexey Bataev	0ac2935	2018-06-25 13:43:35 +0000	[diff] [blame]	154	uint32_t NumThreads = GetNumberOfOmpThreads(
				155	BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
				156	if (NumThreads == 1)
				157	return 1;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	158	/*
				159	* This reduce function handles reduction within a team. It handles
				160	* parallel regions in both L1 and L2 parallelism levels. It also
				161	* supports Generic, SPMD, and NoOMP modes.
				162	*
				163	* 1. Reduce within a warp.
				164	* 2. Warp master copies value to warp 0 via shared memory.
				165	* 3. Warp 0 reduces to a single value.
				166	* 4. The reduced value is available in the thread that returns 1.
				167	*/
				168
				169	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	170	uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
				171	uint32_t WarpId = BlockThreadId / WARPSIZE;
				172
				173	// Volta execution model:
				174	// For the Generic execution mode a parallel region either has 1 thread and
				175	// beyond that, always a multiple of 32. For the SPMD execution mode we may
				176	// have any number of threads.
				177	if ((NumThreads % WARPSIZE == 0) \|\| (WarpId < WarpsNeeded - 1))
				178	gpu_regular_warp_reduce(reduce_data, shflFct);
				179	else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
				180	gpu_irregular_warp_reduce(reduce_data, shflFct,
				181	/LaneCount=/NumThreads % WARPSIZE,
				182	/LaneId=/GetThreadIdInBlock() % WARPSIZE);
				183
				184	// When we have more than [warpsize] number of threads
				185	// a block reduction is performed here.
				186	//
				187	// Only L1 parallel region can enter this if condition.
				188	if (NumThreads > WARPSIZE) {
				189	// Gather all the reduced values from each warp
				190	// to the first warp.
				191	cpyFct(reduce_data, WarpsNeeded);
				192
				193	if (WarpId == 0)
				194	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
				195	BlockThreadId);
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	196	}
				197	return BlockThreadId == 0;
				198	#else
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	199	uint32_t Liveness = __ACTIVEMASK();
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	200	if (Liveness == 0xffffffff) // Full warp
				201	gpu_regular_warp_reduce(reduce_data, shflFct);
				202	else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
				203	gpu_irregular_warp_reduce(reduce_data, shflFct,
				204	/LaneCount=/__popc(Liveness),
				205	/LaneId=/GetThreadIdInBlock() % WARPSIZE);
				206	else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
				207	// parallel region may enter here; return
				208	// early.
				209	return gpu_irregular_simd_reduce(reduce_data, shflFct);
				210
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	211	// When we have more than [warpsize] number of threads
				212	// a block reduction is performed here.
				213	//
				214	// Only L1 parallel region can enter this if condition.
				215	if (NumThreads > WARPSIZE) {
				216	uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
				217	// Gather all the reduced values from each warp
				218	// to the first warp.
				219	cpyFct(reduce_data, WarpsNeeded);
				220
				221	uint32_t WarpId = BlockThreadId / WARPSIZE;
				222	if (WarpId == 0)
				223	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
				224	BlockThreadId);
				225
				226	return BlockThreadId == 0;
				227	} else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
				228	return BlockThreadId == 0;
				229	}
				230
				231	// Get the OMP thread Id. This is different from BlockThreadId in the case of
				232	// an L2 parallel region.
Alexey Bataev	0f221f5	2018-11-30 16:52:38 +0000	[diff] [blame]	233	return global_tid == 0;
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	234	#endif // __CUDA_ARCH__ >= 700
				235	}
				236
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	237	EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	238	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				239	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
Jonas Hahnfeld	a1100e6	2018-10-01 14:14:26 +0000	[diff] [blame]	240	return nvptx_parallel_reduce_nowait(
				241	global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
				242	/isSPMDExecutionMode=/isSPMDMode(),
				243	/isRuntimeUninitialized=/isRuntimeUninitialized());
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	244	}
				245
				246	EXTERN
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	247	int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
				248	kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
				249	void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
				250	kmp_InterWarpCopyFctPtr cpyFct) {
				251	return nvptx_parallel_reduce_nowait(
				252	global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
				253	checkSPMDMode(loc), checkRuntimeUninitialized(loc));
				254	}
				255
				256	EXTERN
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	257	int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
				258	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				259	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
				260	return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
				261	reduce_data, shflFct, cpyFct,
				262	/isSPMDExecutionMode=/true,
				263	/isRuntimeUninitialized=/true);
				264	}
				265
				266	EXTERN
				267	int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
				268	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				269	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
				270	return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
				271	reduce_data, shflFct, cpyFct,
				272	/isSPMDExecutionMode=/false,
				273	/isRuntimeUninitialized=/true);
				274	}
				275
				276	INLINE
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	277	static int32_t nvptx_teams_reduce_nowait(
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	278	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				279	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
				280	kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
Jonas Hahnfeld	a1100e6	2018-10-01 14:14:26 +0000	[diff] [blame]	281	bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
Alexey Bataev	dcf2edc	2019-01-04 17:09:12 +0000	[diff] [blame]	282	uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	283	// In non-generic mode all workers participate in the teams reduction.
				284	// In generic mode only the team master participates in the teams
				285	// reduction because the workers are waiting for parallel work.
				286	uint32_t NumThreads =
				287	isSPMDExecutionMode
				288	? GetNumberOfOmpThreads(ThreadId, /isSPMDExecutionMode=/true,
				289	isRuntimeUninitialized)
				290	: /Master thread only/ 1;
				291	uint32_t TeamId = GetBlockIdInKernel();
				292	uint32_t NumTeams = GetNumberOfBlocksInKernel();
				293	__shared__ volatile bool IsLastTeam;
				294
				295	// Team masters of all teams write to the scratchpad.
				296	if (ThreadId == 0) {
				297	unsigned int *timestamp = GetTeamsReductionTimestamp();
				298	char *scratchpad = GetTeamsReductionScratchpad();
				299
				300	scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
				301	__threadfence();
				302
				303	// atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
				304	// It resets 'timestamp' back to 0 once the last team increments
				305	// this counter.
				306	unsigned val = atomicInc(timestamp, NumTeams - 1);
				307	IsLastTeam = val == NumTeams - 1;
				308	}
				309
				310	// We have to wait on L1 barrier because in GENERIC mode the workers
				311	// are waiting on barrier 0 for work.
				312	//
				313	// If we guard this barrier as follows it leads to deadlock, probably
				314	// because of a compiler bug: if (!IsGenericMode()) __syncthreads();
				315	uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
				316	named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
				317
				318	// If this team is not the last, quit.
				319	if (/* Volatile read by all threads */ !IsLastTeam)
				320	return 0;
				321
				322	//
				323	// Last team processing.
				324	//
				325
				326	// Threads in excess of #teams do not participate in reduction of the
				327	// scratchpad values.
				328	#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
				329	uint32_t ActiveThreads = NumThreads;
				330	if (NumTeams < NumThreads) {
				331	ActiveThreads =
				332	(NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
				333	}
				334	if (ThreadId >= ActiveThreads)
				335	return 0;
				336
				337	// Load from scratchpad and reduce.
				338	char *scratchpad = GetTeamsReductionScratchpad();
				339	ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /Load only/ 0);
				340	for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
				341	ldFct(reduce_data, scratchpad, i, NumTeams, /Load and reduce/ 1);
				342
				343	uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
				344	uint32_t WarpId = ThreadId / WARPSIZE;
				345
				346	// Reduce across warps to the warp master.
				347	if ((ActiveThreads % WARPSIZE == 0) \|\|
				348	(WarpId < WarpsNeeded - 1)) // Full warp
				349	gpu_regular_warp_reduce(reduce_data, shflFct);
				350	else if (ActiveThreads > 1) // Partial warp but contiguous lanes
				351	// Only SPMD execution mode comes thru this case.
				352	gpu_irregular_warp_reduce(reduce_data, shflFct,
				353	/LaneCount=/ActiveThreads % WARPSIZE,
				354	/LaneId=/ThreadId % WARPSIZE);
				355
				356	// When we have more than [warpsize] number of threads
				357	// a block reduction is performed here.
				358	if (ActiveThreads > WARPSIZE) {
				359	// Gather all the reduced values from each warp
				360	// to the first warp.
				361	cpyFct(reduce_data, WarpsNeeded);
				362
				363	if (WarpId == 0)
				364	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
				365	}
				366	#else
				367	if (ThreadId >= NumTeams)
				368	return 0;
				369
				370	// Load from scratchpad and reduce.
				371	char *scratchpad = GetTeamsReductionScratchpad();
				372	ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /Load only/ 0);
				373	for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
				374	ldFct(reduce_data, scratchpad, i, NumTeams, /Load and reduce/ 1);
				375
				376	// Reduce across warps to the warp master.
Alexey Bataev	6b3153a	2019-01-04 20:16:54 +0000	[diff] [blame]	377	uint32_t Liveness = __ACTIVEMASK();
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	378	if (Liveness == 0xffffffff) // Full warp
				379	gpu_regular_warp_reduce(reduce_data, shflFct);
				380	else // Partial warp but contiguous lanes
				381	gpu_irregular_warp_reduce(reduce_data, shflFct,
				382	/LaneCount=/__popc(Liveness),
				383	/LaneId=/ThreadId % WARPSIZE);
				384
				385	// When we have more than [warpsize] number of threads
				386	// a block reduction is performed here.
				387	uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
				388	if (ActiveThreads > WARPSIZE) {
				389	uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
				390	// Gather all the reduced values from each warp
				391	// to the first warp.
				392	cpyFct(reduce_data, WarpsNeeded);
				393
				394	uint32_t WarpId = ThreadId / WARPSIZE;
				395	if (WarpId == 0)
				396	gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
				397	}
				398	#endif // __CUDA_ARCH__ >= 700
				399
				400	return ThreadId == 0;
				401	}
				402
				403	EXTERN
				404	int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
				405	size_t reduce_size, void *reduce_data,
				406	kmp_ShuffleReductFctPtr shflFct,
				407	kmp_InterWarpCopyFctPtr cpyFct,
				408	kmp_CopyToScratchpadFctPtr scratchFct,
				409	kmp_LoadReduceFctPtr ldFct) {
Jonas Hahnfeld	a1100e6	2018-10-01 14:14:26 +0000	[diff] [blame]	410	return nvptx_teams_reduce_nowait(
				411	global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
				412	scratchFct, ldFct, /isSPMDExecutionMode=/isSPMDMode(),
				413	/isRuntimeUninitialized=/isRuntimeUninitialized());
George Rokos	0dd6ed7	2018-01-29 13:59:35 +0000	[diff] [blame]	414	}
				415
				416	EXTERN
				417	int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
				418	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				419	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
				420	kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
				421	return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
				422	reduce_data, shflFct, cpyFct, scratchFct,
				423	ldFct,
				424	/isSPMDExecutionMode=/true,
				425	/isRuntimeUninitialized=/true);
				426	}
				427
				428	EXTERN
				429	int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
				430	int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
				431	kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
				432	kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
				433	return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
				434	reduce_data, shflFct, cpyFct, scratchFct,
				435	ldFct,
				436	/isSPMDExecutionMode=/false,
				437	/isRuntimeUninitialized=/true);
				438	}
Alexey Bataev	d4de439	2018-11-27 21:06:09 +0000	[diff] [blame]	439
				440	EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
				441	int32_t global_tid,
				442	kmp_CriticalName *crit) {
				443	if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
				444	return 0;
				445	// The master thread of the team actually does the reduction.
				446	while (atomicCAS((uint32_t *)crit, 0, 1))
				447	;
				448	return 1;
				449	}
				450
				451	EXTERN void
				452	__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
				453	kmp_CriticalName *crit) {
Gheorghe-Teodor Bercea	10b2e60	2018-12-03 15:21:49 +0000	[diff] [blame]	454	__threadfence_system();
Alexey Bataev	d4de439	2018-11-27 21:06:09 +0000	[diff] [blame]	455	(void)atomicExch((uint32_t *)crit, 0);
				456	}
				457