blob: dbe2d9e7865da6b6ca894f054805c5b2d8aab356 [file] [log] [blame]
George Rokos0dd6ed72018-01-29 13:59:35 +00001//===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA
2//-*-===//
3//
4// The LLVM Compiler Infrastructure
5//
6// This file is dual licensed under the MIT and the University of Illinois Open
7// Source Licenses. See LICENSE.txt for details.
8//
9//===----------------------------------------------------------------------===//
10//
11// This file contains the implementation of reduction with KMPC interface.
12//
13//===----------------------------------------------------------------------===//
14
15#include <complex.h>
16#include <stdio.h>
17
18#include "omptarget-nvptx.h"
19
20// may eventually remove this
21EXTERN
22int32_t __gpu_block_reduce() {
Alexey Bataevdcf2edc2019-01-04 17:09:12 +000023 bool isSPMDExecutionMode = isSPMDMode();
24 int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
25 int nt =
26 GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized());
George Rokos0dd6ed72018-01-29 13:59:35 +000027 if (nt != blockDim.x)
28 return 0;
29 unsigned tnum = __ACTIVEMASK();
Alexey Bataev37d41562018-07-23 13:52:12 +000030 if (tnum != (~0x0)) // assume swapSize is 32
George Rokos0dd6ed72018-01-29 13:59:35 +000031 return 0;
George Rokos0dd6ed72018-01-29 13:59:35 +000032 return 1;
33}
34
35EXTERN
Gheorghe-Teodor Berceaad8632a2018-11-27 19:45:10 +000036int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars,
George Rokos0dd6ed72018-01-29 13:59:35 +000037 size_t reduce_size, void *reduce_data,
38 void *reduce_array_size, kmp_ReductFctPtr *reductFct,
39 kmp_CriticalName *lck) {
Alexey Bataevdcf2edc2019-01-04 17:09:12 +000040 int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc));
George Rokos0dd6ed72018-01-29 13:59:35 +000041 omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId);
42 int numthread;
43 if (currTaskDescr->IsParallelConstruct()) {
44 numthread =
Gheorghe-Teodor Berceaad8632a2018-11-27 19:45:10 +000045 GetNumberOfOmpThreads(threadId, checkSPMDMode(loc),
46 checkRuntimeUninitialized(loc));
George Rokos0dd6ed72018-01-29 13:59:35 +000047 } else {
48 numthread = GetNumberOfOmpTeams();
49 }
50
51 if (numthread == 1)
52 return 1;
Alexey Bataev37d41562018-07-23 13:52:12 +000053 if (!__gpu_block_reduce())
George Rokos0dd6ed72018-01-29 13:59:35 +000054 return 2;
Alexey Bataev37d41562018-07-23 13:52:12 +000055 if (threadIdx.x == 0)
56 return 1;
57 return 0;
George Rokos0dd6ed72018-01-29 13:59:35 +000058}
59
60EXTERN
Gheorghe-Teodor Berceaad8632a2018-11-27 19:45:10 +000061int32_t __kmpc_reduce_combined(kmp_Ident *loc) {
Alexey Bataev37d41562018-07-23 13:52:12 +000062 return threadIdx.x == 0 ? 2 : 0;
George Rokos0dd6ed72018-01-29 13:59:35 +000063}
64
65EXTERN
Gheorghe-Teodor Berceaad8632a2018-11-27 19:45:10 +000066int32_t __kmpc_reduce_simd(kmp_Ident *loc) {
Alexey Bataev37d41562018-07-23 13:52:12 +000067 return (threadIdx.x % 32 == 0) ? 1 : 0;
George Rokos0dd6ed72018-01-29 13:59:35 +000068}
69
70EXTERN
71void __kmpc_nvptx_end_reduce(int32_t global_tid) {}
72
73EXTERN
74void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {}
75
76EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) {
77 return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size);
78}
79
80EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) {
Alexey Bataevcc6cf642018-12-10 14:29:05 +000081 int lo, hi;
82 asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val));
83 hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size);
84 lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size);
85 asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi));
86 return val;
George Rokos0dd6ed72018-01-29 13:59:35 +000087}
88
Alexey Bataev6b3153a2019-01-04 20:16:54 +000089INLINE static void gpu_regular_warp_reduce(void *reduce_data,
George Rokos0dd6ed72018-01-29 13:59:35 +000090 kmp_ShuffleReductFctPtr shflFct) {
91 for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) {
92 shflFct(reduce_data, /*LaneId - not used= */ 0,
93 /*Offset = */ mask, /*AlgoVersion=*/0);
94 }
95}
96
Alexey Bataev6b3153a2019-01-04 20:16:54 +000097INLINE static void gpu_irregular_warp_reduce(void *reduce_data,
George Rokos0dd6ed72018-01-29 13:59:35 +000098 kmp_ShuffleReductFctPtr shflFct,
99 uint32_t size, uint32_t tid) {
100 uint32_t curr_size;
101 uint32_t mask;
102 curr_size = size;
103 mask = curr_size / 2;
104 while (mask > 0) {
105 shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1);
106 curr_size = (curr_size + 1) / 2;
107 mask = curr_size / 2;
108 }
109}
110
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000111INLINE static uint32_t
George Rokos0dd6ed72018-01-29 13:59:35 +0000112gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) {
113 uint32_t lanemask_lt;
114 uint32_t lanemask_gt;
115 uint32_t size, remote_id, physical_lane_id;
116 physical_lane_id = GetThreadIdInBlock() % WARPSIZE;
117 asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt));
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000118 uint32_t Liveness = __ACTIVEMASK();
George Rokos0dd6ed72018-01-29 13:59:35 +0000119 uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2;
120 asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt));
121 do {
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000122 Liveness = __ACTIVEMASK();
George Rokos0dd6ed72018-01-29 13:59:35 +0000123 remote_id = __ffs(Liveness & lanemask_gt);
124 size = __popc(Liveness);
125 logical_lane_id /= 2;
126 shflFct(reduce_data, /*LaneId =*/logical_lane_id,
127 /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2);
128 } while (logical_lane_id % 2 == 0 && size > 1);
129 return (logical_lane_id == 0);
130}
131
132EXTERN
133int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars,
134 size_t reduce_size, void *reduce_data,
135 kmp_ShuffleReductFctPtr shflFct,
136 kmp_InterWarpCopyFctPtr cpyFct) {
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000137 uint32_t Liveness = __ACTIVEMASK();
George Rokos0dd6ed72018-01-29 13:59:35 +0000138 if (Liveness == 0xffffffff) {
139 gpu_regular_warp_reduce(reduce_data, shflFct);
140 return GetThreadIdInBlock() % WARPSIZE ==
141 0; // Result on lane 0 of the simd warp.
142 } else {
143 return gpu_irregular_simd_reduce(
144 reduce_data, shflFct); // Result on the first active lane.
145 }
146}
147
148INLINE
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000149static int32_t nvptx_parallel_reduce_nowait(
150 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
151 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
152 bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
Alexey Bataevdcf2edc2019-01-04 17:09:12 +0000153 uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
Alexey Bataev0ac29352018-06-25 13:43:35 +0000154 uint32_t NumThreads = GetNumberOfOmpThreads(
155 BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized);
156 if (NumThreads == 1)
157 return 1;
George Rokos0dd6ed72018-01-29 13:59:35 +0000158 /*
159 * This reduce function handles reduction within a team. It handles
160 * parallel regions in both L1 and L2 parallelism levels. It also
161 * supports Generic, SPMD, and NoOMP modes.
162 *
163 * 1. Reduce within a warp.
164 * 2. Warp master copies value to warp 0 via shared memory.
165 * 3. Warp 0 reduces to a single value.
166 * 4. The reduced value is available in the thread that returns 1.
167 */
168
169#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
George Rokos0dd6ed72018-01-29 13:59:35 +0000170 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
171 uint32_t WarpId = BlockThreadId / WARPSIZE;
172
173 // Volta execution model:
174 // For the Generic execution mode a parallel region either has 1 thread and
175 // beyond that, always a multiple of 32. For the SPMD execution mode we may
176 // have any number of threads.
177 if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1))
178 gpu_regular_warp_reduce(reduce_data, shflFct);
179 else if (NumThreads > 1) // Only SPMD execution mode comes thru this case.
180 gpu_irregular_warp_reduce(reduce_data, shflFct,
181 /*LaneCount=*/NumThreads % WARPSIZE,
182 /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
183
184 // When we have more than [warpsize] number of threads
185 // a block reduction is performed here.
186 //
187 // Only L1 parallel region can enter this if condition.
188 if (NumThreads > WARPSIZE) {
189 // Gather all the reduced values from each warp
190 // to the first warp.
191 cpyFct(reduce_data, WarpsNeeded);
192
193 if (WarpId == 0)
194 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
195 BlockThreadId);
George Rokos0dd6ed72018-01-29 13:59:35 +0000196 }
197 return BlockThreadId == 0;
198#else
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000199 uint32_t Liveness = __ACTIVEMASK();
George Rokos0dd6ed72018-01-29 13:59:35 +0000200 if (Liveness == 0xffffffff) // Full warp
201 gpu_regular_warp_reduce(reduce_data, shflFct);
202 else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes
203 gpu_irregular_warp_reduce(reduce_data, shflFct,
204 /*LaneCount=*/__popc(Liveness),
205 /*LaneId=*/GetThreadIdInBlock() % WARPSIZE);
206 else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2
207 // parallel region may enter here; return
208 // early.
209 return gpu_irregular_simd_reduce(reduce_data, shflFct);
210
George Rokos0dd6ed72018-01-29 13:59:35 +0000211 // When we have more than [warpsize] number of threads
212 // a block reduction is performed here.
213 //
214 // Only L1 parallel region can enter this if condition.
215 if (NumThreads > WARPSIZE) {
216 uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE;
217 // Gather all the reduced values from each warp
218 // to the first warp.
219 cpyFct(reduce_data, WarpsNeeded);
220
221 uint32_t WarpId = BlockThreadId / WARPSIZE;
222 if (WarpId == 0)
223 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded,
224 BlockThreadId);
225
226 return BlockThreadId == 0;
227 } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) {
228 return BlockThreadId == 0;
229 }
230
231 // Get the OMP thread Id. This is different from BlockThreadId in the case of
232 // an L2 parallel region.
Alexey Bataev0f221f52018-11-30 16:52:38 +0000233 return global_tid == 0;
George Rokos0dd6ed72018-01-29 13:59:35 +0000234#endif // __CUDA_ARCH__ >= 700
235}
236
Alexey Bataevdcf2edc2019-01-04 17:09:12 +0000237EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait(
George Rokos0dd6ed72018-01-29 13:59:35 +0000238 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
239 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
Jonas Hahnfelda1100e62018-10-01 14:14:26 +0000240 return nvptx_parallel_reduce_nowait(
241 global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
242 /*isSPMDExecutionMode=*/isSPMDMode(),
243 /*isRuntimeUninitialized=*/isRuntimeUninitialized());
George Rokos0dd6ed72018-01-29 13:59:35 +0000244}
245
246EXTERN
Alexey Bataevdcf2edc2019-01-04 17:09:12 +0000247int32_t __kmpc_nvptx_parallel_reduce_nowait_v2(
248 kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size,
249 void *reduce_data, kmp_ShuffleReductFctPtr shflFct,
250 kmp_InterWarpCopyFctPtr cpyFct) {
251 return nvptx_parallel_reduce_nowait(
252 global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
253 checkSPMDMode(loc), checkRuntimeUninitialized(loc));
254}
255
256EXTERN
George Rokos0dd6ed72018-01-29 13:59:35 +0000257int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd(
258 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
259 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
260 return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
261 reduce_data, shflFct, cpyFct,
262 /*isSPMDExecutionMode=*/true,
263 /*isRuntimeUninitialized=*/true);
264}
265
266EXTERN
267int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic(
268 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
269 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) {
270 return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size,
271 reduce_data, shflFct, cpyFct,
272 /*isSPMDExecutionMode=*/false,
273 /*isRuntimeUninitialized=*/true);
274}
275
276INLINE
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000277static int32_t nvptx_teams_reduce_nowait(
George Rokos0dd6ed72018-01-29 13:59:35 +0000278 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
279 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
280 kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct,
Jonas Hahnfelda1100e62018-10-01 14:14:26 +0000281 bool isSPMDExecutionMode, bool isRuntimeUninitialized) {
Alexey Bataevdcf2edc2019-01-04 17:09:12 +0000282 uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode);
George Rokos0dd6ed72018-01-29 13:59:35 +0000283 // In non-generic mode all workers participate in the teams reduction.
284 // In generic mode only the team master participates in the teams
285 // reduction because the workers are waiting for parallel work.
286 uint32_t NumThreads =
287 isSPMDExecutionMode
288 ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true,
289 isRuntimeUninitialized)
290 : /*Master thread only*/ 1;
291 uint32_t TeamId = GetBlockIdInKernel();
292 uint32_t NumTeams = GetNumberOfBlocksInKernel();
293 __shared__ volatile bool IsLastTeam;
294
295 // Team masters of all teams write to the scratchpad.
296 if (ThreadId == 0) {
297 unsigned int *timestamp = GetTeamsReductionTimestamp();
298 char *scratchpad = GetTeamsReductionScratchpad();
299
300 scratchFct(reduce_data, scratchpad, TeamId, NumTeams);
301 __threadfence();
302
303 // atomicInc increments 'timestamp' and has a range [0, NumTeams-1].
304 // It resets 'timestamp' back to 0 once the last team increments
305 // this counter.
306 unsigned val = atomicInc(timestamp, NumTeams - 1);
307 IsLastTeam = val == NumTeams - 1;
308 }
309
310 // We have to wait on L1 barrier because in GENERIC mode the workers
311 // are waiting on barrier 0 for work.
312 //
313 // If we guard this barrier as follows it leads to deadlock, probably
314 // because of a compiler bug: if (!IsGenericMode()) __syncthreads();
315 uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE;
316 named_sync(L1_BARRIER, SyncWarps * WARPSIZE);
317
318 // If this team is not the last, quit.
319 if (/* Volatile read by all threads */ !IsLastTeam)
320 return 0;
321
322 //
323 // Last team processing.
324 //
325
326 // Threads in excess of #teams do not participate in reduction of the
327 // scratchpad values.
328#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700
329 uint32_t ActiveThreads = NumThreads;
330 if (NumTeams < NumThreads) {
331 ActiveThreads =
332 (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1);
333 }
334 if (ThreadId >= ActiveThreads)
335 return 0;
336
337 // Load from scratchpad and reduce.
338 char *scratchpad = GetTeamsReductionScratchpad();
339 ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
340 for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads)
341 ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
342
343 uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
344 uint32_t WarpId = ThreadId / WARPSIZE;
345
346 // Reduce across warps to the warp master.
347 if ((ActiveThreads % WARPSIZE == 0) ||
348 (WarpId < WarpsNeeded - 1)) // Full warp
349 gpu_regular_warp_reduce(reduce_data, shflFct);
350 else if (ActiveThreads > 1) // Partial warp but contiguous lanes
351 // Only SPMD execution mode comes thru this case.
352 gpu_irregular_warp_reduce(reduce_data, shflFct,
353 /*LaneCount=*/ActiveThreads % WARPSIZE,
354 /*LaneId=*/ThreadId % WARPSIZE);
355
356 // When we have more than [warpsize] number of threads
357 // a block reduction is performed here.
358 if (ActiveThreads > WARPSIZE) {
359 // Gather all the reduced values from each warp
360 // to the first warp.
361 cpyFct(reduce_data, WarpsNeeded);
362
363 if (WarpId == 0)
364 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
365 }
366#else
367 if (ThreadId >= NumTeams)
368 return 0;
369
370 // Load from scratchpad and reduce.
371 char *scratchpad = GetTeamsReductionScratchpad();
372 ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0);
373 for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads)
374 ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1);
375
376 // Reduce across warps to the warp master.
Alexey Bataev6b3153a2019-01-04 20:16:54 +0000377 uint32_t Liveness = __ACTIVEMASK();
George Rokos0dd6ed72018-01-29 13:59:35 +0000378 if (Liveness == 0xffffffff) // Full warp
379 gpu_regular_warp_reduce(reduce_data, shflFct);
380 else // Partial warp but contiguous lanes
381 gpu_irregular_warp_reduce(reduce_data, shflFct,
382 /*LaneCount=*/__popc(Liveness),
383 /*LaneId=*/ThreadId % WARPSIZE);
384
385 // When we have more than [warpsize] number of threads
386 // a block reduction is performed here.
387 uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads;
388 if (ActiveThreads > WARPSIZE) {
389 uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE;
390 // Gather all the reduced values from each warp
391 // to the first warp.
392 cpyFct(reduce_data, WarpsNeeded);
393
394 uint32_t WarpId = ThreadId / WARPSIZE;
395 if (WarpId == 0)
396 gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId);
397 }
398#endif // __CUDA_ARCH__ >= 700
399
400 return ThreadId == 0;
401}
402
403EXTERN
404int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars,
405 size_t reduce_size, void *reduce_data,
406 kmp_ShuffleReductFctPtr shflFct,
407 kmp_InterWarpCopyFctPtr cpyFct,
408 kmp_CopyToScratchpadFctPtr scratchFct,
409 kmp_LoadReduceFctPtr ldFct) {
Jonas Hahnfelda1100e62018-10-01 14:14:26 +0000410 return nvptx_teams_reduce_nowait(
411 global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct,
412 scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(),
413 /*isRuntimeUninitialized=*/isRuntimeUninitialized());
George Rokos0dd6ed72018-01-29 13:59:35 +0000414}
415
416EXTERN
417int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd(
418 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
419 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
420 kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
421 return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
422 reduce_data, shflFct, cpyFct, scratchFct,
423 ldFct,
424 /*isSPMDExecutionMode=*/true,
425 /*isRuntimeUninitialized=*/true);
426}
427
428EXTERN
429int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic(
430 int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data,
431 kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct,
432 kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) {
433 return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size,
434 reduce_data, shflFct, cpyFct, scratchFct,
435 ldFct,
436 /*isSPMDExecutionMode=*/false,
437 /*isRuntimeUninitialized=*/true);
438}
Alexey Bataevd4de4392018-11-27 21:06:09 +0000439
440EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc,
441 int32_t global_tid,
442 kmp_CriticalName *crit) {
443 if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0)
444 return 0;
445 // The master thread of the team actually does the reduction.
446 while (atomicCAS((uint32_t *)crit, 0, 1))
447 ;
448 return 1;
449}
450
451EXTERN void
452__kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid,
453 kmp_CriticalName *crit) {
Gheorghe-Teodor Bercea10b2e602018-12-03 15:21:49 +0000454 __threadfence_system();
Alexey Bataevd4de4392018-11-27 21:06:09 +0000455 (void)atomicExch((uint32_t *)crit, 0);
456}
457