George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 1 | //===---- reduction.cu - NVPTX OpenMP reduction implementation ---- CUDA |
| 2 | //-*-===// |
| 3 | // |
| 4 | // The LLVM Compiler Infrastructure |
| 5 | // |
| 6 | // This file is dual licensed under the MIT and the University of Illinois Open |
| 7 | // Source Licenses. See LICENSE.txt for details. |
| 8 | // |
| 9 | //===----------------------------------------------------------------------===// |
| 10 | // |
| 11 | // This file contains the implementation of reduction with KMPC interface. |
| 12 | // |
| 13 | //===----------------------------------------------------------------------===// |
| 14 | |
| 15 | #include <complex.h> |
| 16 | #include <stdio.h> |
| 17 | |
| 18 | #include "omptarget-nvptx.h" |
| 19 | |
| 20 | // may eventually remove this |
| 21 | EXTERN |
| 22 | int32_t __gpu_block_reduce() { |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 23 | bool isSPMDExecutionMode = isSPMDMode(); |
| 24 | int tid = GetLogicalThreadIdInBlock(isSPMDExecutionMode); |
| 25 | int nt = |
| 26 | GetNumberOfOmpThreads(tid, isSPMDExecutionMode, isRuntimeUninitialized()); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 27 | if (nt != blockDim.x) |
| 28 | return 0; |
| 29 | unsigned tnum = __ACTIVEMASK(); |
Alexey Bataev | 37d4156 | 2018-07-23 13:52:12 +0000 | [diff] [blame] | 30 | if (tnum != (~0x0)) // assume swapSize is 32 |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 31 | return 0; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 32 | return 1; |
| 33 | } |
| 34 | |
| 35 | EXTERN |
Gheorghe-Teodor Bercea | ad8632a | 2018-11-27 19:45:10 +0000 | [diff] [blame] | 36 | int32_t __kmpc_reduce_gpu(kmp_Ident *loc, int32_t global_tid, int32_t num_vars, |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 37 | size_t reduce_size, void *reduce_data, |
| 38 | void *reduce_array_size, kmp_ReductFctPtr *reductFct, |
| 39 | kmp_CriticalName *lck) { |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 40 | int threadId = GetLogicalThreadIdInBlock(checkSPMDMode(loc)); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 41 | omptarget_nvptx_TaskDescr *currTaskDescr = getMyTopTaskDescriptor(threadId); |
| 42 | int numthread; |
| 43 | if (currTaskDescr->IsParallelConstruct()) { |
| 44 | numthread = |
Gheorghe-Teodor Bercea | ad8632a | 2018-11-27 19:45:10 +0000 | [diff] [blame] | 45 | GetNumberOfOmpThreads(threadId, checkSPMDMode(loc), |
| 46 | checkRuntimeUninitialized(loc)); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 47 | } else { |
| 48 | numthread = GetNumberOfOmpTeams(); |
| 49 | } |
| 50 | |
| 51 | if (numthread == 1) |
| 52 | return 1; |
Alexey Bataev | 37d4156 | 2018-07-23 13:52:12 +0000 | [diff] [blame] | 53 | if (!__gpu_block_reduce()) |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 54 | return 2; |
Alexey Bataev | 37d4156 | 2018-07-23 13:52:12 +0000 | [diff] [blame] | 55 | if (threadIdx.x == 0) |
| 56 | return 1; |
| 57 | return 0; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 58 | } |
| 59 | |
| 60 | EXTERN |
Gheorghe-Teodor Bercea | ad8632a | 2018-11-27 19:45:10 +0000 | [diff] [blame] | 61 | int32_t __kmpc_reduce_combined(kmp_Ident *loc) { |
Alexey Bataev | 37d4156 | 2018-07-23 13:52:12 +0000 | [diff] [blame] | 62 | return threadIdx.x == 0 ? 2 : 0; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 63 | } |
| 64 | |
| 65 | EXTERN |
Gheorghe-Teodor Bercea | ad8632a | 2018-11-27 19:45:10 +0000 | [diff] [blame] | 66 | int32_t __kmpc_reduce_simd(kmp_Ident *loc) { |
Alexey Bataev | 37d4156 | 2018-07-23 13:52:12 +0000 | [diff] [blame] | 67 | return (threadIdx.x % 32 == 0) ? 1 : 0; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 68 | } |
| 69 | |
| 70 | EXTERN |
| 71 | void __kmpc_nvptx_end_reduce(int32_t global_tid) {} |
| 72 | |
| 73 | EXTERN |
| 74 | void __kmpc_nvptx_end_reduce_nowait(int32_t global_tid) {} |
| 75 | |
| 76 | EXTERN int32_t __kmpc_shuffle_int32(int32_t val, int16_t delta, int16_t size) { |
| 77 | return __SHFL_DOWN_SYNC(0xFFFFFFFF, val, delta, size); |
| 78 | } |
| 79 | |
| 80 | EXTERN int64_t __kmpc_shuffle_int64(int64_t val, int16_t delta, int16_t size) { |
Alexey Bataev | cc6cf64 | 2018-12-10 14:29:05 +0000 | [diff] [blame] | 81 | int lo, hi; |
| 82 | asm volatile("mov.b64 {%0,%1}, %2;" : "=r"(lo), "=r"(hi) : "l"(val)); |
| 83 | hi = __SHFL_DOWN_SYNC(0xFFFFFFFF, hi, delta, size); |
| 84 | lo = __SHFL_DOWN_SYNC(0xFFFFFFFF, lo, delta, size); |
| 85 | asm volatile("mov.b64 %0, {%1,%2};" : "=l"(val) : "r"(lo), "r"(hi)); |
| 86 | return val; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 87 | } |
| 88 | |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 89 | INLINE static void gpu_regular_warp_reduce(void *reduce_data, |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 90 | kmp_ShuffleReductFctPtr shflFct) { |
| 91 | for (uint32_t mask = WARPSIZE / 2; mask > 0; mask /= 2) { |
| 92 | shflFct(reduce_data, /*LaneId - not used= */ 0, |
| 93 | /*Offset = */ mask, /*AlgoVersion=*/0); |
| 94 | } |
| 95 | } |
| 96 | |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 97 | INLINE static void gpu_irregular_warp_reduce(void *reduce_data, |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 98 | kmp_ShuffleReductFctPtr shflFct, |
| 99 | uint32_t size, uint32_t tid) { |
| 100 | uint32_t curr_size; |
| 101 | uint32_t mask; |
| 102 | curr_size = size; |
| 103 | mask = curr_size / 2; |
| 104 | while (mask > 0) { |
| 105 | shflFct(reduce_data, /*LaneId = */ tid, /*Offset=*/mask, /*AlgoVersion=*/1); |
| 106 | curr_size = (curr_size + 1) / 2; |
| 107 | mask = curr_size / 2; |
| 108 | } |
| 109 | } |
| 110 | |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 111 | INLINE static uint32_t |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 112 | gpu_irregular_simd_reduce(void *reduce_data, kmp_ShuffleReductFctPtr shflFct) { |
| 113 | uint32_t lanemask_lt; |
| 114 | uint32_t lanemask_gt; |
| 115 | uint32_t size, remote_id, physical_lane_id; |
| 116 | physical_lane_id = GetThreadIdInBlock() % WARPSIZE; |
| 117 | asm("mov.u32 %0, %%lanemask_lt;" : "=r"(lanemask_lt)); |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 118 | uint32_t Liveness = __ACTIVEMASK(); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 119 | uint32_t logical_lane_id = __popc(Liveness & lanemask_lt) * 2; |
| 120 | asm("mov.u32 %0, %%lanemask_gt;" : "=r"(lanemask_gt)); |
| 121 | do { |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 122 | Liveness = __ACTIVEMASK(); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 123 | remote_id = __ffs(Liveness & lanemask_gt); |
| 124 | size = __popc(Liveness); |
| 125 | logical_lane_id /= 2; |
| 126 | shflFct(reduce_data, /*LaneId =*/logical_lane_id, |
| 127 | /*Offset=*/remote_id - 1 - physical_lane_id, /*AlgoVersion=*/2); |
| 128 | } while (logical_lane_id % 2 == 0 && size > 1); |
| 129 | return (logical_lane_id == 0); |
| 130 | } |
| 131 | |
| 132 | EXTERN |
| 133 | int32_t __kmpc_nvptx_simd_reduce_nowait(int32_t global_tid, int32_t num_vars, |
| 134 | size_t reduce_size, void *reduce_data, |
| 135 | kmp_ShuffleReductFctPtr shflFct, |
| 136 | kmp_InterWarpCopyFctPtr cpyFct) { |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 137 | uint32_t Liveness = __ACTIVEMASK(); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 138 | if (Liveness == 0xffffffff) { |
| 139 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 140 | return GetThreadIdInBlock() % WARPSIZE == |
| 141 | 0; // Result on lane 0 of the simd warp. |
| 142 | } else { |
| 143 | return gpu_irregular_simd_reduce( |
| 144 | reduce_data, shflFct); // Result on the first active lane. |
| 145 | } |
| 146 | } |
| 147 | |
| 148 | INLINE |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 149 | static int32_t nvptx_parallel_reduce_nowait( |
| 150 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 151 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, |
| 152 | bool isSPMDExecutionMode, bool isRuntimeUninitialized) { |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 153 | uint32_t BlockThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); |
Alexey Bataev | 0ac2935 | 2018-06-25 13:43:35 +0000 | [diff] [blame] | 154 | uint32_t NumThreads = GetNumberOfOmpThreads( |
| 155 | BlockThreadId, isSPMDExecutionMode, isRuntimeUninitialized); |
| 156 | if (NumThreads == 1) |
| 157 | return 1; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 158 | /* |
| 159 | * This reduce function handles reduction within a team. It handles |
| 160 | * parallel regions in both L1 and L2 parallelism levels. It also |
| 161 | * supports Generic, SPMD, and NoOMP modes. |
| 162 | * |
| 163 | * 1. Reduce within a warp. |
| 164 | * 2. Warp master copies value to warp 0 via shared memory. |
| 165 | * 3. Warp 0 reduces to a single value. |
| 166 | * 4. The reduced value is available in the thread that returns 1. |
| 167 | */ |
| 168 | |
| 169 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 170 | uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; |
| 171 | uint32_t WarpId = BlockThreadId / WARPSIZE; |
| 172 | |
| 173 | // Volta execution model: |
| 174 | // For the Generic execution mode a parallel region either has 1 thread and |
| 175 | // beyond that, always a multiple of 32. For the SPMD execution mode we may |
| 176 | // have any number of threads. |
| 177 | if ((NumThreads % WARPSIZE == 0) || (WarpId < WarpsNeeded - 1)) |
| 178 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 179 | else if (NumThreads > 1) // Only SPMD execution mode comes thru this case. |
| 180 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
| 181 | /*LaneCount=*/NumThreads % WARPSIZE, |
| 182 | /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); |
| 183 | |
| 184 | // When we have more than [warpsize] number of threads |
| 185 | // a block reduction is performed here. |
| 186 | // |
| 187 | // Only L1 parallel region can enter this if condition. |
| 188 | if (NumThreads > WARPSIZE) { |
| 189 | // Gather all the reduced values from each warp |
| 190 | // to the first warp. |
| 191 | cpyFct(reduce_data, WarpsNeeded); |
| 192 | |
| 193 | if (WarpId == 0) |
| 194 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| 195 | BlockThreadId); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 196 | } |
| 197 | return BlockThreadId == 0; |
| 198 | #else |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 199 | uint32_t Liveness = __ACTIVEMASK(); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 200 | if (Liveness == 0xffffffff) // Full warp |
| 201 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 202 | else if (!(Liveness & (Liveness + 1))) // Partial warp but contiguous lanes |
| 203 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
| 204 | /*LaneCount=*/__popc(Liveness), |
| 205 | /*LaneId=*/GetThreadIdInBlock() % WARPSIZE); |
| 206 | else if (!isRuntimeUninitialized) // Dispersed lanes. Only threads in L2 |
| 207 | // parallel region may enter here; return |
| 208 | // early. |
| 209 | return gpu_irregular_simd_reduce(reduce_data, shflFct); |
| 210 | |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 211 | // When we have more than [warpsize] number of threads |
| 212 | // a block reduction is performed here. |
| 213 | // |
| 214 | // Only L1 parallel region can enter this if condition. |
| 215 | if (NumThreads > WARPSIZE) { |
| 216 | uint32_t WarpsNeeded = (NumThreads + WARPSIZE - 1) / WARPSIZE; |
| 217 | // Gather all the reduced values from each warp |
| 218 | // to the first warp. |
| 219 | cpyFct(reduce_data, WarpsNeeded); |
| 220 | |
| 221 | uint32_t WarpId = BlockThreadId / WARPSIZE; |
| 222 | if (WarpId == 0) |
| 223 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, |
| 224 | BlockThreadId); |
| 225 | |
| 226 | return BlockThreadId == 0; |
| 227 | } else if (isRuntimeUninitialized /* Never an L2 parallel region without the OMP runtime */) { |
| 228 | return BlockThreadId == 0; |
| 229 | } |
| 230 | |
| 231 | // Get the OMP thread Id. This is different from BlockThreadId in the case of |
| 232 | // an L2 parallel region. |
Alexey Bataev | 0f221f5 | 2018-11-30 16:52:38 +0000 | [diff] [blame] | 233 | return global_tid == 0; |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 234 | #endif // __CUDA_ARCH__ >= 700 |
| 235 | } |
| 236 | |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 237 | EXTERN __attribute__((deprecated)) int32_t __kmpc_nvptx_parallel_reduce_nowait( |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 238 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 239 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { |
Jonas Hahnfeld | a1100e6 | 2018-10-01 14:14:26 +0000 | [diff] [blame] | 240 | return nvptx_parallel_reduce_nowait( |
| 241 | global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, |
| 242 | /*isSPMDExecutionMode=*/isSPMDMode(), |
| 243 | /*isRuntimeUninitialized=*/isRuntimeUninitialized()); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 244 | } |
| 245 | |
| 246 | EXTERN |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 247 | int32_t __kmpc_nvptx_parallel_reduce_nowait_v2( |
| 248 | kmp_Ident *loc, int32_t global_tid, int32_t num_vars, size_t reduce_size, |
| 249 | void *reduce_data, kmp_ShuffleReductFctPtr shflFct, |
| 250 | kmp_InterWarpCopyFctPtr cpyFct) { |
| 251 | return nvptx_parallel_reduce_nowait( |
| 252 | global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, |
| 253 | checkSPMDMode(loc), checkRuntimeUninitialized(loc)); |
| 254 | } |
| 255 | |
| 256 | EXTERN |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 257 | int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_spmd( |
| 258 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 259 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { |
| 260 | return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, |
| 261 | reduce_data, shflFct, cpyFct, |
| 262 | /*isSPMDExecutionMode=*/true, |
| 263 | /*isRuntimeUninitialized=*/true); |
| 264 | } |
| 265 | |
| 266 | EXTERN |
| 267 | int32_t __kmpc_nvptx_parallel_reduce_nowait_simple_generic( |
| 268 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 269 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct) { |
| 270 | return nvptx_parallel_reduce_nowait(global_tid, num_vars, reduce_size, |
| 271 | reduce_data, shflFct, cpyFct, |
| 272 | /*isSPMDExecutionMode=*/false, |
| 273 | /*isRuntimeUninitialized=*/true); |
| 274 | } |
| 275 | |
| 276 | INLINE |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 277 | static int32_t nvptx_teams_reduce_nowait( |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 278 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 279 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, |
| 280 | kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct, |
Jonas Hahnfeld | a1100e6 | 2018-10-01 14:14:26 +0000 | [diff] [blame] | 281 | bool isSPMDExecutionMode, bool isRuntimeUninitialized) { |
Alexey Bataev | dcf2edc | 2019-01-04 17:09:12 +0000 | [diff] [blame] | 282 | uint32_t ThreadId = GetLogicalThreadIdInBlock(isSPMDExecutionMode); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 283 | // In non-generic mode all workers participate in the teams reduction. |
| 284 | // In generic mode only the team master participates in the teams |
| 285 | // reduction because the workers are waiting for parallel work. |
| 286 | uint32_t NumThreads = |
| 287 | isSPMDExecutionMode |
| 288 | ? GetNumberOfOmpThreads(ThreadId, /*isSPMDExecutionMode=*/true, |
| 289 | isRuntimeUninitialized) |
| 290 | : /*Master thread only*/ 1; |
| 291 | uint32_t TeamId = GetBlockIdInKernel(); |
| 292 | uint32_t NumTeams = GetNumberOfBlocksInKernel(); |
| 293 | __shared__ volatile bool IsLastTeam; |
| 294 | |
| 295 | // Team masters of all teams write to the scratchpad. |
| 296 | if (ThreadId == 0) { |
| 297 | unsigned int *timestamp = GetTeamsReductionTimestamp(); |
| 298 | char *scratchpad = GetTeamsReductionScratchpad(); |
| 299 | |
| 300 | scratchFct(reduce_data, scratchpad, TeamId, NumTeams); |
| 301 | __threadfence(); |
| 302 | |
| 303 | // atomicInc increments 'timestamp' and has a range [0, NumTeams-1]. |
| 304 | // It resets 'timestamp' back to 0 once the last team increments |
| 305 | // this counter. |
| 306 | unsigned val = atomicInc(timestamp, NumTeams - 1); |
| 307 | IsLastTeam = val == NumTeams - 1; |
| 308 | } |
| 309 | |
| 310 | // We have to wait on L1 barrier because in GENERIC mode the workers |
| 311 | // are waiting on barrier 0 for work. |
| 312 | // |
| 313 | // If we guard this barrier as follows it leads to deadlock, probably |
| 314 | // because of a compiler bug: if (!IsGenericMode()) __syncthreads(); |
| 315 | uint16_t SyncWarps = (NumThreads + WARPSIZE - 1) / WARPSIZE; |
| 316 | named_sync(L1_BARRIER, SyncWarps * WARPSIZE); |
| 317 | |
| 318 | // If this team is not the last, quit. |
| 319 | if (/* Volatile read by all threads */ !IsLastTeam) |
| 320 | return 0; |
| 321 | |
| 322 | // |
| 323 | // Last team processing. |
| 324 | // |
| 325 | |
| 326 | // Threads in excess of #teams do not participate in reduction of the |
| 327 | // scratchpad values. |
| 328 | #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 700 |
| 329 | uint32_t ActiveThreads = NumThreads; |
| 330 | if (NumTeams < NumThreads) { |
| 331 | ActiveThreads = |
| 332 | (NumTeams < WARPSIZE) ? 1 : NumTeams & ~((uint16_t)WARPSIZE - 1); |
| 333 | } |
| 334 | if (ThreadId >= ActiveThreads) |
| 335 | return 0; |
| 336 | |
| 337 | // Load from scratchpad and reduce. |
| 338 | char *scratchpad = GetTeamsReductionScratchpad(); |
| 339 | ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); |
| 340 | for (uint32_t i = ActiveThreads + ThreadId; i < NumTeams; i += ActiveThreads) |
| 341 | ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); |
| 342 | |
| 343 | uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; |
| 344 | uint32_t WarpId = ThreadId / WARPSIZE; |
| 345 | |
| 346 | // Reduce across warps to the warp master. |
| 347 | if ((ActiveThreads % WARPSIZE == 0) || |
| 348 | (WarpId < WarpsNeeded - 1)) // Full warp |
| 349 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 350 | else if (ActiveThreads > 1) // Partial warp but contiguous lanes |
| 351 | // Only SPMD execution mode comes thru this case. |
| 352 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
| 353 | /*LaneCount=*/ActiveThreads % WARPSIZE, |
| 354 | /*LaneId=*/ThreadId % WARPSIZE); |
| 355 | |
| 356 | // When we have more than [warpsize] number of threads |
| 357 | // a block reduction is performed here. |
| 358 | if (ActiveThreads > WARPSIZE) { |
| 359 | // Gather all the reduced values from each warp |
| 360 | // to the first warp. |
| 361 | cpyFct(reduce_data, WarpsNeeded); |
| 362 | |
| 363 | if (WarpId == 0) |
| 364 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); |
| 365 | } |
| 366 | #else |
| 367 | if (ThreadId >= NumTeams) |
| 368 | return 0; |
| 369 | |
| 370 | // Load from scratchpad and reduce. |
| 371 | char *scratchpad = GetTeamsReductionScratchpad(); |
| 372 | ldFct(reduce_data, scratchpad, ThreadId, NumTeams, /*Load only*/ 0); |
| 373 | for (uint32_t i = NumThreads + ThreadId; i < NumTeams; i += NumThreads) |
| 374 | ldFct(reduce_data, scratchpad, i, NumTeams, /*Load and reduce*/ 1); |
| 375 | |
| 376 | // Reduce across warps to the warp master. |
Alexey Bataev | 6b3153a | 2019-01-04 20:16:54 +0000 | [diff] [blame] | 377 | uint32_t Liveness = __ACTIVEMASK(); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 378 | if (Liveness == 0xffffffff) // Full warp |
| 379 | gpu_regular_warp_reduce(reduce_data, shflFct); |
| 380 | else // Partial warp but contiguous lanes |
| 381 | gpu_irregular_warp_reduce(reduce_data, shflFct, |
| 382 | /*LaneCount=*/__popc(Liveness), |
| 383 | /*LaneId=*/ThreadId % WARPSIZE); |
| 384 | |
| 385 | // When we have more than [warpsize] number of threads |
| 386 | // a block reduction is performed here. |
| 387 | uint32_t ActiveThreads = NumTeams < NumThreads ? NumTeams : NumThreads; |
| 388 | if (ActiveThreads > WARPSIZE) { |
| 389 | uint32_t WarpsNeeded = (ActiveThreads + WARPSIZE - 1) / WARPSIZE; |
| 390 | // Gather all the reduced values from each warp |
| 391 | // to the first warp. |
| 392 | cpyFct(reduce_data, WarpsNeeded); |
| 393 | |
| 394 | uint32_t WarpId = ThreadId / WARPSIZE; |
| 395 | if (WarpId == 0) |
| 396 | gpu_irregular_warp_reduce(reduce_data, shflFct, WarpsNeeded, ThreadId); |
| 397 | } |
| 398 | #endif // __CUDA_ARCH__ >= 700 |
| 399 | |
| 400 | return ThreadId == 0; |
| 401 | } |
| 402 | |
| 403 | EXTERN |
| 404 | int32_t __kmpc_nvptx_teams_reduce_nowait(int32_t global_tid, int32_t num_vars, |
| 405 | size_t reduce_size, void *reduce_data, |
| 406 | kmp_ShuffleReductFctPtr shflFct, |
| 407 | kmp_InterWarpCopyFctPtr cpyFct, |
| 408 | kmp_CopyToScratchpadFctPtr scratchFct, |
| 409 | kmp_LoadReduceFctPtr ldFct) { |
Jonas Hahnfeld | a1100e6 | 2018-10-01 14:14:26 +0000 | [diff] [blame] | 410 | return nvptx_teams_reduce_nowait( |
| 411 | global_tid, num_vars, reduce_size, reduce_data, shflFct, cpyFct, |
| 412 | scratchFct, ldFct, /*isSPMDExecutionMode=*/isSPMDMode(), |
| 413 | /*isRuntimeUninitialized=*/isRuntimeUninitialized()); |
George Rokos | 0dd6ed7 | 2018-01-29 13:59:35 +0000 | [diff] [blame] | 414 | } |
| 415 | |
| 416 | EXTERN |
| 417 | int32_t __kmpc_nvptx_teams_reduce_nowait_simple_spmd( |
| 418 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 419 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, |
| 420 | kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { |
| 421 | return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, |
| 422 | reduce_data, shflFct, cpyFct, scratchFct, |
| 423 | ldFct, |
| 424 | /*isSPMDExecutionMode=*/true, |
| 425 | /*isRuntimeUninitialized=*/true); |
| 426 | } |
| 427 | |
| 428 | EXTERN |
| 429 | int32_t __kmpc_nvptx_teams_reduce_nowait_simple_generic( |
| 430 | int32_t global_tid, int32_t num_vars, size_t reduce_size, void *reduce_data, |
| 431 | kmp_ShuffleReductFctPtr shflFct, kmp_InterWarpCopyFctPtr cpyFct, |
| 432 | kmp_CopyToScratchpadFctPtr scratchFct, kmp_LoadReduceFctPtr ldFct) { |
| 433 | return nvptx_teams_reduce_nowait(global_tid, num_vars, reduce_size, |
| 434 | reduce_data, shflFct, cpyFct, scratchFct, |
| 435 | ldFct, |
| 436 | /*isSPMDExecutionMode=*/false, |
| 437 | /*isRuntimeUninitialized=*/true); |
| 438 | } |
Alexey Bataev | d4de439 | 2018-11-27 21:06:09 +0000 | [diff] [blame] | 439 | |
| 440 | EXTERN int32_t __kmpc_nvptx_teams_reduce_nowait_simple(kmp_Ident *loc, |
| 441 | int32_t global_tid, |
| 442 | kmp_CriticalName *crit) { |
| 443 | if (checkSPMDMode(loc) && GetThreadIdInBlock() != 0) |
| 444 | return 0; |
| 445 | // The master thread of the team actually does the reduction. |
| 446 | while (atomicCAS((uint32_t *)crit, 0, 1)) |
| 447 | ; |
| 448 | return 1; |
| 449 | } |
| 450 | |
| 451 | EXTERN void |
| 452 | __kmpc_nvptx_teams_end_reduce_nowait_simple(kmp_Ident *loc, int32_t global_tid, |
| 453 | kmp_CriticalName *crit) { |
Gheorghe-Teodor Bercea | 10b2e60 | 2018-12-03 15:21:49 +0000 | [diff] [blame] | 454 | __threadfence_system(); |
Alexey Bataev | d4de439 | 2018-11-27 21:06:09 +0000 | [diff] [blame] | 455 | (void)atomicExch((uint32_t *)crit, 0); |
| 456 | } |
| 457 | |