Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 1 | /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== |
| 2 | * |
Logan Chien | df4f766 | 2019-09-04 16:45:23 -0700 | [diff] [blame] | 3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
| 4 | * See https://llvm.org/LICENSE.txt for license information. |
| 5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 6 | * |
| 7 | *===-----------------------------------------------------------------------=== |
| 8 | */ |
| 9 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
| 10 | #define __CLANG_CUDA_INTRINSICS_H__ |
| 11 | #ifndef __CUDA__ |
| 12 | #error "This file is for CUDA compilation only." |
| 13 | #endif |
| 14 | |
| 15 | // sm_30 intrinsics: __shfl_{up,down,xor}. |
| 16 | |
| 17 | #define __SM_30_INTRINSICS_H__ |
| 18 | #define __SM_30_INTRINSICS_HPP__ |
| 19 | |
| 20 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 21 | |
| 22 | #pragma push_macro("__MAKE_SHUFFLES") |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 23 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
| 24 | __Type) \ |
| 25 | inline __device__ int __FnName(int __val, __Type __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 26 | int __width = warpSize) { \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 27 | return __IntIntrinsic(__val, __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 28 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 29 | } \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 30 | inline __device__ float __FnName(float __val, __Type __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 31 | int __width = warpSize) { \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 32 | return __FloatIntrinsic(__val, __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 33 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 34 | } \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 35 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 36 | int __width = warpSize) { \ |
| 37 | return static_cast<unsigned int>( \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 38 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 39 | } \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 40 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 41 | int __width = warpSize) { \ |
| 42 | struct __Bits { \ |
| 43 | int __a, __b; \ |
| 44 | }; \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 45 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 46 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 47 | __Bits __tmp; \ |
Sasha Smundak | 33d5ddd | 2020-05-04 13:37:26 -0700 | [diff] [blame] | 48 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 49 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
| 50 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 51 | long long __ret; \ |
| 52 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 53 | return __ret; \ |
| 54 | } \ |
| 55 | inline __device__ long __FnName(long __val, __Type __offset, \ |
| 56 | int __width = warpSize) { \ |
| 57 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 58 | sizeof(long) == sizeof(int)); \ |
| 59 | if (sizeof(long) == sizeof(long long)) { \ |
| 60 | return static_cast<long>( \ |
| 61 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
| 62 | } else if (sizeof(long) == sizeof(int)) { \ |
| 63 | return static_cast<long>( \ |
| 64 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
| 65 | } \ |
| 66 | } \ |
| 67 | inline __device__ unsigned long __FnName( \ |
| 68 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
| 69 | return static_cast<unsigned long>( \ |
| 70 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 71 | } \ |
| 72 | inline __device__ unsigned long long __FnName( \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 73 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
| 74 | return static_cast<unsigned long long>(::__FnName( \ |
| 75 | static_cast<unsigned long long>(__val), __offset, __width)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 76 | } \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 77 | inline __device__ double __FnName(double __val, __Type __offset, \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 78 | int __width = warpSize) { \ |
| 79 | long long __tmp; \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 80 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 81 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 82 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 83 | double __ret; \ |
| 84 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 85 | return __ret; \ |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 86 | } |
| 87 | |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 88 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 89 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
| 90 | // maxLane. |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 91 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
| 92 | unsigned int); |
| 93 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
| 94 | unsigned int); |
| 95 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
| 96 | int); |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 97 | #pragma pop_macro("__MAKE_SHUFFLES") |
| 98 | |
| 99 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 100 | |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 101 | #if CUDA_VERSION >= 9000 |
| 102 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
| 103 | // __shfl_sync_* variants available in CUDA-9 |
| 104 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
| 105 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
| 106 | __Mask, __Type) \ |
| 107 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
| 108 | __Type __offset, int __width = warpSize) { \ |
| 109 | return __IntIntrinsic(__mask, __val, __offset, \ |
| 110 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 111 | } \ |
| 112 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
| 113 | __Type __offset, int __width = warpSize) { \ |
| 114 | return __FloatIntrinsic(__mask, __val, __offset, \ |
| 115 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 116 | } \ |
| 117 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
| 118 | unsigned int __val, __Type __offset, \ |
| 119 | int __width = warpSize) { \ |
| 120 | return static_cast<unsigned int>( \ |
| 121 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 122 | } \ |
| 123 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
| 124 | __Type __offset, \ |
| 125 | int __width = warpSize) { \ |
| 126 | struct __Bits { \ |
| 127 | int __a, __b; \ |
| 128 | }; \ |
| 129 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
| 130 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 131 | __Bits __tmp; \ |
Sasha Smundak | 33d5ddd | 2020-05-04 13:37:26 -0700 | [diff] [blame] | 132 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 133 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
| 134 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
| 135 | long long __ret; \ |
| 136 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 137 | return __ret; \ |
| 138 | } \ |
| 139 | inline __device__ unsigned long long __FnName( \ |
| 140 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
| 141 | int __width = warpSize) { \ |
| 142 | return static_cast<unsigned long long>(::__FnName( \ |
| 143 | __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ |
| 144 | } \ |
| 145 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
| 146 | __Type __offset, int __width = warpSize) { \ |
| 147 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 148 | sizeof(long) == sizeof(int)); \ |
| 149 | if (sizeof(long) == sizeof(long long)) { \ |
| 150 | return static_cast<long>(::__FnName( \ |
| 151 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
| 152 | } else if (sizeof(long) == sizeof(int)) { \ |
| 153 | return static_cast<long>( \ |
| 154 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 155 | } \ |
| 156 | } \ |
| 157 | inline __device__ unsigned long __FnName( \ |
| 158 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
| 159 | int __width = warpSize) { \ |
| 160 | return static_cast<unsigned long>( \ |
| 161 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
| 162 | } \ |
| 163 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
| 164 | __Type __offset, int __width = warpSize) { \ |
| 165 | long long __tmp; \ |
| 166 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 167 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
| 168 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
| 169 | double __ret; \ |
| 170 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 171 | return __ret; \ |
| 172 | } |
| 173 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
| 174 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
| 175 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
| 176 | // maxLane. |
| 177 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
| 178 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
| 179 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
| 180 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
| 181 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
| 182 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
| 183 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
| 184 | |
| 185 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
| 186 | return __nvvm_bar_warp_sync(mask); |
| 187 | } |
| 188 | |
| 189 | inline __device__ void __barrier_sync(unsigned int id) { |
| 190 | __nvvm_barrier_sync(id); |
| 191 | } |
| 192 | |
| 193 | inline __device__ void __barrier_sync_count(unsigned int id, |
| 194 | unsigned int count) { |
| 195 | __nvvm_barrier_sync_cnt(id, count); |
| 196 | } |
| 197 | |
| 198 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
| 199 | return __nvvm_vote_all_sync(mask, pred); |
| 200 | } |
| 201 | |
| 202 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
| 203 | return __nvvm_vote_any_sync(mask, pred); |
| 204 | } |
| 205 | |
| 206 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
| 207 | return __nvvm_vote_uni_sync(mask, pred); |
| 208 | } |
| 209 | |
| 210 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
| 211 | return __nvvm_vote_ballot_sync(mask, pred); |
| 212 | } |
| 213 | |
Logan Chien | bedbf4f | 2020-01-06 19:35:19 -0800 | [diff] [blame] | 214 | inline __device__ unsigned int __activemask() { |
| 215 | #if CUDA_VERSION < 9020 |
| 216 | return __nvvm_vote_ballot(1); |
| 217 | #else |
| 218 | unsigned int mask; |
| 219 | asm volatile("activemask.b32 %0;" : "=r"(mask)); |
| 220 | return mask; |
| 221 | #endif |
| 222 | } |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 223 | |
| 224 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
| 225 | return __nvvm_fns(mask, base, offset); |
| 226 | } |
| 227 | |
| 228 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 229 | |
| 230 | // Define __match* builtins CUDA-9 headers expect to see. |
| 231 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
| 232 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
| 233 | unsigned int value) { |
| 234 | return __nvvm_match_any_sync_i32(mask, value); |
| 235 | } |
| 236 | |
| 237 | inline __device__ unsigned long long |
| 238 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
| 239 | return __nvvm_match_any_sync_i64(mask, value); |
| 240 | } |
| 241 | |
| 242 | inline __device__ unsigned int |
| 243 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
| 244 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
| 245 | } |
| 246 | |
| 247 | inline __device__ unsigned long long |
| 248 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
| 249 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
| 250 | } |
| 251 | #include "crt/sm_70_rt.hpp" |
| 252 | |
| 253 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
| 254 | #endif // __CUDA_VERSION >= 9000 |
| 255 | |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 256 | // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. |
| 257 | |
| 258 | // Prevent the vanilla sm_32 intrinsics header from being included. |
| 259 | #define __SM_32_INTRINSICS_H__ |
| 260 | #define __SM_32_INTRINSICS_HPP__ |
| 261 | |
| 262 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
| 263 | |
| 264 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
| 265 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
| 266 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
| 267 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
| 268 | inline __device__ long long __ldg(const long long *ptr) { |
| 269 | return __nvvm_ldg_ll(ptr); |
| 270 | } |
| 271 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
| 272 | return __nvvm_ldg_uc(ptr); |
| 273 | } |
Logan Chien | 55afb0a | 2018-10-15 10:42:14 +0800 | [diff] [blame] | 274 | inline __device__ signed char __ldg(const signed char *ptr) { |
| 275 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
| 276 | } |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 277 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
| 278 | return __nvvm_ldg_us(ptr); |
| 279 | } |
| 280 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
| 281 | return __nvvm_ldg_ui(ptr); |
| 282 | } |
| 283 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
| 284 | return __nvvm_ldg_ul(ptr); |
| 285 | } |
| 286 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
| 287 | return __nvvm_ldg_ull(ptr); |
| 288 | } |
| 289 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
| 290 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
| 291 | |
| 292 | inline __device__ char2 __ldg(const char2 *ptr) { |
| 293 | typedef char c2 __attribute__((ext_vector_type(2))); |
| 294 | // We can assume that ptr is aligned at least to char2's alignment, but the |
| 295 | // load will assume that ptr is aligned to char2's alignment. This is only |
| 296 | // safe if alignof(c2) <= alignof(char2). |
| 297 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
| 298 | char2 ret; |
| 299 | ret.x = rv[0]; |
| 300 | ret.y = rv[1]; |
| 301 | return ret; |
| 302 | } |
| 303 | inline __device__ char4 __ldg(const char4 *ptr) { |
| 304 | typedef char c4 __attribute__((ext_vector_type(4))); |
| 305 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
| 306 | char4 ret; |
| 307 | ret.x = rv[0]; |
| 308 | ret.y = rv[1]; |
| 309 | ret.z = rv[2]; |
| 310 | ret.w = rv[3]; |
| 311 | return ret; |
| 312 | } |
| 313 | inline __device__ short2 __ldg(const short2 *ptr) { |
| 314 | typedef short s2 __attribute__((ext_vector_type(2))); |
| 315 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
| 316 | short2 ret; |
| 317 | ret.x = rv[0]; |
| 318 | ret.y = rv[1]; |
| 319 | return ret; |
| 320 | } |
| 321 | inline __device__ short4 __ldg(const short4 *ptr) { |
| 322 | typedef short s4 __attribute__((ext_vector_type(4))); |
| 323 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
| 324 | short4 ret; |
| 325 | ret.x = rv[0]; |
| 326 | ret.y = rv[1]; |
| 327 | ret.z = rv[2]; |
| 328 | ret.w = rv[3]; |
| 329 | return ret; |
| 330 | } |
| 331 | inline __device__ int2 __ldg(const int2 *ptr) { |
| 332 | typedef int i2 __attribute__((ext_vector_type(2))); |
| 333 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
| 334 | int2 ret; |
| 335 | ret.x = rv[0]; |
| 336 | ret.y = rv[1]; |
| 337 | return ret; |
| 338 | } |
| 339 | inline __device__ int4 __ldg(const int4 *ptr) { |
| 340 | typedef int i4 __attribute__((ext_vector_type(4))); |
| 341 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
| 342 | int4 ret; |
| 343 | ret.x = rv[0]; |
| 344 | ret.y = rv[1]; |
| 345 | ret.z = rv[2]; |
| 346 | ret.w = rv[3]; |
| 347 | return ret; |
| 348 | } |
| 349 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
| 350 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
| 351 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
| 352 | longlong2 ret; |
| 353 | ret.x = rv[0]; |
| 354 | ret.y = rv[1]; |
| 355 | return ret; |
| 356 | } |
| 357 | |
| 358 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
| 359 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
| 360 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
| 361 | uchar2 ret; |
| 362 | ret.x = rv[0]; |
| 363 | ret.y = rv[1]; |
| 364 | return ret; |
| 365 | } |
| 366 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
| 367 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
| 368 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
| 369 | uchar4 ret; |
| 370 | ret.x = rv[0]; |
| 371 | ret.y = rv[1]; |
| 372 | ret.z = rv[2]; |
| 373 | ret.w = rv[3]; |
| 374 | return ret; |
| 375 | } |
| 376 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
| 377 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
| 378 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
| 379 | ushort2 ret; |
| 380 | ret.x = rv[0]; |
| 381 | ret.y = rv[1]; |
| 382 | return ret; |
| 383 | } |
| 384 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
| 385 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
| 386 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
| 387 | ushort4 ret; |
| 388 | ret.x = rv[0]; |
| 389 | ret.y = rv[1]; |
| 390 | ret.z = rv[2]; |
| 391 | ret.w = rv[3]; |
| 392 | return ret; |
| 393 | } |
| 394 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
| 395 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
| 396 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
| 397 | uint2 ret; |
| 398 | ret.x = rv[0]; |
| 399 | ret.y = rv[1]; |
| 400 | return ret; |
| 401 | } |
| 402 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
| 403 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
| 404 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
| 405 | uint4 ret; |
| 406 | ret.x = rv[0]; |
| 407 | ret.y = rv[1]; |
| 408 | ret.z = rv[2]; |
| 409 | ret.w = rv[3]; |
| 410 | return ret; |
| 411 | } |
| 412 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
| 413 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
| 414 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
| 415 | ulonglong2 ret; |
| 416 | ret.x = rv[0]; |
| 417 | ret.y = rv[1]; |
| 418 | return ret; |
| 419 | } |
| 420 | |
| 421 | inline __device__ float2 __ldg(const float2 *ptr) { |
| 422 | typedef float f2 __attribute__((ext_vector_type(2))); |
| 423 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
| 424 | float2 ret; |
| 425 | ret.x = rv[0]; |
| 426 | ret.y = rv[1]; |
| 427 | return ret; |
| 428 | } |
| 429 | inline __device__ float4 __ldg(const float4 *ptr) { |
| 430 | typedef float f4 __attribute__((ext_vector_type(4))); |
| 431 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
| 432 | float4 ret; |
| 433 | ret.x = rv[0]; |
| 434 | ret.y = rv[1]; |
| 435 | ret.z = rv[2]; |
| 436 | ret.w = rv[3]; |
| 437 | return ret; |
| 438 | } |
| 439 | inline __device__ double2 __ldg(const double2 *ptr) { |
| 440 | typedef double d2 __attribute__((ext_vector_type(2))); |
| 441 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
| 442 | double2 ret; |
| 443 | ret.x = rv[0]; |
| 444 | ret.y = rv[1]; |
| 445 | return ret; |
| 446 | } |
| 447 | |
| 448 | // TODO: Implement these as intrinsics, so the backend can work its magic on |
| 449 | // these. Alternatively, we could implement these as plain C and try to get |
| 450 | // llvm to recognize the relevant patterns. |
| 451 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
| 452 | unsigned shiftWidth) { |
| 453 | unsigned result; |
| 454 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
| 455 | : "=r"(result) |
| 456 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 457 | return result; |
| 458 | } |
| 459 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
| 460 | unsigned shiftWidth) { |
| 461 | unsigned result; |
| 462 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
| 463 | : "=r"(result) |
| 464 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 465 | return result; |
| 466 | } |
| 467 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
| 468 | unsigned shiftWidth) { |
| 469 | unsigned result; |
| 470 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
| 471 | : "=r"(result) |
| 472 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 473 | return result; |
| 474 | } |
| 475 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
| 476 | unsigned shiftWidth) { |
| 477 | unsigned ret; |
| 478 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
| 479 | : "=r"(ret) |
| 480 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 481 | return ret; |
| 482 | } |
| 483 | |
| 484 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
| 485 | |
Pirama Arumuga Nainar | ec8c89d | 2022-02-23 09:26:16 -0800 | [diff] [blame] | 486 | #if CUDA_VERSION >= 11000 |
| 487 | extern "C" { |
| 488 | __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) { |
| 489 | return (size_t)(void __attribute__((address_space(1))) *)__ptr; |
| 490 | } |
| 491 | __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) { |
| 492 | return (size_t)(void __attribute__((address_space(3))) *)__ptr; |
| 493 | } |
| 494 | __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) { |
| 495 | return (size_t)(void __attribute__((address_space(4))) *)__ptr; |
| 496 | } |
| 497 | __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) { |
| 498 | return (size_t)(void __attribute__((address_space(5))) *)__ptr; |
| 499 | } |
| 500 | __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) { |
| 501 | return (void *)(void __attribute__((address_space(1))) *)__ptr; |
| 502 | } |
| 503 | __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) { |
| 504 | return (void *)(void __attribute__((address_space(3))) *)__ptr; |
| 505 | } |
| 506 | __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) { |
| 507 | return (void *)(void __attribute__((address_space(4))) *)__ptr; |
| 508 | } |
| 509 | __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) { |
| 510 | return (void *)(void __attribute__((address_space(5))) *)__ptr; |
| 511 | } |
| 512 | __device__ inline uint32_t __nvvm_get_smem_pointer(void *__ptr) { |
| 513 | return __nv_cvta_generic_to_shared_impl(__ptr); |
| 514 | } |
| 515 | } // extern "C" |
| 516 | #endif // CUDA_VERSION >= 11000 |
| 517 | |
Logan Chien | 2833ffb | 2018-10-09 10:03:24 +0800 | [diff] [blame] | 518 | #endif // defined(__CLANG_CUDA_INTRINSICS_H__) |