Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 1 | /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== |
| 2 | * |
| 3 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
| 4 | * of this software and associated documentation files (the "Software"), to deal |
| 5 | * in the Software without restriction, including without limitation the rights |
| 6 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
| 7 | * copies of the Software, and to permit persons to whom the Software is |
| 8 | * furnished to do so, subject to the following conditions: |
| 9 | * |
| 10 | * The above copyright notice and this permission notice shall be included in |
| 11 | * all copies or substantial portions of the Software. |
| 12 | * |
| 13 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| 14 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| 15 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE |
| 16 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| 17 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
| 18 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
| 19 | * THE SOFTWARE. |
| 20 | * |
| 21 | *===-----------------------------------------------------------------------=== |
| 22 | */ |
| 23 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
| 24 | #define __CLANG_CUDA_INTRINSICS_H__ |
| 25 | #ifndef __CUDA__ |
| 26 | #error "This file is for CUDA compilation only." |
| 27 | #endif |
| 28 | |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 29 | // sm_30 intrinsics: __shfl_{up,down,xor}. |
| 30 | |
| 31 | #define __SM_30_INTRINSICS_H__ |
| 32 | #define __SM_30_INTRINSICS_HPP__ |
| 33 | |
| 34 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 35 | |
| 36 | #pragma push_macro("__MAKE_SHUFFLES") |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 37 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
| 38 | __Type) \ |
| 39 | inline __device__ int __FnName(int __val, __Type __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 40 | int __width = warpSize) { \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 41 | return __IntIntrinsic(__val, __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 42 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 43 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 44 | inline __device__ float __FnName(float __val, __Type __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 45 | int __width = warpSize) { \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 46 | return __FloatIntrinsic(__val, __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 47 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 48 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 49 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 50 | int __width = warpSize) { \ |
| 51 | return static_cast<unsigned int>( \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 52 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 53 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 54 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 55 | int __width = warpSize) { \ |
| 56 | struct __Bits { \ |
| 57 | int __a, __b; \ |
| 58 | }; \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 59 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 60 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 61 | __Bits __tmp; \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 62 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 63 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
| 64 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 65 | long long __ret; \ |
| 66 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 67 | return __ret; \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 68 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 69 | inline __device__ long __FnName(long __val, __Type __offset, \ |
| 70 | int __width = warpSize) { \ |
| 71 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 72 | sizeof(long) == sizeof(int)); \ |
| 73 | if (sizeof(long) == sizeof(long long)) { \ |
| 74 | return static_cast<long>( \ |
| 75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
| 76 | } else if (sizeof(long) == sizeof(int)) { \ |
| 77 | return static_cast<long>( \ |
| 78 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
| 79 | } \ |
| 80 | } \ |
| 81 | inline __device__ unsigned long __FnName( \ |
| 82 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
| 83 | return static_cast<unsigned long>( \ |
| 84 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
| 85 | } \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 86 | inline __device__ unsigned long long __FnName( \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 87 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 88 | return static_cast<unsigned long long>(::__FnName( \ |
| 89 | static_cast<unsigned long long>(__val), __offset, __width)); \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 90 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 91 | inline __device__ double __FnName(double __val, __Type __offset, \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 92 | int __width = warpSize) { \ |
| 93 | long long __tmp; \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 94 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 95 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 96 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
Justin Lebar | b8f7a3b | 2017-01-05 16:54:11 +0000 | [diff] [blame] | 97 | double __ret; \ |
| 98 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 99 | return __ret; \ |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 100 | } |
| 101 | |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 102 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 103 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
| 104 | // maxLane. |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 105 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
| 106 | unsigned int); |
| 107 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
| 108 | unsigned int); |
| 109 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
| 110 | int); |
Justin Lebar | 4fb5711 | 2016-06-09 20:04:57 +0000 | [diff] [blame] | 111 | #pragma pop_macro("__MAKE_SHUFFLES") |
| 112 | |
| 113 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 114 | |
Artem Belevich | bab95c7 | 2017-09-26 17:07:23 +0000 | [diff] [blame] | 115 | #if CUDA_VERSION >= 9000 |
| 116 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 117 | // __shfl_sync_* variants available in CUDA-9 |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 118 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
| 119 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 120 | __Mask, __Type) \ |
| 121 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
| 122 | __Type __offset, int __width = warpSize) { \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 123 | return __IntIntrinsic(__mask, __val, __offset, \ |
| 124 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 125 | } \ |
| 126 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 127 | __Type __offset, int __width = warpSize) { \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 128 | return __FloatIntrinsic(__mask, __val, __offset, \ |
| 129 | ((warpSize - __width) << 8) | (__Mask)); \ |
| 130 | } \ |
| 131 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 132 | unsigned int __val, __Type __offset, \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 133 | int __width = warpSize) { \ |
| 134 | return static_cast<unsigned int>( \ |
| 135 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 136 | } \ |
| 137 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 138 | __Type __offset, \ |
| 139 | int __width = warpSize) { \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 140 | struct __Bits { \ |
| 141 | int __a, __b; \ |
| 142 | }; \ |
| 143 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
| 144 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
| 145 | __Bits __tmp; \ |
| 146 | memcpy(&__val, &__tmp, sizeof(__val)); \ |
| 147 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
| 148 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
| 149 | long long __ret; \ |
| 150 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
| 151 | return __ret; \ |
| 152 | } \ |
| 153 | inline __device__ unsigned long long __FnName( \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 154 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 155 | int __width = warpSize) { \ |
| 156 | return static_cast<unsigned long long>(::__FnName( \ |
| 157 | __mask, static_cast<unsigned long long>(__val), __offset, __width)); \ |
| 158 | } \ |
Artem Belevich | 4631ef1 | 2017-12-06 17:40:35 +0000 | [diff] [blame] | 159 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 160 | __Type __offset, int __width = warpSize) { \ |
Artem Belevich | 4631ef1 | 2017-12-06 17:40:35 +0000 | [diff] [blame] | 161 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
| 162 | sizeof(long) == sizeof(int)); \ |
| 163 | if (sizeof(long) == sizeof(long long)) { \ |
| 164 | return static_cast<long>(::__FnName( \ |
| 165 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
| 166 | } else if (sizeof(long) == sizeof(int)) { \ |
| 167 | return static_cast<long>( \ |
| 168 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
| 169 | } \ |
| 170 | } \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 171 | inline __device__ unsigned long __FnName( \ |
| 172 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
| 173 | int __width = warpSize) { \ |
Artem Belevich | 4631ef1 | 2017-12-06 17:40:35 +0000 | [diff] [blame] | 174 | return static_cast<unsigned long>( \ |
| 175 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
| 176 | } \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 177 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 178 | __Type __offset, int __width = warpSize) { \ |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 179 | long long __tmp; \ |
| 180 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
| 181 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
| 182 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
| 183 | double __ret; \ |
| 184 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
| 185 | return __ret; \ |
| 186 | } |
| 187 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 188 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 189 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
| 190 | // maxLane. |
Artem Belevich | b542f1f | 2017-09-21 18:46:39 +0000 | [diff] [blame] | 191 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 192 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
Artem Belevich | b542f1f | 2017-09-21 18:46:39 +0000 | [diff] [blame] | 193 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 194 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
Artem Belevich | b542f1f | 2017-09-21 18:46:39 +0000 | [diff] [blame] | 195 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
Artem Belevich | 3cebc73 | 2017-12-21 23:52:09 +0000 | [diff] [blame] | 196 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 197 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
| 198 | |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 199 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
| 200 | return __nvvm_bar_warp_sync(mask); |
| 201 | } |
| 202 | |
| 203 | inline __device__ void __barrier_sync(unsigned int id) { |
| 204 | __nvvm_barrier_sync(id); |
| 205 | } |
| 206 | |
| 207 | inline __device__ void __barrier_sync_count(unsigned int id, |
| 208 | unsigned int count) { |
| 209 | __nvvm_barrier_sync_cnt(id, count); |
| 210 | } |
| 211 | |
| 212 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
Artem Belevich | 4d80105 | 2017-09-25 17:55:26 +0000 | [diff] [blame] | 213 | return __nvvm_vote_all_sync(mask, pred); |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 214 | } |
| 215 | |
| 216 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
Artem Belevich | 4d80105 | 2017-09-25 17:55:26 +0000 | [diff] [blame] | 217 | return __nvvm_vote_any_sync(mask, pred); |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 218 | } |
| 219 | |
| 220 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
Artem Belevich | 4d80105 | 2017-09-25 17:55:26 +0000 | [diff] [blame] | 221 | return __nvvm_vote_uni_sync(mask, pred); |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 222 | } |
| 223 | |
| 224 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
Artem Belevich | 4d80105 | 2017-09-25 17:55:26 +0000 | [diff] [blame] | 225 | return __nvvm_vote_ballot_sync(mask, pred); |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 226 | } |
| 227 | |
Jonas Hahnfeld | f21a602 | 2017-10-02 17:50:11 +0000 | [diff] [blame] | 228 | inline __device__ unsigned int __activemask() { return __nvvm_vote_ballot(1); } |
Artem Belevich | 42960b4 | 2017-09-21 18:44:49 +0000 | [diff] [blame] | 229 | |
Artem Belevich | a659d25 | 2017-12-06 17:50:05 +0000 | [diff] [blame] | 230 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
| 231 | return __nvvm_fns(mask, base, offset); |
| 232 | } |
| 233 | |
Artem Belevich | bab95c7 | 2017-09-26 17:07:23 +0000 | [diff] [blame] | 234 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
| 235 | |
| 236 | // Define __match* builtins CUDA-9 headers expect to see. |
| 237 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
| 238 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
| 239 | unsigned int value) { |
| 240 | return __nvvm_match_any_sync_i32(mask, value); |
| 241 | } |
| 242 | |
| 243 | inline __device__ unsigned long long |
| 244 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
| 245 | return __nvvm_match_any_sync_i64(mask, value); |
| 246 | } |
| 247 | |
| 248 | inline __device__ unsigned int |
| 249 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
| 250 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
| 251 | } |
| 252 | |
| 253 | inline __device__ unsigned long long |
| 254 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
| 255 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
| 256 | } |
| 257 | #include "crt/sm_70_rt.hpp" |
| 258 | |
| 259 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
| 260 | #endif // __CUDA_VERSION >= 9000 |
Artem Belevich | 4654dc8 | 2017-09-20 21:23:07 +0000 | [diff] [blame] | 261 | |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 262 | // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. |
| 263 | |
| 264 | // Prevent the vanilla sm_32 intrinsics header from being included. |
| 265 | #define __SM_32_INTRINSICS_H__ |
| 266 | #define __SM_32_INTRINSICS_HPP__ |
| 267 | |
| 268 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
| 269 | |
| 270 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
| 271 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
| 272 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
| 273 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
| 274 | inline __device__ long long __ldg(const long long *ptr) { |
| 275 | return __nvvm_ldg_ll(ptr); |
| 276 | } |
| 277 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
| 278 | return __nvvm_ldg_uc(ptr); |
| 279 | } |
| 280 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
| 281 | return __nvvm_ldg_us(ptr); |
| 282 | } |
| 283 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
| 284 | return __nvvm_ldg_ui(ptr); |
| 285 | } |
| 286 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
| 287 | return __nvvm_ldg_ul(ptr); |
| 288 | } |
| 289 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
| 290 | return __nvvm_ldg_ull(ptr); |
| 291 | } |
| 292 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
| 293 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
| 294 | |
| 295 | inline __device__ char2 __ldg(const char2 *ptr) { |
| 296 | typedef char c2 __attribute__((ext_vector_type(2))); |
| 297 | // We can assume that ptr is aligned at least to char2's alignment, but the |
| 298 | // load will assume that ptr is aligned to char2's alignment. This is only |
| 299 | // safe if alignof(c2) <= alignof(char2). |
| 300 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
| 301 | char2 ret; |
| 302 | ret.x = rv[0]; |
| 303 | ret.y = rv[1]; |
| 304 | return ret; |
| 305 | } |
| 306 | inline __device__ char4 __ldg(const char4 *ptr) { |
| 307 | typedef char c4 __attribute__((ext_vector_type(4))); |
| 308 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
| 309 | char4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 310 | ret.x = rv[0]; |
| 311 | ret.y = rv[1]; |
| 312 | ret.z = rv[2]; |
| 313 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 314 | return ret; |
| 315 | } |
| 316 | inline __device__ short2 __ldg(const short2 *ptr) { |
| 317 | typedef short s2 __attribute__((ext_vector_type(2))); |
| 318 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
| 319 | short2 ret; |
| 320 | ret.x = rv[0]; |
| 321 | ret.y = rv[1]; |
| 322 | return ret; |
| 323 | } |
| 324 | inline __device__ short4 __ldg(const short4 *ptr) { |
| 325 | typedef short s4 __attribute__((ext_vector_type(4))); |
| 326 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
| 327 | short4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 328 | ret.x = rv[0]; |
| 329 | ret.y = rv[1]; |
| 330 | ret.z = rv[2]; |
| 331 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 332 | return ret; |
| 333 | } |
| 334 | inline __device__ int2 __ldg(const int2 *ptr) { |
| 335 | typedef int i2 __attribute__((ext_vector_type(2))); |
| 336 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
| 337 | int2 ret; |
| 338 | ret.x = rv[0]; |
| 339 | ret.y = rv[1]; |
| 340 | return ret; |
| 341 | } |
| 342 | inline __device__ int4 __ldg(const int4 *ptr) { |
| 343 | typedef int i4 __attribute__((ext_vector_type(4))); |
| 344 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
| 345 | int4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 346 | ret.x = rv[0]; |
| 347 | ret.y = rv[1]; |
| 348 | ret.z = rv[2]; |
| 349 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 350 | return ret; |
| 351 | } |
| 352 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
| 353 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
| 354 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
| 355 | longlong2 ret; |
| 356 | ret.x = rv[0]; |
| 357 | ret.y = rv[1]; |
| 358 | return ret; |
| 359 | } |
| 360 | |
| 361 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
| 362 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
| 363 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
| 364 | uchar2 ret; |
| 365 | ret.x = rv[0]; |
| 366 | ret.y = rv[1]; |
| 367 | return ret; |
| 368 | } |
| 369 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
| 370 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
| 371 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
| 372 | uchar4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 373 | ret.x = rv[0]; |
| 374 | ret.y = rv[1]; |
| 375 | ret.z = rv[2]; |
| 376 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 377 | return ret; |
| 378 | } |
| 379 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
| 380 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
| 381 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
| 382 | ushort2 ret; |
| 383 | ret.x = rv[0]; |
| 384 | ret.y = rv[1]; |
| 385 | return ret; |
| 386 | } |
| 387 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
| 388 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
| 389 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
| 390 | ushort4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 391 | ret.x = rv[0]; |
| 392 | ret.y = rv[1]; |
| 393 | ret.z = rv[2]; |
| 394 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 395 | return ret; |
| 396 | } |
| 397 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
| 398 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
| 399 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
| 400 | uint2 ret; |
| 401 | ret.x = rv[0]; |
| 402 | ret.y = rv[1]; |
| 403 | return ret; |
| 404 | } |
| 405 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
| 406 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
| 407 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
| 408 | uint4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 409 | ret.x = rv[0]; |
| 410 | ret.y = rv[1]; |
| 411 | ret.z = rv[2]; |
| 412 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 413 | return ret; |
| 414 | } |
| 415 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
| 416 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
| 417 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
| 418 | ulonglong2 ret; |
| 419 | ret.x = rv[0]; |
| 420 | ret.y = rv[1]; |
| 421 | return ret; |
| 422 | } |
| 423 | |
| 424 | inline __device__ float2 __ldg(const float2 *ptr) { |
| 425 | typedef float f2 __attribute__((ext_vector_type(2))); |
| 426 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
| 427 | float2 ret; |
| 428 | ret.x = rv[0]; |
| 429 | ret.y = rv[1]; |
| 430 | return ret; |
| 431 | } |
| 432 | inline __device__ float4 __ldg(const float4 *ptr) { |
| 433 | typedef float f4 __attribute__((ext_vector_type(4))); |
| 434 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
| 435 | float4 ret; |
Justin Lebar | 720f8da | 2016-05-30 17:12:55 +0000 | [diff] [blame] | 436 | ret.x = rv[0]; |
| 437 | ret.y = rv[1]; |
| 438 | ret.z = rv[2]; |
| 439 | ret.w = rv[3]; |
Justin Lebar | 2e4ecfd | 2016-05-19 22:49:13 +0000 | [diff] [blame] | 440 | return ret; |
| 441 | } |
| 442 | inline __device__ double2 __ldg(const double2 *ptr) { |
| 443 | typedef double d2 __attribute__((ext_vector_type(2))); |
| 444 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
| 445 | double2 ret; |
| 446 | ret.x = rv[0]; |
| 447 | ret.y = rv[1]; |
| 448 | return ret; |
| 449 | } |
| 450 | |
| 451 | // TODO: Implement these as intrinsics, so the backend can work its magic on |
| 452 | // these. Alternatively, we could implement these as plain C and try to get |
| 453 | // llvm to recognize the relevant patterns. |
| 454 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
| 455 | unsigned shiftWidth) { |
| 456 | unsigned result; |
| 457 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
| 458 | : "=r"(result) |
| 459 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 460 | return result; |
| 461 | } |
| 462 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
| 463 | unsigned shiftWidth) { |
| 464 | unsigned result; |
| 465 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
| 466 | : "=r"(result) |
| 467 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 468 | return result; |
| 469 | } |
| 470 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
| 471 | unsigned shiftWidth) { |
| 472 | unsigned result; |
| 473 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
| 474 | : "=r"(result) |
| 475 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 476 | return result; |
| 477 | } |
| 478 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
| 479 | unsigned shiftWidth) { |
| 480 | unsigned ret; |
| 481 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
| 482 | : "=r"(ret) |
| 483 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
| 484 | return ret; |
| 485 | } |
| 486 | |
| 487 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
| 488 | |
| 489 | #endif // defined(__CLANG_CUDA_INTRINSICS_H__) |