Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 1 | //===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===// |
| 2 | // |
| 3 | // The LLVM Compiler Infrastructure |
| 4 | // |
| 5 | // This file is distributed under the University of Illinois Open Source |
| 6 | // License. See LICENSE.TXT for details. |
| 7 | // |
| 8 | //===----------------------------------------------------------------------===// |
| 9 | /// |
| 10 | /// This file defines the standard CUDA implementation of the Acxxel API. |
| 11 | /// |
| 12 | //===----------------------------------------------------------------------===// |
| 13 | |
| 14 | #include "acxxel.h" |
| 15 | |
| 16 | #include "cuda.h" |
| 17 | #include "cuda_runtime.h" |
| 18 | |
| 19 | #include <array> |
| 20 | #include <cassert> |
| 21 | #include <sstream> |
| 22 | #include <vector> |
| 23 | |
| 24 | namespace acxxel { |
| 25 | |
| 26 | namespace { |
| 27 | |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 28 | static std::string getCUErrorMessage(CUresult Result) { |
| 29 | if (!Result) |
| 30 | return "success"; |
| 31 | const char *ErrorName = "UNKNOWN_ERROR_NAME"; |
| 32 | const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION"; |
| 33 | cuGetErrorName(Result, &ErrorName); |
| 34 | cuGetErrorString(Result, &ErrorDescription); |
| 35 | std::ostringstream OutStream; |
| 36 | OutStream << "CUDA driver error: code = " << Result |
| 37 | << ", name = " << ErrorName |
| 38 | << ", description = " << ErrorDescription; |
| 39 | return OutStream.str(); |
| 40 | } |
| 41 | |
| 42 | static Status getCUError(CUresult Result, const std::string &Message) { |
| 43 | if (!Result) |
| 44 | return Status(); |
| 45 | std::ostringstream OutStream; |
| 46 | OutStream << getCUErrorMessage(Result) << ", message = " << Message; |
| 47 | return Status(OutStream.str()); |
| 48 | } |
| 49 | |
| 50 | static std::string getCUDAErrorMessage(cudaError_t E) { |
| 51 | if (!E) |
| 52 | return "success"; |
| 53 | std::ostringstream OutStream; |
| 54 | OutStream << "CUDA runtime error: code = " << E |
| 55 | << ", name = " << cudaGetErrorName(E) |
| 56 | << ", description = " << cudaGetErrorString(E); |
| 57 | return OutStream.str(); |
| 58 | } |
| 59 | |
| 60 | static Status getCUDAError(cudaError_t E, const std::string &Message) { |
| 61 | if (!E) |
| 62 | return Status(); |
| 63 | std::ostringstream OutStream; |
| 64 | OutStream << getCUDAErrorMessage(E) << ", message = " << Message; |
| 65 | return Status(OutStream.str()); |
| 66 | } |
| 67 | |
| 68 | static void logCUWarning(CUresult Result, const std::string &Message) { |
| 69 | if (Result) { |
| 70 | std::ostringstream OutStream; |
| 71 | OutStream << Message << ": " << getCUErrorMessage(Result); |
| 72 | logWarning(OutStream.str()); |
| 73 | } |
| 74 | } |
| 75 | |
| 76 | /// A CUDA Platform implementation. |
| 77 | class CUDAPlatform : public Platform { |
| 78 | public: |
| 79 | ~CUDAPlatform() override = default; |
| 80 | |
| 81 | static Expected<CUDAPlatform> create(); |
| 82 | |
| 83 | Expected<int> getDeviceCount() override; |
| 84 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 85 | Expected<Stream> createStream(int DeviceIndex) override; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 86 | |
| 87 | Status streamSync(void *Stream) override; |
| 88 | |
| 89 | Status streamWaitOnEvent(void *Stream, void *Event) override; |
| 90 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 91 | Expected<Event> createEvent(int DeviceIndex) override; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 92 | |
| 93 | protected: |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 94 | Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 95 | HandleDestructor getDeviceMemoryHandleDestructor() override; |
| 96 | void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize, |
| 97 | size_t ByteOffset) override; |
| 98 | virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override; |
| 99 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 100 | Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol, |
| 101 | int DeviceIndex) override; |
| 102 | Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol, |
| 103 | int DeviceIndex) override; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 104 | |
| 105 | Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override; |
| 106 | HandleDestructor getUnregisterHostMemoryHandleDestructor() override; |
| 107 | |
| 108 | Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override; |
| 109 | HandleDestructor getFreeHostMemoryHandleDestructor() override; |
| 110 | |
| 111 | Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset, |
| 112 | void *DeviceDst, ptrdiff_t DeviceDstByteOffset, |
| 113 | ptrdiff_t ByteCount, void *Stream) override; |
| 114 | Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset, |
| 115 | void *HostDst, ptrdiff_t ByteCount, |
| 116 | void *Stream) override; |
| 117 | Status asyncCopyHToD(const void *HostSrc, void *DeviceDst, |
| 118 | ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount, |
| 119 | void *Stream) override; |
| 120 | |
| 121 | Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, |
| 122 | ptrdiff_t ByteCount, char ByteValue, |
| 123 | void *Stream) override; |
| 124 | |
| 125 | Status addStreamCallback(Stream &Stream, StreamCallback Callback) override; |
| 126 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 127 | Expected<Program> createProgramFromSource(Span<const char> Source, |
| 128 | int DeviceIndex) override; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 129 | |
| 130 | Status enqueueEvent(void *Event, void *Stream) override; |
| 131 | bool eventIsDone(void *Event) override; |
| 132 | Status eventSync(void *Event) override; |
| 133 | Expected<float> getSecondsBetweenEvents(void *StartEvent, |
| 134 | void *EndEvent) override; |
| 135 | |
| 136 | Expected<void *> rawCreateKernel(void *Program, |
| 137 | const std::string &Name) override; |
| 138 | HandleDestructor getKernelHandleDestructor() override; |
| 139 | |
| 140 | Status rawEnqueueKernelLaunch(void *Stream, void *Kernel, |
| 141 | KernelLaunchDimensions LaunchDimensions, |
| 142 | Span<void *> Arguments, |
| 143 | Span<size_t> ArgumentSizes, |
| 144 | size_t SharedMemoryBytes) override; |
| 145 | |
| 146 | private: |
| 147 | explicit CUDAPlatform(const std::vector<CUcontext> &Contexts) |
| 148 | : TheContexts(Contexts) {} |
| 149 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 150 | Status setContext(int DeviceIndex) { |
| 151 | if (DeviceIndex < 0 || |
| 152 | static_cast<size_t>(DeviceIndex) >= TheContexts.size()) |
| 153 | return Status("invalid deivce index " + std::to_string(DeviceIndex)); |
| 154 | return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]), |
| 155 | "cuCtxSetCurrent"); |
| 156 | } |
| 157 | |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 158 | // Vector of contexts for each device. |
| 159 | std::vector<CUcontext> TheContexts; |
| 160 | }; |
| 161 | |
| 162 | Expected<CUDAPlatform> CUDAPlatform::create() { |
| 163 | std::vector<CUcontext> Contexts; |
| 164 | if (CUresult Result = cuInit(0)) |
| 165 | return getCUError(Result, "cuInit"); |
| 166 | |
| 167 | int DeviceCount = 0; |
| 168 | if (CUresult Result = cuDeviceGetCount(&DeviceCount)) |
| 169 | return getCUError(Result, "cuDeviceGetCount"); |
| 170 | |
| 171 | for (int I = 0; I < DeviceCount; ++I) { |
| 172 | CUdevice Device; |
| 173 | if (CUresult Result = cuDeviceGet(&Device, I)) |
| 174 | return getCUError(Result, "cuDeviceGet"); |
| 175 | CUcontext Context; |
| 176 | if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device)) |
| 177 | return getCUError(Result, "cuDevicePrimaryCtxRetain"); |
| 178 | if (CUresult Result = cuCtxSetCurrent(Context)) |
| 179 | return getCUError(Result, "cuCtxSetCurrent"); |
| 180 | Contexts.emplace_back(Context); |
| 181 | } |
| 182 | |
| 183 | return CUDAPlatform(Contexts); |
| 184 | } |
| 185 | |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 186 | Expected<int> CUDAPlatform::getDeviceCount() { |
| 187 | int Count = 0; |
| 188 | if (CUresult Result = cuDeviceGetCount(&Count)) |
| 189 | return getCUError(Result, "cuDeviceGetCount"); |
| 190 | return Count; |
| 191 | } |
| 192 | |
| 193 | static void cudaDestroyStream(void *H) { |
| 194 | logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)), |
| 195 | "cuStreamDestroy"); |
| 196 | } |
| 197 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 198 | Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) { |
| 199 | Status S = setContext(DeviceIndex); |
| 200 | if (S.isError()) |
| 201 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 202 | unsigned int Flags = CU_STREAM_DEFAULT; |
| 203 | CUstream Handle; |
| 204 | if (CUresult Result = cuStreamCreate(&Handle, Flags)) |
| 205 | return getCUError(Result, "cuStreamCreate"); |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 206 | return constructStream(this, DeviceIndex, Handle, cudaDestroyStream); |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 207 | } |
| 208 | |
| 209 | Status CUDAPlatform::streamSync(void *Stream) { |
| 210 | return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)), |
| 211 | "cuStreamSynchronize"); |
| 212 | } |
| 213 | |
| 214 | Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) { |
| 215 | // CUDA docs says flags must be 0. |
| 216 | unsigned int Flags = 0u; |
| 217 | return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream), |
| 218 | static_cast<CUevent_st *>(Event), Flags), |
| 219 | "cuStreamWaitEvent"); |
| 220 | } |
| 221 | |
| 222 | static void cudaDestroyEvent(void *H) { |
| 223 | logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy"); |
| 224 | } |
| 225 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 226 | Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) { |
| 227 | Status S = setContext(DeviceIndex); |
| 228 | if (S.isError()) |
| 229 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 230 | unsigned int Flags = CU_EVENT_DEFAULT; |
| 231 | CUevent Handle; |
| 232 | if (CUresult Result = cuEventCreate(&Handle, Flags)) |
| 233 | return getCUError(Result, "cuEventCreate"); |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 234 | return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent); |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 235 | } |
| 236 | |
| 237 | Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) { |
| 238 | return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event), |
| 239 | static_cast<CUstream_st *>(Stream)), |
| 240 | "cuEventRecord"); |
| 241 | } |
| 242 | |
| 243 | bool CUDAPlatform::eventIsDone(void *Event) { |
| 244 | return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY; |
| 245 | } |
| 246 | |
| 247 | Status CUDAPlatform::eventSync(void *Event) { |
| 248 | return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)), |
| 249 | "cuEventSynchronize"); |
| 250 | } |
| 251 | |
| 252 | Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent, |
| 253 | void *EndEvent) { |
| 254 | float Milliseconds; |
| 255 | if (CUresult Result = cuEventElapsedTime( |
| 256 | &Milliseconds, static_cast<CUevent_st *>(StartEvent), |
| 257 | static_cast<CUevent_st *>(EndEvent))) |
| 258 | return getCUError(Result, "cuEventElapsedTime"); |
| 259 | return Milliseconds * 1e-6; |
| 260 | } |
| 261 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 262 | Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount, |
| 263 | int DeviceIndex) { |
| 264 | Status S = setContext(DeviceIndex); |
| 265 | if (S.isError()) |
| 266 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 267 | if (!ByteCount) |
| 268 | return nullptr; |
| 269 | CUdeviceptr Pointer; |
| 270 | if (CUresult Result = cuMemAlloc(&Pointer, ByteCount)) |
| 271 | return getCUError(Result, "cuMemAlloc"); |
| 272 | return reinterpret_cast<void *>(Pointer); |
| 273 | } |
| 274 | |
| 275 | static void cudaDestroyDeviceMemory(void *H) { |
| 276 | logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree"); |
| 277 | } |
| 278 | |
| 279 | HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() { |
| 280 | return cudaDestroyDeviceMemory; |
| 281 | } |
| 282 | |
| 283 | void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t, |
| 284 | size_t ByteOffset) { |
| 285 | return static_cast<char *>(BaseHandle) + ByteOffset; |
| 286 | } |
| 287 | |
| 288 | void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) { |
| 289 | // Do nothing for this platform. |
| 290 | } |
| 291 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 292 | Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol, |
| 293 | int DeviceIndex) { |
| 294 | Status S = setContext(DeviceIndex); |
| 295 | if (S.isError()) |
| 296 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 297 | void *Address; |
| 298 | if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol)) |
| 299 | return getCUDAError(Status, "cudaGetSymbolAddress"); |
| 300 | return Address; |
| 301 | } |
| 302 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 303 | Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol, |
| 304 | int DeviceIndex) { |
| 305 | Status S = setContext(DeviceIndex); |
| 306 | if (S.isError()) |
| 307 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 308 | size_t Size; |
| 309 | if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol)) |
| 310 | return getCUDAError(Status, "cudaGetSymbolSize"); |
| 311 | return Size; |
| 312 | } |
| 313 | |
| 314 | static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) { |
| 315 | return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset); |
| 316 | } |
| 317 | |
| 318 | static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) { |
| 319 | return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset); |
| 320 | } |
| 321 | |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 322 | Status CUDAPlatform::rawRegisterHostMem(const void *Memory, |
| 323 | ptrdiff_t ByteCount) { |
| 324 | unsigned int Flags = 0; |
| 325 | return getCUError( |
| 326 | cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags), |
| 327 | "cuMemHostRegiser"); |
| 328 | } |
| 329 | |
| 330 | static void cudaUnregisterHostMemoryHandleDestructor(void *H) { |
| 331 | logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister"); |
| 332 | } |
| 333 | |
| 334 | HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() { |
| 335 | return cudaUnregisterHostMemoryHandleDestructor; |
| 336 | } |
| 337 | |
| 338 | Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) { |
| 339 | unsigned int Flags = 0; |
| 340 | void *Memory; |
| 341 | if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags)) |
| 342 | return getCUError(Result, "cuMemHostAlloc"); |
| 343 | return Memory; |
| 344 | } |
| 345 | |
| 346 | static void cudaFreeHostMemoryHandleDestructor(void *H) { |
| 347 | logCUWarning(cuMemFreeHost(H), "cuMemFreeHost"); |
| 348 | } |
| 349 | |
| 350 | HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() { |
| 351 | return cudaFreeHostMemoryHandleDestructor; |
| 352 | } |
| 353 | |
| 354 | Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc, |
| 355 | ptrdiff_t DeviceSrcByteOffset, |
| 356 | void *DeviceDst, |
| 357 | ptrdiff_t DeviceDstByteOffset, |
| 358 | ptrdiff_t ByteCount, void *Stream) { |
| 359 | return getCUError( |
| 360 | cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>( |
| 361 | offsetVoidPtr(DeviceDst, DeviceDstByteOffset)), |
| 362 | reinterpret_cast<CUdeviceptr>( |
| 363 | offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)), |
| 364 | ByteCount, static_cast<CUstream_st *>(Stream)), |
| 365 | "cuMemcpyDtoDAsync"); |
| 366 | } |
| 367 | |
| 368 | Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc, |
| 369 | ptrdiff_t DeviceSrcByteOffset, void *HostDst, |
| 370 | ptrdiff_t ByteCount, void *Stream) { |
| 371 | return getCUError( |
| 372 | cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr( |
| 373 | DeviceSrc, DeviceSrcByteOffset)), |
| 374 | ByteCount, static_cast<CUstream_st *>(Stream)), |
| 375 | "cuMemcpyDtoHAsync"); |
| 376 | } |
| 377 | |
| 378 | Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst, |
| 379 | ptrdiff_t DeviceDstByteOffset, |
| 380 | ptrdiff_t ByteCount, void *Stream) { |
| 381 | return getCUError( |
| 382 | cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>( |
| 383 | offsetVoidPtr(DeviceDst, DeviceDstByteOffset)), |
| 384 | HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)), |
| 385 | "cuMemcpyHtoDAsync"); |
| 386 | } |
| 387 | |
| 388 | Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset, |
| 389 | ptrdiff_t ByteCount, char ByteValue, |
| 390 | void *Stream) { |
| 391 | return getCUError( |
| 392 | cuMemsetD8Async( |
| 393 | reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)), |
| 394 | ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)), |
| 395 | "cuMemsetD8Async"); |
| 396 | } |
| 397 | |
| 398 | struct StreamCallbackUserData { |
| 399 | StreamCallbackUserData(Stream &Stream, StreamCallback Function) |
| 400 | : TheStream(Stream), TheFunction(std::move(Function)) {} |
| 401 | |
| 402 | Stream &TheStream; |
| 403 | StreamCallback TheFunction; |
| 404 | }; |
| 405 | |
| 406 | static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status, |
| 407 | void *UserData) { |
| 408 | std::unique_ptr<StreamCallbackUserData> Data( |
| 409 | static_cast<StreamCallbackUserData *>(UserData)); |
| 410 | Stream &TheStream = Data->TheStream; |
| 411 | assert(static_cast<CUstream_st *>(TheStream) == HStream); |
| 412 | Data->TheFunction(TheStream, |
| 413 | getCUError(Status, "stream callback error state")); |
| 414 | } |
| 415 | |
| 416 | Status CUDAPlatform::addStreamCallback(Stream &Stream, |
| 417 | StreamCallback Callback) { |
| 418 | // CUDA docs say flags must always be 0 here. |
| 419 | unsigned int Flags = 0u; |
| 420 | std::unique_ptr<StreamCallbackUserData> UserData( |
| 421 | new StreamCallbackUserData(Stream, std::move(Callback))); |
| 422 | return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim, |
| 423 | UserData.release(), Flags), |
| 424 | "cuStreamAddCallback"); |
| 425 | } |
| 426 | |
| 427 | static void cudaDestroyProgram(void *H) { |
| 428 | logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload"); |
| 429 | } |
| 430 | |
Jason Henline | bdc410b | 2016-10-28 00:54:02 +0000 | [diff] [blame] | 431 | Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source, |
| 432 | int DeviceIndex) { |
| 433 | Status S = setContext(DeviceIndex); |
| 434 | if (S.isError()) |
| 435 | return S; |
Jason Henline | ac232dd | 2016-10-25 20:18:56 +0000 | [diff] [blame] | 436 | CUmodule Module; |
| 437 | constexpr int LogBufferSizeBytes = 1024; |
| 438 | char InfoLogBuffer[LogBufferSizeBytes]; |
| 439 | char ErrorLogBuffer[LogBufferSizeBytes]; |
| 440 | constexpr size_t OptionsCount = 4; |
| 441 | std::array<CUjit_option, OptionsCount> OptionNames = { |
| 442 | {CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES, |
| 443 | CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}}; |
| 444 | std::array<void *, OptionsCount> OptionValues = { |
| 445 | {InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer, |
| 446 | const_cast<int *>(&LogBufferSizeBytes)}}; |
| 447 | if (CUresult Result = |
| 448 | cuModuleLoadDataEx(&Module, Source.data(), OptionsCount, |
| 449 | OptionNames.data(), OptionValues.data())) { |
| 450 | InfoLogBuffer[LogBufferSizeBytes - 1] = '\0'; |
| 451 | ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0'; |
| 452 | std::ostringstream OutStream; |
| 453 | OutStream << "Error creating program from source: " |
| 454 | << getCUErrorMessage(Result) |
| 455 | << "\nINFO MESSAGES\n================\n" |
| 456 | << InfoLogBuffer << "\nERROR MESSAGES\n==================\n" |
| 457 | << ErrorLogBuffer; |
| 458 | return Status(OutStream.str()); |
| 459 | } |
| 460 | return constructProgram(this, Module, cudaDestroyProgram); |
| 461 | } |
| 462 | |
| 463 | Expected<void *> CUDAPlatform::rawCreateKernel(void *Program, |
| 464 | const std::string &Name) { |
| 465 | CUmodule Module = static_cast<CUmodule>(Program); |
| 466 | CUfunction Kernel; |
| 467 | if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str())) |
| 468 | return getCUError(Result, "cuModuleGetFunction"); |
| 469 | return Kernel; |
| 470 | } |
| 471 | |
| 472 | static void cudaDestroyKernel(void *) { |
| 473 | // Do nothing. |
| 474 | } |
| 475 | |
| 476 | HandleDestructor CUDAPlatform::getKernelHandleDestructor() { |
| 477 | return cudaDestroyKernel; |
| 478 | } |
| 479 | |
| 480 | Status CUDAPlatform::rawEnqueueKernelLaunch( |
| 481 | void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions, |
| 482 | Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) { |
| 483 | return getCUError( |
| 484 | cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX, |
| 485 | LaunchDimensions.GridY, LaunchDimensions.GridZ, |
| 486 | LaunchDimensions.BlockX, LaunchDimensions.BlockY, |
| 487 | LaunchDimensions.BlockZ, SharedMemoryBytes, |
| 488 | static_cast<CUstream>(Stream), Arguments.data(), nullptr), |
| 489 | "cuLaunchKernel"); |
| 490 | } |
| 491 | |
| 492 | } // namespace |
| 493 | |
| 494 | namespace cuda { |
| 495 | |
| 496 | /// Gets the CUDAPlatform instance and returns it as an unowned pointer to a |
| 497 | /// Platform. |
| 498 | Expected<Platform *> getPlatform() { |
| 499 | static auto MaybePlatform = []() -> Expected<CUDAPlatform *> { |
| 500 | Expected<CUDAPlatform> CreationResult = CUDAPlatform::create(); |
| 501 | if (CreationResult.isError()) |
| 502 | return CreationResult.getError(); |
| 503 | else |
| 504 | return new CUDAPlatform(CreationResult.takeValue()); |
| 505 | }(); |
| 506 | return MaybePlatform; |
| 507 | } |
| 508 | |
| 509 | } // namespace cuda |
| 510 | |
| 511 | } // namespace acxxel |