blob: d8ec44c3b2695119cda6def861b7feb40df9bb12 [file] [log] [blame]
Jason Henlineac232dd2016-10-25 20:18:56 +00001//===--- cuda_acxxel.cpp - CUDA implementation of the Acxxel API ----------===//
2//
3// The LLVM Compiler Infrastructure
4//
5// This file is distributed under the University of Illinois Open Source
6// License. See LICENSE.TXT for details.
7//
8//===----------------------------------------------------------------------===//
9///
10/// This file defines the standard CUDA implementation of the Acxxel API.
11///
12//===----------------------------------------------------------------------===//
13
14#include "acxxel.h"
15
16#include "cuda.h"
17#include "cuda_runtime.h"
18
19#include <array>
20#include <cassert>
21#include <sstream>
22#include <vector>
23
24namespace acxxel {
25
26namespace {
27
Jason Henlineac232dd2016-10-25 20:18:56 +000028static std::string getCUErrorMessage(CUresult Result) {
29 if (!Result)
30 return "success";
31 const char *ErrorName = "UNKNOWN_ERROR_NAME";
32 const char *ErrorDescription = "UNKNOWN_ERROR_DESCRIPTION";
33 cuGetErrorName(Result, &ErrorName);
34 cuGetErrorString(Result, &ErrorDescription);
35 std::ostringstream OutStream;
36 OutStream << "CUDA driver error: code = " << Result
37 << ", name = " << ErrorName
38 << ", description = " << ErrorDescription;
39 return OutStream.str();
40}
41
42static Status getCUError(CUresult Result, const std::string &Message) {
43 if (!Result)
44 return Status();
45 std::ostringstream OutStream;
46 OutStream << getCUErrorMessage(Result) << ", message = " << Message;
47 return Status(OutStream.str());
48}
49
50static std::string getCUDAErrorMessage(cudaError_t E) {
51 if (!E)
52 return "success";
53 std::ostringstream OutStream;
54 OutStream << "CUDA runtime error: code = " << E
55 << ", name = " << cudaGetErrorName(E)
56 << ", description = " << cudaGetErrorString(E);
57 return OutStream.str();
58}
59
60static Status getCUDAError(cudaError_t E, const std::string &Message) {
61 if (!E)
62 return Status();
63 std::ostringstream OutStream;
64 OutStream << getCUDAErrorMessage(E) << ", message = " << Message;
65 return Status(OutStream.str());
66}
67
68static void logCUWarning(CUresult Result, const std::string &Message) {
69 if (Result) {
70 std::ostringstream OutStream;
71 OutStream << Message << ": " << getCUErrorMessage(Result);
72 logWarning(OutStream.str());
73 }
74}
75
76/// A CUDA Platform implementation.
77class CUDAPlatform : public Platform {
78public:
79 ~CUDAPlatform() override = default;
80
81 static Expected<CUDAPlatform> create();
82
83 Expected<int> getDeviceCount() override;
84
Jason Henlinebdc410b2016-10-28 00:54:02 +000085 Expected<Stream> createStream(int DeviceIndex) override;
Jason Henlineac232dd2016-10-25 20:18:56 +000086
87 Status streamSync(void *Stream) override;
88
89 Status streamWaitOnEvent(void *Stream, void *Event) override;
90
Jason Henlinebdc410b2016-10-28 00:54:02 +000091 Expected<Event> createEvent(int DeviceIndex) override;
Jason Henlineac232dd2016-10-25 20:18:56 +000092
93protected:
Jason Henlinebdc410b2016-10-28 00:54:02 +000094 Expected<void *> rawMallocD(ptrdiff_t ByteCount, int DeviceIndex) override;
Jason Henlineac232dd2016-10-25 20:18:56 +000095 HandleDestructor getDeviceMemoryHandleDestructor() override;
96 void *getDeviceMemorySpanHandle(void *BaseHandle, size_t ByteSize,
97 size_t ByteOffset) override;
98 virtual void rawDestroyDeviceMemorySpanHandle(void *Handle) override;
99
Jason Henlinebdc410b2016-10-28 00:54:02 +0000100 Expected<void *> rawGetDeviceSymbolAddress(const void *Symbol,
101 int DeviceIndex) override;
102 Expected<ptrdiff_t> rawGetDeviceSymbolSize(const void *Symbol,
103 int DeviceIndex) override;
Jason Henlineac232dd2016-10-25 20:18:56 +0000104
105 Status rawRegisterHostMem(const void *Memory, ptrdiff_t ByteCount) override;
106 HandleDestructor getUnregisterHostMemoryHandleDestructor() override;
107
108 Expected<void *> rawMallocRegisteredH(ptrdiff_t ByteCount) override;
109 HandleDestructor getFreeHostMemoryHandleDestructor() override;
110
111 Status asyncCopyDToD(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
112 void *DeviceDst, ptrdiff_t DeviceDstByteOffset,
113 ptrdiff_t ByteCount, void *Stream) override;
114 Status asyncCopyDToH(const void *DeviceSrc, ptrdiff_t DeviceSrcByteOffset,
115 void *HostDst, ptrdiff_t ByteCount,
116 void *Stream) override;
117 Status asyncCopyHToD(const void *HostSrc, void *DeviceDst,
118 ptrdiff_t DeviceDstByteOffset, ptrdiff_t ByteCount,
119 void *Stream) override;
120
121 Status asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
122 ptrdiff_t ByteCount, char ByteValue,
123 void *Stream) override;
124
125 Status addStreamCallback(Stream &Stream, StreamCallback Callback) override;
126
Jason Henlinebdc410b2016-10-28 00:54:02 +0000127 Expected<Program> createProgramFromSource(Span<const char> Source,
128 int DeviceIndex) override;
Jason Henlineac232dd2016-10-25 20:18:56 +0000129
130 Status enqueueEvent(void *Event, void *Stream) override;
131 bool eventIsDone(void *Event) override;
132 Status eventSync(void *Event) override;
133 Expected<float> getSecondsBetweenEvents(void *StartEvent,
134 void *EndEvent) override;
135
136 Expected<void *> rawCreateKernel(void *Program,
137 const std::string &Name) override;
138 HandleDestructor getKernelHandleDestructor() override;
139
140 Status rawEnqueueKernelLaunch(void *Stream, void *Kernel,
141 KernelLaunchDimensions LaunchDimensions,
142 Span<void *> Arguments,
143 Span<size_t> ArgumentSizes,
144 size_t SharedMemoryBytes) override;
145
146private:
147 explicit CUDAPlatform(const std::vector<CUcontext> &Contexts)
148 : TheContexts(Contexts) {}
149
Jason Henlinebdc410b2016-10-28 00:54:02 +0000150 Status setContext(int DeviceIndex) {
151 if (DeviceIndex < 0 ||
152 static_cast<size_t>(DeviceIndex) >= TheContexts.size())
153 return Status("invalid deivce index " + std::to_string(DeviceIndex));
154 return getCUError(cuCtxSetCurrent(TheContexts[DeviceIndex]),
155 "cuCtxSetCurrent");
156 }
157
Jason Henlineac232dd2016-10-25 20:18:56 +0000158 // Vector of contexts for each device.
159 std::vector<CUcontext> TheContexts;
160};
161
162Expected<CUDAPlatform> CUDAPlatform::create() {
163 std::vector<CUcontext> Contexts;
164 if (CUresult Result = cuInit(0))
165 return getCUError(Result, "cuInit");
166
167 int DeviceCount = 0;
168 if (CUresult Result = cuDeviceGetCount(&DeviceCount))
169 return getCUError(Result, "cuDeviceGetCount");
170
171 for (int I = 0; I < DeviceCount; ++I) {
172 CUdevice Device;
173 if (CUresult Result = cuDeviceGet(&Device, I))
174 return getCUError(Result, "cuDeviceGet");
175 CUcontext Context;
176 if (CUresult Result = cuDevicePrimaryCtxRetain(&Context, Device))
177 return getCUError(Result, "cuDevicePrimaryCtxRetain");
178 if (CUresult Result = cuCtxSetCurrent(Context))
179 return getCUError(Result, "cuCtxSetCurrent");
180 Contexts.emplace_back(Context);
181 }
182
183 return CUDAPlatform(Contexts);
184}
185
Jason Henlineac232dd2016-10-25 20:18:56 +0000186Expected<int> CUDAPlatform::getDeviceCount() {
187 int Count = 0;
188 if (CUresult Result = cuDeviceGetCount(&Count))
189 return getCUError(Result, "cuDeviceGetCount");
190 return Count;
191}
192
193static void cudaDestroyStream(void *H) {
194 logCUWarning(cuStreamDestroy(static_cast<CUstream_st *>(H)),
195 "cuStreamDestroy");
196}
197
Jason Henlinebdc410b2016-10-28 00:54:02 +0000198Expected<Stream> CUDAPlatform::createStream(int DeviceIndex) {
199 Status S = setContext(DeviceIndex);
200 if (S.isError())
201 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000202 unsigned int Flags = CU_STREAM_DEFAULT;
203 CUstream Handle;
204 if (CUresult Result = cuStreamCreate(&Handle, Flags))
205 return getCUError(Result, "cuStreamCreate");
Jason Henlinebdc410b2016-10-28 00:54:02 +0000206 return constructStream(this, DeviceIndex, Handle, cudaDestroyStream);
Jason Henlineac232dd2016-10-25 20:18:56 +0000207}
208
209Status CUDAPlatform::streamSync(void *Stream) {
210 return getCUError(cuStreamSynchronize(static_cast<CUstream_st *>(Stream)),
211 "cuStreamSynchronize");
212}
213
214Status CUDAPlatform::streamWaitOnEvent(void *Stream, void *Event) {
215 // CUDA docs says flags must be 0.
216 unsigned int Flags = 0u;
217 return getCUError(cuStreamWaitEvent(static_cast<CUstream_st *>(Stream),
218 static_cast<CUevent_st *>(Event), Flags),
219 "cuStreamWaitEvent");
220}
221
222static void cudaDestroyEvent(void *H) {
223 logCUWarning(cuEventDestroy(static_cast<CUevent_st *>(H)), "cuEventDestroy");
224}
225
Jason Henlinebdc410b2016-10-28 00:54:02 +0000226Expected<Event> CUDAPlatform::createEvent(int DeviceIndex) {
227 Status S = setContext(DeviceIndex);
228 if (S.isError())
229 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000230 unsigned int Flags = CU_EVENT_DEFAULT;
231 CUevent Handle;
232 if (CUresult Result = cuEventCreate(&Handle, Flags))
233 return getCUError(Result, "cuEventCreate");
Jason Henlinebdc410b2016-10-28 00:54:02 +0000234 return constructEvent(this, DeviceIndex, Handle, cudaDestroyEvent);
Jason Henlineac232dd2016-10-25 20:18:56 +0000235}
236
237Status CUDAPlatform::enqueueEvent(void *Event, void *Stream) {
238 return getCUError(cuEventRecord(static_cast<CUevent_st *>(Event),
239 static_cast<CUstream_st *>(Stream)),
240 "cuEventRecord");
241}
242
243bool CUDAPlatform::eventIsDone(void *Event) {
244 return cuEventQuery(static_cast<CUevent_st *>(Event)) != CUDA_ERROR_NOT_READY;
245}
246
247Status CUDAPlatform::eventSync(void *Event) {
248 return getCUError(cuEventSynchronize(static_cast<CUevent_st *>(Event)),
249 "cuEventSynchronize");
250}
251
252Expected<float> CUDAPlatform::getSecondsBetweenEvents(void *StartEvent,
253 void *EndEvent) {
254 float Milliseconds;
255 if (CUresult Result = cuEventElapsedTime(
256 &Milliseconds, static_cast<CUevent_st *>(StartEvent),
257 static_cast<CUevent_st *>(EndEvent)))
258 return getCUError(Result, "cuEventElapsedTime");
259 return Milliseconds * 1e-6;
260}
261
Jason Henlinebdc410b2016-10-28 00:54:02 +0000262Expected<void *> CUDAPlatform::rawMallocD(ptrdiff_t ByteCount,
263 int DeviceIndex) {
264 Status S = setContext(DeviceIndex);
265 if (S.isError())
266 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000267 if (!ByteCount)
268 return nullptr;
269 CUdeviceptr Pointer;
270 if (CUresult Result = cuMemAlloc(&Pointer, ByteCount))
271 return getCUError(Result, "cuMemAlloc");
272 return reinterpret_cast<void *>(Pointer);
273}
274
275static void cudaDestroyDeviceMemory(void *H) {
276 logCUWarning(cuMemFree(reinterpret_cast<CUdeviceptr>(H)), "cuMemFree");
277}
278
279HandleDestructor CUDAPlatform::getDeviceMemoryHandleDestructor() {
280 return cudaDestroyDeviceMemory;
281}
282
283void *CUDAPlatform::getDeviceMemorySpanHandle(void *BaseHandle, size_t,
284 size_t ByteOffset) {
285 return static_cast<char *>(BaseHandle) + ByteOffset;
286}
287
288void CUDAPlatform::rawDestroyDeviceMemorySpanHandle(void *) {
289 // Do nothing for this platform.
290}
291
Jason Henlinebdc410b2016-10-28 00:54:02 +0000292Expected<void *> CUDAPlatform::rawGetDeviceSymbolAddress(const void *Symbol,
293 int DeviceIndex) {
294 Status S = setContext(DeviceIndex);
295 if (S.isError())
296 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000297 void *Address;
298 if (cudaError_t Status = cudaGetSymbolAddress(&Address, Symbol))
299 return getCUDAError(Status, "cudaGetSymbolAddress");
300 return Address;
301}
302
Jason Henlinebdc410b2016-10-28 00:54:02 +0000303Expected<ptrdiff_t> CUDAPlatform::rawGetDeviceSymbolSize(const void *Symbol,
304 int DeviceIndex) {
305 Status S = setContext(DeviceIndex);
306 if (S.isError())
307 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000308 size_t Size;
309 if (cudaError_t Status = cudaGetSymbolSize(&Size, Symbol))
310 return getCUDAError(Status, "cudaGetSymbolSize");
311 return Size;
312}
313
314static const void *offsetVoidPtr(const void *Ptr, ptrdiff_t ByteOffset) {
315 return static_cast<const void *>(static_cast<const char *>(Ptr) + ByteOffset);
316}
317
318static void *offsetVoidPtr(void *Ptr, ptrdiff_t ByteOffset) {
319 return static_cast<void *>(static_cast<char *>(Ptr) + ByteOffset);
320}
321
Jason Henlineac232dd2016-10-25 20:18:56 +0000322Status CUDAPlatform::rawRegisterHostMem(const void *Memory,
323 ptrdiff_t ByteCount) {
324 unsigned int Flags = 0;
325 return getCUError(
326 cuMemHostRegister(const_cast<void *>(Memory), ByteCount, Flags),
327 "cuMemHostRegiser");
328}
329
330static void cudaUnregisterHostMemoryHandleDestructor(void *H) {
331 logCUWarning(cuMemHostUnregister(H), "cuMemHostUnregister");
332}
333
334HandleDestructor CUDAPlatform::getUnregisterHostMemoryHandleDestructor() {
335 return cudaUnregisterHostMemoryHandleDestructor;
336}
337
338Expected<void *> CUDAPlatform::rawMallocRegisteredH(ptrdiff_t ByteCount) {
339 unsigned int Flags = 0;
340 void *Memory;
341 if (CUresult Result = cuMemHostAlloc(&Memory, ByteCount, Flags))
342 return getCUError(Result, "cuMemHostAlloc");
343 return Memory;
344}
345
346static void cudaFreeHostMemoryHandleDestructor(void *H) {
347 logCUWarning(cuMemFreeHost(H), "cuMemFreeHost");
348}
349
350HandleDestructor CUDAPlatform::getFreeHostMemoryHandleDestructor() {
351 return cudaFreeHostMemoryHandleDestructor;
352}
353
354Status CUDAPlatform::asyncCopyDToD(const void *DeviceSrc,
355 ptrdiff_t DeviceSrcByteOffset,
356 void *DeviceDst,
357 ptrdiff_t DeviceDstByteOffset,
358 ptrdiff_t ByteCount, void *Stream) {
359 return getCUError(
360 cuMemcpyDtoDAsync(reinterpret_cast<CUdeviceptr>(
361 offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
362 reinterpret_cast<CUdeviceptr>(
363 offsetVoidPtr(DeviceSrc, DeviceSrcByteOffset)),
364 ByteCount, static_cast<CUstream_st *>(Stream)),
365 "cuMemcpyDtoDAsync");
366}
367
368Status CUDAPlatform::asyncCopyDToH(const void *DeviceSrc,
369 ptrdiff_t DeviceSrcByteOffset, void *HostDst,
370 ptrdiff_t ByteCount, void *Stream) {
371 return getCUError(
372 cuMemcpyDtoHAsync(HostDst, reinterpret_cast<CUdeviceptr>(offsetVoidPtr(
373 DeviceSrc, DeviceSrcByteOffset)),
374 ByteCount, static_cast<CUstream_st *>(Stream)),
375 "cuMemcpyDtoHAsync");
376}
377
378Status CUDAPlatform::asyncCopyHToD(const void *HostSrc, void *DeviceDst,
379 ptrdiff_t DeviceDstByteOffset,
380 ptrdiff_t ByteCount, void *Stream) {
381 return getCUError(
382 cuMemcpyHtoDAsync(reinterpret_cast<CUdeviceptr>(
383 offsetVoidPtr(DeviceDst, DeviceDstByteOffset)),
384 HostSrc, ByteCount, static_cast<CUstream_st *>(Stream)),
385 "cuMemcpyHtoDAsync");
386}
387
388Status CUDAPlatform::asyncMemsetD(void *DeviceDst, ptrdiff_t ByteOffset,
389 ptrdiff_t ByteCount, char ByteValue,
390 void *Stream) {
391 return getCUError(
392 cuMemsetD8Async(
393 reinterpret_cast<CUdeviceptr>(offsetVoidPtr(DeviceDst, ByteOffset)),
394 ByteValue, ByteCount, static_cast<CUstream_st *>(Stream)),
395 "cuMemsetD8Async");
396}
397
398struct StreamCallbackUserData {
399 StreamCallbackUserData(Stream &Stream, StreamCallback Function)
400 : TheStream(Stream), TheFunction(std::move(Function)) {}
401
402 Stream &TheStream;
403 StreamCallback TheFunction;
404};
405
406static void CUDA_CB cuStreamCallbackShim(CUstream HStream, CUresult Status,
407 void *UserData) {
408 std::unique_ptr<StreamCallbackUserData> Data(
409 static_cast<StreamCallbackUserData *>(UserData));
410 Stream &TheStream = Data->TheStream;
411 assert(static_cast<CUstream_st *>(TheStream) == HStream);
412 Data->TheFunction(TheStream,
413 getCUError(Status, "stream callback error state"));
414}
415
416Status CUDAPlatform::addStreamCallback(Stream &Stream,
417 StreamCallback Callback) {
418 // CUDA docs say flags must always be 0 here.
419 unsigned int Flags = 0u;
420 std::unique_ptr<StreamCallbackUserData> UserData(
421 new StreamCallbackUserData(Stream, std::move(Callback)));
422 return getCUError(cuStreamAddCallback(Stream, cuStreamCallbackShim,
423 UserData.release(), Flags),
424 "cuStreamAddCallback");
425}
426
427static void cudaDestroyProgram(void *H) {
428 logCUWarning(cuModuleUnload(static_cast<CUmod_st *>(H)), "cuModuleUnload");
429}
430
Jason Henlinebdc410b2016-10-28 00:54:02 +0000431Expected<Program> CUDAPlatform::createProgramFromSource(Span<const char> Source,
432 int DeviceIndex) {
433 Status S = setContext(DeviceIndex);
434 if (S.isError())
435 return S;
Jason Henlineac232dd2016-10-25 20:18:56 +0000436 CUmodule Module;
437 constexpr int LogBufferSizeBytes = 1024;
438 char InfoLogBuffer[LogBufferSizeBytes];
439 char ErrorLogBuffer[LogBufferSizeBytes];
440 constexpr size_t OptionsCount = 4;
441 std::array<CUjit_option, OptionsCount> OptionNames = {
442 {CU_JIT_INFO_LOG_BUFFER, CU_JIT_INFO_LOG_BUFFER_SIZE_BYTES,
443 CU_JIT_ERROR_LOG_BUFFER, CU_JIT_ERROR_LOG_BUFFER_SIZE_BYTES}};
444 std::array<void *, OptionsCount> OptionValues = {
445 {InfoLogBuffer, const_cast<int *>(&LogBufferSizeBytes), ErrorLogBuffer,
446 const_cast<int *>(&LogBufferSizeBytes)}};
447 if (CUresult Result =
448 cuModuleLoadDataEx(&Module, Source.data(), OptionsCount,
449 OptionNames.data(), OptionValues.data())) {
450 InfoLogBuffer[LogBufferSizeBytes - 1] = '\0';
451 ErrorLogBuffer[LogBufferSizeBytes - 1] = '\0';
452 std::ostringstream OutStream;
453 OutStream << "Error creating program from source: "
454 << getCUErrorMessage(Result)
455 << "\nINFO MESSAGES\n================\n"
456 << InfoLogBuffer << "\nERROR MESSAGES\n==================\n"
457 << ErrorLogBuffer;
458 return Status(OutStream.str());
459 }
460 return constructProgram(this, Module, cudaDestroyProgram);
461}
462
463Expected<void *> CUDAPlatform::rawCreateKernel(void *Program,
464 const std::string &Name) {
465 CUmodule Module = static_cast<CUmodule>(Program);
466 CUfunction Kernel;
467 if (CUresult Result = cuModuleGetFunction(&Kernel, Module, Name.c_str()))
468 return getCUError(Result, "cuModuleGetFunction");
469 return Kernel;
470}
471
472static void cudaDestroyKernel(void *) {
473 // Do nothing.
474}
475
476HandleDestructor CUDAPlatform::getKernelHandleDestructor() {
477 return cudaDestroyKernel;
478}
479
480Status CUDAPlatform::rawEnqueueKernelLaunch(
481 void *Stream, void *Kernel, KernelLaunchDimensions LaunchDimensions,
482 Span<void *> Arguments, Span<size_t>, size_t SharedMemoryBytes) {
483 return getCUError(
484 cuLaunchKernel(static_cast<CUfunction>(Kernel), LaunchDimensions.GridX,
485 LaunchDimensions.GridY, LaunchDimensions.GridZ,
486 LaunchDimensions.BlockX, LaunchDimensions.BlockY,
487 LaunchDimensions.BlockZ, SharedMemoryBytes,
488 static_cast<CUstream>(Stream), Arguments.data(), nullptr),
489 "cuLaunchKernel");
490}
491
492} // namespace
493
494namespace cuda {
495
496/// Gets the CUDAPlatform instance and returns it as an unowned pointer to a
497/// Platform.
498Expected<Platform *> getPlatform() {
499 static auto MaybePlatform = []() -> Expected<CUDAPlatform *> {
500 Expected<CUDAPlatform> CreationResult = CUDAPlatform::create();
501 if (CreationResult.isError())
502 return CreationResult.getError();
503 else
504 return new CUDAPlatform(CreationResult.takeValue());
505 }();
506 return MaybePlatform;
507}
508
509} // namespace cuda
510
511} // namespace acxxel