Add support for Semaphores (gpu waiting on gpu) in Ganesh

BUG=skia:

Change-Id: I4324b65bc50a3dfd90372459899870d5f1952fdc
Reviewed-on: https://skia-review.googlesource.com/9120
Commit-Queue: Greg Daniel <egdaniel@google.com>
Reviewed-by: Brian Salomon <bsalomon@google.com>
Reviewed-by: Brian Osman <brianosman@google.com>
diff --git a/src/gpu/GrGpu.h b/src/gpu/GrGpu.h
index f710810..201b22c 100644
--- a/src/gpu/GrGpu.h
+++ b/src/gpu/GrGpu.h
@@ -33,6 +33,7 @@
 class GrPipeline;
 class GrPrimitiveProcessor;
 class GrRenderTarget;
+class GrSemaphore;
 class GrStencilAttachment;
 class GrStencilSettings;
 class GrSurface;
@@ -378,10 +379,14 @@
     // Provides a hook for post-flush actions (e.g. PLS reset and Vulkan command buffer submits).
     virtual void finishOpList() {}
 
-    virtual GrFence SK_WARN_UNUSED_RESULT insertFence() const = 0;
-    virtual bool waitFence(GrFence, uint64_t timeout = 1000) const = 0;
+    virtual GrFence SK_WARN_UNUSED_RESULT insertFence() = 0;
+    virtual bool waitFence(GrFence, uint64_t timeout = 1000) = 0;
     virtual void deleteFence(GrFence) const = 0;
 
+    virtual sk_sp<GrSemaphore> SK_WARN_UNUSED_RESULT makeSemaphore() = 0;
+    virtual void insertSemaphore(sk_sp<GrSemaphore> semaphore) = 0;
+    virtual void waitSemaphore(sk_sp<GrSemaphore> semaphore) = 0;
+
     // Ensures that all queued up driver-level commands have been sent to the GPU. For example, on
     // OpenGL, this calls glFlush.
     virtual void flush() = 0;
diff --git a/src/gpu/GrSemaphore.h b/src/gpu/GrSemaphore.h
new file mode 100644
index 0000000..bdeff09
--- /dev/null
+++ b/src/gpu/GrSemaphore.h
@@ -0,0 +1,29 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrSemaphore_DEFINED
+#define GrSemaphore_DEFINED
+
+#include "SkRefCnt.h"
+
+class GrGpu;
+
+class GrSemaphore : public SkRefCnt {
+public:
+    // This function should only be used in the case of exporting and importing a GrSemaphore object
+    // from one GrContext to another. When exporting, the GrSemaphore should be set to a null GrGpu,
+    // and when importing it should be set to the GrGpu of the current context. Once exported, a
+    // GrSemaphore should not be used with its old context.
+    void resetGpu(const GrGpu* gpu) { fGpu = gpu; }
+
+protected:
+    explicit GrSemaphore(const GrGpu* gpu) : fGpu(gpu) {}
+
+    const GrGpu* fGpu;
+};
+
+#endif
diff --git a/src/gpu/gl/GrGLAssembleInterface.cpp b/src/gpu/gl/GrGLAssembleInterface.cpp
index 3dbff4c..efd1536 100644
--- a/src/gpu/gl/GrGLAssembleInterface.cpp
+++ b/src/gpu/gl/GrGLAssembleInterface.cpp
@@ -530,6 +530,7 @@
     if (glVer >= GR_GL_VER(3, 2) || extensions.has("GL_ARB_sync")) {
         GET_PROC(FenceSync);
         GET_PROC(ClientWaitSync);
+        GET_PROC(WaitSync);
         GET_PROC(DeleteSync);
     }
 
@@ -937,6 +938,7 @@
     if (version >= GR_GL_VER(3, 0)) {
         GET_PROC(FenceSync);
         GET_PROC(ClientWaitSync);
+        GET_PROC(WaitSync);
         GET_PROC(DeleteSync);
     }
 
diff --git a/src/gpu/gl/GrGLDefines.h b/src/gpu/gl/GrGLDefines.h
index 997726c..522a036 100644
--- a/src/gpu/gl/GrGLDefines.h
+++ b/src/gpu/gl/GrGLDefines.h
@@ -1055,6 +1055,7 @@
 #define GR_GL_CONDITION_SATISFIED                           0x911C
 #define GR_GL_WAIT_FAILED                                   0x911D
 #define GR_GL_SYNC_FLUSH_COMMANDS_BIT                       0x00000001
+#define GR_GL_TIMEOUT_IGNORED                               0xFFFFFFFFFFFFFFFFull
 
 /* EGL Defines */
 #define GR_EGL_NO_DISPLAY                                   ((GrEGLDisplay)0)
diff --git a/src/gpu/gl/GrGLGpu.cpp b/src/gpu/gl/GrGLGpu.cpp
index 5dce7d1..cfbef9f 100644
--- a/src/gpu/gl/GrGLGpu.cpp
+++ b/src/gpu/gl/GrGLGpu.cpp
@@ -11,6 +11,7 @@
 #include "GrFixedClip.h"
 #include "GrGLBuffer.h"
 #include "GrGLGpuCommandBuffer.h"
+#include "GrGLSemaphore.h"
 #include "GrGLStencilAttachment.h"
 #include "GrGLTextureRenderTarget.h"
 #include "GrGpuResourcePriv.h"
@@ -4726,20 +4727,43 @@
     return false;
 }
 
-GrFence SK_WARN_UNUSED_RESULT GrGLGpu::insertFence() const {
-    GrGLsync fence;
-    GL_CALL_RET(fence, FenceSync(GR_GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
-    return (GrFence)fence;
+GrFence SK_WARN_UNUSED_RESULT GrGLGpu::insertFence() {
+    GrGLsync sync;
+    GL_CALL_RET(sync, FenceSync(GR_GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+    GR_STATIC_ASSERT(sizeof(GrFence) >= sizeof(GrGLsync));
+    return (GrFence)sync;
 }
 
-bool GrGLGpu::waitFence(GrFence fence, uint64_t timeout) const {
+bool GrGLGpu::waitFence(GrFence fence, uint64_t timeout) {
     GrGLenum result;
     GL_CALL_RET(result, ClientWaitSync((GrGLsync)fence, GR_GL_SYNC_FLUSH_COMMANDS_BIT, timeout));
     return (GR_GL_CONDITION_SATISFIED == result);
 }
 
 void GrGLGpu::deleteFence(GrFence fence) const {
-    GL_CALL(DeleteSync((GrGLsync)fence));
+    this->deleteSync((GrGLsync)fence);
+}
+
+sk_sp<GrSemaphore> SK_WARN_UNUSED_RESULT GrGLGpu::makeSemaphore() {
+    return GrGLSemaphore::Make(this);
+}
+
+void GrGLGpu::insertSemaphore(sk_sp<GrSemaphore> semaphore) {
+    GrGLSemaphore* glSem = static_cast<GrGLSemaphore*>(semaphore.get());
+
+    GrGLsync sync;
+    GL_CALL_RET(sync, FenceSync(GR_GL_SYNC_GPU_COMMANDS_COMPLETE, 0));
+    glSem->setSync(sync);
+}
+
+void GrGLGpu::waitSemaphore(sk_sp<GrSemaphore> semaphore) {
+    GrGLSemaphore* glSem = static_cast<GrGLSemaphore*>(semaphore.get());
+
+    GL_CALL(WaitSync(glSem->sync(), 0, GR_GL_TIMEOUT_IGNORED));
+}
+
+void GrGLGpu::deleteSync(GrGLsync sync) const {
+    GL_CALL(DeleteSync(sync));
 }
 
 void GrGLGpu::flush() {
diff --git a/src/gpu/gl/GrGLGpu.h b/src/gpu/gl/GrGLGpu.h
index b6ca4f6..57068fb 100644
--- a/src/gpu/gl/GrGLGpu.h
+++ b/src/gpu/gl/GrGLGpu.h
@@ -146,10 +146,16 @@
 
     void finishOpList() override;
 
-    GrFence SK_WARN_UNUSED_RESULT insertFence() const override;
-    bool waitFence(GrFence, uint64_t timeout) const override;
+    GrFence SK_WARN_UNUSED_RESULT insertFence() override;
+    bool waitFence(GrFence, uint64_t timeout) override;
     void deleteFence(GrFence) const override;
 
+    sk_sp<GrSemaphore> SK_WARN_UNUSED_RESULT makeSemaphore() override;
+    void insertSemaphore(sk_sp<GrSemaphore> semaphore) override;
+    void waitSemaphore(sk_sp<GrSemaphore> semaphore) override;
+
+    void deleteSync(GrGLsync) const;
+
     void flush() override;
 
 private:
diff --git a/src/gpu/gl/GrGLInterface.cpp b/src/gpu/gl/GrGLInterface.cpp
index 3714dcb..a62d353 100644
--- a/src/gpu/gl/GrGLInterface.cpp
+++ b/src/gpu/gl/GrGLInterface.cpp
@@ -768,6 +768,7 @@
         if (glVer >= GR_GL_VER(3, 2) || fExtensions.has("GL_ARB_sync")) {
             if (nullptr == fFunctions.fFenceSync ||
                 nullptr == fFunctions.fClientWaitSync ||
+                nullptr == fFunctions.fWaitSync ||
                 nullptr == fFunctions.fDeleteSync) {
                 RETURN_FALSE_INTERFACE
             }
@@ -776,6 +777,7 @@
         if (glVer >= GR_GL_VER(3, 0)) {
             if (nullptr == fFunctions.fFenceSync ||
                 nullptr == fFunctions.fClientWaitSync ||
+                nullptr == fFunctions.fWaitSync ||
                 nullptr == fFunctions.fDeleteSync) {
                 RETURN_FALSE_INTERFACE
             }
diff --git a/src/gpu/gl/GrGLSemaphore.h b/src/gpu/gl/GrGLSemaphore.h
new file mode 100644
index 0000000..40dc53b
--- /dev/null
+++ b/src/gpu/gl/GrGLSemaphore.h
@@ -0,0 +1,38 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrGLSemaphore_DEFINED
+#define GrGLSemaphore_DEFINED
+
+#include "GrSemaphore.h"
+
+#include "GrGLGpu.h"
+
+class GrGLSemaphore : public GrSemaphore {
+public:
+    static sk_sp<GrGLSemaphore> Make(const GrGLGpu* gpu) {
+        return sk_sp<GrGLSemaphore>(new GrGLSemaphore(gpu));
+    }
+
+    ~GrGLSemaphore() override {
+        if (fGpu) {
+            static_cast<const GrGLGpu*>(fGpu)->deleteSync(fSync);
+        }
+    }
+
+    GrGLsync sync() const { return fSync; }
+    void setSync(const GrGLsync& sync) { fSync = sync; }
+
+private:
+    GrGLSemaphore(const GrGLGpu* gpu) : INHERITED(gpu), fSync(0) {}
+
+    GrGLsync fSync;
+
+    typedef GrSemaphore INHERITED;
+};
+
+#endif
diff --git a/src/gpu/gl/GrGLTestInterface.cpp b/src/gpu/gl/GrGLTestInterface.cpp
index d871ef6..83fc663 100644
--- a/src/gpu/gl/GrGLTestInterface.cpp
+++ b/src/gpu/gl/GrGLTestInterface.cpp
@@ -314,6 +314,7 @@
     fFunctions.fTextureBuffer = bind_to_member(this, &GrGLTestInterface::textureBuffer);
     fFunctions.fFenceSync = bind_to_member(this, &GrGLTestInterface::fenceSync);
     fFunctions.fClientWaitSync = bind_to_member(this, &GrGLTestInterface::clientWaitSync);
+    fFunctions.fWaitSync = bind_to_member(this, &GrGLTestInterface::waitSync);
     fFunctions.fDeleteSync = bind_to_member(this, &GrGLTestInterface::deleteSync);
     fFunctions.fDebugMessageControl = bind_to_member(this, &GrGLTestInterface::debugMessageControl);
     fFunctions.fDebugMessageInsert = bind_to_member(this, &GrGLTestInterface::debugMessageInsert);
diff --git a/src/gpu/gl/GrGLTestInterface.h b/src/gpu/gl/GrGLTestInterface.h
index ef00df3..5918690 100644
--- a/src/gpu/gl/GrGLTestInterface.h
+++ b/src/gpu/gl/GrGLTestInterface.h
@@ -319,6 +319,7 @@
     virtual GrGLvoid textureBuffer(GrGLuint texture, GrGLenum target, GrGLenum internalformat, GrGLuint buffer) {}
     virtual GrGLsync fenceSync(GrGLenum condition, GrGLbitfield flags) { return nullptr;  }
     virtual GrGLenum clientWaitSync(GrGLsync sync, GrGLbitfield flags, GrGLuint64 timeout) { return GR_GL_WAIT_FAILED;  }
+    virtual GrGLvoid waitSync(GrGLsync sync, GrGLbitfield flags, GrGLuint64 timeout) {}
     virtual GrGLvoid deleteSync(GrGLsync sync) {}
     virtual GrGLvoid debugMessageControl(GrGLenum source, GrGLenum type, GrGLenum severity, GrGLsizei count, const GrGLuint* ids, GrGLboolean enabled) {}
     virtual GrGLvoid debugMessageInsert(GrGLenum source, GrGLenum type, GrGLuint id, GrGLenum severity, GrGLsizei length,  const GrGLchar* buf) {}
diff --git a/src/gpu/vk/GrVkCommandBuffer.cpp b/src/gpu/vk/GrVkCommandBuffer.cpp
index b1d20e2..64c1c88 100644
--- a/src/gpu/vk/GrVkCommandBuffer.cpp
+++ b/src/gpu/vk/GrVkCommandBuffer.cpp
@@ -398,9 +398,12 @@
     this->invalidateState();
 }
 
-void GrVkPrimaryCommandBuffer::submitToQueue(const GrVkGpu* gpu,
-                                             VkQueue queue,
-                                             GrVkGpu::SyncQueue sync) {
+void GrVkPrimaryCommandBuffer::submitToQueue(
+        const GrVkGpu* gpu,
+        VkQueue queue,
+        GrVkGpu::SyncQueue sync,
+        const GrVkSemaphore::Resource* signalSemaphore,
+        SkTArray<const GrVkSemaphore::Resource*>& waitSemaphores) {
     SkASSERT(!fIsActive);
 
     VkResult err;
@@ -415,17 +418,36 @@
         GR_VK_CALL(gpu->vkInterface(), ResetFences(gpu->device(), 1, &fSubmitFence));
     }
 
+    if (signalSemaphore) {
+        this->addResource(signalSemaphore);
+    }
+
+    int waitCount = waitSemaphores.count();
+    SkTArray<VkSemaphore> vkWaitSems(waitCount);
+    SkTArray<VkPipelineStageFlags> vkWaitStages(waitCount);
+    if (waitCount) {
+        for (int i = 0; i < waitCount; ++i) {
+            this->addResource(waitSemaphores[i]);
+            vkWaitSems.push_back(waitSemaphores[i]->semaphore());
+            vkWaitStages.push_back(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT);
+        }
+    }
+    SkTArray<VkSemaphore> vkSignalSem;
+    if (signalSemaphore) {
+        vkSignalSem.push_back(signalSemaphore->semaphore());
+    }
+
     VkSubmitInfo submitInfo;
     memset(&submitInfo, 0, sizeof(VkSubmitInfo));
     submitInfo.sType = VK_STRUCTURE_TYPE_SUBMIT_INFO;
     submitInfo.pNext = nullptr;
-    submitInfo.waitSemaphoreCount = 0;
-    submitInfo.pWaitSemaphores = nullptr;
-    submitInfo.pWaitDstStageMask = 0;
+    submitInfo.waitSemaphoreCount = waitCount;
+    submitInfo.pWaitSemaphores = vkWaitSems.begin();
+    submitInfo.pWaitDstStageMask = vkWaitStages.begin();
     submitInfo.commandBufferCount = 1;
     submitInfo.pCommandBuffers = &fCmdBuffer;
-    submitInfo.signalSemaphoreCount = 0;
-    submitInfo.pSignalSemaphores = nullptr;
+    submitInfo.signalSemaphoreCount = vkSignalSem.count();
+    submitInfo.pSignalSemaphores = vkSignalSem.begin();
     GR_VK_CALL_ERRCHECK(gpu->vkInterface(), QueueSubmit(queue, 1, &submitInfo, fSubmitFence));
 
     if (GrVkGpu::kForce_SyncQueue == sync) {
diff --git a/src/gpu/vk/GrVkCommandBuffer.h b/src/gpu/vk/GrVkCommandBuffer.h
index e28df44..e156861 100644
--- a/src/gpu/vk/GrVkCommandBuffer.h
+++ b/src/gpu/vk/GrVkCommandBuffer.h
@@ -10,6 +10,7 @@
 
 #include "GrVkGpu.h"
 #include "GrVkResource.h"
+#include "GrVkSemaphore.h"
 #include "GrVkUtil.h"
 #include "vk/GrVkDefines.h"
 
@@ -298,7 +299,9 @@
                       uint32_t regionCount,
                       const VkImageResolve* regions);
 
-    void submitToQueue(const GrVkGpu* gpu, VkQueue queue, GrVkGpu::SyncQueue sync);
+    void submitToQueue(const GrVkGpu* gpu, VkQueue queue, GrVkGpu::SyncQueue sync,
+                       const GrVkSemaphore::Resource* signalSemaphore,
+                       SkTArray<const GrVkSemaphore::Resource*>& waitSemaphores);
     bool finished(const GrVkGpu* gpu) const;
 
 #ifdef SK_TRACE_VK_RESOURCES
diff --git a/src/gpu/vk/GrVkGpu.cpp b/src/gpu/vk/GrVkGpu.cpp
index 8569c93..b85aa2d 100644
--- a/src/gpu/vk/GrVkGpu.cpp
+++ b/src/gpu/vk/GrVkGpu.cpp
@@ -25,6 +25,7 @@
 #include "GrVkPipelineState.h"
 #include "GrVkRenderPass.h"
 #include "GrVkResourceProvider.h"
+#include "GrVkSemaphore.h"
 #include "GrVkTexture.h"
 #include "GrVkTextureRenderTarget.h"
 #include "GrVkTransferBuffer.h"
@@ -181,6 +182,11 @@
     SkASSERT(VK_SUCCESS == res || VK_ERROR_DEVICE_LOST == res);
 #endif
 
+    for (int i = 0; i < fSemaphoresToWaitOn.count(); ++i) {
+        fSemaphoresToWaitOn[i]->unref(this);
+    }
+    fSemaphoresToWaitOn.reset();
+
     fCopyManager.destroyResources(this);
 
     // must call this just before we destroy the command pool and VkDevice
@@ -206,11 +212,18 @@
     return new GrVkGpuCommandBuffer(this, colorInfo, stencilInfo);
 }
 
-void GrVkGpu::submitCommandBuffer(SyncQueue sync) {
+void GrVkGpu::submitCommandBuffer(SyncQueue sync,
+                                  const GrVkSemaphore::Resource* signalSemaphore) {
     SkASSERT(fCurrentCmdBuffer);
     fCurrentCmdBuffer->end(this);
 
-    fCurrentCmdBuffer->submitToQueue(this, fQueue, sync);
+    fCurrentCmdBuffer->submitToQueue(this, fQueue, sync, signalSemaphore, fSemaphoresToWaitOn);
+
+    for (int i = 0; i < fSemaphoresToWaitOn.count(); ++i) {
+        fSemaphoresToWaitOn[i]->unref(this);
+    }
+    fSemaphoresToWaitOn.reset();
+
     fResourceProvider.checkCommandBuffers();
 
     // Release old command buffer and create a new one
@@ -1832,32 +1845,48 @@
     this->didWriteToSurface(target, &bounds);
 }
 
-GrFence SK_WARN_UNUSED_RESULT GrVkGpu::insertFence() const {
+GrFence SK_WARN_UNUSED_RESULT GrVkGpu::insertFence() {
     VkFenceCreateInfo createInfo;
     memset(&createInfo, 0, sizeof(VkFenceCreateInfo));
     createInfo.sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO;
     createInfo.pNext = nullptr;
     createInfo.flags = 0;
     VkFence fence = VK_NULL_HANDLE;
-    VkResult result = GR_VK_CALL(this->vkInterface(), CreateFence(this->device(), &createInfo,
-                                                                  nullptr, &fence));
-    // TODO: verify that all QueueSubmits before this will finish before this fence signals
-    if (VK_SUCCESS == result) {
-        GR_VK_CALL(this->vkInterface(), QueueSubmit(this->queue(), 0, nullptr, fence));
-    }
+
+    VK_CALL_ERRCHECK(CreateFence(this->device(), &createInfo, nullptr, &fence));
+    VK_CALL(QueueSubmit(this->queue(), 0, nullptr, fence));
+
+    GR_STATIC_ASSERT(sizeof(GrFence) >= sizeof(VkFence));
     return (GrFence)fence;
 }
 
-bool GrVkGpu::waitFence(GrFence fence, uint64_t timeout) const {
-    VkResult result = GR_VK_CALL(this->vkInterface(), WaitForFences(this->device(), 1,
-                                                                    (VkFence*)&fence,
-                                                                    VK_TRUE,
-                                                                    timeout));
+bool GrVkGpu::waitFence(GrFence fence, uint64_t timeout) {
+    SkASSERT(VK_NULL_HANDLE != (VkFence)fence);
+
+    VkResult result = VK_CALL(WaitForFences(this->device(), 1, (VkFence*)&fence, VK_TRUE, timeout));
     return (VK_SUCCESS == result);
 }
 
 void GrVkGpu::deleteFence(GrFence fence) const {
-    GR_VK_CALL(this->vkInterface(), DestroyFence(this->device(), (VkFence)fence, nullptr));
+    VK_CALL(DestroyFence(this->device(), (VkFence)fence, nullptr));
+}
+
+sk_sp<GrSemaphore> SK_WARN_UNUSED_RESULT GrVkGpu::makeSemaphore() {
+    return GrVkSemaphore::Make(this);
+}
+
+void GrVkGpu::insertSemaphore(sk_sp<GrSemaphore> semaphore) {
+    GrVkSemaphore* vkSem = static_cast<GrVkSemaphore*>(semaphore.get());
+
+    this->submitCommandBuffer(kSkip_SyncQueue, vkSem->getResource());
+}
+
+void GrVkGpu::waitSemaphore(sk_sp<GrSemaphore> semaphore) {
+    GrVkSemaphore* vkSem = static_cast<GrVkSemaphore*>(semaphore.get());
+
+    const GrVkSemaphore::Resource* resource = vkSem->getResource();
+    resource->ref();
+    fSemaphoresToWaitOn.push_back(resource);
 }
 
 void GrVkGpu::flush() {
diff --git a/src/gpu/vk/GrVkGpu.h b/src/gpu/vk/GrVkGpu.h
index c935945..7203bf1 100644
--- a/src/gpu/vk/GrVkGpu.h
+++ b/src/gpu/vk/GrVkGpu.h
@@ -16,6 +16,7 @@
 #include "GrVkIndexBuffer.h"
 #include "GrVkMemory.h"
 #include "GrVkResourceProvider.h"
+#include "GrVkSemaphore.h"
 #include "GrVkVertexBuffer.h"
 #include "GrVkUtil.h"
 #include "vk/GrVkDefines.h"
@@ -129,10 +130,14 @@
 
     void finishOpList() override;
 
-    GrFence SK_WARN_UNUSED_RESULT insertFence() const override;
-    bool waitFence(GrFence, uint64_t timeout) const override;
+    GrFence SK_WARN_UNUSED_RESULT insertFence() override;
+    bool waitFence(GrFence, uint64_t timeout) override;
     void deleteFence(GrFence) const override;
 
+    sk_sp<GrSemaphore> SK_WARN_UNUSED_RESULT makeSemaphore() override;
+    void insertSemaphore(sk_sp<GrSemaphore> semaphore) override;
+    void waitSemaphore(sk_sp<GrSemaphore> semaphore) override;
+
     void flush() override;
 
     void generateMipmap(GrVkTexture* tex);
@@ -202,8 +207,12 @@
 
     // Ends and submits the current command buffer to the queue and then creates a new command
     // buffer and begins it. If sync is set to kForce_SyncQueue, the function will wait for all
-    // work in the queue to finish before returning.
-    void submitCommandBuffer(SyncQueue sync);
+    // work in the queue to finish before returning. If the signalSemaphore is not VK_NULL_HANDLE,
+    // we will signal the semaphore at the end of this command buffer. If this GrVkGpu object has
+    // any semaphores in fSemaphoresToWaitOn, we will add those wait semaphores to this command
+    // buffer when submitting.
+    void submitCommandBuffer(SyncQueue sync,
+                             const GrVkSemaphore::Resource* signalSemaphore = nullptr);
 
     void internalResolveRenderTarget(GrRenderTarget* target, bool requiresSubmit);
 
@@ -247,18 +256,22 @@
 
     // These Vulkan objects are provided by the client, and also stored in fBackendContext.
     // They're copied here for convenient access.
-    VkDevice                               fDevice;
-    VkQueue                                fQueue;    // Must be Graphics queue
+    VkDevice                                     fDevice;
+    VkQueue                                      fQueue;    // Must be Graphics queue
 
     // Created by GrVkGpu
-    GrVkResourceProvider                   fResourceProvider;
-    VkCommandPool                          fCmdPool;
-    GrVkPrimaryCommandBuffer*              fCurrentCmdBuffer;
-    VkPhysicalDeviceMemoryProperties       fPhysDevMemProps;
+    GrVkResourceProvider                         fResourceProvider;
+    VkCommandPool                                fCmdPool;
 
-    std::unique_ptr<GrVkHeap>              fHeaps[kHeapCount];
+    GrVkPrimaryCommandBuffer*                    fCurrentCmdBuffer;
 
-    GrVkCopyManager                        fCopyManager;
+    SkSTArray<1, const GrVkSemaphore::Resource*> fSemaphoresToWaitOn;
+
+    VkPhysicalDeviceMemoryProperties             fPhysDevMemProps;
+
+    std::unique_ptr<GrVkHeap>                    fHeaps[kHeapCount];
+
+    GrVkCopyManager                              fCopyManager;
 
 #ifdef SK_ENABLE_VK_LAYERS
     // For reporting validation layer errors
diff --git a/src/gpu/vk/GrVkSemaphore.cpp b/src/gpu/vk/GrVkSemaphore.cpp
new file mode 100644
index 0000000..d84635f
--- /dev/null
+++ b/src/gpu/vk/GrVkSemaphore.cpp
@@ -0,0 +1,47 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#include "GrVkSemaphore.h"
+
+#include "GrVkGpu.h"
+#include "GrVkUtil.h"
+
+#ifdef VK_USE_PLATFORM_WIN32_KHR
+// windows wants to define this as CreateSemaphoreA or CreateSemaphoreW
+#undef CreateSemaphore
+#endif
+
+sk_sp<GrVkSemaphore> GrVkSemaphore::Make(const GrVkGpu* gpu) {
+    VkSemaphoreCreateInfo createInfo;
+    memset(&createInfo, 0, sizeof(VkFenceCreateInfo));
+    createInfo.sType = VK_STRUCTURE_TYPE_SEMAPHORE_CREATE_INFO;
+    createInfo.pNext = nullptr;
+    createInfo.flags = 0;
+    VkSemaphore semaphore = VK_NULL_HANDLE;
+    GR_VK_CALL_ERRCHECK(gpu->vkInterface(),
+                        CreateSemaphore(gpu->device(), &createInfo, nullptr, &semaphore));
+
+    return sk_sp<GrVkSemaphore>(new GrVkSemaphore(gpu, semaphore));
+}
+
+GrVkSemaphore::GrVkSemaphore(const GrVkGpu* gpu, VkSemaphore semaphore) : INHERITED(gpu) {
+    fResource = new Resource(semaphore);
+}
+
+GrVkSemaphore::~GrVkSemaphore() {
+    if (fGpu) {
+        fResource->unref(static_cast<const GrVkGpu*>(fGpu));
+    } else {
+        fResource->unrefAndAbandon();
+    }
+}
+
+void GrVkSemaphore::Resource::freeGPUData(const GrVkGpu* gpu) const {
+    GR_VK_CALL(gpu->vkInterface(),
+               DestroySemaphore(gpu->device(), fSemaphore, nullptr));
+}
+
diff --git a/src/gpu/vk/GrVkSemaphore.h b/src/gpu/vk/GrVkSemaphore.h
new file mode 100644
index 0000000..0a3bc17
--- /dev/null
+++ b/src/gpu/vk/GrVkSemaphore.h
@@ -0,0 +1,55 @@
+/*
+ * Copyright 2017 Google Inc.
+ *
+ * Use of this source code is governed by a BSD-style license that can be
+ * found in the LICENSE file.
+ */
+
+#ifndef GrVkSemaphore_DEFINED
+#define GrVkSemaphore_DEFINED
+
+#include "GrSemaphore.h"
+#include "GrVkResource.h"
+
+#include "vk/GrVkTypes.h"
+
+class GrVkGpu;
+
+class GrVkSemaphore : public GrSemaphore {
+public:
+    static sk_sp<GrVkSemaphore> Make(const GrVkGpu* gpu);
+
+    ~GrVkSemaphore() override;
+
+    class Resource : public GrVkResource {
+    public:
+        Resource(VkSemaphore semaphore) : INHERITED(), fSemaphore(semaphore) {}
+
+        ~Resource() override {}
+
+        VkSemaphore semaphore() const { return fSemaphore; }
+
+#ifdef SK_TRACE_VK_RESOURCES
+        void dumpInfo() const override {
+            SkDebugf("GrVkSemaphore: %d (%d refs)\n", fSemaphore, this->getRefCnt());
+        }
+#endif
+    private:
+        void freeGPUData(const GrVkGpu* gpu) const override;
+
+        VkSemaphore fSemaphore;
+
+        typedef GrVkResource INHERITED;
+    };
+
+    const Resource* getResource() const { return fResource; }
+
+private:
+    GrVkSemaphore(const GrVkGpu* gpu, VkSemaphore semaphore);
+
+    const Resource* fResource;
+
+    typedef GrSemaphore INHERITED;
+};
+
+#endif