Vulkan: Implement occlusion queries

Begin and end queries insert an execution barrier in the command graph
to ensure the commands around them are not reordered w.r.t to the query.

Also, these commands cannot be recorded in separate secondary command
buffers.  Therefore, special-function nodes are created to perform the
begin and end query calls on the final primary command buffer.

Bug: angleproject:2855
Change-Id: Ie216dfdd6a2009deaaf744fd15d2db6899dd93e9
Reviewed-on: https://chromium-review.googlesource.com/c/1259762
Reviewed-by: Geoff Lang <geofflang@chromium.org>
Commit-Queue: Shahbaz Youssefi <syoussefi@chromium.org>
diff --git a/src/libANGLE/renderer/vulkan/CommandGraph.cpp b/src/libANGLE/renderer/vulkan/CommandGraph.cpp
index 84b3c31..fa90703 100644
--- a/src/libANGLE/renderer/vulkan/CommandGraph.cpp
+++ b/src/libANGLE/renderer/vulkan/CommandGraph.cpp
@@ -57,6 +57,8 @@
             return "Framebuffer";
         case CommandGraphResourceType::Image:
             return "Image";
+        case CommandGraphResourceType::Query:
+            return "Query";
         default:
             UNREACHABLE();
             return "";
@@ -89,6 +91,13 @@
     return renderer->isSerialInUse(mStoredQueueSerial);
 }
 
+bool CommandGraphResource::hasPendingWork(RendererVk *renderer) const
+{
+    // If the renderer has a queue serial higher than the stored one, the command buffers recorded
+    // by this resource have already been submitted, so there is no pending work.
+    return mStoredQueueSerial == renderer->getCurrentQueueSerial();
+}
+
 Serial CommandGraphResource::getStoredQueueSerial() const
 {
     return mStoredQueueSerial;
@@ -101,7 +110,7 @@
 
     if (!hasChildlessWritingNode() || hasStartedRenderPass())
     {
-        finishCurrentCommands(context->getRenderer());
+        startNewCommands(context->getRenderer(), CommandGraphNodeFunction::Generic);
         return mCurrentWritingNode->beginOutsideRenderPassRecording(
             context, context->getRenderer()->getCommandPool(), commandBufferOut);
     }
@@ -146,8 +155,14 @@
                                                     const gl::Rectangle &renderArea,
                                                     const RenderPassDesc &renderPassDesc,
                                                     const std::vector<VkClearValue> &clearValues,
-                                                    CommandBuffer **commandBufferOut) const
+                                                    CommandBuffer **commandBufferOut)
 {
+    // If a barrier has been inserted in the meantime, stop the command buffer.
+    if (!hasChildlessWritingNode())
+    {
+        startNewCommands(context->getRenderer(), CommandGraphNodeFunction::Generic);
+    }
+
     // Hard-code RenderPass to clear the first render target to the current clear value.
     // TODO(jmadill): Proper clear value implementation. http://anglebug.com/2361
     mCurrentWritingNode->storeRenderPassInfo(framebuffer, renderArea, renderPassDesc, clearValues);
@@ -155,9 +170,31 @@
     return mCurrentWritingNode->beginInsideRenderPassRecording(context, commandBufferOut);
 }
 
+void CommandGraphResource::beginQuery(Context *context,
+                                      const QueryPool *queryPool,
+                                      uint32_t queryIndex)
+{
+    startNewCommands(context->getRenderer(), CommandGraphNodeFunction::BeginQuery);
+    mCurrentWritingNode->setQueryPool(queryPool, queryIndex);
+}
+
+void CommandGraphResource::endQuery(Context *context,
+                                    const QueryPool *queryPool,
+                                    uint32_t queryIndex)
+{
+    startNewCommands(context->getRenderer(), CommandGraphNodeFunction::EndQuery);
+    mCurrentWritingNode->setQueryPool(queryPool, queryIndex);
+}
+
 void CommandGraphResource::finishCurrentCommands(RendererVk *renderer)
 {
-    CommandGraphNode *newCommands = renderer->getCommandGraph()->allocateNode();
+    startNewCommands(renderer, CommandGraphNodeFunction::Generic);
+}
+
+void CommandGraphResource::startNewCommands(RendererVk *renderer, CommandGraphNodeFunction function)
+{
+    bool isBarrier                = function != CommandGraphNodeFunction::Generic;
+    CommandGraphNode *newCommands = renderer->getCommandGraph()->allocateNode(isBarrier, function);
     newCommands->setDiagnosticInfo(mResourceType, reinterpret_cast<uintptr_t>(this));
     onWriteImpl(newCommands, renderer->getCurrentQueueSerial());
 }
@@ -177,7 +214,8 @@
     // Make sure any open reads and writes finish before we execute 'writingNode'.
     if (!mCurrentReadingNodes.empty())
     {
-        CommandGraphNode::SetHappensBeforeDependencies(mCurrentReadingNodes, writingNode);
+        CommandGraphNode::SetHappensBeforeDependencies(mCurrentReadingNodes.data(),
+                                                       mCurrentReadingNodes.size(), writingNode);
         mCurrentReadingNodes.clear();
     }
 
@@ -207,8 +245,13 @@
 }
 
 // CommandGraphNode implementation.
-CommandGraphNode::CommandGraphNode()
-    : mRenderPassClearValues{}, mHasChildren(false), mVisitedState(VisitedState::Unvisited)
+CommandGraphNode::CommandGraphNode(CommandGraphNodeFunction function)
+    : mRenderPassClearValues{},
+      mFunction(function),
+      mQueryPool(VK_NULL_HANDLE),
+      mQueryIndex(0),
+      mHasChildren(false),
+      mVisitedState(VisitedState::Unvisited)
 {
 }
 
@@ -238,7 +281,8 @@
     inheritanceInfo.renderPass           = VK_NULL_HANDLE;
     inheritanceInfo.subpass              = 0;
     inheritanceInfo.framebuffer          = VK_NULL_HANDLE;
-    inheritanceInfo.occlusionQueryEnable = VK_FALSE;
+    inheritanceInfo.occlusionQueryEnable =
+        context->getRenderer()->getPhysicalDeviceFeatures().inheritedQueries;
     inheritanceInfo.queryFlags           = 0;
     inheritanceInfo.pipelineStatistics   = 0;
 
@@ -255,7 +299,7 @@
     ASSERT(!mHasChildren);
 
     // Get a compatible RenderPass from the cache so we can initialize the inheritance info.
-    // TODO(jmadill): Support query for compatible/conformant render pass. htto://anglebug.com/2361
+    // TODO(jmadill): Support query for compatible/conformant render pass. http://anglebug.com/2361
     RenderPass *compatibleRenderPass;
     ANGLE_TRY(context->getRenderer()->getCompatibleRenderPass(context, mRenderPassDesc,
                                                               &compatibleRenderPass));
@@ -265,7 +309,8 @@
     inheritanceInfo.renderPass           = compatibleRenderPass->getHandle();
     inheritanceInfo.subpass              = 0;
     inheritanceInfo.framebuffer          = mRenderPassFramebuffer.getHandle();
-    inheritanceInfo.occlusionQueryEnable = VK_FALSE;
+    inheritanceInfo.occlusionQueryEnable =
+        context->getRenderer()->getPhysicalDeviceFeatures().inheritedQueries;
     inheritanceInfo.queryFlags           = 0;
     inheritanceInfo.pipelineStatistics   = 0;
 
@@ -298,18 +343,29 @@
 }
 
 // static
-void CommandGraphNode::SetHappensBeforeDependencies(
-    const std::vector<CommandGraphNode *> &beforeNodes,
-    CommandGraphNode *afterNode)
+void CommandGraphNode::SetHappensBeforeDependencies(CommandGraphNode **beforeNodes,
+                                                    size_t beforeNodesCount,
+                                                    CommandGraphNode *afterNode)
 {
-    afterNode->mParents.insert(afterNode->mParents.end(), beforeNodes.begin(), beforeNodes.end());
+    afterNode->mParents.insert(afterNode->mParents.end(), beforeNodes,
+                               beforeNodes + beforeNodesCount);
 
     // TODO(jmadill): is there a faster way to do this?
-    for (CommandGraphNode *beforeNode : beforeNodes)
+    for (size_t i = 0; i < beforeNodesCount; ++i)
     {
-        beforeNode->setHasChildren();
+        beforeNodes[i]->setHasChildren();
 
-        ASSERT(beforeNode != afterNode && !beforeNode->isChildOf(afterNode));
+        ASSERT(beforeNodes[i] != afterNode && !beforeNodes[i]->isChildOf(afterNode));
+    }
+}
+
+void CommandGraphNode::SetHappensBeforeDependencies(CommandGraphNode *beforeNode,
+                                                    CommandGraphNode **afterNodes,
+                                                    size_t afterNodesCount)
+{
+    for (size_t i = 0; i < afterNodesCount; ++i)
+    {
+        SetHappensBeforeDependency(beforeNode, afterNodes[i]);
     }
 }
 
@@ -318,6 +374,14 @@
     return !mParents.empty();
 }
 
+void CommandGraphNode::setQueryPool(const QueryPool *queryPool, uint32_t queryIndex)
+{
+    ASSERT(mFunction == CommandGraphNodeFunction::BeginQuery ||
+           mFunction == CommandGraphNodeFunction::EndQuery);
+    mQueryPool  = queryPool->getHandle();
+    mQueryIndex = queryIndex;
+}
+
 void CommandGraphNode::setHasChildren()
 {
     mHasChildren = true;
@@ -364,37 +428,66 @@
                                                 RenderPassCache *renderPassCache,
                                                 CommandBuffer *primaryCommandBuffer)
 {
-    if (mOutsideRenderPassCommands.valid())
+    switch (mFunction)
     {
-        ANGLE_TRY(mOutsideRenderPassCommands.end(context));
-        primaryCommandBuffer->executeCommands(1, &mOutsideRenderPassCommands);
-    }
+        case CommandGraphNodeFunction::Generic:
+            ASSERT(mQueryPool == VK_NULL_HANDLE);
 
-    if (mInsideRenderPassCommands.valid())
-    {
-        // Pull a compatible RenderPass from the cache.
-        // TODO(jmadill): Insert real ops and layout transitions.
-        RenderPass *renderPass = nullptr;
-        ANGLE_TRY(renderPassCache->getCompatibleRenderPass(context, serial, mRenderPassDesc,
-                                                           &renderPass));
+            if (mOutsideRenderPassCommands.valid())
+            {
+                ANGLE_TRY(mOutsideRenderPassCommands.end(context));
+                primaryCommandBuffer->executeCommands(1, &mOutsideRenderPassCommands);
+            }
 
-        ANGLE_TRY(mInsideRenderPassCommands.end(context));
+            if (mInsideRenderPassCommands.valid())
+            {
+                // Pull a compatible RenderPass from the cache.
+                // TODO(jmadill): Insert real ops and layout transitions.
+                RenderPass *renderPass = nullptr;
+                ANGLE_TRY(renderPassCache->getCompatibleRenderPass(context, serial, mRenderPassDesc,
+                                                                   &renderPass));
 
-        VkRenderPassBeginInfo beginInfo    = {};
-        beginInfo.sType                    = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
-        beginInfo.renderPass               = renderPass->getHandle();
-        beginInfo.framebuffer              = mRenderPassFramebuffer.getHandle();
-        beginInfo.renderArea.offset.x      = static_cast<uint32_t>(mRenderPassRenderArea.x);
-        beginInfo.renderArea.offset.y      = static_cast<uint32_t>(mRenderPassRenderArea.y);
-        beginInfo.renderArea.extent.width  = static_cast<uint32_t>(mRenderPassRenderArea.width);
-        beginInfo.renderArea.extent.height = static_cast<uint32_t>(mRenderPassRenderArea.height);
-        beginInfo.clearValueCount          = mRenderPassDesc.attachmentCount();
-        beginInfo.pClearValues             = mRenderPassClearValues.data();
+                ANGLE_TRY(mInsideRenderPassCommands.end(context));
 
-        primaryCommandBuffer->beginRenderPass(beginInfo,
-                                              VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);
-        primaryCommandBuffer->executeCommands(1, &mInsideRenderPassCommands);
-        primaryCommandBuffer->endRenderPass();
+                VkRenderPassBeginInfo beginInfo = {};
+                beginInfo.sType                 = VK_STRUCTURE_TYPE_RENDER_PASS_BEGIN_INFO;
+                beginInfo.renderPass            = renderPass->getHandle();
+                beginInfo.framebuffer           = mRenderPassFramebuffer.getHandle();
+                beginInfo.renderArea.offset.x   = static_cast<uint32_t>(mRenderPassRenderArea.x);
+                beginInfo.renderArea.offset.y   = static_cast<uint32_t>(mRenderPassRenderArea.y);
+                beginInfo.renderArea.extent.width =
+                    static_cast<uint32_t>(mRenderPassRenderArea.width);
+                beginInfo.renderArea.extent.height =
+                    static_cast<uint32_t>(mRenderPassRenderArea.height);
+                beginInfo.clearValueCount = mRenderPassDesc.attachmentCount();
+                beginInfo.pClearValues    = mRenderPassClearValues.data();
+
+                primaryCommandBuffer->beginRenderPass(
+                    beginInfo, VK_SUBPASS_CONTENTS_SECONDARY_COMMAND_BUFFERS);
+                primaryCommandBuffer->executeCommands(1, &mInsideRenderPassCommands);
+                primaryCommandBuffer->endRenderPass();
+            }
+            break;
+
+        case CommandGraphNodeFunction::BeginQuery:
+            ASSERT(!mOutsideRenderPassCommands.valid() && !mInsideRenderPassCommands.valid());
+            ASSERT(mQueryPool != VK_NULL_HANDLE);
+
+            primaryCommandBuffer->resetQueryPool(mQueryPool, mQueryIndex, 1);
+            primaryCommandBuffer->beginQuery(mQueryPool, mQueryIndex, 0);
+
+            break;
+
+        case CommandGraphNodeFunction::EndQuery:
+            ASSERT(!mOutsideRenderPassCommands.valid() && !mInsideRenderPassCommands.valid());
+            ASSERT(mQueryPool != VK_NULL_HANDLE);
+
+            primaryCommandBuffer->endQuery(mQueryPool, mQueryIndex);
+
+            break;
+
+        default:
+            UNREACHABLE();
     }
 
     mVisitedState = VisitedState::Visited;
@@ -420,7 +513,7 @@
 
 // CommandGraph implementation.
 CommandGraph::CommandGraph(bool enableGraphDiagnostics)
-    : mEnableGraphDiagnostics(enableGraphDiagnostics)
+    : mEnableGraphDiagnostics(enableGraphDiagnostics), mLastBarrierIndex(kInvalidNodeIndex)
 {
 }
 
@@ -429,20 +522,56 @@
     ASSERT(empty());
 }
 
-CommandGraphNode *CommandGraph::allocateNode()
+CommandGraphNode *CommandGraph::allocateNode(bool isBarrier, CommandGraphNodeFunction function)
 {
     // TODO(jmadill): Use a pool allocator for the CPU node allocations.
-    CommandGraphNode *newCommands = new CommandGraphNode();
+    CommandGraphNode *newCommands = new CommandGraphNode(function);
     mNodes.emplace_back(newCommands);
+
+    if (isBarrier)
+    {
+        setNewBarrier(newCommands);
+    }
+
     return newCommands;
 }
 
+void CommandGraph::setNewBarrier(CommandGraphNode *newBarrier)
+{
+    size_t previousBarrierIndex       = 0;
+    CommandGraphNode *previousBarrier = getLastBarrierNode(&previousBarrierIndex);
+
+    // Add a dependency from previousBarrier to all nodes in (previousBarrier, newBarrier].
+    if (previousBarrier && previousBarrierIndex + 1 < mNodes.size())
+    {
+        size_t afterNodesCount = mNodes.size() - (previousBarrierIndex + 1);
+        CommandGraphNode::SetHappensBeforeDependencies(
+            previousBarrier, &mNodes[previousBarrierIndex + 1], afterNodesCount);
+    }
+
+    // Add a dependency from all nodes in (previousBarrier, newBarrier) to newBarrier.
+    addDependenciesToNextBarrier(previousBarrierIndex + 1, mNodes.size() - 1, newBarrier);
+
+    mLastBarrierIndex = mNodes.size() - 1;
+}
+
 angle::Result CommandGraph::submitCommands(Context *context,
                                            Serial serial,
                                            RenderPassCache *renderPassCache,
                                            CommandPool *commandPool,
                                            CommandBuffer *primaryCommandBufferOut)
 {
+    size_t previousBarrierIndex       = 0;
+    CommandGraphNode *previousBarrier = getLastBarrierNode(&previousBarrierIndex);
+
+    // Add a dependency from previousBarrier to all nodes in (previousBarrier, end].
+    if (previousBarrier && previousBarrierIndex + 1 < mNodes.size())
+    {
+        size_t afterNodesCount = mNodes.size() - (previousBarrierIndex + 1);
+        CommandGraphNode::SetHappensBeforeDependencies(
+            previousBarrier, &mNodes[previousBarrierIndex + 1], afterNodesCount);
+    }
+
     VkCommandBufferAllocateInfo primaryInfo = {};
     primaryInfo.sType              = VK_STRUCTURE_TYPE_COMMAND_BUFFER_ALLOCATE_INFO;
     primaryInfo.commandPool        = commandPool->getHandle();
@@ -511,6 +640,7 @@
         delete node;
     }
     mNodes.clear();
+    mLastBarrierIndex = kInvalidNodeIndex;
 
     return angle::Result::Continue();
 }
@@ -537,6 +667,7 @@
     int bufferIDCounter      = 1;
     int framebufferIDCounter = 1;
     int imageIDCounter       = 1;
+    int queryIDCounter       = 1;
 
     out << "digraph {" << std::endl;
 
@@ -568,6 +699,9 @@
                 case CommandGraphResourceType::Image:
                     id = imageIDCounter++;
                     break;
+                case CommandGraphResourceType::Query:
+                    id = queryIDCounter++;
+                    break;
                 default:
                     UNREACHABLE();
                     break;
@@ -595,5 +729,27 @@
 
     out << "}" << std::endl;
 }
+
+CommandGraphNode *CommandGraph::getLastBarrierNode(size_t *indexOut)
+{
+    *indexOut = mLastBarrierIndex == kInvalidNodeIndex ? 0 : mLastBarrierIndex;
+    return mLastBarrierIndex == kInvalidNodeIndex ? nullptr : mNodes[mLastBarrierIndex];
+}
+
+void CommandGraph::addDependenciesToNextBarrier(size_t begin,
+                                                size_t end,
+                                                CommandGraphNode *nextBarrier)
+{
+    for (size_t i = begin; i < end; ++i)
+    {
+        // As a small optimization, only add edges to childless nodes.  The others have an
+        // indirect dependency.
+        if (!mNodes[i]->hasChildren())
+        {
+            CommandGraphNode::SetHappensBeforeDependency(mNodes[i], nextBarrier);
+        }
+    }
+}
+
 }  // namespace vk
 }  // namespace rx