Support use of non-coherent memory allocations in Vulkan.

BUG=skia:5034
GOLD_TRYBOT_URL= https://gold.skia.org/search?issue=2348523002

Review-Url: https://codereview.chromium.org/2348523002
diff --git a/src/gpu/vk/GrVkBuffer.cpp b/src/gpu/vk/GrVkBuffer.cpp
index b8efb39..82674b4 100644
--- a/src/gpu/vk/GrVkBuffer.cpp
+++ b/src/gpu/vk/GrVkBuffer.cpp
@@ -180,6 +180,7 @@
     SkASSERT(this->vkIsMapped());
 
     if (fDesc.fDynamic) {
+        GrVkMemory::FlushMappedAlloc(gpu, this->alloc());
         VK_CALL(gpu, UnmapMemory(gpu->device(), this->alloc().fMemory));
         fMapPtr = nullptr;
     } else {
diff --git a/src/gpu/vk/GrVkGpu.cpp b/src/gpu/vk/GrVkGpu.cpp
index 4d410a7..a892e1b 100644
--- a/src/gpu/vk/GrVkGpu.cpp
+++ b/src/gpu/vk/GrVkGpu.cpp
@@ -494,6 +494,7 @@
         }
     }
 
+    GrVkMemory::FlushMappedAlloc(this, alloc);
     GR_VK_CALL(interface, UnmapMemory(fDevice, alloc.fMemory));
 
     return true;
@@ -606,6 +607,7 @@
         currentHeight = SkTMax(1, currentHeight/2);
     }
 
+    // no need to flush non-coherent memory, unmap will do that for us
     transferBuffer->unmap();
 
     // Change layout of our target so it can be copied to
@@ -963,12 +965,12 @@
 
 ////////////////////////////////////////////////////////////////////////////////
 
-bool copy_testing_data(GrVkGpu* gpu, void* srcData, GrVkAlloc* alloc,
+bool copy_testing_data(GrVkGpu* gpu, void* srcData, const GrVkAlloc& alloc,
                        size_t srcRowBytes, size_t dstRowBytes, int h) {
     void* mapPtr;
     VkResult err = GR_VK_CALL(gpu->vkInterface(), MapMemory(gpu->device(),
-                                                            alloc->fMemory,
-                                                            alloc->fOffset,
+                                                            alloc.fMemory,
+                                                            alloc.fOffset,
                                                             dstRowBytes * h,
                                                             0,
                                                             &mapPtr));
@@ -984,7 +986,8 @@
         SkRectMemcpy(mapPtr, static_cast<size_t>(dstRowBytes), srcData, srcRowBytes,
                      srcRowBytes, h);
     }
-    GR_VK_CALL(gpu->vkInterface(), UnmapMemory(gpu->device(), alloc->fMemory));
+    GrVkMemory::FlushMappedAlloc(gpu, alloc);
+    GR_VK_CALL(gpu->vkInterface(), UnmapMemory(gpu->device(), alloc.fMemory));
     return true;
 }
 
@@ -1019,7 +1022,7 @@
     }
 
     VkImage image = VK_NULL_HANDLE;
-    GrVkAlloc alloc = { VK_NULL_HANDLE, 0, 0 };
+    GrVkAlloc alloc = { VK_NULL_HANDLE, 0, 0, 0 };
 
     VkImageTiling imageTiling = linearTiling ? VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
     VkImageLayout initialLayout = (VK_IMAGE_TILING_LINEAR == imageTiling)
@@ -1070,7 +1073,7 @@
 
             VK_CALL(GetImageSubresourceLayout(fDevice, image, &subres, &layout));
 
-            if (!copy_testing_data(this, srcData, &alloc, rowCopyBytes,
+            if (!copy_testing_data(this, srcData, alloc, rowCopyBytes,
                                    static_cast<size_t>(layout.rowPitch), h)) {
                 GrVkMemory::FreeImageMemory(this, linearTiling, alloc);
                 VK_CALL(DestroyImage(fDevice, image, nullptr));
@@ -1098,7 +1101,7 @@
                 return 0;
             }
 
-            GrVkAlloc bufferAlloc = { VK_NULL_HANDLE, 0, 0 };
+            GrVkAlloc bufferAlloc = { VK_NULL_HANDLE, 0, 0, 0 };
             if (!GrVkMemory::AllocAndBindBufferMemory(this, buffer, GrVkBuffer::kCopyRead_Type,
                                                       true, &bufferAlloc)) {
                 GrVkMemory::FreeImageMemory(this, linearTiling, alloc);
@@ -1107,7 +1110,7 @@
                 return 0;
             }
 
-            if (!copy_testing_data(this, srcData, &bufferAlloc, rowCopyBytes, rowCopyBytes, h)) {
+            if (!copy_testing_data(this, srcData, bufferAlloc, rowCopyBytes, rowCopyBytes, h)) {
                 GrVkMemory::FreeImageMemory(this, linearTiling, alloc);
                 VK_CALL(DestroyImage(fDevice, image, nullptr));
                 GrVkMemory::FreeBufferMemory(this, GrVkBuffer::kCopyRead_Type, bufferAlloc);
@@ -1756,7 +1759,7 @@
     // We need to submit the current command buffer to the Queue and make sure it finishes before
     // we can copy the data out of the buffer.
     this->submitCommandBuffer(kForce_SyncQueue);
-
+    GrVkMemory::InvalidateMappedAlloc(this, transferBuffer->alloc());
     void* mappedMemory = transferBuffer->map();
 
     if (copyFromOrigin) {
diff --git a/src/gpu/vk/GrVkMemory.cpp b/src/gpu/vk/GrVkMemory.cpp
index 48bea9c..98b2f89 100644
--- a/src/gpu/vk/GrVkMemory.cpp
+++ b/src/gpu/vk/GrVkMemory.cpp
@@ -10,13 +10,12 @@
 #include "GrVkGpu.h"
 #include "GrVkUtil.h"
 
-static bool get_valid_memory_type_index(VkPhysicalDeviceMemoryProperties physDevMemProps,
+static bool get_valid_memory_type_index(const VkPhysicalDeviceMemoryProperties& physDevMemProps,
                                         uint32_t typeBits,
                                         VkMemoryPropertyFlags requestedMemFlags,
                                         uint32_t* typeIndex) {
-    uint32_t checkBit = 1;
-    for (uint32_t i = 0; i < 32; ++i) {
-        if (typeBits & checkBit) {
+    for (uint32_t i = 0; i < physDevMemProps.memoryTypeCount; ++i) {
+        if (typeBits & (1 << i)) {
             uint32_t supportedFlags = physDevMemProps.memoryTypes[i].propertyFlags &
                                       requestedMemFlags;
             if (supportedFlags == requestedMemFlags) {
@@ -24,7 +23,6 @@
                 return true;
             }
         }
-        checkBit <<= 1;
     }
     return false;
 }
@@ -57,21 +55,32 @@
     VkMemoryRequirements memReqs;
     GR_VK_CALL(iface, GetBufferMemoryRequirements(device, buffer, &memReqs));
 
-    VkMemoryPropertyFlags desiredMemProps = dynamic ? VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                                      VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
-                                                      VK_MEMORY_PROPERTY_HOST_CACHED_BIT
-                                                    : VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT;
     uint32_t typeIndex = 0;
-    if (!get_valid_memory_type_index(gpu->physicalDeviceMemoryProperties(),
-                                     memReqs.memoryTypeBits,
-                                     desiredMemProps,
-                                     &typeIndex)) {
-        // this memory type should always be available
-        SkASSERT_RELEASE(get_valid_memory_type_index(gpu->physicalDeviceMemoryProperties(),
+    const VkPhysicalDeviceMemoryProperties& phDevMemProps = gpu->physicalDeviceMemoryProperties();
+    if (dynamic) {
+        // try to get cached and ideally non-coherent memory first
+        if (!get_valid_memory_type_index(phDevMemProps,
+                                         memReqs.memoryTypeBits,
+                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
+                                         VK_MEMORY_PROPERTY_HOST_CACHED_BIT,
+                                         &typeIndex)) {
+            // some sort of host-visible memory type should always be available for dynamic buffers
+            SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
+                                                         memReqs.memoryTypeBits,
+                                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
+                                                         &typeIndex));
+        }
+
+        VkMemoryPropertyFlags mpf = phDevMemProps.memoryTypes[typeIndex].propertyFlags;
+        alloc->fFlags = mpf & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT ? 0x0
+                                                                   : GrVkAlloc::kNoncoherent_Flag;
+    } else {
+        // device-local memory should always be available for static buffers
+        SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
                                                      memReqs.memoryTypeBits,
-                                                     VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                                     VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                                                     VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                                                      &typeIndex));
+        alloc->fFlags = 0x0;
     }
 
     GrVkHeap* heap = gpu->getHeap(buffer_type_to_heap(type));
@@ -81,7 +90,7 @@
         return false;
     }
 
-    // Bind Memory to device
+    // Bind buffer
     VkResult err = GR_VK_CALL(iface, BindBufferMemory(device, buffer,
                                                       alloc->fMemory, alloc->fOffset));
     if (err) {
@@ -122,25 +131,27 @@
 
     uint32_t typeIndex = 0;
     GrVkHeap* heap;
+    const VkPhysicalDeviceMemoryProperties& phDevMemProps = gpu->physicalDeviceMemoryProperties();
     if (linearTiling) {
         VkMemoryPropertyFlags desiredMemProps = VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                                VK_MEMORY_PROPERTY_HOST_COHERENT_BIT |
                                                 VK_MEMORY_PROPERTY_HOST_CACHED_BIT;
-        if (!get_valid_memory_type_index(gpu->physicalDeviceMemoryProperties(),
+        if (!get_valid_memory_type_index(phDevMemProps,
                                          memReqs.memoryTypeBits,
                                          desiredMemProps,
                                          &typeIndex)) {
-            // this memory type should always be available
-            SkASSERT_RELEASE(get_valid_memory_type_index(gpu->physicalDeviceMemoryProperties(),
+            // some sort of host-visible memory type should always be available
+            SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
                                                          memReqs.memoryTypeBits,
-                                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT |
-                                                         VK_MEMORY_PROPERTY_HOST_COHERENT_BIT,
+                                                         VK_MEMORY_PROPERTY_HOST_VISIBLE_BIT,
                                                          &typeIndex));
         }
         heap = gpu->getHeap(GrVkGpu::kLinearImage_Heap);
+        VkMemoryPropertyFlags mpf = phDevMemProps.memoryTypes[typeIndex].propertyFlags;
+        alloc->fFlags = mpf & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT ? 0x0
+                                                                   : GrVkAlloc::kNoncoherent_Flag;
     } else {
         // this memory type should always be available
-        SkASSERT_RELEASE(get_valid_memory_type_index(gpu->physicalDeviceMemoryProperties(),
+        SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
                                                      memReqs.memoryTypeBits,
                                                      VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT,
                                                      &typeIndex));
@@ -149,6 +160,7 @@
         } else {
             heap = gpu->getHeap(GrVkGpu::kOptimalImage_Heap);
         }
+        alloc->fFlags = 0x0;
     }
 
     if (!heap->alloc(memReqs.size, memReqs.alignment, typeIndex, alloc)) {
@@ -156,7 +168,7 @@
         return false;
     }
 
-    // Bind Memory to device
+    // Bind image
     VkResult err = GR_VK_CALL(iface, BindImageMemory(device, image,
                               alloc->fMemory, alloc->fOffset));
     if (err) {
@@ -244,6 +256,32 @@
     return flags;
 }
 
+void GrVkMemory::FlushMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc) {
+    if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
+        VkMappedMemoryRange mappedMemoryRange;
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = alloc.fMemory;
+        mappedMemoryRange.offset = alloc.fOffset;
+        mappedMemoryRange.size = alloc.fSize;
+        GR_VK_CALL(gpu->vkInterface(), FlushMappedMemoryRanges(gpu->device(),
+                                                               1, &mappedMemoryRange));
+    }
+}
+
+void GrVkMemory::InvalidateMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc) {
+    if (alloc.fFlags & GrVkAlloc::kNoncoherent_Flag) {
+        VkMappedMemoryRange mappedMemoryRange;
+        memset(&mappedMemoryRange, 0, sizeof(VkMappedMemoryRange));
+        mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
+        mappedMemoryRange.memory = alloc.fMemory;
+        mappedMemoryRange.offset = alloc.fOffset;
+        mappedMemoryRange.size = alloc.fSize;
+        GR_VK_CALL(gpu->vkInterface(), InvalidateMappedMemoryRanges(gpu->device(),
+                                                               1, &mappedMemoryRange));
+    }
+}
+
 bool GrVkFreeListAlloc::alloc(VkDeviceSize requestedSize,
                               VkDeviceSize* allocOffset, VkDeviceSize* allocSize) {
     VkDeviceSize alignedSize = align_size(requestedSize, fAlignment);
diff --git a/src/gpu/vk/GrVkMemory.h b/src/gpu/vk/GrVkMemory.h
index 644d7d4..f8d5fdf 100644
--- a/src/gpu/vk/GrVkMemory.h
+++ b/src/gpu/vk/GrVkMemory.h
@@ -37,6 +37,9 @@
     VkPipelineStageFlags LayoutToPipelineStageFlags(const VkImageLayout layout);
 
     VkAccessFlags LayoutToSrcAccessMask(const VkImageLayout layout);
+
+    void FlushMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc);
+    void InvalidateMappedAlloc(const GrVkGpu* gpu, const GrVkAlloc& alloc);
 }
 
 class GrVkFreeListAlloc {