Fixes to alignment issues with regards to mapped vulkan memory.

Bug: skia:
Change-Id: Ida9813fe774580a6d157b8eb8d330488c8e8c4bc
Reviewed-on: https://skia-review.googlesource.com/109483
Commit-Queue: Greg Daniel <egdaniel@google.com>
Reviewed-by: Jim Van Verth <jvanverth@google.com>
diff --git a/src/gpu/vk/GrVkBuffer.cpp b/src/gpu/vk/GrVkBuffer.cpp
index 5aa3fb0..64f2630 100644
--- a/src/gpu/vk/GrVkBuffer.cpp
+++ b/src/gpu/vk/GrVkBuffer.cpp
@@ -169,6 +169,21 @@
 
     if (fDesc.fDynamic) {
         const GrVkAlloc& alloc = this->alloc();
+        SkASSERT(alloc.fSize > 0);
+
+        // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+        // are aligned to the nonCoherentAtomSize limit. The offset should have been correctly
+        // aligned by our memory allocator. For size we pad out to make the range also aligned.
+        if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+            // Currently we always have the internal offset as 0.
+            SkASSERT(0 == fOffset);
+            VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
+            SkASSERT(0 == (alloc.fOffset & (alignment - 1)));
+
+            // Make size of the map aligned to nonCoherentAtomSize
+            size = (size + alignment - 1) & ~(alignment - 1);
+        }
+        SkASSERT(size + fOffset <= alloc.fSize);
         VkResult err = VK_CALL(gpu, MapMemory(gpu->device(), alloc.fMemory,
                                               alloc.fOffset + fOffset,
                                               size, 0, &fMapPtr));
diff --git a/src/gpu/vk/GrVkGpu.cpp b/src/gpu/vk/GrVkGpu.cpp
index 2ef765d..c5a03f9 100644
--- a/src/gpu/vk/GrVkGpu.cpp
+++ b/src/gpu/vk/GrVkGpu.cpp
@@ -130,6 +130,7 @@
                                fBackendContext->fFeatures, fBackendContext->fExtensions));
     fCaps.reset(SkRef(fVkCaps.get()));
 
+    VK_CALL(GetPhysicalDeviceProperties(fBackendContext->fPhysicalDevice, &fPhysDevProps));
     VK_CALL(GetPhysicalDeviceMemoryProperties(fBackendContext->fPhysicalDevice, &fPhysDevMemProps));
 
     const VkCommandPoolCreateInfo cmdPoolInfo = {
@@ -578,12 +579,27 @@
     int texTop = kBottomLeft_GrSurfaceOrigin == texOrigin ? tex->height() - top - height : top;
     const GrVkAlloc& alloc = tex->alloc();
     VkDeviceSize offset = alloc.fOffset + texTop*layout.rowPitch + left*bpp;
+    VkDeviceSize offsetDiff = 0;
     VkDeviceSize size = height*layout.rowPitch;
+    // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+    // are aligned to the nonCoherentAtomSize limit. We may have to move the initial offset back to
+    // meet the alignment requirements. So we track how far we move back and then adjust the mapped
+    // ptr back up so that this is opaque to the caller.
+    if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+        VkDeviceSize alignment = this->physicalDeviceProperties().limits.nonCoherentAtomSize;
+        offsetDiff = offset & (alignment - 1);
+        offset = offset - offsetDiff;
+        // Make size of the map aligned to nonCoherentAtomSize
+        size = (size + alignment - 1) & ~(alignment - 1);
+    }
+    SkASSERT(offset >= alloc.fOffset);
+    SkASSERT(size <= alloc.fOffset + alloc.fSize);
     void* mapPtr;
     err = GR_VK_CALL(interface, MapMemory(fDevice, alloc.fMemory, offset, size, 0, &mapPtr));
     if (err) {
         return false;
     }
+    mapPtr = reinterpret_cast<char*>(mapPtr) + offsetDiff;
 
     if (kBottomLeft_GrSurfaceOrigin == texOrigin) {
         // copy into buffer by rows
@@ -1108,13 +1124,30 @@
 
 bool copy_testing_data(GrVkGpu* gpu, void* srcData, const GrVkAlloc& alloc, size_t bufferOffset,
                        size_t srcRowBytes, size_t dstRowBytes, int h) {
+    // For Noncoherent buffers we want to make sure the range that we map, both offset and size,
+    // are aligned to the nonCoherentAtomSize limit. We may have to move the initial offset back to
+    // meet the alignment requirements. So we track how far we move back and then adjust the mapped
+    // ptr back up so that this is opaque to the caller.
+    VkDeviceSize mapSize = dstRowBytes * h;
+    VkDeviceSize mapOffset = alloc.fOffset + bufferOffset;
+    VkDeviceSize offsetDiff = 0;
+    if (SkToBool(alloc.fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+        VkDeviceSize alignment = gpu->physicalDeviceProperties().limits.nonCoherentAtomSize;
+        offsetDiff = mapOffset & (alignment - 1);
+        mapOffset = mapOffset - offsetDiff;
+        // Make size of the map aligned to nonCoherentAtomSize
+        mapSize = (mapSize + alignment - 1) & ~(alignment - 1);
+    }
+    SkASSERT(mapOffset >= alloc.fOffset);
+    SkASSERT(mapSize + mapOffset <= alloc.fOffset + alloc.fSize);
     void* mapPtr;
     VkResult err = GR_VK_CALL(gpu->vkInterface(), MapMemory(gpu->device(),
                                                             alloc.fMemory,
-                                                            alloc.fOffset + bufferOffset,
-                                                            dstRowBytes * h,
+                                                            mapOffset,
+                                                            mapSize,
                                                             0,
                                                             &mapPtr));
+    mapPtr = reinterpret_cast<char*>(mapPtr) + offsetDiff;
     if (err) {
         return false;
     }
@@ -1179,7 +1212,7 @@
     }
 
     VkImage image = VK_NULL_HANDLE;
-    GrVkAlloc alloc = { VK_NULL_HANDLE, 0, 0, 0 };
+    GrVkAlloc alloc;
 
     VkImageTiling imageTiling = linearTiling ? VK_IMAGE_TILING_LINEAR : VK_IMAGE_TILING_OPTIMAL;
     VkImageLayout initialLayout = (VK_IMAGE_TILING_LINEAR == imageTiling)
@@ -1224,7 +1257,7 @@
     }
 
     // We need to declare these early so that we can delete them at the end outside of the if block.
-    GrVkAlloc bufferAlloc = { VK_NULL_HANDLE, 0, 0, 0 };
+    GrVkAlloc bufferAlloc;
     VkBuffer buffer = VK_NULL_HANDLE;
 
     VkResult err;
@@ -1978,8 +2011,8 @@
     // We need to submit the current command buffer to the Queue and make sure it finishes before
     // we can copy the data out of the buffer.
     this->submitCommandBuffer(kForce_SyncQueue);
-    GrVkMemory::InvalidateMappedAlloc(this, transferBuffer->alloc());
     void* mappedMemory = transferBuffer->map();
+    GrVkMemory::InvalidateMappedAlloc(this, transferBuffer->alloc());
 
     if (copyFromOrigin) {
         uint32_t skipRows = region.imageExtent.height - height;
diff --git a/src/gpu/vk/GrVkGpu.h b/src/gpu/vk/GrVkGpu.h
index 3833c5f..0b52147f 100644
--- a/src/gpu/vk/GrVkGpu.h
+++ b/src/gpu/vk/GrVkGpu.h
@@ -51,6 +51,9 @@
     VkDevice device() const { return fDevice; }
     VkQueue  queue() const { return fQueue; }
     VkCommandPool cmdPool() const { return fCmdPool; }
+    VkPhysicalDeviceProperties physicalDeviceProperties() const {
+        return fPhysDevProps;
+    }
     VkPhysicalDeviceMemoryProperties physicalDeviceMemoryProperties() const {
         return fPhysDevMemProps;
     }
@@ -253,6 +256,7 @@
     SkSTArray<1, GrVkSemaphore::Resource*>       fSemaphoresToWaitOn;
     SkSTArray<1, GrVkSemaphore::Resource*>       fSemaphoresToSignal;
 
+    VkPhysicalDeviceProperties                   fPhysDevProps;
     VkPhysicalDeviceMemoryProperties             fPhysDevMemProps;
 
     std::unique_ptr<GrVkHeap>                    fHeaps[kHeapCount];
diff --git a/src/gpu/vk/GrVkMemory.cpp b/src/gpu/vk/GrVkMemory.cpp
index a90533e..e27e260 100644
--- a/src/gpu/vk/GrVkMemory.cpp
+++ b/src/gpu/vk/GrVkMemory.cpp
@@ -68,6 +68,7 @@
     uint32_t typeIndex = 0;
     uint32_t heapIndex = 0;
     const VkPhysicalDeviceMemoryProperties& phDevMemProps = gpu->physicalDeviceMemoryProperties();
+    const VkPhysicalDeviceProperties& phDevProps = gpu->physicalDeviceProperties();
     if (dynamic) {
         // try to get cached and ideally non-coherent memory first
         if (!get_valid_memory_type_index(phDevMemProps,
@@ -87,6 +88,11 @@
         VkMemoryPropertyFlags mpf = phDevMemProps.memoryTypes[typeIndex].propertyFlags;
         alloc->fFlags = mpf & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT ? 0x0
                                                                    : GrVkAlloc::kNoncoherent_Flag;
+        if (SkToBool(alloc->fFlags & GrVkAlloc::kNoncoherent_Flag)) {
+            SkASSERT(SkIsPow2(memReqs.alignment));
+            SkASSERT(SkIsPow2(phDevProps.limits.nonCoherentAtomSize));
+            memReqs.alignment = SkTMax(memReqs.alignment, phDevProps.limits.nonCoherentAtomSize);
+        }
     } else {
         // device-local memory should always be available for static buffers
         SkASSERT_RELEASE(get_valid_memory_type_index(phDevMemProps,
@@ -293,7 +299,7 @@
         mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
         mappedMemoryRange.memory = alloc.fMemory;
         mappedMemoryRange.offset = alloc.fOffset;
-        mappedMemoryRange.size = alloc.fSize;
+        mappedMemoryRange.size = VK_WHOLE_SIZE; // Size of what we mapped
         GR_VK_CALL(gpu->vkInterface(), FlushMappedMemoryRanges(gpu->device(),
                                                                1, &mappedMemoryRange));
     }
@@ -306,7 +312,7 @@
         mappedMemoryRange.sType = VK_STRUCTURE_TYPE_MAPPED_MEMORY_RANGE;
         mappedMemoryRange.memory = alloc.fMemory;
         mappedMemoryRange.offset = alloc.fOffset;
-        mappedMemoryRange.size = alloc.fSize;
+        mappedMemoryRange.size = VK_WHOLE_SIZE; // Size of what we mapped
         GR_VK_CALL(gpu->vkInterface(), InvalidateMappedMemoryRanges(gpu->device(),
                                                                1, &mappedMemoryRange));
     }
@@ -519,7 +525,7 @@
         VkMemoryAllocateInfo allocInfo = {
             VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO,      // sType
             nullptr,                                     // pNext
-            size,                                        // allocationSize
+            alignedSize,                                 // allocationSize
             memoryTypeIndex,                             // memoryTypeIndex
         };
 
@@ -531,7 +537,8 @@
             return false;
         }
         alloc->fOffset = 0;
-        alloc->fSize = 0;    // hint that this is not a subheap allocation
+        alloc->fSize = alignedSize;
+        alloc->fUsesSystemHeap = true;
 #ifdef SK_DEBUG
         gHeapUsage[VK_MAX_MEMORY_HEAPS] += alignedSize;
 #endif
@@ -624,7 +631,7 @@
 
 bool GrVkHeap::free(const GrVkAlloc& alloc) {
     // a size of 0 means we're using the system heap
-    if (0 == alloc.fSize) {
+    if (alloc.fUsesSystemHeap) {
         const GrVkInterface* iface = fGpu->vkInterface();
         GR_VK_CALL(iface, FreeMemory(fGpu->device(), alloc.fMemory, nullptr));
         return true;
diff --git a/src/gpu/vk/GrVkMemory.h b/src/gpu/vk/GrVkMemory.h
index a8f3771..88cc47b 100644
--- a/src/gpu/vk/GrVkMemory.h
+++ b/src/gpu/vk/GrVkMemory.h
@@ -141,6 +141,7 @@
     bool alloc(VkDeviceSize size, VkDeviceSize alignment, uint32_t memoryTypeIndex,
                uint32_t heapIndex, GrVkAlloc* alloc) {
         SkASSERT(size > 0);
+        alloc->fUsesSystemHeap = false;
         return (*this.*fAllocFunc)(size, alignment, memoryTypeIndex, heapIndex, alloc);
     }
     bool free(const GrVkAlloc& alloc);