dax: fix race between simultaneous faults If two threads write-fault on the same hole at the same time, the winner of the race will return to userspace and complete their store, only to have the loser overwrite their store with zeroes. Fix this for now by taking the i_mmap_sem for write instead of read, and do so outside the call to get_block(). Now the loser of the race will see the block has already been zeroed, and will not zero it again. This severely limits our scalability. I have ideas for improving it, but those can wait for a later patch. Signed-off-by: Matthew Wilcox <willy@linux.intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>

commit: 843172978bb92997310d2f7fbc172ece423cfc02 [log] [tgz]
author: Matthew Wilcox <willy@linux.intel.com> Tue Sep 08 14:59:25 2015 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Tue Sep 08 15:35:28 2015 -0700
tree: 3dd6214e78238293b1ac4612e7fc35775b567da0
parent: 01a33b4ace68bc35679a347f21d5ed6e222e30dc [diff] [blame]
diff --git a/fs/dax.c b/fs/dax.c
index c694117..9593f4b 100644
--- a/fs/dax.c
+++ b/fs/dax.c

@@ -272,7 +272,6 @@
 static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh,
 			struct vm_area_struct *vma, struct vm_fault *vmf)
 {
-	struct address_space *mapping = inode->i_mapping;
 	sector_t sector = bh->b_blocknr << (inode->i_blkbits - 9);
 	unsigned long vaddr = (unsigned long)vmf->virtual_address;
 	void *addr;
@@ -280,8 +279,6 @@
 	pgoff_t size;
 	int error;
 
-	i_mmap_lock_read(mapping);
-
 	/*
 	 * Check truncate didn't happen while we were allocating a block.
 	 * If it did, this block may or may not be still allocated to the
@@ -309,8 +306,6 @@
 	error = vm_insert_mixed(vma, vaddr, pfn);
 
  out:
-	i_mmap_unlock_read(mapping);
-
 	return error;
 }
 
@@ -372,15 +367,17 @@
 			 * from a read fault and we've raced with a truncate
 			 */
 			error = -EIO;
-			goto unlock_page;
+			goto unlock;
 		}
+	} else {
+		i_mmap_lock_write(mapping);
 	}
 
 	error = get_block(inode, block, &bh, 0);
 	if (!error && (bh.b_size < PAGE_SIZE))
 		error = -EIO;		/* fs corruption? */
 	if (error)
-		goto unlock_page;
+		goto unlock;
 
 	if (!buffer_mapped(&bh) && !buffer_unwritten(&bh) && !vmf->cow_page) {
 		if (vmf->flags & FAULT_FLAG_WRITE) {
@@ -391,8 +388,9 @@
 			if (!error && (bh.b_size < PAGE_SIZE))
 				error = -EIO;
 			if (error)
-				goto unlock_page;
+				goto unlock;
 		} else {
+			i_mmap_unlock_write(mapping);
 			return dax_load_hole(mapping, page, vmf);
 		}
 	}
@@ -404,17 +402,15 @@
 		else
 			clear_user_highpage(new_page, vaddr);
 		if (error)
-			goto unlock_page;
+			goto unlock;
 		vmf->page = page;
 		if (!page) {
-			i_mmap_lock_read(mapping);
 			/* Check we didn't race with truncate */
 			size = (i_size_read(inode) + PAGE_SIZE - 1) >>
 								PAGE_SHIFT;
 			if (vmf->pgoff >= size) {
-				i_mmap_unlock_read(mapping);
 				error = -EIO;
-				goto out;
+				goto unlock;
 			}
 		}
 		return VM_FAULT_LOCKED;
@@ -450,6 +446,8 @@
 			WARN_ON_ONCE(!(vmf->flags & FAULT_FLAG_WRITE));
 	}
 
+	if (!page)
+		i_mmap_unlock_write(mapping);
  out:
 	if (error == -ENOMEM)
 		return VM_FAULT_OOM | major;
@@ -458,11 +456,14 @@
 		return VM_FAULT_SIGBUS | major;
 	return VM_FAULT_NOPAGE | major;
 
- unlock_page:
+ unlock:
 	if (page) {
 		unlock_page(page);
 		page_cache_release(page);
+	} else {
+		i_mmap_unlock_write(mapping);
 	}
+
 	goto out;
 }
 EXPORT_SYMBOL(__dax_fault);
@@ -540,10 +541,10 @@
 	block = (sector_t)pgoff << (PAGE_SHIFT - blkbits);
 
 	bh.b_size = PMD_SIZE;
+	i_mmap_lock_write(mapping);
 	length = get_block(inode, block, &bh, write);
 	if (length)
 		return VM_FAULT_SIGBUS;
-	i_mmap_lock_read(mapping);
 
 	/*
 	 * If the filesystem isn't willing to tell us the length of a hole,
@@ -607,11 +608,11 @@
 	}
 
  out:
-	i_mmap_unlock_read(mapping);
-
 	if (buffer_unwritten(&bh))
 		complete_unwritten(&bh, !(result & VM_FAULT_ERROR));
 
+	i_mmap_unlock_write(mapping);
+
 	return result;
 
  fallback:
commit	843172978bb92997310d2f7fbc172ece423cfc02	[log] [tgz]
author	Matthew Wilcox <willy@linux.intel.com>	Tue Sep 08 14:59:25 2015 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Tue Sep 08 15:35:28 2015 -0700
tree	3dd6214e78238293b1ac4612e7fc35775b567da0
parent	01a33b4ace68bc35679a347f21d5ed6e222e30dc [diff] [blame]