ocfs2: serialize unaligned aio

Fix a corruption that can happen when we have (two or more) outstanding
aio's to an overlapping unaligned region.  Ext4
(e9e3bcecf44c04b9e6b505fd8e2eb9cea58fb94d) and xfs recently had to fix
similar issues.

In our case what happens is that we can have an outstanding aio on a region
and if a write comes in with some bytes overlapping the original aio we may
decide to read that region into a page before continuing (typically because
of buffered-io fallback).  Since we have no ordering guarantees with the
aio, we can read stale or bad data into the page and then write it back out.

If the i/o is page and block aligned, then we avoid this issue as there
won't be any need to read data from disk.

I took the same approach as Eric in the ext4 patch and introduced some
serialization of unaligned async direct i/o.  I don't expect this to have an
effect on the most common cases of AIO.  Unaligned aio will be slower
though, but that's far more acceptable than data corruption.

Signed-off-by: Mark Fasheh <mfasheh@suse.com>
Signed-off-by: Joel Becker <jlbec@evilplan.org>
diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c
index b1e35a3..145f453 100644
--- a/fs/ocfs2/file.c
+++ b/fs/ocfs2/file.c
@@ -2038,6 +2038,23 @@
 	return ret;
 }
 
+static void ocfs2_aiodio_wait(struct inode *inode)
+{
+	wait_queue_head_t *wq = ocfs2_ioend_wq(inode);
+
+	wait_event(*wq, (atomic_read(&OCFS2_I(inode)->ip_unaligned_aio) == 0));
+}
+
+static int ocfs2_is_io_unaligned(struct inode *inode, size_t count, loff_t pos)
+{
+	int blockmask = inode->i_sb->s_blocksize - 1;
+	loff_t final_size = pos + count;
+
+	if ((pos & blockmask) || (final_size & blockmask))
+		return 1;
+	return 0;
+}
+
 static int ocfs2_prepare_inode_for_refcount(struct inode *inode,
 					    struct file *file,
 					    loff_t pos, size_t count,
@@ -2216,6 +2233,7 @@
 	struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
 	int full_coherency = !(osb->s_mount_opt &
 			       OCFS2_MOUNT_COHERENCY_BUFFERED);
+	int unaligned_dio = 0;
 
 	trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry,
 		(unsigned long long)OCFS2_I(inode)->ip_blkno,
@@ -2284,6 +2302,10 @@
 		goto out;
 	}
 
+	if (direct_io && !is_sync_kiocb(iocb))
+		unaligned_dio = ocfs2_is_io_unaligned(inode, iocb->ki_left,
+						      *ppos);
+
 	/*
 	 * We can't complete the direct I/O as requested, fall back to
 	 * buffered I/O.
@@ -2299,6 +2321,18 @@
 		goto relock;
 	}
 
+	if (unaligned_dio) {
+		/*
+		 * Wait on previous unaligned aio to complete before
+		 * proceeding.
+		 */
+		ocfs2_aiodio_wait(inode);
+
+		/* Mark the iocb as needing a decrement in ocfs2_dio_end_io */
+		atomic_inc(&OCFS2_I(inode)->ip_unaligned_aio);
+		ocfs2_iocb_set_unaligned_aio(iocb);
+	}
+
 	/*
 	 * To later detect whether a journal commit for sync writes is
 	 * necessary, we sample i_size, and cluster count here.
@@ -2371,8 +2405,12 @@
 	if ((ret == -EIOCBQUEUED) || (!ocfs2_iocb_is_rw_locked(iocb))) {
 		rw_level = -1;
 		have_alloc_sem = 0;
+		unaligned_dio = 0;
 	}
 
+	if (unaligned_dio)
+		atomic_dec(&OCFS2_I(inode)->ip_unaligned_aio);
+
 out:
 	if (rw_level != -1)
 		ocfs2_rw_unlock(inode, rw_level);