xfs: Non-blocking inode locking in IO completion The introduction of barriers to loop devices has created a new IO order completion dependency that XFS does not handle. The loop device implements barriers using fsync and so turns a log IO in the XFS filesystem on the loop device into a data IO in the backing filesystem. That is, the completion of log IOs in the loop filesystem are now dependent on completion of data IO in the backing filesystem. This can cause deadlocks when a flush daemon issues a log force with an inode locked because the IO completion of IO on the inode is blocked by the inode lock. This in turn prevents further data IO completion from occuring on all XFS filesystems on that CPU (due to the shared nature of the completion queues). This then prevents the log IO from completing because the log is waiting for data IO completion as well. The fix for this new completion order dependency issue is to make the IO completion inode locking non-blocking. If the inode lock can't be grabbed, simply requeue the IO completion back to the work queue so that it can be processed later. This prevents the completion queue from being blocked and allows data IO completion on other inodes to proceed, hence avoiding completion order dependent deadlocks. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de> Signed-off-by: Alex Elder <aelder@sgi.com>

commit: 77d7a0c2eeb285c9069e15396703d0cb9690ac50 [log] [tgz]
author: Dave Chinner <david@fromorbit.com> Wed Feb 17 05:36:29 2010 +0000
committer: Alex Elder <aelder@sgi.com> Mon Mar 01 16:34:52 2010 -0600
tree: 22de501446dd5ba08581b04616408f90449f7211
parent: 66d834ea603d61bd90fedad90300ca91c5bba0a3 [diff]
diff --git a/fs/xfs/linux-2.6/xfs_aops.c b/fs/xfs/linux-2.6/xfs_aops.c
index ce369a8..b493c63 100644
--- a/fs/xfs/linux-2.6/xfs_aops.c
+++ b/fs/xfs/linux-2.6/xfs_aops.c

@@ -163,14 +163,17 @@
 }
 
 /*
- * Update on-disk file size now that data has been written to disk.
- * The current in-memory file size is i_size.  If a write is beyond
- * eof i_new_size will be the intended file size until i_size is
- * updated.  If this write does not extend all the way to the valid
- * file size then restrict this update to the end of the write.
+ * Update on-disk file size now that data has been written to disk.  The
+ * current in-memory file size is i_size.  If a write is beyond eof i_new_size
+ * will be the intended file size until i_size is updated.  If this write does
+ * not extend all the way to the valid file size then restrict this update to
+ * the end of the write.
+ *
+ * This function does not block as blocking on the inode lock in IO completion
+ * can lead to IO completion order dependency deadlocks.. If it can't get the
+ * inode ilock it will return EAGAIN. Callers must handle this.
  */
-
-STATIC void
+STATIC int
 xfs_setfilesize(
 	xfs_ioend_t		*ioend)
 {
@@ -181,9 +184,11 @@
 	ASSERT(ioend->io_type != IOMAP_READ);
 
 	if (unlikely(ioend->io_error))
-		return;
+		return 0;
 
-	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	if (!xfs_ilock_nowait(ip, XFS_ILOCK_EXCL))
+		return EAGAIN;
+
 	isize = xfs_ioend_new_eof(ioend);
 	if (isize) {
 		ip->i_d.di_size = isize;
@@ -191,40 +196,7 @@
 	}
 
 	xfs_iunlock(ip, XFS_ILOCK_EXCL);
-}
-
-/*
- * IO write completion.
- */
-STATIC void
-xfs_end_io(
-	struct work_struct	*work)
-{
-	xfs_ioend_t		*ioend =
-		container_of(work, xfs_ioend_t, io_work);
-	struct xfs_inode	*ip = XFS_I(ioend->io_inode);
-
-	/*
-	 * For unwritten extents we need to issue transactions to convert a
-	 * range to normal written extens after the data I/O has finished.
-	 */
-	if (ioend->io_type == IOMAP_UNWRITTEN &&
-	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
-		int error;
-
-		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
-						 ioend->io_size);
-		if (error)
-			ioend->io_error = error;
-	}
-
-	/*
-	 * We might have to update the on-disk file size after extending
-	 * writes.
-	 */
-	if (ioend->io_type != IOMAP_READ)
-		xfs_setfilesize(ioend);
-	xfs_destroy_ioend(ioend);
+	return 0;
 }
 
 /*
@@ -249,6 +221,53 @@
 }
 
 /*
+ * IO write completion.
+ */
+STATIC void
+xfs_end_io(
+	struct work_struct *work)
+{
+	xfs_ioend_t	*ioend = container_of(work, xfs_ioend_t, io_work);
+	struct xfs_inode *ip = XFS_I(ioend->io_inode);
+	int		error;
+
+	/*
+	 * For unwritten extents we need to issue transactions to convert a
+	 * range to normal written extens after the data I/O has finished.
+	 */
+	if (ioend->io_type == IOMAP_UNWRITTEN &&
+	    likely(!ioend->io_error && !XFS_FORCED_SHUTDOWN(ip->i_mount))) {
+
+		error = xfs_iomap_write_unwritten(ip, ioend->io_offset,
+						 ioend->io_size);
+		if (error)
+			ioend->io_error = error;
+	}
+
+	/*
+	 * We might have to update the on-disk file size after extending
+	 * writes.
+	 */
+	if (ioend->io_type != IOMAP_READ) {
+		error = xfs_setfilesize(ioend);
+		ASSERT(!error || error == EAGAIN);
+	}
+
+	/*
+	 * If we didn't complete processing of the ioend, requeue it to the
+	 * tail of the workqueue for another attempt later. Otherwise destroy
+	 * it.
+	 */
+	if (error == EAGAIN) {
+		atomic_inc(&ioend->io_remaining);
+		xfs_finish_ioend(ioend, 0);
+		/* ensure we don't spin on blocked ioends */
+		delay(1);
+	} else
+		xfs_destroy_ioend(ioend);
+}
+
+/*
  * Allocate and initialise an IO completion structure.
  * We need to track unwritten extent write completion here initially.
  * We'll need to extend this for updating the ondisk inode size later
commit	77d7a0c2eeb285c9069e15396703d0cb9690ac50	[log] [tgz]
author	Dave Chinner <david@fromorbit.com>	Wed Feb 17 05:36:29 2010 +0000
committer	Alex Elder <aelder@sgi.com>	Mon Mar 01 16:34:52 2010 -0600
tree	22de501446dd5ba08581b04616408f90449f7211
parent	66d834ea603d61bd90fedad90300ca91c5bba0a3 [diff]