xfs: Don't issue buffer IO direct from AIL push V2 All buffers logged into the AIL are marked as delayed write. When the AIL needs to push the buffer out, it issues an async write of the buffer. This means that IO patterns are dependent on the order of buffers in the AIL. Instead of flushing the buffer, promote the buffer in the delayed write list so that the next time the xfsbufd is run the buffer will be flushed by the xfsbufd. Return the state to the xfsaild that the buffer was promoted so that the xfsaild knows that it needs to cause the xfsbufd to run to flush the buffers that were promoted. Using the xfsbufd for issuing the IO allows us to dispatch all buffer IO from the one queue. This means that we can make much more enlightened decisions on what order to flush buffers to disk as we don't have multiple places issuing IO. Optimisations to xfsbufd will be in a future patch. Version 2 - kill XFS_ITEM_FLUSHING as it is now unused. Signed-off-by: Dave Chinner <david@fromorbit.com> Reviewed-by: Christoph Hellwig <hch@lst.de>

commit: d808f617ad00a413585b806de340feda5ad9a2da [log] [tgz]
author: Dave Chinner <david@fromorbit.com> Tue Feb 02 10:13:42 2010 +1100
committer: Dave Chinner <david@fromorbit.com> Tue Feb 02 10:13:42 2010 +1100
tree: ed03d4d019a9d8b566ffd454e112e9fbce70bad8
parent: c854363e80b49dd04a4de18ebc379eb8c8806674 [diff] [blame]
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 207553e..d4dc063 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c

@@ -602,33 +602,20 @@
 
 	if (!xfs_iflock_nowait(ip)) {
 		/*
-		 * If someone else isn't already trying to push the inode
-		 * buffer, we get to do it.
+		 * inode has already been flushed to the backing buffer,
+		 * leave it locked in shared mode, pushbuf routine will
+		 * unlock it.
 		 */
-		if (iip->ili_pushbuf_flag == 0) {
-			iip->ili_pushbuf_flag = 1;
-#ifdef DEBUG
-			iip->ili_push_owner = current_pid();
-#endif
-			/*
-			 * Inode is left locked in shared mode.
-			 * Pushbuf routine gets to unlock it.
-			 */
-			return XFS_ITEM_PUSHBUF;
-		} else {
-			/*
-			 * We hold the AIL lock, so we must specify the
-			 * NONOTIFY flag so that we won't double trip.
-			 */
-			xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
-			return XFS_ITEM_FLUSHING;
-		}
-		/* NOTREACHED */
+		return XFS_ITEM_PUSHBUF;
 	}
 
 	/* Stale items should force out the iclog */
 	if (ip->i_flags & XFS_ISTALE) {
 		xfs_ifunlock(ip);
+		/*
+		 * we hold the AIL lock - notify the unlock routine of this
+		 * so it doesn't try to get the lock again.
+		 */
 		xfs_iunlock(ip, XFS_ILOCK_SHARED|XFS_IUNLOCK_NONOTIFY);
 		return XFS_ITEM_PINNED;
 	}
@@ -746,11 +733,8 @@
  * This gets called by xfs_trans_push_ail(), when IOP_TRYLOCK
  * failed to get the inode flush lock but did get the inode locked SHARED.
  * Here we're trying to see if the inode buffer is incore, and if so whether it's
- * marked delayed write. If that's the case, we'll initiate a bawrite on that
- * buffer to expedite the process.
- *
- * We aren't holding the AIL lock (or the flush lock) when this gets called,
- * so it is inherently race-y.
+ * marked delayed write. If that's the case, we'll promote it and that will
+ * allow the caller to write the buffer by triggering the xfsbufd to run.
  */
 STATIC void
 xfs_inode_item_pushbuf(
@@ -759,26 +743,16 @@
 	xfs_inode_t	*ip;
 	xfs_mount_t	*mp;
 	xfs_buf_t	*bp;
-	uint		dopush;
 
 	ip = iip->ili_inode;
-
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED));
 
 	/*
-	 * The ili_pushbuf_flag keeps others from
-	 * trying to duplicate our effort.
-	 */
-	ASSERT(iip->ili_pushbuf_flag != 0);
-	ASSERT(iip->ili_push_owner == current_pid());
-
-	/*
 	 * If a flush is not in progress anymore, chances are that the
 	 * inode was taken off the AIL. So, just get out.
 	 */
 	if (completion_done(&ip->i_flush) ||
 	    ((iip->ili_item.li_flags & XFS_LI_IN_AIL) == 0)) {
-		iip->ili_pushbuf_flag = 0;
 		xfs_iunlock(ip, XFS_ILOCK_SHARED);
 		return;
 	}
@@ -787,53 +761,12 @@
 	bp = xfs_incore(mp->m_ddev_targp, iip->ili_format.ilf_blkno,
 		    iip->ili_format.ilf_len, XBF_TRYLOCK);
 
-	if (bp != NULL) {
-		if (XFS_BUF_ISDELAYWRITE(bp)) {
-			/*
-			 * We were racing with iflush because we don't hold
-			 * the AIL lock or the flush lock. However, at this point,
-			 * we have the buffer, and we know that it's dirty.
-			 * So, it's possible that iflush raced with us, and
-			 * this item is already taken off the AIL.
-			 * If not, we can flush it async.
-			 */
-			dopush = ((iip->ili_item.li_flags & XFS_LI_IN_AIL) &&
-				  !completion_done(&ip->i_flush));
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-
-			trace_xfs_inode_item_push(bp, _RET_IP_);
-
-			if (XFS_BUF_ISPINNED(bp))
-				xfs_log_force(mp, 0);
-
-			if (dopush) {
-				int	error;
-				error = xfs_bawrite(mp, bp);
-				if (error)
-					xfs_fs_cmn_err(CE_WARN, mp,
-		"xfs_inode_item_pushbuf: pushbuf error %d on iip %p, bp %p",
-							error, iip, bp);
-			} else {
-				xfs_buf_relse(bp);
-			}
-		} else {
-			iip->ili_pushbuf_flag = 0;
-			xfs_iunlock(ip, XFS_ILOCK_SHARED);
-			xfs_buf_relse(bp);
-		}
-		return;
-	}
-	/*
-	 * We have to be careful about resetting pushbuf flag too early (above).
-	 * Even though in theory we can do it as soon as we have the buflock,
-	 * we don't want others to be doing work needlessly. They'll come to
-	 * this function thinking that pushing the buffer is their
-	 * responsibility only to find that the buffer is still locked by
-	 * another doing the same thing
-	 */
-	iip->ili_pushbuf_flag = 0;
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+	if (!bp)
+		return;
+	if (XFS_BUF_ISDELAYWRITE(bp))
+		xfs_buf_delwri_promote(bp);
+	xfs_buf_relse(bp);
 	return;
 }
 
@@ -937,7 +870,6 @@
 	/*
 	   We have zeroed memory. No need ...
 	   iip->ili_extents_buf = NULL;
-	   iip->ili_pushbuf_flag = 0;
 	 */
 
 	iip->ili_format.ilf_type = XFS_LI_INODE;
commit	d808f617ad00a413585b806de340feda5ad9a2da	[log] [tgz]
author	Dave Chinner <david@fromorbit.com>	Tue Feb 02 10:13:42 2010 +1100
committer	Dave Chinner <david@fromorbit.com>	Tue Feb 02 10:13:42 2010 +1100
tree	ed03d4d019a9d8b566ffd454e112e9fbce70bad8
parent	c854363e80b49dd04a4de18ebc379eb8c8806674 [diff] [blame]