xfs: Introduce writeback context for writepages

xfs_vm_writepages() calls generic_writepages to writeback a range of
a file, but then xfs_vm_writepage() clusters pages itself as it does
not have any context it can pass between->writepage calls from
__write_cache_pages().

Introduce a writeback context for xfs_vm_writepages() and call
__write_cache_pages directly with our own writepage callback so that
we can pass that context to each writepage invocation. This
encapsulates the current mapping, whether it is valid or not, the
current ioend and it's IO type and the ioend chain being built.

This requires us to move the ioend submission up to the level where
the writepage context is declared. This does mean we do not submit
IO until we packaged the entire writeback range, but with the block
plugging in the writepages call this is the way IO is submitted,
anyway.

It also means that we need to handle discontiguous page ranges.  If
the pages sent down by write_cache_pages to the writepage callback
are discontiguous, we need to detect this and put each discontiguous
page range into individual ioends. This is needed to ensure that the
ioend accurately represents the range of the file that it covers so
that file size updates during IO completion set the size correctly.
Failure to take into account the discontiguous ranges results in
files being too small when writeback patterns are non-sequential.

Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Christoph Hellwig <hch@lst.de>
Signed-off-by: Dave Chinner <david@fromorbit.com>

diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 245c448..e0be341 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -36,6 +36,18 @@
 #include <linux/pagevec.h>
 #include <linux/writeback.h>
 
+/*
+ * structure owned by writepages passed to individual writepage calls
+ */
+struct xfs_writepage_ctx {
+	struct xfs_bmbt_irec    imap;
+	bool			imap_valid;
+	unsigned int		io_type;
+	struct xfs_ioend	*iohead;
+	struct xfs_ioend	*ioend;
+	sector_t		last_block;
+};
+
 void
 xfs_count_page_state(
 	struct page		*page,
@@ -335,7 +347,7 @@
 	return 0;
 }
 
-STATIC int
+STATIC bool
 xfs_imap_valid(
 	struct inode		*inode,
 	struct xfs_bmbt_irec	*imap,
@@ -532,29 +544,27 @@
 	struct inode		*inode,
 	struct buffer_head	*bh,
 	xfs_off_t		offset,
-	unsigned int		type,
-	xfs_ioend_t		**result,
-	int			need_ioend)
+	struct xfs_writepage_ctx *wpc)
 {
-	xfs_ioend_t		*ioend = *result;
+	if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
+	    bh->b_blocknr != wpc->last_block + 1) {
+		struct xfs_ioend	*new;
 
-	if (!ioend || need_ioend || type != ioend->io_type) {
-		xfs_ioend_t	*previous = *result;
-
-		ioend = xfs_alloc_ioend(inode, type);
-		ioend->io_offset = offset;
-		ioend->io_buffer_head = bh;
-		ioend->io_buffer_tail = bh;
-		if (previous)
-			previous->io_list = ioend;
-		*result = ioend;
+		new = xfs_alloc_ioend(inode, wpc->io_type);
+		new->io_offset = offset;
+		new->io_buffer_head = bh;
+		new->io_buffer_tail = bh;
+		if (wpc->ioend)
+			wpc->ioend->io_list = new;
+		wpc->ioend = new;
 	} else {
-		ioend->io_buffer_tail->b_private = bh;
-		ioend->io_buffer_tail = bh;
+		wpc->ioend->io_buffer_tail->b_private = bh;
+		wpc->ioend->io_buffer_tail = bh;
 	}
 
 	bh->b_private = NULL;
-	ioend->io_size += bh->b_size;
+	wpc->ioend->io_size += bh->b_size;
+	wpc->last_block = bh->b_blocknr;
 }
 
 STATIC void
@@ -651,17 +661,15 @@
 	struct inode		*inode,
 	struct page		*page,
 	loff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc)
 {
 	struct buffer_head	*bh, *head;
 	xfs_off_t		end_offset;
 	unsigned long		p_offset;
-	unsigned int		type;
 	int			len, page_dirty;
 	int			count = 0, done = 0, uptodate = 1;
- 	xfs_off_t		offset = page_offset(page);
+	xfs_off_t		offset = page_offset(page);
 
 	if (page->index != tindex)
 		goto fail;
@@ -671,7 +679,7 @@
 		goto fail_unlock_page;
 	if (page->mapping != inode->i_mapping)
 		goto fail_unlock_page;
-	if (!xfs_check_page_type(page, (*ioendp)->io_type, false))
+	if (!xfs_check_page_type(page, wpc->ioend->io_type, false))
 		goto fail_unlock_page;
 
 	/*
@@ -707,7 +715,7 @@
 	 * writeback.  Hence for more optimal IO patterns, we should always
 	 * avoid partial page writeback due to multiple mappings on a page here.
 	 */
-	if (!xfs_imap_valid(inode, imap, end_offset))
+	if (!xfs_imap_valid(inode, &wpc->imap, end_offset))
 		goto fail_unlock_page;
 
 	len = 1 << inode->i_blkbits;
@@ -739,23 +747,22 @@
 		if (buffer_unwritten(bh) || buffer_delay(bh) ||
 		    buffer_mapped(bh)) {
 			if (buffer_unwritten(bh))
-				type = XFS_IO_UNWRITTEN;
+				wpc->io_type = XFS_IO_UNWRITTEN;
 			else if (buffer_delay(bh))
-				type = XFS_IO_DELALLOC;
+				wpc->io_type = XFS_IO_DELALLOC;
 			else
-				type = XFS_IO_OVERWRITE;
+				wpc->io_type = XFS_IO_OVERWRITE;
 
 			/*
 			 * imap should always be valid because of the above
 			 * partial page end_offset check on the imap.
 			 */
-			ASSERT(xfs_imap_valid(inode, imap, offset));
+			ASSERT(xfs_imap_valid(inode, &wpc->imap, offset));
 
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type,
-					 ioendp, done);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 
 			page_dirty--;
 			count++;
@@ -790,8 +797,7 @@
 xfs_cluster_write(
 	struct inode		*inode,
 	pgoff_t			tindex,
-	struct xfs_bmbt_irec	*imap,
-	xfs_ioend_t		**ioendp,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	pgoff_t			tlast)
 {
@@ -807,7 +813,7 @@
 
 		for (i = 0; i < pagevec_count(&pvec); i++) {
 			done = xfs_convert_page(inode, pvec.pages[i], tindex++,
-					imap, ioendp, wbc);
+						wpc, wbc);
 			if (done)
 				break;
 		}
@@ -895,21 +901,20 @@
 
 static int
 xfs_writepage_submit(
-	struct xfs_ioend	*ioend,
-	struct xfs_ioend	*iohead,
+	struct xfs_writepage_ctx *wpc,
 	struct writeback_control *wbc,
 	int			status)
 {
 	struct blk_plug		plug;
 
 	/* Reserve log space if we might write beyond the on-disk inode size. */
-	if (!status && ioend && ioend->io_type != XFS_IO_UNWRITTEN &&
-	    xfs_ioend_is_append(ioend))
-		status = xfs_setfilesize_trans_alloc(ioend);
+	if (!status && wpc->ioend && wpc->ioend->io_type != XFS_IO_UNWRITTEN &&
+	    xfs_ioend_is_append(wpc->ioend))
+		status = xfs_setfilesize_trans_alloc(wpc->ioend);
 
-	if (iohead) {
+	if (wpc->iohead) {
 		blk_start_plug(&plug);
-		xfs_submit_ioend(wbc, iohead, status);
+		xfs_submit_ioend(wbc, wpc->iohead, status);
 		blk_finish_plug(&plug);
 	}
 	return status;
@@ -924,20 +929,19 @@
  * For any other dirty buffer heads on the page we should flush them.
  */
 STATIC int
-xfs_vm_writepage(
+xfs_do_writepage(
 	struct page		*page,
-	struct writeback_control *wbc)
+	struct writeback_control *wbc,
+	void			*data)
 {
+	struct xfs_writepage_ctx *wpc = data;
 	struct inode		*inode = page->mapping->host;
 	struct buffer_head	*bh, *head;
-	struct xfs_bmbt_irec	imap;
-	xfs_ioend_t		*ioend = NULL, *iohead = NULL;
 	loff_t			offset;
-	unsigned int		type;
 	__uint64_t              end_offset;
 	pgoff_t                 end_index, last_index;
 	ssize_t			len;
-	int			err, imap_valid = 0, uptodate = 1;
+	int			err, uptodate = 1;
 	int			count = 0;
 
 	trace_xfs_writepage(inode, page, 0, 0);
@@ -1036,11 +1040,8 @@
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
-	type = XFS_IO_OVERWRITE;
 
 	do {
-		int new_ioend = 0;
-
 		if (offset >= end_offset)
 			break;
 		if (!buffer_uptodate(bh))
@@ -1053,24 +1054,24 @@
 		 * buffers covering holes here.
 		 */
 		if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
-			imap_valid = 0;
+			wpc->imap_valid = false;
 			continue;
 		}
 
 		if (buffer_unwritten(bh)) {
-			if (type != XFS_IO_UNWRITTEN) {
-				type = XFS_IO_UNWRITTEN;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_UNWRITTEN) {
+				wpc->io_type = XFS_IO_UNWRITTEN;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_delay(bh)) {
-			if (type != XFS_IO_DELALLOC) {
-				type = XFS_IO_DELALLOC;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_DELALLOC) {
+				wpc->io_type = XFS_IO_DELALLOC;
+				wpc->imap_valid = false;
 			}
 		} else if (buffer_uptodate(bh)) {
-			if (type != XFS_IO_OVERWRITE) {
-				type = XFS_IO_OVERWRITE;
-				imap_valid = 0;
+			if (wpc->io_type != XFS_IO_OVERWRITE) {
+				wpc->io_type = XFS_IO_OVERWRITE;
+				wpc->imap_valid = false;
 			}
 		} else {
 			if (PageUptodate(page))
@@ -1081,38 +1082,29 @@
 			 * subsequent writeable buffers into a new
 			 * ioend.
 			 */
-			imap_valid = 0;
+			wpc->imap_valid = 0;
 			continue;
 		}
 
-		if (imap_valid)
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
-		if (!imap_valid) {
-			/*
-			 * If we didn't have a valid mapping then we need to
-			 * put the new mapping into a separate ioend structure.
-			 * This ensures non-contiguous extents always have
-			 * separate ioends, which is particularly important
-			 * for unwritten extent conversion at I/O completion
-			 * time.
-			 */
-			new_ioend = 1;
-			err = xfs_map_blocks(inode, offset, &imap, type);
+		if (wpc->imap_valid)
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+		if (!wpc->imap_valid) {
+			err = xfs_map_blocks(inode, offset, &wpc->imap,
+					     wpc->io_type);
 			if (err)
 				goto error;
-			imap_valid = xfs_imap_valid(inode, &imap, offset);
+			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 		}
-		if (imap_valid) {
+		if (wpc->imap_valid) {
 			lock_buffer(bh);
-			if (type != XFS_IO_OVERWRITE)
-				xfs_map_at_offset(inode, bh, &imap, offset);
-			xfs_add_to_ioend(inode, bh, offset, type, &ioend,
-					 new_ioend);
+			if (wpc->io_type != XFS_IO_OVERWRITE)
+				xfs_map_at_offset(inode, bh, &wpc->imap, offset);
+			xfs_add_to_ioend(inode, bh, offset, wpc);
 			count++;
 		}
 
-		if (!iohead)
-			iohead = ioend;
+		if (!wpc->iohead)
+			wpc->iohead = wpc->ioend;
 
 	} while (offset += len, ((bh = bh->b_this_page) != head));
 
@@ -1122,10 +1114,10 @@
 	xfs_start_page_writeback(page, 1, count);
 
 	/* if there is no IO to be submitted for this page, we are done */
-	if (!ioend)
+	if (!count)
 		return 0;
 
-	ASSERT(iohead);
+	ASSERT(wpc->iohead);
 	ASSERT(err == 0);
 
 	/*
@@ -1133,10 +1125,10 @@
 	 * completion path as we have marked the initial page as under writeback
 	 * and unlocked it.
 	 */
-	if (imap_valid) {
+	if (wpc->imap_valid) {
 		xfs_off_t		end_index;
 
-		end_index = imap.br_startoff + imap.br_blockcount;
+		end_index = wpc->imap.br_startoff + wpc->imap.br_blockcount;
 
 		/* to bytes */
 		end_index <<= inode->i_blkbits;
@@ -1148,32 +1140,30 @@
 		if (end_index > last_index)
 			end_index = last_index;
 
-		xfs_cluster_write(inode, page->index + 1, &imap, &ioend,
-				  wbc, end_index);
+		xfs_cluster_write(inode, page->index + 1, wpc, wbc, end_index);
 	}
-
-	return xfs_writepage_submit(ioend, iohead, wbc, 0);
+	return 0;
 
 error:
 	/*
 	 * On error, we have to fail the iohead here because we buffers locked
 	 * in the ioend chain. If we don't do this, we'll deadlock invalidating
 	 * the page as that tries to lock the buffers on the page. Also, because
-	 * we may have set pages under writeback, we have to run IO completion to
-	 * mark the error state of the IO appropriately, so we can't cancel the
-	 * ioend directly here. That means we have to mark this page as under
-	 * writeback if we included any buffers from it in the ioend chain.
+	 * we may have set pages under writeback, we have to make sure we run
+	 * IO completion to mark the error state of the IO appropriately, so we
+	 * can't cancel the ioend directly here. That means we have to mark this
+	 * page as under writeback if we included any buffers from it in the
+	 * ioend chain so that completion treats it correctly.
+	 *
+	 * If we didn't include the page in the ioend, then we can simply
+	 * discard and unlock it as there are no other users of the page or it's
+	 * buffers right now. The caller will still need to trigger submission
+	 * of outstanding ioends on the writepage context so they are treated
+	 * correctly on error.
 	 */
 	if (count)
 		xfs_start_page_writeback(page, 0, count);
-	xfs_writepage_submit(ioend, iohead, wbc, err);
-
-	/*
-	 * We can only discard the page we had the IO error on if we haven't
-	 * included it in the ioend above. If it has already been errored out,
-	 * the it is unlocked and we can't touch it here.
-	 */
-	if (!count) {
+	else {
 		xfs_aops_discard_page(page);
 		ClearPageUptodate(page);
 		unlock_page(page);
@@ -1188,12 +1178,32 @@
 }
 
 STATIC int
+xfs_vm_writepage(
+	struct page		*page,
+	struct writeback_control *wbc)
+{
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
+	ret = xfs_do_writepage(page, wbc, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
+}
+
+STATIC int
 xfs_vm_writepages(
 	struct address_space	*mapping,
 	struct writeback_control *wbc)
 {
+	struct xfs_writepage_ctx wpc = {
+		.io_type = XFS_IO_INVALID,
+	};
+	int			ret;
+
 	xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
-	return generic_writepages(mapping, wbc);
+	ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
+	return xfs_writepage_submit(&wpc, wbc, ret);
 }
 
 /*
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index f6ffc9a..3c3f1a3 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -24,12 +24,14 @@
  * Types of I/O for bmap clustering and I/O completion tracking.
  */
 enum {
+	XFS_IO_INVALID,		/* initial state */
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
 };
 
 #define XFS_IO_TYPES \
+	{ XFS_IO_INVALID,		"invalid" }, \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
 	{ XFS_IO_OVERWRITE,		"overwrite" }