->splice_write() via ->write_iter()

iter_file_splice_write() - a ->splice_write() instance that gathers the
pipe buffers, builds a bio_vec-based iov_iter covering those and feeds
it to ->write_iter().  A bunch of simple cases coverted to that...

[AV: fixed the braino spotted by Cyrill]

Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 4e36b8e..e68e150 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -1583,7 +1583,7 @@
 	.compat_ioctl	= compat_blkdev_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 int ioctl_by_bdev(struct block_device *bdev, unsigned cmd, unsigned long arg)
diff --git a/fs/exofs/file.c b/fs/exofs/file.c
index 5b7f6be..71bf8e4 100644
--- a/fs/exofs/file.c
+++ b/fs/exofs/file.c
@@ -77,7 +77,7 @@
 	.fsync		= exofs_file_fsync,
 	.flush		= exofs_flush,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 const struct inode_operations exofs_file_inode_operations = {
diff --git a/fs/ext2/file.c b/fs/ext2/file.c
index 970c6ac..7c87b22 100644
--- a/fs/ext2/file.c
+++ b/fs/ext2/file.c
@@ -75,7 +75,7 @@
 	.release	= ext2_release_file,
 	.fsync		= ext2_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 #ifdef CONFIG_EXT2_FS_XIP
diff --git a/fs/ext3/file.c b/fs/ext3/file.c
index c833b12..a062fa1 100644
--- a/fs/ext3/file.c
+++ b/fs/ext3/file.c
@@ -63,7 +63,7 @@
 	.release	= ext3_release_file,
 	.fsync		= ext3_sync_file,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
 
 const struct inode_operations ext3_file_inode_operations = {
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 48383a5..708aad7 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -599,7 +599,7 @@
 	.release	= ext4_release_file,
 	.fsync		= ext4_sync_file,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.fallocate	= ext4_fallocate,
 };
 
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 22f4900..e4ba4b9 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -692,5 +692,5 @@
 	.compat_ioctl	= f2fs_compat_ioctl,
 #endif
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 };
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index ca932cd..01b4c5b 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -1068,7 +1068,7 @@
 	.lock		= gfs2_lock,
 	.flock		= gfs2_flock,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.setlease	= gfs2_setlease,
 	.fallocate	= gfs2_fallocate,
 };
@@ -1098,7 +1098,7 @@
 	.release	= gfs2_release,
 	.fsync		= gfs2_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.setlease	= generic_setlease,
 	.fallocate	= gfs2_fallocate,
 };
diff --git a/fs/jfs/file.c b/fs/jfs/file.c
index cc744ec..33aa0cc 100644
--- a/fs/jfs/file.c
+++ b/fs/jfs/file.c
@@ -157,7 +157,7 @@
 	.write_iter	= generic_file_write_iter,
 	.mmap		= generic_file_mmap,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.fsync		= jfs_fsync,
 	.release	= jfs_release,
 	.unlocked_ioctl = jfs_ioctl,
diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c
index 6ea0b97..4f56de8 100644
--- a/fs/ramfs/file-mmu.c
+++ b/fs/ramfs/file-mmu.c
@@ -38,7 +38,7 @@
 	.mmap		= generic_file_mmap,
 	.fsync		= noop_fsync,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.llseek		= generic_file_llseek,
 };
 
diff --git a/fs/ramfs/file-nommu.c b/fs/ramfs/file-nommu.c
index 9ed420f..dda012a 100644
--- a/fs/ramfs/file-nommu.c
+++ b/fs/ramfs/file-nommu.c
@@ -43,7 +43,7 @@
 	.write_iter		= generic_file_write_iter,
 	.fsync			= noop_fsync,
 	.splice_read		= generic_file_splice_read,
-	.splice_write		= generic_file_splice_write,
+	.splice_write		= iter_file_splice_write,
 	.llseek			= generic_file_llseek,
 };
 
diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c
index 7c8ecd6..f070cc8 100644
--- a/fs/reiserfs/file.c
+++ b/fs/reiserfs/file.c
@@ -248,7 +248,7 @@
 	.read_iter = generic_file_read_iter,
 	.write_iter = generic_file_write_iter,
 	.splice_read = generic_file_splice_read,
-	.splice_write = generic_file_splice_write,
+	.splice_write = iter_file_splice_write,
 	.llseek = generic_file_llseek,
 };
 
diff --git a/fs/splice.c b/fs/splice.c
index f99e420..f195a9b 100644
--- a/fs/splice.c
+++ b/fs/splice.c
@@ -32,6 +32,7 @@
 #include <linux/gfp.h>
 #include <linux/socket.h>
 #include <linux/compat.h>
+#include <linux/aio.h>
 #include "internal.h"
 
 /*
@@ -1052,6 +1053,145 @@
 
 EXPORT_SYMBOL(generic_file_splice_write);
 
+/**
+ * iter_file_splice_write - splice data from a pipe to a file
+ * @pipe:	pipe info
+ * @out:	file to write to
+ * @ppos:	position in @out
+ * @len:	number of bytes to splice
+ * @flags:	splice modifier flags
+ *
+ * Description:
+ *    Will either move or copy pages (determined by @flags options) from
+ *    the given pipe inode to the given file.
+ *    This one is ->write_iter-based.
+ *
+ */
+ssize_t
+iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out,
+			  loff_t *ppos, size_t len, unsigned int flags)
+{
+	struct splice_desc sd = {
+		.total_len = len,
+		.flags = flags,
+		.pos = *ppos,
+		.u.file = out,
+	};
+	int nbufs = pipe->buffers;
+	struct bio_vec *array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+	ssize_t ret;
+
+	if (unlikely(!array))
+		return -ENOMEM;
+
+	pipe_lock(pipe);
+
+	splice_from_pipe_begin(&sd);
+	while (sd.total_len) {
+		struct iov_iter from;
+		struct kiocb kiocb;
+		size_t left;
+		int n, idx;
+
+		ret = splice_from_pipe_next(pipe, &sd);
+		if (ret <= 0)
+			break;
+
+		if (unlikely(nbufs < pipe->buffers)) {
+			kfree(array);
+			nbufs = pipe->buffers;
+			array = kcalloc(nbufs, sizeof(struct bio_vec),
+					GFP_KERNEL);
+			if (!array) {
+				ret = -ENOMEM;
+				break;
+			}
+		}
+
+		/* build the vector */
+		left = sd.total_len;
+		for (n = 0, idx = pipe->curbuf; left && n < pipe->nrbufs; n++, idx++) {
+			struct pipe_buffer *buf = pipe->bufs + idx;
+			size_t this_len = buf->len;
+
+			if (this_len > left)
+				this_len = left;
+
+			if (idx == pipe->buffers - 1)
+				idx = -1;
+
+			ret = buf->ops->confirm(pipe, buf);
+			if (unlikely(ret)) {
+				if (ret == -ENODATA)
+					ret = 0;
+				goto done;
+			}
+
+			array[n].bv_page = buf->page;
+			array[n].bv_len = this_len;
+			array[n].bv_offset = buf->offset;
+			left -= this_len;
+		}
+
+		/* ... iov_iter */
+		from.type = ITER_BVEC | WRITE;
+		from.bvec = array;
+		from.nr_segs = n;
+		from.count = sd.total_len - left;
+		from.iov_offset = 0;
+
+		/* ... and iocb */
+		init_sync_kiocb(&kiocb, out);
+		kiocb.ki_pos = sd.pos;
+		kiocb.ki_nbytes = sd.total_len - left;
+
+		/* now, send it */
+		ret = out->f_op->write_iter(&kiocb, &from);
+		if (-EIOCBQUEUED == ret)
+			ret = wait_on_sync_kiocb(&kiocb);
+
+		if (ret <= 0)
+			break;
+
+		sd.num_spliced += ret;
+		sd.total_len -= ret;
+		*ppos = sd.pos = kiocb.ki_pos;
+
+		/* dismiss the fully eaten buffers, adjust the partial one */
+		while (ret) {
+			struct pipe_buffer *buf = pipe->bufs + pipe->curbuf;
+			if (ret >= buf->len) {
+				const struct pipe_buf_operations *ops = buf->ops;
+				ret -= buf->len;
+				buf->len = 0;
+				buf->ops = NULL;
+				ops->release(pipe, buf);
+				pipe->curbuf = (pipe->curbuf + 1) & (pipe->buffers - 1);
+				pipe->nrbufs--;
+				if (pipe->files)
+					sd.need_wakeup = true;
+			} else {
+				buf->offset += ret;
+				buf->len -= ret;
+				ret = 0;
+			}
+		}
+	}
+done:
+	kfree(array);
+	splice_from_pipe_end(pipe, &sd);
+
+	pipe_unlock(pipe);
+
+	if (sd.num_spliced)
+		ret = sd.num_spliced;
+
+	return ret;
+}
+
+EXPORT_SYMBOL(iter_file_splice_write);
+
 static int write_pipe_buf(struct pipe_inode_info *pipe, struct pipe_buffer *buf,
 			  struct splice_desc *sd)
 {
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index 6bc4e8e..0888502 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -1585,7 +1585,7 @@
 	.fsync          = ubifs_fsync,
 	.unlocked_ioctl = ubifs_ioctl,
 	.splice_read	= generic_file_splice_read,
-	.splice_write	= generic_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl   = ubifs_compat_ioctl,
 #endif
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5446e86..b1c489c 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -343,47 +343,6 @@
 }
 
 /*
- * xfs_file_splice_write() does not use xfs_rw_ilock() because
- * generic_file_splice_write() takes the i_mutex itself. This, in theory,
- * couuld cause lock inversions between the aio_write path and the splice path
- * if someone is doing concurrent splice(2) based writes and write(2) based
- * writes to the same inode. The only real way to fix this is to re-implement
- * the generic code here with correct locking orders.
- */
-STATIC ssize_t
-xfs_file_splice_write(
-	struct pipe_inode_info	*pipe,
-	struct file		*outfilp,
-	loff_t			*ppos,
-	size_t			count,
-	unsigned int		flags)
-{
-	struct inode		*inode = outfilp->f_mapping->host;
-	struct xfs_inode	*ip = XFS_I(inode);
-	int			ioflags = 0;
-	ssize_t			ret;
-
-	XFS_STATS_INC(xs_write_calls);
-
-	if (outfilp->f_mode & FMODE_NOCMTIME)
-		ioflags |= IO_INVIS;
-
-	if (XFS_FORCED_SHUTDOWN(ip->i_mount))
-		return -EIO;
-
-	xfs_ilock(ip, XFS_IOLOCK_EXCL);
-
-	trace_xfs_file_splice_write(ip, count, *ppos, ioflags);
-
-	ret = generic_file_splice_write(pipe, outfilp, ppos, count, flags);
-	if (ret > 0)
-		XFS_STATS_ADD(xs_write_bytes, ret);
-
-	xfs_iunlock(ip, XFS_IOLOCK_EXCL);
-	return ret;
-}
-
-/*
  * This routine is called to handle zeroing any space in the last block of the
  * file that is beyond the EOF.  We do this since the size is being increased
  * without writing anything to that block and we don't want to read the
@@ -1442,7 +1401,7 @@
 	.read_iter	= xfs_file_read_iter,
 	.write_iter	= xfs_file_write_iter,
 	.splice_read	= xfs_file_splice_read,
-	.splice_write	= xfs_file_splice_write,
+	.splice_write	= iter_file_splice_write,
 	.unlocked_ioctl	= xfs_file_ioctl,
 #ifdef CONFIG_COMPAT
 	.compat_ioctl	= xfs_file_compat_ioctl,
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 65d8c79..53182f9 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -1060,7 +1060,6 @@
 DEFINE_RW_EVENT(xfs_file_buffered_write);
 DEFINE_RW_EVENT(xfs_file_direct_write);
 DEFINE_RW_EVENT(xfs_file_splice_read);
-DEFINE_RW_EVENT(xfs_file_splice_write);
 
 DECLARE_EVENT_CLASS(xfs_page_class,
 	TP_PROTO(struct inode *inode, struct page *page, unsigned long off,
diff --git a/include/linux/fs.h b/include/linux/fs.h
index a644884..8bd8ed3 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2434,6 +2434,8 @@
 		struct pipe_inode_info *, size_t, unsigned int);
 extern ssize_t generic_file_splice_write(struct pipe_inode_info *,
 		struct file *, loff_t *, size_t, unsigned int);
+extern ssize_t iter_file_splice_write(struct pipe_inode_info *,
+		struct file *, loff_t *, size_t, unsigned int);
 extern ssize_t generic_splice_sendpage(struct pipe_inode_info *pipe,
 		struct file *out, loff_t *, size_t len, unsigned int flags);