nfs42: add NFS_IOC_CLONE ioctl

It can be called by user space to CLONE two files.
Follow btrfs lead and define NFS_IOC_CLONE same as BTRFS_IOC_CLONE.
Thus we don't mess up userspace with too many ioctls.

Signed-off-by: Peng Tao <tao.peng@primarydata.com>
Signed-off-by: Trond Myklebust <trond.myklebust@primarydata.com>
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index b0dbe0a..fc68ba5 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -4,6 +4,7 @@
  *  Copyright (C) 1992  Rick Sladkey
  */
 #include <linux/fs.h>
+#include <linux/file.h>
 #include <linux/falloc.h>
 #include <linux/nfs_fs.h>
 #include "delegation.h"
@@ -192,8 +193,104 @@
 		return nfs42_proc_deallocate(filep, offset, len);
 	return nfs42_proc_allocate(filep, offset, len);
 }
+
+static noinline long
+nfs42_ioctl_clone(struct file *dst_file, unsigned long srcfd,
+		  u64 src_off, u64 dst_off, u64 count)
+{
+	struct inode *dst_inode = file_inode(dst_file);
+	struct fd src_file;
+	struct inode *src_inode;
+	int ret;
+
+	/* dst file must be opened for writing */
+	if (!(dst_file->f_mode & FMODE_WRITE))
+		return -EINVAL;
+
+	ret = mnt_want_write_file(dst_file);
+	if (ret)
+		return ret;
+
+	src_file = fdget(srcfd);
+	if (!src_file.file) {
+		ret = -EBADF;
+		goto out_drop_write;
+	}
+
+	src_inode = file_inode(src_file.file);
+
+	/* src and dst must be different files */
+	ret = -EINVAL;
+	if (src_inode == dst_inode)
+		goto out_fput;
+
+	/* src file must be opened for reading */
+	if (!(src_file.file->f_mode & FMODE_READ))
+		goto out_fput;
+
+	/* src and dst must be regular files */
+	ret = -EISDIR;
+	if (!S_ISREG(src_inode->i_mode) || !S_ISREG(dst_inode->i_mode))
+		goto out_fput;
+
+	ret = -EXDEV;
+	if (src_file.file->f_path.mnt != dst_file->f_path.mnt ||
+	    src_inode->i_sb != dst_inode->i_sb)
+		goto out_fput;
+
+	/* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */
+	if (dst_inode < src_inode) {
+		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD);
+	} else {
+		mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT);
+		mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD);
+	}
+
+	/* flush all pending writes on both src and dst so that server
+	 * has the latest data */
+	ret = nfs_sync_inode(src_inode);
+	if (ret)
+		goto out_unlock;
+	ret = nfs_sync_inode(dst_inode);
+	if (ret)
+		goto out_unlock;
+
+	ret = nfs42_proc_clone(src_file.file, dst_file, src_off, dst_off, count);
+
+	/* truncate inode page cache of the dst range so that future reads can fetch
+	 * new data from server */
+	if (!ret)
+		truncate_inode_pages_range(&dst_inode->i_data, dst_off, dst_off + count - 1);
+
+out_unlock:
+	if (dst_inode < src_inode) {
+		mutex_unlock(&src_inode->i_mutex);
+		mutex_unlock(&dst_inode->i_mutex);
+	} else {
+		mutex_unlock(&dst_inode->i_mutex);
+		mutex_unlock(&src_inode->i_mutex);
+	}
+out_fput:
+	fdput(src_file);
+out_drop_write:
+	mnt_drop_write_file(dst_file);
+	return ret;
+}
 #endif /* CONFIG_NFS_V4_2 */
 
+long nfs4_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+	switch (cmd) {
+#ifdef CONFIG_NFS_V4_2
+	case NFS_IOC_CLONE:
+		return nfs42_ioctl_clone(file, arg, 0, 0, 0);
+#endif
+	}
+
+	return -ENOTTY;
+}
+
 const struct file_operations nfs4_file_operations = {
 #ifdef CONFIG_NFS_V4_2
 	.llseek		= nfs4_file_llseek,
@@ -216,4 +313,9 @@
 #endif /* CONFIG_NFS_V4_2 */
 	.check_flags	= nfs_check_flags,
 	.setlease	= simple_nosetlease,
+#ifdef CONFIG_COMPAT
+	.unlocked_ioctl = nfs4_ioctl,
+#else
+	.compat_ioctl	= nfs4_ioctl,
+#endif /* CONFIG_COMPAT */
 };
diff --git a/include/uapi/linux/nfs.h b/include/uapi/linux/nfs.h
index 5199a36..d85748d 100644
--- a/include/uapi/linux/nfs.h
+++ b/include/uapi/linux/nfs.h
@@ -31,6 +31,10 @@
 
 #define NFS_PIPE_DIRNAME "nfs"
 
+/* NFS ioctls */
+/* Let's follow btrfs lead on CLONE to avoid messing userspace */
+#define NFS_IOC_CLONE	_IOW(0x94, 9, int)
+
 /*
  * NFS stats. The good thing with these values is that NFSv3 errors are
  * a superset of NFSv2 errors (with the exception of NFSERR_WFLUSH which