NFS: support large reads and writes on the wire
Most NFS server implementations allow up to 64KB reads and writes on the
wire. The Solaris NFS server allows up to a megabyte, for instance.
Now the Linux NFS client supports transfer sizes up to 1MB, too. This will
help reduce protocol and context switch overhead on read/write intensive NFS
workloads, and support larger atomic read and write operations on servers
that support them.
Test-plan:
Connectathon and iozone on mount point with wsize=rsize>32768 over TCP.
Tests with NFS over UDP to verify the maximum RPC payload size cap.
Signed-off-by: Chuck Lever <cel@netapp.com>
Signed-off-by: Trond Myklebust <Trond.Myklebust@netapp.com>
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 4dff705..d38010b 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -38,9 +38,6 @@
# define NFS_DEBUG
#endif
-#define NFS_MAX_FILE_IO_BUFFER_SIZE 32768
-#define NFS_DEF_FILE_IO_BUFFER_SIZE 4096
-
/* Default timeout values */
#define NFS_MAX_UDP_TIMEOUT (60*HZ)
#define NFS_MAX_TCP_TIMEOUT (600*HZ)
@@ -462,18 +459,33 @@
*/
extern mempool_t *nfs_wdata_mempool;
-static inline struct nfs_write_data *nfs_writedata_alloc(void)
+static inline struct nfs_write_data *nfs_writedata_alloc(unsigned int pagecount)
{
struct nfs_write_data *p = mempool_alloc(nfs_wdata_mempool, SLAB_NOFS);
+
if (p) {
memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
+ if (pagecount < NFS_PAGEVEC_SIZE)
+ p->pagevec = &p->page_array[0];
+ else {
+ size_t size = ++pagecount * sizeof(struct page *);
+ p->pagevec = kmalloc(size, GFP_NOFS);
+ if (p->pagevec) {
+ memset(p->pagevec, 0, size);
+ } else {
+ mempool_free(p, nfs_wdata_mempool);
+ p = NULL;
+ }
+ }
}
return p;
}
static inline void nfs_writedata_free(struct nfs_write_data *p)
{
+ if (p && (p->pagevec != &p->page_array[0]))
+ kfree(p->pagevec);
mempool_free(p, nfs_wdata_mempool);
}
@@ -492,16 +504,33 @@
*/
extern mempool_t *nfs_rdata_mempool;
-static inline struct nfs_read_data *nfs_readdata_alloc(void)
+static inline struct nfs_read_data *nfs_readdata_alloc(unsigned int pagecount)
{
struct nfs_read_data *p = mempool_alloc(nfs_rdata_mempool, SLAB_NOFS);
- if (p)
+
+ if (p) {
memset(p, 0, sizeof(*p));
+ INIT_LIST_HEAD(&p->pages);
+ if (pagecount < NFS_PAGEVEC_SIZE)
+ p->pagevec = &p->page_array[0];
+ else {
+ size_t size = ++pagecount * sizeof(struct page *);
+ p->pagevec = kmalloc(size, GFP_NOFS);
+ if (p->pagevec) {
+ memset(p->pagevec, 0, size);
+ } else {
+ mempool_free(p, nfs_rdata_mempool);
+ p = NULL;
+ }
+ }
+ }
return p;
}
static inline void nfs_readdata_free(struct nfs_read_data *p)
{
+ if (p && (p->pagevec != &p->page_array[0]))
+ kfree(p->pagevec);
mempool_free(p, nfs_rdata_mempool);
}
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index b8b0eed..9f422fd 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -4,6 +4,16 @@
#include <linux/sunrpc/xprt.h>
#include <linux/nfsacl.h>
+/*
+ * To change the maximum rsize and wsize supported by the NFS client, adjust
+ * NFS_MAX_FILE_IO_SIZE. 64KB is a typical maximum, but some servers can
+ * support a megabyte or more. The default is left at 4096 bytes, which is
+ * reasonable for NFS over UDP.
+ */
+#define NFS_MAX_FILE_IO_SIZE (1048576U)
+#define NFS_DEF_FILE_IO_SIZE (4096U)
+#define NFS_MIN_FILE_IO_SIZE (1024U)
+
struct nfs4_fsid {
__u64 major;
__u64 minor;
@@ -215,12 +225,6 @@
/*
* Arguments to the read call.
*/
-
-#define NFS_READ_MAXIOV (9U)
-#if (NFS_READ_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_READ_MAXIOV is too large"
-#endif
-
struct nfs_readargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
@@ -239,11 +243,6 @@
/*
* Arguments to the write call.
*/
-#define NFS_WRITE_MAXIOV (9U)
-#if (NFS_WRITE_MAXIOV > (MAX_IOVEC -2))
-#error "NFS_WRITE_MAXIOV is too large"
-#endif
-
struct nfs_writeargs {
struct nfs_fh * fh;
struct nfs_open_context *context;
@@ -674,6 +673,8 @@
struct nfs_page;
+#define NFS_PAGEVEC_SIZE (8U)
+
struct nfs_read_data {
int flags;
struct rpc_task task;
@@ -682,13 +683,14 @@
struct nfs_fattr fattr; /* fattr storage */
struct list_head pages; /* Coalesced read requests */
struct nfs_page *req; /* multi ops per nfs_page */
- struct page *pagevec[NFS_READ_MAXIOV];
+ struct page **pagevec;
struct nfs_readargs args;
struct nfs_readres res;
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_read_data *, int);
+ struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};
struct nfs_write_data {
@@ -700,13 +702,14 @@
struct nfs_writeverf verf;
struct list_head pages; /* Coalesced requests we wish to flush */
struct nfs_page *req; /* multi ops per nfs_page */
- struct page *pagevec[NFS_WRITE_MAXIOV];
+ struct page **pagevec;
struct nfs_writeargs args; /* argument struct */
struct nfs_writeres res; /* result struct */
#ifdef CONFIG_NFS_V4
unsigned long timestamp; /* For lease renewal */
#endif
void (*complete) (struct nfs_write_data *, int);
+ struct page *page_array[NFS_PAGEVEC_SIZE + 1];
};
struct nfs_access_entry;
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 5da9687..5676794 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -135,11 +135,6 @@
}
/*
- * Maximum number of iov's we use.
- */
-#define MAX_IOVEC (12)
-
-/*
* XDR buffer helper functions
*/
extern void xdr_shift_buf(struct xdr_buf *, size_t);