nfsd4: allow encoding across page boundaries

After this we can handle for example getattr of very large ACLs.

Read, readdir, readlink are still special cases with their own limits.

Also we can't handle a new operation starting close to the end of a
page.

Signed-off-by: J. Bruce Fields <bfields@redhat.com>
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 3ce431b..5d8f915 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1264,6 +1264,10 @@
 	xdr->end = head->iov_base + PAGE_SIZE - 2 * RPC_MAX_AUTH_SIZE;
 	/* Tail and page_len should be zero at this point: */
 	buf->len = buf->head[0].iov_len;
+	xdr->scratch.iov_len = 0;
+	xdr->page_ptr = buf->pages;
+	buf->buflen = PAGE_SIZE * (1 + rqstp->rq_page_end - buf->pages)
+		- 2 * RPC_MAX_AUTH_SIZE;
 }
 
 /*
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index bd529e5..d3a576d 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1624,6 +1624,7 @@
 		 * the head and tail in another page:
 		 */
 		return 2 * PAGE_SIZE;
+	case OP_GETATTR:
 	case OP_READ:
 		return INT_MAX;
 	default:
@@ -2560,21 +2561,31 @@
 	goto out;
 }
 
+static void svcxdr_init_encode_from_buffer(struct xdr_stream *xdr,
+				struct xdr_buf *buf, __be32 *p, int bytes)
+{
+	xdr->scratch.iov_len = 0;
+	memset(buf, 0, sizeof(struct xdr_buf));
+	buf->head[0].iov_base = p;
+	buf->head[0].iov_len = 0;
+	buf->len = 0;
+	xdr->buf = buf;
+	xdr->iov = buf->head;
+	xdr->p = p;
+	xdr->end = (void *)p + bytes;
+	buf->buflen = bytes;
+}
+
 __be32 nfsd4_encode_fattr_to_buf(__be32 **p, int words,
 			struct svc_fh *fhp, struct svc_export *exp,
 			struct dentry *dentry, u32 *bmval,
 			struct svc_rqst *rqstp, int ignore_crossmnt)
 {
-	struct xdr_buf dummy = {
-			.head[0] = {
-				.iov_base = *p,
-			},
-			.buflen = words << 2,
-		};
+	struct xdr_buf dummy;
 	struct xdr_stream xdr;
 	__be32 ret;
 
-	xdr_init_encode(&xdr, &dummy, NULL);
+	svcxdr_init_encode_from_buffer(&xdr, &dummy, *p, words << 2);
 	ret = nfsd4_encode_fattr(&xdr, fhp, exp, dentry, bmval, rqstp,
 							ignore_crossmnt);
 	*p = xdr.p;
@@ -3064,8 +3075,6 @@
 
 	if (nfserr)
 		return nfserr;
-	if (resp->xdr.buf->page_len)
-		return nfserr_resource;
 
 	p = xdr_reserve_space(xdr, 8); /* eof flag and byte count */
 	if (!p)
@@ -3075,6 +3084,9 @@
 	if (xdr->end - xdr->p < 1)
 		return nfserr_resource;
 
+	if (resp->xdr.buf->page_len)
+		return nfserr_resource;
+
 	maxcount = svc_max_payload(resp->rqstp);
 	if (maxcount > read->rd_length)
 		maxcount = read->rd_length;
@@ -3119,6 +3131,8 @@
 				- (char *)resp->xdr.buf->head[0].iov_base);
 	resp->xdr.buf->page_len = maxcount;
 	xdr->buf->len += maxcount;
+	xdr->page_ptr += v;
+	xdr->buf->buflen = maxcount + PAGE_SIZE - 2 * RPC_MAX_AUTH_SIZE;
 	xdr->iov = xdr->buf->tail;
 
 	/* Use rest of head for padding and remaining ops: */
@@ -3145,6 +3159,11 @@
 
 	if (nfserr)
 		return nfserr;
+
+	p = xdr_reserve_space(xdr, 4);
+	if (!p)
+		return nfserr_resource;
+
 	if (resp->xdr.buf->page_len)
 		return nfserr_resource;
 	if (!*resp->rqstp->rq_next_page)
@@ -3154,10 +3173,6 @@
 
 	maxcount = PAGE_SIZE;
 
-	p = xdr_reserve_space(xdr, 4);
-	if (!p)
-		return nfserr_resource;
-
 	if (xdr->end - xdr->p < 1)
 		return nfserr_resource;
 
@@ -3180,6 +3195,8 @@
 				- (char *)resp->xdr.buf->head[0].iov_base;
 	resp->xdr.buf->page_len = maxcount;
 	xdr->buf->len += maxcount;
+	xdr->page_ptr += 1;
+	xdr->buf->buflen -= PAGE_SIZE;
 	xdr->iov = xdr->buf->tail;
 
 	/* Use rest of head for padding and remaining ops: */
@@ -3206,15 +3223,16 @@
 
 	if (nfserr)
 		return nfserr;
-	if (resp->xdr.buf->page_len)
-		return nfserr_resource;
-	if (!*resp->rqstp->rq_next_page)
-		return nfserr_resource;
 
 	p = xdr_reserve_space(xdr, NFS4_VERIFIER_SIZE);
 	if (!p)
 		return nfserr_resource;
 
+	if (resp->xdr.buf->page_len)
+		return nfserr_resource;
+	if (!*resp->rqstp->rq_next_page)
+		return nfserr_resource;
+
 	/* XXX: Following NFSv3, we ignore the READDIR verifier for now. */
 	WRITE32(0);
 	WRITE32(0);
@@ -3266,6 +3284,10 @@
 
 	xdr->iov = xdr->buf->tail;
 
+	xdr->page_ptr++;
+	xdr->buf->buflen -= PAGE_SIZE;
+	xdr->iov = xdr->buf->tail;
+
 	/* Use rest of head for padding and remaining ops: */
 	resp->xdr.buf->tail[0].iov_base = tailbase;
 	resp->xdr.buf->tail[0].iov_len = 0;
@@ -3800,6 +3822,8 @@
 	       !nfsd4_enc_ops[op->opnum]);
 	encoder = nfsd4_enc_ops[op->opnum];
 	op->status = encoder(resp, op->status, &op->u);
+	xdr_commit_encode(xdr);
+
 	/* nfsd4_check_resp_size guarantees enough room for error status */
 	if (!op->status) {
 		int space_needed = 0;
@@ -3919,6 +3943,8 @@
 	WARN_ON_ONCE(buf->len != buf->head[0].iov_len + buf->page_len +
 				 buf->tail[0].iov_len);
 
+	rqstp->rq_next_page = resp->xdr.page_ptr + 1;
+
 	p = resp->tagp;
 	*p++ = htonl(resp->taglen);
 	memcpy(p, resp->tag, resp->taglen);
diff --git a/include/linux/sunrpc/svc.h b/include/linux/sunrpc/svc.h
index a0dbbd1..85cb647 100644
--- a/include/linux/sunrpc/svc.h
+++ b/include/linux/sunrpc/svc.h
@@ -244,6 +244,7 @@
 	struct page *		rq_pages[RPCSVC_MAXPAGES];
 	struct page *		*rq_respages;	/* points into rq_pages */
 	struct page *		*rq_next_page; /* next reply page to use */
+	struct page *		*rq_page_end;  /* one past the last page */
 
 	struct kvec		rq_vec[RPCSVC_MAXPAGES]; /* generally useful.. */
 
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index e7bb2e3..b23d69f 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -215,6 +215,7 @@
 
 extern void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p);
 extern __be32 *xdr_reserve_space(struct xdr_stream *xdr, size_t nbytes);
+extern void xdr_commit_encode(struct xdr_stream *xdr);
 extern void xdr_truncate_encode(struct xdr_stream *xdr, size_t len);
 extern void xdr_write_pages(struct xdr_stream *xdr, struct page **pages,
 		unsigned int base, unsigned int len);
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index 29772e0..b4737fb 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -597,6 +597,7 @@
 			}
 			rqstp->rq_pages[i] = p;
 		}
+	rqstp->rq_page_end = &rqstp->rq_pages[i];
 	rqstp->rq_pages[i++] = NULL; /* this might be seen in nfs_read_actor */
 
 	/* Make arg->head point to first page and arg->pages point to rest */
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 352f3b3..2b546e8 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -462,6 +462,7 @@
 	struct kvec *iov = buf->head;
 	int scratch_len = buf->buflen - buf->page_len - buf->tail[0].iov_len;
 
+	xdr_set_scratch_buffer(xdr, NULL, 0);
 	BUG_ON(scratch_len < 0);
 	xdr->buf = buf;
 	xdr->iov = iov;
@@ -482,6 +483,74 @@
 EXPORT_SYMBOL_GPL(xdr_init_encode);
 
 /**
+ * xdr_commit_encode - Ensure all data is written to buffer
+ * @xdr: pointer to xdr_stream
+ *
+ * We handle encoding across page boundaries by giving the caller a
+ * temporary location to write to, then later copying the data into
+ * place; xdr_commit_encode does that copying.
+ *
+ * Normally the caller doesn't need to call this directly, as the
+ * following xdr_reserve_space will do it.  But an explicit call may be
+ * required at the end of encoding, or any other time when the xdr_buf
+ * data might be read.
+ */
+void xdr_commit_encode(struct xdr_stream *xdr)
+{
+	int shift = xdr->scratch.iov_len;
+	void *page;
+
+	if (shift == 0)
+		return;
+	page = page_address(*xdr->page_ptr);
+	memcpy(xdr->scratch.iov_base, page, shift);
+	memmove(page, page + shift, (void *)xdr->p - page);
+	xdr->scratch.iov_len = 0;
+}
+EXPORT_SYMBOL_GPL(xdr_commit_encode);
+
+__be32 *xdr_get_next_encode_buffer(struct xdr_stream *xdr, size_t nbytes)
+{
+	static __be32 *p;
+	int space_left;
+	int frag1bytes, frag2bytes;
+
+	if (nbytes > PAGE_SIZE)
+		return NULL; /* Bigger buffers require special handling */
+	if (xdr->buf->len + nbytes > xdr->buf->buflen)
+		return NULL; /* Sorry, we're totally out of space */
+	frag1bytes = (xdr->end - xdr->p) << 2;
+	frag2bytes = nbytes - frag1bytes;
+	if (xdr->iov)
+		xdr->iov->iov_len += frag1bytes;
+	else {
+		xdr->buf->page_len += frag1bytes;
+		xdr->page_ptr++;
+	}
+	xdr->iov = NULL;
+	/*
+	 * If the last encode didn't end exactly on a page boundary, the
+	 * next one will straddle boundaries.  Encode into the next
+	 * page, then copy it back later in xdr_commit_encode.  We use
+	 * the "scratch" iov to track any temporarily unused fragment of
+	 * space at the end of the previous buffer:
+	 */
+	xdr->scratch.iov_base = xdr->p;
+	xdr->scratch.iov_len = frag1bytes;
+	p = page_address(*xdr->page_ptr);
+	/*
+	 * Note this is where the next encode will start after we've
+	 * shifted this one back:
+	 */
+	xdr->p = (void *)p + frag2bytes;
+	space_left = xdr->buf->buflen - xdr->buf->len;
+	xdr->end = (void *)p + min_t(int, space_left, PAGE_SIZE);
+	xdr->buf->page_len += frag2bytes;
+	xdr->buf->len += nbytes;
+	return p;
+}
+
+/**
  * xdr_reserve_space - Reserve buffer space for sending
  * @xdr: pointer to xdr_stream
  * @nbytes: number of bytes to reserve
@@ -495,14 +564,18 @@
 	__be32 *p = xdr->p;
 	__be32 *q;
 
+	xdr_commit_encode(xdr);
 	/* align nbytes on the next 32-bit boundary */
 	nbytes += 3;
 	nbytes &= ~3;
 	q = p + (nbytes >> 2);
 	if (unlikely(q > xdr->end || q < p))
-		return NULL;
+		return xdr_get_next_encode_buffer(xdr, nbytes);
 	xdr->p = q;
-	xdr->iov->iov_len += nbytes;
+	if (xdr->iov)
+		xdr->iov->iov_len += nbytes;
+	else
+		xdr->buf->page_len += nbytes;
 	xdr->buf->len += nbytes;
 	return p;
 }
@@ -539,6 +612,7 @@
 		WARN_ON_ONCE(1);
 		return;
 	}
+	xdr_commit_encode(xdr);
 
 	fraglen = min_t(int, buf->len - len, tail->iov_len);
 	tail->iov_len -= fraglen;