Merge branch 'layoutfixes'
* layoutfixes:
NFSv4.1/pnfs: Remove redundant wakeup in pnfs_send_layoutreturn()
NFSv4.1/pnfs: Remove redundant check in pnfs_layoutgets_blocked()
NFSv4.1/pnfs: Remove redundant lo->plh_block_lgets in layoutreturn
NFSv4.1/pnfs: Don't prevent layoutgets when doing return-on-close
NFSv4.1/pnfs: Fix serialisation of layout return and layoutget
NFSv4.1/pnfs: Remove redundant checks in pnfs_layoutgets_blocked()
pNFS: Tighten up locking around DS commit buckets
diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c
index bac3fb4..30eb245 100644
--- a/drivers/infiniband/core/verbs.c
+++ b/drivers/infiniband/core/verbs.c
@@ -1144,73 +1144,6 @@
}
EXPORT_SYMBOL(ib_get_dma_mr);
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_mr *mr;
- int err;
-
- err = ib_check_mr_access(mr_access_flags);
- if (err)
- return ERR_PTR(err);
-
- if (!pd->device->reg_phys_mr)
- return ERR_PTR(-ENOSYS);
-
- mr = pd->device->reg_phys_mr(pd, phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!IS_ERR(mr)) {
- mr->device = pd->device;
- mr->pd = pd;
- mr->uobject = NULL;
- atomic_inc(&pd->usecnt);
- atomic_set(&mr->usecnt, 0);
- }
-
- return mr;
-}
-EXPORT_SYMBOL(ib_reg_phys_mr);
-
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start)
-{
- struct ib_pd *old_pd;
- int ret;
-
- ret = ib_check_mr_access(mr_access_flags);
- if (ret)
- return ret;
-
- if (!mr->device->rereg_phys_mr)
- return -ENOSYS;
-
- if (atomic_read(&mr->usecnt))
- return -EBUSY;
-
- old_pd = mr->pd;
-
- ret = mr->device->rereg_phys_mr(mr, mr_rereg_mask, pd,
- phys_buf_array, num_phys_buf,
- mr_access_flags, iova_start);
-
- if (!ret && (mr_rereg_mask & IB_MR_REREG_PD)) {
- atomic_dec(&old_pd->usecnt);
- atomic_inc(&pd->usecnt);
- }
-
- return ret;
-}
-EXPORT_SYMBOL(ib_rereg_phys_mr);
-
int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
{
return mr->device->query_mr ?
diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h
index 92dca9e..c556640 100644
--- a/fs/nfs/blocklayout/blocklayout.h
+++ b/fs/nfs/blocklayout/blocklayout.h
@@ -46,13 +46,6 @@
struct pnfs_block_dev;
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
-
#define PNFS_BLOCK_MAX_UUIDS 4
#define PNFS_BLOCK_MAX_DEVICES 64
@@ -117,13 +110,6 @@
struct pnfs_block_dev_map *map);
};
-enum exstate4 {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */
- PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */
-};
-
/* sector_t fields are all in 512-byte sectors */
struct pnfs_block_extent {
union {
@@ -134,15 +120,12 @@
sector_t be_f_offset; /* the starting offset in the file */
sector_t be_length; /* the size of the extent */
sector_t be_v_offset; /* the starting offset in the volume */
- enum exstate4 be_state; /* the state of this extent */
+ enum pnfs_block_extent_state be_state; /* the state of this extent */
#define EXTENT_WRITTEN 1
#define EXTENT_COMMITTING 2
unsigned int be_tag;
};
-/* on the wire size of the extent */
-#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
-
struct pnfs_block_layout {
struct pnfs_layout_hdr bl_layout;
struct rb_root bl_ext_rw;
diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c
index e535599..a861bbd 100644
--- a/fs/nfs/blocklayout/dev.c
+++ b/fs/nfs/blocklayout/dev.c
@@ -22,7 +22,7 @@
kfree(dev->children);
} else {
if (dev->bdev)
- blkdev_put(dev->bdev, FMODE_READ);
+ blkdev_put(dev->bdev, FMODE_READ | FMODE_WRITE);
}
}
@@ -65,6 +65,11 @@
return -EIO;
p = xdr_decode_hyper(p, &b->simple.sigs[i].offset);
b->simple.sigs[i].sig_len = be32_to_cpup(p++);
+ if (b->simple.sigs[i].sig_len > PNFS_BLOCK_UUID_LEN) {
+ pr_info("signature too long: %d\n",
+ b->simple.sigs[i].sig_len);
+ return -EIO;
+ }
p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len);
if (!p)
@@ -195,7 +200,7 @@
if (!dev)
return -EIO;
- d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL);
+ d->bdev = blkdev_get_by_dev(dev, FMODE_READ | FMODE_WRITE, NULL);
if (IS_ERR(d->bdev)) {
printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n",
MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev));
diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c
index 31d0b5e..c59a59c 100644
--- a/fs/nfs/blocklayout/extent_tree.c
+++ b/fs/nfs/blocklayout/extent_tree.c
@@ -462,6 +462,12 @@
return err;
}
+static size_t ext_tree_layoutupdate_size(size_t count)
+{
+ return sizeof(__be32) /* number of entries */ +
+ PNFS_BLOCK_EXTENT_SIZE * count;
+}
+
static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
size_t buffer_size)
{
@@ -489,7 +495,7 @@
continue;
(*count)++;
- if (*count * BL_EXTENT_SIZE > buffer_size) {
+ if (ext_tree_layoutupdate_size(*count) > buffer_size) {
/* keep counting.. */
ret = -ENOSPC;
continue;
@@ -530,7 +536,7 @@
if (unlikely(ret)) {
ext_tree_free_commitdata(arg, buffer_size);
- buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ buffer_size = ext_tree_layoutupdate_size(count);
count = 0;
arg->layoutupdate_pages =
@@ -549,17 +555,14 @@
}
*start_p = cpu_to_be32(count);
- arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count;
+ arg->layoutupdate_len = ext_tree_layoutupdate_size(count);
if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
- __be32 *p = start_p;
+ void *p = start_p, *end = p + arg->layoutupdate_len;
int i = 0;
- for (p = start_p;
- p < start_p + arg->layoutupdate_len;
- p += PAGE_SIZE) {
+ for ( ; p < end; p += PAGE_SIZE)
arg->layoutupdate_pages[i++] = vmalloc_to_page(p);
- }
}
dprintk("%s found %zu ranges\n", __func__, count);
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index 29e3c1b..624bef7 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -554,7 +554,7 @@
status = htonl(NFS4_OK);
nfs41_set_target_slotid(fc_tbl, args->crsa_target_highest_slotid);
- nfs41_server_notify_target_slotid_update(cps->clp);
+ nfs41_notify_server(cps->clp);
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 4a90c9b..57c5a02 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -20,6 +20,7 @@
#include <linux/stat.h>
#include <linux/errno.h>
#include <linux/unistd.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/metrics.h>
@@ -285,116 +286,6 @@
}
EXPORT_SYMBOL_GPL(nfs_put_client);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-/*
- * Test if two ip6 socket addresses refer to the same socket by
- * comparing relevant fields. The padding bytes specifically, are not
- * compared. sin6_flowinfo is not compared because it only affects QoS
- * and sin6_scope_id is only compared if the address is "link local"
- * because "link local" addresses need only be unique to a specific
- * link. Conversely, ordinary unicast addresses might have different
- * sin6_scope_id.
- *
- * The caller should ensure both socket addresses are AF_INET6.
- */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- if (!ipv6_addr_equal(&sin1->sin6_addr, &sin2->sin6_addr))
- return 0;
- else if (ipv6_addr_type(&sin1->sin6_addr) & IPV6_ADDR_LINKLOCAL)
- return sin1->sin6_scope_id == sin2->sin6_scope_id;
-
- return 1;
-}
-#else /* !defined(CONFIG_IPV6) && !defined(CONFIG_IPV6_MODULE) */
-static int nfs_sockaddr_match_ipaddr6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- return 0;
-}
-#endif
-
-/*
- * Test if two ip4 socket addresses refer to the same socket, by
- * comparing relevant fields. The padding bytes specifically, are
- * not compared.
- *
- * The caller should ensure both socket addresses are AF_INET.
- */
-static int nfs_sockaddr_match_ipaddr4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return sin1->sin_addr.s_addr == sin2->sin_addr.s_addr;
-}
-
-static int nfs_sockaddr_cmp_ip6(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sa1;
- const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sa2;
-
- return nfs_sockaddr_match_ipaddr6(sa1, sa2) &&
- (sin1->sin6_port == sin2->sin6_port);
-}
-
-static int nfs_sockaddr_cmp_ip4(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sa1;
- const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sa2;
-
- return nfs_sockaddr_match_ipaddr4(sa1, sa2) &&
- (sin1->sin_port == sin2->sin_port);
-}
-
-#if defined(CONFIG_NFS_V4_1)
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, excluding the port number.
- */
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_match_ipaddr4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_match_ipaddr6(sa1, sa2);
- }
- return 0;
-}
-EXPORT_SYMBOL_GPL(nfs_sockaddr_match_ipaddr);
-#endif /* CONFIG_NFS_V4_1 */
-
-/*
- * Test if two socket addresses represent the same actual socket,
- * by comparing (only) relevant fields, including the port number.
- */
-static int nfs_sockaddr_cmp(const struct sockaddr *sa1,
- const struct sockaddr *sa2)
-{
- if (sa1->sa_family != sa2->sa_family)
- return 0;
-
- switch (sa1->sa_family) {
- case AF_INET:
- return nfs_sockaddr_cmp_ip4(sa1, sa2);
- case AF_INET6:
- return nfs_sockaddr_cmp_ip6(sa1, sa2);
- }
- return 0;
-}
-
/*
* Find an nfs_client on the list that matches the initialisation data
* that is supplied.
@@ -421,7 +312,7 @@
if (clp->cl_minorversion != data->minorversion)
continue;
/* Match the full socket address */
- if (!nfs_sockaddr_cmp(sap, clap))
+ if (!rpc_cmp_addr_port(sap, clap))
continue;
atomic_inc(&clp->cl_count);
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 547308a..3d8e4ff 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -583,26 +583,19 @@
}
static
-void nfs_readdir_free_pagearray(struct page **pages, unsigned int npages)
+void nfs_readdir_free_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
for (i = 0; i < npages; i++)
put_page(pages[i]);
}
-static
-void nfs_readdir_free_large_page(void *ptr, struct page **pages,
- unsigned int npages)
-{
- nfs_readdir_free_pagearray(pages, npages);
-}
-
/*
* nfs_readdir_large_page will allocate pages that must be freed with a call
- * to nfs_readdir_free_large_page
+ * to nfs_readdir_free_pagearray
*/
static
-int nfs_readdir_large_page(struct page **pages, unsigned int npages)
+int nfs_readdir_alloc_pages(struct page **pages, unsigned int npages)
{
unsigned int i;
@@ -615,7 +608,7 @@
return 0;
out_freepages:
- nfs_readdir_free_pagearray(pages, i);
+ nfs_readdir_free_pages(pages, i);
return -ENOMEM;
}
@@ -623,7 +616,6 @@
int nfs_readdir_xdr_to_array(nfs_readdir_descriptor_t *desc, struct page *page, struct inode *inode)
{
struct page *pages[NFS_MAX_READDIR_PAGES];
- void *pages_ptr = NULL;
struct nfs_entry entry;
struct file *file = desc->file;
struct nfs_cache_array *array;
@@ -653,7 +645,7 @@
memset(array, 0, sizeof(struct nfs_cache_array));
array->eof_index = -1;
- status = nfs_readdir_large_page(pages, array_size);
+ status = nfs_readdir_alloc_pages(pages, array_size);
if (status < 0)
goto out_release_array;
do {
@@ -671,7 +663,7 @@
}
} while (array->eof_index < 0);
- nfs_readdir_free_large_page(pages_ptr, pages, array_size);
+ nfs_readdir_free_pages(pages, array_size);
out_release_array:
nfs_readdir_release_array(page);
out_label_free:
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index cc4fa1e..7538a85 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -82,7 +82,8 @@
dprintk("NFS: release(%pD2)\n", filp);
nfs_inc_stats(inode, NFSIOS_VFSRELEASE);
- return nfs_release(inode, filp);
+ nfs_file_clear_open_context(filp);
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_file_release);
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 2a93bec..13fe64b4 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -730,8 +730,6 @@
return FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg);
/* no lseg means that pnfs is not in use, so no mirroring here */
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
nfs_pageio_reset_write_mds(pgio);
return 1;
}
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 0adc7d2..382c8a4 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -888,7 +888,7 @@
return ctx;
}
-static void nfs_file_clear_open_context(struct file *filp)
+void nfs_file_clear_open_context(struct file *filp)
{
struct nfs_open_context *ctx = nfs_file_open_context(filp);
@@ -919,12 +919,6 @@
return 0;
}
-int nfs_release(struct inode *inode, struct file *filp)
-{
- nfs_file_clear_open_context(filp);
- return 0;
-}
-
/*
* This function is called whenever some part of NFS notices that
* the cached attributes have to be refreshed.
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 1dad181..9ab3b1c 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -219,10 +219,6 @@
}
#endif
-#ifdef CONFIG_NFS_V4_1
-int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *);
-#endif
-
/* callback_xdr.c */
extern struct svc_version nfs4_callback_version1;
extern struct svc_version nfs4_callback_version4;
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index 9b04c2e..267126d 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1103,6 +1103,7 @@
{
encode_diropargs3(xdr, args->fromfh, args->fromname, args->fromlen);
encode_symlinkdata3(xdr, args);
+ xdr->buf->flags |= XDRBUF_WRITE;
}
/*
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index ea3bee9..50cfc4c 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -405,9 +405,7 @@
int nfs41_discover_server_trunking(struct nfs_client *clp,
struct nfs_client **, struct rpc_cred *);
extern void nfs4_schedule_session_recovery(struct nfs4_session *, int);
-extern void nfs41_server_notify_target_slotid_update(struct nfs_client *clp);
-extern void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp);
-
+extern void nfs41_notify_server(struct nfs_client *);
#else
static inline void nfs4_schedule_session_recovery(struct nfs4_session *session, int err)
{
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 3aa6a9b..223bedd 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -729,10 +729,7 @@
return false;
/* Match only the IP address, not the port number */
- if (!nfs_sockaddr_match_ipaddr(addr, clap))
- return false;
-
- return true;
+ return rpc_cmp_addr(addr, clap);
}
/*
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index 535dfc6..2e49022 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -184,7 +184,7 @@
.read = user_read,
};
-static int nfs_idmap_init_keyring(void)
+int nfs_idmap_init(void)
{
struct cred *cred;
struct key *keyring;
@@ -230,7 +230,7 @@
return ret;
}
-static void nfs_idmap_quit_keyring(void)
+void nfs_idmap_quit(void)
{
key_revoke(id_resolver_cache->thread_keyring);
unregister_key_type(&key_type_id_resolver);
@@ -492,16 +492,6 @@
kfree(idmap);
}
-int nfs_idmap_init(void)
-{
- return nfs_idmap_init_keyring();
-}
-
-void nfs_idmap_quit(void)
-{
- nfs_idmap_quit_keyring();
-}
-
static int nfs_idmap_prepare_message(char *desc, struct idmap *idmap,
struct idmap_msg *im,
struct rpc_pipe_msg *msg)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 15ee8bd..f4e5816 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -586,7 +586,7 @@
spin_unlock(&tbl->slot_tbl_lock);
res->sr_slot = NULL;
if (send_new_highest_used_slotid)
- nfs41_server_notify_highest_slotid_update(session->clp);
+ nfs41_notify_server(session->clp);
}
int nfs41_sequence_done(struct rpc_task *task, struct nfs4_sequence_res *res)
@@ -8661,6 +8661,7 @@
.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
.state_renewal_ops = &nfs41_state_renewal_ops,
+ .mig_recovery_ops = &nfs41_mig_recovery_ops,
};
#endif
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f2e2ad8..da73bc4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2152,23 +2152,13 @@
}
EXPORT_SYMBOL_GPL(nfs4_schedule_session_recovery);
-static void nfs41_ping_server(struct nfs_client *clp)
+void nfs41_notify_server(struct nfs_client *clp)
{
/* Use CHECK_LEASE to ping the server with a SEQUENCE */
set_bit(NFS4CLNT_CHECK_LEASE, &clp->cl_state);
nfs4_schedule_state_manager(clp);
}
-void nfs41_server_notify_target_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
-void nfs41_server_notify_highest_slotid_update(struct nfs_client *clp)
-{
- nfs41_ping_server(clp);
-}
-
static void nfs4_reset_all_state(struct nfs_client *clp)
{
if (test_and_set_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) == 0) {
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 558cd65d..c42459e 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1154,7 +1154,9 @@
case NF4LNK:
p = reserve_space(xdr, 4);
*p = cpu_to_be32(create->u.symlink.len);
- xdr_write_pages(xdr, create->u.symlink.pages, 0, create->u.symlink.len);
+ xdr_write_pages(xdr, create->u.symlink.pages, 0,
+ create->u.symlink.len);
+ xdr->buf->flags |= XDRBUF_WRITE;
break;
case NF4BLK: case NF4CHR:
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 4984bbe..7c5718b 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -77,8 +77,8 @@
void nfs_set_pgio_error(struct nfs_pgio_header *hdr, int error, loff_t pos)
{
spin_lock(&hdr->lock);
- if (pos < hdr->io_start + hdr->good_bytes) {
- set_bit(NFS_IOHDR_ERROR, &hdr->flags);
+ if (!test_and_set_bit(NFS_IOHDR_ERROR, &hdr->flags)
+ || pos < hdr->io_start + hdr->good_bytes) {
clear_bit(NFS_IOHDR_EOF, &hdr->flags);
hdr->good_bytes = pos - hdr->io_start;
hdr->error = error;
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index cd3c040..bbd407b 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -373,26 +373,31 @@
return false;
}
+/*
+ * Checks if 'dsaddrs1' contains a subset of 'dsaddrs2'. If it does,
+ * declare a match.
+ */
static bool
_same_data_server_addrs_locked(const struct list_head *dsaddrs1,
const struct list_head *dsaddrs2)
{
struct nfs4_pnfs_ds_addr *da1, *da2;
+ struct sockaddr *sa1, *sa2;
+ bool match = false;
- /* step through both lists, comparing as we go */
- for (da1 = list_first_entry(dsaddrs1, typeof(*da1), da_node),
- da2 = list_first_entry(dsaddrs2, typeof(*da2), da_node);
- da1 != NULL && da2 != NULL;
- da1 = list_entry(da1->da_node.next, typeof(*da1), da_node),
- da2 = list_entry(da2->da_node.next, typeof(*da2), da_node)) {
- if (!same_sockaddr((struct sockaddr *)&da1->da_addr,
- (struct sockaddr *)&da2->da_addr))
- return false;
+ list_for_each_entry(da1, dsaddrs1, da_node) {
+ sa1 = (struct sockaddr *)&da1->da_addr;
+ match = false;
+ list_for_each_entry(da2, dsaddrs2, da_node) {
+ sa2 = (struct sockaddr *)&da2->da_addr;
+ match = same_sockaddr(sa1, sa2);
+ if (match)
+ break;
+ }
+ if (!match)
+ break;
}
- if (da1 == NULL && da2 == NULL)
- return true;
-
- return false;
+ return match;
}
/*
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index fdee927..388f480 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -1812,7 +1812,7 @@
return res;
}
-static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc)
+int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
{
struct nfs_inode *nfsi = NFS_I(inode);
int flags = FLUSH_SYNC;
@@ -1847,11 +1847,6 @@
__mark_inode_dirty(inode, I_DIRTY_DATASYNC);
return ret;
}
-
-int nfs_write_inode(struct inode *inode, struct writeback_control *wbc)
-{
- return nfs_commit_unstable_pages(inode, wbc);
-}
EXPORT_SYMBOL_GPL(nfs_write_inode);
/*
diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c
index 9aa2796..6d834dc 100644
--- a/fs/nfsd/blocklayoutxdr.c
+++ b/fs/nfsd/blocklayoutxdr.c
@@ -101,7 +101,7 @@
}
nr_iomaps = be32_to_cpup(p++);
- expected = sizeof(__be32) + nr_iomaps * NFS4_BLOCK_EXTENT_SIZE;
+ expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE;
if (len != expected) {
dprintk("%s: extent array size mismatch: %u/%u\n",
__func__, len, expected);
diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h
index fdc7903..6de925f 100644
--- a/fs/nfsd/blocklayoutxdr.h
+++ b/fs/nfsd/blocklayoutxdr.h
@@ -7,13 +7,6 @@
struct iomap;
struct xdr_stream;
-enum pnfs_block_extent_state {
- PNFS_BLOCK_READWRITE_DATA = 0,
- PNFS_BLOCK_READ_DATA = 1,
- PNFS_BLOCK_INVALID_DATA = 2,
- PNFS_BLOCK_NONE_DATA = 3,
-};
-
struct pnfs_block_extent {
struct nfsd4_deviceid vol_id;
u64 foff;
@@ -21,14 +14,6 @@
u64 soff;
enum pnfs_block_extent_state es;
};
-#define NFS4_BLOCK_EXTENT_SIZE 44
-
-enum pnfs_block_volume_type {
- PNFS_BLOCK_VOLUME_SIMPLE = 0,
- PNFS_BLOCK_VOLUME_SLICE = 1,
- PNFS_BLOCK_VOLUME_CONCAT = 2,
- PNFS_BLOCK_VOLUME_STRIPE = 3,
-};
/*
* Random upper cap for the uuid length to avoid unbounded allocation.
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index b8e72aa..00121f2 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -547,6 +547,24 @@
NOTIFY_DEVICEID4_DELETE = 1 << 2,
};
+enum pnfs_block_volume_type {
+ PNFS_BLOCK_VOLUME_SIMPLE = 0,
+ PNFS_BLOCK_VOLUME_SLICE = 1,
+ PNFS_BLOCK_VOLUME_CONCAT = 2,
+ PNFS_BLOCK_VOLUME_STRIPE = 3,
+};
+
+enum pnfs_block_extent_state {
+ PNFS_BLOCK_READWRITE_DATA = 0,
+ PNFS_BLOCK_READ_DATA = 1,
+ PNFS_BLOCK_INVALID_DATA = 2,
+ PNFS_BLOCK_NONE_DATA = 3,
+};
+
+/* on the wire size of a block layout extent */
+#define PNFS_BLOCK_EXTENT_SIZE \
+ (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE)
+
#define NFL4_UFLG_MASK 0x0000003F
#define NFL4_UFLG_DENSE 0x00000001
#define NFL4_UFLG_COMMIT_THRU_MDS 0x00000002
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 874b772..c0e9614 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -353,7 +353,6 @@
extern void nfs_access_set_mask(struct nfs_access_entry *, u32);
extern int nfs_permission(struct inode *, int);
extern int nfs_open(struct inode *, struct file *);
-extern int nfs_release(struct inode *, struct file *);
extern int nfs_attribute_timeout(struct inode *inode);
extern int nfs_attribute_cache_expired(struct inode *inode);
extern int nfs_revalidate_inode(struct nfs_server *server, struct inode *inode);
@@ -371,6 +370,7 @@
extern struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry, fmode_t f_mode);
extern void nfs_inode_attach_open_context(struct nfs_open_context *ctx);
extern void nfs_file_set_open_context(struct file *filp, struct nfs_open_context *ctx);
+extern void nfs_file_clear_open_context(struct file *flip);
extern struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx);
extern void nfs_put_lock_context(struct nfs_lock_context *l_ctx);
extern u64 nfs_compat_user_ino64(u64 fileid);
diff --git a/include/linux/sunrpc/addr.h b/include/linux/sunrpc/addr.h
index 07d8e53..9bc3538 100644
--- a/include/linux/sunrpc/addr.h
+++ b/include/linux/sunrpc/addr.h
@@ -46,8 +46,8 @@
#define IPV6_SCOPE_DELIMITER '%'
#define IPV6_SCOPE_ID_LEN sizeof("%nnnnnnnnnn")
-static inline bool __rpc_cmp_addr4(const struct sockaddr *sap1,
- const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr4(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
{
const struct sockaddr_in *sin1 = (const struct sockaddr_in *)sap1;
const struct sockaddr_in *sin2 = (const struct sockaddr_in *)sap2;
@@ -67,8 +67,8 @@
}
#if IS_ENABLED(CONFIG_IPV6)
-static inline bool __rpc_cmp_addr6(const struct sockaddr *sap1,
- const struct sockaddr *sap2)
+static inline bool rpc_cmp_addr6(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
{
const struct sockaddr_in6 *sin1 = (const struct sockaddr_in6 *)sap1;
const struct sockaddr_in6 *sin2 = (const struct sockaddr_in6 *)sap2;
@@ -122,15 +122,28 @@
if (sap1->sa_family == sap2->sa_family) {
switch (sap1->sa_family) {
case AF_INET:
- return __rpc_cmp_addr4(sap1, sap2);
+ return rpc_cmp_addr4(sap1, sap2);
case AF_INET6:
- return __rpc_cmp_addr6(sap1, sap2);
+ return rpc_cmp_addr6(sap1, sap2);
}
}
return false;
}
/**
+ * rpc_cmp_addr_port - compare the address and port number of two sockaddrs.
+ * @sap1: first sockaddr
+ * @sap2: second sockaddr
+ */
+static inline bool rpc_cmp_addr_port(const struct sockaddr *sap1,
+ const struct sockaddr *sap2)
+{
+ if (!rpc_cmp_addr(sap1, sap2))
+ return false;
+ return rpc_get_port(sap1) == rpc_get_port(sap2);
+}
+
+/**
* rpc_copy_addr - copy the address portion of one sockaddr to another
* @dst: destination sockaddr
* @src: source sockaddr
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index b176130..b7b279b 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -49,7 +49,7 @@
* a single chunk type per message is supported currently.
*/
#define RPCRDMA_MIN_SLOT_TABLE (2U)
-#define RPCRDMA_DEF_SLOT_TABLE (32U)
+#define RPCRDMA_DEF_SLOT_TABLE (128U)
#define RPCRDMA_MAX_SLOT_TABLE (256U)
#define RPCRDMA_DEF_INLINE (1024) /* default inline max */
diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h
index b0f898e..43c1cf0 100644
--- a/include/rdma/ib_verbs.h
+++ b/include/rdma/ib_verbs.h
@@ -2760,52 +2760,6 @@
}
/**
- * ib_reg_phys_mr - Prepares a virtually addressed memory region for use
- * by an HCA.
- * @pd: The protection domain associated assigned to the registered region.
- * @phys_buf_array: Specifies a list of physical buffers to use in the
- * memory region.
- * @num_phys_buf: Specifies the size of the phys_buf_array.
- * @mr_access_flags: Specifies the memory access rights.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-struct ib_mr *ib_reg_phys_mr(struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
- * ib_rereg_phys_mr - Modifies the attributes of an existing memory region.
- * Conceptually, this call performs the functions deregister memory region
- * followed by register physical memory region. Where possible,
- * resources are reused instead of deallocated and reallocated.
- * @mr: The memory region to modify.
- * @mr_rereg_mask: A bit-mask used to indicate which of the following
- * properties of the memory region are being modified.
- * @pd: If %IB_MR_REREG_PD is set in mr_rereg_mask, this field specifies
- * the new protection domain to associated with the memory region,
- * otherwise, this parameter is ignored.
- * @phys_buf_array: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies a list of physical buffers to use in the new
- * translation, otherwise, this parameter is ignored.
- * @num_phys_buf: If %IB_MR_REREG_TRANS is set in mr_rereg_mask, this
- * field specifies the size of the phys_buf_array, otherwise, this
- * parameter is ignored.
- * @mr_access_flags: If %IB_MR_REREG_ACCESS is set in mr_rereg_mask, this
- * field specifies the new memory access rights, otherwise, this
- * parameter is ignored.
- * @iova_start: The offset of the region's starting I/O virtual address.
- */
-int ib_rereg_phys_mr(struct ib_mr *mr,
- int mr_rereg_mask,
- struct ib_pd *pd,
- struct ib_phys_buf *phys_buf_array,
- int num_phys_buf,
- int mr_access_flags,
- u64 *iova_start);
-
-/**
* ib_query_mr - Retrieves information about a specific memory region.
* @mr: The memory region to retrieve information about.
* @mr_attr: The attributes of the specified memory region.
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index f1e8daf..cb25c89 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -39,6 +39,25 @@
fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ struct ib_mr *mr;
+
+ /* Obtain an lkey to use for the regbufs, which are
+ * protected from remote access.
+ */
+ if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+ } else {
+ mr = ib_get_dma_mr(ia->ri_pd, IB_ACCESS_LOCAL_WRITE);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+ ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+ ia->ri_dma_mr = mr;
+ }
+
return 0;
}
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 04ea914..63f282e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -189,6 +189,11 @@
struct ib_device_attr *devattr = &ia->ri_devattr;
int depth, delta;
+ /* Obtain an lkey to use for the regbufs, which are
+ * protected from remote access.
+ */
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
devattr->max_fast_reg_page_list_len);
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 41985d0..72cf8b1 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -23,6 +23,29 @@
physical_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
+ struct ib_device_attr *devattr = &ia->ri_devattr;
+ struct ib_mr *mr;
+
+ /* Obtain an rkey to use for RPC data payloads.
+ */
+ mr = ib_get_dma_mr(ia->ri_pd,
+ IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_READ);
+ if (IS_ERR(mr)) {
+ pr_err("%s: ib_get_dma_mr for failed with %lX\n",
+ __func__, PTR_ERR(mr));
+ return -ENOMEM;
+ }
+ ia->ri_dma_mr = mr;
+
+ /* Obtain an lkey to use for regbufs.
+ */
+ if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)
+ ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
+ else
+ ia->ri_dma_lkey = ia->ri_dma_mr->lkey;
+
return 0;
}
@@ -51,7 +74,7 @@
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
- seg->mr_rkey = ia->ri_bind_mem->rkey;
+ seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
seg->mr_nsegs = 1;
return 1;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 84ea37d..bc8bd65 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -71,6 +71,67 @@
};
#endif
+/* The client can send a request inline as long as the RPCRDMA header
+ * plus the RPC call fit under the transport's inline limit. If the
+ * combined call message size exceeds that limit, the client must use
+ * the read chunk list for this operation.
+ */
+static bool rpcrdma_args_inline(struct rpc_rqst *rqst)
+{
+ unsigned int callsize = RPCRDMA_HDRLEN_MIN + rqst->rq_snd_buf.len;
+
+ return callsize <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+}
+
+/* The client can't know how large the actual reply will be. Thus it
+ * plans for the largest possible reply for that particular ULP
+ * operation. If the maximum combined reply message size exceeds that
+ * limit, the client must provide a write list or a reply chunk for
+ * this request.
+ */
+static bool rpcrdma_results_inline(struct rpc_rqst *rqst)
+{
+ unsigned int repsize = RPCRDMA_HDRLEN_MIN + rqst->rq_rcv_buf.buflen;
+
+ return repsize <= RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+}
+
+static int
+rpcrdma_tail_pullup(struct xdr_buf *buf)
+{
+ size_t tlen = buf->tail[0].iov_len;
+ size_t skip = tlen & 3;
+
+ /* Do not include the tail if it is only an XDR pad */
+ if (tlen < 4)
+ return 0;
+
+ /* xdr_write_pages() adds a pad at the beginning of the tail
+ * if the content in "buf->pages" is unaligned. Force the
+ * tail's actual content to land at the next XDR position
+ * after the head instead.
+ */
+ if (skip) {
+ unsigned char *src, *dst;
+ unsigned int count;
+
+ src = buf->tail[0].iov_base;
+ dst = buf->head[0].iov_base;
+ dst += buf->head[0].iov_len;
+
+ src += skip;
+ tlen -= skip;
+
+ dprintk("RPC: %s: skip=%zu, memmove(%p, %p, %zu)\n",
+ __func__, skip, dst, src, tlen);
+
+ for (count = tlen; count; count--)
+ *dst++ = *src++;
+ }
+
+ return tlen;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -122,6 +183,10 @@
if (len && n == nsegs)
return -EIO;
+ /* When encoding the read list, the tail is always sent inline */
+ if (type == rpcrdma_readch)
+ return n;
+
if (xdrbuf->tail[0].iov_len) {
/* the rpcrdma protocol allows us to omit any trailing
* xdr pad bytes, saving the server an RDMA operation. */
@@ -297,8 +362,7 @@
* pre-registered memory buffer for this request. For small amounts
* of data, this is efficient. The cutoff value is tunable.
*/
-static int
-rpcrdma_inline_pullup(struct rpc_rqst *rqst, int pad)
+static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
{
int i, npages, curlen;
int copy_len;
@@ -310,16 +374,9 @@
destp = rqst->rq_svec[0].iov_base;
curlen = rqst->rq_svec[0].iov_len;
destp += curlen;
- /*
- * Do optional padding where it makes sense. Alignment of write
- * payload can help the server, if our setting is accurate.
- */
- pad -= (curlen + 36/*sizeof(struct rpcrdma_msg_padded)*/);
- if (pad < 0 || rqst->rq_slen - curlen < RPCRDMA_INLINE_PAD_THRESH)
- pad = 0; /* don't pad this request */
- dprintk("RPC: %s: pad %d destp 0x%p len %d hdrlen %d\n",
- __func__, pad, destp, rqst->rq_slen, curlen);
+ dprintk("RPC: %s: destp 0x%p len %d hdrlen %d\n",
+ __func__, destp, rqst->rq_slen, curlen);
copy_len = rqst->rq_snd_buf.page_len;
@@ -355,7 +412,6 @@
page_base = 0;
}
/* header now contains entire send message */
- return pad;
}
/*
@@ -380,7 +436,7 @@
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
char *base;
- size_t rpclen, padlen;
+ size_t rpclen;
ssize_t hdrlen;
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
@@ -402,28 +458,15 @@
/*
* Chunks needed for results?
*
+ * o Read ops return data as write chunk(s), header as inline.
* o If the expected result is under the inline threshold, all ops
- * return as inline (but see later).
+ * return as inline.
* o Large non-read ops return as a single reply chunk.
- * o Large read ops return data as write chunk(s), header as inline.
- *
- * Note: the NFS code sending down multiple result segments implies
- * the op is one of read, readdir[plus], readlink or NFSv4 getacl.
*/
-
- /*
- * This code can handle read chunks, write chunks OR reply
- * chunks -- only one type. If the request is too big to fit
- * inline, then we will choose read chunks. If the request is
- * a READ, then use write chunks to separate the file data
- * into pages; otherwise use reply chunks.
- */
- if (rqst->rq_rcv_buf.buflen <= RPCRDMA_INLINE_READ_THRESHOLD(rqst))
- wtype = rpcrdma_noch;
- else if (rqst->rq_rcv_buf.page_len == 0)
- wtype = rpcrdma_replych;
- else if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
+ if (rqst->rq_rcv_buf.flags & XDRBUF_READ)
wtype = rpcrdma_writech;
+ else if (rpcrdma_results_inline(rqst))
+ wtype = rpcrdma_noch;
else
wtype = rpcrdma_replych;
@@ -432,21 +475,25 @@
*
* o If the total request is under the inline threshold, all ops
* are sent as inline.
- * o Large non-write ops are sent with the entire message as a
- * single read chunk (protocol 0-position special case).
* o Large write ops transmit data as read chunk(s), header as
* inline.
+ * o Large non-write ops are sent with the entire message as a
+ * single read chunk (protocol 0-position special case).
*
- * Note: the NFS code sending down multiple argument segments
- * implies the op is a write.
- * TBD check NFSv4 setacl
+ * This assumes that the upper layer does not present a request
+ * that both has a data payload, and whose non-data arguments
+ * by themselves are larger than the inline threshold.
*/
- if (rqst->rq_snd_buf.len <= RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
+ if (rpcrdma_args_inline(rqst)) {
rtype = rpcrdma_noch;
- else if (rqst->rq_snd_buf.page_len == 0)
- rtype = rpcrdma_areadch;
- else
+ } else if (rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
rtype = rpcrdma_readch;
+ } else {
+ r_xprt->rx_stats.nomsg_call_count++;
+ headerp->rm_type = htonl(RDMA_NOMSG);
+ rtype = rpcrdma_areadch;
+ rpclen = 0;
+ }
/* The following simplification is not true forever */
if (rtype != rpcrdma_noch && wtype == rpcrdma_replych)
@@ -458,7 +505,6 @@
}
hdrlen = RPCRDMA_HDRLEN_MIN;
- padlen = 0;
/*
* Pull up any extra send data into the preregistered buffer.
@@ -467,45 +513,15 @@
*/
if (rtype == rpcrdma_noch) {
- padlen = rpcrdma_inline_pullup(rqst,
- RPCRDMA_INLINE_PAD_VALUE(rqst));
+ rpcrdma_inline_pullup(rqst);
- if (padlen) {
- headerp->rm_type = rdma_msgp;
- headerp->rm_body.rm_padded.rm_align =
- cpu_to_be32(RPCRDMA_INLINE_PAD_VALUE(rqst));
- headerp->rm_body.rm_padded.rm_thresh =
- cpu_to_be32(RPCRDMA_INLINE_PAD_THRESH);
- headerp->rm_body.rm_padded.rm_pempty[0] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[1] = xdr_zero;
- headerp->rm_body.rm_padded.rm_pempty[2] = xdr_zero;
- hdrlen += 2 * sizeof(u32); /* extra words in padhdr */
- if (wtype != rpcrdma_noch) {
- dprintk("RPC: %s: invalid chunk list\n",
- __func__);
- return -EIO;
- }
- } else {
- headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
- headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
- /* new length after pullup */
- rpclen = rqst->rq_svec[0].iov_len;
- /*
- * Currently we try to not actually use read inline.
- * Reply chunks have the desirable property that
- * they land, packed, directly in the target buffers
- * without headers, so they require no fixup. The
- * additional RDMA Write op sends the same amount
- * of data, streams on-the-wire and adds no overhead
- * on receive. Therefore, we request a reply chunk
- * for non-writes wherever feasible and efficient.
- */
- if (wtype == rpcrdma_noch)
- wtype = rpcrdma_replych;
- }
- }
-
+ headerp->rm_body.rm_nochunks.rm_empty[0] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[1] = xdr_zero;
+ headerp->rm_body.rm_nochunks.rm_empty[2] = xdr_zero;
+ /* new length after pullup */
+ rpclen = rqst->rq_svec[0].iov_len;
+ } else if (rtype == rpcrdma_readch)
+ rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
if (rtype != rpcrdma_noch) {
hdrlen = rpcrdma_create_chunks(rqst, &rqst->rq_snd_buf,
headerp, rtype);
@@ -518,9 +534,9 @@
if (hdrlen < 0)
return hdrlen;
- dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd padlen %zd"
+ dprintk("RPC: %s: %s: hdrlen %zd rpclen %zd"
" headerp 0x%p base 0x%p lkey 0x%x\n",
- __func__, transfertypes[wtype], hdrlen, rpclen, padlen,
+ __func__, transfertypes[wtype], hdrlen, rpclen,
headerp, base, rdmab_lkey(req->rl_rdmabuf));
/*
@@ -534,26 +550,15 @@
req->rl_send_iov[0].length = hdrlen;
req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+ req->rl_niovs = 1;
+ if (rtype == rpcrdma_areadch)
+ return 0;
+
req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
req->rl_send_iov[1].length = rpclen;
req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
req->rl_niovs = 2;
-
- if (padlen) {
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- req->rl_send_iov[2].addr = rdmab_addr(ep->rep_padbuf);
- req->rl_send_iov[2].length = padlen;
- req->rl_send_iov[2].lkey = rdmab_lkey(ep->rep_padbuf);
-
- req->rl_send_iov[3].addr = req->rl_send_iov[1].addr + rpclen;
- req->rl_send_iov[3].length = rqst->rq_slen - rpclen;
- req->rl_send_iov[3].lkey = rdmab_lkey(req->rl_sendbuf);
-
- req->rl_niovs = 4;
- }
-
return 0;
}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 680f888..64443eb 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -175,10 +175,8 @@
}
static void
-xprt_rdma_format_addresses(struct rpc_xprt *xprt)
+xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
- struct sockaddr *sap = (struct sockaddr *)
- &rpcx_to_rdmad(xprt).addr;
char buf[128];
switch (sap->sa_family) {
@@ -302,7 +300,7 @@
struct rpc_xprt *xprt;
struct rpcrdma_xprt *new_xprt;
struct rpcrdma_ep *new_ep;
- struct sockaddr_in *sin;
+ struct sockaddr *sap;
int rc;
if (args->addrlen > sizeof(xprt->addr)) {
@@ -333,26 +331,20 @@
* Set up RDMA-specific connect data.
*/
- /* Put server RDMA address in local cdata */
- memcpy(&cdata.addr, args->dstaddr, args->addrlen);
+ sap = (struct sockaddr *)&cdata.addr;
+ memcpy(sap, args->dstaddr, args->addrlen);
/* Ensure xprt->addr holds valid server TCP (not RDMA)
* address, for any side protocols which peek at it */
xprt->prot = IPPROTO_TCP;
xprt->addrlen = args->addrlen;
- memcpy(&xprt->addr, &cdata.addr, xprt->addrlen);
+ memcpy(&xprt->addr, sap, xprt->addrlen);
- sin = (struct sockaddr_in *)&cdata.addr;
- if (ntohs(sin->sin_port) != 0)
+ if (rpc_get_port(sap))
xprt_set_bound(xprt);
- dprintk("RPC: %s: %pI4:%u\n",
- __func__, &sin->sin_addr.s_addr, ntohs(sin->sin_port));
-
- /* Set max requests */
cdata.max_requests = xprt->max_reqs;
- /* Set some length limits */
cdata.rsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA write max */
cdata.wsize = RPCRDMA_MAX_SEGS * PAGE_SIZE; /* RDMA read max */
@@ -375,8 +367,7 @@
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt, (struct sockaddr *) &cdata.addr,
- xprt_rdma_memreg_strategy);
+ rc = rpcrdma_ia_open(new_xprt, sap, xprt_rdma_memreg_strategy);
if (rc)
goto out1;
@@ -409,7 +400,7 @@
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
- xprt_rdma_format_addresses(xprt);
+ xprt_rdma_format_addresses(xprt, sap);
xprt->max_payload = new_xprt->rx_ia.ri_ops->ro_maxpages(new_xprt);
if (xprt->max_payload == 0)
goto out4;
@@ -420,6 +411,9 @@
if (!try_module_get(THIS_MODULE))
goto out4;
+ dprintk("RPC: %s: %s:%s\n", __func__,
+ xprt->address_strings[RPC_DISPLAY_ADDR],
+ xprt->address_strings[RPC_DISPLAY_PORT]);
return xprt;
out4:
@@ -653,31 +647,30 @@
if (xprt_connected(xprt))
idle_time = (long)(jiffies - xprt->last_used) / HZ;
- seq_printf(seq,
- "\txprt:\trdma %u %lu %lu %lu %ld %lu %lu %lu %Lu %Lu "
- "%lu %lu %lu %Lu %Lu %Lu %Lu %lu %lu %lu\n",
-
- 0, /* need a local port? */
- xprt->stat.bind_count,
- xprt->stat.connect_count,
- xprt->stat.connect_time,
- idle_time,
- xprt->stat.sends,
- xprt->stat.recvs,
- xprt->stat.bad_xids,
- xprt->stat.req_u,
- xprt->stat.bklog_u,
-
- r_xprt->rx_stats.read_chunk_count,
- r_xprt->rx_stats.write_chunk_count,
- r_xprt->rx_stats.reply_chunk_count,
- r_xprt->rx_stats.total_rdma_request,
- r_xprt->rx_stats.total_rdma_reply,
- r_xprt->rx_stats.pullup_copy_count,
- r_xprt->rx_stats.fixup_copy_count,
- r_xprt->rx_stats.hardway_register_count,
- r_xprt->rx_stats.failed_marshal_count,
- r_xprt->rx_stats.bad_reply_count);
+ seq_puts(seq, "\txprt:\trdma ");
+ seq_printf(seq, "%u %lu %lu %lu %ld %lu %lu %lu %llu %llu ",
+ 0, /* need a local port? */
+ xprt->stat.bind_count,
+ xprt->stat.connect_count,
+ xprt->stat.connect_time,
+ idle_time,
+ xprt->stat.sends,
+ xprt->stat.recvs,
+ xprt->stat.bad_xids,
+ xprt->stat.req_u,
+ xprt->stat.bklog_u);
+ seq_printf(seq, "%lu %lu %lu %llu %llu %llu %llu %lu %lu %lu %lu\n",
+ r_xprt->rx_stats.read_chunk_count,
+ r_xprt->rx_stats.write_chunk_count,
+ r_xprt->rx_stats.reply_chunk_count,
+ r_xprt->rx_stats.total_rdma_request,
+ r_xprt->rx_stats.total_rdma_reply,
+ r_xprt->rx_stats.pullup_copy_count,
+ r_xprt->rx_stats.fixup_copy_count,
+ r_xprt->rx_stats.hardway_register_count,
+ r_xprt->rx_stats.failed_marshal_count,
+ r_xprt->rx_stats.bad_reply_count,
+ r_xprt->rx_stats.nomsg_call_count);
}
static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 891c4ed..f73d7a7 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -52,6 +52,7 @@
#include <linux/prefetch.h>
#include <linux/sunrpc/addr.h>
#include <asm/bitops.h>
+#include <linux/module.h> /* try_module_get()/module_put() */
#include "xprt_rdma.h"
@@ -414,6 +415,14 @@
return 0;
}
+static void rpcrdma_destroy_id(struct rdma_cm_id *id)
+{
+ if (id) {
+ module_put(id->device->owner);
+ rdma_destroy_id(id);
+ }
+}
+
static struct rdma_cm_id *
rpcrdma_create_id(struct rpcrdma_xprt *xprt,
struct rpcrdma_ia *ia, struct sockaddr *addr)
@@ -440,6 +449,17 @@
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
+
+ /* FIXME:
+ * Until xprtrdma supports DEVICE_REMOVAL, the provider must
+ * be pinned while there are active NFS/RDMA mounts to prevent
+ * hangs and crashes at umount time.
+ */
+ if (!ia->ri_async_rc && !try_module_get(id->device->owner)) {
+ dprintk("RPC: %s: Failed to get device module\n",
+ __func__);
+ ia->ri_async_rc = -ENODEV;
+ }
rc = ia->ri_async_rc;
if (rc)
goto out;
@@ -449,16 +469,17 @@
if (rc) {
dprintk("RPC: %s: rdma_resolve_route() failed %i\n",
__func__, rc);
- goto out;
+ goto put;
}
wait_for_completion_interruptible_timeout(&ia->ri_done,
msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1);
rc = ia->ri_async_rc;
if (rc)
- goto out;
+ goto put;
return id;
-
+put:
+ module_put(id->device->owner);
out:
rdma_destroy_id(id);
return ERR_PTR(rc);
@@ -493,9 +514,11 @@
int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
- int rc, mem_priv;
struct rpcrdma_ia *ia = &xprt->rx_ia;
struct ib_device_attr *devattr = &ia->ri_devattr;
+ int rc;
+
+ ia->ri_dma_mr = NULL;
ia->ri_id = rpcrdma_create_id(xprt, ia, addr);
if (IS_ERR(ia->ri_id)) {
@@ -519,11 +542,6 @@
goto out3;
}
- if (devattr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) {
- ia->ri_have_dma_lkey = 1;
- ia->ri_dma_lkey = ia->ri_device->local_dma_lkey;
- }
-
if (memreg == RPCRDMA_FRMR) {
/* Requires both frmr reg and local dma lkey */
if (((devattr->device_cap_flags &
@@ -539,42 +557,19 @@
if (!ia->ri_device->alloc_fmr) {
dprintk("RPC: %s: MTHCAFMR registration "
"not supported by HCA\n", __func__);
- memreg = RPCRDMA_ALLPHYSICAL;
+ goto out3;
}
}
- /*
- * Optionally obtain an underlying physical identity mapping in
- * order to do a memory window-based bind. This base registration
- * is protected from remote access - that is enabled only by binding
- * for the specific bytes targeted during each RPC operation, and
- * revoked after the corresponding completion similar to a storage
- * adapter.
- */
switch (memreg) {
case RPCRDMA_FRMR:
ia->ri_ops = &rpcrdma_frwr_memreg_ops;
break;
case RPCRDMA_ALLPHYSICAL:
ia->ri_ops = &rpcrdma_physical_memreg_ops;
- mem_priv = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_REMOTE_READ;
- goto register_setup;
+ break;
case RPCRDMA_MTHCAFMR:
ia->ri_ops = &rpcrdma_fmr_memreg_ops;
- if (ia->ri_have_dma_lkey)
- break;
- mem_priv = IB_ACCESS_LOCAL_WRITE;
- register_setup:
- ia->ri_bind_mem = ib_get_dma_mr(ia->ri_pd, mem_priv);
- if (IS_ERR(ia->ri_bind_mem)) {
- printk(KERN_ALERT "%s: ib_get_dma_mr for "
- "phys register failed with %lX\n",
- __func__, PTR_ERR(ia->ri_bind_mem));
- rc = -ENOMEM;
- goto out3;
- }
break;
default:
printk(KERN_ERR "RPC: Unsupported memory "
@@ -592,7 +587,7 @@
ib_dealloc_pd(ia->ri_pd);
ia->ri_pd = NULL;
out2:
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
out1:
return rc;
@@ -606,19 +601,11 @@
void
rpcrdma_ia_close(struct rpcrdma_ia *ia)
{
- int rc;
-
dprintk("RPC: %s: entering\n", __func__);
- if (ia->ri_bind_mem != NULL) {
- rc = ib_dereg_mr(ia->ri_bind_mem);
- dprintk("RPC: %s: ib_dereg_mr returned %i\n",
- __func__, rc);
- }
-
if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
if (ia->ri_id->qp)
rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
+ rpcrdma_destroy_id(ia->ri_id);
ia->ri_id = NULL;
}
@@ -639,6 +626,12 @@
struct ib_cq_init_attr cq_attr = {};
int rc, err;
+ if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ dprintk("RPC: %s: insufficient sge's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+
/* check provider's send/recv wr limits */
if (cdata->max_requests > devattr->max_qp_wr)
cdata->max_requests = devattr->max_qp_wr;
@@ -651,21 +644,13 @@
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
- ep->rep_attr.cap.max_send_sge = (cdata->padding ? 4 : 2);
+ ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
ep->rep_attr.qp_type = IB_QPT_RC;
ep->rep_attr.port_num = ~0;
- if (cdata->padding) {
- ep->rep_padbuf = rpcrdma_alloc_regbuf(ia, cdata->padding,
- GFP_KERNEL);
- if (IS_ERR(ep->rep_padbuf))
- return PTR_ERR(ep->rep_padbuf);
- } else
- ep->rep_padbuf = NULL;
-
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
@@ -748,7 +733,8 @@
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, err);
out1:
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
+ if (ia->ri_dma_mr)
+ ib_dereg_mr(ia->ri_dma_mr);
return rc;
}
@@ -775,8 +761,6 @@
ia->ri_id->qp = NULL;
}
- rpcrdma_free_regbuf(ia, ep->rep_padbuf);
-
rpcrdma_clean_cq(ep->rep_attr.recv_cq);
rc = ib_destroy_cq(ep->rep_attr.recv_cq);
if (rc)
@@ -788,6 +772,12 @@
if (rc)
dprintk("RPC: %s: ib_destroy_cq returned %i\n",
__func__, rc);
+
+ if (ia->ri_dma_mr) {
+ rc = ib_dereg_mr(ia->ri_dma_mr);
+ dprintk("RPC: %s: ib_dereg_mr returned %i\n",
+ __func__, rc);
+ }
}
/*
@@ -825,7 +815,7 @@
if (ia->ri_device != id->device) {
printk("RPC: %s: can't reconnect on "
"different device!\n", __func__);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -834,7 +824,7 @@
if (rc) {
dprintk("RPC: %s: rdma_create_qp failed %i\n",
__func__, rc);
- rdma_destroy_id(id);
+ rpcrdma_destroy_id(id);
rc = -ENETUNREACH;
goto out;
}
@@ -845,7 +835,7 @@
write_unlock(&ia->ri_qplock);
rdma_destroy_qp(old);
- rdma_destroy_id(old);
+ rpcrdma_destroy_id(old);
} else {
dprintk("RPC: %s: connecting...\n", __func__);
rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &ep->rep_attr);
@@ -1229,75 +1219,6 @@
(unsigned long long)seg->mr_dma, seg->mr_dmalen);
}
-static int
-rpcrdma_register_internal(struct rpcrdma_ia *ia, void *va, int len,
- struct ib_mr **mrp, struct ib_sge *iov)
-{
- struct ib_phys_buf ipb;
- struct ib_mr *mr;
- int rc;
-
- /*
- * All memory passed here was kmalloc'ed, therefore phys-contiguous.
- */
- iov->addr = ib_dma_map_single(ia->ri_device,
- va, len, DMA_BIDIRECTIONAL);
- if (ib_dma_mapping_error(ia->ri_device, iov->addr))
- return -ENOMEM;
-
- iov->length = len;
-
- if (ia->ri_have_dma_lkey) {
- *mrp = NULL;
- iov->lkey = ia->ri_dma_lkey;
- return 0;
- } else if (ia->ri_bind_mem != NULL) {
- *mrp = NULL;
- iov->lkey = ia->ri_bind_mem->lkey;
- return 0;
- }
-
- ipb.addr = iov->addr;
- ipb.size = iov->length;
- mr = ib_reg_phys_mr(ia->ri_pd, &ipb, 1,
- IB_ACCESS_LOCAL_WRITE, &iov->addr);
-
- dprintk("RPC: %s: phys convert: 0x%llx "
- "registered 0x%llx length %d\n",
- __func__, (unsigned long long)ipb.addr,
- (unsigned long long)iov->addr, len);
-
- if (IS_ERR(mr)) {
- *mrp = NULL;
- rc = PTR_ERR(mr);
- dprintk("RPC: %s: failed with %i\n", __func__, rc);
- } else {
- *mrp = mr;
- iov->lkey = mr->lkey;
- rc = 0;
- }
-
- return rc;
-}
-
-static int
-rpcrdma_deregister_internal(struct rpcrdma_ia *ia,
- struct ib_mr *mr, struct ib_sge *iov)
-{
- int rc;
-
- ib_dma_unmap_single(ia->ri_device,
- iov->addr, iov->length, DMA_BIDIRECTIONAL);
-
- if (NULL == mr)
- return 0;
-
- rc = ib_dereg_mr(mr);
- if (rc)
- dprintk("RPC: %s: ib_dereg_mr failed %i\n", __func__, rc);
- return rc;
-}
-
/**
* rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
* @ia: controlling rpcrdma_ia
@@ -1317,26 +1238,29 @@
rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
{
struct rpcrdma_regbuf *rb;
- int rc;
+ struct ib_sge *iov;
- rc = -ENOMEM;
rb = kmalloc(sizeof(*rb) + size, flags);
if (rb == NULL)
goto out;
- rb->rg_size = size;
- rb->rg_owner = NULL;
- rc = rpcrdma_register_internal(ia, rb->rg_base, size,
- &rb->rg_mr, &rb->rg_iov);
- if (rc)
+ iov = &rb->rg_iov;
+ iov->addr = ib_dma_map_single(ia->ri_device,
+ (void *)rb->rg_base, size,
+ DMA_BIDIRECTIONAL);
+ if (ib_dma_mapping_error(ia->ri_device, iov->addr))
goto out_free;
+ iov->length = size;
+ iov->lkey = ia->ri_dma_lkey;
+ rb->rg_size = size;
+ rb->rg_owner = NULL;
return rb;
out_free:
kfree(rb);
out:
- return ERR_PTR(rc);
+ return ERR_PTR(-ENOMEM);
}
/**
@@ -1347,10 +1271,15 @@
void
rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
{
- if (rb) {
- rpcrdma_deregister_internal(ia, rb->rg_mr, &rb->rg_iov);
- kfree(rb);
- }
+ struct ib_sge *iov;
+
+ if (!rb)
+ return;
+
+ iov = &rb->rg_iov;
+ ib_dma_unmap_single(ia->ri_device,
+ iov->addr, iov->length, DMA_BIDIRECTIONAL);
+ kfree(rb);
}
/*
@@ -1363,9 +1292,11 @@
struct rpcrdma_ep *ep,
struct rpcrdma_req *req)
{
+ struct ib_device *device = ia->ri_device;
struct ib_send_wr send_wr, *send_wr_fail;
struct rpcrdma_rep *rep = req->rl_reply;
- int rc;
+ struct ib_sge *iov = req->rl_send_iov;
+ int i, rc;
if (rep) {
rc = rpcrdma_ep_post_recv(ia, ep, rep);
@@ -1376,22 +1307,15 @@
send_wr.next = NULL;
send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
- send_wr.sg_list = req->rl_send_iov;
+ send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
- if (send_wr.num_sge == 4) /* no need to sync any pad (constant) */
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[3].addr,
- req->rl_send_iov[3].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[1].addr,
- req->rl_send_iov[1].length,
- DMA_TO_DEVICE);
- ib_dma_sync_single_for_device(ia->ri_device,
- req->rl_send_iov[0].addr,
- req->rl_send_iov[0].length,
- DMA_TO_DEVICE);
+
+ for (i = 0; i < send_wr.num_sge; i++)
+ ib_dma_sync_single_for_device(device, iov[i].addr,
+ iov[i].length, DMA_TO_DEVICE);
+ dprintk("RPC: %s: posting %d s/g entries\n",
+ __func__, send_wr.num_sge);
if (DECR_CQCOUNT(ep) > 0)
send_wr.send_flags = 0;
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index f49dd8b..d252457 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,9 +65,8 @@
struct ib_device *ri_device;
struct rdma_cm_id *ri_id;
struct ib_pd *ri_pd;
- struct ib_mr *ri_bind_mem;
+ struct ib_mr *ri_dma_mr;
u32 ri_dma_lkey;
- int ri_have_dma_lkey;
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
@@ -89,7 +88,6 @@
int rep_connected;
struct ib_qp_init_attr rep_attr;
wait_queue_head_t rep_connect_wait;
- struct rpcrdma_regbuf *rep_padbuf;
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
@@ -119,7 +117,6 @@
struct rpcrdma_regbuf {
size_t rg_size;
struct rpcrdma_req *rg_owner;
- struct ib_mr *rg_mr;
struct ib_sge rg_iov;
__be32 rg_base[0] __attribute__ ((aligned(256)));
};
@@ -165,8 +162,7 @@
* struct rpcrdma_buffer. N is the max number of outstanding requests.
*/
-/* temporary static scatter/gather max */
-#define RPCRDMA_MAX_DATA_SEGS (64) /* max scatter/gather */
+#define RPCRDMA_MAX_DATA_SEGS ((1 * 1024 * 1024) / PAGE_SIZE)
#define RPCRDMA_MAX_SEGS (RPCRDMA_MAX_DATA_SEGS + 2) /* head+tail = 2 */
struct rpcrdma_buffer;
@@ -258,16 +254,18 @@
char *mr_offset; /* kva if no page, else offset */
};
+#define RPCRDMA_MAX_IOVS (2)
+
struct rpcrdma_req {
- unsigned int rl_niovs; /* 0, 2 or 4 */
- unsigned int rl_nchunks; /* non-zero if chunks */
- unsigned int rl_connect_cookie; /* retry detection */
- struct rpcrdma_buffer *rl_buffer; /* home base for this structure */
+ unsigned int rl_niovs;
+ unsigned int rl_nchunks;
+ unsigned int rl_connect_cookie;
+ struct rpcrdma_buffer *rl_buffer;
struct rpcrdma_rep *rl_reply;/* holder for reply buffer */
- struct ib_sge rl_send_iov[4]; /* for active requests */
- struct rpcrdma_regbuf *rl_rdmabuf;
- struct rpcrdma_regbuf *rl_sendbuf;
- struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+ struct ib_sge rl_send_iov[RPCRDMA_MAX_IOVS];
+ struct rpcrdma_regbuf *rl_rdmabuf;
+ struct rpcrdma_regbuf *rl_sendbuf;
+ struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
};
static inline struct rpcrdma_req *
@@ -342,6 +340,7 @@
unsigned long hardway_register_count;
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
+ unsigned long nomsg_call_count;
};
/*
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 0030376..86ed778 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -2279,13 +2279,14 @@
WARN_ON_ONCE(!xprt_lock_connect(xprt, task, transport));
- /* Start by resetting any existing state */
- xs_reset_transport(transport);
-
- if (transport->sock != NULL && !RPC_IS_SOFTCONN(task)) {
+ if (transport->sock != NULL) {
dprintk("RPC: xs_connect delayed xprt %p for %lu "
"seconds\n",
xprt, xprt->reestablish_timeout / HZ);
+
+ /* Start by resetting any existing state */
+ xs_reset_transport(transport);
+
queue_delayed_work(rpciod_workqueue,
&transport->connect_worker,
xprt->reestablish_timeout);