IB/srp: Prevent mapping failures
If both max_sectors and the queue_depth are high enough it can
happen that the MR pool is depleted temporarily. This causes
the SRP initiator to report mapping failures. Although the SRP
initiator recovers from such mapping failures, prevent that
this can happen by allocating more memory regions.
Additionally, only enable memory registration if at least two
pages can be registered per memory region.
Reported-by: Laurence Oberman <loberman@redhat.com>
Signed-off-by: Bart Van Assche <bart.vanassche@sandisk.com>
Tested-by: Laurence Oberman <loberman@redhat.com>
Cc: Christoph Hellwig <hch@lst.de>
Cc: Sagi Grimberg <sagi@grimberg.me>
Signed-off-by: Doug Ledford <dledford@redhat.com>
diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c
index a37a1f9..6a5ccd4c 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.c
+++ b/drivers/infiniband/ulp/srp/ib_srp.c
@@ -468,7 +468,7 @@
struct ib_qp *qp;
struct ib_fmr_pool *fmr_pool = NULL;
struct srp_fr_pool *fr_pool = NULL;
- const int m = dev->use_fast_reg ? 3 : 1;
+ const int m = 1 + dev->use_fast_reg * target->mr_per_cmd * 2;
int ret;
init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL);
@@ -849,7 +849,7 @@
for (i = 0; i < target->req_ring_size; ++i) {
req = &ch->req_ring[i];
- mr_list = kmalloc(target->cmd_sg_cnt * sizeof(void *),
+ mr_list = kmalloc(target->mr_per_cmd * sizeof(void *),
GFP_KERNEL);
if (!mr_list)
goto out;
@@ -1298,9 +1298,16 @@
srp_handle_qp_err(cq, wc, "FAST REG");
}
+/*
+ * Map up to sg_nents elements of state->sg where *sg_offset_p is the offset
+ * where to start in the first element. If sg_offset_p != NULL then
+ * *sg_offset_p is updated to the offset in state->sg[retval] of the first
+ * byte that has not yet been mapped.
+ */
static int srp_map_finish_fr(struct srp_map_state *state,
struct srp_request *req,
- struct srp_rdma_ch *ch, int sg_nents)
+ struct srp_rdma_ch *ch, int sg_nents,
+ unsigned int *sg_offset_p)
{
struct srp_target_port *target = ch->target;
struct srp_device *dev = target->srp_host->srp_dev;
@@ -1316,9 +1323,13 @@
WARN_ON_ONCE(!dev->use_fast_reg);
if (sg_nents == 1 && target->global_mr) {
- srp_map_desc(state, sg_dma_address(state->sg),
- sg_dma_len(state->sg),
+ unsigned int sg_offset = sg_offset_p ? *sg_offset_p : 0;
+
+ srp_map_desc(state, sg_dma_address(state->sg) + sg_offset,
+ sg_dma_len(state->sg) - sg_offset,
target->global_mr->rkey);
+ if (sg_offset_p)
+ *sg_offset_p = 0;
return 1;
}
@@ -1329,15 +1340,18 @@
rkey = ib_inc_rkey(desc->mr->rkey);
ib_update_fast_reg_key(desc->mr, rkey);
- n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, NULL, dev->mr_page_size);
+ n = ib_map_mr_sg(desc->mr, state->sg, sg_nents, sg_offset_p,
+ dev->mr_page_size);
if (unlikely(n < 0)) {
srp_fr_pool_put(ch->fr_pool, &desc, 1);
- pr_debug("%s: ib_map_mr_sg(%d) returned %d.\n",
+ pr_debug("%s: ib_map_mr_sg(%d, %d) returned %d.\n",
dev_name(&req->scmnd->device->sdev_gendev), sg_nents,
- n);
+ sg_offset_p ? *sg_offset_p : -1, n);
return n;
}
+ WARN_ON_ONCE(desc->mr->length == 0);
+
req->reg_cqe.done = srp_reg_mr_err_done;
wr.wr.next = NULL;
@@ -1358,8 +1372,10 @@
desc->mr->length, desc->mr->rkey);
err = ib_post_send(ch->qp, &wr.wr, &bad_wr);
- if (unlikely(err))
+ if (unlikely(err)) {
+ WARN_ON_ONCE(err == -ENOMEM);
return err;
+ }
return n;
}
@@ -1416,7 +1432,7 @@
state->pages = req->map_page;
state->fmr.next = req->fmr_list;
- state->fmr.end = req->fmr_list + ch->target->cmd_sg_cnt;
+ state->fmr.end = req->fmr_list + ch->target->mr_per_cmd;
for_each_sg(scat, sg, count, i) {
ret = srp_map_sg_entry(state, ch, sg, i);
@@ -1435,9 +1451,11 @@
struct srp_request *req, struct scatterlist *scat,
int count)
{
+ unsigned int sg_offset = 0;
+
state->desc = req->indirect_desc;
state->fr.next = req->fr_list;
- state->fr.end = req->fr_list + ch->target->cmd_sg_cnt;
+ state->fr.end = req->fr_list + ch->target->mr_per_cmd;
state->sg = scat;
if (count == 0)
@@ -1446,7 +1464,7 @@
while (count) {
int i, n;
- n = srp_map_finish_fr(state, req, ch, count);
+ n = srp_map_finish_fr(state, req, ch, count, &sg_offset);
if (unlikely(n < 0))
return n;
@@ -1511,9 +1529,10 @@
#ifdef CONFIG_NEED_SG_DMA_LENGTH
idb_sg->dma_length = idb_sg->length; /* hack^2 */
#endif
- ret = srp_map_finish_fr(&state, req, ch, 1);
+ ret = srp_map_finish_fr(&state, req, ch, 1, NULL);
if (ret < 0)
return ret;
+ WARN_ON_ONCE(ret < 1);
} else if (dev->use_fmr) {
state.pages = idb_pages;
state.pages[0] = (req->indirect_dma_addr &
@@ -1531,6 +1550,32 @@
return 0;
}
+#if defined(DYNAMIC_DATA_DEBUG)
+static void srp_check_mapping(struct srp_map_state *state,
+ struct srp_rdma_ch *ch, struct srp_request *req,
+ struct scatterlist *scat, int count)
+{
+ struct srp_device *dev = ch->target->srp_host->srp_dev;
+ struct srp_fr_desc **pfr;
+ u64 desc_len = 0, mr_len = 0;
+ int i;
+
+ for (i = 0; i < state->ndesc; i++)
+ desc_len += be32_to_cpu(req->indirect_desc[i].len);
+ if (dev->use_fast_reg)
+ for (i = 0, pfr = req->fr_list; i < state->nmdesc; i++, pfr++)
+ mr_len += (*pfr)->mr->length;
+ else if (dev->use_fmr)
+ for (i = 0; i < state->nmdesc; i++)
+ mr_len += be32_to_cpu(req->indirect_desc[i].len);
+ if (desc_len != scsi_bufflen(req->scmnd) ||
+ mr_len > scsi_bufflen(req->scmnd))
+ pr_err("Inconsistent: scsi len %d <> desc len %lld <> mr len %lld; ndesc %d; nmdesc = %d\n",
+ scsi_bufflen(req->scmnd), desc_len, mr_len,
+ state->ndesc, state->nmdesc);
+}
+#endif
+
/**
* srp_map_data() - map SCSI data buffer onto an SRP request
* @scmnd: SCSI command to map
@@ -1616,6 +1661,15 @@
if (ret < 0)
goto unmap;
+#if defined(DYNAMIC_DEBUG)
+ {
+ DEFINE_DYNAMIC_DEBUG_METADATA(ddm,
+ "Memory mapping consistency check");
+ if (unlikely(ddm.flags & _DPRINTK_FLAGS_PRINT))
+ srp_check_mapping(&state, ch, req, scat, count);
+ }
+#endif
+
/* We've mapped the request, now pull as much of the indirect
* descriptor table as we can into the command buffer. If this
* target is not using an external indirect table, we are
@@ -2580,6 +2634,20 @@
return srp_reconnect_rport(target->rport) == 0 ? SUCCESS : FAILED;
}
+static int srp_slave_alloc(struct scsi_device *sdev)
+{
+ struct Scsi_Host *shost = sdev->host;
+ struct srp_target_port *target = host_to_target(shost);
+ struct srp_device *srp_dev = target->srp_host->srp_dev;
+ struct ib_device *ibdev = srp_dev->dev;
+
+ if (!(ibdev->attrs.device_cap_flags & IB_DEVICE_SG_GAPS_REG))
+ blk_queue_virt_boundary(sdev->request_queue,
+ ~srp_dev->mr_page_mask);
+
+ return 0;
+}
+
static int srp_slave_configure(struct scsi_device *sdev)
{
struct Scsi_Host *shost = sdev->host;
@@ -2771,6 +2839,7 @@
.module = THIS_MODULE,
.name = "InfiniBand SRP initiator",
.proc_name = DRV_NAME,
+ .slave_alloc = srp_slave_alloc,
.slave_configure = srp_slave_configure,
.info = srp_target_info,
.queuecommand = srp_queuecommand,
@@ -3177,6 +3246,7 @@
struct srp_device *srp_dev = host->srp_dev;
struct ib_device *ibdev = srp_dev->dev;
int ret, node_idx, node, cpu, i;
+ unsigned int max_sectors_per_mr, mr_per_cmd = 0;
bool multich = false;
target_host = scsi_host_alloc(&srp_template,
@@ -3233,8 +3303,33 @@
target->sg_tablesize = target->cmd_sg_cnt;
}
+ if (srp_dev->use_fast_reg || srp_dev->use_fmr) {
+ /*
+ * FR and FMR can only map one HCA page per entry. If the
+ * start address is not aligned on a HCA page boundary two
+ * entries will be used for the head and the tail although
+ * these two entries combined contain at most one HCA page of
+ * data. Hence the "+ 1" in the calculation below.
+ *
+ * The indirect data buffer descriptor is contiguous so the
+ * memory for that buffer will only be registered if
+ * register_always is true. Hence add one to mr_per_cmd if
+ * register_always has been set.
+ */
+ max_sectors_per_mr = srp_dev->max_pages_per_mr <<
+ (ilog2(srp_dev->mr_page_size) - 9);
+ mr_per_cmd = register_always +
+ (target->scsi_host->max_sectors + 1 +
+ max_sectors_per_mr - 1) / max_sectors_per_mr;
+ pr_debug("max_sectors = %u; max_pages_per_mr = %u; mr_page_size = %u; max_sectors_per_mr = %u; mr_per_cmd = %u\n",
+ target->scsi_host->max_sectors,
+ srp_dev->max_pages_per_mr, srp_dev->mr_page_size,
+ max_sectors_per_mr, mr_per_cmd);
+ }
+
target_host->sg_tablesize = target->sg_tablesize;
- target->mr_pool_size = target->scsi_host->can_queue;
+ target->mr_pool_size = target->scsi_host->can_queue * mr_per_cmd;
+ target->mr_per_cmd = mr_per_cmd;
target->indirect_size = target->sg_tablesize *
sizeof (struct srp_direct_buf);
target->max_iu_len = sizeof (struct srp_cmd) +
@@ -3441,6 +3536,9 @@
srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1);
max_pages_per_mr = device->attrs.max_mr_size;
do_div(max_pages_per_mr, srp_dev->mr_page_size);
+ pr_debug("%s: %llu / %u = %llu <> %u\n", __func__,
+ device->attrs.max_mr_size, srp_dev->mr_page_size,
+ max_pages_per_mr, SRP_MAX_PAGES_PER_MR);
srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR,
max_pages_per_mr);
@@ -3448,12 +3546,13 @@
device->map_phys_fmr && device->unmap_fmr);
srp_dev->has_fr = (device->attrs.device_cap_flags &
IB_DEVICE_MEM_MGT_EXTENSIONS);
- if (!srp_dev->has_fmr && !srp_dev->has_fr)
+ if (!srp_dev->has_fmr && !srp_dev->has_fr) {
dev_warn(&device->dev, "neither FMR nor FR is supported\n");
-
- srp_dev->use_fast_reg = (srp_dev->has_fr &&
- (!srp_dev->has_fmr || prefer_fr));
- srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+ } else if (device->attrs.max_mr_size >= 2 * srp_dev->mr_page_size) {
+ srp_dev->use_fast_reg = (srp_dev->has_fr &&
+ (!srp_dev->has_fmr || prefer_fr));
+ srp_dev->use_fmr = !srp_dev->use_fast_reg && srp_dev->has_fmr;
+ }
if (srp_dev->use_fast_reg) {
srp_dev->max_pages_per_mr =
diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h
index a00914c..26bb9b0 100644
--- a/drivers/infiniband/ulp/srp/ib_srp.h
+++ b/drivers/infiniband/ulp/srp/ib_srp.h
@@ -203,6 +203,7 @@
unsigned int scsi_id;
unsigned int sg_tablesize;
int mr_pool_size;
+ int mr_per_cmd;
int queue_size;
int req_ring_size;
int comp_vector;