IB/iser: Add more RX CQs to scale out processing of SCSI responses

RX/TX CQs will now be selected from a per HCA pool.  For the RX flow
this has the effect of using different interrupt vectors when using
low level drivers (such as mlx4) that map the "vector" param provided
by the ULP on CQ creation to a dedicated IRQ/MSI-X vector.  This
allows the RX flow processing of IO responses to be distributed across
multiple CPUs.

QPs (--> iSER sessions) are assigned to CQs in round robin order using
the CQ with the minimum number of sessions attached to it.

Signed-off-by: Or Gerlitz <ogerlitz@mellanox.com>
Signed-off-by: Alex Tabachnik <alext@mellanox.com>
Signed-off-by: Roland Dreier <roland@purestorage.com>
diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c
index 2dddabd..95a49af 100644
--- a/drivers/infiniband/ulp/iser/iser_verbs.c
+++ b/drivers/infiniband/ulp/iser/iser_verbs.c
@@ -70,32 +70,50 @@
  */
 static int iser_create_device_ib_res(struct iser_device *device)
 {
+	int i, j;
+	struct iser_cq_desc *cq_desc;
+
+	device->cqs_used = min(ISER_MAX_CQ, device->ib_device->num_comp_vectors);
+	iser_err("using %d CQs, device %s supports %d vectors\n", device->cqs_used,
+		 device->ib_device->name, device->ib_device->num_comp_vectors);
+
+	device->cq_desc = kmalloc(sizeof(struct iser_cq_desc) * device->cqs_used,
+				  GFP_KERNEL);
+	if (device->cq_desc == NULL)
+		goto cq_desc_err;
+	cq_desc = device->cq_desc;
+
 	device->pd = ib_alloc_pd(device->ib_device);
 	if (IS_ERR(device->pd))
 		goto pd_err;
 
-	device->rx_cq = ib_create_cq(device->ib_device,
-				  iser_cq_callback,
-				  iser_cq_event_callback,
-				  (void *)device,
-				  ISER_MAX_RX_CQ_LEN, 0);
-	if (IS_ERR(device->rx_cq))
-		goto rx_cq_err;
+	for (i = 0; i < device->cqs_used; i++) {
+		cq_desc[i].device   = device;
+		cq_desc[i].cq_index = i;
 
-	device->tx_cq = ib_create_cq(device->ib_device,
-				  NULL, iser_cq_event_callback,
-				  (void *)device,
-				  ISER_MAX_TX_CQ_LEN, 0);
+		device->rx_cq[i] = ib_create_cq(device->ib_device,
+					  iser_cq_callback,
+					  iser_cq_event_callback,
+					  (void *)&cq_desc[i],
+					  ISER_MAX_RX_CQ_LEN, i);
+		if (IS_ERR(device->rx_cq[i]))
+			goto cq_err;
 
-	if (IS_ERR(device->tx_cq))
-		goto tx_cq_err;
+		device->tx_cq[i] = ib_create_cq(device->ib_device,
+					  NULL, iser_cq_event_callback,
+					  (void *)&cq_desc[i],
+					  ISER_MAX_TX_CQ_LEN, i);
 
-	if (ib_req_notify_cq(device->rx_cq, IB_CQ_NEXT_COMP))
-		goto cq_arm_err;
+		if (IS_ERR(device->tx_cq[i]))
+			goto cq_err;
 
-	tasklet_init(&device->cq_tasklet,
-		     iser_cq_tasklet_fn,
-		     (unsigned long)device);
+		if (ib_req_notify_cq(device->rx_cq[i], IB_CQ_NEXT_COMP))
+			goto cq_err;
+
+		tasklet_init(&device->cq_tasklet[i],
+			     iser_cq_tasklet_fn,
+			(unsigned long)&cq_desc[i]);
+	}
 
 	device->mr = ib_get_dma_mr(device->pd, IB_ACCESS_LOCAL_WRITE |
 				   IB_ACCESS_REMOTE_WRITE |
@@ -113,14 +131,19 @@
 handler_err:
 	ib_dereg_mr(device->mr);
 dma_mr_err:
-	tasklet_kill(&device->cq_tasklet);
-cq_arm_err:
-	ib_destroy_cq(device->tx_cq);
-tx_cq_err:
-	ib_destroy_cq(device->rx_cq);
-rx_cq_err:
+	for (j = 0; j < device->cqs_used; j++)
+		tasklet_kill(&device->cq_tasklet[j]);
+cq_err:
+	for (j = 0; j < i; j++) {
+		if (device->tx_cq[j])
+			ib_destroy_cq(device->tx_cq[j]);
+		if (device->rx_cq[j])
+			ib_destroy_cq(device->rx_cq[j]);
+	}
 	ib_dealloc_pd(device->pd);
 pd_err:
+	kfree(device->cq_desc);
+cq_desc_err:
 	iser_err("failed to allocate an IB resource\n");
 	return -1;
 }
@@ -131,18 +154,24 @@
  */
 static void iser_free_device_ib_res(struct iser_device *device)
 {
+	int i;
 	BUG_ON(device->mr == NULL);
 
-	tasklet_kill(&device->cq_tasklet);
+	for (i = 0; i < device->cqs_used; i++) {
+		tasklet_kill(&device->cq_tasklet[i]);
+		(void)ib_destroy_cq(device->tx_cq[i]);
+		(void)ib_destroy_cq(device->rx_cq[i]);
+		device->tx_cq[i] = NULL;
+		device->rx_cq[i] = NULL;
+	}
+
 	(void)ib_unregister_event_handler(&device->event_handler);
 	(void)ib_dereg_mr(device->mr);
-	(void)ib_destroy_cq(device->tx_cq);
-	(void)ib_destroy_cq(device->rx_cq);
 	(void)ib_dealloc_pd(device->pd);
 
+	kfree(device->cq_desc);
+
 	device->mr = NULL;
-	device->tx_cq = NULL;
-	device->rx_cq = NULL;
 	device->pd = NULL;
 }
 
@@ -157,6 +186,7 @@
 	struct ib_qp_init_attr	init_attr;
 	int			req_err, resp_err, ret = -ENOMEM;
 	struct ib_fmr_pool_param params;
+	int index, min_index = 0;
 
 	BUG_ON(ib_conn->device == NULL);
 
@@ -220,10 +250,20 @@
 
 	memset(&init_attr, 0, sizeof init_attr);
 
+	mutex_lock(&ig.connlist_mutex);
+	/* select the CQ with the minimal number of usages */
+	for (index = 0; index < device->cqs_used; index++)
+		if (device->cq_active_qps[index] <
+		    device->cq_active_qps[min_index])
+			min_index = index;
+	device->cq_active_qps[min_index]++;
+	mutex_unlock(&ig.connlist_mutex);
+	iser_err("cq index %d used for ib_conn %p\n", min_index, ib_conn);
+
 	init_attr.event_handler = iser_qp_event_callback;
 	init_attr.qp_context	= (void *)ib_conn;
-	init_attr.send_cq	= device->tx_cq;
-	init_attr.recv_cq	= device->rx_cq;
+	init_attr.send_cq	= device->tx_cq[min_index];
+	init_attr.recv_cq	= device->rx_cq[min_index];
 	init_attr.cap.max_send_wr  = ISER_QP_MAX_REQ_DTOS;
 	init_attr.cap.max_recv_wr  = ISER_QP_MAX_RECV_DTOS;
 	init_attr.cap.max_send_sge = 2;
@@ -252,6 +292,7 @@
  */
 static int iser_free_ib_conn_res(struct iser_conn *ib_conn, int can_destroy_id)
 {
+	int cq_index;
 	BUG_ON(ib_conn == NULL);
 
 	iser_err("freeing conn %p cma_id %p fmr pool %p qp %p\n",
@@ -262,9 +303,12 @@
 	if (ib_conn->fmr_pool != NULL)
 		ib_destroy_fmr_pool(ib_conn->fmr_pool);
 
-	if (ib_conn->qp != NULL)
-		rdma_destroy_qp(ib_conn->cma_id);
+	if (ib_conn->qp != NULL) {
+		cq_index = ((struct iser_cq_desc *)ib_conn->qp->recv_cq->cq_context)->cq_index;
+		ib_conn->device->cq_active_qps[cq_index]--;
 
+		rdma_destroy_qp(ib_conn->cma_id);
+	}
 	/* if cma handler context, the caller acts s.t the cma destroy the id */
 	if (ib_conn->cma_id != NULL && can_destroy_id)
 		rdma_destroy_id(ib_conn->cma_id);
@@ -791,9 +835,9 @@
 	}
 }
 
-static int iser_drain_tx_cq(struct iser_device  *device)
+static int iser_drain_tx_cq(struct iser_device  *device, int cq_index)
 {
-	struct ib_cq  *cq = device->tx_cq;
+	struct ib_cq  *cq = device->tx_cq[cq_index];
 	struct ib_wc  wc;
 	struct iser_tx_desc *tx_desc;
 	struct iser_conn *ib_conn;
@@ -822,8 +866,10 @@
 
 static void iser_cq_tasklet_fn(unsigned long data)
 {
-	 struct iser_device  *device = (struct iser_device *)data;
-	 struct ib_cq	     *cq = device->rx_cq;
+	struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)data;
+	struct iser_device  *device = cq_desc->device;
+	int cq_index = cq_desc->cq_index;
+	struct ib_cq	     *cq = device->rx_cq[cq_index];
 	 struct ib_wc	     wc;
 	 struct iser_rx_desc *desc;
 	 unsigned long	     xfer_len;
@@ -851,19 +897,21 @@
 		}
 		completed_rx++;
 		if (!(completed_rx & 63))
-			completed_tx += iser_drain_tx_cq(device);
+			completed_tx += iser_drain_tx_cq(device, cq_index);
 	}
 	/* #warning "it is assumed here that arming CQ only once its empty" *
 	 * " would not cause interrupts to be missed"                       */
 	ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
 
-	completed_tx += iser_drain_tx_cq(device);
+	completed_tx += iser_drain_tx_cq(device, cq_index);
 	iser_dbg("got %d rx %d tx completions\n", completed_rx, completed_tx);
 }
 
 static void iser_cq_callback(struct ib_cq *cq, void *cq_context)
 {
-	struct iser_device  *device = (struct iser_device *)cq_context;
+	struct iser_cq_desc *cq_desc = (struct iser_cq_desc *)cq_context;
+	struct iser_device  *device = cq_desc->device;
+	int cq_index = cq_desc->cq_index;
 
-	tasklet_schedule(&device->cq_tasklet);
+	tasklet_schedule(&device->cq_tasklet[cq_index]);
 }