IB/ipath: Fix many locking issues when switching to error state

The send DMA hardware queue voided a number of prior assumptions about
when a send is complete which led to completions being generated out of
order.  There were also a number of locking issues when switching the QP
to the error or reset states, and we implement the IB_QPS_SQD state.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 22bb42d..e0ec540 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -111,16 +111,24 @@
 module_param_named(disable_sma, ib_ipath_disable_sma, uint, S_IWUSR | S_IRUGO);
 MODULE_PARM_DESC(disable_sma, "Disable the SMA");
 
+/*
+ * Note that it is OK to post send work requests in the SQE and ERR
+ * states; ipath_do_send() will process them and generate error
+ * completions as per IB 1.2 C10-96.
+ */
 const int ib_ipath_state_ops[IB_QPS_ERR + 1] = {
 	[IB_QPS_RESET] = 0,
 	[IB_QPS_INIT] = IPATH_POST_RECV_OK,
 	[IB_QPS_RTR] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
 	[IB_QPS_RTS] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK |
+	    IPATH_PROCESS_NEXT_SEND_OK,
 	[IB_QPS_SQD] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
-	    IPATH_POST_SEND_OK,
-	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK,
-	[IB_QPS_ERR] = 0,
+	    IPATH_POST_SEND_OK | IPATH_PROCESS_SEND_OK,
+	[IB_QPS_SQE] = IPATH_POST_RECV_OK | IPATH_PROCESS_RECV_OK |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
+	[IB_QPS_ERR] = IPATH_POST_RECV_OK | IPATH_FLUSH_RECV |
+	    IPATH_POST_SEND_OK | IPATH_FLUSH_SEND,
 };
 
 struct ipath_ucontext {
@@ -230,18 +238,6 @@
 	}
 }
 
-static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-	struct ib_wc wc;
-
-	memset(&wc, 0, sizeof(wc));
-	wc.wr_id = wr->wr_id;
-	wc.status = IB_WC_WR_FLUSH_ERR;
-	wc.opcode = ib_ipath_wc_opcode[wr->opcode];
-	wc.qp = &qp->ibqp;
-	ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
-}
-
 /*
  * Count the number of DMA descriptors needed to send length bytes of data.
  * Don't modify the ipath_sge_state to get the count.
@@ -347,14 +343,8 @@
 	spin_lock_irqsave(&qp->s_lock, flags);
 
 	/* Check that state is OK to post send. */
-	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) {
-		if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR)
-			goto bail_inval;
-		/* C10-96 says generate a flushed completion entry. */
-		ipath_flush_wqe(qp, wr);
-		ret = 0;
-		goto bail;
-	}
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)))
+		goto bail_inval;
 
 	/* IB spec says that num_sge == 0 is OK. */
 	if (wr->num_sge > qp->s_max_sge)
@@ -677,6 +667,7 @@
 static void ipath_ib_timer(struct ipath_ibdev *dev)
 {
 	struct ipath_qp *resend = NULL;
+	struct ipath_qp *rnr = NULL;
 	struct list_head *last;
 	struct ipath_qp *qp;
 	unsigned long flags;
@@ -703,7 +694,9 @@
 		if (--qp->s_rnr_timeout == 0) {
 			do {
 				list_del_init(&qp->timerwait);
-				tasklet_hi_schedule(&qp->s_task);
+				qp->timer_next = rnr;
+				rnr = qp;
+				atomic_inc(&qp->refcount);
 				if (list_empty(last))
 					break;
 				qp = list_entry(last->next, struct ipath_qp,
@@ -743,9 +736,13 @@
 	spin_unlock_irqrestore(&dev->pending_lock, flags);
 
 	/* XXX What if timer fires again while this is running? */
-	for (qp = resend; qp != NULL; qp = qp->timer_next) {
+	while (resend != NULL) {
+		qp = resend;
+		resend = qp->timer_next;
+
 		spin_lock_irqsave(&qp->s_lock, flags);
-		if (qp->s_last != qp->s_tail && qp->state == IB_QPS_RTS) {
+		if (qp->s_last != qp->s_tail &&
+		    ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
 			dev->n_timeouts++;
 			ipath_restart_rc(qp, qp->s_last_psn + 1);
 		}
@@ -755,6 +752,19 @@
 		if (atomic_dec_and_test(&qp->refcount))
 			wake_up(&qp->wait);
 	}
+	while (rnr != NULL) {
+		qp = rnr;
+		rnr = qp->timer_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
 }
 
 static void update_sge(struct ipath_sge_state *ss, u32 length)
@@ -1010,13 +1020,24 @@
 	struct ipath_verbs_txreq *tx = cookie;
 	struct ipath_qp *qp = tx->qp;
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	unsigned int flags;
+	enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
+		IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
 
-	/* Generate a completion queue entry if needed */
-	if (qp->ibqp.qp_type != IB_QPT_RC && tx->wqe) {
-		enum ib_wc_status ibs = status == IPATH_SDMA_TXREQ_S_OK ?
-			IB_WC_SUCCESS : IB_WC_WR_FLUSH_ERR;
-
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (tx->wqe)
+			ipath_send_complete(qp, tx->wqe, ibs);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	} else if (tx->wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
 		ipath_send_complete(qp, tx->wqe, ibs);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
 	}
 
 	if (tx->txreq.flags & IPATH_SDMA_TXREQ_F_FREEBUF)
@@ -1027,6 +1048,21 @@
 		wake_up(&qp->wait);
 }
 
+static void decrement_dma_busy(struct ipath_qp *qp)
+{
+	unsigned int flags;
+
+	if (atomic_dec_and_test(&qp->s_dma_busy)) {
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if ((ib_ipath_state_ops[qp->state] & IPATH_FLUSH_SEND &&
+		     qp->s_last != qp->s_head) ||
+		    (qp->s_flags & IPATH_S_WAIT_DMA))
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		wake_up(&qp->wait_dma);
+	}
+}
+
 /*
  * Compute the number of clock cycles of delay before sending the next packet.
  * The multipliers reflect the number of clocks for the fastest rate so
@@ -1065,9 +1101,12 @@
 	if (tx) {
 		qp->s_tx = NULL;
 		/* resend previously constructed packet */
+		atomic_inc(&qp->s_dma_busy);
 		ret = ipath_sdma_verbs_send(dd, tx->ss, tx->len, tx);
-		if (ret)
+		if (ret) {
 			qp->s_tx = tx;
+			decrement_dma_busy(qp);
+		}
 		goto bail;
 	}
 
@@ -1118,12 +1157,14 @@
 		tx->txreq.sg_count = ndesc;
 		tx->map_len = (hdrwords + 2) << 2;
 		tx->txreq.map_addr = &tx->hdr;
+		atomic_inc(&qp->s_dma_busy);
 		ret = ipath_sdma_verbs_send(dd, ss, dwords, tx);
 		if (ret) {
 			/* save ss and length in dwords */
 			tx->ss = ss;
 			tx->len = dwords;
 			qp->s_tx = tx;
+			decrement_dma_busy(qp);
 		}
 		goto bail;
 	}
@@ -1144,6 +1185,7 @@
 	memcpy(piobuf, hdr, hdrwords << 2);
 	ipath_copy_from_sge(piobuf + hdrwords, ss, len);
 
+	atomic_inc(&qp->s_dma_busy);
 	ret = ipath_sdma_verbs_send(dd, NULL, 0, tx);
 	/*
 	 * If we couldn't queue the DMA request, save the info
@@ -1154,6 +1196,7 @@
 		tx->ss = NULL;
 		tx->len = 0;
 		qp->s_tx = tx;
+		decrement_dma_busy(qp);
 	}
 	dev->n_unaligned++;
 	goto bail;
@@ -1177,6 +1220,7 @@
 	unsigned flush_wc;
 	u32 control;
 	int ret;
+	unsigned int flags;
 
 	piobuf = ipath_getpiobuf(dd, plen, NULL);
 	if (unlikely(piobuf == NULL)) {
@@ -1247,8 +1291,11 @@
 	}
 	copy_io(piobuf, ss, len, flush_wc);
 done:
-	if (qp->s_wqe)
+	if (qp->s_wqe) {
+		spin_lock_irqsave(&qp->s_lock, flags);
 		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+	}
 	ret = 0;
 bail:
 	return ret;
@@ -1281,19 +1328,12 @@
 	 * can defer SDMA restart until link goes ACTIVE without
 	 * worrying about just how we got there.
 	 */
-	if (qp->ibqp.qp_type == IB_QPT_SMI)
+	if (qp->ibqp.qp_type == IB_QPT_SMI ||
+	    !(dd->ipath_flags & IPATH_HAS_SEND_DMA))
 		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
 					   plen, dwords);
-	/* All non-VL15 packets are dropped if link is not ACTIVE */
-	else if (!(dd->ipath_flags & IPATH_LINKACTIVE)) {
-		if (qp->s_wqe)
-			ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
-		ret = 0;
-	} else if (dd->ipath_flags & IPATH_HAS_SEND_DMA)
-		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
-					   plen, dwords);
 	else
-		ret = ipath_verbs_send_pio(qp, hdr, hdrwords, ss, len,
+		ret = ipath_verbs_send_dma(qp, hdr, hdrwords, ss, len,
 					   plen, dwords);
 
 	return ret;
@@ -1401,27 +1441,46 @@
  * This is called from ipath_intr() at interrupt level when a PIO buffer is
  * available after ipath_verbs_send() returned an error that no buffers were
  * available.  Return 1 if we consumed all the PIO buffers and we still have
- * QPs waiting for buffers (for now, just do a tasklet_hi_schedule and
+ * QPs waiting for buffers (for now, just restart the send tasklet and
  * return zero).
  */
 int ipath_ib_piobufavail(struct ipath_ibdev *dev)
 {
+	struct list_head *list;
+	struct ipath_qp *qplist;
 	struct ipath_qp *qp;
 	unsigned long flags;
 
 	if (dev == NULL)
 		goto bail;
 
+	list = &dev->piowait;
+	qplist = NULL;
+
 	spin_lock_irqsave(&dev->pending_lock, flags);
-	while (!list_empty(&dev->piowait)) {
-		qp = list_entry(dev->piowait.next, struct ipath_qp,
-				piowait);
+	while (!list_empty(list)) {
+		qp = list_entry(list->next, struct ipath_qp, piowait);
 		list_del_init(&qp->piowait);
-		clear_bit(IPATH_S_BUSY, &qp->s_busy);
-		tasklet_hi_schedule(&qp->s_task);
+		qp->pio_next = qplist;
+		qplist = qp;
+		atomic_inc(&qp->refcount);
 	}
 	spin_unlock_irqrestore(&dev->pending_lock, flags);
 
+	while (qplist != NULL) {
+		qp = qplist;
+		qplist = qp->pio_next;
+
+		spin_lock_irqsave(&qp->s_lock, flags);
+		if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)
+			ipath_schedule_send(qp);
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+
+		/* Notify ipath_destroy_qp() if it is waiting. */
+		if (atomic_dec_and_test(&qp->refcount))
+			wake_up(&qp->wait);
+	}
+
 bail:
 	return 0;
 }
@@ -2143,11 +2202,12 @@
 void ipath_unregister_ib_device(struct ipath_ibdev *dev)
 {
 	struct ib_device *ibdev = &dev->ibdev;
-
-	disable_timer(dev->dd);
+	u32 qps_inuse;
 
 	ib_unregister_device(ibdev);
 
+	disable_timer(dev->dd);
+
 	if (!list_empty(&dev->pending[0]) ||
 	    !list_empty(&dev->pending[1]) ||
 	    !list_empty(&dev->pending[2]))
@@ -2162,7 +2222,10 @@
 	 * Note that ipath_unregister_ib_device() can be called before all
 	 * the QPs are destroyed!
 	 */
-	ipath_free_all_qps(&dev->qp_table);
+	qps_inuse = ipath_free_all_qps(&dev->qp_table);
+	if (qps_inuse)
+		ipath_dev_err(dev->dd, "QP memory leak! %u still in use\n",
+			qps_inuse);
 	kfree(dev->qp_table.table);
 	kfree(dev->lk_table.table);
 	kfree(dev->txreq_bufs);
@@ -2213,17 +2276,14 @@
 		      "RC OTH NAKs %d\n"
 		      "RC timeouts %d\n"
 		      "RC RDMA dup %d\n"
-		      "RC stalls   %d\n"
 		      "piobuf wait %d\n"
-		      "no piobuf   %d\n"
 		      "unaligned   %d\n"
 		      "PKT drops   %d\n"
 		      "WQE errs    %d\n",
 		      dev->n_rc_resends, dev->n_rc_qacks, dev->n_rc_acks,
 		      dev->n_seq_naks, dev->n_rdma_seq, dev->n_rnr_naks,
 		      dev->n_other_naks, dev->n_timeouts,
-		      dev->n_rdma_dup_busy, dev->n_rc_stalls, dev->n_piowait,
-		      dev->n_no_piobuf, dev->n_unaligned,
+		      dev->n_rdma_dup_busy, dev->n_piowait, dev->n_unaligned,
 		      dev->n_pkt_drops, dev->n_wqe_errs);
 	for (i = 0; i < ARRAY_SIZE(dev->opstats); i++) {
 		const struct ipath_opcode_stats *si = &dev->opstats[i];