IB/ipath: Fix many locking issues when switching to error state

The send DMA hardware queue voided a number of prior assumptions about
when a send is complete which led to completions being generated out of
order.  There were also a number of locking issues when switching the QP
to the error or reset states, and we implement the IB_QPS_SQD state.

Signed-off-by: Ralph Campbell <ralph.campbell@qlogic.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index c716a03..a4b5521 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -78,6 +78,7 @@
  * ipath_insert_rnr_queue - put QP on the RNR timeout list for the device
  * @qp: the QP
  *
+ * Called with the QP s_lock held and interrupts disabled.
  * XXX Use a simple list for now.  We might need a priority
  * queue if we have lots of QPs waiting for RNR timeouts
  * but that should be rare.
@@ -85,9 +86,9 @@
 void ipath_insert_rnr_queue(struct ipath_qp *qp)
 {
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	unsigned long flags;
 
-	spin_lock_irqsave(&dev->pending_lock, flags);
+	/* We already did a spin_lock_irqsave(), so just use spin_lock */
+	spin_lock(&dev->pending_lock);
 	if (list_empty(&dev->rnrwait))
 		list_add(&qp->timerwait, &dev->rnrwait);
 	else {
@@ -109,7 +110,7 @@
 			nqp->s_rnr_timeout -= qp->s_rnr_timeout;
 		list_add(&qp->timerwait, l);
 	}
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
+	spin_unlock(&dev->pending_lock);
 }
 
 /**
@@ -185,6 +186,11 @@
 	}
 
 	spin_lock_irqsave(&rq->lock, flags);
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		ret = 0;
+		goto unlock;
+	}
+
 	wq = rq->wq;
 	tail = wq->tail;
 	/* Validate tail before using it since it is user writable. */
@@ -192,9 +198,8 @@
 		tail = 0;
 	do {
 		if (unlikely(tail == wq->head)) {
-			spin_unlock_irqrestore(&rq->lock, flags);
 			ret = 0;
-			goto bail;
+			goto unlock;
 		}
 		/* Make sure entry is read after head index is read. */
 		smp_rmb();
@@ -207,7 +212,7 @@
 	wq->tail = tail;
 
 	ret = 1;
-	qp->r_wrid_valid = 1;
+	set_bit(IPATH_R_WRID_VALID, &qp->r_aflags);
 	if (handler) {
 		u32 n;
 
@@ -234,8 +239,8 @@
 			goto bail;
 		}
 	}
+unlock:
 	spin_unlock_irqrestore(&rq->lock, flags);
-
 bail:
 	return ret;
 }
@@ -263,35 +268,59 @@
 	atomic64_t *maddr;
 	enum ib_wc_status send_status;
 
+	/*
+	 * Note that we check the responder QP state after
+	 * checking the requester's state.
+	 */
 	qp = ipath_lookup_qpn(&dev->qp_table, sqp->remote_qpn);
-	if (!qp) {
-		dev->n_pkt_drops++;
-		return;
-	}
 
-again:
 	spin_lock_irqsave(&sqp->s_lock, flags);
 
-	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_SEND_OK) ||
-	    sqp->s_rnr_timeout) {
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		goto done;
-	}
+	/* Return if we are already busy processing a work request. */
+	if ((sqp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		goto unlock;
 
-	/* Get the next send request. */
-	if (sqp->s_last == sqp->s_head) {
-		/* Send work queue is empty. */
-		spin_unlock_irqrestore(&sqp->s_lock, flags);
-		goto done;
+	sqp->s_flags |= IPATH_S_BUSY;
+
+again:
+	if (sqp->s_last == sqp->s_head)
+		goto clr_busy;
+	wqe = get_swqe_ptr(sqp, sqp->s_last);
+
+	/* Return if it is not OK to start a new work reqeust. */
+	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_NEXT_SEND_OK)) {
+		if (!(ib_ipath_state_ops[sqp->state] & IPATH_FLUSH_SEND))
+			goto clr_busy;
+		/* We are in the error state, flush the work request. */
+		send_status = IB_WC_WR_FLUSH_ERR;
+		goto flush_send;
 	}
 
 	/*
 	 * We can rely on the entry not changing without the s_lock
 	 * being held until we update s_last.
+	 * We increment s_cur to indicate s_last is in progress.
 	 */
-	wqe = get_swqe_ptr(sqp, sqp->s_last);
+	if (sqp->s_last == sqp->s_cur) {
+		if (++sqp->s_cur >= sqp->s_size)
+			sqp->s_cur = 0;
+	}
 	spin_unlock_irqrestore(&sqp->s_lock, flags);
 
+	if (!qp || !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_RECV_OK)) {
+		dev->n_pkt_drops++;
+		/*
+		 * For RC, the requester would timeout and retry so
+		 * shortcut the timeouts and just signal too many retries.
+		 */
+		if (sqp->ibqp.qp_type == IB_QPT_RC)
+			send_status = IB_WC_RETRY_EXC_ERR;
+		else
+			send_status = IB_WC_SUCCESS;
+		goto serr;
+	}
+
 	memset(&wc, 0, sizeof wc);
 	send_status = IB_WC_SUCCESS;
 
@@ -396,8 +425,7 @@
 		sqp->s_len -= len;
 	}
 
-	if (wqe->wr.opcode == IB_WR_RDMA_WRITE ||
-	    wqe->wr.opcode == IB_WR_RDMA_READ)
+	if (!test_and_clear_bit(IPATH_R_WRID_VALID, &qp->r_aflags))
 		goto send_comp;
 
 	if (wqe->wr.opcode == IB_WR_RDMA_WRITE_WITH_IMM)
@@ -417,6 +445,8 @@
 		       wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
+	spin_lock_irqsave(&sqp->s_lock, flags);
+flush_send:
 	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
 	ipath_send_complete(sqp, wqe, send_status);
 	goto again;
@@ -437,11 +467,12 @@
 		sqp->s_rnr_retry--;
 	spin_lock_irqsave(&sqp->s_lock, flags);
 	if (!(ib_ipath_state_ops[sqp->state] & IPATH_PROCESS_RECV_OK))
-		goto unlock;
+		goto clr_busy;
+	sqp->s_flags |= IPATH_S_WAITING;
 	dev->n_rnr_naks++;
 	sqp->s_rnr_timeout = ib_ipath_rnr_table[qp->r_min_rnr_timer];
 	ipath_insert_rnr_queue(sqp);
-	goto unlock;
+	goto clr_busy;
 
 inv_err:
 	send_status = IB_WC_REM_INV_REQ_ERR;
@@ -473,17 +504,19 @@
 		}
 		goto done;
 	}
+clr_busy:
+	sqp->s_flags &= ~IPATH_S_BUSY;
 unlock:
 	spin_unlock_irqrestore(&sqp->s_lock, flags);
 done:
-	if (atomic_dec_and_test(&qp->refcount))
+	if (qp && atomic_dec_and_test(&qp->refcount))
 		wake_up(&qp->wait);
 }
 
 static void want_buffer(struct ipath_devdata *dd, struct ipath_qp *qp)
 {
 	if (!(dd->ipath_flags & IPATH_HAS_SEND_DMA) ||
-		qp->ibqp.qp_type == IB_QPT_SMI) {
+	    qp->ibqp.qp_type == IB_QPT_SMI) {
 		unsigned long flags;
 
 		spin_lock_irqsave(&dd->ipath_sendctrl_lock, flags);
@@ -501,26 +534,36 @@
  * @dev: the device we ran out of buffers on
  *
  * Called when we run out of PIO buffers.
+ * If we are now in the error state, return zero to flush the
+ * send work request.
  */
-static void ipath_no_bufs_available(struct ipath_qp *qp,
+static int ipath_no_bufs_available(struct ipath_qp *qp,
 				    struct ipath_ibdev *dev)
 {
 	unsigned long flags;
+	int ret = 1;
 
 	/*
 	 * Note that as soon as want_buffer() is called and
 	 * possibly before it returns, ipath_ib_piobufavail()
-	 * could be called.  If we are still in the tasklet function,
-	 * tasklet_hi_schedule() will not call us until the next time
-	 * tasklet_hi_schedule() is called.
-	 * We leave the busy flag set so that another post send doesn't
-	 * try to put the same QP on the piowait list again.
+	 * could be called. Therefore, put QP on the piowait list before
+	 * enabling the PIO avail interrupt.
 	 */
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	list_add_tail(&qp->piowait, &dev->piowait);
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
-	want_buffer(dev->dd, qp);
-	dev->n_piowait++;
+	spin_lock_irqsave(&qp->s_lock, flags);
+	if (ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) {
+		dev->n_piowait++;
+		qp->s_flags |= IPATH_S_WAITING;
+		qp->s_flags &= ~IPATH_S_BUSY;
+		spin_lock(&dev->pending_lock);
+		if (list_empty(&qp->piowait))
+			list_add_tail(&qp->piowait, &dev->piowait);
+		spin_unlock(&dev->pending_lock);
+	} else
+		ret = 0;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (ret)
+		want_buffer(dev->dd, qp);
+	return ret;
 }
 
 /**
@@ -596,15 +639,13 @@
 	struct ipath_qp *qp = (struct ipath_qp *)data;
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 	int (*make_req)(struct ipath_qp *qp);
-
-	if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy))
-		goto bail;
+	unsigned long flags;
 
 	if ((qp->ibqp.qp_type == IB_QPT_RC ||
 	     qp->ibqp.qp_type == IB_QPT_UC) &&
 	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
 		ipath_ruc_loopback(qp);
-		goto clear;
+		goto bail;
 	}
 
 	if (qp->ibqp.qp_type == IB_QPT_RC)
@@ -614,6 +655,19 @@
 	else
 	       make_req = ipath_make_ud_req;
 
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Return if we are already busy processing a work request. */
+	if ((qp->s_flags & (IPATH_S_BUSY | IPATH_S_ANY_WAIT)) ||
+	    !(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND)) {
+		spin_unlock_irqrestore(&qp->s_lock, flags);
+		goto bail;
+	}
+
+	qp->s_flags |= IPATH_S_BUSY;
+
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+
 again:
 	/* Check for a constructed packet to be sent. */
 	if (qp->s_hdrwords != 0) {
@@ -623,8 +677,8 @@
 		 */
 		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
 				     qp->s_cur_sge, qp->s_cur_size)) {
-			ipath_no_bufs_available(qp, dev);
-			goto bail;
+			if (ipath_no_bufs_available(qp, dev))
+				goto bail;
 		}
 		dev->n_unicast_xmit++;
 		/* Record that we sent the packet and s_hdr is empty. */
@@ -633,16 +687,20 @@
 
 	if (make_req(qp))
 		goto again;
-clear:
-	clear_bit(IPATH_S_BUSY, &qp->s_busy);
+
 bail:;
 }
 
+/*
+ * This should be called with s_lock held.
+ */
 void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
 			 enum ib_wc_status status)
 {
-	unsigned long flags;
-	u32 last;
+	u32 old_last, last;
+
+	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_OR_FLUSH_SEND))
+		return;
 
 	/* See ch. 11.2.4.1 and 10.7.3.1 */
 	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
@@ -661,10 +719,14 @@
 			       status != IB_WC_SUCCESS);
 	}
 
-	spin_lock_irqsave(&qp->s_lock, flags);
-	last = qp->s_last;
+	old_last = last = qp->s_last;
 	if (++last >= qp->s_size)
 		last = 0;
 	qp->s_last = last;
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+	if (qp->s_cur == old_last)
+		qp->s_cur = last;
+	if (qp->s_tail == old_last)
+		qp->s_tail = last;
+	if (qp->state == IB_QPS_SQD && last == qp->s_cur)
+		qp->s_draining = 0;
 }