IB/mlx4: Fix memory ordering problem when posting LSO sends

The current work request posting code writes the LSO segment before
writing any data segments.  This leaves a window where the LSO segment
overwrites the stamping in one cacheline that the HCA prefetches
before the rest of the cacheline is filled with the correct data
segments.  When the HCA processes this work request, a local
protection error may result.

Fix this by saving the LSO header size field off and writing it only
after all data segments are written.  This fix is a cleaned-up version
of a patch from Jack Morgenstein <jackm@dev.mellanox.co.il>.

This fixes <https://bugs.openfabrics.org/show_bug.cgi?id=1383>.

Reported-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 39167a7..a91cb4c 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1462,7 +1462,8 @@
 }
 
 static int build_lso_seg(struct mlx4_wqe_lso_seg *wqe, struct ib_send_wr *wr,
-			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len)
+			 struct mlx4_ib_qp *qp, unsigned *lso_seg_len,
+			 __be32 *lso_hdr_sz)
 {
 	unsigned halign = ALIGN(sizeof *wqe + wr->wr.ud.hlen, 16);
 
@@ -1479,12 +1480,8 @@
 
 	memcpy(wqe->header, wr->wr.ud.header, wr->wr.ud.hlen);
 
-	/* make sure LSO header is written before overwriting stamping */
-	wmb();
-
-	wqe->mss_hdr_size = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
-					wr->wr.ud.hlen);
-
+	*lso_hdr_sz  = cpu_to_be32((wr->wr.ud.mss - wr->wr.ud.hlen) << 16 |
+				   wr->wr.ud.hlen);
 	*lso_seg_len = halign;
 	return 0;
 }
@@ -1518,6 +1515,9 @@
 	int uninitialized_var(stamp);
 	int uninitialized_var(size);
 	unsigned uninitialized_var(seglen);
+	__be32 dummy;
+	__be32 *lso_wqe;
+	__be32 uninitialized_var(lso_hdr_sz);
 	int i;
 
 	spin_lock_irqsave(&qp->sq.lock, flags);
@@ -1525,6 +1525,8 @@
 	ind = qp->sq_next_wqe;
 
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
+		lso_wqe = &dummy;
+
 		if (mlx4_wq_overflow(&qp->sq, nreq, qp->ibqp.send_cq)) {
 			err = -ENOMEM;
 			*bad_wr = wr;
@@ -1606,11 +1608,12 @@
 			size += sizeof (struct mlx4_wqe_datagram_seg) / 16;
 
 			if (wr->opcode == IB_WR_LSO) {
-				err = build_lso_seg(wqe, wr, qp, &seglen);
+				err = build_lso_seg(wqe, wr, qp, &seglen, &lso_hdr_sz);
 				if (unlikely(err)) {
 					*bad_wr = wr;
 					goto out;
 				}
+				lso_wqe = (__be32 *) wqe;
 				wqe  += seglen;
 				size += seglen / 16;
 			}
@@ -1652,6 +1655,14 @@
 		for (i = wr->num_sge - 1; i >= 0; --i, --dseg)
 			set_data_seg(dseg, wr->sg_list + i);
 
+		/*
+		 * Possibly overwrite stamping in cacheline with LSO
+		 * segment only after making sure all data segments
+		 * are written.
+		 */
+		wmb();
+		*lso_wqe = lso_hdr_sz;
+
 		ctrl->fence_size = (wr->send_flags & IB_SEND_FENCE ?
 				    MLX4_WQE_CTRL_FENCE : 0) | size;
 
@@ -1686,7 +1697,6 @@
 			stamp_send_wqe(qp, stamp, size * 16);
 			ind = pad_wraparound(qp, ind);
 		}
-
 	}
 
 out: