IB: split struct ib_send_wr

This patch split up struct ib_send_wr so that all non-trivial verbs
use their own structure which embedds struct ib_send_wr.  This dramaticly
shrinks the size of a WR for most common operations:

sizeof(struct ib_send_wr) (old):	96

sizeof(struct ib_send_wr):		48
sizeof(struct ib_rdma_wr):		64
sizeof(struct ib_atomic_wr):		96
sizeof(struct ib_ud_wr):		88
sizeof(struct ib_fast_reg_wr):		88
sizeof(struct ib_bind_mw_wr):		96
sizeof(struct ib_sig_handover_wr):	80

And with Sagi's pending MR rework the fast registration WR will also be
down to a reasonable size:

sizeof(struct ib_fastreg_wr):		64

Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Bart Van Assche <bart.vanassche@sandisk.com> [srp, srpt]
Reviewed-by: Chuck Lever <chuck.lever@oracle.com> [sunrpc]
Tested-by: Haggai Eran <haggaie@mellanox.com>
Tested-by: Sagi Grimberg <sagig@mellanox.com>
Tested-by: Steve Wise <swise@opengridcomputing.com>
diff --git a/drivers/infiniband/hw/qib/qib_ud.c b/drivers/infiniband/hw/qib/qib_ud.c
index 26243b7..59193f6 100644
--- a/drivers/infiniband/hw/qib/qib_ud.c
+++ b/drivers/infiniband/hw/qib/qib_ud.c
@@ -59,7 +59,7 @@
 	u32 length;
 	enum ib_qp_type sqptype, dqptype;
 
-	qp = qib_lookup_qpn(ibp, swqe->wr.wr.ud.remote_qpn);
+	qp = qib_lookup_qpn(ibp, swqe->ud_wr.remote_qpn);
 	if (!qp) {
 		ibp->n_pkt_drops++;
 		return;
@@ -76,7 +76,7 @@
 		goto drop;
 	}
 
-	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
+	ah_attr = &to_iah(swqe->ud_wr.ah)->attr;
 	ppd = ppd_from_ibp(ibp);
 
 	if (qp->ibqp.qp_num > 1) {
@@ -106,8 +106,8 @@
 	if (qp->ibqp.qp_num) {
 		u32 qkey;
 
-		qkey = (int)swqe->wr.wr.ud.remote_qkey < 0 ?
-			sqp->qkey : swqe->wr.wr.ud.remote_qkey;
+		qkey = (int)swqe->ud_wr.remote_qkey < 0 ?
+			sqp->qkey : swqe->ud_wr.remote_qkey;
 		if (unlikely(qkey != qp->qkey)) {
 			u16 lid;
 
@@ -210,7 +210,7 @@
 	wc.qp = &qp->ibqp;
 	wc.src_qp = sqp->ibqp.qp_num;
 	wc.pkey_index = qp->ibqp.qp_type == IB_QPT_GSI ?
-		swqe->wr.wr.ud.pkey_index : 0;
+		swqe->ud_wr.pkey_index : 0;
 	wc.slid = ppd->lid | (ah_attr->src_path_bits & ((1 << ppd->lmc) - 1));
 	wc.sl = ah_attr->sl;
 	wc.dlid_path_bits = ah_attr->dlid & ((1 << ppd->lmc) - 1);
@@ -277,7 +277,7 @@
 	/* Construct the header. */
 	ibp = to_iport(qp->ibqp.device, qp->port_num);
 	ppd = ppd_from_ibp(ibp);
-	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
+	ah_attr = &to_iah(wqe->ud_wr.ah)->attr;
 	if (ah_attr->dlid >= QIB_MULTICAST_LID_BASE) {
 		if (ah_attr->dlid != QIB_PERMISSIVE_LID)
 			this_cpu_inc(ibp->pmastats->n_multicast_xmit);
@@ -363,7 +363,7 @@
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? QIB_DEFAULT_P_KEY :
 		qib_get_pkey(ibp, qp->ibqp.qp_type == IB_QPT_GSI ?
-			     wqe->wr.wr.ud.pkey_index : qp->s_pkey_index);
+			     wqe->ud_wr.pkey_index : qp->s_pkey_index);
 	ohdr->bth[0] = cpu_to_be32(bth0);
 	/*
 	 * Use the multicast QP if the destination LID is a multicast LID.
@@ -371,14 +371,14 @@
 	ohdr->bth[1] = ah_attr->dlid >= QIB_MULTICAST_LID_BASE &&
 		ah_attr->dlid != QIB_PERMISSIVE_LID ?
 		cpu_to_be32(QIB_MULTICAST_QPN) :
-		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
+		cpu_to_be32(wqe->ud_wr.remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & QIB_PSN_MASK);
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
-					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->ud_wr.remote_qkey < 0 ?
+					 qp->qkey : wqe->ud_wr.remote_qkey);
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
 
 done: