RDMA/cxgb4: Fastreg NSMR fixes

- Remove dsgl support - doesn't work in T4.
- Wrap the immediate PBL as needed when building it in the wr.
- Adjust max pbl depth allowed based on ulptx alignment requirements.
- Bump the slots per SQ to 5 to allow up to 128MB fast registers.
- Advertise fastreg support by default.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 17ea5fc..7000442 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -66,7 +66,7 @@
 
 #define T4_EQ_ENTRY_SIZE 64
 
-#define T4_SQ_NUM_SLOTS 4
+#define T4_SQ_NUM_SLOTS 5
 #define T4_SQ_NUM_BYTES (T4_EQ_ENTRY_SIZE * T4_SQ_NUM_SLOTS)
 #define T4_MAX_SEND_SGE ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_send_wr) - \
 			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
@@ -79,7 +79,7 @@
 			sizeof(struct fw_ri_rdma_write_wr) - \
 			sizeof(struct fw_ri_isgl)) / sizeof(struct fw_ri_sge))
 #define T4_MAX_FR_IMMD ((T4_SQ_NUM_BYTES - sizeof(struct fw_ri_fr_nsmr_wr) - \
-			sizeof(struct fw_ri_immd)))
+			sizeof(struct fw_ri_immd)) & ~31UL)
 #define T4_MAX_FR_DEPTH (T4_MAX_FR_IMMD / sizeof(u64))
 
 #define T4_RQ_NUM_SLOTS 2