IB/mlx4: Add support for memory management extensions and local DMA L_Key

Add support for the following operations to mlx4 when device firmware
supports them:

 - Send with invalidate and local invalidate send queue work requests;
 - Allocate/free fast register MRs;
 - Allocate/free fast register MR page lists;
 - Fast register MR send queue work requests;
 - Local DMA L_Key.

Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/net/mlx4/fw.c b/drivers/net/mlx4/fw.c
index 0851ebd..5727822 100644
--- a/drivers/net/mlx4/fw.c
+++ b/drivers/net/mlx4/fw.c
@@ -202,7 +202,7 @@
 #define QUERY_DEV_CAP_C_MPT_ENTRY_SZ_OFFSET	0x8e
 #define QUERY_DEV_CAP_MTT_ENTRY_SZ_OFFSET	0x90
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET	0x92
-#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x97
+#define QUERY_DEV_CAP_BMME_FLAGS_OFFSET		0x94
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET		0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET		0xa0
 
@@ -377,12 +377,8 @@
 		}
 	}
 
-	if (dev_cap->bmme_flags & 1)
-		mlx4_dbg(dev, "Base MM extensions: yes "
-			 "(flags %d, rsvd L_Key %08x)\n",
-			 dev_cap->bmme_flags, dev_cap->reserved_lkey);
-	else
-		mlx4_dbg(dev, "Base MM extensions: no\n");
+	mlx4_dbg(dev, "Base MM extensions: flags %08x, rsvd L_Key %08x\n",
+		 dev_cap->bmme_flags, dev_cap->reserved_lkey);
 
 	/*
 	 * Each UAR has 4 EQ doorbells; so if a UAR is reserved, then
diff --git a/drivers/net/mlx4/fw.h b/drivers/net/mlx4/fw.h
index a0e046c..fbf0e22 100644
--- a/drivers/net/mlx4/fw.h
+++ b/drivers/net/mlx4/fw.h
@@ -98,7 +98,7 @@
 	int cmpt_entry_sz;
 	int mtt_entry_sz;
 	int resize_srq;
-	u8  bmme_flags;
+	u32 bmme_flags;
 	u32 reserved_lkey;
 	u64 max_icm_sz;
 	int max_gso_sz;
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index d373601..8e1d24c 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -158,6 +158,8 @@
 	dev->caps.max_msg_sz         = dev_cap->max_msg_sz;
 	dev->caps.page_size_cap	     = ~(u32) (dev_cap->min_page_sz - 1);
 	dev->caps.flags		     = dev_cap->flags;
+	dev->caps.bmme_flags	     = dev_cap->bmme_flags;
+	dev->caps.reserved_lkey	     = dev_cap->reserved_lkey;
 	dev->caps.stat_rate_support  = dev_cap->stat_rate_support;
 	dev->caps.max_gso_sz	     = dev_cap->max_gso_sz;
 
diff --git a/drivers/net/mlx4/mr.c b/drivers/net/mlx4/mr.c
index b3ea93b..a3c04c5 100644
--- a/drivers/net/mlx4/mr.c
+++ b/drivers/net/mlx4/mr.c
@@ -47,7 +47,7 @@
 	__be32 flags;
 	__be32 qpn;
 	__be32 key;
-	__be32 pd;
+	__be32 pd_flags;
 	__be64 start;
 	__be64 length;
 	__be32 lkey;
@@ -61,11 +61,15 @@
 } __attribute__((packed));
 
 #define MLX4_MPT_FLAG_SW_OWNS	    (0xfUL << 28)
+#define MLX4_MPT_FLAG_FREE	    (0x3UL << 28)
 #define MLX4_MPT_FLAG_MIO	    (1 << 17)
 #define MLX4_MPT_FLAG_BIND_ENABLE   (1 << 15)
 #define MLX4_MPT_FLAG_PHYSICAL	    (1 <<  9)
 #define MLX4_MPT_FLAG_REGION	    (1 <<  8)
 
+#define MLX4_MPT_PD_FLAG_FAST_REG   (1 << 26)
+#define MLX4_MPT_PD_FLAG_EN_INV	    (3 << 24)
+
 #define MLX4_MTT_FLAG_PRESENT		1
 
 #define MLX4_MPT_STATUS_SW		0xF0
@@ -324,21 +328,30 @@
 
 	memset(mpt_entry, 0, sizeof *mpt_entry);
 
-	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS	 |
-				       MLX4_MPT_FLAG_MIO	 |
+	mpt_entry->flags = cpu_to_be32(MLX4_MPT_FLAG_MIO	 |
 				       MLX4_MPT_FLAG_REGION	 |
 				       mr->access);
 
 	mpt_entry->key	       = cpu_to_be32(key_to_hw_index(mr->key));
-	mpt_entry->pd	       = cpu_to_be32(mr->pd);
+	mpt_entry->pd_flags    = cpu_to_be32(mr->pd | MLX4_MPT_PD_FLAG_EN_INV);
 	mpt_entry->start       = cpu_to_be64(mr->iova);
 	mpt_entry->length      = cpu_to_be64(mr->size);
 	mpt_entry->entity_size = cpu_to_be32(mr->mtt.page_shift);
+
 	if (mr->mtt.order < 0) {
 		mpt_entry->flags |= cpu_to_be32(MLX4_MPT_FLAG_PHYSICAL);
 		mpt_entry->mtt_seg = 0;
-	} else
+	} else {
 		mpt_entry->mtt_seg = cpu_to_be64(mlx4_mtt_addr(dev, &mr->mtt));
+	}
+
+	if (mr->mtt.order >= 0 && mr->mtt.page_shift == 0) {
+		/* fast register MR in free state */
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_FREE);
+		mpt_entry->pd_flags |= cpu_to_be32(MLX4_MPT_PD_FLAG_FAST_REG);
+	} else {
+		mpt_entry->flags    |= cpu_to_be32(MLX4_MPT_FLAG_SW_OWNS);
+	}
 
 	err = mlx4_SW2HW_MPT(dev, mailbox,
 			     key_to_hw_index(mr->key) & (dev->caps.num_mpts - 1));