mlx4_core: Support ICM tables in coherent memory

Enable having ICM tables in coherent memory, and use coherent memory
for the dMPT table.  This will allow writing MPT entries for MRs both
via the SW2HW_MPT command and also directly by the driver for FMR
remapping without needing to flush or worry about cacheline boundaries.

Signed-off-by: Jack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: Michael S. Tsirkin <mst@dev.mellanox.co.il>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/net/mlx4/icm.c b/drivers/net/mlx4/icm.c
index b7a4aa8..250e248 100644
--- a/drivers/net/mlx4/icm.c
+++ b/drivers/net/mlx4/icm.c
@@ -34,6 +34,7 @@
 #include <linux/init.h>
 #include <linux/errno.h>
 #include <linux/mm.h>
+#include <linux/scatterlist.h>
 
 #include <linux/mlx4/cmd.h>
 
@@ -50,19 +51,41 @@
 	MLX4_TABLE_CHUNK_SIZE	= 1 << 18
 };
 
-void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm)
+static void mlx4_free_icm_pages(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
 {
-	struct mlx4_icm_chunk *chunk, *tmp;
 	int i;
 
-	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
-		if (chunk->nsg > 0)
-			pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
-				     PCI_DMA_BIDIRECTIONAL);
+	if (chunk->nsg > 0)
+		pci_unmap_sg(dev->pdev, chunk->mem, chunk->npages,
+			     PCI_DMA_BIDIRECTIONAL);
 
-		for (i = 0; i < chunk->npages; ++i)
-			__free_pages(chunk->mem[i].page,
-				     get_order(chunk->mem[i].length));
+	for (i = 0; i < chunk->npages; ++i)
+		__free_pages(chunk->mem[i].page,
+			     get_order(chunk->mem[i].length));
+}
+
+static void mlx4_free_icm_coherent(struct mlx4_dev *dev, struct mlx4_icm_chunk *chunk)
+{
+	int i;
+
+	for (i = 0; i < chunk->npages; ++i)
+		dma_free_coherent(&dev->pdev->dev, chunk->mem[i].length,
+				  lowmem_page_address(chunk->mem[i].page),
+				  sg_dma_address(&chunk->mem[i]));
+}
+
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent)
+{
+	struct mlx4_icm_chunk *chunk, *tmp;
+
+	if (!icm)
+		return;
+
+	list_for_each_entry_safe(chunk, tmp, &icm->chunk_list, list) {
+		if (coherent)
+			mlx4_free_icm_coherent(dev, chunk);
+		else
+			mlx4_free_icm_pages(dev, chunk);
 
 		kfree(chunk);
 	}
@@ -70,16 +93,45 @@
 	kfree(icm);
 }
 
+static int mlx4_alloc_icm_pages(struct scatterlist *mem, int order, gfp_t gfp_mask)
+{
+	mem->page = alloc_pages(gfp_mask, order);
+	if (!mem->page)
+		return -ENOMEM;
+
+	mem->length = PAGE_SIZE << order;
+	mem->offset = 0;
+	return 0;
+}
+
+static int mlx4_alloc_icm_coherent(struct device *dev, struct scatterlist *mem,
+				    int order, gfp_t gfp_mask)
+{
+	void *buf = dma_alloc_coherent(dev, PAGE_SIZE << order,
+				       &sg_dma_address(mem), gfp_mask);
+	if (!buf)
+		return -ENOMEM;
+
+	sg_set_buf(mem, buf, PAGE_SIZE << order);
+	BUG_ON(mem->offset);
+	sg_dma_len(mem) = PAGE_SIZE << order;
+	return 0;
+}
+
 struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
-				gfp_t gfp_mask)
+				gfp_t gfp_mask, int coherent)
 {
 	struct mlx4_icm *icm;
 	struct mlx4_icm_chunk *chunk = NULL;
 	int cur_order;
+	int ret;
+
+	/* We use sg_set_buf for coherent allocs, which assumes low memory */
+	BUG_ON(coherent && (gfp_mask & __GFP_HIGHMEM));
 
 	icm = kmalloc(sizeof *icm, gfp_mask & ~(__GFP_HIGHMEM | __GFP_NOWARN));
 	if (!icm)
-		return icm;
+		return NULL;
 
 	icm->refcount = 0;
 	INIT_LIST_HEAD(&icm->chunk_list);
@@ -101,12 +153,20 @@
 		while (1 << cur_order > npages)
 			--cur_order;
 
-		chunk->mem[chunk->npages].page = alloc_pages(gfp_mask, cur_order);
-		if (chunk->mem[chunk->npages].page) {
-			chunk->mem[chunk->npages].length = PAGE_SIZE << cur_order;
-			chunk->mem[chunk->npages].offset = 0;
+		if (coherent)
+			ret = mlx4_alloc_icm_coherent(&dev->pdev->dev,
+						      &chunk->mem[chunk->npages],
+						      cur_order, gfp_mask);
+		else
+			ret = mlx4_alloc_icm_pages(&chunk->mem[chunk->npages],
+						   cur_order, gfp_mask);
 
-			if (++chunk->npages == MLX4_ICM_CHUNK_LEN) {
+		if (!ret) {
+			++chunk->npages;
+
+			if (coherent)
+				++chunk->nsg;
+			else if (chunk->npages == MLX4_ICM_CHUNK_LEN) {
 				chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
 							chunk->npages,
 							PCI_DMA_BIDIRECTIONAL);
@@ -125,7 +185,7 @@
 		}
 	}
 
-	if (chunk) {
+	if (!coherent && chunk) {
 		chunk->nsg = pci_map_sg(dev->pdev, chunk->mem,
 					chunk->npages,
 					PCI_DMA_BIDIRECTIONAL);
@@ -137,7 +197,7 @@
 	return icm;
 
 fail:
-	mlx4_free_icm(dev, icm);
+	mlx4_free_icm(dev, icm, coherent);
 	return NULL;
 }
 
@@ -202,7 +262,7 @@
 
 	table->icm[i] = mlx4_alloc_icm(dev, MLX4_TABLE_CHUNK_SIZE >> PAGE_SHIFT,
 				       (table->lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
-				       __GFP_NOWARN);
+				       __GFP_NOWARN, table->coherent);
 	if (!table->icm[i]) {
 		ret = -ENOMEM;
 		goto out;
@@ -210,7 +270,7 @@
 
 	if (mlx4_MAP_ICM(dev, table->icm[i], table->virt +
 			 (u64) i * MLX4_TABLE_CHUNK_SIZE)) {
-		mlx4_free_icm(dev, table->icm[i]);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
 		table->icm[i] = NULL;
 		ret = -ENOMEM;
 		goto out;
@@ -234,7 +294,7 @@
 	if (--table->icm[i]->refcount == 0) {
 		mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
 			       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-		mlx4_free_icm(dev, table->icm[i]);
+		mlx4_free_icm(dev, table->icm[i], table->coherent);
 		table->icm[i] = NULL;
 	}
 
@@ -309,7 +369,7 @@
 
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			u64 virt, int obj_size,	int nobj, int reserved,
-			int use_lowmem)
+			int use_lowmem, int use_coherent)
 {
 	int obj_per_chunk;
 	int num_icm;
@@ -327,6 +387,7 @@
 	table->num_obj  = nobj;
 	table->obj_size = obj_size;
 	table->lowmem   = use_lowmem;
+	table->coherent = use_coherent;
 	mutex_init(&table->mutex);
 
 	for (i = 0; i * MLX4_TABLE_CHUNK_SIZE < reserved * obj_size; ++i) {
@@ -336,11 +397,11 @@
 
 		table->icm[i] = mlx4_alloc_icm(dev, chunk_size >> PAGE_SHIFT,
 					       (use_lowmem ? GFP_KERNEL : GFP_HIGHUSER) |
-					       __GFP_NOWARN);
+					       __GFP_NOWARN, use_coherent);
 		if (!table->icm[i])
 			goto err;
 		if (mlx4_MAP_ICM(dev, table->icm[i], virt + i * MLX4_TABLE_CHUNK_SIZE)) {
-			mlx4_free_icm(dev, table->icm[i]);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
 			table->icm[i] = NULL;
 			goto err;
 		}
@@ -359,7 +420,7 @@
 		if (table->icm[i]) {
 			mlx4_UNMAP_ICM(dev, virt + i * MLX4_TABLE_CHUNK_SIZE,
 				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-			mlx4_free_icm(dev, table->icm[i]);
+			mlx4_free_icm(dev, table->icm[i], use_coherent);
 		}
 
 	return -ENOMEM;
@@ -373,7 +434,7 @@
 		if (table->icm[i]) {
 			mlx4_UNMAP_ICM(dev, table->virt + i * MLX4_TABLE_CHUNK_SIZE,
 				       MLX4_TABLE_CHUNK_SIZE / MLX4_ICM_PAGE_SIZE);
-			mlx4_free_icm(dev, table->icm[i]);
+			mlx4_free_icm(dev, table->icm[i], table->coherent);
 		}
 
 	kfree(table->icm);
diff --git a/drivers/net/mlx4/icm.h b/drivers/net/mlx4/icm.h
index bea223d..a77db6d 100644
--- a/drivers/net/mlx4/icm.h
+++ b/drivers/net/mlx4/icm.h
@@ -67,8 +67,9 @@
 
 struct mlx4_dev;
 
-struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages, gfp_t gfp_mask);
-void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm);
+struct mlx4_icm *mlx4_alloc_icm(struct mlx4_dev *dev, int npages,
+				gfp_t gfp_mask, int coherent);
+void mlx4_free_icm(struct mlx4_dev *dev, struct mlx4_icm *icm, int coherent);
 
 int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
 void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
@@ -78,7 +79,7 @@
 			  int start, int end);
 int mlx4_init_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table,
 			u64 virt, int obj_size,	int nobj, int reserved,
-			int use_lowmem);
+			int use_lowmem, int use_coherent);
 void mlx4_cleanup_icm_table(struct mlx4_dev *dev, struct mlx4_icm_table *table);
 int mlx4_table_get(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
 void mlx4_table_put(struct mlx4_dev *dev, struct mlx4_icm_table *table, int obj);
diff --git a/drivers/net/mlx4/main.c b/drivers/net/mlx4/main.c
index 9e590e1..07c2847 100644
--- a/drivers/net/mlx4/main.c
+++ b/drivers/net/mlx4/main.c
@@ -168,7 +168,7 @@
 	int err;
 
 	priv->fw.fw_icm = mlx4_alloc_icm(dev, priv->fw.fw_pages,
-					 GFP_HIGHUSER | __GFP_NOWARN);
+					 GFP_HIGHUSER | __GFP_NOWARN, 0);
 	if (!priv->fw.fw_icm) {
 		mlx4_err(dev, "Couldn't allocate FW area, aborting.\n");
 		return -ENOMEM;
@@ -192,7 +192,7 @@
 	mlx4_UNMAP_FA(dev);
 
 err_free:
-	mlx4_free_icm(dev, priv->fw.fw_icm);
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
 	return err;
 }
 
@@ -207,7 +207,7 @@
 				  ((u64) (MLX4_CMPT_TYPE_QP *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz, dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0);
+				  dev->caps.reserved_qps, 0, 0);
 	if (err)
 		goto err;
 
@@ -216,7 +216,7 @@
 				  ((u64) (MLX4_CMPT_TYPE_SRQ *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz, dev->caps.num_srqs,
-				  dev->caps.reserved_srqs, 0);
+				  dev->caps.reserved_srqs, 0, 0);
 	if (err)
 		goto err_qp;
 
@@ -225,7 +225,7 @@
 				  ((u64) (MLX4_CMPT_TYPE_CQ *
 					  cmpt_entry_sz) << MLX4_CMPT_SHIFT),
 				  cmpt_entry_sz, dev->caps.num_cqs,
-				  dev->caps.reserved_cqs, 0);
+				  dev->caps.reserved_cqs, 0, 0);
 	if (err)
 		goto err_srq;
 
@@ -236,7 +236,7 @@
 				  cmpt_entry_sz,
 				  roundup_pow_of_two(MLX4_NUM_EQ +
 						     dev->caps.reserved_eqs),
-				  MLX4_NUM_EQ + dev->caps.reserved_eqs, 0);
+				  MLX4_NUM_EQ + dev->caps.reserved_eqs, 0, 0);
 	if (err)
 		goto err_cq;
 
@@ -275,7 +275,7 @@
 		 (unsigned long long) aux_pages << 2);
 
 	priv->fw.aux_icm = mlx4_alloc_icm(dev, aux_pages,
-					  GFP_HIGHUSER | __GFP_NOWARN);
+					  GFP_HIGHUSER | __GFP_NOWARN, 0);
 	if (!priv->fw.aux_icm) {
 		mlx4_err(dev, "Couldn't allocate aux memory, aborting.\n");
 		return -ENOMEM;
@@ -303,7 +303,7 @@
 				  init_hca->mtt_base,
 				  dev->caps.mtt_entry_sz,
 				  dev->caps.num_mtt_segs,
-				  dev->caps.reserved_mtts, 1);
+				  dev->caps.reserved_mtts, 1, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map MTT context memory, aborting.\n");
 		goto err_unmap_eq;
@@ -313,7 +313,7 @@
 				  init_hca->dmpt_base,
 				  dev_cap->dmpt_entry_sz,
 				  dev->caps.num_mpts,
-				  dev->caps.reserved_mrws, 1);
+				  dev->caps.reserved_mrws, 1, 1);
 	if (err) {
 		mlx4_err(dev, "Failed to map dMPT context memory, aborting.\n");
 		goto err_unmap_mtt;
@@ -323,7 +323,7 @@
 				  init_hca->qpc_base,
 				  dev_cap->qpc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0);
+				  dev->caps.reserved_qps, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map QP context memory, aborting.\n");
 		goto err_unmap_dmpt;
@@ -333,7 +333,7 @@
 				  init_hca->auxc_base,
 				  dev_cap->aux_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0);
+				  dev->caps.reserved_qps, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map AUXC context memory, aborting.\n");
 		goto err_unmap_qp;
@@ -343,7 +343,7 @@
 				  init_hca->altc_base,
 				  dev_cap->altc_entry_sz,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0);
+				  dev->caps.reserved_qps, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map ALTC context memory, aborting.\n");
 		goto err_unmap_auxc;
@@ -353,7 +353,7 @@
 				  init_hca->rdmarc_base,
 				  dev_cap->rdmarc_entry_sz << priv->qp_table.rdmarc_shift,
 				  dev->caps.num_qps,
-				  dev->caps.reserved_qps, 0);
+				  dev->caps.reserved_qps, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map RDMARC context memory, aborting\n");
 		goto err_unmap_altc;
@@ -363,7 +363,7 @@
 				  init_hca->cqc_base,
 				  dev_cap->cqc_entry_sz,
 				  dev->caps.num_cqs,
-				  dev->caps.reserved_cqs, 0);
+				  dev->caps.reserved_cqs, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map CQ context memory, aborting.\n");
 		goto err_unmap_rdmarc;
@@ -373,7 +373,7 @@
 				  init_hca->srqc_base,
 				  dev_cap->srq_entry_sz,
 				  dev->caps.num_srqs,
-				  dev->caps.reserved_srqs, 0);
+				  dev->caps.reserved_srqs, 0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map SRQ context memory, aborting.\n");
 		goto err_unmap_cq;
@@ -388,7 +388,7 @@
 				  init_hca->mc_base, MLX4_MGM_ENTRY_SIZE,
 				  dev->caps.num_mgms + dev->caps.num_amgms,
 				  dev->caps.num_mgms + dev->caps.num_amgms,
-				  0);
+				  0, 0);
 	if (err) {
 		mlx4_err(dev, "Failed to map MCG context memory, aborting.\n");
 		goto err_unmap_srq;
@@ -433,7 +433,7 @@
 	mlx4_UNMAP_ICM_AUX(dev);
 
 err_free_aux:
-	mlx4_free_icm(dev, priv->fw.aux_icm);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
 
 	return err;
 }
@@ -458,7 +458,7 @@
 	mlx4_unmap_eq_icm(dev);
 
 	mlx4_UNMAP_ICM_AUX(dev);
-	mlx4_free_icm(dev, priv->fw.aux_icm);
+	mlx4_free_icm(dev, priv->fw.aux_icm, 0);
 }
 
 static void mlx4_close_hca(struct mlx4_dev *dev)
@@ -466,7 +466,7 @@
 	mlx4_CLOSE_HCA(dev, 0);
 	mlx4_free_icms(dev);
 	mlx4_UNMAP_FA(dev);
-	mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm);
+	mlx4_free_icm(dev, mlx4_priv(dev)->fw.fw_icm, 0);
 }
 
 static int __devinit mlx4_init_hca(struct mlx4_dev *dev)
@@ -537,7 +537,7 @@
 
 err_stop_fw:
 	mlx4_UNMAP_FA(dev);
-	mlx4_free_icm(dev, priv->fw.fw_icm);
+	mlx4_free_icm(dev, priv->fw.fw_icm, 0);
 
 	return err;
 }
diff --git a/drivers/net/mlx4/mlx4.h b/drivers/net/mlx4/mlx4.h
index b9f8397..2bad045 100644
--- a/drivers/net/mlx4/mlx4.h
+++ b/drivers/net/mlx4/mlx4.h
@@ -129,6 +129,7 @@
 	int			num_obj;
 	int			obj_size;
 	int			lowmem;
+	int			coherent;
 	struct mutex		mutex;
 	struct mlx4_icm	      **icm;
 };