RDMA/cxgb4: Support on-chip SQs

T4 support on-chip SQs to reduce latency.  This patch adds support for
this in iw_cxgb4:

 - Manage ocqp memory like other adapter mem resources.
 - Allocate user mode SQs from ocqp mem if available.
 - Map ocqp mem to user process using write combining.
 - Map PCIE_MA_SYNC reg to user process.

Bump uverbs ABI.

Signed-off-by: Steve Wise <swise@opengridcomputing.com>
Signed-off-by: Roland Dreier <rolandd@cisco.com>
diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c
index 2851bf8..986cfd7 100644
--- a/drivers/infiniband/hw/cxgb4/device.c
+++ b/drivers/infiniband/hw/cxgb4/device.c
@@ -364,7 +364,14 @@
 		printk(KERN_ERR MOD "error %d initializing rqt pool\n", err);
 		goto err3;
 	}
+	err = c4iw_ocqp_pool_create(rdev);
+	if (err) {
+		printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err);
+		goto err4;
+	}
 	return 0;
+err4:
+	c4iw_rqtpool_destroy(rdev);
 err3:
 	c4iw_pblpool_destroy(rdev);
 err2:
@@ -391,6 +398,7 @@
 	idr_destroy(&dev->cqidr);
 	idr_destroy(&dev->qpidr);
 	idr_destroy(&dev->mmidr);
+	iounmap(dev->rdev.oc_mw_kva);
 	ib_dealloc_device(&dev->ibdev);
 }
 
@@ -406,6 +414,17 @@
 	}
 	devp->rdev.lldi = *infop;
 
+	devp->rdev.oc_mw_pa = pci_resource_start(devp->rdev.lldi.pdev, 2) +
+		(pci_resource_len(devp->rdev.lldi.pdev, 2) -
+		 roundup_pow_of_two(devp->rdev.lldi.vr->ocq.size));
+	devp->rdev.oc_mw_kva = ioremap_wc(devp->rdev.oc_mw_pa,
+					       devp->rdev.lldi.vr->ocq.size);
+
+	printk(KERN_INFO MOD "ocq memory: "
+	       "hw_start 0x%x size %u mw_pa 0x%lx mw_kva %p\n",
+	       devp->rdev.lldi.vr->ocq.start, devp->rdev.lldi.vr->ocq.size,
+	       devp->rdev.oc_mw_pa, devp->rdev.oc_mw_kva);
+
 	mutex_lock(&dev_mutex);
 
 	ret = c4iw_rdev_open(&devp->rdev);
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index 7780116..1c26922 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -112,8 +112,11 @@
 	struct c4iw_dev_ucontext uctx;
 	struct gen_pool *pbl_pool;
 	struct gen_pool *rqt_pool;
+	struct gen_pool *ocqp_pool;
 	u32 flags;
 	struct cxgb4_lld_info lldi;
+	unsigned long oc_mw_pa;
+	void __iomem *oc_mw_kva;
 };
 
 static inline int c4iw_fatal_error(struct c4iw_rdev *rdev)
@@ -675,8 +678,10 @@
 int c4iw_init_ctrl_qp(struct c4iw_rdev *rdev);
 int c4iw_pblpool_create(struct c4iw_rdev *rdev);
 int c4iw_rqtpool_create(struct c4iw_rdev *rdev);
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev);
 void c4iw_pblpool_destroy(struct c4iw_rdev *rdev);
 void c4iw_rqtpool_destroy(struct c4iw_rdev *rdev);
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev);
 void c4iw_destroy_resource(struct c4iw_resource *rscp);
 int c4iw_destroy_ctrl_qp(struct c4iw_rdev *rdev);
 int c4iw_register_device(struct c4iw_dev *dev);
@@ -742,6 +747,8 @@
 void c4iw_rqtpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
 u32 c4iw_pblpool_alloc(struct c4iw_rdev *rdev, int size);
 void c4iw_pblpool_free(struct c4iw_rdev *rdev, u32 addr, int size);
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size);
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size);
 int c4iw_ofld_send(struct c4iw_rdev *rdev, struct sk_buff *skb);
 void c4iw_flush_hw_cq(struct t4_cq *cq);
 void c4iw_count_rcqes(struct t4_cq *cq, struct t4_wq *wq, int *count);
diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c
index 8f645c8..a49a9c1 100644
--- a/drivers/infiniband/hw/cxgb4/provider.c
+++ b/drivers/infiniband/hw/cxgb4/provider.c
@@ -149,19 +149,28 @@
 	addr = mm->addr;
 	kfree(mm);
 
-	if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
-	    (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
-		       pci_resource_len(rdev->lldi.pdev, 2)))) {
+	if ((addr >= pci_resource_start(rdev->lldi.pdev, 0)) &&
+	    (addr < (pci_resource_start(rdev->lldi.pdev, 0) +
+		    pci_resource_len(rdev->lldi.pdev, 0)))) {
 
 		/*
-		 * Map T4 DB register.
+		 * MA_SYNC register...
 		 */
-		if (vma->vm_flags & VM_READ)
-			return -EPERM;
-
 		vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
-		vma->vm_flags |= VM_DONTCOPY | VM_DONTEXPAND;
-		vma->vm_flags &= ~VM_MAYREAD;
+		ret = io_remap_pfn_range(vma, vma->vm_start,
+					 addr >> PAGE_SHIFT,
+					 len, vma->vm_page_prot);
+	} else if ((addr >= pci_resource_start(rdev->lldi.pdev, 2)) &&
+		   (addr < (pci_resource_start(rdev->lldi.pdev, 2) +
+		    pci_resource_len(rdev->lldi.pdev, 2)))) {
+
+		/*
+		 * Map user DB or OCQP memory...
+		 */
+		if (addr >= rdev->oc_mw_pa)
+			vma->vm_page_prot = t4_pgprot_wc(vma->vm_page_prot);
+		else
+			vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 		ret = io_remap_pfn_range(vma, vma->vm_start,
 					 addr >> PAGE_SHIFT,
 					 len, vma->vm_page_prot);
@@ -472,6 +481,7 @@
 	dev->ibdev.post_send = c4iw_post_send;
 	dev->ibdev.post_recv = c4iw_post_receive;
 	dev->ibdev.get_protocol_stats = c4iw_get_mib;
+	dev->ibdev.uverbs_abi_ver = C4IW_UVERBS_ABI_VERSION;
 
 	dev->ibdev.iwcm = kmalloc(sizeof(struct iw_cm_verbs), GFP_KERNEL);
 	if (!dev->ibdev.iwcm)
diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c
index 40187e2..7e45f73 100644
--- a/drivers/infiniband/hw/cxgb4/qp.c
+++ b/drivers/infiniband/hw/cxgb4/qp.c
@@ -31,6 +31,55 @@
  */
 #include "iw_cxgb4.h"
 
+static int ocqp_support;
+module_param(ocqp_support, int, 0644);
+MODULE_PARM_DESC(ocqp_support, "Support on-chip SQs (default=0)");
+
+static void dealloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	c4iw_ocqp_pool_free(rdev, sq->dma_addr, sq->memsize);
+}
+
+static void dealloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	dma_free_coherent(&(rdev->lldi.pdev->dev), sq->memsize, sq->queue,
+			  pci_unmap_addr(sq, mapping));
+}
+
+static void dealloc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (t4_sq_onchip(sq))
+		dealloc_oc_sq(rdev, sq);
+	else
+		dealloc_host_sq(rdev, sq);
+}
+
+static int alloc_oc_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	if (!ocqp_support || !t4_ocqp_supported())
+		return -ENOSYS;
+	sq->dma_addr = c4iw_ocqp_pool_alloc(rdev, sq->memsize);
+	if (!sq->dma_addr)
+		return -ENOMEM;
+	sq->phys_addr = rdev->oc_mw_pa + sq->dma_addr -
+			rdev->lldi.vr->ocq.start;
+	sq->queue = (__force union t4_wr *)(rdev->oc_mw_kva + sq->dma_addr -
+					    rdev->lldi.vr->ocq.start);
+	sq->flags |= T4_SQ_ONCHIP;
+	return 0;
+}
+
+static int alloc_host_sq(struct c4iw_rdev *rdev, struct t4_sq *sq)
+{
+	sq->queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev), sq->memsize,
+				       &(sq->dma_addr), GFP_KERNEL);
+	if (!sq->queue)
+		return -ENOMEM;
+	sq->phys_addr = virt_to_phys(sq->queue);
+	pci_unmap_addr_set(sq, mapping, sq->dma_addr);
+	return 0;
+}
+
 static int destroy_qp(struct c4iw_rdev *rdev, struct t4_wq *wq,
 		      struct c4iw_dev_ucontext *uctx)
 {
@@ -41,9 +90,7 @@
 	dma_free_coherent(&(rdev->lldi.pdev->dev),
 			  wq->rq.memsize, wq->rq.queue,
 			  dma_unmap_addr(&wq->rq, mapping));
-	dma_free_coherent(&(rdev->lldi.pdev->dev),
-			  wq->sq.memsize, wq->sq.queue,
-			  dma_unmap_addr(&wq->sq, mapping));
+	dealloc_sq(rdev, &wq->sq);
 	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
 	kfree(wq->rq.sw_rq);
 	kfree(wq->sq.sw_sq);
@@ -93,11 +140,12 @@
 	if (!wq->rq.rqt_hwaddr)
 		goto err4;
 
-	wq->sq.queue = dma_alloc_coherent(&(rdev->lldi.pdev->dev),
-					  wq->sq.memsize, &(wq->sq.dma_addr),
-					  GFP_KERNEL);
-	if (!wq->sq.queue)
-		goto err5;
+	if (user) {
+		if (alloc_oc_sq(rdev, &wq->sq) && alloc_host_sq(rdev, &wq->sq))
+			goto err5;
+	} else
+		if (alloc_host_sq(rdev, &wq->sq))
+			goto err5;
 	memset(wq->sq.queue, 0, wq->sq.memsize);
 	dma_unmap_addr_set(&wq->sq, mapping, wq->sq.dma_addr);
 
@@ -158,6 +206,7 @@
 		V_FW_RI_RES_WR_HOSTFCMODE(0) |	/* no host cidx updates */
 		V_FW_RI_RES_WR_CPRIO(0) |	/* don't keep in chip cache */
 		V_FW_RI_RES_WR_PCIECHN(0) |	/* set by uP at ri_init time */
+		t4_sq_onchip(&wq->sq) ? F_FW_RI_RES_WR_ONCHIP : 0 |
 		V_FW_RI_RES_WR_IQID(scq->cqid));
 	res->u.sqrq.dcaen_to_eqsize = cpu_to_be32(
 		V_FW_RI_RES_WR_DCAEN(0) |
@@ -212,9 +261,7 @@
 			  wq->rq.memsize, wq->rq.queue,
 			  dma_unmap_addr(&wq->rq, mapping));
 err6:
-	dma_free_coherent(&(rdev->lldi.pdev->dev),
-			  wq->sq.memsize, wq->sq.queue,
-			  dma_unmap_addr(&wq->sq, mapping));
+	dealloc_sq(rdev, &wq->sq);
 err5:
 	c4iw_rqtpool_free(rdev, wq->rq.rqt_hwaddr, wq->rq.rqt_size);
 err4:
@@ -1361,7 +1408,7 @@
 	int sqsize, rqsize;
 	struct c4iw_ucontext *ucontext;
 	int ret;
-	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4;
+	struct c4iw_mm_entry *mm1, *mm2, *mm3, *mm4, *mm5 = NULL;
 
 	PDBG("%s ib_pd %p\n", __func__, pd);
 
@@ -1459,7 +1506,15 @@
 			ret = -ENOMEM;
 			goto err6;
 		}
-
+		if (t4_sq_onchip(&qhp->wq.sq)) {
+			mm5 = kmalloc(sizeof *mm5, GFP_KERNEL);
+			if (!mm5) {
+				ret = -ENOMEM;
+				goto err7;
+			}
+			uresp.flags = C4IW_QPF_ONCHIP;
+		} else
+			uresp.flags = 0;
 		uresp.qid_mask = rhp->rdev.qpmask;
 		uresp.sqid = qhp->wq.sq.qid;
 		uresp.sq_size = qhp->wq.sq.size;
@@ -1468,6 +1523,10 @@
 		uresp.rq_size = qhp->wq.rq.size;
 		uresp.rq_memsize = qhp->wq.rq.memsize;
 		spin_lock(&ucontext->mmap_lock);
+		if (mm5) {
+			uresp.ma_sync_key = ucontext->key;
+			ucontext->key += PAGE_SIZE;
+		}
 		uresp.sq_key = ucontext->key;
 		ucontext->key += PAGE_SIZE;
 		uresp.rq_key = ucontext->key;
@@ -1479,9 +1538,9 @@
 		spin_unlock(&ucontext->mmap_lock);
 		ret = ib_copy_to_udata(udata, &uresp, sizeof uresp);
 		if (ret)
-			goto err7;
+			goto err8;
 		mm1->key = uresp.sq_key;
-		mm1->addr = virt_to_phys(qhp->wq.sq.queue);
+		mm1->addr = qhp->wq.sq.phys_addr;
 		mm1->len = PAGE_ALIGN(qhp->wq.sq.memsize);
 		insert_mmap(ucontext, mm1);
 		mm2->key = uresp.rq_key;
@@ -1496,6 +1555,13 @@
 		mm4->addr = qhp->wq.rq.udb;
 		mm4->len = PAGE_SIZE;
 		insert_mmap(ucontext, mm4);
+		if (mm5) {
+			mm5->key = uresp.ma_sync_key;
+			mm5->addr = (pci_resource_start(rhp->rdev.lldi.pdev, 0)
+				    + A_PCIE_MA_SYNC) & PAGE_MASK;
+			mm5->len = PAGE_SIZE;
+			insert_mmap(ucontext, mm5);
+		}
 	}
 	qhp->ibqp.qp_num = qhp->wq.sq.qid;
 	init_timer(&(qhp->timer));
@@ -1503,6 +1569,8 @@
 	     __func__, qhp, qhp->attr.sq_num_entries, qhp->attr.rq_num_entries,
 	     qhp->wq.sq.qid);
 	return &qhp->ibqp;
+err8:
+	kfree(mm5);
 err7:
 	kfree(mm4);
 err6:
diff --git a/drivers/infiniband/hw/cxgb4/resource.c b/drivers/infiniband/hw/cxgb4/resource.c
index 26365f6..4fb50d5 100644
--- a/drivers/infiniband/hw/cxgb4/resource.c
+++ b/drivers/infiniband/hw/cxgb4/resource.c
@@ -422,3 +422,59 @@
 {
 	gen_pool_destroy(rdev->rqt_pool);
 }
+
+/*
+ * On-Chip QP Memory.
+ */
+#define MIN_OCQP_SHIFT 12	/* 4KB == min ocqp size */
+
+u32 c4iw_ocqp_pool_alloc(struct c4iw_rdev *rdev, int size)
+{
+	unsigned long addr = gen_pool_alloc(rdev->ocqp_pool, size);
+	PDBG("%s addr 0x%x size %d\n", __func__, (u32)addr, size);
+	return (u32)addr;
+}
+
+void c4iw_ocqp_pool_free(struct c4iw_rdev *rdev, u32 addr, int size)
+{
+	PDBG("%s addr 0x%x size %d\n", __func__, addr, size);
+	gen_pool_free(rdev->ocqp_pool, (unsigned long)addr, size);
+}
+
+int c4iw_ocqp_pool_create(struct c4iw_rdev *rdev)
+{
+	unsigned start, chunk, top;
+
+	rdev->ocqp_pool = gen_pool_create(MIN_OCQP_SHIFT, -1);
+	if (!rdev->ocqp_pool)
+		return -ENOMEM;
+
+	start = rdev->lldi.vr->ocq.start;
+	chunk = rdev->lldi.vr->ocq.size;
+	top = start + chunk;
+
+	while (start < top) {
+		chunk = min(top - start + 1, chunk);
+		if (gen_pool_add(rdev->ocqp_pool, start, chunk, -1)) {
+			PDBG("%s failed to add OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			if (chunk <= 1024 << MIN_OCQP_SHIFT) {
+				printk(KERN_WARNING MOD
+				       "Failed to add all OCQP chunks (%x/%x)\n",
+				       start, top - start);
+				return 0;
+			}
+			chunk >>= 1;
+		} else {
+			PDBG("%s added OCQP chunk (%x/%x)\n",
+			     __func__, start, chunk);
+			start += chunk;
+		}
+	}
+	return 0;
+}
+
+void c4iw_ocqp_pool_destroy(struct c4iw_rdev *rdev)
+{
+	gen_pool_destroy(rdev->ocqp_pool);
+}
diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h
index 24f3690..17ea5fc 100644
--- a/drivers/infiniband/hw/cxgb4/t4.h
+++ b/drivers/infiniband/hw/cxgb4/t4.h
@@ -52,6 +52,7 @@
 #define T4_STAG_UNSET 0xffffffff
 #define T4_FW_MAJ 0
 #define T4_EQ_STATUS_ENTRIES (L1_CACHE_BYTES > 64 ? 2 : 1)
+#define A_PCIE_MA_SYNC 0x30b4
 
 struct t4_status_page {
 	__be32 rsvd1;	/* flit 0 - hw owns */
@@ -266,10 +267,36 @@
 	u16			idx;
 };
 
+static inline pgprot_t t4_pgprot_wc(pgprot_t prot)
+{
+#if defined(__i386__) || defined(__x86_64__)
+	return pgprot_writecombine(prot);
+#elif defined(CONFIG_PPC64)
+	return __pgprot((pgprot_val(prot) | _PAGE_NO_CACHE) &
+			~(pgprot_t)_PAGE_GUARDED);
+#else
+	return pgprot_noncached(prot);
+#endif
+}
+
+static inline int t4_ocqp_supported(void)
+{
+#if defined(__i386__) || defined(__x86_64__) || defined(CONFIG_PPC64)
+	return 1;
+#else
+	return 0;
+#endif
+}
+
+enum {
+	T4_SQ_ONCHIP = (1<<0),
+};
+
 struct t4_sq {
 	union t4_wr *queue;
 	dma_addr_t dma_addr;
 	DEFINE_DMA_UNMAP_ADDR(mapping);
+	unsigned long phys_addr;
 	struct t4_swsqe *sw_sq;
 	struct t4_swsqe *oldest_read;
 	u64 udb;
@@ -280,6 +307,7 @@
 	u16 cidx;
 	u16 pidx;
 	u16 wq_pidx;
+	u16 flags;
 };
 
 struct t4_swrqe {
@@ -350,6 +378,11 @@
 		wq->rq.cidx = 0;
 }
 
+static inline int t4_sq_onchip(struct t4_sq *sq)
+{
+	return sq->flags & T4_SQ_ONCHIP;
+}
+
 static inline int t4_sq_empty(struct t4_wq *wq)
 {
 	return wq->sq.in_use == 0;
@@ -396,30 +429,27 @@
 
 static inline int t4_wq_in_error(struct t4_wq *wq)
 {
-	return wq->sq.queue[wq->sq.size].status.qp_err;
+	return wq->rq.queue[wq->rq.size].status.qp_err;
 }
 
 static inline void t4_set_wq_in_error(struct t4_wq *wq)
 {
-	wq->sq.queue[wq->sq.size].status.qp_err = 1;
 	wq->rq.queue[wq->rq.size].status.qp_err = 1;
 }
 
 static inline void t4_disable_wq_db(struct t4_wq *wq)
 {
-	wq->sq.queue[wq->sq.size].status.db_off = 1;
 	wq->rq.queue[wq->rq.size].status.db_off = 1;
 }
 
 static inline void t4_enable_wq_db(struct t4_wq *wq)
 {
-	wq->sq.queue[wq->sq.size].status.db_off = 0;
 	wq->rq.queue[wq->rq.size].status.db_off = 0;
 }
 
 static inline int t4_wq_db_enabled(struct t4_wq *wq)
 {
-	return !wq->sq.queue[wq->sq.size].status.db_off;
+	return !wq->rq.queue[wq->rq.size].status.db_off;
 }
 
 struct t4_cq {
diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h
index ed6414a..e6669d5 100644
--- a/drivers/infiniband/hw/cxgb4/user.h
+++ b/drivers/infiniband/hw/cxgb4/user.h
@@ -50,7 +50,13 @@
 	__u32 qid_mask;
 };
 
+
+enum {
+	C4IW_QPF_ONCHIP = (1<<0)
+};
+
 struct c4iw_create_qp_resp {
+	__u64 ma_sync_key;
 	__u64 sq_key;
 	__u64 rq_key;
 	__u64 sq_db_gts_key;
@@ -62,5 +68,6 @@
 	__u32 sq_size;
 	__u32 rq_size;
 	__u32 qid_mask;
+	__u32 flags;
 };
 #endif