IB/ehca: Support small QP queues

eHCA2 supports QP queues that can be as small as 512 bytes. This
greatly reduces memory overhead for consumers that use lots of QPs
with small queues (e.g. RDMA-only QPs). Apart from dealing with
firmware, this code needs to manage bite-sized chunks of kernel pages,
making sure that no kernel page is shared between different protection
domains.

Signed-off-by: Hoang-Nam Nguyen <hnguyen@de.ibm.com>
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
index 9606f13..a090c67 100644
--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
@@ -40,6 +40,11 @@
 
 #include "ehca_tools.h"
 #include "ipz_pt_fn.h"
+#include "ehca_classes.h"
+
+#define PAGES_PER_KPAGE (PAGE_SIZE >> EHCA_PAGESHIFT)
+
+struct kmem_cache *small_qp_cache;
 
 void *ipz_qpageit_get_inc(struct ipz_queue *queue)
 {
@@ -49,7 +54,7 @@
 		queue->current_q_offset -= queue->pagesize;
 		ret = NULL;
 	}
-	if (((u64)ret) % EHCA_PAGESIZE) {
+	if (((u64)ret) % queue->pagesize) {
 		ehca_gen_err("ERROR!! not at PAGE-Boundary");
 		return NULL;
 	}
@@ -83,80 +88,195 @@
 	return -EINVAL;
 }
 
-int ipz_queue_ctor(struct ipz_queue *queue,
-		   const u32 nr_of_pages,
-		   const u32 pagesize, const u32 qe_size, const u32 nr_of_sg)
-{
-	int pages_per_kpage = PAGE_SIZE >> EHCA_PAGESHIFT;
-	int f;
+#if PAGE_SHIFT < EHCA_PAGESHIFT
+#error Kernel pages must be at least as large than eHCA pages (4K) !
+#endif
 
-	if (pagesize > PAGE_SIZE) {
-		ehca_gen_err("FATAL ERROR: pagesize=%x is greater "
-			     "than kernel page size", pagesize);
-		return 0;
-	}
-	if (!pages_per_kpage) {
-		ehca_gen_err("FATAL ERROR: invalid kernel page size. "
-			     "pages_per_kpage=%x", pages_per_kpage);
-		return 0;
-	}
-	queue->queue_length = nr_of_pages * pagesize;
-	queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
-	if (!queue->queue_pages) {
-		ehca_gen_err("ERROR!! didn't get the memory");
-		return 0;
-	}
-	memset(queue->queue_pages, 0, nr_of_pages * sizeof(void *));
-	/*
-	 * allocate pages for queue:
-	 * outer loop allocates whole kernel pages (page aligned) and
-	 * inner loop divides a kernel page into smaller hca queue pages
-	 */
-	f = 0;
+/*
+ * allocate pages for queue:
+ * outer loop allocates whole kernel pages (page aligned) and
+ * inner loop divides a kernel page into smaller hca queue pages
+ */
+static int alloc_queue_pages(struct ipz_queue *queue, const u32 nr_of_pages)
+{
+	int k, f = 0;
+	u8 *kpage;
+
 	while (f < nr_of_pages) {
-		u8 *kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
-		int k;
+		kpage = (u8 *)get_zeroed_page(GFP_KERNEL);
 		if (!kpage)
-			goto ipz_queue_ctor_exit0; /*NOMEM*/
-		for (k = 0; k < pages_per_kpage && f < nr_of_pages; k++) {
-			(queue->queue_pages)[f] = (struct ipz_page *)kpage;
+			goto out;
+
+		for (k = 0; k < PAGES_PER_KPAGE && f < nr_of_pages; k++) {
+			queue->queue_pages[f] = (struct ipz_page *)kpage;
 			kpage += EHCA_PAGESIZE;
 			f++;
 		}
 	}
-
-	queue->current_q_offset = 0;
-	queue->qe_size = qe_size;
-	queue->act_nr_of_sg = nr_of_sg;
-	queue->pagesize = pagesize;
-	queue->toggle_state = 1;
 	return 1;
 
- ipz_queue_ctor_exit0:
-	ehca_gen_err("Couldn't get alloc pages queue=%p f=%x nr_of_pages=%x",
-		     queue, f, nr_of_pages);
-	for (f = 0; f < nr_of_pages; f += pages_per_kpage) {
-		if (!(queue->queue_pages)[f])
-			break;
+out:
+	for (f = 0; f < nr_of_pages && queue->queue_pages[f];
+	     f += PAGES_PER_KPAGE)
 		free_page((unsigned long)(queue->queue_pages)[f]);
-	}
 	return 0;
 }
 
-int ipz_queue_dtor(struct ipz_queue *queue)
+static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
 {
-	int pages_per_kpage = PAGE_SIZE >> EHCA_PAGESHIFT;
-	int g;
-	int nr_pages;
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page;
+	unsigned long bit;
+
+	mutex_lock(&pd->lock);
+
+	if (!list_empty(&pd->free[order]))
+		page = list_entry(pd->free[order].next,
+				  struct ipz_small_queue_page, list);
+	else {
+		page = kmem_cache_zalloc(small_qp_cache, GFP_KERNEL);
+		if (!page)
+			goto out;
+
+		page->page = get_zeroed_page(GFP_KERNEL);
+		if (!page->page) {
+			kmem_cache_free(small_qp_cache, page);
+			goto out;
+		}
+
+		list_add(&page->list, &pd->free[order]);
+	}
+
+	bit = find_first_zero_bit(page->bitmap, IPZ_SPAGE_PER_KPAGE >> order);
+	__set_bit(bit, page->bitmap);
+	page->fill++;
+
+	if (page->fill == IPZ_SPAGE_PER_KPAGE >> order)
+		list_move(&page->list, &pd->full[order]);
+
+	mutex_unlock(&pd->lock);
+
+	queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
+	queue->small_page = page;
+	return 1;
+
+out:
+	ehca_err(pd->ib_pd.device, "failed to allocate small queue page");
+	return 0;
+}
+
+static void free_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
+{
+	int order = ilog2(queue->pagesize) - 9;
+	struct ipz_small_queue_page *page = queue->small_page;
+	unsigned long bit;
+	int free_page = 0;
+
+	bit = ((unsigned long)queue->queue_pages[0] & PAGE_MASK)
+		>> (order + 9);
+
+	mutex_lock(&pd->lock);
+
+	__clear_bit(bit, page->bitmap);
+	page->fill--;
+
+	if (page->fill == 0) {
+		list_del(&page->list);
+		free_page = 1;
+	}
+
+	if (page->fill == (IPZ_SPAGE_PER_KPAGE >> order) - 1)
+		/* the page was full until we freed the chunk */
+		list_move_tail(&page->list, &pd->free[order]);
+
+	mutex_unlock(&pd->lock);
+
+	if (free_page) {
+		free_page(page->page);
+		kmem_cache_free(small_qp_cache, page);
+	}
+}
+
+int ipz_queue_ctor(struct ehca_pd *pd, struct ipz_queue *queue,
+		   const u32 nr_of_pages, const u32 pagesize,
+		   const u32 qe_size, const u32 nr_of_sg,
+		   int is_small)
+{
+	if (pagesize > PAGE_SIZE) {
+		ehca_gen_err("FATAL ERROR: pagesize=%x "
+			     "is greater than kernel page size", pagesize);
+		return 0;
+	}
+
+	/* init queue fields */
+	queue->queue_length = nr_of_pages * pagesize;
+	queue->pagesize = pagesize;
+	queue->qe_size = qe_size;
+	queue->act_nr_of_sg = nr_of_sg;
+	queue->current_q_offset = 0;
+	queue->toggle_state = 1;
+	queue->small_page = NULL;
+
+	/* allocate queue page pointers */
+	queue->queue_pages = vmalloc(nr_of_pages * sizeof(void *));
+	if (!queue->queue_pages) {
+		ehca_gen_err("Couldn't allocate queue page list");
+		return 0;
+	}
+	memset(queue->queue_pages, 0, nr_of_pages * sizeof(void *));
+
+	/* allocate actual queue pages */
+	if (is_small) {
+		if (!alloc_small_queue_page(queue, pd))
+			goto ipz_queue_ctor_exit0;
+	} else
+		if (!alloc_queue_pages(queue, nr_of_pages))
+			goto ipz_queue_ctor_exit0;
+
+	return 1;
+
+ipz_queue_ctor_exit0:
+	ehca_gen_err("Couldn't alloc pages queue=%p "
+		 "nr_of_pages=%x",  queue, nr_of_pages);
+	vfree(queue->queue_pages);
+
+	return 0;
+}
+
+int ipz_queue_dtor(struct ehca_pd *pd, struct ipz_queue *queue)
+{
+	int i, nr_pages;
 
 	if (!queue || !queue->queue_pages) {
 		ehca_gen_dbg("queue or queue_pages is NULL");
 		return 0;
 	}
-	nr_pages = queue->queue_length / queue->pagesize;
-	for (g = 0; g < nr_pages; g += pages_per_kpage)
-		free_page((unsigned long)(queue->queue_pages)[g]);
+
+	if (queue->small_page)
+		free_small_queue_page(queue, pd);
+	else {
+		nr_pages = queue->queue_length / queue->pagesize;
+		for (i = 0; i < nr_pages; i += PAGES_PER_KPAGE)
+			free_page((unsigned long)queue->queue_pages[i]);
+	}
+
 	vfree(queue->queue_pages);
 
 	return 1;
 }
+
+int ehca_init_small_qp_cache(void)
+{
+	small_qp_cache = kmem_cache_create("ehca_cache_small_qp",
+					   sizeof(struct ipz_small_queue_page),
+					   0, SLAB_HWCACHE_ALIGN, NULL);
+	if (!small_qp_cache)
+		return -ENOMEM;
+
+	return 0;
+}
+
+void ehca_cleanup_small_qp_cache(void)
+{
+	kmem_cache_destroy(small_qp_cache);
+}