percpu_ida: make percpu_ida percpu size/batch configurable

Make percpu_ida percpu size/batch configurable. The block-mq-tag will
use it.

After block-mq uses percpu_ida to manage tags, performance is improved.
My test is done in a 2 sockets machine, 12 process cross the 2 sockets.
So if there is lock contention or ipi, should be stressed heavily.
Testing is done for null-blk.

hw_queue_depth	nopatch iops	patch iops
64		~800k/s		~1470k/s
2048		~4470k/s	~4340k/s

Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Shaohua Li <shli@fusionio.com>
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c
index bab1ba2..a601d42 100644
--- a/lib/percpu_ida.c
+++ b/lib/percpu_ida.c
@@ -30,15 +30,6 @@
 #include <linux/spinlock.h>
 #include <linux/percpu_ida.h>
 
-/*
- * Number of tags we move between the percpu freelist and the global freelist at
- * a time
- */
-#define IDA_PCPU_BATCH_MOVE	32U
-
-/* Max size of percpu freelist, */
-#define IDA_PCPU_SIZE		((IDA_PCPU_BATCH_MOVE * 3) / 2)
-
 struct percpu_ida_cpu {
 	/*
 	 * Even though this is percpu, we need a lock for tag stealing by remote
@@ -78,7 +69,7 @@
 	struct percpu_ida_cpu *remote;
 
 	for (cpus_have_tags = cpumask_weight(&pool->cpus_have_tags);
-	     cpus_have_tags * IDA_PCPU_SIZE > pool->nr_tags / 2;
+	     cpus_have_tags * pool->percpu_max_size > pool->nr_tags / 2;
 	     cpus_have_tags--) {
 		cpu = cpumask_next(cpu, &pool->cpus_have_tags);
 
@@ -123,7 +114,7 @@
 {
 	move_tags(tags->freelist, &tags->nr_free,
 		  pool->freelist, &pool->nr_free,
-		  min(pool->nr_free, IDA_PCPU_BATCH_MOVE));
+		  min(pool->nr_free, pool->percpu_batch_size));
 }
 
 static inline unsigned alloc_local_tag(struct percpu_ida *pool,
@@ -245,17 +236,17 @@
 		wake_up(&pool->wait);
 	}
 
-	if (nr_free == IDA_PCPU_SIZE) {
+	if (nr_free == pool->percpu_max_size) {
 		spin_lock(&pool->lock);
 
 		/*
 		 * Global lock held and irqs disabled, don't need percpu
 		 * lock
 		 */
-		if (tags->nr_free == IDA_PCPU_SIZE) {
+		if (tags->nr_free == pool->percpu_max_size) {
 			move_tags(pool->freelist, &pool->nr_free,
 				  tags->freelist, &tags->nr_free,
-				  IDA_PCPU_BATCH_MOVE);
+				  pool->percpu_batch_size);
 
 			wake_up(&pool->wait);
 		}
@@ -292,7 +283,8 @@
  * Allocation is percpu, but sharding is limited by nr_tags - for best
  * performance, the workload should not span more cpus than nr_tags / 128.
  */
-int percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags)
+int __percpu_ida_init(struct percpu_ida *pool, unsigned long nr_tags,
+	unsigned long max_size, unsigned long batch_size)
 {
 	unsigned i, cpu, order;
 
@@ -301,6 +293,8 @@
 	init_waitqueue_head(&pool->wait);
 	spin_lock_init(&pool->lock);
 	pool->nr_tags = nr_tags;
+	pool->percpu_max_size = max_size;
+	pool->percpu_batch_size = batch_size;
 
 	/* Guard against overflow */
 	if (nr_tags > (unsigned) INT_MAX + 1) {
@@ -319,7 +313,7 @@
 	pool->nr_free = nr_tags;
 
 	pool->tag_cpu = __alloc_percpu(sizeof(struct percpu_ida_cpu) +
-				       IDA_PCPU_SIZE * sizeof(unsigned),
+				       pool->percpu_max_size * sizeof(unsigned),
 				       sizeof(unsigned));
 	if (!pool->tag_cpu)
 		goto err;
@@ -332,4 +326,4 @@
 	percpu_ida_destroy(pool);
 	return -ENOMEM;
 }
-EXPORT_SYMBOL_GPL(percpu_ida_init);
+EXPORT_SYMBOL_GPL(__percpu_ida_init);