block: add support for IO CPU affinity

This patch adds support for controlling the IO completion CPU of
either all requests on a queue, or on a per-request basis. We export
a sysfs variable (rq_affinity) which, if set, migrates completions
of requests to the CPU that originally submitted it. A bio helper
(bio_set_completion_cpu()) is also added, so that queuers can ask
for completion on that specific CPU.

In testing, this has been show to cut the system time by as much
as 20-40% on synthetic workloads where CPU affinity is desired.

This requires a little help from the architecture, so it'll only
work as designed for archs that are using the new generic smp
helper infrastructure.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 10aa46c..93204bf 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -17,6 +17,7 @@
 #include <linux/module.h>
 #include <linux/stringify.h>
 #include <linux/bsg.h>
+#include <linux/smp.h>
 
 #include <asm/scatterlist.h>
 
@@ -139,7 +140,8 @@
  */
 struct request {
 	struct list_head queuelist;
-	struct list_head donelist;
+	struct call_single_data csd;
+	int cpu;
 
 	struct request_queue *q;
 
@@ -420,6 +422,7 @@
 #define QUEUE_FLAG_ELVSWITCH	8	/* don't use elevator, just do FIFO */
 #define QUEUE_FLAG_BIDI		9	/* queue supports bidi requests */
 #define QUEUE_FLAG_NOMERGES    10	/* disable merge attempts */
+#define QUEUE_FLAG_SAME_COMP   11	/* force complete on same CPU */
 
 static inline int queue_is_locked(struct request_queue *q)
 {