NVMe: Add a kthread to handle the congestion list

Instead of trying to resubmit I/Os in the I/O completion path (in
interrupt context), wake up a kthread which will resubmit I/O from
user context.  This allows mke2fs to run to completion.

Signed-off-by: Matthew Wilcox <matthew.r.wilcox@intel.com>
diff --git a/drivers/block/nvme.c b/drivers/block/nvme.c
index 61a2417..606371e 100644
--- a/drivers/block/nvme.c
+++ b/drivers/block/nvme.c
@@ -26,6 +26,7 @@
 #include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kdev_t.h>
+#include <linux/kthread.h>
 #include <linux/kernel.h>
 #include <linux/mm.h>
 #include <linux/module.h>
@@ -50,10 +51,15 @@
 static int use_threaded_interrupts;
 module_param(use_threaded_interrupts, int, 0);
 
+static DEFINE_SPINLOCK(dev_list_lock);
+static LIST_HEAD(dev_list);
+static struct task_struct *nvme_thread;
+
 /*
  * Represents an NVM Express device.  Each nvme_dev is a PCI function.
  */
 struct nvme_dev {
+	struct list_head node;
 	struct nvme_queue **queues;
 	u32 __iomem *dbs;
 	struct pci_dev *pci_dev;
@@ -97,6 +103,7 @@
 	dma_addr_t sq_dma_addr;
 	dma_addr_t cq_dma_addr;
 	wait_queue_head_t sq_full;
+	wait_queue_t sq_cong_wait;
 	struct bio_list sq_cong;
 	u32 __iomem *q_db;
 	u16 q_depth;
@@ -108,8 +115,6 @@
 	unsigned long cmdid_data[];
 };
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio);
-
 /*
  * Check we didin't inadvertently grow the command struct
  */
@@ -309,9 +314,6 @@
 			bio_data_dir(bio) ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
 	free_nbio(nvmeq, nbio);
 	bio_endio(bio, status ? -EIO : 0);
-	bio = bio_list_pop(&nvmeq->sq_cong);
-	if (bio)
-		nvme_resubmit_bio(nvmeq, bio);
 }
 
 /* length is in bytes */
@@ -482,16 +484,6 @@
 	return result;
 }
 
-static void nvme_resubmit_bio(struct nvme_queue *nvmeq, struct bio *bio)
-{
-	struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
-	if (nvme_submit_bio_queue(nvmeq, ns, bio))
-		bio_list_add_head(&nvmeq->sq_cong, bio);
-	else if (bio_list_empty(&nvmeq->sq_cong))
-		blk_clear_queue_congested(ns->queue, rw_is_sync(bio->bi_rw));
-	/* XXX: Need to duplicate the logic from __freed_request here */
-}
-
 /*
  * NB: return value of non-zero would mean that we were a stacking driver.
  * make_request must always succeed.
@@ -774,6 +766,7 @@
 	nvmeq->cq_head = 0;
 	nvmeq->cq_phase = 1;
 	init_waitqueue_head(&nvmeq->sq_full);
+	init_waitqueue_entry(&nvmeq->sq_cong_wait, nvme_thread);
 	bio_list_init(&nvmeq->sq_cong);
 	nvmeq->q_db = &dev->dbs[qid * 2];
 	nvmeq->q_depth = depth;
@@ -1097,6 +1090,43 @@
 	.ioctl		= nvme_ioctl,
 };
 
+static void nvme_resubmit_bios(struct nvme_queue *nvmeq)
+{
+	while (bio_list_peek(&nvmeq->sq_cong)) {
+		struct bio *bio = bio_list_pop(&nvmeq->sq_cong);
+		struct nvme_ns *ns = bio->bi_bdev->bd_disk->private_data;
+		if (nvme_submit_bio_queue(nvmeq, ns, bio)) {
+			bio_list_add_head(&nvmeq->sq_cong, bio);
+			break;
+		}
+	}
+}
+
+static int nvme_kthread(void *data)
+{
+	struct nvme_dev *dev;
+
+	while (!kthread_should_stop()) {
+		__set_current_state(TASK_RUNNING);
+		spin_lock(&dev_list_lock);
+		list_for_each_entry(dev, &dev_list, node) {
+			int i;
+			for (i = 0; i < dev->queue_count; i++) {
+				struct nvme_queue *nvmeq = dev->queues[i];
+				spin_lock_irq(&nvmeq->q_lock);
+				if (nvme_process_cq(nvmeq))
+					printk("process_cq did something\n");
+				nvme_resubmit_bios(nvmeq);
+				spin_unlock_irq(&nvmeq->q_lock);
+			}
+		}
+		spin_unlock(&dev_list_lock);
+		set_current_state(TASK_INTERRUPTIBLE);
+		schedule_timeout(HZ);
+	}
+	return 0;
+}
+
 static struct nvme_ns *nvme_alloc_ns(struct nvme_dev *dev, int index,
 			struct nvme_id_ns *id, struct nvme_lba_range_type *rt)
 {
@@ -1307,6 +1337,10 @@
 {
 	struct nvme_ns *ns, *next;
 
+	spin_lock(&dev_list_lock);
+	list_del(&dev->node);
+	spin_unlock(&dev_list_lock);
+
 	/* TODO: wait all I/O finished or cancel them */
 
 	list_for_each_entry_safe(ns, next, &dev->namespaces, list) {
@@ -1406,6 +1440,11 @@
 	result = nvme_dev_add(dev);
 	if (result)
 		goto delete;
+
+	spin_lock(&dev_list_lock);
+	list_add(&dev->node, &dev_list);
+	spin_unlock(&dev_list_lock);
+
 	return 0;
 
  delete:
@@ -1479,17 +1518,25 @@
 
 static int __init nvme_init(void)
 {
-	int result;
+	int result = -EBUSY;
+
+	nvme_thread = kthread_run(nvme_kthread, NULL, "nvme");
+	if (IS_ERR(nvme_thread))
+		return PTR_ERR(nvme_thread);
 
 	nvme_major = register_blkdev(nvme_major, "nvme");
 	if (nvme_major <= 0)
-		return -EBUSY;
+		goto kill_kthread;
 
 	result = pci_register_driver(&nvme_driver);
-	if (!result)
-		return 0;
+	if (result)
+		goto unregister_blkdev;
+	return 0;
 
+ unregister_blkdev:
 	unregister_blkdev(nvme_major, "nvme");
+ kill_kthread:
+	kthread_stop(nvme_thread);
 	return result;
 }
 
@@ -1497,6 +1544,7 @@
 {
 	pci_unregister_driver(&nvme_driver);
 	unregister_blkdev(nvme_major, "nvme");
+	kthread_stop(nvme_thread);
 }
 
 MODULE_AUTHOR("Matthew Wilcox <willy@linux.intel.com>");