Merge branch 'for-linus2' of git://git.kernel.dk/linux-block
Pull block fixes from Jens Axboe:
"Round 2 of this. I cut back to the bare necessities, the patch is
still larger than it usually would be at this time, due to the number
of NVMe fixes in there. This pull request contains:
- The 4 core fixes from Ming, that fix both problems with exceeding
the virtual boundary limit in case of merging, and the gap checking
for cloned bio's.
- NVMe fixes from Keith and Christoph:
- Regression on larger user commands, causing problems with
reading log pages (for instance). This touches both NVMe,
and the block core since that is now generally utilized also
for these types of commands.
- Hot removal fixes.
- User exploitable issue with passthrough IO commands, if !length
is given, causing us to fault on writing to the zero
page.
- Fix for a hang under error conditions
- And finally, the current series regression for umount with cgroup
writeback, where the final flush would happen async and hence open
up window after umount where the device wasn't consistent. fsck
right after umount would show this. From Tejun"
* 'for-linus2' of git://git.kernel.dk/linux-block:
block: support large requests in blk_rq_map_user_iov
block: fix blk_rq_get_max_sectors for driver private requests
nvme: fix max_segments integer truncation
nvme: set queue limits for the admin queue
writeback: flush inode cgroup wb switches instead of pinning super_block
NVMe: Fix 0-length integrity payload
NVMe: Don't allow unsupported flags
NVMe: Move error handling to failed reset handler
NVMe: Simplify device reset failure
NVMe: Fix namespace removal deadlock
NVMe: Use IDA for namespace disk naming
NVMe: Don't unmap controller registers on reset
block: merge: get the 1st and last bvec via helpers
block: get the 1st and last bvec via helpers
block: check virt boundary in bio_will_gap()
block: bio: introduce helpers to get the 1st and last bvec
diff --git a/block/blk-map.c b/block/blk-map.c
index f565e11..a54f054 100644
--- a/block/blk-map.c
+++ b/block/blk-map.c
@@ -57,6 +57,49 @@
return ret;
}
+static int __blk_rq_map_user_iov(struct request *rq,
+ struct rq_map_data *map_data, struct iov_iter *iter,
+ gfp_t gfp_mask, bool copy)
+{
+ struct request_queue *q = rq->q;
+ struct bio *bio, *orig_bio;
+ int ret;
+
+ if (copy)
+ bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
+ else
+ bio = bio_map_user_iov(q, iter, gfp_mask);
+
+ if (IS_ERR(bio))
+ return PTR_ERR(bio);
+
+ if (map_data && map_data->null_mapped)
+ bio_set_flag(bio, BIO_NULL_MAPPED);
+
+ iov_iter_advance(iter, bio->bi_iter.bi_size);
+ if (map_data)
+ map_data->offset += bio->bi_iter.bi_size;
+
+ orig_bio = bio;
+ blk_queue_bounce(q, &bio);
+
+ /*
+ * We link the bounce buffer in and could have to traverse it
+ * later so we have to get a ref to prevent it from being freed
+ */
+ bio_get(bio);
+
+ ret = blk_rq_append_bio(q, rq, bio);
+ if (ret) {
+ bio_endio(bio);
+ __blk_rq_unmap_user(orig_bio);
+ bio_put(bio);
+ return ret;
+ }
+
+ return 0;
+}
+
/**
* blk_rq_map_user_iov - map user data to a request, for REQ_TYPE_BLOCK_PC usage
* @q: request queue where request should be inserted
@@ -82,10 +125,11 @@
struct rq_map_data *map_data,
const struct iov_iter *iter, gfp_t gfp_mask)
{
- struct bio *bio;
- int unaligned = 0;
- struct iov_iter i;
struct iovec iov, prv = {.iov_base = NULL, .iov_len = 0};
+ bool copy = (q->dma_pad_mask & iter->count) || map_data;
+ struct bio *bio = NULL;
+ struct iov_iter i;
+ int ret;
if (!iter || !iter->count)
return -EINVAL;
@@ -101,42 +145,29 @@
*/
if ((uaddr & queue_dma_alignment(q)) ||
iovec_gap_to_prv(q, &prv, &iov))
- unaligned = 1;
+ copy = true;
prv.iov_base = iov.iov_base;
prv.iov_len = iov.iov_len;
}
- if (unaligned || (q->dma_pad_mask & iter->count) || map_data)
- bio = bio_copy_user_iov(q, map_data, iter, gfp_mask);
- else
- bio = bio_map_user_iov(q, iter, gfp_mask);
-
- if (IS_ERR(bio))
- return PTR_ERR(bio);
-
- if (map_data && map_data->null_mapped)
- bio_set_flag(bio, BIO_NULL_MAPPED);
-
- if (bio->bi_iter.bi_size != iter->count) {
- /*
- * Grab an extra reference to this bio, as bio_unmap_user()
- * expects to be able to drop it twice as it happens on the
- * normal IO completion path
- */
- bio_get(bio);
- bio_endio(bio);
- __blk_rq_unmap_user(bio);
- return -EINVAL;
- }
+ i = *iter;
+ do {
+ ret =__blk_rq_map_user_iov(rq, map_data, &i, gfp_mask, copy);
+ if (ret)
+ goto unmap_rq;
+ if (!bio)
+ bio = rq->bio;
+ } while (iov_iter_count(&i));
if (!bio_flagged(bio, BIO_USER_MAPPED))
rq->cmd_flags |= REQ_COPY_USER;
-
- blk_queue_bounce(q, &bio);
- bio_get(bio);
- blk_rq_bio_prep(q, rq, bio);
return 0;
+
+unmap_rq:
+ __blk_rq_unmap_user(bio);
+ rq->bio = NULL;
+ return -EINVAL;
}
EXPORT_SYMBOL(blk_rq_map_user_iov);
diff --git a/block/blk-merge.c b/block/blk-merge.c
index 888a7fe..2613531 100644
--- a/block/blk-merge.c
+++ b/block/blk-merge.c
@@ -304,7 +304,6 @@
struct bio *nxt)
{
struct bio_vec end_bv = { NULL }, nxt_bv;
- struct bvec_iter iter;
if (!blk_queue_cluster(q))
return 0;
@@ -316,11 +315,8 @@
if (!bio_has_data(bio))
return 1;
- bio_for_each_segment(end_bv, bio, iter)
- if (end_bv.bv_len == iter.bi_size)
- break;
-
- nxt_bv = bio_iovec(nxt);
+ bio_get_last_bvec(bio, &end_bv);
+ bio_get_first_bvec(nxt, &nxt_bv);
if (!BIOVEC_PHYS_MERGEABLE(&end_bv, &nxt_bv))
return 0;
diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c
index 3cd921e..03c4641 100644
--- a/drivers/nvme/host/core.c
+++ b/drivers/nvme/host/core.c
@@ -55,8 +55,9 @@
ns->disk->private_data = NULL;
spin_unlock(&dev_list_lock);
- nvme_put_ctrl(ns->ctrl);
put_disk(ns->disk);
+ ida_simple_remove(&ns->ctrl->ns_ida, ns->instance);
+ nvme_put_ctrl(ns->ctrl);
kfree(ns);
}
@@ -183,7 +184,7 @@
goto out_unmap;
}
- if (meta_buffer) {
+ if (meta_buffer && meta_len) {
struct bio_integrity_payload *bip;
meta = kmalloc(meta_len, GFP_KERNEL);
@@ -373,6 +374,8 @@
if (copy_from_user(&io, uio, sizeof(io)))
return -EFAULT;
+ if (io.flags)
+ return -EINVAL;
switch (io.opcode) {
case nvme_cmd_write:
@@ -424,6 +427,8 @@
return -EACCES;
if (copy_from_user(&cmd, ucmd, sizeof(cmd)))
return -EFAULT;
+ if (cmd.flags)
+ return -EINVAL;
memset(&c, 0, sizeof(c));
c.common.opcode = cmd.opcode;
@@ -556,6 +561,10 @@
u16 old_ms;
unsigned short bs;
+ if (test_bit(NVME_NS_DEAD, &ns->flags)) {
+ set_capacity(disk, 0);
+ return -ENODEV;
+ }
if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) {
dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n",
__func__, ns->ctrl->instance, ns->ns_id);
@@ -831,6 +840,23 @@
return ret;
}
+static void nvme_set_queue_limits(struct nvme_ctrl *ctrl,
+ struct request_queue *q)
+{
+ if (ctrl->max_hw_sectors) {
+ u32 max_segments =
+ (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1;
+
+ blk_queue_max_hw_sectors(q, ctrl->max_hw_sectors);
+ blk_queue_max_segments(q, min_t(u32, max_segments, USHRT_MAX));
+ }
+ if (ctrl->stripe_size)
+ blk_queue_chunk_sectors(q, ctrl->stripe_size >> 9);
+ if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
+ blk_queue_flush(q, REQ_FLUSH | REQ_FUA);
+ blk_queue_virt_boundary(q, ctrl->page_size - 1);
+}
+
/*
* Initialize the cached copies of the Identify data and various controller
* register in our nvme_ctrl structure. This should be called as soon as
@@ -888,6 +914,8 @@
}
}
+ nvme_set_queue_limits(ctrl, ctrl->admin_q);
+
kfree(id);
return 0;
}
@@ -1118,9 +1146,13 @@
if (!ns)
return;
+ ns->instance = ida_simple_get(&ctrl->ns_ida, 1, 0, GFP_KERNEL);
+ if (ns->instance < 0)
+ goto out_free_ns;
+
ns->queue = blk_mq_init_queue(ctrl->tagset);
if (IS_ERR(ns->queue))
- goto out_free_ns;
+ goto out_release_instance;
queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue);
ns->queue->queuedata = ns;
ns->ctrl = ctrl;
@@ -1134,17 +1166,9 @@
ns->disk = disk;
ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */
+
blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift);
- if (ctrl->max_hw_sectors) {
- blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors);
- blk_queue_max_segments(ns->queue,
- (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1);
- }
- if (ctrl->stripe_size)
- blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9);
- if (ctrl->vwc & NVME_CTRL_VWC_PRESENT)
- blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA);
- blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1);
+ nvme_set_queue_limits(ctrl, ns->queue);
disk->major = nvme_major;
disk->first_minor = 0;
@@ -1153,7 +1177,7 @@
disk->queue = ns->queue;
disk->driverfs_dev = ctrl->device;
disk->flags = GENHD_FL_EXT_DEVT;
- sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid);
+ sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, ns->instance);
if (nvme_revalidate_disk(ns->disk))
goto out_free_disk;
@@ -1173,40 +1197,29 @@
kfree(disk);
out_free_queue:
blk_cleanup_queue(ns->queue);
+ out_release_instance:
+ ida_simple_remove(&ctrl->ns_ida, ns->instance);
out_free_ns:
kfree(ns);
}
static void nvme_ns_remove(struct nvme_ns *ns)
{
- bool kill = nvme_io_incapable(ns->ctrl) &&
- !blk_queue_dying(ns->queue);
+ if (test_and_set_bit(NVME_NS_REMOVING, &ns->flags))
+ return;
- lockdep_assert_held(&ns->ctrl->namespaces_mutex);
-
- if (kill) {
- blk_set_queue_dying(ns->queue);
-
- /*
- * The controller was shutdown first if we got here through
- * device removal. The shutdown may requeue outstanding
- * requests. These need to be aborted immediately so
- * del_gendisk doesn't block indefinitely for their completion.
- */
- blk_mq_abort_requeue_list(ns->queue);
- }
if (ns->disk->flags & GENHD_FL_UP) {
if (blk_get_integrity(ns->disk))
blk_integrity_unregister(ns->disk);
sysfs_remove_group(&disk_to_dev(ns->disk)->kobj,
&nvme_ns_attr_group);
del_gendisk(ns->disk);
- }
- if (kill || !blk_queue_dying(ns->queue)) {
blk_mq_abort_requeue_list(ns->queue);
blk_cleanup_queue(ns->queue);
}
+ mutex_lock(&ns->ctrl->namespaces_mutex);
list_del_init(&ns->list);
+ mutex_unlock(&ns->ctrl->namespaces_mutex);
nvme_put_ns(ns);
}
@@ -1300,10 +1313,8 @@
{
struct nvme_ns *ns, *next;
- mutex_lock(&ctrl->namespaces_mutex);
list_for_each_entry_safe(ns, next, &ctrl->namespaces, list)
nvme_ns_remove(ns);
- mutex_unlock(&ctrl->namespaces_mutex);
}
static DEFINE_IDA(nvme_instance_ida);
@@ -1350,6 +1361,7 @@
put_device(ctrl->device);
nvme_release_instance(ctrl);
+ ida_destroy(&ctrl->ns_ida);
ctrl->ops->free_ctrl(ctrl);
}
@@ -1390,6 +1402,7 @@
}
get_device(ctrl->device);
dev_set_drvdata(ctrl->device, ctrl);
+ ida_init(&ctrl->ns_ida);
spin_lock(&dev_list_lock);
list_add_tail(&ctrl->node, &nvme_ctrl_list);
@@ -1402,6 +1415,38 @@
return ret;
}
+/**
+ * nvme_kill_queues(): Ends all namespace queues
+ * @ctrl: the dead controller that needs to end
+ *
+ * Call this function when the driver determines it is unable to get the
+ * controller in a state capable of servicing IO.
+ */
+void nvme_kill_queues(struct nvme_ctrl *ctrl)
+{
+ struct nvme_ns *ns;
+
+ mutex_lock(&ctrl->namespaces_mutex);
+ list_for_each_entry(ns, &ctrl->namespaces, list) {
+ if (!kref_get_unless_zero(&ns->kref))
+ continue;
+
+ /*
+ * Revalidating a dead namespace sets capacity to 0. This will
+ * end buffered writers dirtying pages that can't be synced.
+ */
+ if (!test_and_set_bit(NVME_NS_DEAD, &ns->flags))
+ revalidate_disk(ns->disk);
+
+ blk_set_queue_dying(ns->queue);
+ blk_mq_abort_requeue_list(ns->queue);
+ blk_mq_start_stopped_hw_queues(ns->queue, true);
+
+ nvme_put_ns(ns);
+ }
+ mutex_unlock(&ctrl->namespaces_mutex);
+}
+
void nvme_stop_queues(struct nvme_ctrl *ctrl)
{
struct nvme_ns *ns;
diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h
index 9664d07..fb15ba5 100644
--- a/drivers/nvme/host/nvme.h
+++ b/drivers/nvme/host/nvme.h
@@ -72,6 +72,7 @@
struct mutex namespaces_mutex;
struct device *device; /* char device */
struct list_head node;
+ struct ida ns_ida;
char name[12];
char serial[20];
@@ -102,6 +103,7 @@
struct request_queue *queue;
struct gendisk *disk;
struct kref kref;
+ int instance;
u8 eui[8];
u8 uuid[16];
@@ -112,6 +114,11 @@
bool ext;
u8 pi_type;
int type;
+ unsigned long flags;
+
+#define NVME_NS_REMOVING 0
+#define NVME_NS_DEAD 1
+
u64 mode_select_num_blocks;
u32 mode_select_block_len;
};
@@ -240,6 +247,7 @@
void nvme_stop_queues(struct nvme_ctrl *ctrl);
void nvme_start_queues(struct nvme_ctrl *ctrl);
+void nvme_kill_queues(struct nvme_ctrl *ctrl);
struct request *nvme_alloc_request(struct request_queue *q,
struct nvme_command *cmd, unsigned int flags);
diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c
index a128672..680f578 100644
--- a/drivers/nvme/host/pci.c
+++ b/drivers/nvme/host/pci.c
@@ -86,7 +86,6 @@
static int nvme_reset(struct nvme_dev *dev);
static void nvme_process_cq(struct nvme_queue *nvmeq);
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev);
static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown);
/*
@@ -120,6 +119,7 @@
unsigned long flags;
#define NVME_CTRL_RESETTING 0
+#define NVME_CTRL_REMOVING 1
struct nvme_ctrl ctrl;
struct completion ioq_wait;
@@ -286,6 +286,17 @@
return 0;
}
+static void nvme_queue_scan(struct nvme_dev *dev)
+{
+ /*
+ * Do not queue new scan work when a controller is reset during
+ * removal.
+ */
+ if (test_bit(NVME_CTRL_REMOVING, &dev->flags))
+ return;
+ queue_work(nvme_workq, &dev->scan_work);
+}
+
static void nvme_complete_async_event(struct nvme_dev *dev,
struct nvme_completion *cqe)
{
@@ -300,7 +311,7 @@
switch (result & 0xff07) {
case NVME_AER_NOTICE_NS_CHANGED:
dev_info(dev->dev, "rescanning\n");
- queue_work(nvme_workq, &dev->scan_work);
+ nvme_queue_scan(dev);
default:
dev_warn(dev->dev, "async event result %08x\n", result);
}
@@ -679,7 +690,10 @@
spin_lock_irq(&nvmeq->q_lock);
if (unlikely(nvmeq->cq_vector < 0)) {
- ret = BLK_MQ_RQ_QUEUE_BUSY;
+ if (ns && !test_bit(NVME_NS_DEAD, &ns->flags))
+ ret = BLK_MQ_RQ_QUEUE_BUSY;
+ else
+ ret = BLK_MQ_RQ_QUEUE_ERROR;
spin_unlock_irq(&nvmeq->q_lock);
goto out;
}
@@ -1250,6 +1264,12 @@
static void nvme_dev_remove_admin(struct nvme_dev *dev)
{
if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) {
+ /*
+ * If the controller was reset during removal, it's possible
+ * user requests may be waiting on a stopped queue. Start the
+ * queue to flush these to completion.
+ */
+ blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true);
blk_cleanup_queue(dev->ctrl.admin_q);
blk_mq_free_tag_set(&dev->admin_tagset);
}
@@ -1690,14 +1710,14 @@
return 0;
dev->ctrl.tagset = &dev->tagset;
}
- queue_work(nvme_workq, &dev->scan_work);
+ nvme_queue_scan(dev);
return 0;
}
-static int nvme_dev_map(struct nvme_dev *dev)
+static int nvme_pci_enable(struct nvme_dev *dev)
{
u64 cap;
- int bars, result = -ENOMEM;
+ int result = -ENOMEM;
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pci_enable_device_mem(pdev))
@@ -1705,24 +1725,14 @@
dev->entry[0].vector = pdev->irq;
pci_set_master(pdev);
- bars = pci_select_bars(pdev, IORESOURCE_MEM);
- if (!bars)
- goto disable_pci;
-
- if (pci_request_selected_regions(pdev, bars, "nvme"))
- goto disable_pci;
if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) &&
dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32)))
goto disable;
- dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
- if (!dev->bar)
- goto disable;
-
if (readl(dev->bar + NVME_REG_CSTS) == -1) {
result = -ENODEV;
- goto unmap;
+ goto disable;
}
/*
@@ -1732,7 +1742,7 @@
if (!pdev->irq) {
result = pci_enable_msix(pdev, dev->entry, 1);
if (result < 0)
- goto unmap;
+ goto disable;
}
cap = lo_hi_readq(dev->bar + NVME_REG_CAP);
@@ -1759,18 +1769,20 @@
pci_save_state(pdev);
return 0;
- unmap:
- iounmap(dev->bar);
- dev->bar = NULL;
disable:
- pci_release_regions(pdev);
- disable_pci:
pci_disable_device(pdev);
return result;
}
static void nvme_dev_unmap(struct nvme_dev *dev)
{
+ if (dev->bar)
+ iounmap(dev->bar);
+ pci_release_regions(to_pci_dev(dev->dev));
+}
+
+static void nvme_pci_disable(struct nvme_dev *dev)
+{
struct pci_dev *pdev = to_pci_dev(dev->dev);
if (pdev->msi_enabled)
@@ -1778,12 +1790,6 @@
else if (pdev->msix_enabled)
pci_disable_msix(pdev);
- if (dev->bar) {
- iounmap(dev->bar);
- dev->bar = NULL;
- pci_release_regions(pdev);
- }
-
if (pci_is_enabled(pdev)) {
pci_disable_pcie_error_reporting(pdev);
pci_disable_device(pdev);
@@ -1842,7 +1848,7 @@
nvme_dev_list_remove(dev);
mutex_lock(&dev->shutdown_lock);
- if (dev->bar) {
+ if (pci_is_enabled(to_pci_dev(dev->dev))) {
nvme_stop_queues(&dev->ctrl);
csts = readl(dev->bar + NVME_REG_CSTS);
}
@@ -1855,7 +1861,7 @@
nvme_disable_io_queues(dev);
nvme_disable_admin_queue(dev, shutdown);
}
- nvme_dev_unmap(dev);
+ nvme_pci_disable(dev);
for (i = dev->queue_count - 1; i >= 0; i--)
nvme_clear_queue(dev->queues[i]);
@@ -1899,10 +1905,20 @@
kfree(dev);
}
+static void nvme_remove_dead_ctrl(struct nvme_dev *dev, int status)
+{
+ dev_warn(dev->dev, "Removing after probe failure status: %d\n", status);
+
+ kref_get(&dev->ctrl.kref);
+ nvme_dev_disable(dev, false);
+ if (!schedule_work(&dev->remove_work))
+ nvme_put_ctrl(&dev->ctrl);
+}
+
static void nvme_reset_work(struct work_struct *work)
{
struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work);
- int result;
+ int result = -ENODEV;
if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags)))
goto out;
@@ -1911,37 +1927,37 @@
* If we're called to reset a live controller first shut it down before
* moving on.
*/
- if (dev->bar)
+ if (dev->ctrl.ctrl_config & NVME_CC_ENABLE)
nvme_dev_disable(dev, false);
set_bit(NVME_CTRL_RESETTING, &dev->flags);
- result = nvme_dev_map(dev);
+ result = nvme_pci_enable(dev);
if (result)
goto out;
result = nvme_configure_admin_queue(dev);
if (result)
- goto unmap;
+ goto out;
nvme_init_queue(dev->queues[0], 0);
result = nvme_alloc_admin_tags(dev);
if (result)
- goto disable;
+ goto out;
result = nvme_init_identify(&dev->ctrl);
if (result)
- goto free_tags;
+ goto out;
result = nvme_setup_io_queues(dev);
if (result)
- goto free_tags;
+ goto out;
dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS;
result = nvme_dev_list_add(dev);
if (result)
- goto remove;
+ goto out;
/*
* Keep the controller around but remove all namespaces if we don't have
@@ -1958,19 +1974,8 @@
clear_bit(NVME_CTRL_RESETTING, &dev->flags);
return;
- remove:
- nvme_dev_list_remove(dev);
- free_tags:
- nvme_dev_remove_admin(dev);
- blk_put_queue(dev->ctrl.admin_q);
- dev->ctrl.admin_q = NULL;
- dev->queues[0]->tags = NULL;
- disable:
- nvme_disable_admin_queue(dev, false);
- unmap:
- nvme_dev_unmap(dev);
out:
- nvme_remove_dead_ctrl(dev);
+ nvme_remove_dead_ctrl(dev, result);
}
static void nvme_remove_dead_ctrl_work(struct work_struct *work)
@@ -1978,19 +1983,12 @@
struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work);
struct pci_dev *pdev = to_pci_dev(dev->dev);
+ nvme_kill_queues(&dev->ctrl);
if (pci_get_drvdata(pdev))
pci_stop_and_remove_bus_device_locked(pdev);
nvme_put_ctrl(&dev->ctrl);
}
-static void nvme_remove_dead_ctrl(struct nvme_dev *dev)
-{
- dev_warn(dev->dev, "Removing after probe failure\n");
- kref_get(&dev->ctrl.kref);
- if (!schedule_work(&dev->remove_work))
- nvme_put_ctrl(&dev->ctrl);
-}
-
static int nvme_reset(struct nvme_dev *dev)
{
if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q))
@@ -2042,6 +2040,27 @@
.free_ctrl = nvme_pci_free_ctrl,
};
+static int nvme_dev_map(struct nvme_dev *dev)
+{
+ int bars;
+ struct pci_dev *pdev = to_pci_dev(dev->dev);
+
+ bars = pci_select_bars(pdev, IORESOURCE_MEM);
+ if (!bars)
+ return -ENODEV;
+ if (pci_request_selected_regions(pdev, bars, "nvme"))
+ return -ENODEV;
+
+ dev->bar = ioremap(pci_resource_start(pdev, 0), 8192);
+ if (!dev->bar)
+ goto release;
+
+ return 0;
+ release:
+ pci_release_regions(pdev);
+ return -ENODEV;
+}
+
static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id)
{
int node, result = -ENOMEM;
@@ -2066,6 +2085,10 @@
dev->dev = get_device(&pdev->dev);
pci_set_drvdata(pdev, dev);
+ result = nvme_dev_map(dev);
+ if (result)
+ goto free;
+
INIT_LIST_HEAD(&dev->node);
INIT_WORK(&dev->scan_work, nvme_dev_scan);
INIT_WORK(&dev->reset_work, nvme_reset_work);
@@ -2089,6 +2112,7 @@
nvme_release_prp_pools(dev);
put_pci:
put_device(dev->dev);
+ nvme_dev_unmap(dev);
free:
kfree(dev->queues);
kfree(dev->entry);
@@ -2112,10 +2136,16 @@
nvme_dev_disable(dev, true);
}
+/*
+ * The driver's remove may be called on a device in a partially initialized
+ * state. This function must not have any dependencies on the device state in
+ * order to proceed.
+ */
static void nvme_remove(struct pci_dev *pdev)
{
struct nvme_dev *dev = pci_get_drvdata(pdev);
+ set_bit(NVME_CTRL_REMOVING, &dev->flags);
pci_set_drvdata(pdev, NULL);
flush_work(&dev->scan_work);
nvme_remove_namespaces(&dev->ctrl);
@@ -2126,6 +2156,7 @@
nvme_free_queues(dev, 0);
nvme_release_cmb(dev);
nvme_release_prp_pools(dev);
+ nvme_dev_unmap(dev);
nvme_put_ctrl(&dev->ctrl);
}
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 1f76d89..5c46ed9 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -223,6 +223,9 @@
#define WB_FRN_HIST_MAX_SLOTS (WB_FRN_HIST_THR_SLOTS / 2 + 1)
/* one round can affect upto 5 slots */
+static atomic_t isw_nr_in_flight = ATOMIC_INIT(0);
+static struct workqueue_struct *isw_wq;
+
void __inode_attach_wb(struct inode *inode, struct page *page)
{
struct backing_dev_info *bdi = inode_to_bdi(inode);
@@ -317,7 +320,6 @@
struct inode_switch_wbs_context *isw =
container_of(work, struct inode_switch_wbs_context, work);
struct inode *inode = isw->inode;
- struct super_block *sb = inode->i_sb;
struct address_space *mapping = inode->i_mapping;
struct bdi_writeback *old_wb = inode->i_wb;
struct bdi_writeback *new_wb = isw->new_wb;
@@ -424,8 +426,9 @@
wb_put(new_wb);
iput(inode);
- deactivate_super(sb);
kfree(isw);
+
+ atomic_dec(&isw_nr_in_flight);
}
static void inode_switch_wbs_rcu_fn(struct rcu_head *rcu_head)
@@ -435,7 +438,7 @@
/* needs to grab bh-unsafe locks, bounce to work item */
INIT_WORK(&isw->work, inode_switch_wbs_work_fn);
- schedule_work(&isw->work);
+ queue_work(isw_wq, &isw->work);
}
/**
@@ -471,20 +474,20 @@
/* while holding I_WB_SWITCH, no one else can update the association */
spin_lock(&inode->i_lock);
-
- if (inode->i_state & (I_WB_SWITCH | I_FREEING) ||
- inode_to_wb(inode) == isw->new_wb)
- goto out_unlock;
-
- if (!atomic_inc_not_zero(&inode->i_sb->s_active))
- goto out_unlock;
-
+ if (!(inode->i_sb->s_flags & MS_ACTIVE) ||
+ inode->i_state & (I_WB_SWITCH | I_FREEING) ||
+ inode_to_wb(inode) == isw->new_wb) {
+ spin_unlock(&inode->i_lock);
+ goto out_free;
+ }
inode->i_state |= I_WB_SWITCH;
spin_unlock(&inode->i_lock);
ihold(inode);
isw->inode = inode;
+ atomic_inc(&isw_nr_in_flight);
+
/*
* In addition to synchronizing among switchers, I_WB_SWITCH tells
* the RCU protected stat update paths to grab the mapping's
@@ -494,8 +497,6 @@
call_rcu(&isw->rcu_head, inode_switch_wbs_rcu_fn);
return;
-out_unlock:
- spin_unlock(&inode->i_lock);
out_free:
if (isw->new_wb)
wb_put(isw->new_wb);
@@ -847,6 +848,33 @@
wb_put(last_wb);
}
+/**
+ * cgroup_writeback_umount - flush inode wb switches for umount
+ *
+ * This function is called when a super_block is about to be destroyed and
+ * flushes in-flight inode wb switches. An inode wb switch goes through
+ * RCU and then workqueue, so the two need to be flushed in order to ensure
+ * that all previously scheduled switches are finished. As wb switches are
+ * rare occurrences and synchronize_rcu() can take a while, perform
+ * flushing iff wb switches are in flight.
+ */
+void cgroup_writeback_umount(void)
+{
+ if (atomic_read(&isw_nr_in_flight)) {
+ synchronize_rcu();
+ flush_workqueue(isw_wq);
+ }
+}
+
+static int __init cgroup_writeback_init(void)
+{
+ isw_wq = alloc_workqueue("inode_switch_wbs", 0, 0);
+ if (!isw_wq)
+ return -ENOMEM;
+ return 0;
+}
+fs_initcall(cgroup_writeback_init);
+
#else /* CONFIG_CGROUP_WRITEBACK */
static struct bdi_writeback *
diff --git a/fs/super.c b/fs/super.c
index 1182af8..74914b1 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -415,6 +415,7 @@
sb->s_flags &= ~MS_ACTIVE;
fsnotify_unmount_inodes(sb);
+ cgroup_writeback_umount();
evict_inodes(sb);
diff --git a/include/linux/bio.h b/include/linux/bio.h
index 5349e68..cb68888 100644
--- a/include/linux/bio.h
+++ b/include/linux/bio.h
@@ -310,6 +310,43 @@
bio->bi_flags &= ~(1U << bit);
}
+static inline void bio_get_first_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ *bv = bio_iovec(bio);
+}
+
+static inline void bio_get_last_bvec(struct bio *bio, struct bio_vec *bv)
+{
+ struct bvec_iter iter = bio->bi_iter;
+ int idx;
+
+ if (!bio_flagged(bio, BIO_CLONED)) {
+ *bv = bio->bi_io_vec[bio->bi_vcnt - 1];
+ return;
+ }
+
+ if (unlikely(!bio_multiple_segments(bio))) {
+ *bv = bio_iovec(bio);
+ return;
+ }
+
+ bio_advance_iter(bio, &iter, iter.bi_size);
+
+ if (!iter.bi_bvec_done)
+ idx = iter.bi_idx - 1;
+ else /* in the middle of bvec */
+ idx = iter.bi_idx;
+
+ *bv = bio->bi_io_vec[idx];
+
+ /*
+ * iter.bi_bvec_done records actual length of the last bvec
+ * if this bio ends in the middle of one io vector
+ */
+ if (iter.bi_bvec_done)
+ bv->bv_len = iter.bi_bvec_done;
+}
+
enum bip_flags {
BIP_BLOCK_INTEGRITY = 1 << 0, /* block layer owns integrity data */
BIP_MAPPED_INTEGRITY = 1 << 1, /* ref tag has been remapped */
diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h
index 4571ef1..413c84f 100644
--- a/include/linux/blkdev.h
+++ b/include/linux/blkdev.h
@@ -895,7 +895,7 @@
{
struct request_queue *q = rq->q;
- if (unlikely(rq->cmd_type == REQ_TYPE_BLOCK_PC))
+ if (unlikely(rq->cmd_type != REQ_TYPE_FS))
return q->limits.max_hw_sectors;
if (!q->limits.chunk_sectors || (rq->cmd_flags & REQ_DISCARD))
@@ -1372,6 +1372,13 @@
page_cache_release(p.v);
}
+static inline bool __bvec_gap_to_prev(struct request_queue *q,
+ struct bio_vec *bprv, unsigned int offset)
+{
+ return offset ||
+ ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+}
+
/*
* Check if adding a bio_vec after bprv with offset would create a gap in
* the SG list. Most drivers don't care about this, but some do.
@@ -1381,18 +1388,22 @@
{
if (!queue_virt_boundary(q))
return false;
- return offset ||
- ((bprv->bv_offset + bprv->bv_len) & queue_virt_boundary(q));
+ return __bvec_gap_to_prev(q, bprv, offset);
}
static inline bool bio_will_gap(struct request_queue *q, struct bio *prev,
struct bio *next)
{
- if (!bio_has_data(prev))
- return false;
+ if (bio_has_data(prev) && queue_virt_boundary(q)) {
+ struct bio_vec pb, nb;
- return bvec_gap_to_prev(q, &prev->bi_io_vec[prev->bi_vcnt - 1],
- next->bi_io_vec[0].bv_offset);
+ bio_get_last_bvec(prev, &pb);
+ bio_get_first_bvec(next, &nb);
+
+ return __bvec_gap_to_prev(q, &pb, nb.bv_offset);
+ }
+
+ return false;
}
static inline bool req_gap_back_merge(struct request *req, struct bio *bio)
diff --git a/include/linux/writeback.h b/include/linux/writeback.h
index b333c94..d0b5ca5 100644
--- a/include/linux/writeback.h
+++ b/include/linux/writeback.h
@@ -198,6 +198,7 @@
void wbc_detach_inode(struct writeback_control *wbc);
void wbc_account_io(struct writeback_control *wbc, struct page *page,
size_t bytes);
+void cgroup_writeback_umount(void);
/**
* inode_attach_wb - associate an inode with its wb
@@ -301,6 +302,10 @@
{
}
+static inline void cgroup_writeback_umount(void)
+{
+}
+
#endif /* CONFIG_CGROUP_WRITEBACK */
/*