Merge git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm
* git://git.kernel.org/pub/scm/linux/kernel/git/agk/linux-2.6-dm:
dm kcopyd: fix callback race
dm kcopyd: prepare for callback race fix
dm: implement basic barrier support
dm: remove dm_request loop
dm: rework queueing and suspension
dm: simplify dm_request loop
dm: split DMF_BLOCK_IO flag into two
dm: rearrange dm_wq_work
dm: remove limited barrier support
dm: add integrity support
diff --git a/drivers/md/dm-ioctl.c b/drivers/md/dm-ioctl.c
index f010965..823ceba6 100644
--- a/drivers/md/dm-ioctl.c
+++ b/drivers/md/dm-ioctl.c
@@ -1047,6 +1047,19 @@
return dm_table_complete(table);
}
+static int table_prealloc_integrity(struct dm_table *t,
+ struct mapped_device *md)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *dd;
+
+ list_for_each_entry(dd, devices, list)
+ if (bdev_get_integrity(dd->dm_dev.bdev))
+ return blk_integrity_register(dm_disk(md), NULL);
+
+ return 0;
+}
+
static int table_load(struct dm_ioctl *param, size_t param_size)
{
int r;
@@ -1068,6 +1081,14 @@
goto out;
}
+ r = table_prealloc_integrity(t, md);
+ if (r) {
+ DMERR("%s: could not register integrity profile.",
+ dm_device_name(md));
+ dm_table_destroy(t);
+ goto out;
+ }
+
down_write(&_hash_lock);
hc = dm_get_mdptr(md);
if (!hc || hc->md != md) {
diff --git a/drivers/md/dm-kcopyd.c b/drivers/md/dm-kcopyd.c
index 0a225da..3e3fc06 100644
--- a/drivers/md/dm-kcopyd.c
+++ b/drivers/md/dm-kcopyd.c
@@ -297,7 +297,8 @@
dm_kcopyd_notify_fn fn = job->fn;
struct dm_kcopyd_client *kc = job->kc;
- kcopyd_put_pages(kc, job->pages);
+ if (job->pages)
+ kcopyd_put_pages(kc, job->pages);
mempool_free(job, kc->job_pool);
fn(read_err, write_err, context);
@@ -461,6 +462,7 @@
sector_t progress = 0;
sector_t count = 0;
struct kcopyd_job *job = (struct kcopyd_job *) context;
+ struct dm_kcopyd_client *kc = job->kc;
mutex_lock(&job->lock);
@@ -490,7 +492,7 @@
if (count) {
int i;
- struct kcopyd_job *sub_job = mempool_alloc(job->kc->job_pool,
+ struct kcopyd_job *sub_job = mempool_alloc(kc->job_pool,
GFP_NOIO);
*sub_job = *job;
@@ -509,13 +511,16 @@
} else if (atomic_dec_and_test(&job->sub_jobs)) {
/*
- * To avoid a race we must keep the job around
- * until after the notify function has completed.
- * Otherwise the client may try and stop the job
- * after we've completed.
+ * Queue the completion callback to the kcopyd thread.
+ *
+ * Some callers assume that all the completions are called
+ * from a single thread and don't race with each other.
+ *
+ * We must not call the callback directly here because this
+ * code may not be executing in the thread.
*/
- job->fn(read_err, write_err, job->context);
- mempool_free(job, job->kc->job_pool);
+ push(&kc->complete_jobs, job);
+ wake(kc);
}
}
@@ -528,6 +533,8 @@
{
int i;
+ atomic_inc(&job->kc->nr_jobs);
+
atomic_set(&job->sub_jobs, SPLIT_COUNT);
for (i = 0; i < SPLIT_COUNT; i++)
segment_complete(0, 0u, job);
diff --git a/drivers/md/dm-linear.c b/drivers/md/dm-linear.c
index bfa107f..79fb53e 100644
--- a/drivers/md/dm-linear.c
+++ b/drivers/md/dm-linear.c
@@ -142,7 +142,6 @@
.status = linear_status,
.ioctl = linear_ioctl,
.merge = linear_merge,
- .features = DM_TARGET_SUPPORTS_BARRIERS,
};
int __init dm_linear_init(void)
diff --git a/drivers/md/dm-table.c b/drivers/md/dm-table.c
index e8361b1..429b50b 100644
--- a/drivers/md/dm-table.c
+++ b/drivers/md/dm-table.c
@@ -52,8 +52,6 @@
sector_t *highs;
struct dm_target *targets;
- unsigned barriers_supported:1;
-
/*
* Indicates the rw permissions for the new logical
* device. This should be a combination of FMODE_READ
@@ -243,7 +241,6 @@
INIT_LIST_HEAD(&t->devices);
atomic_set(&t->holders, 0);
- t->barriers_supported = 1;
if (!num_targets)
num_targets = KEYS_PER_NODE;
@@ -751,10 +748,6 @@
/* FIXME: the plan is to combine high here and then have
* the merge fn apply the target level restrictions. */
combine_restrictions_low(&t->limits, &tgt->limits);
-
- if (!(tgt->type->features & DM_TARGET_SUPPORTS_BARRIERS))
- t->barriers_supported = 0;
-
return 0;
bad:
@@ -799,12 +792,6 @@
check_for_valid_limits(&t->limits);
- /*
- * We only support barriers if there is exactly one underlying device.
- */
- if (!list_is_singular(&t->devices))
- t->barriers_supported = 0;
-
/* how many indexes will the btree have ? */
leaf_nodes = dm_div_up(t->num_targets, KEYS_PER_NODE);
t->depth = 1 + int_log(leaf_nodes, CHILDREN_PER_NODE);
@@ -879,6 +866,45 @@
return &t->targets[(KEYS_PER_NODE * n) + k];
}
+/*
+ * Set the integrity profile for this device if all devices used have
+ * matching profiles.
+ */
+static void dm_table_set_integrity(struct dm_table *t)
+{
+ struct list_head *devices = dm_table_get_devices(t);
+ struct dm_dev_internal *prev = NULL, *dd = NULL;
+
+ if (!blk_get_integrity(dm_disk(t->md)))
+ return;
+
+ list_for_each_entry(dd, devices, list) {
+ if (prev &&
+ blk_integrity_compare(prev->dm_dev.bdev->bd_disk,
+ dd->dm_dev.bdev->bd_disk) < 0) {
+ DMWARN("%s: integrity not set: %s and %s mismatch",
+ dm_device_name(t->md),
+ prev->dm_dev.bdev->bd_disk->disk_name,
+ dd->dm_dev.bdev->bd_disk->disk_name);
+ goto no_integrity;
+ }
+ prev = dd;
+ }
+
+ if (!prev || !bdev_get_integrity(prev->dm_dev.bdev))
+ goto no_integrity;
+
+ blk_integrity_register(dm_disk(t->md),
+ bdev_get_integrity(prev->dm_dev.bdev));
+
+ return;
+
+no_integrity:
+ blk_integrity_register(dm_disk(t->md), NULL);
+
+ return;
+}
+
void dm_table_set_restrictions(struct dm_table *t, struct request_queue *q)
{
/*
@@ -899,6 +925,7 @@
else
queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, q);
+ dm_table_set_integrity(t);
}
unsigned int dm_table_get_num_targets(struct dm_table *t)
@@ -1019,12 +1046,6 @@
return t->md;
}
-int dm_table_barrier_ok(struct dm_table *t)
-{
- return t->barriers_supported;
-}
-EXPORT_SYMBOL(dm_table_barrier_ok);
-
EXPORT_SYMBOL(dm_vcalloc);
EXPORT_SYMBOL(dm_get_device);
EXPORT_SYMBOL(dm_put_device);
diff --git a/drivers/md/dm.c b/drivers/md/dm.c
index 788ba96..8a994be 100644
--- a/drivers/md/dm.c
+++ b/drivers/md/dm.c
@@ -89,12 +89,13 @@
/*
* Bits for the md->flags field.
*/
-#define DMF_BLOCK_IO 0
+#define DMF_BLOCK_IO_FOR_SUSPEND 0
#define DMF_SUSPENDED 1
#define DMF_FROZEN 2
#define DMF_FREEING 3
#define DMF_DELETING 4
#define DMF_NOFLUSH_SUSPENDING 5
+#define DMF_QUEUE_IO_TO_THREAD 6
/*
* Work processed by per-device workqueue.
@@ -124,6 +125,11 @@
spinlock_t deferred_lock;
/*
+ * An error from the barrier request currently being processed.
+ */
+ int barrier_error;
+
+ /*
* Processing queue (flush/barriers)
*/
struct workqueue_struct *wq;
@@ -424,6 +430,10 @@
part_stat_add(cpu, &dm_disk(md)->part0, ticks[rw], duration);
part_stat_unlock();
+ /*
+ * After this is decremented the bio must not be touched if it is
+ * a barrier.
+ */
dm_disk(md)->part0.in_flight = pending =
atomic_dec_return(&md->pending);
@@ -435,21 +445,18 @@
/*
* Add the bio to the list of deferred io.
*/
-static int queue_io(struct mapped_device *md, struct bio *bio)
+static void queue_io(struct mapped_device *md, struct bio *bio)
{
down_write(&md->io_lock);
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
- up_write(&md->io_lock);
- return 1;
- }
-
spin_lock_irq(&md->deferred_lock);
bio_list_add(&md->deferred, bio);
spin_unlock_irq(&md->deferred_lock);
+ if (!test_and_set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags))
+ queue_work(md->wq, &md->work);
+
up_write(&md->io_lock);
- return 0; /* deferred successfully */
}
/*
@@ -533,25 +540,35 @@
*/
spin_lock_irqsave(&md->deferred_lock, flags);
if (__noflush_suspending(md))
- bio_list_add(&md->deferred, io->bio);
+ bio_list_add_head(&md->deferred, io->bio);
else
/* noflush suspend was interrupted. */
io->error = -EIO;
spin_unlock_irqrestore(&md->deferred_lock, flags);
}
- end_io_acct(io);
-
io_error = io->error;
bio = io->bio;
- free_io(md, io);
+ if (bio_barrier(bio)) {
+ /*
+ * There can be just one barrier request so we use
+ * a per-device variable for error reporting.
+ * Note that you can't touch the bio after end_io_acct
+ */
+ md->barrier_error = io_error;
+ end_io_acct(io);
+ } else {
+ end_io_acct(io);
- if (io_error != DM_ENDIO_REQUEUE) {
- trace_block_bio_complete(md->queue, bio);
+ if (io_error != DM_ENDIO_REQUEUE) {
+ trace_block_bio_complete(md->queue, bio);
- bio_endio(bio, io_error);
+ bio_endio(bio, io_error);
+ }
}
+
+ free_io(md, io);
}
}
@@ -693,13 +710,19 @@
clone->bi_sector = sector;
clone->bi_bdev = bio->bi_bdev;
- clone->bi_rw = bio->bi_rw;
+ clone->bi_rw = bio->bi_rw & ~(1 << BIO_RW_BARRIER);
clone->bi_vcnt = 1;
clone->bi_size = to_bytes(len);
clone->bi_io_vec->bv_offset = offset;
clone->bi_io_vec->bv_len = clone->bi_size;
clone->bi_flags |= 1 << BIO_CLONED;
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, offset), len);
+ }
+
return clone;
}
@@ -714,6 +737,7 @@
clone = bio_alloc_bioset(GFP_NOIO, bio->bi_max_vecs, bs);
__bio_clone(clone, bio);
+ clone->bi_rw &= ~(1 << BIO_RW_BARRIER);
clone->bi_destructor = dm_bio_destructor;
clone->bi_sector = sector;
clone->bi_idx = idx;
@@ -721,6 +745,14 @@
clone->bi_size = to_bytes(len);
clone->bi_flags &= ~(1 << BIO_SEG_VALID);
+ if (bio_integrity(bio)) {
+ bio_integrity_clone(clone, bio, GFP_NOIO);
+
+ if (idx != bio->bi_idx || clone->bi_size < bio->bi_size)
+ bio_integrity_trim(clone,
+ bio_sector_offset(bio, idx, 0), len);
+ }
+
return clone;
}
@@ -834,14 +866,13 @@
ci.map = dm_get_table(md);
if (unlikely(!ci.map)) {
- bio_io_error(bio);
+ if (!bio_barrier(bio))
+ bio_io_error(bio);
+ else
+ md->barrier_error = -EIO;
return;
}
- if (unlikely(bio_barrier(bio) && !dm_table_barrier_ok(ci.map))) {
- dm_table_put(ci.map);
- bio_endio(bio, -EOPNOTSUPP);
- return;
- }
+
ci.md = md;
ci.bio = bio;
ci.io = alloc_io(md);
@@ -918,7 +949,6 @@
*/
static int dm_request(struct request_queue *q, struct bio *bio)
{
- int r = -EIO;
int rw = bio_data_dir(bio);
struct mapped_device *md = q->queuedata;
int cpu;
@@ -931,34 +961,27 @@
part_stat_unlock();
/*
- * If we're suspended we have to queue
- * this io for later.
+ * If we're suspended or the thread is processing barriers
+ * we have to queue this io for later.
*/
- while (test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (unlikely(test_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags)) ||
+ unlikely(bio_barrier(bio))) {
up_read(&md->io_lock);
- if (bio_rw(bio) != READA)
- r = queue_io(md, bio);
+ if (unlikely(test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) &&
+ bio_rw(bio) == READA) {
+ bio_io_error(bio);
+ return 0;
+ }
- if (r <= 0)
- goto out_req;
+ queue_io(md, bio);
- /*
- * We're in a while loop, because someone could suspend
- * before we get to the following read lock.
- */
- down_read(&md->io_lock);
+ return 0;
}
__split_and_process_bio(md, bio);
up_read(&md->io_lock);
return 0;
-
-out_req:
- if (r < 0)
- bio_io_error(bio);
-
- return 0;
}
static void dm_unplug_all(struct request_queue *q)
@@ -978,7 +1001,7 @@
struct mapped_device *md = congested_data;
struct dm_table *map;
- if (!test_bit(DMF_BLOCK_IO, &md->flags)) {
+ if (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
map = dm_get_table(md);
if (map) {
r = dm_table_any_congested(map, bdi_bits);
@@ -1193,6 +1216,7 @@
mempool_destroy(md->tio_pool);
mempool_destroy(md->io_pool);
bioset_free(md->bs);
+ blk_integrity_unregister(md->disk);
del_gendisk(md->disk);
free_minor(minor);
@@ -1406,6 +1430,36 @@
return r;
}
+static int dm_flush(struct mapped_device *md)
+{
+ dm_wait_for_completion(md, TASK_UNINTERRUPTIBLE);
+ return 0;
+}
+
+static void process_barrier(struct mapped_device *md, struct bio *bio)
+{
+ int error = dm_flush(md);
+
+ if (unlikely(error)) {
+ bio_endio(bio, error);
+ return;
+ }
+ if (bio_empty_barrier(bio)) {
+ bio_endio(bio, 0);
+ return;
+ }
+
+ __split_and_process_bio(md, bio);
+
+ error = dm_flush(md);
+
+ if (!error && md->barrier_error)
+ error = md->barrier_error;
+
+ if (md->barrier_error != DM_ENDIO_REQUEUE)
+ bio_endio(bio, error);
+}
+
/*
* Process the deferred bios
*/
@@ -1417,25 +1471,34 @@
down_write(&md->io_lock);
-next_bio:
- spin_lock_irq(&md->deferred_lock);
- c = bio_list_pop(&md->deferred);
- spin_unlock_irq(&md->deferred_lock);
+ while (!test_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags)) {
+ spin_lock_irq(&md->deferred_lock);
+ c = bio_list_pop(&md->deferred);
+ spin_unlock_irq(&md->deferred_lock);
- if (c) {
- __split_and_process_bio(md, c);
- goto next_bio;
+ if (!c) {
+ clear_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
+ break;
+ }
+
+ up_write(&md->io_lock);
+
+ if (bio_barrier(c))
+ process_barrier(md, c);
+ else
+ __split_and_process_bio(md, c);
+
+ down_write(&md->io_lock);
}
- clear_bit(DMF_BLOCK_IO, &md->flags);
-
up_write(&md->io_lock);
}
static void dm_queue_flush(struct mapped_device *md)
{
+ clear_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ smp_mb__after_clear_bit();
queue_work(md->wq, &md->work);
- flush_workqueue(md->wq);
}
/*
@@ -1553,20 +1616,36 @@
}
/*
- * First we set the BLOCK_IO flag so no more ios will be mapped.
+ * Here we must make sure that no processes are submitting requests
+ * to target drivers i.e. no one may be executing
+ * __split_and_process_bio. This is called from dm_request and
+ * dm_wq_work.
+ *
+ * To get all processes out of __split_and_process_bio in dm_request,
+ * we take the write lock. To prevent any process from reentering
+ * __split_and_process_bio from dm_request, we set
+ * DMF_QUEUE_IO_TO_THREAD.
+ *
+ * To quiesce the thread (dm_wq_work), we set DMF_BLOCK_IO_FOR_SUSPEND
+ * and call flush_workqueue(md->wq). flush_workqueue will wait until
+ * dm_wq_work exits and DMF_BLOCK_IO_FOR_SUSPEND will prevent any
+ * further calls to __split_and_process_bio from dm_wq_work.
*/
down_write(&md->io_lock);
- set_bit(DMF_BLOCK_IO, &md->flags);
-
+ set_bit(DMF_BLOCK_IO_FOR_SUSPEND, &md->flags);
+ set_bit(DMF_QUEUE_IO_TO_THREAD, &md->flags);
up_write(&md->io_lock);
+ flush_workqueue(md->wq);
+
/*
- * Wait for the already-mapped ios to complete.
+ * At this point no more requests are entering target request routines.
+ * We call dm_wait_for_completion to wait for all existing requests
+ * to finish.
*/
r = dm_wait_for_completion(md, TASK_INTERRUPTIBLE);
down_write(&md->io_lock);
-
if (noflush)
clear_bit(DMF_NOFLUSH_SUSPENDING, &md->flags);
up_write(&md->io_lock);
@@ -1579,6 +1658,12 @@
goto out; /* pushback list is already flushed, so skip flush */
}
+ /*
+ * If dm_wait_for_completion returned 0, the device is completely
+ * quiescent now. There is no request-processing activity. All new
+ * requests are being added to md->deferred list.
+ */
+
dm_table_postsuspend_targets(map);
set_bit(DMF_SUSPENDED, &md->flags);
diff --git a/drivers/md/dm.h b/drivers/md/dm.h
index b48397c..a31506d 100644
--- a/drivers/md/dm.h
+++ b/drivers/md/dm.h
@@ -52,7 +52,6 @@
* To check the return value from dm_table_find_target().
*/
#define dm_target_is_valid(t) ((t)->table)
-int dm_table_barrier_ok(struct dm_table *t);
/*-----------------------------------------------------------------
* A registry of target types.
diff --git a/include/linux/device-mapper.h b/include/linux/device-mapper.h
index 66ec05a..ded2d7c 100644
--- a/include/linux/device-mapper.h
+++ b/include/linux/device-mapper.h
@@ -116,7 +116,6 @@
/*
* Target features
*/
-#define DM_TARGET_SUPPORTS_BARRIERS 0x00000001
struct target_type {
uint64_t features;