Merge branch 'drbd-8.4_ed6' into for-3.8-drivers-drbd-8.4_ed6
diff --git a/drivers/block/drbd/Makefile b/drivers/block/drbd/Makefile
index 0d3f337..8b45033 100644
--- a/drivers/block/drbd/Makefile
+++ b/drivers/block/drbd/Makefile
@@ -1,5 +1,7 @@
drbd-y := drbd_bitmap.o drbd_proc.o
drbd-y += drbd_worker.o drbd_receiver.o drbd_req.o drbd_actlog.o
drbd-y += drbd_main.o drbd_strings.o drbd_nl.o
+drbd-y += drbd_interval.o drbd_state.o
+drbd-y += drbd_nla.o
obj-$(CONFIG_BLK_DEV_DRBD) += drbd.o
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index d4dd563..92510f8 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -24,21 +24,73 @@
*/
#include <linux/slab.h>
+#include <linux/crc32c.h>
#include <linux/drbd.h>
+#include <linux/drbd_limits.h>
+#include <linux/dynamic_debug.h>
#include "drbd_int.h"
#include "drbd_wrappers.h"
-/* We maintain a trivial checksum in our on disk activity log.
- * With that we can ensure correct operation even when the storage
- * device might do a partial (last) sector write while losing power.
- */
-struct __packed al_transaction {
- u32 magic;
- u32 tr_number;
- struct __packed {
- u32 pos;
- u32 extent; } updates[1 + AL_EXTENTS_PT];
- u32 xor_sum;
+
+enum al_transaction_types {
+ AL_TR_UPDATE = 0,
+ AL_TR_INITIALIZED = 0xffff
+};
+/* all fields on disc in big endian */
+struct __packed al_transaction_on_disk {
+ /* don't we all like magic */
+ __be32 magic;
+
+ /* to identify the most recent transaction block
+ * in the on disk ring buffer */
+ __be32 tr_number;
+
+ /* checksum on the full 4k block, with this field set to 0. */
+ __be32 crc32c;
+
+ /* type of transaction, special transaction types like:
+ * purge-all, set-all-idle, set-all-active, ... to-be-defined
+ * see also enum al_transaction_types */
+ __be16 transaction_type;
+
+ /* we currently allow only a few thousand extents,
+ * so 16bit will be enough for the slot number. */
+
+ /* how many updates in this transaction */
+ __be16 n_updates;
+
+ /* maximum slot number, "al-extents" in drbd.conf speak.
+ * Having this in each transaction should make reconfiguration
+ * of that parameter easier. */
+ __be16 context_size;
+
+ /* slot number the context starts with */
+ __be16 context_start_slot_nr;
+
+ /* Some reserved bytes. Expected usage is a 64bit counter of
+ * sectors-written since device creation, and other data generation tag
+ * supporting usage */
+ __be32 __reserved[4];
+
+ /* --- 36 byte used --- */
+
+ /* Reserve space for up to AL_UPDATES_PER_TRANSACTION changes
+ * in one transaction, then use the remaining byte in the 4k block for
+ * context information. "Flexible" number of updates per transaction
+ * does not help, as we have to account for the case when all update
+ * slots are used anyways, so it would only complicate code without
+ * additional benefit.
+ */
+ __be16 update_slot_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* but the extent number is 32bit, which at an extent size of 4 MiB
+ * allows to cover device sizes of up to 2**54 Byte (16 PiB) */
+ __be32 update_extent_nr[AL_UPDATES_PER_TRANSACTION];
+
+ /* --- 420 bytes used (36 + 64*6) --- */
+
+ /* 4096 - 420 = 3676 = 919 * 4 */
+ __be32 context[AL_CONTEXT_PER_TRANSACTION];
};
struct update_odbm_work {
@@ -48,22 +100,11 @@
struct update_al_work {
struct drbd_work w;
- struct lc_element *al_ext;
struct completion event;
- unsigned int enr;
- /* if old_enr != LC_FREE, write corresponding bitmap sector, too */
- unsigned int old_enr;
+ int err;
};
-struct drbd_atodb_wait {
- atomic_t count;
- struct completion io_done;
- struct drbd_conf *mdev;
- int error;
-};
-
-
-int w_al_write_transaction(struct drbd_conf *, struct drbd_work *, int);
+static int al_write_transaction(struct drbd_conf *mdev);
void *drbd_md_get_buffer(struct drbd_conf *mdev)
{
@@ -85,12 +126,17 @@
void wait_until_done_or_force_detached(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
unsigned int *done)
{
- long dt = bdev->dc.disk_timeout * HZ / 10;
+ long dt;
+
+ rcu_read_lock();
+ dt = rcu_dereference(bdev->disk_conf)->disk_timeout;
+ rcu_read_unlock();
+ dt = dt * HZ / 10;
if (dt == 0)
dt = MAX_SCHEDULE_TIMEOUT;
dt = wait_event_timeout(mdev->misc_wait,
- *done || drbd_test_flag(mdev, FORCE_DETACH), dt);
+ *done || test_bit(FORCE_DETACH, &mdev->flags), dt);
if (dt == 0) {
dev_err(DEV, "meta-data IO operation timed out\n");
drbd_chk_io_error(mdev, 1, DRBD_FORCE_DETACH);
@@ -103,20 +149,20 @@
int rw, int size)
{
struct bio *bio;
- int ok;
+ int err;
mdev->md_io.done = 0;
mdev->md_io.error = -ENODEV;
- if ((rw & WRITE) && !drbd_test_flag(mdev, MD_NO_FUA))
+ if ((rw & WRITE) && !test_bit(MD_NO_FUA, &mdev->flags))
rw |= REQ_FUA | REQ_FLUSH;
rw |= REQ_SYNC;
bio = bio_alloc_drbd(GFP_NOIO);
bio->bi_bdev = bdev->md_bdev;
bio->bi_sector = sector;
- ok = (bio_add_page(bio, page, size, 0) == size);
- if (!ok)
+ err = -EIO;
+ if (bio_add_page(bio, page, size, 0) != size)
goto out;
bio->bi_private = &mdev->md_io;
bio->bi_end_io = drbd_md_io_complete;
@@ -124,7 +170,7 @@
if (!get_ldev_if_state(mdev, D_ATTACHING)) { /* Corresponding put_ldev in drbd_md_io_complete() */
dev_err(DEV, "ASSERT FAILED: get_ldev_if_state() == 1 in _drbd_md_sync_page_io()\n");
- ok = 0;
+ err = -ENODEV;
goto out;
}
@@ -135,85 +181,46 @@
else
submit_bio(rw, bio);
wait_until_done_or_force_detached(mdev, bdev, &mdev->md_io.done);
- ok = bio_flagged(bio, BIO_UPTODATE) && mdev->md_io.error == 0;
+ if (bio_flagged(bio, BIO_UPTODATE))
+ err = mdev->md_io.error;
out:
bio_put(bio);
- return ok;
+ return err;
}
int drbd_md_sync_page_io(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
sector_t sector, int rw)
{
- int logical_block_size, mask, ok;
- int offset = 0;
+ int err;
struct page *iop = mdev->md_io_page;
D_ASSERT(atomic_read(&mdev->md_io_in_use) == 1);
BUG_ON(!bdev->md_bdev);
- logical_block_size = bdev_logical_block_size(bdev->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- /* in case logical_block_size != 512 [ s390 only? ] */
- if (logical_block_size != MD_SECTOR_SIZE) {
- mask = (logical_block_size / MD_SECTOR_SIZE) - 1;
- D_ASSERT(mask == 1 || mask == 3 || mask == 7);
- D_ASSERT(logical_block_size == (mask+1) * MD_SECTOR_SIZE);
- offset = sector & mask;
- sector = sector & ~mask;
- iop = mdev->md_io_tmpp;
-
- if (rw & WRITE) {
- /* these are GFP_KERNEL pages, pre-allocated
- * on device initialization */
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector,
- READ, logical_block_size);
-
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,"
- "READ [logical_block_size!=512]) failed!\n",
- (unsigned long long)sector);
- return 0;
- }
-
- memcpy(hp + offset*MD_SECTOR_SIZE, p, MD_SECTOR_SIZE);
- }
- }
+ dev_dbg(DEV, "meta_data io: %s [%d]:%s(,%llus,%s)\n",
+ current->comm, current->pid, __func__,
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
if (sector < drbd_md_first_sector(bdev) ||
- sector > drbd_md_last_sector(bdev))
+ sector + 7 > drbd_md_last_sector(bdev))
dev_alert(DEV, "%s [%d]:%s(,%llus,%s) out of range md access!\n",
current->comm, current->pid, __func__,
(unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- ok = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, logical_block_size);
- if (unlikely(!ok)) {
- dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed!\n",
- (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ");
- return 0;
+ err = _drbd_md_sync_page_io(mdev, bdev, iop, sector, rw, MD_BLOCK_SIZE);
+ if (err) {
+ dev_err(DEV, "drbd_md_sync_page_io(,%llus,%s) failed with error %d\n",
+ (unsigned long long)sector, (rw & WRITE) ? "WRITE" : "READ", err);
}
-
- if (logical_block_size != MD_SECTOR_SIZE && !(rw & WRITE)) {
- void *p = page_address(mdev->md_io_page);
- void *hp = page_address(mdev->md_io_tmpp);
-
- memcpy(p, hp + offset*MD_SECTOR_SIZE, MD_SECTOR_SIZE);
- }
-
- return ok;
+ return err;
}
static struct lc_element *_al_get(struct drbd_conf *mdev, unsigned int enr)
{
struct lc_element *al_ext;
struct lc_element *tmp;
- unsigned long al_flags = 0;
int wake;
spin_lock_irq(&mdev->al_lock);
@@ -228,76 +235,92 @@
return NULL;
}
}
- al_ext = lc_get(mdev->act_log, enr);
- al_flags = mdev->act_log->flags;
+ al_ext = lc_get(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
-
- /*
- if (!al_ext) {
- if (al_flags & LC_STARVING)
- dev_warn(DEV, "Have to wait for LRU element (AL too small?)\n");
- if (al_flags & LC_DIRTY)
- dev_warn(DEV, "Ongoing AL update (AL device too slow?)\n");
- }
- */
-
return al_ext;
}
-void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
- struct lc_element *al_ext;
- struct update_al_work al_work;
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
+ bool locked = false;
+
+ D_ASSERT(first <= last);
D_ASSERT(atomic_read(&mdev->local_cnt) > 0);
- wait_event(mdev->al_wait, (al_ext = _al_get(mdev, enr)));
+ for (enr = first; enr <= last; enr++)
+ wait_event(mdev->al_wait, _al_get(mdev, enr) != NULL);
- if (al_ext->lc_number != enr) {
+ /* Serialize multiple transactions.
+ * This uses test_and_set_bit, memory barrier is implicit.
+ */
+ wait_event(mdev->al_wait,
+ mdev->act_log->pending_changes == 0 ||
+ (locked = lc_try_lock_for_transaction(mdev->act_log)));
+
+ if (locked) {
/* drbd_al_write_transaction(mdev,al_ext,enr);
* recurses into generic_make_request(), which
* disallows recursion, bios being serialized on the
* current->bio_tail list now.
* we have to delegate updates to the activity log
* to the worker thread. */
- init_completion(&al_work.event);
- al_work.al_ext = al_ext;
- al_work.enr = enr;
- al_work.old_enr = al_ext->lc_number;
- al_work.w.cb = w_al_write_transaction;
- drbd_queue_work_front(&mdev->data.work, &al_work.w);
- wait_for_completion(&al_work.event);
- mdev->al_writ_cnt++;
+ /* Double check: it may have been committed by someone else,
+ * while we have been waiting for the lock. */
+ if (mdev->act_log->pending_changes) {
+ bool write_al_updates;
- spin_lock_irq(&mdev->al_lock);
- lc_changed(mdev->act_log, al_ext);
- spin_unlock_irq(&mdev->al_lock);
+ rcu_read_lock();
+ write_al_updates = rcu_dereference(mdev->ldev->disk_conf)->al_updates;
+ rcu_read_unlock();
+
+ if (write_al_updates) {
+ al_write_transaction(mdev);
+ mdev->al_writ_cnt++;
+ }
+
+ spin_lock_irq(&mdev->al_lock);
+ /* FIXME
+ if (err)
+ we need an "lc_cancel" here;
+ */
+ lc_committed(mdev->act_log);
+ spin_unlock_irq(&mdev->al_lock);
+ }
+ lc_unlock(mdev->act_log);
wake_up(&mdev->al_wait);
}
}
-void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector)
+void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i)
{
- unsigned int enr = (sector >> (AL_EXTENT_SHIFT-9));
+ /* for bios crossing activity log extent boundaries,
+ * we may need to activate two extents in one go */
+ unsigned first = i->sector >> (AL_EXTENT_SHIFT-9);
+ unsigned last = i->size == 0 ? first : (i->sector + (i->size >> 9) - 1) >> (AL_EXTENT_SHIFT-9);
+ unsigned enr;
struct lc_element *extent;
unsigned long flags;
+ D_ASSERT(first <= last);
spin_lock_irqsave(&mdev->al_lock, flags);
- extent = lc_find(mdev->act_log, enr);
-
- if (!extent) {
- spin_unlock_irqrestore(&mdev->al_lock, flags);
- dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
- return;
+ for (enr = first; enr <= last; enr++) {
+ extent = lc_find(mdev->act_log, enr);
+ if (!extent) {
+ dev_err(DEV, "al_complete_io() called on inactive extent %u\n", enr);
+ continue;
+ }
+ lc_put(mdev->act_log, extent);
}
-
- if (lc_put(mdev->act_log, extent) == 0)
- wake_up(&mdev->al_wait);
-
spin_unlock_irqrestore(&mdev->al_lock, flags);
+ wake_up(&mdev->al_wait);
}
#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)
@@ -323,296 +346,148 @@
return rs_enr >>
/* bit to page */
((PAGE_SHIFT + 3) -
- /* al extent number to bit */
+ /* resync extent number to bit */
(BM_EXT_SHIFT - BM_BLOCK_SHIFT));
}
-int
-w_al_write_transaction(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int
+_al_write_transaction(struct drbd_conf *mdev)
{
- struct update_al_work *aw = container_of(w, struct update_al_work, w);
- struct lc_element *updated = aw->al_ext;
- const unsigned int new_enr = aw->enr;
- const unsigned int evicted = aw->old_enr;
- struct al_transaction *buffer;
+ struct al_transaction_on_disk *buffer;
+ struct lc_element *e;
sector_t sector;
- int i, n, mx;
- unsigned int extent_nr;
- u32 xor_sum = 0;
+ int i, mx;
+ unsigned extent_nr;
+ unsigned crc = 0;
+ int err = 0;
if (!get_ldev(mdev)) {
- dev_err(DEV,
- "disk is %s, cannot start al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
- return 1;
+ dev_err(DEV, "disk is %s, cannot start al transaction\n",
+ drbd_disk_str(mdev->state.disk));
+ return -EIO;
}
- /* do we have to do a bitmap write, first?
- * TODO reduce maximum latency:
- * submit both bios, then wait for both,
- * instead of doing two synchronous sector writes.
- * For now, we must not write the transaction,
- * if we cannot write out the bitmap of the evicted extent. */
- if (mdev->state.conn < C_CONNECTED && evicted != LC_FREE)
- drbd_bm_write_page(mdev, al_extent_to_bm_page(evicted));
/* The bitmap write may have failed, causing a state change. */
if (mdev->state.disk < D_INCONSISTENT) {
dev_err(DEV,
- "disk is %s, cannot write al transaction (-%d +%d)\n",
- drbd_disk_str(mdev->state.disk), evicted, new_enr);
- complete(&((struct update_al_work *)w)->event);
+ "disk is %s, cannot write al transaction\n",
+ drbd_disk_str(mdev->state.disk));
put_ldev(mdev);
- return 1;
+ return -EIO;
}
buffer = drbd_md_get_buffer(mdev); /* protects md_io_buffer, al_tr_cycle, ... */
if (!buffer) {
dev_err(DEV, "disk failed while waiting for md_io buffer\n");
- complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
- return 1;
+ return -ENODEV;
}
- buffer->magic = __constant_cpu_to_be32(DRBD_MAGIC);
+ memset(buffer, 0, sizeof(*buffer));
+ buffer->magic = cpu_to_be32(DRBD_AL_MAGIC);
buffer->tr_number = cpu_to_be32(mdev->al_tr_number);
- n = lc_index_of(mdev->act_log, updated);
+ i = 0;
- buffer->updates[0].pos = cpu_to_be32(n);
- buffer->updates[0].extent = cpu_to_be32(new_enr);
+ /* Even though no one can start to change this list
+ * once we set the LC_LOCKED -- from drbd_al_begin_io(),
+ * lc_try_lock_for_transaction() --, someone may still
+ * be in the process of changing it. */
+ spin_lock_irq(&mdev->al_lock);
+ list_for_each_entry(e, &mdev->act_log->to_be_changed, list) {
+ if (i == AL_UPDATES_PER_TRANSACTION) {
+ i++;
+ break;
+ }
+ buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index);
+ buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number);
+ if (e->lc_number != LC_FREE)
+ drbd_bm_mark_for_writeout(mdev,
+ al_extent_to_bm_page(e->lc_number));
+ i++;
+ }
+ spin_unlock_irq(&mdev->al_lock);
+ BUG_ON(i > AL_UPDATES_PER_TRANSACTION);
- xor_sum ^= new_enr;
+ buffer->n_updates = cpu_to_be16(i);
+ for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) {
+ buffer->update_slot_nr[i] = cpu_to_be16(-1);
+ buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE);
+ }
- mx = min_t(int, AL_EXTENTS_PT,
+ buffer->context_size = cpu_to_be16(mdev->act_log->nr_elements);
+ buffer->context_start_slot_nr = cpu_to_be16(mdev->al_tr_cycle);
+
+ mx = min_t(int, AL_CONTEXT_PER_TRANSACTION,
mdev->act_log->nr_elements - mdev->al_tr_cycle);
for (i = 0; i < mx; i++) {
unsigned idx = mdev->al_tr_cycle + i;
extent_nr = lc_element_by_index(mdev->act_log, idx)->lc_number;
- buffer->updates[i+1].pos = cpu_to_be32(idx);
- buffer->updates[i+1].extent = cpu_to_be32(extent_nr);
- xor_sum ^= extent_nr;
+ buffer->context[i] = cpu_to_be32(extent_nr);
}
- for (; i < AL_EXTENTS_PT; i++) {
- buffer->updates[i+1].pos = __constant_cpu_to_be32(-1);
- buffer->updates[i+1].extent = __constant_cpu_to_be32(LC_FREE);
- xor_sum ^= LC_FREE;
- }
- mdev->al_tr_cycle += AL_EXTENTS_PT;
+ for (; i < AL_CONTEXT_PER_TRANSACTION; i++)
+ buffer->context[i] = cpu_to_be32(LC_FREE);
+
+ mdev->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION;
if (mdev->al_tr_cycle >= mdev->act_log->nr_elements)
mdev->al_tr_cycle = 0;
- buffer->xor_sum = cpu_to_be32(xor_sum);
-
sector = mdev->ldev->md.md_offset
- + mdev->ldev->md.al_offset + mdev->al_tr_pos;
+ + mdev->ldev->md.al_offset
+ + mdev->al_tr_pos * (MD_BLOCK_SIZE>>9);
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE))
+ crc = crc32c(0, buffer, 4096);
+ buffer->crc32c = cpu_to_be32(crc);
+
+ if (drbd_bm_write_hinted(mdev))
+ err = -EIO;
+ /* drbd_chk_io_error done already */
+ else if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ err = -EIO;
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
-
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
-
- D_ASSERT(mdev->al_tr_pos < MD_AL_MAX_SIZE);
- mdev->al_tr_number++;
+ } else {
+ /* advance ringbuffer position and transaction counter */
+ mdev->al_tr_pos = (mdev->al_tr_pos + 1) % (MD_AL_SECTORS*512/MD_BLOCK_SIZE);
+ mdev->al_tr_number++;
+ }
drbd_md_put_buffer(mdev);
-
- complete(&((struct update_al_work *)w)->event);
put_ldev(mdev);
- return 1;
+ return err;
}
-/**
- * drbd_al_read_tr() - Read a single transaction from the on disk activity log
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- * @b: pointer to an al_transaction.
- * @index: On disk slot of the transaction to read.
- *
- * Returns -1 on IO error, 0 on checksum error and 1 upon success.
- */
-static int drbd_al_read_tr(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev,
- struct al_transaction *b,
- int index)
+
+static int w_al_write_transaction(struct drbd_work *w, int unused)
{
- sector_t sector;
- int rv, i;
- u32 xor_sum = 0;
+ struct update_al_work *aw = container_of(w, struct update_al_work, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
- sector = bdev->md.md_offset + bdev->md.al_offset + index;
+ err = _al_write_transaction(mdev);
+ aw->err = err;
+ complete(&aw->event);
- /* Dont process error normally,
- * as this is done before disk is attached! */
- if (!drbd_md_sync_page_io(mdev, bdev, sector, READ))
- return -1;
-
- rv = (be32_to_cpu(b->magic) == DRBD_MAGIC);
-
- for (i = 0; i < AL_EXTENTS_PT + 1; i++)
- xor_sum ^= be32_to_cpu(b->updates[i].extent);
- rv &= (xor_sum == be32_to_cpu(b->xor_sum));
-
- return rv;
+ return err != -EIO ? err : 0;
}
-/**
- * drbd_al_read_log() - Restores the activity log from its on disk representation.
- * @mdev: DRBD device.
- * @bdev: Block device to read form.
- *
- * Returns 1 on success, returns 0 when reading the log failed due to IO errors.
- */
-int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
+/* Calls from worker context (see w_restart_disk_io()) need to write the
+ transaction directly. Others came through generic_make_request(),
+ those need to delegate it to the worker. */
+static int al_write_transaction(struct drbd_conf *mdev)
{
- struct al_transaction *buffer;
- int i;
- int rv;
- int mx;
- int active_extents = 0;
- int transactions = 0;
- int found_valid = 0;
- int from = 0;
- int to = 0;
- u32 from_tnr = 0;
- u32 to_tnr = 0;
- u32 cnr;
+ struct update_al_work al_work;
- mx = div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT);
+ if (current == mdev->tconn->worker.task)
+ return _al_write_transaction(mdev);
- /* lock out all other meta data io for now,
- * and make sure the page is mapped.
- */
- buffer = drbd_md_get_buffer(mdev);
- if (!buffer)
- return 0;
+ init_completion(&al_work.event);
+ al_work.w.cb = w_al_write_transaction;
+ al_work.w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &al_work.w);
+ wait_for_completion(&al_work.event);
- /* Find the valid transaction in the log */
- for (i = 0; i <= mx; i++) {
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- if (rv == 0)
- continue;
- if (rv == -1) {
- drbd_md_put_buffer(mdev);
- return 0;
- }
- cnr = be32_to_cpu(buffer->tr_number);
-
- if (++found_valid == 1) {
- from = i;
- to = i;
- from_tnr = cnr;
- to_tnr = cnr;
- continue;
- }
- if ((int)cnr - (int)from_tnr < 0) {
- D_ASSERT(from_tnr - cnr + i - from == mx+1);
- from = i;
- from_tnr = cnr;
- }
- if ((int)cnr - (int)to_tnr > 0) {
- D_ASSERT(cnr - to_tnr == i - to);
- to = i;
- to_tnr = cnr;
- }
- }
-
- if (!found_valid) {
- dev_warn(DEV, "No usable activity log found.\n");
- drbd_md_put_buffer(mdev);
- return 1;
- }
-
- /* Read the valid transactions.
- * dev_info(DEV, "Reading from %d to %d.\n",from,to); */
- i = from;
- while (1) {
- int j, pos;
- unsigned int extent_nr;
- unsigned int trn;
-
- rv = drbd_al_read_tr(mdev, bdev, buffer, i);
- ERR_IF(rv == 0) goto cancel;
- if (rv == -1) {
- drbd_md_put_buffer(mdev);
- return 0;
- }
-
- trn = be32_to_cpu(buffer->tr_number);
-
- spin_lock_irq(&mdev->al_lock);
-
- /* This loop runs backwards because in the cyclic
- elements there might be an old version of the
- updated element (in slot 0). So the element in slot 0
- can overwrite old versions. */
- for (j = AL_EXTENTS_PT; j >= 0; j--) {
- pos = be32_to_cpu(buffer->updates[j].pos);
- extent_nr = be32_to_cpu(buffer->updates[j].extent);
-
- if (extent_nr == LC_FREE)
- continue;
-
- lc_set(mdev->act_log, extent_nr, pos);
- active_extents++;
- }
- spin_unlock_irq(&mdev->al_lock);
-
- transactions++;
-
-cancel:
- if (i == to)
- break;
- i++;
- if (i > mx)
- i = 0;
- }
-
- mdev->al_tr_number = to_tnr+1;
- mdev->al_tr_pos = to;
- if (++mdev->al_tr_pos >
- div_ceil(mdev->act_log->nr_elements, AL_EXTENTS_PT))
- mdev->al_tr_pos = 0;
-
- /* ok, we are done with it */
- drbd_md_put_buffer(mdev);
-
- dev_info(DEV, "Found %d transactions (%d active extents) in activity log.\n",
- transactions, active_extents);
-
- return 1;
-}
-
-/**
- * drbd_al_apply_to_bm() - Sets the bitmap to diry(1) where covered ba active AL extents
- * @mdev: DRBD device.
- */
-void drbd_al_apply_to_bm(struct drbd_conf *mdev)
-{
- unsigned int enr;
- unsigned long add = 0;
- char ppb[10];
- int i, tmp;
-
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
-
- for (i = 0; i < mdev->act_log->nr_elements; i++) {
- enr = lc_element_by_index(mdev->act_log, i)->lc_number;
- if (enr == LC_FREE)
- continue;
- tmp = drbd_bm_ALe_set_all(mdev, enr);
- dynamic_dev_dbg(DEV, "AL: set %d bits in extent %u\n", tmp, enr);
- add += tmp;
- }
-
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
-
- dev_info(DEV, "Marked additional %s as out-of-sync based on AL.\n",
- ppsize(ppb, Bit2KB(add)));
+ return al_work.err;
}
static int _try_lc_del(struct drbd_conf *mdev, struct lc_element *al_ext)
@@ -642,7 +517,7 @@
struct lc_element *al_ext;
int i;
- D_ASSERT(test_bit(__LC_DIRTY, &mdev->act_log->flags));
+ D_ASSERT(test_bit(__LC_LOCKED, &mdev->act_log->flags));
for (i = 0; i < mdev->act_log->nr_elements; i++) {
al_ext = lc_element_by_index(mdev->act_log, i);
@@ -654,15 +529,17 @@
wake_up(&mdev->al_wait);
}
-static int w_update_odbm(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_update_odbm(struct drbd_work *w, int unused)
{
struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
+ struct drbd_conf *mdev = w->mdev;
+ struct sib_info sib = { .sib_reason = SIB_SYNC_PROGRESS, };
if (!get_ldev(mdev)) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "Can not update on disk bitmap, local IO disabled.\n");
kfree(udw);
- return 1;
+ return 0;
}
drbd_bm_write_page(mdev, rs_extent_to_bm_page(udw->enr));
@@ -680,9 +557,9 @@
break;
}
}
- drbd_bcast_sync_progress(mdev);
+ drbd_bcast_event(mdev, &sib);
- return 1;
+ return 0;
}
@@ -752,7 +629,9 @@
}
ext->rs_left = rs_left;
ext->rs_failed = success ? 0 : count;
- lc_changed(mdev->resync, &ext->lce);
+ /* we don't keep a persistent log of the resync lru,
+ * we can commit any change right away. */
+ lc_committed(mdev->resync);
}
lc_put(mdev->resync, &ext->lce);
/* no race, we are within the al_lock! */
@@ -764,7 +643,8 @@
if (udw) {
udw->enr = ext->lce.lc_number;
udw->w.cb = w_update_odbm;
- drbd_queue_work_front(&mdev->data.work, &udw->w);
+ udw->w.mdev = mdev;
+ drbd_queue_work_front(&mdev->tconn->sender_work, &udw->w);
} else {
dev_warn(DEV, "Could not kmalloc an udw\n");
}
@@ -810,16 +690,22 @@
int wake_up = 0;
unsigned long flags;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_set_in_sync: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
}
+
+ if (!get_ldev(mdev))
+ return; /* no disk, no metadata, no bitmap to clear bits in */
+
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ goto out;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
@@ -827,7 +713,7 @@
* round up start sector, round down end sector. we make sure we only
* clear full, aligned, BM_BLOCK_SIZE (4K) blocks */
if (unlikely(esector < BM_SECT_PER_BIT-1))
- return;
+ goto out;
if (unlikely(esector == (nr_sectors-1)))
ebnr = lbnr;
else
@@ -835,14 +721,14 @@
sbnr = BM_SECT_TO_BIT(sector + BM_SECT_PER_BIT-1);
if (sbnr > ebnr)
- return;
+ goto out;
/*
* ok, (capacity & 7) != 0 sometimes, but who cares...
* we count rs_{total,left} in bits, not sectors.
*/
count = drbd_bm_clear_bits(mdev, sbnr, ebnr);
- if (count && get_ldev(mdev)) {
+ if (count) {
drbd_advance_rs_marks(mdev, drbd_bm_total_weight(mdev));
spin_lock_irqsave(&mdev->al_lock, flags);
drbd_try_clear_on_disk_bm(mdev, sector, count, true);
@@ -851,8 +737,9 @@
/* just wake_up unconditional now, various lc_chaged(),
* lc_put() in drbd_try_clear_on_disk_bm(). */
wake_up = 1;
- put_ldev(mdev);
}
+out:
+ put_ldev(mdev);
if (wake_up)
wake_up(&mdev->al_wait);
}
@@ -868,7 +755,7 @@
int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector, int size,
const char *file, const unsigned int line)
{
- unsigned long sbnr, ebnr, lbnr, flags;
+ unsigned long sbnr, ebnr, flags;
sector_t esector, nr_sectors;
unsigned int enr, count = 0;
struct lc_element *e;
@@ -877,7 +764,7 @@
if (size == 0)
return 0;
- if (size < 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size < 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "sector: %llus, size: %d\n",
(unsigned long long)sector, size);
return 0;
@@ -889,12 +776,10 @@
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors)
+ if (!expect(sector < nr_sectors))
goto out;
- ERR_IF(esector >= nr_sectors)
- esector = (nr_sectors-1);
-
- lbnr = BM_SECT_TO_BIT(nr_sectors-1);
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
/* we set it out of sync,
* we do not need to round anything here */
@@ -937,7 +822,7 @@
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
+ lc_committed(mdev->resync);
wakeup = 1;
}
if (bm_ext->lce.refcnt == 1)
@@ -953,7 +838,7 @@
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
}
return bm_ext;
@@ -961,26 +846,12 @@
static int _is_in_al(struct drbd_conf *mdev, unsigned int enr)
{
- struct lc_element *al_ext;
- int rv = 0;
+ int rv;
spin_lock_irq(&mdev->al_lock);
- if (unlikely(enr == mdev->act_log->new_number))
- rv = 1;
- else {
- al_ext = lc_find(mdev->act_log, enr);
- if (al_ext) {
- if (al_ext->refcnt)
- rv = 1;
- }
- }
+ rv = lc_is_used(mdev->act_log, enr);
spin_unlock_irq(&mdev->al_lock);
- /*
- if (unlikely(rv)) {
- dev_info(DEV, "Delaying sync read until app's write is done\n");
- }
- */
return rv;
}
@@ -1110,13 +981,13 @@
if (rs_flags & LC_STARVING)
dev_warn(DEV, "Have to wait for element"
" (resync LRU too small?)\n");
- BUG_ON(rs_flags & LC_DIRTY);
+ BUG_ON(rs_flags & LC_LOCKED);
goto try_again;
}
if (bm_ext->lce.lc_number != enr) {
bm_ext->rs_left = drbd_bm_e_weight(mdev, enr);
bm_ext->rs_failed = 0;
- lc_changed(mdev->resync, &bm_ext->lce);
+ lc_committed(mdev->resync);
wake_up(&mdev->al_wait);
D_ASSERT(test_bit(BME_LOCKED, &bm_ext->flags) == 0);
}
@@ -1127,8 +998,6 @@
}
check_al:
for (i = 0; i < AL_EXT_PER_BM_SECT; i++) {
- if (unlikely(al_enr+i == mdev->act_log->new_number))
- goto try_again;
if (lc_is_used(mdev->act_log, al_enr+i))
goto try_again;
}
@@ -1263,7 +1132,7 @@
sector_t esector, nr_sectors;
int wake_up = 0;
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "drbd_rs_failed_io: sector=%llus size=%d nonsense!\n",
(unsigned long long)sector, size);
return;
@@ -1271,8 +1140,10 @@
nr_sectors = drbd_get_capacity(mdev->this_bdev);
esector = sector + (size >> 9) - 1;
- ERR_IF(sector >= nr_sectors) return;
- ERR_IF(esector >= nr_sectors) esector = (nr_sectors-1);
+ if (!expect(sector < nr_sectors))
+ return;
+ if (!expect(esector < nr_sectors))
+ esector = nr_sectors - 1;
lbnr = BM_SECT_TO_BIT(nr_sectors-1);
diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index 8d80697..1ab205a 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -119,13 +119,9 @@
if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_err(DEV, "FIXME %s in %s, bitmap locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- func, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_task_to_thread_name(mdev->tconn, current),
+ func, b->bm_why ?: "?",
+ drbd_task_to_thread_name(mdev->tconn, b->bm_task));
}
void drbd_bm_lock(struct drbd_conf *mdev, char *why, enum bm_flag flags)
@@ -142,13 +138,9 @@
if (trylock_failed) {
dev_warn(DEV, "%s going to '%s' but bitmap already locked for '%s' by %s\n",
- current == mdev->receiver.task ? "receiver" :
- current == mdev->asender.task ? "asender" :
- current == mdev->worker.task ? "worker" : current->comm,
- why, b->bm_why ?: "?",
- b->bm_task == mdev->receiver.task ? "receiver" :
- b->bm_task == mdev->asender.task ? "asender" :
- b->bm_task == mdev->worker.task ? "worker" : "?");
+ drbd_task_to_thread_name(mdev->tconn, current),
+ why, b->bm_why ?: "?",
+ drbd_task_to_thread_name(mdev->tconn, b->bm_task));
mutex_lock(&b->bm_change);
}
if (BM_LOCKED_MASK & b->bm_flags)
@@ -196,6 +188,9 @@
/* to mark for lazy writeout once syncer cleared all clearable bits,
* we if bits have been cleared since last IO. */
#define BM_PAGE_LAZY_WRITEOUT 28
+/* pages marked with this "HINT" will be considered for writeout
+ * on activity log transactions */
+#define BM_PAGE_HINT_WRITEOUT 27
/* store_page_idx uses non-atomic assignment. It is only used directly after
* allocating the page. All other bm_set_page_* and bm_clear_page_* need to
@@ -227,8 +222,7 @@
{
struct drbd_bitmap *b = mdev->bitmap;
void *addr = &page_private(b->bm_pages[page_nr]);
- clear_bit(BM_PAGE_IO_LOCK, addr);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(BM_PAGE_IO_LOCK, addr);
wake_up(&mdev->bitmap->bm_io_wait);
}
@@ -246,6 +240,27 @@
set_bit(BM_PAGE_NEED_WRITEOUT, &page_private(page));
}
+/**
+ * drbd_bm_mark_for_writeout() - mark a page with a "hint" to be considered for writeout
+ * @mdev: DRBD device.
+ * @page_nr: the bitmap page to mark with the "hint" flag
+ *
+ * From within an activity log transaction, we mark a few pages with these
+ * hints, then call drbd_bm_write_hinted(), which will only write out changed
+ * pages which are flagged with this mark.
+ */
+void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr)
+{
+ struct page *page;
+ if (page_nr >= mdev->bitmap->bm_number_of_pages) {
+ dev_warn(DEV, "BAD: page_nr: %u, number_of_pages: %u\n",
+ page_nr, (int)mdev->bitmap->bm_number_of_pages);
+ return;
+ }
+ page = mdev->bitmap->bm_pages[page_nr];
+ set_bit(BM_PAGE_HINT_WRITEOUT, &page_private(page));
+}
+
static int bm_test_page_unchanged(struct page *page)
{
volatile const unsigned long *addr = &page_private(page);
@@ -376,7 +391,7 @@
* GFP_NOIO, as this is called while drbd IO is "suspended",
* and during resize or attach on diskless Primary,
* we must not block on IO to ourselves.
- * Context is receiver thread or cqueue thread/dmsetup. */
+ * Context is receiver thread or dmsetup. */
bytes = sizeof(struct page *)*want;
new_pages = kzalloc(bytes, GFP_NOIO);
if (!new_pages) {
@@ -441,7 +456,8 @@
sector_t drbd_bm_capacity(struct drbd_conf *mdev)
{
- ERR_IF(!mdev->bitmap) return 0;
+ if (!expect(mdev->bitmap))
+ return 0;
return mdev->bitmap->bm_dev_capacity;
}
@@ -449,7 +465,8 @@
*/
void drbd_bm_cleanup(struct drbd_conf *mdev)
{
- ERR_IF (!mdev->bitmap) return;
+ if (!expect(mdev->bitmap))
+ return;
bm_free_pages(mdev->bitmap->bm_pages, mdev->bitmap->bm_number_of_pages);
bm_vk_free(mdev->bitmap->bm_pages, (BM_P_VMALLOCED & mdev->bitmap->bm_flags));
kfree(mdev->bitmap);
@@ -612,7 +629,8 @@
int err = 0, growing;
int opages_vmalloced;
- ERR_IF(!b) return -ENOMEM;
+ if (!expect(b))
+ return -ENOMEM;
drbd_bm_lock(mdev, "resize", BM_LOCKED_MASK);
@@ -734,8 +752,10 @@
unsigned long s;
unsigned long flags;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
s = b->bm_set;
@@ -758,8 +778,10 @@
size_t drbd_bm_words(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
return b->bm_words;
}
@@ -767,7 +789,8 @@
unsigned long drbd_bm_bits(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return 0;
+ if (!expect(b))
+ return 0;
return b->bm_bits;
}
@@ -788,8 +811,10 @@
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
if (number == 0)
return;
WARN_ON(offset >= b->bm_words);
@@ -833,8 +858,10 @@
end = offset + number;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
if ((offset >= b->bm_words) ||
@@ -862,8 +889,10 @@
void drbd_bm_set_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0xff, b->bm_words);
@@ -876,8 +905,10 @@
void drbd_bm_clear_all(struct drbd_conf *mdev)
{
struct drbd_bitmap *b = mdev->bitmap;
- ERR_IF(!b) return;
- ERR_IF(!b->bm_pages) return;
+ if (!expect(b))
+ return;
+ if (!expect(b->bm_pages))
+ return;
spin_lock_irq(&b->bm_lock);
bm_memset(b, 0, 0, b->bm_words);
@@ -891,7 +922,8 @@
unsigned int done;
unsigned flags;
#define BM_AIO_COPY_PAGES 1
-#define BM_WRITE_ALL_PAGES 2
+#define BM_AIO_WRITE_HINTED 2
+#define BM_WRITE_ALL_PAGES 4
int error;
struct kref kref;
};
@@ -1062,6 +1094,11 @@
if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
break;
if (rw & WRITE) {
+ if ((flags & BM_AIO_WRITE_HINTED) &&
+ !test_and_clear_bit(BM_PAGE_HINT_WRITEOUT,
+ &page_private(b->bm_pages[i])))
+ continue;
+
if (!(flags & BM_WRITE_ALL_PAGES) &&
bm_test_page_unchanged(b->bm_pages[i])) {
dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
@@ -1094,9 +1131,11 @@
else
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
- dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
- rw == WRITE ? "WRITE" : "READ",
- count, jiffies - now);
+ /* summary for global bitmap IO */
+ if (flags == 0)
+ dev_info(DEV, "bitmap %s of %u pages took %lu jiffies\n",
+ rw == WRITE ? "WRITE" : "READ",
+ count, jiffies - now);
if (ctx->error) {
dev_alert(DEV, "we had at least one MD IO ERROR during bitmap IO\n");
@@ -1117,8 +1156,9 @@
}
now = b->bm_set;
- dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
- ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
+ if (flags == 0)
+ dev_info(DEV, "%s (%lu bits) marked out-of-sync by on disk bit-map.\n",
+ ppsize(ppb, now << (BM_BLOCK_SHIFT-10)), now);
kref_put(&ctx->kref, &bm_aio_ctx_destroy);
return err;
@@ -1181,9 +1221,17 @@
return bm_rw(mdev, WRITE, BM_AIO_COPY_PAGES, 0);
}
+/**
+ * drbd_bm_write_hinted() - Write bitmap pages with "hint" marks, if they have changed.
+ * @mdev: DRBD device.
+ */
+int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local)
+{
+ return bm_rw(mdev, WRITE, BM_AIO_WRITE_HINTED | BM_AIO_COPY_PAGES, 0);
+}
/**
- * drbd_bm_write_page: Writes a PAGE_SIZE aligned piece of bitmap
+ * drbd_bm_write_page() - Writes a PAGE_SIZE aligned piece of bitmap
* @mdev: DRBD device.
* @idx: bitmap page index
*
@@ -1291,8 +1339,10 @@
struct drbd_bitmap *b = mdev->bitmap;
unsigned long i = DRBD_END_OF_BITMAP;
- ERR_IF(!b) return i;
- ERR_IF(!b->bm_pages) return i;
+ if (!expect(b))
+ return i;
+ if (!expect(b->bm_pages))
+ return i;
spin_lock_irq(&b->bm_lock);
if (BM_DONT_TEST & b->bm_flags)
@@ -1393,8 +1443,10 @@
struct drbd_bitmap *b = mdev->bitmap;
int c = 0;
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if ((val ? BM_DONT_SET : BM_DONT_CLEAR) & b->bm_flags)
@@ -1425,13 +1477,21 @@
{
int i;
int bits;
+ int changed = 0;
unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]);
for (i = first_word; i < last_word; i++) {
bits = hweight_long(paddr[i]);
paddr[i] = ~0UL;
- b->bm_set += BITS_PER_LONG - bits;
+ changed += BITS_PER_LONG - bits;
}
kunmap_atomic(paddr);
+ if (changed) {
+ /* We only need lazy writeout, the information is still in the
+ * remote bitmap as well, and is reconstructed during the next
+ * bitmap exchange, if lost locally due to a crash. */
+ bm_set_page_lazy_writeout(b->bm_pages[page_nr]);
+ b->bm_set += changed;
+ }
}
/* Same thing as drbd_bm_set_bits,
@@ -1526,8 +1586,10 @@
unsigned long *p_addr;
int i;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1561,8 +1623,10 @@
* robust in case we screwed up elsewhere, in that case pretend there
* was one dirty bit in the requested area, so we won't try to do a
* local read there (no bitmap probably implies no disk) */
- ERR_IF(!b) return 1;
- ERR_IF(!b->bm_pages) return 1;
+ if (!expect(b))
+ return 1;
+ if (!expect(b->bm_pages))
+ return 1;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1575,11 +1639,10 @@
bm_unmap(p_addr);
p_addr = bm_map_pidx(b, idx);
}
- ERR_IF (bitnr >= b->bm_bits) {
- dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
- } else {
+ if (expect(bitnr < b->bm_bits))
c += (0 != test_bit_le(bitnr - (page_nr << (PAGE_SHIFT+3)), p_addr));
- }
+ else
+ dev_err(DEV, "bitnr=%lu bm_bits=%lu\n", bitnr, b->bm_bits);
}
if (p_addr)
bm_unmap(p_addr);
@@ -1609,8 +1672,10 @@
unsigned long flags;
unsigned long *p_addr, *bm;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
+ if (!expect(b))
+ return 0;
+ if (!expect(b->bm_pages))
+ return 0;
spin_lock_irqsave(&b->bm_lock, flags);
if (BM_DONT_TEST & b->bm_flags)
@@ -1632,47 +1697,3 @@
spin_unlock_irqrestore(&b->bm_lock, flags);
return count;
}
-
-/* Set all bits covered by the AL-extent al_enr.
- * Returns number of bits changed. */
-unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev, unsigned long al_enr)
-{
- struct drbd_bitmap *b = mdev->bitmap;
- unsigned long *p_addr, *bm;
- unsigned long weight;
- unsigned long s, e;
- int count, i, do_now;
- ERR_IF(!b) return 0;
- ERR_IF(!b->bm_pages) return 0;
-
- spin_lock_irq(&b->bm_lock);
- if (BM_DONT_SET & b->bm_flags)
- bm_print_lock_info(mdev);
- weight = b->bm_set;
-
- s = al_enr * BM_WORDS_PER_AL_EXT;
- e = min_t(size_t, s + BM_WORDS_PER_AL_EXT, b->bm_words);
- /* assert that s and e are on the same page */
- D_ASSERT((e-1) >> (PAGE_SHIFT - LN2_BPL + 3)
- == s >> (PAGE_SHIFT - LN2_BPL + 3));
- count = 0;
- if (s < b->bm_words) {
- i = do_now = e-s;
- p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s));
- bm = p_addr + MLPP(s);
- while (i--) {
- count += hweight_long(*bm);
- *bm = -1UL;
- bm++;
- }
- bm_unmap(p_addr);
- b->bm_set += do_now*BITS_PER_LONG - count;
- if (e == b->bm_words)
- b->bm_set -= bm_clear_surplus(b);
- } else {
- dev_err(DEV, "start offset (%lu) too large in drbd_bm_ALe_set_all\n", s);
- }
- weight = b->bm_set - weight;
- spin_unlock_irq(&b->bm_lock);
- return weight;
-}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 277c69c..ef72a72 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -39,9 +39,13 @@
#include <linux/major.h>
#include <linux/blkdev.h>
#include <linux/genhd.h>
+#include <linux/idr.h>
#include <net/tcp.h>
#include <linux/lru_cache.h>
#include <linux/prefetch.h>
+#include <linux/drbd_genl_api.h>
+#include <linux/drbd.h>
+#include "drbd_state.h"
#ifdef __CHECKER__
# define __protected_by(x) __attribute__((require_context(x,1,999,"rdwr")))
@@ -61,7 +65,6 @@
extern unsigned int minor_count;
extern bool disable_sendpage;
extern bool allow_oos;
-extern unsigned int cn_idx;
#ifdef CONFIG_DRBD_FAULT_INJECTION
extern int enable_faults;
@@ -86,34 +89,44 @@
*/
#define DRBD_SIGKILL SIGHUP
-/* All EEs on the free list should have ID_VACANT (== 0)
- * freshly allocated EEs get !ID_VACANT (== 1)
- * so if it says "cannot dereference null pointer at address 0x00000001",
- * it is most likely one of these :( */
-
#define ID_IN_SYNC (4711ULL)
#define ID_OUT_OF_SYNC (4712ULL)
-
#define ID_SYNCER (-1ULL)
-#define ID_VACANT 0
-#define is_syncer_block_id(id) ((id) == ID_SYNCER)
+
#define UUID_NEW_BM_OFFSET ((u64)0x0001000000000000ULL)
struct drbd_conf;
+struct drbd_tconn;
/* to shorten dev_warn(DEV, "msg"); and relatives statements */
#define DEV (disk_to_dev(mdev->vdisk))
+#define conn_printk(LEVEL, TCONN, FMT, ARGS...) \
+ printk(LEVEL "d-con %s: " FMT, TCONN->name , ## ARGS)
+#define conn_alert(TCONN, FMT, ARGS...) conn_printk(KERN_ALERT, TCONN, FMT, ## ARGS)
+#define conn_crit(TCONN, FMT, ARGS...) conn_printk(KERN_CRIT, TCONN, FMT, ## ARGS)
+#define conn_err(TCONN, FMT, ARGS...) conn_printk(KERN_ERR, TCONN, FMT, ## ARGS)
+#define conn_warn(TCONN, FMT, ARGS...) conn_printk(KERN_WARNING, TCONN, FMT, ## ARGS)
+#define conn_notice(TCONN, FMT, ARGS...) conn_printk(KERN_NOTICE, TCONN, FMT, ## ARGS)
+#define conn_info(TCONN, FMT, ARGS...) conn_printk(KERN_INFO, TCONN, FMT, ## ARGS)
+#define conn_dbg(TCONN, FMT, ARGS...) conn_printk(KERN_DEBUG, TCONN, FMT, ## ARGS)
+
#define D_ASSERT(exp) if (!(exp)) \
dev_err(DEV, "ASSERT( " #exp " ) in %s:%d\n", __FILE__, __LINE__)
-#define ERR_IF(exp) if (({ \
- int _b = (exp) != 0; \
- if (_b) dev_err(DEV, "ASSERT FAILED: %s: (%s) in %s:%d\n", \
- __func__, #exp, __FILE__, __LINE__); \
- _b; \
- }))
+/**
+ * expect - Make an assertion
+ *
+ * Unlike the assert macro, this macro returns a boolean result.
+ */
+#define expect(exp) ({ \
+ bool _bool = (exp); \
+ if (!_bool) \
+ dev_err(DEV, "ASSERTION %s FAILED in %s\n", \
+ #exp, __func__); \
+ _bool; \
+ })
/* Defines to control fault insertion */
enum {
@@ -150,15 +163,12 @@
/* usual integer division */
#define div_floor(A, B) ((A)/(B))
-/* drbd_meta-data.c (still in drbd_main.c) */
-/* 4th incarnation of the disk layout. */
-#define DRBD_MD_MAGIC (DRBD_MAGIC+4)
-
-extern struct drbd_conf **minor_table;
extern struct ratelimit_state drbd_ratelimit_state;
+extern struct idr minors; /* RCU, updates: genl_lock() */
+extern struct list_head drbd_tconns; /* RCU, updates: genl_lock() */
/* on the wire */
-enum drbd_packets {
+enum drbd_packet {
/* receiver (data socket) */
P_DATA = 0x00,
P_DATA_REPLY = 0x01, /* Response to P_DATA_REQUEST */
@@ -186,7 +196,7 @@
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
- P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
+ P_SUPERSEDED = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
@@ -207,77 +217,23 @@
P_DELAY_PROBE = 0x27, /* is used on BOTH sockets */
P_OUT_OF_SYNC = 0x28, /* Mark as out of sync (Outrunning), data socket */
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
+ P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
+ P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
+ P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
+ P_PROTOCOL_UPDATE = 0x2d, /* data sock: is used in established connections */
- P_MAX_CMD = 0x2A,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
/* special command ids for handshake */
- P_HAND_SHAKE_M = 0xfff1, /* First Packet on the MetaSock */
- P_HAND_SHAKE_S = 0xfff2, /* First Packet on the Socket */
+ P_INITIAL_META = 0xfff1, /* First Packet on the MetaSock */
+ P_INITIAL_DATA = 0xfff2, /* First Packet on the Socket */
- P_HAND_SHAKE = 0xfffe /* FIXED for the next century! */
+ P_CONNECTION_FEATURES = 0xfffe /* FIXED for the next century! */
};
-static inline const char *cmdname(enum drbd_packets cmd)
-{
- /* THINK may need to become several global tables
- * when we want to support more than
- * one PRO_VERSION */
- static const char *cmdnames[] = {
- [P_DATA] = "Data",
- [P_DATA_REPLY] = "DataReply",
- [P_RS_DATA_REPLY] = "RSDataReply",
- [P_BARRIER] = "Barrier",
- [P_BITMAP] = "ReportBitMap",
- [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
- [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
- [P_UNPLUG_REMOTE] = "UnplugRemote",
- [P_DATA_REQUEST] = "DataRequest",
- [P_RS_DATA_REQUEST] = "RSDataRequest",
- [P_SYNC_PARAM] = "SyncParam",
- [P_SYNC_PARAM89] = "SyncParam89",
- [P_PROTOCOL] = "ReportProtocol",
- [P_UUIDS] = "ReportUUIDs",
- [P_SIZES] = "ReportSizes",
- [P_STATE] = "ReportState",
- [P_SYNC_UUID] = "ReportSyncUUID",
- [P_AUTH_CHALLENGE] = "AuthChallenge",
- [P_AUTH_RESPONSE] = "AuthResponse",
- [P_PING] = "Ping",
- [P_PING_ACK] = "PingAck",
- [P_RECV_ACK] = "RecvAck",
- [P_WRITE_ACK] = "WriteAck",
- [P_RS_WRITE_ACK] = "RSWriteAck",
- [P_DISCARD_ACK] = "DiscardAck",
- [P_NEG_ACK] = "NegAck",
- [P_NEG_DREPLY] = "NegDReply",
- [P_NEG_RS_DREPLY] = "NegRSDReply",
- [P_BARRIER_ACK] = "BarrierAck",
- [P_STATE_CHG_REQ] = "StateChgRequest",
- [P_STATE_CHG_REPLY] = "StateChgReply",
- [P_OV_REQUEST] = "OVRequest",
- [P_OV_REPLY] = "OVReply",
- [P_OV_RESULT] = "OVResult",
- [P_CSUM_RS_REQUEST] = "CsumRSRequest",
- [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
- [P_COMPRESSED_BITMAP] = "CBitmap",
- [P_DELAY_PROBE] = "DelayProbe",
- [P_OUT_OF_SYNC] = "OutOfSync",
- [P_MAX_CMD] = NULL,
- };
-
- if (cmd == P_HAND_SHAKE_M)
- return "HandShakeM";
- if (cmd == P_HAND_SHAKE_S)
- return "HandShakeS";
- if (cmd == P_HAND_SHAKE)
- return "HandShake";
- if (cmd >= P_MAX_CMD)
- return "Unknown";
- return cmdnames[cmd];
-}
+extern const char *cmdname(enum drbd_packet cmd);
/* for sending/receiving the bitmap,
* possibly in some encoding scheme */
@@ -337,37 +293,24 @@
u32 magic;
u16 command;
u16 length; /* bytes of data after this header */
- u8 payload[0];
} __packed;
/* Header for big packets, Used for data packets exceeding 64kB */
struct p_header95 {
u16 magic; /* use DRBD_MAGIC_BIG here */
u16 command;
- u32 length; /* Use only 24 bits of that. Ignore the highest 8 bit. */
- u8 payload[0];
+ u32 length;
} __packed;
-union p_header {
- struct p_header80 h80;
- struct p_header95 h95;
-};
+struct p_header100 {
+ u32 magic;
+ u16 volume;
+ u16 command;
+ u32 length;
+ u32 pad;
+} __packed;
-/*
- * short commands, packets without payload, plain p_header:
- * P_PING
- * P_PING_ACK
- * P_BECOME_SYNC_TARGET
- * P_BECOME_SYNC_SOURCE
- * P_UNPLUG_REMOTE
- */
-
-/*
- * commands with out-of-struct payload:
- * P_BITMAP (no additional fields)
- * P_DATA, P_DATA_REPLY (see p_data)
- * P_COMPRESSED_BITMAP (see receive_compressed_bitmap)
- */
+extern unsigned int drbd_header_size(struct drbd_tconn *tconn);
/* these defines must not be changed without changing the protocol version */
#define DP_HARDBARRIER 1 /* depricated */
@@ -377,9 +320,10 @@
#define DP_FUA 16 /* equals REQ_FUA */
#define DP_FLUSH 32 /* equals REQ_FLUSH */
#define DP_DISCARD 64 /* equals REQ_DISCARD */
+#define DP_SEND_RECEIVE_ACK 128 /* This is a proto B write request */
+#define DP_SEND_WRITE_ACK 256 /* This is a proto C write request */
struct p_data {
- union p_header head;
u64 sector; /* 64 bits sector number */
u64 block_id; /* to identify the request in protocol B&C */
u32 seq_num;
@@ -390,21 +334,18 @@
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
- * P_DISCARD_ACK (proto C, two-primaries conflict detection)
+ * P_SUPERSEDED (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
struct p_block_ack {
- struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
u32 seq_num;
} __packed;
-
struct p_block_req {
- struct p_header80 head;
u64 sector;
u64 block_id;
u32 blksize;
@@ -413,59 +354,52 @@
/*
* commands with their own struct for additional fields:
- * P_HAND_SHAKE
+ * P_CONNECTION_FEATURES
* P_BARRIER
* P_BARRIER_ACK
* P_SYNC_PARAM
* ReportParams
*/
-struct p_handshake {
- struct p_header80 head; /* 8 bytes */
+struct p_connection_features {
u32 protocol_min;
u32 feature_flags;
u32 protocol_max;
/* should be more than enough for future enhancements
- * for now, feature_flags and the reserverd array shall be zero.
+ * for now, feature_flags and the reserved array shall be zero.
*/
u32 _pad;
- u64 reserverd[7];
+ u64 reserved[7];
} __packed;
-/* 80 bytes, FIXED for the next century */
struct p_barrier {
- struct p_header80 head;
u32 barrier; /* barrier number _handle_ only */
u32 pad; /* to multiple of 8 Byte */
} __packed;
struct p_barrier_ack {
- struct p_header80 head;
u32 barrier;
u32 set_size;
} __packed;
struct p_rs_param {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
/* Since protocol version 88 and higher. */
char verify_alg[0];
} __packed;
struct p_rs_param_89 {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
/* protocol version 89: */
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
} __packed;
struct p_rs_param_95 {
- struct p_header80 head;
- u32 rate;
+ u32 resync_rate;
char verify_alg[SHARED_SECRET_MAX];
char csums_alg[SHARED_SECRET_MAX];
u32 c_plan_ahead;
@@ -475,12 +409,11 @@
} __packed;
enum drbd_conn_flags {
- CF_WANT_LOSE = 1,
+ CF_DISCARD_MY_DATA = 1,
CF_DRY_RUN = 2,
};
struct p_protocol {
- struct p_header80 head;
u32 protocol;
u32 after_sb_0p;
u32 after_sb_1p;
@@ -494,17 +427,14 @@
} __packed;
struct p_uuids {
- struct p_header80 head;
u64 uuid[UI_EXTENDED_SIZE];
} __packed;
struct p_rs_uuid {
- struct p_header80 head;
u64 uuid;
} __packed;
struct p_sizes {
- struct p_header80 head;
u64 d_size; /* size of disk */
u64 u_size; /* user requested size */
u64 c_size; /* current exported size */
@@ -514,18 +444,15 @@
} __packed;
struct p_state {
- struct p_header80 head;
u32 state;
} __packed;
struct p_req_state {
- struct p_header80 head;
u32 mask;
u32 val;
} __packed;
struct p_req_state_reply {
- struct p_header80 head;
u32 retcode;
} __packed;
@@ -539,15 +466,7 @@
u32 bit_map_gen[5];
} __packed;
-struct p_discard {
- struct p_header80 head;
- u64 block_id;
- u32 seq_num;
- u32 pad;
-} __packed;
-
struct p_block_desc {
- struct p_header80 head;
u64 sector;
u32 blksize;
u32 pad; /* to multiple of 8 Byte */
@@ -563,7 +482,6 @@
};
struct p_compressed_bm {
- struct p_header80 head;
/* (encoding & 0x0f): actual encoding, see enum drbd_bitmap_code
* (encoding & 0x80): polarity (set/unset) of first runlength
* ((encoding >> 4) & 0x07): pad_bits, number of trailing zero bits
@@ -575,90 +493,22 @@
} __packed;
struct p_delay_probe93 {
- struct p_header80 head;
u32 seq_num; /* sequence number to match the two probe packets */
u32 offset; /* usecs the probe got sent after the reference time point */
} __packed;
-/* DCBP: Drbd Compressed Bitmap Packet ... */
-static inline enum drbd_bitmap_code
-DCBP_get_code(struct p_compressed_bm *p)
-{
- return (enum drbd_bitmap_code)(p->encoding & 0x0f);
-}
-
-static inline void
-DCBP_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
-{
- BUG_ON(code & ~0xf);
- p->encoding = (p->encoding & ~0xf) | code;
-}
-
-static inline int
-DCBP_get_start(struct p_compressed_bm *p)
-{
- return (p->encoding & 0x80) != 0;
-}
-
-static inline void
-DCBP_set_start(struct p_compressed_bm *p, int set)
-{
- p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
-}
-
-static inline int
-DCBP_get_pad_bits(struct p_compressed_bm *p)
-{
- return (p->encoding >> 4) & 0x7;
-}
-
-static inline void
-DCBP_set_pad_bits(struct p_compressed_bm *p, int n)
-{
- BUG_ON(n & ~0x7);
- p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
-}
-
-/* one bitmap packet, including the p_header,
- * should fit within one _architecture independend_ page.
- * so we need to use the fixed size 4KiB page size
- * most architectures have used for a long time.
+/*
+ * Bitmap packets need to fit within a single page on the sender and receiver,
+ * so we are limited to 4 KiB (and not to PAGE_SIZE, which can be bigger).
*/
-#define BM_PACKET_PAYLOAD_BYTES (4096 - sizeof(struct p_header80))
-#define BM_PACKET_WORDS (BM_PACKET_PAYLOAD_BYTES/sizeof(long))
-#define BM_PACKET_VLI_BYTES_MAX (4096 - sizeof(struct p_compressed_bm))
-#if (PAGE_SIZE < 4096)
-/* drbd_send_bitmap / receive_bitmap would break horribly */
-#error "PAGE_SIZE too small"
-#endif
-
-union p_polymorph {
- union p_header header;
- struct p_handshake handshake;
- struct p_data data;
- struct p_block_ack block_ack;
- struct p_barrier barrier;
- struct p_barrier_ack barrier_ack;
- struct p_rs_param_89 rs_param_89;
- struct p_rs_param_95 rs_param_95;
- struct p_protocol protocol;
- struct p_sizes sizes;
- struct p_uuids uuids;
- struct p_state state;
- struct p_req_state req_state;
- struct p_req_state_reply req_state_reply;
- struct p_block_req block_req;
- struct p_delay_probe93 delay_probe93;
- struct p_rs_uuid rs_uuid;
- struct p_block_desc block_desc;
-} __packed;
+#define DRBD_SOCKET_BUFFER_SIZE 4096
/**********************************************************************/
enum drbd_thread_state {
- None,
- Running,
- Exiting,
- Restarting
+ NONE,
+ RUNNING,
+ EXITING,
+ RESTARTING
};
struct drbd_thread {
@@ -667,8 +517,9 @@
struct completion stop;
enum drbd_thread_state t_state;
int (*function) (struct drbd_thread *);
- struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
int reset_cpu_mask;
+ char name[9];
};
static inline enum drbd_thread_state get_t_state(struct drbd_thread *thi)
@@ -681,58 +532,54 @@
return thi->t_state;
}
-struct drbd_work;
-typedef int (*drbd_work_cb)(struct drbd_conf *, struct drbd_work *, int cancel);
struct drbd_work {
struct list_head list;
- drbd_work_cb cb;
+ int (*cb)(struct drbd_work *, int cancel);
+ union {
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
+ };
};
-struct drbd_tl_epoch;
+#include "drbd_interval.h"
+
+extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
+
struct drbd_request {
struct drbd_work w;
- struct drbd_conf *mdev;
/* if local IO is not allowed, will be NULL.
* if local IO _is_ allowed, holds the locally submitted bio clone,
* or, after local IO completion, the ERR_PTR(error).
- * see drbd_endio_pri(). */
+ * see drbd_request_endio(). */
struct bio *private_bio;
- struct hlist_node collision;
- sector_t sector;
- unsigned int size;
- unsigned int epoch; /* barrier_nr */
+ struct drbd_interval i;
- /* barrier_nr: used to check on "completion" whether this req was in
+ /* epoch: used to check on "completion" whether this req was in
* the current epoch, and we therefore have to close it,
- * starting a new epoch...
+ * causing a p_barrier packet to be send, starting a new epoch.
+ *
+ * This corresponds to "barrier" in struct p_barrier[_ack],
+ * and to "barrier_nr" in struct drbd_epoch (and various
+ * comments/function parameters/local variable names).
*/
+ unsigned int epoch;
struct list_head tl_requests; /* ring list in the transfer log */
struct bio *master_bio; /* master bio pointer */
- unsigned long rq_state; /* see comments above _req_mod() */
unsigned long start_time;
+
+ /* once it hits 0, we may complete the master_bio */
+ atomic_t completion_ref;
+ /* once it hits 0, we may destroy this drbd_request object */
+ struct kref kref;
+
+ unsigned rq_state; /* see comments above _req_mod() */
};
-struct drbd_tl_epoch {
- struct drbd_work w;
- struct list_head requests; /* requests before */
- struct drbd_tl_epoch *next; /* pointer to the next barrier */
- unsigned int br_number; /* the barriers identifier. */
- int n_writes; /* number of requests attached before this barrier */
-};
-
-struct drbd_request;
-
-/* These Tl_epoch_entries may be in one of 6 lists:
- active_ee .. data packet being written
- sync_ee .. syncer block being written
- done_ee .. block written, need to send P_WRITE_ACK
- read_ee .. [RS]P_DATA_REQUEST being read
-*/
-
struct drbd_epoch {
+ struct drbd_tconn *tconn;
struct list_head list;
unsigned int barrier_nr;
atomic_t epoch_size; /* increased on every request added. */
@@ -762,17 +609,14 @@
void *digest;
};
-struct drbd_epoch_entry {
+struct drbd_peer_request {
struct drbd_work w;
- struct hlist_node collision;
struct drbd_epoch *epoch; /* for writes */
- struct drbd_conf *mdev;
struct page *pages;
atomic_t pending_bios;
- unsigned int size;
+ struct drbd_interval i;
/* see comments on ee flag bits below */
unsigned long flags;
- sector_t sector;
union {
u64 block_id;
struct digest_info *digest;
@@ -793,31 +637,37 @@
* we need to resubmit without the barrier flag. */
__EE_RESUBMITTED,
- /* we may have several bios per epoch entry.
+ /* we may have several bios per peer request.
* if any of those fail, we set this flag atomically
* from the endio callback */
__EE_WAS_ERROR,
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
+
+ /* Conflicting local requests need to be restarted after this request */
+ __EE_RESTART_REQUESTS,
+
+ /* The peer wants a write ACK for this (wire proto C) */
+ __EE_SEND_WRITE_ACK,
+
+ /* Is set when net_conf had two_primaries set while creating this peer_req */
+ __EE_IN_INTERVAL_TREE,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
+#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
+#define EE_SEND_WRITE_ACK (1<<__EE_SEND_WRITE_ACK)
+#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
-/* global flag bits */
-enum drbd_flag {
- CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
- SIGNAL_ASENDER, /* whether asender wants to be interrupted */
- SEND_PING, /* whether asender should send a ping asap */
-
+/* flag bits per mdev */
+enum {
UNPLUG_REMOTE, /* sending a "UnplugRemote" could help */
MD_DIRTY, /* current uuids and flags not yet on disk */
- DISCARD_CONCURRENT, /* Set on one node, cleared on the peer! */
USE_DEGR_WFC_T, /* degr-wfc-timeout instead of wfc-timeout. */
- CLUSTER_ST_CHANGE, /* Cluster wide state change going on... */
CL_ST_CHG_SUCCESS,
CL_ST_CHG_FAIL,
CRASHED_PRIMARY, /* This node was a crashed primary.
@@ -835,33 +685,14 @@
WAS_READ_ERROR, /* Local disk READ failed (set additionally to the above) */
FORCE_DETACH, /* Force-detach from local disk, aborting any pending local IO */
RESYNC_AFTER_NEG, /* Resync after online grow after the attach&negotiate finished. */
- NET_CONGESTED, /* The data socket is congested */
-
- CONFIG_PENDING, /* serialization of (re)configuration requests.
- * if set, also prevents the device from dying */
- DEVICE_DYING, /* device became unconfigured,
- * but worker thread is still handling the cleanup.
- * reconfiguring (nl_disk_conf, nl_net_conf) is dissalowed,
- * while this is set. */
RESIZE_PENDING, /* Size change detected locally, waiting for the response from
* the peer, if it changed there as well. */
- CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
- GOT_PING_ACK, /* set when we receive a ping_ack packet, misc wait gets woken */
NEW_CUR_UUID, /* Create new current UUID when thawing IO */
AL_SUSPENDED, /* Activity logging is currently suspended. */
AHEAD_TO_SYNC_SOURCE, /* Ahead -> SyncSource queued */
- STATE_SENT, /* Do not change state/UUIDs while this is set */
-
- CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
- * pending, from drbd worker context.
- * If set, bdi_write_congested() returns true,
- * so shrink_page_list() would not recurse into,
- * and potentially deadlock on, this drbd worker.
- */
- DISCONNECT_SENT, /* Currently the last bit in this 32bit word */
-
- /* keep last */
- DRBD_N_FLAGS,
+ B_RS_H_DONE, /* Before resync handler done (already executed) */
+ DISCARD_MY_DATA, /* discard_my_data flag per volume */
+ READ_BALANCE_RR,
};
struct drbd_bitmap; /* opaque for drbd_conf */
@@ -899,18 +730,17 @@
struct drbd_work_queue {
struct list_head q;
- struct semaphore s; /* producers up it, worker down()s it */
spinlock_t q_lock; /* to protect the list. */
+ wait_queue_head_t q_wait;
};
struct drbd_socket {
- struct drbd_work_queue work;
struct mutex mutex;
struct socket *socket;
/* this way we get our
* send/receive buffers off the stack */
- union p_polymorph sbuf;
- union p_polymorph rbuf;
+ void *sbuf;
+ void *rbuf;
};
struct drbd_md {
@@ -927,24 +757,16 @@
s32 bm_offset; /* signed relative sector offset to bitmap */
/* u32 al_nr_extents; important for restoring the AL
- * is stored into sync_conf.al_extents, which in turn
+ * is stored into ldev->dc.al_extents, which in turn
* gets applied to act_log->nr_elements
*/
};
-/* for sync_conf and other types... */
-#define NL_PACKET(name, number, fields) struct name { fields };
-#define NL_INTEGER(pn,pr,member) int member;
-#define NL_INT64(pn,pr,member) __u64 member;
-#define NL_BIT(pn,pr,member) unsigned member:1;
-#define NL_STRING(pn,pr,member,len) unsigned char member[len]; int member ## _len;
-#include <linux/drbd_nl.h>
-
struct drbd_backing_dev {
struct block_device *backing_bdev;
struct block_device *md_bdev;
struct drbd_md md;
- struct disk_conf dc; /* The user provided config... */
+ struct disk_conf *disk_conf; /* RCU, for updates: mdev->tconn->conf_update */
sector_t known_size; /* last known size of that backing device */
};
@@ -968,17 +790,116 @@
};
struct fifo_buffer {
- int *values;
unsigned int head_index;
unsigned int size;
+ int total; /* sum of all values */
+ int values[0];
+};
+extern struct fifo_buffer *fifo_alloc(int fifo_size);
+
+/* flag bits per tconn */
+enum {
+ NET_CONGESTED, /* The data socket is congested */
+ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */
+ SEND_PING, /* whether asender should send a ping asap */
+ SIGNAL_ASENDER, /* whether asender wants to be interrupted */
+ GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */
+ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */
+ CONN_WD_ST_CHG_OKAY,
+ CONN_WD_ST_CHG_FAIL,
+ CONN_DRY_RUN, /* Expect disconnect after resync handshake. */
+ CREATE_BARRIER, /* next P_DATA is preceded by a P_BARRIER */
+ STATE_SENT, /* Do not change state/UUIDs while this is set */
+ CALLBACK_PENDING, /* Whether we have a call_usermodehelper(, UMH_WAIT_PROC)
+ * pending, from drbd worker context.
+ * If set, bdi_write_congested() returns true,
+ * so shrink_page_list() would not recurse into,
+ * and potentially deadlock on, this drbd worker.
+ */
+ DISCONNECT_SENT,
+};
+
+struct drbd_tconn { /* is a resource from the config file */
+ char *name; /* Resource name */
+ struct list_head all_tconn; /* linked on global drbd_tconns */
+ struct kref kref;
+ struct idr volumes; /* <tconn, vnr> to mdev mapping */
+ enum drbd_conns cstate; /* Only C_STANDALONE to C_WF_REPORT_PARAMS */
+ unsigned susp:1; /* IO suspended by user */
+ unsigned susp_nod:1; /* IO suspended because no data */
+ unsigned susp_fen:1; /* IO suspended because fence peer handler runs */
+ struct mutex cstate_mutex; /* Protects graceful disconnects */
+
+ unsigned long flags;
+ struct net_conf *net_conf; /* content protected by rcu */
+ struct mutex conf_update; /* mutex for ready-copy-update of net_conf and disk_conf */
+ wait_queue_head_t ping_wait; /* Woken upon reception of a ping, and a state change */
+ struct res_opts res_opts;
+
+ struct sockaddr_storage my_addr;
+ int my_addr_len;
+ struct sockaddr_storage peer_addr;
+ int peer_addr_len;
+
+ struct drbd_socket data; /* data/barrier/cstate/parameter packets */
+ struct drbd_socket meta; /* ping/ack (metadata) packets */
+ int agreed_pro_version; /* actually used protocol version */
+ unsigned long last_received; /* in jiffies, either socket */
+ unsigned int ko_count;
+
+ spinlock_t req_lock;
+
+ struct list_head transfer_log; /* all requests not yet fully processed */
+
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm; /* checksums we compute, updates protected by tconn->data->mutex */
+ struct crypto_hash *peer_integrity_tfm; /* checksums we verify, only accessed from receiver thread */
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *verify_tfm;
+ void *int_dig_in;
+ void *int_dig_vv;
+
+ /* receiver side */
+ struct drbd_epoch *current_epoch;
+ spinlock_t epoch_lock;
+ unsigned int epochs;
+ enum write_ordering_e write_ordering;
+ atomic_t current_tle_nr; /* transfer log epoch number */
+ unsigned current_tle_writes; /* writes seen within this tl epoch */
+
+ unsigned long last_reconnect_jif;
+ struct drbd_thread receiver;
+ struct drbd_thread worker;
+ struct drbd_thread asender;
+ cpumask_var_t cpu_mask;
+
+ /* sender side */
+ struct drbd_work_queue sender_work;
+
+ struct {
+ /* whether this sender thread
+ * has processed a single write yet. */
+ bool seen_any_write_yet;
+
+ /* Which barrier number to send with the next P_BARRIER */
+ int current_epoch_nr;
+
+ /* how many write requests have been sent
+ * with req->epoch == current_epoch_nr.
+ * If none, no P_BARRIER will be sent. */
+ unsigned current_epoch_writes;
+ } send;
};
struct drbd_conf {
- unsigned long drbd_flags[(DRBD_N_FLAGS + BITS_PER_LONG -1)/BITS_PER_LONG];
+ struct drbd_tconn *tconn;
+ int vnr; /* volume number within the connection */
+ struct kref kref;
+
+ /* things that are stored as / read from meta data on disk */
+ unsigned long flags;
/* configured by drbdsetup */
- struct net_conf *net_conf; /* protected by get_net_conf() and put_net_conf() */
- struct syncer_conf sync_conf;
struct drbd_backing_dev *ldev __protected_by(local);
sector_t p_size; /* partner's disk size */
@@ -986,11 +907,7 @@
struct block_device *this_bdev;
struct gendisk *vdisk;
- struct drbd_socket data; /* data/barrier/cstate/parameter packets */
- struct drbd_socket meta; /* ping/ack (metadata) packets */
- int agreed_pro_version; /* actually used protocol version */
- unsigned long last_received; /* in jiffies, either socket */
- unsigned int ko_count;
+ unsigned long last_reattach_jif;
struct drbd_work resync_work,
unplug_work,
go_diskless,
@@ -1010,10 +927,9 @@
/* Used after attach while negotiating new disk state. */
union drbd_state new_state_tmp;
- union drbd_state state;
+ union drbd_dev_state state;
wait_queue_head_t misc_wait;
wait_queue_head_t state_wait; /* upon each state change. */
- wait_queue_head_t net_cnt_wait;
unsigned int send_cnt;
unsigned int recv_cnt;
unsigned int read_cnt;
@@ -1023,17 +939,12 @@
atomic_t ap_bio_cnt; /* Requests we need to complete */
atomic_t ap_pending_cnt; /* AP data packets on the wire, ack expected */
atomic_t rs_pending_cnt; /* RS request/data packets on the wire */
- atomic_t unacked_cnt; /* Need to send replys for */
+ atomic_t unacked_cnt; /* Need to send replies for */
atomic_t local_cnt; /* Waiting for local completion */
- atomic_t net_cnt; /* Users of net_conf */
- spinlock_t req_lock;
- struct drbd_tl_epoch *unused_spare_tle; /* for pre-allocation */
- struct drbd_tl_epoch *newest_tle;
- struct drbd_tl_epoch *oldest_tle;
- struct list_head out_of_sequence_requests;
- struct list_head barrier_acked_requests;
- struct hlist_head *tl_hash;
- unsigned int tl_hash_s;
+
+ /* Interval tree of pending local requests */
+ struct rb_root read_requests;
+ struct rb_root write_requests;
/* blocks to resync in this run [unit BM_BLOCK_SIZE] */
unsigned long rs_total;
@@ -1053,6 +964,7 @@
unsigned long rs_mark_time[DRBD_SYNC_MARKS];
/* current index into rs_mark_{left,time} */
int rs_last_mark;
+ unsigned long rs_last_bcast; /* [unit jiffies] */
/* where does the admin want us to start? (sector) */
sector_t ov_start_sector;
@@ -1064,14 +976,7 @@
/* size of out-of-sync range in sectors. */
sector_t ov_last_oos_size;
unsigned long ov_left; /* in bits */
- struct crypto_hash *csums_tfm;
- struct crypto_hash *verify_tfm;
- unsigned long last_reattach_jif;
- unsigned long last_reconnect_jif;
- struct drbd_thread receiver;
- struct drbd_thread worker;
- struct drbd_thread asender;
struct drbd_bitmap *bitmap;
unsigned long bm_resync_fo; /* bit offset for drbd_bm_find_next */
@@ -1084,29 +989,19 @@
int open_cnt;
u64 *p_uuid;
- struct drbd_epoch *current_epoch;
- spinlock_t epoch_lock;
- unsigned int epochs;
- enum write_ordering_e write_ordering;
+
struct list_head active_ee; /* IO in progress (P_DATA gets written to disk) */
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
- struct list_head done_ee; /* send ack */
- struct list_head read_ee; /* IO in progress (any read) */
+ struct list_head done_ee; /* need to send P_WRITE_ACK */
+ struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
- struct hlist_head *ee_hash; /* is proteced by req_lock! */
- unsigned int ee_hash_s;
-
- /* this one is protected by ee_lock, single thread */
- struct drbd_epoch_entry *last_write_w_barrier;
int next_barrier_nr;
- struct hlist_head *app_reads_hash; /* is proteced by req_lock */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
atomic_t pp_in_use_by_net; /* sendpage()d, still referenced by tcp */
wait_queue_head_t ee_wait;
struct page *md_io_page; /* one page buffer for md_io */
- struct page *md_io_tmpp; /* for logical_block_size != 512 */
struct drbd_md_io md_io;
atomic_t md_io_in_use; /* protects the md_io, md_io_page and md_io_tmpp */
spinlock_t al_lock;
@@ -1115,22 +1010,16 @@
unsigned int al_tr_number;
int al_tr_cycle;
int al_tr_pos; /* position of the next transaction in the journal */
- struct crypto_hash *cram_hmac_tfm;
- struct crypto_hash *integrity_w_tfm; /* to be used by the worker thread */
- struct crypto_hash *integrity_r_tfm; /* to be used by the receiver thread */
- void *int_dig_out;
- void *int_dig_in;
- void *int_dig_vv;
wait_queue_head_t seq_wait;
atomic_t packet_seq;
unsigned int peer_seq;
spinlock_t peer_seq_lock;
unsigned int minor;
unsigned long comm_bm_set; /* communicated number of set bits. */
- cpumask_var_t cpu_mask;
struct bm_io_work bm_io_work;
u64 ed_uuid; /* UUID of the exposed data */
- struct mutex state_mutex;
+ struct mutex own_state_mutex;
+ struct mutex *state_mutex; /* either own_state_mutex or mdev->tconn->cstate_mutex */
char congestion_reason; /* Why we where congested... */
atomic_t rs_sect_in; /* for incoming resync data rate, SyncTarget */
atomic_t rs_sect_ev; /* for submitted resync data rate, both */
@@ -1138,46 +1027,16 @@
int rs_last_events; /* counter of read or write "events" (unit sectors)
* on the lower level device when we last looked. */
int c_sync_rate; /* current resync rate after syncer throttle magic */
- struct fifo_buffer rs_plan_s; /* correction values of resync planer */
+ struct fifo_buffer *rs_plan_s; /* correction values of resync planer (RCU, tconn->conn_update) */
int rs_in_flight; /* resync sectors in flight (to proxy, in proxy and from proxy) */
- int rs_planed; /* resync sectors already planned */
atomic_t ap_in_flight; /* App sectors in flight (waiting for ack) */
unsigned int peer_max_bio_size;
unsigned int local_max_bio_size;
};
-static inline void drbd_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- set_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline void drbd_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- clear_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_and_set_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_and_set_bit(f, &mdev->drbd_flags[0]);
-}
-
-static inline int drbd_test_and_clear_flag(struct drbd_conf *mdev, enum drbd_flag f)
-{
- return test_and_clear_bit(f, &mdev->drbd_flags[0]);
-}
-
static inline struct drbd_conf *minor_to_mdev(unsigned int minor)
{
- struct drbd_conf *mdev;
-
- mdev = minor < minor_count ? minor_table[minor] : NULL;
-
- return mdev;
+ return (struct drbd_conf *)idr_find(&minors, minor);
}
static inline unsigned int mdev_to_minor(struct drbd_conf *mdev)
@@ -1185,29 +1044,9 @@
return mdev->minor;
}
-/* returns 1 if it was successful,
- * returns 0 if there was no data socket.
- * so wherever you are going to use the data.socket, e.g. do
- * if (!drbd_get_data_sock(mdev))
- * return 0;
- * CODE();
- * drbd_put_data_sock(mdev);
- */
-static inline int drbd_get_data_sock(struct drbd_conf *mdev)
+static inline struct drbd_conf *vnr_to_mdev(struct drbd_tconn *tconn, int vnr)
{
- mutex_lock(&mdev->data.mutex);
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (unlikely(mdev->data.socket == NULL)) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
- return 1;
-}
-
-static inline void drbd_put_data_sock(struct drbd_conf *mdev)
-{
- mutex_unlock(&mdev->data.mutex);
+ return (struct drbd_conf *)idr_find(&tconn->volumes, vnr);
}
/*
@@ -1216,99 +1055,69 @@
/* drbd_main.c */
-enum chg_state_flags {
- CS_HARD = 1,
- CS_VERBOSE = 2,
- CS_WAIT_COMPLETE = 4,
- CS_SERIALIZE = 8,
- CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
-};
-
enum dds_flags {
DDSF_FORCED = 1,
DDSF_NO_RESYNC = 2, /* Do not run a resync for the new space */
};
extern void drbd_init_set_defaults(struct drbd_conf *mdev);
-extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
- enum chg_state_flags f,
- union drbd_state mask,
- union drbd_state val);
-extern void drbd_force_state(struct drbd_conf *, union drbd_state,
- union drbd_state);
-extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
- union drbd_state,
- union drbd_state,
- enum chg_state_flags);
-extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
- enum chg_state_flags,
- struct completion *done);
-extern void print_st_err(struct drbd_conf *, union drbd_state,
- union drbd_state, int);
extern int drbd_thread_start(struct drbd_thread *thi);
extern void _drbd_thread_stop(struct drbd_thread *thi, int restart, int wait);
+extern char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task);
#ifdef CONFIG_SMP
-extern void drbd_thread_current_set_cpu(struct drbd_conf *mdev);
-extern void drbd_calc_cpu_mask(struct drbd_conf *mdev);
+extern void drbd_thread_current_set_cpu(struct drbd_thread *thi);
+extern void drbd_calc_cpu_mask(struct drbd_tconn *tconn);
#else
#define drbd_thread_current_set_cpu(A) ({})
#define drbd_calc_cpu_mask(A) ({})
#endif
-extern void drbd_free_resources(struct drbd_conf *mdev);
-extern void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
+extern void tl_release(struct drbd_tconn *, unsigned int barrier_nr,
unsigned int set_size);
-extern void tl_clear(struct drbd_conf *mdev);
-extern void _tl_add_barrier(struct drbd_conf *, struct drbd_tl_epoch *);
-extern void drbd_free_sock(struct drbd_conf *mdev);
-extern int drbd_send(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, unsigned msg_flags);
-extern int drbd_send_protocol(struct drbd_conf *mdev);
+extern void tl_clear(struct drbd_tconn *);
+extern void drbd_free_sock(struct drbd_tconn *tconn);
+extern int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
+ void *buf, size_t size, unsigned msg_flags);
+extern int drbd_send_all(struct drbd_tconn *, struct socket *, void *, size_t,
+ unsigned);
+
+extern int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd);
+extern int drbd_send_protocol(struct drbd_tconn *tconn);
extern int drbd_send_uuids(struct drbd_conf *mdev);
extern int drbd_send_uuids_skip_initial_sync(struct drbd_conf *mdev);
-extern int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
+extern void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev);
extern int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags);
extern int drbd_send_state(struct drbd_conf *mdev, union drbd_state s);
extern int drbd_send_current_state(struct drbd_conf *mdev);
-extern int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags);
-#define USE_DATA_SOCKET 1
-#define USE_META_SOCKET 0
-extern int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size);
-extern int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd,
- char *data, size_t size);
-extern int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc);
-extern int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr,
- u32 set_size);
-extern int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
-extern int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp);
-extern int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size);
-extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+extern int drbd_send_sync_param(struct drbd_conf *mdev);
+extern void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr,
+ u32 set_size);
+extern int drbd_send_ack(struct drbd_conf *, enum drbd_packet,
+ struct drbd_peer_request *);
+extern void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_block_req *rp);
+extern void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_data *dp, int data_size);
+extern int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id);
-extern int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req);
-extern int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e);
+extern int drbd_send_out_of_sync(struct drbd_conf *, struct drbd_request *);
+extern int drbd_send_block(struct drbd_conf *, enum drbd_packet,
+ struct drbd_peer_request *);
extern int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id);
-extern int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector,int size,
- void *digest, int digest_size,
- enum drbd_packets cmd);
+extern int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector,
+ int size, void *digest, int digest_size,
+ enum drbd_packet cmd);
extern int drbd_send_ov_request(struct drbd_conf *mdev,sector_t sector,int size);
extern int drbd_send_bitmap(struct drbd_conf *mdev);
-extern int _drbd_send_bitmap(struct drbd_conf *mdev);
-extern int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode);
+extern void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode);
extern void drbd_free_bc(struct drbd_backing_dev *ldev);
extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
+extern void conn_md_sync(struct drbd_tconn *tconn);
extern void drbd_md_sync(struct drbd_conf *mdev);
extern int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@ -1334,33 +1143,52 @@
extern int drbd_bitmap_io(struct drbd_conf *mdev,
int (*io_fn)(struct drbd_conf *),
char *why, enum bm_flag flags);
+extern int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ char *why, enum bm_flag flags);
extern int drbd_bmio_set_n_write(struct drbd_conf *mdev);
extern int drbd_bmio_clear_n_write(struct drbd_conf *mdev);
extern void drbd_go_diskless(struct drbd_conf *mdev);
extern void drbd_ldev_destroy(struct drbd_conf *mdev);
-
/* Meta data layout
We reserve a 128MB Block (4k aligned)
* either at the end of the backing device
* or on a separate meta data device. */
-#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
/* The following numbers are sectors */
-#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
-#define MD_AL_MAX_SIZE 64 /* = 32 kb LOG ~ 3776 extents ~ 14 GB Storage */
-/* Allows up to about 3.8TB */
-#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_MAX_SIZE)
+/* Allows up to about 3.8TB, so if you want more,
+ * you need to use the "flexible" meta data format. */
+#define MD_RESERVED_SECT (128LU << 11) /* 128 MB, unit sectors */
+#define MD_AL_OFFSET 8 /* 8 Sectors after start of meta area */
+#define MD_AL_SECTORS 64 /* = 32 kB on disk activity log ring buffer */
+#define MD_BM_OFFSET (MD_AL_OFFSET + MD_AL_SECTORS)
-/* Since the smalles IO unit is usually 512 byte */
-#define MD_SECTOR_SHIFT 9
-#define MD_SECTOR_SIZE (1<<MD_SECTOR_SHIFT)
+/* we do all meta data IO in 4k blocks */
+#define MD_BLOCK_SHIFT 12
+#define MD_BLOCK_SIZE (1<<MD_BLOCK_SHIFT)
-/* activity log */
-#define AL_EXTENTS_PT ((MD_SECTOR_SIZE-12)/8-1) /* 61 ; Extents per 512B sector */
-#define AL_EXTENT_SHIFT 22 /* One extent represents 4M Storage */
+/* One activity log extent represents 4M of storage */
+#define AL_EXTENT_SHIFT 22
#define AL_EXTENT_SIZE (1<<AL_EXTENT_SHIFT)
+/* We could make these currently hardcoded constants configurable
+ * variables at create-md time (or even re-configurable at runtime?).
+ * Which will require some more changes to the DRBD "super block"
+ * and attach code.
+ *
+ * updates per transaction:
+ * This many changes to the active set can be logged with one transaction.
+ * This number is arbitrary.
+ * context per transaction:
+ * This many context extent numbers are logged with each transaction.
+ * This number is resulting from the transaction block size (4k), the layout
+ * of the transaction header, and the number of updates per transaction.
+ * See drbd_actlog.c:struct al_transaction_on_disk
+ * */
+#define AL_UPDATES_PER_TRANSACTION 64 // arbitrary
+#define AL_CONTEXT_PER_TRANSACTION 919 // (4096 - 36 - 6*64)/4
+
#if BITS_PER_LONG == 32
#define LN2_BPL 5
#define cpu_to_lel(A) cpu_to_le32(A)
@@ -1396,11 +1224,14 @@
#define SLEEP_TIME (HZ/10)
-#define BM_BLOCK_SHIFT 12 /* 4k per bit */
+/* We do bitmap IO in units of 4k blocks.
+ * We also still have a hardcoded 4k per bit relation. */
+#define BM_BLOCK_SHIFT 12 /* 4k per bit */
#define BM_BLOCK_SIZE (1<<BM_BLOCK_SHIFT)
-/* (9+3) : 512 bytes @ 8 bits; representing 16M storage
- * per sector of on disk bitmap */
-#define BM_EXT_SHIFT (BM_BLOCK_SHIFT + MD_SECTOR_SHIFT + 3) /* = 24 */
+/* mostly arbitrarily set the represented size of one bitmap extent,
+ * aka resync extent, to 16 MiB (which is also 512 Byte worth of bitmap
+ * at 4k per bit resolution) */
+#define BM_EXT_SHIFT 24 /* 16 MiB per resync extent */
#define BM_EXT_SIZE (1<<BM_EXT_SHIFT)
#if (BM_EXT_SHIFT != 24) || (BM_BLOCK_SHIFT != 12)
@@ -1468,17 +1299,20 @@
#endif
#endif
-/* Sector shift value for the "hash" functions of tl_hash and ee_hash tables.
- * With a value of 8 all IO in one 128K block make it to the same slot of the
- * hash table. */
-#define HT_SHIFT 8
-#define DRBD_MAX_BIO_SIZE (1U<<(9+HT_SHIFT))
+/* BIO_MAX_SIZE is 256 * PAGE_CACHE_SIZE,
+ * so for typical PAGE_CACHE_SIZE of 4k, that is (1<<20) Byte.
+ * Since we may live in a mixed-platform cluster,
+ * we limit us to a platform agnostic constant here for now.
+ * A followup commit may allow even bigger BIO sizes,
+ * once we thought that through. */
+#define DRBD_MAX_BIO_SIZE (1U << 20)
+#if DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#error Architecture not supported: DRBD_MAX_BIO_SIZE > BIO_MAX_SIZE
+#endif
#define DRBD_MAX_BIO_SIZE_SAFE (1U << 12) /* Works always = 4k */
-#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* The old header only allows packets up to 32Kib data */
-
-/* Number of elements in the app_reads_hash */
-#define APP_R_HSIZE 15
+#define DRBD_MAX_SIZE_H80_PACKET (1U << 15) /* Header 80 only allows packets up to 32KiB data */
+#define DRBD_MAX_BIO_SIZE_P95 (1U << 17) /* Protocol 95 to 99 allows bios up to 128KiB */
extern int drbd_bm_init(struct drbd_conf *mdev);
extern int drbd_bm_resize(struct drbd_conf *mdev, sector_t sectors, int set_new_bits);
@@ -1500,11 +1334,11 @@
extern int drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
extern int drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
extern int drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
+extern void drbd_bm_mark_for_writeout(struct drbd_conf *mdev, int page_nr);
extern int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_hinted(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
extern int drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
-extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
- unsigned long al_enr);
extern size_t drbd_bm_words(struct drbd_conf *mdev);
extern unsigned long drbd_bm_bits(struct drbd_conf *mdev);
extern sector_t drbd_bm_capacity(struct drbd_conf *mdev);
@@ -1529,7 +1363,7 @@
/* drbd_main.c */
extern struct kmem_cache *drbd_request_cache;
-extern struct kmem_cache *drbd_ee_cache; /* epoch entries */
+extern struct kmem_cache *drbd_ee_cache; /* peer requests */
extern struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t *drbd_request_mempool;
@@ -1569,12 +1403,22 @@
extern rwlock_t global_state_lock;
-extern struct drbd_conf *drbd_new_device(unsigned int minor);
-extern void drbd_free_mdev(struct drbd_conf *mdev);
+extern int conn_lowest_minor(struct drbd_tconn *tconn);
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr);
+extern void drbd_minor_destroy(struct kref *kref);
+
+extern int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts);
+extern struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts);
+extern void conn_destroy(struct kref *kref);
+struct drbd_tconn *conn_get_by_name(const char *name);
+extern struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len);
+extern void conn_free_crypto(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
+extern void __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern void drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
@@ -1582,10 +1426,11 @@
/* drbd_nl.c */
+extern int drbd_msg_put_info(const char *info);
extern void drbd_suspend_io(struct drbd_conf *mdev);
extern void drbd_resume_io(struct drbd_conf *mdev);
extern char *ppsize(char *buf, unsigned long long size);
-extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, int);
+extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
extern void resync_after_online_grow(struct drbd_conf *);
@@ -1593,13 +1438,14 @@
extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
enum drbd_role new_role,
int force);
-extern enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev);
-extern void drbd_try_outdate_peer_async(struct drbd_conf *mdev);
+extern bool conn_try_outdate_peer(struct drbd_tconn *tconn);
+extern void conn_try_outdate_peer_async(struct drbd_tconn *tconn);
extern int drbd_khelper(struct drbd_conf *mdev, char *cmd);
/* drbd_worker.c */
extern int drbd_worker(struct drbd_thread *thi);
-extern int drbd_alter_sa(struct drbd_conf *mdev, int na);
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor);
+void drbd_resync_after_changed(struct drbd_conf *mdev);
extern void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side);
extern void resume_next_sg(struct drbd_conf *mdev);
extern void suspend_other_sg(struct drbd_conf *mdev);
@@ -1608,13 +1454,13 @@
extern void *drbd_md_get_buffer(struct drbd_conf *mdev);
extern void drbd_md_put_buffer(struct drbd_conf *mdev);
extern int drbd_md_sync_page_io(struct drbd_conf *mdev,
- struct drbd_backing_dev *bdev, sector_t sector, int rw);
+ struct drbd_backing_dev *bdev, sector_t sector, int rw);
+extern void drbd_ov_out_of_sync_found(struct drbd_conf *, sector_t, int);
extern void wait_until_done_or_force_detached(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev, unsigned int *done);
-extern void drbd_ov_oos_found(struct drbd_conf*, sector_t, int);
extern void drbd_rs_controller_reset(struct drbd_conf *mdev);
-static inline void ov_oos_print(struct drbd_conf *mdev)
+static inline void ov_out_of_sync_print(struct drbd_conf *mdev)
{
if (mdev->ov_last_oos_size) {
dev_err(DEV, "Out of sync: start=%llu, size=%lu (sectors)\n",
@@ -1626,97 +1472,102 @@
extern void drbd_csum_bio(struct drbd_conf *, struct crypto_hash *, struct bio *, void *);
-extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *, struct drbd_epoch_entry *, void *);
+extern void drbd_csum_ee(struct drbd_conf *, struct crypto_hash *,
+ struct drbd_peer_request *, void *);
/* worker callbacks */
-extern int w_req_cancel_conflict(struct drbd_conf *, struct drbd_work *, int);
-extern int w_read_retry_remote(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_data_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_rsdata_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_csum_rs_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_reply(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_end_ov_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_ov_finished(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resync_timer(struct drbd_conf *, struct drbd_work *, int);
-extern int w_resume_next_sg(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_write_hint(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_dblock(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_barrier(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_read_req(struct drbd_conf *, struct drbd_work *, int);
-extern int w_prev_work_done(struct drbd_conf *, struct drbd_work *, int);
-extern int w_e_reissue(struct drbd_conf *, struct drbd_work *, int);
-extern int w_restart_disk_io(struct drbd_conf *, struct drbd_work *, int);
-extern int w_send_oos(struct drbd_conf *, struct drbd_work *, int);
-extern int w_start_resync(struct drbd_conf *, struct drbd_work *, int);
+extern int w_e_end_data_req(struct drbd_work *, int);
+extern int w_e_end_rsdata_req(struct drbd_work *, int);
+extern int w_e_end_csum_rs_req(struct drbd_work *, int);
+extern int w_e_end_ov_reply(struct drbd_work *, int);
+extern int w_e_end_ov_req(struct drbd_work *, int);
+extern int w_ov_finished(struct drbd_work *, int);
+extern int w_resync_timer(struct drbd_work *, int);
+extern int w_send_write_hint(struct drbd_work *, int);
+extern int w_make_resync_request(struct drbd_work *, int);
+extern int w_send_dblock(struct drbd_work *, int);
+extern int w_send_read_req(struct drbd_work *, int);
+extern int w_prev_work_done(struct drbd_work *, int);
+extern int w_e_reissue(struct drbd_work *, int);
+extern int w_restart_disk_io(struct drbd_work *, int);
+extern int w_send_out_of_sync(struct drbd_work *, int);
+extern int w_start_resync(struct drbd_work *, int);
extern void resync_timer_fn(unsigned long data);
extern void start_resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
-extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type);
-extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
-extern struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local);
-extern void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- int is_net);
-#define drbd_free_ee(m,e) drbd_free_some_ee(m, e, 0)
-#define drbd_free_net_ee(m,e) drbd_free_some_ee(m, e, 1)
-extern void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
-extern void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
- struct list_head *head);
+extern int drbd_submit_peer_request(struct drbd_conf *,
+ struct drbd_peer_request *, const unsigned,
+ const int);
+extern int drbd_free_peer_reqs(struct drbd_conf *, struct list_head *);
+extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_conf *, u64,
+ sector_t, unsigned int,
+ gfp_t) __must_hold(local);
+extern void __drbd_free_peer_req(struct drbd_conf *, struct drbd_peer_request *,
+ int);
+#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
+#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
+extern struct page *drbd_alloc_pages(struct drbd_conf *, unsigned int, bool);
extern void drbd_set_recv_tcq(struct drbd_conf *mdev, int tcq_enabled);
extern void _drbd_clear_done_ee(struct drbd_conf *mdev, struct list_head *to_be_freed);
-extern void drbd_flush_workqueue(struct drbd_conf *mdev);
-extern void drbd_free_tl_hash(struct drbd_conf *mdev);
-
-/* yes, there is kernel_setsockopt, but only since 2.6.18. we don't need to
- * mess with get_fs/set_fs, we know we are KERNEL_DS always. */
-static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
- char __user *optval, int optlen)
+extern void conn_flush_workqueue(struct drbd_tconn *tconn);
+extern int drbd_connected(struct drbd_conf *mdev);
+static inline void drbd_flush_workqueue(struct drbd_conf *mdev)
{
+ conn_flush_workqueue(mdev->tconn);
+}
+
+/* Yes, there is kernel_setsockopt, but only since 2.6.18.
+ * So we have our own copy of it here. */
+static inline int drbd_setsockopt(struct socket *sock, int level, int optname,
+ char *optval, int optlen)
+{
+ mm_segment_t oldfs = get_fs();
+ char __user *uoptval;
int err;
+
+ uoptval = (char __user __force *)optval;
+
+ set_fs(KERNEL_DS);
if (level == SOL_SOCKET)
- err = sock_setsockopt(sock, level, optname, optval, optlen);
+ err = sock_setsockopt(sock, level, optname, uoptval, optlen);
else
- err = sock->ops->setsockopt(sock, level, optname, optval,
+ err = sock->ops->setsockopt(sock, level, optname, uoptval,
optlen);
+ set_fs(oldfs);
return err;
}
static inline void drbd_tcp_cork(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_uncork(struct socket *sock)
{
- int __user val = 0;
+ int val = 0;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_CORK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_nodelay(struct socket *sock)
{
- int __user val = 1;
+ int val = 1;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_NODELAY,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
static inline void drbd_tcp_quickack(struct socket *sock)
{
- int __user val = 2;
+ int val = 2;
(void) drbd_setsockopt(sock, SOL_TCP, TCP_QUICKACK,
- (char __user *)&val, sizeof(val));
+ (char*)&val, sizeof(val));
}
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo);
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo);
/* drbd_proc.c */
extern struct proc_dir_entry *drbd_proc;
@@ -1725,8 +1576,8 @@
extern const char *drbd_role_str(enum drbd_role s);
/* drbd_actlog.c */
-extern void drbd_al_begin_io(struct drbd_conf *mdev, sector_t sector);
-extern void drbd_al_complete_io(struct drbd_conf *mdev, sector_t sector);
+extern void drbd_al_begin_io(struct drbd_conf *mdev, struct drbd_interval *i);
+extern void drbd_al_complete_io(struct drbd_conf *mdev, struct drbd_interval *i);
extern void drbd_rs_complete_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
extern int drbd_try_rs_begin_io(struct drbd_conf *mdev, sector_t sector);
@@ -1734,7 +1585,6 @@
extern int drbd_rs_del_all(struct drbd_conf *mdev);
extern void drbd_rs_failed_io(struct drbd_conf *mdev,
sector_t sector, int size);
-extern int drbd_al_read_log(struct drbd_conf *mdev, struct drbd_backing_dev *);
extern void drbd_advance_rs_marks(struct drbd_conf *mdev, unsigned long still_to_go);
extern void __drbd_set_in_sync(struct drbd_conf *mdev, sector_t sector,
int size, const char *file, const unsigned int line);
@@ -1744,73 +1594,24 @@
int size, const char *file, const unsigned int line);
#define drbd_set_out_of_sync(mdev, sector, size) \
__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
-extern void drbd_al_apply_to_bm(struct drbd_conf *mdev);
extern void drbd_al_shrink(struct drbd_conf *mdev);
-
/* drbd_nl.c */
-
-void drbd_nl_cleanup(void);
-int __init drbd_nl_init(void);
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state);
-void drbd_bcast_sync_progress(struct drbd_conf *mdev);
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e);
-
-
-/**
- * DOC: DRBD State macros
- *
- * These macros are used to express state changes in easily readable form.
- *
- * The NS macros expand to a mask and a value, that can be bit ored onto the
- * current state as soon as the spinlock (req_lock) was taken.
- *
- * The _NS macros are used for state functions that get called with the
- * spinlock. These macros expand directly to the new state value.
- *
- * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
- * to express state changes that affect more than one aspect of the state.
- *
- * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
- * Means that the network connection was established and that the peer
- * is in secondary role.
- */
-#define role_MASK R_MASK
-#define peer_MASK R_MASK
-#define disk_MASK D_MASK
-#define pdsk_MASK D_MASK
-#define conn_MASK C_MASK
-#define susp_MASK 1
-#define user_isp_MASK 1
-#define aftr_isp_MASK 1
-#define susp_nod_MASK 1
-#define susp_fen_MASK 1
-
-#define NS(T, S) \
- ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T = (S); val; })
-#define NS2(T1, S1, T2, S2) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val; })
-#define NS3(T1, S1, T2, S2, T3, S3) \
- ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
- mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
- ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
- val.T2 = (S2); val.T3 = (S3); val; })
-
-#define _NS(D, T, S) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T = (S); __ns; })
-#define _NS2(D, T1, S1, T2, S2) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns; })
-#define _NS3(D, T1, S1, T2, S2, T3, S3) \
- D, ({ union drbd_state __ns; __ns.i = D->state.i; __ns.T1 = (S1); \
- __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+/* state info broadcast */
+struct sib_info {
+ enum drbd_state_info_bcast_reason sib_reason;
+ union {
+ struct {
+ char *helper_name;
+ unsigned helper_exit_code;
+ };
+ struct {
+ union drbd_state os;
+ union drbd_state ns;
+ };
+ };
+};
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib);
/*
* inline helper functions
@@ -1827,9 +1628,10 @@
#define page_chain_for_each_safe(page, n) \
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
-static inline int drbd_ee_has_active_page(struct drbd_epoch_entry *e)
+
+static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
page_chain_for_each(page) {
if (page_count(page) > 1)
return 1;
@@ -1837,18 +1639,6 @@
return 0;
}
-static inline void drbd_state_lock(struct drbd_conf *mdev)
-{
- wait_event(mdev->misc_wait,
- !drbd_test_and_set_flag(mdev, CLUSTER_ST_CHANGE));
-}
-
-static inline void drbd_state_unlock(struct drbd_conf *mdev)
-{
- drbd_clear_flag(mdev, CLUSTER_ST_CHANGE);
- wake_up(&mdev->misc_wait);
-}
-
static inline enum drbd_state_rv
_drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
enum chg_state_flags flags, struct completion *done)
@@ -1862,21 +1652,16 @@
return rv;
}
-/**
- * drbd_request_state() - Reqest a state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- *
- * This is the most graceful way of requesting a state change. It is verbose
- * quite verbose in case the state change is not possible, and all those
- * state changes are globally serialized.
- */
-static inline int drbd_request_state(struct drbd_conf *mdev,
- union drbd_state mask,
- union drbd_state val)
+static inline union drbd_state drbd_read_state(struct drbd_conf *mdev)
{
- return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+ union drbd_state rv;
+
+ rv.i = mdev->state.i;
+ rv.susp = mdev->tconn->susp;
+ rv.susp_nod = mdev->tconn->susp_nod;
+ rv.susp_fen = mdev->tconn->susp_fen;
+
+ return rv;
}
enum drbd_force_detach_flags {
@@ -1891,8 +1676,13 @@
enum drbd_force_detach_flags df,
const char *where)
{
- switch (mdev->ldev->dc.on_io_error) {
- case EP_PASS_ON:
+ enum drbd_io_error_p ep;
+
+ rcu_read_lock();
+ ep = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+ switch (ep) {
+ case EP_PASS_ON: /* FIXME would this be better named "Ignore"? */
if (df == DRBD_READ_ERROR || df == DRBD_WRITE_ERROR) {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Local IO failed in %s.\n", where);
@@ -1923,11 +1713,11 @@
* we read meta data only once during attach,
* which will fail in case of errors.
*/
- drbd_set_flag(mdev, WAS_IO_ERROR);
+ set_bit(WAS_IO_ERROR, &mdev->flags);
if (df == DRBD_READ_ERROR)
- drbd_set_flag(mdev, WAS_READ_ERROR);
+ set_bit(WAS_READ_ERROR, &mdev->flags);
if (df == DRBD_FORCE_DETACH)
- drbd_set_flag(mdev, FORCE_DETACH);
+ set_bit(FORCE_DETACH, &mdev->flags);
if (mdev->state.disk > D_FAILED) {
_drbd_set_state(_NS(mdev, disk, D_FAILED), CS_HARD, NULL);
dev_err(DEV,
@@ -1951,9 +1741,9 @@
{
if (error) {
unsigned long flags;
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__drbd_chk_io_error_(mdev, forcedetach, where);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
}
}
@@ -1965,9 +1755,9 @@
* BTW, for internal meta data, this happens to be the maximum capacity
* we could agree upon with our peer node.
*/
-static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+static inline sector_t _drbd_md_first_sector(int meta_dev_idx, struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + bdev->md.bm_offset;
@@ -1977,13 +1767,30 @@
}
}
+static inline sector_t drbd_md_first_sector(struct drbd_backing_dev *bdev)
+{
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ return _drbd_md_first_sector(meta_dev_idx, bdev);
+}
+
/**
* drbd_md_last_sector() - Return the last sector number of the meta data area
* @bdev: Meta data block device.
*/
static inline sector_t drbd_md_last_sector(struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
return bdev->md.md_offset + MD_AL_OFFSET - 1;
@@ -2011,12 +1818,18 @@
static inline sector_t drbd_get_max_capacity(struct drbd_backing_dev *bdev)
{
sector_t s;
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
case DRBD_MD_INDEX_INTERNAL:
case DRBD_MD_INDEX_FLEX_INT:
s = drbd_get_capacity(bdev->backing_bdev)
? min_t(sector_t, DRBD_MAX_SECTORS_FLEX,
- drbd_md_first_sector(bdev))
+ _drbd_md_first_sector(meta_dev_idx, bdev))
: 0;
break;
case DRBD_MD_INDEX_FLEX_EXT:
@@ -2042,9 +1855,15 @@
static inline sector_t drbd_md_ss__(struct drbd_conf *mdev,
struct drbd_backing_dev *bdev)
{
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+ rcu_read_unlock();
+
+ switch (meta_dev_idx) {
default: /* external, some index */
- return MD_RESERVED_SECT * bdev->dc.meta_dev_idx;
+ return MD_RESERVED_SECT * meta_dev_idx;
case DRBD_MD_INDEX_INTERNAL:
/* with drbd08, internal meta data is always "flexible" */
case DRBD_MD_INDEX_FLEX_INT:
@@ -2070,9 +1889,8 @@
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
static inline void
@@ -2081,41 +1899,35 @@
unsigned long flags;
spin_lock_irqsave(&q->q_lock, flags);
list_add_tail(&w->list, &q->q);
- up(&q->s); /* within the spinlock,
- see comment near end of drbd_worker() */
spin_unlock_irqrestore(&q->q_lock, flags);
+ wake_up(&q->q_wait);
}
-static inline void wake_asender(struct drbd_conf *mdev)
+static inline void wake_asender(struct drbd_tconn *tconn)
{
- if (drbd_test_flag(mdev, SIGNAL_ASENDER))
- force_sig(DRBD_SIG, mdev->asender.task);
+ if (test_bit(SIGNAL_ASENDER, &tconn->flags))
+ force_sig(DRBD_SIG, tconn->asender.task);
}
-static inline void request_ping(struct drbd_conf *mdev)
+static inline void request_ping(struct drbd_tconn *tconn)
{
- drbd_set_flag(mdev, SEND_PING);
- wake_asender(mdev);
+ set_bit(SEND_PING, &tconn->flags);
+ wake_asender(tconn);
}
-static inline int drbd_send_short_cmd(struct drbd_conf *mdev,
- enum drbd_packets cmd)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd, &h, sizeof(h));
-}
+extern void *conn_prepare_command(struct drbd_tconn *, struct drbd_socket *);
+extern void *drbd_prepare_command(struct drbd_conf *, struct drbd_socket *);
+extern int conn_send_command(struct drbd_tconn *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
+extern int drbd_send_command(struct drbd_conf *, struct drbd_socket *,
+ enum drbd_packet, unsigned int, void *,
+ unsigned int);
-static inline int drbd_send_ping(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING, &h, sizeof(h));
-}
-
-static inline int drbd_send_ping_ack(struct drbd_conf *mdev)
-{
- struct p_header80 h;
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_PING_ACK, &h, sizeof(h));
-}
+extern int drbd_send_ping(struct drbd_tconn *tconn);
+extern int drbd_send_ping_ack(struct drbd_tconn *tconn);
+extern int drbd_send_state_req(struct drbd_conf *, union drbd_state, union drbd_state);
+extern int conn_send_state_req(struct drbd_tconn *, union drbd_state, union drbd_state);
static inline void drbd_thread_stop(struct drbd_thread *thi)
{
@@ -2137,21 +1949,21 @@
* or implicit barrier packets as necessary.
* increased:
* w_send_barrier
- * _req_mod(req, queue_for_net_write or queue_for_net_read);
+ * _req_mod(req, QUEUE_FOR_NET_WRITE or QUEUE_FOR_NET_READ);
* it is much easier and equally valid to count what we queue for the
* worker, even before it actually was queued or send.
* (drbd_make_request_common; recovery path on read io-error)
* decreased:
* got_BarrierAck (respective tl_clear, tl_clear_barrier)
- * _req_mod(req, data_received)
+ * _req_mod(req, DATA_RECEIVED)
* [from receive_DataReply]
- * _req_mod(req, write_acked_by_peer or recv_acked_by_peer or neg_acked)
+ * _req_mod(req, WRITE_ACKED_BY_PEER or RECV_ACKED_BY_PEER or NEG_ACKED)
* [from got_BlockAck (P_WRITE_ACK, P_RECV_ACK)]
* for some reason it is NOT decreased in got_NegAck,
* but in the resulting cleanup code from report_params.
* we should try to remember the reason for that...
- * _req_mod(req, send_failed or send_canceled)
- * _req_mod(req, connection_lost_while_pending)
+ * _req_mod(req, SEND_FAILED or SEND_CANCELED)
+ * _req_mod(req, CONNECTION_LOST_WHILE_PENDING)
* [from tl_clear_barrier]
*/
static inline void inc_ap_pending(struct drbd_conf *mdev)
@@ -2159,17 +1971,19 @@
atomic_inc(&mdev->ap_pending_cnt);
}
-#define ERR_IF_CNT_IS_NEGATIVE(which) \
- if (atomic_read(&mdev->which) < 0) \
+#define ERR_IF_CNT_IS_NEGATIVE(which, func, line) \
+ if (atomic_read(&mdev->which) < 0) \
dev_err(DEV, "in %s:%d: " #which " = %d < 0 !\n", \
- __func__ , __LINE__ , \
- atomic_read(&mdev->which))
+ func, line, \
+ atomic_read(&mdev->which))
-#define dec_ap_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- if (atomic_dec_and_test(&mdev->ap_pending_cnt)) \
- wake_up(&mdev->misc_wait); \
- ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt); } while (0)
+#define dec_ap_pending(mdev) _dec_ap_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_ap_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+ if (atomic_dec_and_test(&mdev->ap_pending_cnt))
+ wake_up(&mdev->misc_wait);
+ ERR_IF_CNT_IS_NEGATIVE(ap_pending_cnt, func, line);
+}
/* counts how many resync-related answers we still expect from the peer
* increase decrease
@@ -2182,10 +1996,12 @@
atomic_inc(&mdev->rs_pending_cnt);
}
-#define dec_rs_pending(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->rs_pending_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt); } while (0)
+#define dec_rs_pending(mdev) _dec_rs_pending(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_rs_pending(struct drbd_conf *mdev, const char *func, int line)
+{
+ atomic_dec(&mdev->rs_pending_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(rs_pending_cnt, func, line);
+}
/* counts how many answers we still need to send to the peer.
* increased on
@@ -2201,38 +2017,18 @@
atomic_inc(&mdev->unacked_cnt);
}
-#define dec_unacked(mdev) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_dec(&mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-#define sub_unacked(mdev, n) do { \
- typecheck(struct drbd_conf *, mdev); \
- atomic_sub(n, &mdev->unacked_cnt); \
- ERR_IF_CNT_IS_NEGATIVE(unacked_cnt); } while (0)
-
-
-static inline void put_net_conf(struct drbd_conf *mdev)
+#define dec_unacked(mdev) _dec_unacked(mdev, __FUNCTION__, __LINE__)
+static inline void _dec_unacked(struct drbd_conf *mdev, const char *func, int line)
{
- if (atomic_dec_and_test(&mdev->net_cnt))
- wake_up(&mdev->net_cnt_wait);
+ atomic_dec(&mdev->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
-/**
- * get_net_conf() - Increase ref count on mdev->net_conf; Returns 0 if nothing there
- * @mdev: DRBD device.
- *
- * You have to call put_net_conf() when finished working with mdev->net_conf.
- */
-static inline int get_net_conf(struct drbd_conf *mdev)
+#define sub_unacked(mdev, n) _sub_unacked(mdev, n, __FUNCTION__, __LINE__)
+static inline void _sub_unacked(struct drbd_conf *mdev, int n, const char *func, int line)
{
- int have_net_conf;
-
- atomic_inc(&mdev->net_cnt);
- have_net_conf = mdev->state.conn >= C_UNCONNECTED;
- if (!have_net_conf)
- put_net_conf(mdev);
- return have_net_conf;
+ atomic_sub(n, &mdev->unacked_cnt);
+ ERR_IF_CNT_IS_NEGATIVE(unacked_cnt, func, line);
}
/**
@@ -2336,17 +2132,20 @@
* maybe re-implement using semaphores? */
static inline int drbd_get_max_buffers(struct drbd_conf *mdev)
{
- int mxb = 1000000; /* arbitrary limit on open requests */
- if (get_net_conf(mdev)) {
- mxb = mdev->net_conf->max_buffers;
- put_net_conf(mdev);
- }
+ struct net_conf *nc;
+ int mxb;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000; /* arbitrary limit on open requests */
+ rcu_read_unlock();
+
return mxb;
}
static inline int drbd_state_is_stable(struct drbd_conf *mdev)
{
- union drbd_state s = mdev->state;
+ union drbd_dev_state s = mdev->state;
/* DO NOT add a default clause, we want the compiler to warn us
* for any newly introduced state we may have forgotten to add here */
@@ -2380,7 +2179,7 @@
/* Allow IO in BM exchange states with new protocols */
case C_WF_BITMAP_S:
- if (mdev->agreed_pro_version < 96)
+ if (mdev->tconn->agreed_pro_version < 96)
return 0;
break;
@@ -2402,7 +2201,7 @@
/* disk state is stable as well. */
break;
- /* no new io accepted during tansitional states */
+ /* no new io accepted during transitional states */
case D_ATTACHING:
case D_NEGOTIATING:
case D_UNKNOWN:
@@ -2414,18 +2213,20 @@
return 1;
}
-static inline int is_susp(union drbd_state s)
+static inline int drbd_suspended(struct drbd_conf *mdev)
{
- return s.susp || s.susp_nod || s.susp_fen;
+ struct drbd_tconn *tconn = mdev->tconn;
+
+ return tconn->susp || tconn->susp_fen || tconn->susp_nod;
}
static inline bool may_inc_ap_bio(struct drbd_conf *mdev)
{
int mxb = drbd_get_max_buffers(mdev);
- if (is_susp(mdev->state))
+ if (drbd_suspended(mdev))
return false;
- if (drbd_test_flag(mdev, SUSPEND_IO))
+ if (test_bit(SUSPEND_IO, &mdev->flags))
return false;
/* to avoid potential deadlock or bitmap corruption,
@@ -2440,35 +2241,35 @@
* and we are within the spinlock anyways, we have this workaround. */
if (atomic_read(&mdev->ap_bio_cnt) > mxb)
return false;
- if (drbd_test_flag(mdev, BITMAP_IO))
+ if (test_bit(BITMAP_IO, &mdev->flags))
return false;
return true;
}
-static inline bool inc_ap_bio_cond(struct drbd_conf *mdev, int count)
+static inline bool inc_ap_bio_cond(struct drbd_conf *mdev)
{
bool rv = false;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
rv = may_inc_ap_bio(mdev);
if (rv)
- atomic_add(count, &mdev->ap_bio_cnt);
- spin_unlock_irq(&mdev->req_lock);
+ atomic_inc(&mdev->ap_bio_cnt);
+ spin_unlock_irq(&mdev->tconn->req_lock);
return rv;
}
-static inline void inc_ap_bio(struct drbd_conf *mdev, int count)
+static inline void inc_ap_bio(struct drbd_conf *mdev)
{
/* we wait here
* as long as the device is suspended
* until the bitmap is no longer on the fly during connection
- * handshake as long as we would exeed the max_buffer limit.
+ * handshake as long as we would exceed the max_buffer limit.
*
* to avoid races with the reconnect code,
* we need to atomic_inc within the spinlock. */
- wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev, count));
+ wait_event(mdev->misc_wait, inc_ap_bio_cond(mdev));
}
static inline void dec_ap_bio(struct drbd_conf *mdev)
@@ -2478,9 +2279,9 @@
D_ASSERT(ap_bio >= 0);
- if (ap_bio == 0 && drbd_test_flag(mdev, BITMAP_IO)) {
- if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ if (ap_bio == 0 && test_bit(BITMAP_IO, &mdev->flags)) {
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
}
/* this currently does wake_up for every dec_ap_bio!
@@ -2490,6 +2291,12 @@
wake_up(&mdev->misc_wait);
}
+static inline bool verify_can_do_stop_sector(struct drbd_conf *mdev)
+{
+ return mdev->tconn->agreed_pro_version >= 97 &&
+ mdev->tconn->agreed_pro_version != 100;
+}
+
static inline int drbd_set_ed_uuid(struct drbd_conf *mdev, u64 val)
{
int changed = mdev->ed_uuid != val;
@@ -2497,40 +2304,6 @@
return changed;
}
-static inline int seq_cmp(u32 a, u32 b)
-{
- /* we assume wrap around at 32bit.
- * for wrap around at 24bit (old atomic_t),
- * we'd have to
- * a <<= 8; b <<= 8;
- */
- return (s32)(a) - (s32)(b);
-}
-#define seq_lt(a, b) (seq_cmp((a), (b)) < 0)
-#define seq_gt(a, b) (seq_cmp((a), (b)) > 0)
-#define seq_ge(a, b) (seq_cmp((a), (b)) >= 0)
-#define seq_le(a, b) (seq_cmp((a), (b)) <= 0)
-/* CAUTION: please no side effects in arguments! */
-#define seq_max(a, b) ((u32)(seq_gt((a), (b)) ? (a) : (b)))
-
-static inline void update_peer_seq(struct drbd_conf *mdev, unsigned int new_seq)
-{
- unsigned int m;
- spin_lock(&mdev->peer_seq_lock);
- m = seq_max(mdev->peer_seq, new_seq);
- mdev->peer_seq = m;
- spin_unlock(&mdev->peer_seq_lock);
- if (m == new_seq)
- wake_up(&mdev->seq_wait);
-}
-
-static inline void drbd_update_congested(struct drbd_conf *mdev)
-{
- struct sock *sk = mdev->data.socket->sk;
- if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
- drbd_set_flag(mdev, NET_CONGESTED);
-}
-
static inline int drbd_queue_order_type(struct drbd_conf *mdev)
{
/* sorry, we currently have no working implementation
@@ -2545,15 +2318,46 @@
{
int r;
- if (drbd_test_flag(mdev, MD_NO_FUA))
+ if (mdev->ldev == NULL) {
+ dev_warn(DEV, "mdev->ldev == NULL in drbd_md_flush\n");
+ return;
+ }
+
+ if (test_bit(MD_NO_FUA, &mdev->flags))
return;
r = blkdev_issue_flush(mdev->ldev->md_bdev, GFP_NOIO, NULL);
if (r) {
- drbd_set_flag(mdev, MD_NO_FUA);
+ set_bit(MD_NO_FUA, &mdev->flags);
dev_err(DEV, "meta data flush failed with status %d, disabling md-flushes\n", r);
}
}
-
#endif
+
+/* This is defined in drivers/md/md.h as well. Should go into wait.h */
+#define __wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ wait_queue_t __wait; \
+ init_waitqueue_entry(&__wait, current); \
+ \
+ add_wait_queue(&wq, &__wait); \
+ for (;;) { \
+ set_current_state(TASK_UNINTERRUPTIBLE); \
+ if (condition) \
+ break; \
+ spin_unlock_irq(&lock); \
+ cmd; \
+ schedule(); \
+ spin_lock_irq(&lock); \
+ } \
+ current->state = TASK_RUNNING; \
+ remove_wait_queue(&wq, &__wait); \
+} while (0)
+
+#define wait_event_lock_irq(wq, condition, lock, cmd) \
+do { \
+ if (condition) \
+ break; \
+ __wait_event_lock_irq(wq, condition, lock, cmd); \
+} while (0)
diff --git a/drivers/block/drbd/drbd_interval.c b/drivers/block/drbd/drbd_interval.c
new file mode 100644
index 0000000..89c497c
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.c
@@ -0,0 +1,207 @@
+#include <asm/bug.h>
+#include <linux/rbtree_augmented.h>
+#include "drbd_interval.h"
+
+/**
+ * interval_end - return end of @node
+ */
+static inline
+sector_t interval_end(struct rb_node *node)
+{
+ struct drbd_interval *this = rb_entry(node, struct drbd_interval, rb);
+ return this->end;
+}
+
+/**
+ * compute_subtree_last - compute end of @node
+ *
+ * The end of an interval is the highest (start + (size >> 9)) value of this
+ * node and of its children. Called for @node and its parents whenever the end
+ * may have changed.
+ */
+static inline sector_t
+compute_subtree_last(struct drbd_interval *node)
+{
+ sector_t max = node->sector + (node->size >> 9);
+
+ if (node->rb.rb_left) {
+ sector_t left = interval_end(node->rb.rb_left);
+ if (left > max)
+ max = left;
+ }
+ if (node->rb.rb_right) {
+ sector_t right = interval_end(node->rb.rb_right);
+ if (right > max)
+ max = right;
+ }
+ return max;
+}
+
+static void augment_propagate(struct rb_node *rb, struct rb_node *stop)
+{
+ while (rb != stop) {
+ struct drbd_interval *node = rb_entry(rb, struct drbd_interval, rb);
+ sector_t subtree_last = compute_subtree_last(node);
+ if (node->end == subtree_last)
+ break;
+ node->end = subtree_last;
+ rb = rb_parent(&node->rb);
+ }
+}
+
+static void augment_copy(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+}
+
+static void augment_rotate(struct rb_node *rb_old, struct rb_node *rb_new)
+{
+ struct drbd_interval *old = rb_entry(rb_old, struct drbd_interval, rb);
+ struct drbd_interval *new = rb_entry(rb_new, struct drbd_interval, rb);
+
+ new->end = old->end;
+ old->end = compute_subtree_last(old);
+}
+
+static const struct rb_augment_callbacks augment_callbacks = {
+ augment_propagate,
+ augment_copy,
+ augment_rotate,
+};
+
+/**
+ * drbd_insert_interval - insert a new interval into a tree
+ */
+bool
+drbd_insert_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ struct rb_node **new = &root->rb_node, *parent = NULL;
+
+ BUG_ON(!IS_ALIGNED(this->size, 512));
+
+ while (*new) {
+ struct drbd_interval *here =
+ rb_entry(*new, struct drbd_interval, rb);
+
+ parent = *new;
+ if (this->sector < here->sector)
+ new = &(*new)->rb_left;
+ else if (this->sector > here->sector)
+ new = &(*new)->rb_right;
+ else if (this < here)
+ new = &(*new)->rb_left;
+ else if (this > here)
+ new = &(*new)->rb_right;
+ else
+ return false;
+ }
+
+ rb_link_node(&this->rb, parent, new);
+ rb_insert_augmented(&this->rb, root, &augment_callbacks);
+ return true;
+}
+
+/**
+ * drbd_contains_interval - check if a tree contains a given interval
+ * @sector: start sector of @interval
+ * @interval: may not be a valid pointer
+ *
+ * Returns if the tree contains the node @interval with start sector @start.
+ * Does not dereference @interval until @interval is known to be a valid object
+ * in @tree. Returns %false if @interval is in the tree but with a different
+ * sector number.
+ */
+bool
+drbd_contains_interval(struct rb_root *root, sector_t sector,
+ struct drbd_interval *interval)
+{
+ struct rb_node *node = root->rb_node;
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (sector < here->sector)
+ node = node->rb_left;
+ else if (sector > here->sector)
+ node = node->rb_right;
+ else if (interval < here)
+ node = node->rb_left;
+ else if (interval > here)
+ node = node->rb_right;
+ else
+ return true;
+ }
+ return false;
+}
+
+/**
+ * drbd_remove_interval - remove an interval from a tree
+ */
+void
+drbd_remove_interval(struct rb_root *root, struct drbd_interval *this)
+{
+ rb_erase_augmented(&this->rb, root, &augment_callbacks);
+}
+
+/**
+ * drbd_find_overlap - search for an interval overlapping with [sector, sector + size)
+ * @sector: start sector
+ * @size: size, aligned to 512 bytes
+ *
+ * Returns an interval overlapping with [sector, sector + size), or NULL if
+ * there is none. When there is more than one overlapping interval in the
+ * tree, the interval with the lowest start sector is returned, and all other
+ * overlapping intervals will be on the right side of the tree, reachable with
+ * rb_next().
+ */
+struct drbd_interval *
+drbd_find_overlap(struct rb_root *root, sector_t sector, unsigned int size)
+{
+ struct rb_node *node = root->rb_node;
+ struct drbd_interval *overlap = NULL;
+ sector_t end = sector + (size >> 9);
+
+ BUG_ON(!IS_ALIGNED(size, 512));
+
+ while (node) {
+ struct drbd_interval *here =
+ rb_entry(node, struct drbd_interval, rb);
+
+ if (node->rb_left &&
+ sector < interval_end(node->rb_left)) {
+ /* Overlap if any must be on left side */
+ node = node->rb_left;
+ } else if (here->sector < end &&
+ sector < here->sector + (here->size >> 9)) {
+ overlap = here;
+ break;
+ } else if (sector >= here->sector) {
+ /* Overlap if any must be on right side */
+ node = node->rb_right;
+ } else
+ break;
+ }
+ return overlap;
+}
+
+struct drbd_interval *
+drbd_next_overlap(struct drbd_interval *i, sector_t sector, unsigned int size)
+{
+ sector_t end = sector + (size >> 9);
+ struct rb_node *node;
+
+ for (;;) {
+ node = rb_next(&i->rb);
+ if (!node)
+ return NULL;
+ i = rb_entry(node, struct drbd_interval, rb);
+ if (i->sector >= end)
+ return NULL;
+ if (sector < i->sector + (i->size >> 9))
+ return i;
+ }
+}
diff --git a/drivers/block/drbd/drbd_interval.h b/drivers/block/drbd/drbd_interval.h
new file mode 100644
index 0000000..f38fcb0
--- /dev/null
+++ b/drivers/block/drbd/drbd_interval.h
@@ -0,0 +1,40 @@
+#ifndef __DRBD_INTERVAL_H
+#define __DRBD_INTERVAL_H
+
+#include <linux/types.h>
+#include <linux/rbtree.h>
+
+struct drbd_interval {
+ struct rb_node rb;
+ sector_t sector; /* start sector of the interval */
+ unsigned int size; /* size in bytes */
+ sector_t end; /* highest interval end in subtree */
+ int local:1 /* local or remote request? */;
+ int waiting:1;
+};
+
+static inline void drbd_clear_interval(struct drbd_interval *i)
+{
+ RB_CLEAR_NODE(&i->rb);
+}
+
+static inline bool drbd_interval_empty(struct drbd_interval *i)
+{
+ return RB_EMPTY_NODE(&i->rb);
+}
+
+extern bool drbd_insert_interval(struct rb_root *, struct drbd_interval *);
+extern bool drbd_contains_interval(struct rb_root *, sector_t,
+ struct drbd_interval *);
+extern void drbd_remove_interval(struct rb_root *, struct drbd_interval *);
+extern struct drbd_interval *drbd_find_overlap(struct rb_root *, sector_t,
+ unsigned int);
+extern struct drbd_interval *drbd_next_overlap(struct drbd_interval *, sector_t,
+ unsigned int);
+
+#define drbd_for_each_overlap(i, root, sector, size) \
+ for (i = drbd_find_overlap(root, sector, size); \
+ i; \
+ i = drbd_next_overlap(i, sector, size))
+
+#endif /* __DRBD_INTERVAL_H */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 9b833e0..52de26d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -56,14 +56,6 @@
#include "drbd_vli.h"
-struct after_state_chg_work {
- struct drbd_work w;
- union drbd_state os;
- union drbd_state ns;
- enum chg_state_flags flags;
- struct completion *done;
-};
-
static DEFINE_MUTEX(drbd_main_mutex);
int drbdd_init(struct drbd_thread *);
int drbd_worker(struct drbd_thread *);
@@ -72,21 +64,17 @@
int drbd_init(void);
static int drbd_open(struct block_device *bdev, fmode_t mode);
static int drbd_release(struct gendisk *gd, fmode_t mode);
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags);
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static int w_md_sync(struct drbd_work *w, int unused);
static void md_sync_timer_fn(unsigned long data);
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
-static void _tl_clear(struct drbd_conf *mdev);
+static int w_bitmap_io(struct drbd_work *w, int unused);
+static int w_go_diskless(struct drbd_work *w, int unused);
MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
"Lars Ellenberg <lars@linbit.com>");
MODULE_DESCRIPTION("drbd - Distributed Replicated Block Device v" REL_VERSION);
MODULE_VERSION(REL_VERSION);
MODULE_LICENSE("GPL");
-MODULE_PARM_DESC(minor_count, "Maximum number of drbd devices ("
+MODULE_PARM_DESC(minor_count, "Approximate number of drbd devices ("
__stringify(DRBD_MINOR_COUNT_MIN) "-" __stringify(DRBD_MINOR_COUNT_MAX) ")");
MODULE_ALIAS_BLOCKDEV_MAJOR(DRBD_MAJOR);
@@ -98,7 +86,6 @@
module_param(minor_count, uint, 0444);
module_param(disable_sendpage, bool, 0644);
module_param(allow_oos, bool, 0);
-module_param(cn_idx, uint, 0444);
module_param(proc_details, int, 0644);
#ifdef CONFIG_DRBD_FAULT_INJECTION
@@ -120,7 +107,6 @@
unsigned int minor_count = DRBD_MINOR_COUNT_DEF;
bool disable_sendpage;
bool allow_oos;
-unsigned int cn_idx = CN_IDX_DRBD;
int proc_details; /* Detail level in proc drbd*/
/* Module parameter for setting the user mode helper program
@@ -132,10 +118,11 @@
/* in 2.6.x, our device mapping and config info contains our virtual gendisks
* as member "struct gendisk *vdisk;"
*/
-struct drbd_conf **minor_table;
+struct idr minors;
+struct list_head drbd_tconns; /* list of struct drbd_tconn */
struct kmem_cache *drbd_request_cache;
-struct kmem_cache *drbd_ee_cache; /* epoch entries */
+struct kmem_cache *drbd_ee_cache; /* peer requests */
struct kmem_cache *drbd_bm_ext_cache; /* bitmap extents */
struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t *drbd_request_mempool;
@@ -164,10 +151,15 @@
struct bio *bio_alloc_drbd(gfp_t gfp_mask)
{
+ struct bio *bio;
+
if (!drbd_md_io_bio_set)
return bio_alloc(gfp_mask, 1);
- return bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ bio = bio_alloc_bioset(gfp_mask, 1, drbd_md_io_bio_set);
+ if (!bio)
+ return NULL;
+ return bio;
}
#ifdef __CHECKER__
@@ -190,158 +182,87 @@
#endif
/**
- * DOC: The transfer log
- *
- * The transfer log is a single linked list of &struct drbd_tl_epoch objects.
- * mdev->newest_tle points to the head, mdev->oldest_tle points to the tail
- * of the list. There is always at least one &struct drbd_tl_epoch object.
- *
- * Each &struct drbd_tl_epoch has a circular double linked list of requests
- * attached.
- */
-static int tl_init(struct drbd_conf *mdev)
-{
- struct drbd_tl_epoch *b;
-
- /* during device minor initialization, we may well use GFP_KERNEL */
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_KERNEL);
- if (!b)
- return 0;
- INIT_LIST_HEAD(&b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->next = NULL;
- b->br_number = 4711;
- b->n_writes = 0;
- b->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
-
- mdev->oldest_tle = b;
- mdev->newest_tle = b;
- INIT_LIST_HEAD(&mdev->out_of_sequence_requests);
- INIT_LIST_HEAD(&mdev->barrier_acked_requests);
-
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-
- return 1;
-}
-
-static void tl_cleanup(struct drbd_conf *mdev)
-{
- D_ASSERT(mdev->oldest_tle == mdev->newest_tle);
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
- kfree(mdev->oldest_tle);
- mdev->oldest_tle = NULL;
- kfree(mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
-}
-
-/**
- * _tl_add_barrier() - Adds a barrier to the transfer log
- * @mdev: DRBD device.
- * @new: Barrier to be added before the current head of the TL.
- *
- * The caller must hold the req_lock.
- */
-void _tl_add_barrier(struct drbd_conf *mdev, struct drbd_tl_epoch *new)
-{
- struct drbd_tl_epoch *newest_before;
-
- INIT_LIST_HEAD(&new->requests);
- INIT_LIST_HEAD(&new->w.list);
- new->w.cb = NULL; /* if this is != NULL, we need to dec_ap_pending in tl_clear */
- new->next = NULL;
- new->n_writes = 0;
-
- newest_before = mdev->newest_tle;
- new->br_number = newest_before->br_number+1;
- if (mdev->newest_tle != new) {
- mdev->newest_tle->next = new;
- mdev->newest_tle = new;
- }
-}
-
-/**
- * tl_release() - Free or recycle the oldest &struct drbd_tl_epoch object of the TL
- * @mdev: DRBD device.
+ * tl_release() - mark as BARRIER_ACKED all requests in the corresponding transfer log epoch
+ * @tconn: DRBD connection.
* @barrier_nr: Expected identifier of the DRBD write barrier packet.
* @set_size: Expected number of requests before that barrier.
*
* In case the passed barrier_nr or set_size does not match the oldest
- * &struct drbd_tl_epoch objects this function will cause a termination
- * of the connection.
+ * epoch of not yet barrier-acked requests, this function will cause a
+ * termination of the connection.
*/
-void tl_release(struct drbd_conf *mdev, unsigned int barrier_nr,
- unsigned int set_size)
+void tl_release(struct drbd_tconn *tconn, unsigned int barrier_nr,
+ unsigned int set_size)
{
- struct drbd_tl_epoch *b, *nob; /* next old barrier */
- struct list_head *le, *tle;
struct drbd_request *r;
+ struct drbd_request *req = NULL;
+ int expect_epoch = 0;
+ int expect_size = 0;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&tconn->req_lock);
- b = mdev->oldest_tle;
+ /* find oldest not yet barrier-acked write request,
+ * count writes in its epoch. */
+ list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+ const unsigned s = r->rq_state;
+ if (!req) {
+ if (!(s & RQ_WRITE))
+ continue;
+ if (!(s & RQ_NET_MASK))
+ continue;
+ if (s & RQ_NET_DONE)
+ continue;
+ req = r;
+ expect_epoch = req->epoch;
+ expect_size ++;
+ } else {
+ if (r->epoch != expect_epoch)
+ break;
+ if (!(s & RQ_WRITE))
+ continue;
+ /* if (s & RQ_DONE): not expected */
+ /* if (!(s & RQ_NET_MASK)): not expected */
+ expect_size++;
+ }
+ }
/* first some paranoia code */
- if (b == NULL) {
- dev_err(DEV, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
- barrier_nr);
+ if (req == NULL) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, but no epoch in tl!?\n",
+ barrier_nr);
goto bail;
}
- if (b->br_number != barrier_nr) {
- dev_err(DEV, "BAD! BarrierAck #%u received, expected #%u!\n",
- barrier_nr, b->br_number);
- goto bail;
- }
- if (b->n_writes != set_size) {
- dev_err(DEV, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
- barrier_nr, set_size, b->n_writes);
+ if (expect_epoch != barrier_nr) {
+ conn_err(tconn, "BAD! BarrierAck #%u received, expected #%u!\n",
+ barrier_nr, expect_epoch);
goto bail;
}
- /* Clean up list of requests processed during current epoch */
- list_for_each_safe(le, tle, &b->requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- _req_mod(r, barrier_acked);
- }
- /* There could be requests on the list waiting for completion
- of the write to the local disk. To avoid corruptions of
- slab's data structures we have to remove the lists head.
-
- Also there could have been a barrier ack out of sequence, overtaking
- the write acks - which would be a bug and violating write ordering.
- To not deadlock in case we lose connection while such requests are
- still pending, we need some way to find them for the
- _req_mode(connection_lost_while_pending).
-
- These have been list_move'd to the out_of_sequence_requests list in
- _req_mod(, barrier_acked) above.
- */
- list_splice_init(&b->requests, &mdev->barrier_acked_requests);
-
- nob = b->next;
- if (drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
- _tl_add_barrier(mdev, b);
- if (nob)
- mdev->oldest_tle = nob;
- /* if nob == NULL b was the only barrier, and becomes the new
- barrier. Therefore mdev->oldest_tle points already to b */
- } else {
- D_ASSERT(nob != NULL);
- mdev->oldest_tle = nob;
- kfree(b);
+ if (expect_size != set_size) {
+ conn_err(tconn, "BAD! BarrierAck #%u received with n_writes=%u, expected n_writes=%u!\n",
+ barrier_nr, set_size, expect_size);
+ goto bail;
}
- spin_unlock_irq(&mdev->req_lock);
- dec_ap_pending(mdev);
+ /* Clean up list of requests processed during current epoch. */
+ /* this extra list walk restart is paranoia,
+ * to catch requests being barrier-acked "unexpectedly".
+ * It usually should find the same req again, or some READ preceding it. */
+ list_for_each_entry(req, &tconn->transfer_log, tl_requests)
+ if (req->epoch == expect_epoch)
+ break;
+ list_for_each_entry_safe_from(req, r, &tconn->transfer_log, tl_requests) {
+ if (req->epoch != expect_epoch)
+ break;
+ _req_mod(req, BARRIER_ACKED);
+ }
+ spin_unlock_irq(&tconn->req_lock);
return;
bail:
- spin_unlock_irq(&mdev->req_lock);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ spin_unlock_irq(&tconn->req_lock);
+ conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
@@ -350,85 +271,24 @@
* @mdev: DRBD device.
* @what: The action/event to perform with all request objects
*
- * @what might be one of connection_lost_while_pending, resend, fail_frozen_disk_io,
- * restart_frozen_disk_io.
+ * @what might be one of CONNECTION_LOST_WHILE_PENDING, RESEND, FAIL_FROZEN_DISK_IO,
+ * RESTART_FROZEN_DISK_IO.
*/
-static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
+/* must hold resource->req_lock */
+void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
{
- struct drbd_tl_epoch *b, *tmp, **pn;
- struct list_head *le, *tle, carry_reads;
- struct drbd_request *req;
- int rv, n_writes, n_reads;
+ struct drbd_request *req, *r;
- b = mdev->oldest_tle;
- pn = &mdev->oldest_tle;
- while (b) {
- n_writes = 0;
- n_reads = 0;
- INIT_LIST_HEAD(&carry_reads);
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- rv = _req_mod(req, what);
-
- n_writes += (rv & MR_WRITE) >> MR_WRITE_SHIFT;
- n_reads += (rv & MR_READ) >> MR_READ_SHIFT;
- }
- tmp = b->next;
-
- if (n_writes) {
- if (what == resend) {
- b->n_writes = n_writes;
- if (b->w.cb == NULL) {
- b->w.cb = w_send_barrier;
- inc_ap_pending(mdev);
- drbd_set_flag(mdev, CREATE_BARRIER);
- }
-
- drbd_queue_work(&mdev->data.work, &b->w);
- }
- pn = &b->next;
- } else {
- if (n_reads)
- list_add(&carry_reads, &b->requests);
- /* there could still be requests on that ring list,
- * in case local io is still pending */
- list_del(&b->requests);
-
- /* dec_ap_pending corresponding to queue_barrier.
- * the newest barrier may not have been queued yet,
- * in which case w.cb is still NULL. */
- if (b->w.cb != NULL)
- dec_ap_pending(mdev);
-
- if (b == mdev->newest_tle) {
- /* recycle, but reinit! */
- D_ASSERT(tmp == NULL);
- INIT_LIST_HEAD(&b->requests);
- list_splice(&carry_reads, &b->requests);
- INIT_LIST_HEAD(&b->w.list);
- b->w.cb = NULL;
- b->br_number = net_random();
- b->n_writes = 0;
-
- *pn = b;
- break;
- }
- *pn = tmp;
- kfree(b);
- }
- b = tmp;
- list_splice(&carry_reads, &b->requests);
- }
-
- /* Actions operating on the disk state, also want to work on
- requests that got barrier acked. */
-
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests)
_req_mod(req, what);
- }
}
+void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what)
+{
+ spin_lock_irq(&tconn->req_lock);
+ _tl_restart(tconn, what);
+ spin_unlock_irq(&tconn->req_lock);
+}
/**
* tl_clear() - Clears all requests and &struct drbd_tl_epoch objects out of the TL
@@ -438,43 +298,9 @@
* by the requests on the transfer gets marked as our of sync. Called from the
* receiver thread and the worker thread.
*/
-void tl_clear(struct drbd_conf *mdev)
+void tl_clear(struct drbd_tconn *tconn)
{
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- spin_unlock_irq(&mdev->req_lock);
-}
-
-static void _tl_clear(struct drbd_conf *mdev)
-{
- struct list_head *le, *tle;
- struct drbd_request *r;
-
- _tl_restart(mdev, connection_lost_while_pending);
-
- /* we expect this list to be empty. */
- D_ASSERT(list_empty(&mdev->out_of_sequence_requests));
-
- /* but just in case, clean it up anyways! */
- list_for_each_safe(le, tle, &mdev->out_of_sequence_requests) {
- r = list_entry(le, struct drbd_request, tl_requests);
- /* It would be nice to complete outside of spinlock.
- * But this is easier for now. */
- _req_mod(r, connection_lost_while_pending);
- }
-
- /* ensure bit indicating barrier is required is clear */
- drbd_clear_flag(mdev, CREATE_BARRIER);
-
- memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
-
-}
-
-void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
-{
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- spin_unlock_irq(&mdev->req_lock);
+ tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
}
/**
@@ -483,1392 +309,131 @@
*/
void tl_abort_disk_io(struct drbd_conf *mdev)
{
- struct drbd_tl_epoch *b;
- struct list_head *le, *tle;
- struct drbd_request *req;
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct drbd_request *req, *r;
- spin_lock_irq(&mdev->req_lock);
- b = mdev->oldest_tle;
- while (b) {
- list_for_each_safe(le, tle, &b->requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
- if (!(req->rq_state & RQ_LOCAL_PENDING))
- continue;
- _req_mod(req, abort_disk_io);
- }
- b = b->next;
- }
-
- list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
- req = list_entry(le, struct drbd_request, tl_requests);
+ spin_lock_irq(&tconn->req_lock);
+ list_for_each_entry_safe(req, r, &tconn->transfer_log, tl_requests) {
if (!(req->rq_state & RQ_LOCAL_PENDING))
continue;
- _req_mod(req, abort_disk_io);
+ if (req->w.mdev != mdev)
+ continue;
+ _req_mod(req, ABORT_DISK_IO);
}
-
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&tconn->req_lock);
}
-/**
- * cl_wide_st_chg() - true if the state change is a cluster wide one
- * @mdev: DRBD device.
- * @os: old (current) state.
- * @ns: new (wanted) state.
- */
-static int cl_wide_st_chg(struct drbd_conf *mdev,
- union drbd_state os, union drbd_state ns)
-{
- return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
- ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
- (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
- (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
- (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
- (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S);
-}
-
-enum drbd_state_rv
-drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
- union drbd_state mask, union drbd_state val)
-{
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, NULL);
- ns = mdev->state;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_force_state() - Impose a change which happens outside our control on our state
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- */
-void drbd_force_state(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
-{
- drbd_change_state(mdev, CS_HARD, mask, val);
-}
-
-static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
-static enum drbd_state_rv is_valid_state_transition(struct drbd_conf *,
- union drbd_state,
- union drbd_state);
-enum sanitize_state_warnings {
- NO_WARNING,
- ABORTED_ONLINE_VERIFY,
- ABORTED_RESYNC,
- CONNECTION_LOST_NEGOTIATING,
- IMPLICITLY_UPGRADED_DISK,
- IMPLICITLY_UPGRADED_PDSK,
-};
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn);
-int drbd_send_state_req(struct drbd_conf *,
- union drbd_state, union drbd_state);
-
-static enum drbd_state_rv
-_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val)
-{
- union drbd_state os, ns;
- unsigned long flags;
- enum drbd_state_rv rv;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_SUCCESS))
- return SS_CW_SUCCESS;
-
- if (drbd_test_and_clear_flag(mdev, CL_ST_CHG_FAIL))
- return SS_CW_FAILED_BY_PEER;
-
- rv = 0;
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (!cl_wide_st_chg(mdev, os, ns))
- rv = SS_CW_NO_NEED;
- if (!rv) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS) {
- rv = is_valid_state_transition(mdev, ns, os);
- if (rv == SS_SUCCESS)
- rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
- }
- }
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- return rv;
-}
-
-/**
- * drbd_req_state() - Perform an eventually cluster wide state change
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Should not be called directly, use drbd_request_state() or
- * _drbd_request_state().
- */
-static enum drbd_state_rv
-drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
-{
- struct completion done;
- unsigned long flags;
- union drbd_state os, ns;
- enum drbd_state_rv rv;
-
- init_completion(&done);
-
- if (f & CS_SERIALIZE)
- mutex_lock(&mdev->state_mutex);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- ns = sanitize_state(mdev, os, ns, NULL);
-
- if (cl_wide_st_chg(mdev, os, ns)) {
- rv = is_valid_state(mdev, ns);
- if (rv == SS_SUCCESS)
- rv = is_valid_state_transition(mdev, ns, os);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (rv < SS_SUCCESS) {
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- drbd_state_lock(mdev);
- if (!drbd_send_state_req(mdev, mask, val)) {
- drbd_state_unlock(mdev);
- rv = SS_CW_FAILED_BY_PEER;
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
-
- if (mask.conn == C_MASK && val.conn == C_DISCONNECTING)
- drbd_set_flag(mdev, DISCONNECT_SENT);
-
- wait_event(mdev->state_wait,
- (rv = _req_st_cond(mdev, mask, val)));
-
- if (rv < SS_SUCCESS) {
- drbd_state_unlock(mdev);
- if (f & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- goto abort;
- }
- spin_lock_irqsave(&mdev->req_lock, flags);
- os = mdev->state;
- ns.i = (os.i & ~mask.i) | val.i;
- rv = _drbd_set_state(mdev, ns, f, &done);
- drbd_state_unlock(mdev);
- } else {
- rv = _drbd_set_state(mdev, ns, f, &done);
- }
-
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
- D_ASSERT(current != mdev->worker.task);
- wait_for_completion(&done);
- }
-
-abort:
- if (f & CS_SERIALIZE)
- mutex_unlock(&mdev->state_mutex);
-
- return rv;
-}
-
-/**
- * _drbd_request_state() - Request a state change (with flags)
- * @mdev: DRBD device.
- * @mask: mask of state bits to change.
- * @val: value of new state bits.
- * @f: flags
- *
- * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
- * flag, or when logging of failed state change requests is not desired.
- */
-enum drbd_state_rv
-_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
- union drbd_state val, enum chg_state_flags f)
-{
- enum drbd_state_rv rv;
-
- wait_event(mdev->state_wait,
- (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
-
- return rv;
-}
-
-static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
-{
- dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c }\n",
- name,
- drbd_conn_str(ns.conn),
- drbd_role_str(ns.role),
- drbd_role_str(ns.peer),
- drbd_disk_str(ns.disk),
- drbd_disk_str(ns.pdsk),
- is_susp(ns) ? 's' : 'r',
- ns.aftr_isp ? 'a' : '-',
- ns.peer_isp ? 'p' : '-',
- ns.user_isp ? 'u' : '-'
- );
-}
-
-void print_st_err(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum drbd_state_rv err)
-{
- if (err == SS_IN_TRANSIENT_STATE)
- return;
- dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
- print_st(mdev, " state", os);
- print_st(mdev, "wanted", ns);
-}
-
-
-/**
- * is_valid_state() - Returns an SS_ error code if ns is not valid
- * @mdev: DRBD device.
- * @ns: State to consider.
- */
-static enum drbd_state_rv
-is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
-{
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- enum drbd_fencing_p fp;
- enum drbd_state_rv rv = SS_SUCCESS;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (get_net_conf(mdev)) {
- if (!mdev->net_conf->two_primaries &&
- ns.role == R_PRIMARY && ns.peer == R_PRIMARY)
- rv = SS_TWO_PRIMARIES;
- put_net_conf(mdev);
- }
-
- if (rv <= 0)
- /* already found a reason to abort */;
- else if (ns.role == R_SECONDARY && mdev->open_cnt)
- rv = SS_DEVICE_IN_USE;
-
- else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (fp >= FP_RESOURCE &&
- ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
- rv = SS_PRIMARY_NOP;
-
- else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
- rv = SS_NO_LOCAL_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
- rv = SS_NO_REMOTE_DISK;
-
- else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
- rv = SS_NO_UP_TO_DATE_DISK;
-
- else if ((ns.conn == C_CONNECTED ||
- ns.conn == C_WF_BITMAP_S ||
- ns.conn == C_SYNC_SOURCE ||
- ns.conn == C_PAUSED_SYNC_S) &&
- ns.disk == D_OUTDATED)
- rv = SS_CONNECTED_OUTDATES;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- (mdev->sync_conf.verify_alg[0] == 0))
- rv = SS_NO_VERIFY_ALG;
-
- else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- mdev->agreed_pro_version < 88)
- rv = SS_NOT_SUPPORTED;
-
- else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
- rv = SS_CONNECTED_OUTDATES;
-
- return rv;
-}
-
-/**
- * is_valid_state_transition() - Returns an SS_ error code if the state transition is not possible
- * @mdev: DRBD device.
- * @ns: new state.
- * @os: old state.
- */
-static enum drbd_state_rv
-is_valid_state_transition(struct drbd_conf *mdev, union drbd_state ns,
- union drbd_state os)
-{
- enum drbd_state_rv rv = SS_SUCCESS;
-
- if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
- os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
- rv = SS_ALREADY_STANDALONE;
-
- if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
- rv = SS_IS_DISKLESS;
-
- if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
- rv = SS_NO_NET_CONFIG;
-
- if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
- rv = SS_LOWER_THAN_OUTDATED;
-
- if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
- rv = SS_IN_TRANSIENT_STATE;
-
- if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
- rv = SS_IN_TRANSIENT_STATE;
-
- /* While establishing a connection only allow cstate to change.
- Delay/refuse role changes, detach attach etc... */
- if (drbd_test_flag(mdev, STATE_SENT) &&
- !(os.conn == C_WF_REPORT_PARAMS ||
- (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
- rv = SS_IN_TRANSIENT_STATE;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
- ns.conn != os.conn && os.conn > C_CONNECTED)
- rv = SS_RESYNC_RUNNING;
-
- if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
- os.conn < C_CONNECTED)
- rv = SS_NEED_CONNECTION;
-
- if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
- && os.conn < C_WF_REPORT_PARAMS)
- rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
-
- return rv;
-}
-
-static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
-{
- static const char *msg_table[] = {
- [NO_WARNING] = "",
- [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
- [ABORTED_RESYNC] = "Resync aborted.",
- [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
- [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
- [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
- };
-
- if (warn != NO_WARNING)
- dev_warn(DEV, "%s\n", msg_table[warn]);
-}
-
-/**
- * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @warn_sync_abort:
- *
- * When we loose connection, we have to set the state of the peers disk (pdsk)
- * to D_UNKNOWN. This rule and many more along those lines are in this function.
- */
-static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum sanitize_state_warnings *warn)
-{
- enum drbd_fencing_p fp;
- enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
-
- if (warn)
- *warn = NO_WARNING;
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Disallow Network errors to configure a device's network part */
- if ((ns.conn >= C_TIMEOUT && ns.conn <= C_TEAR_DOWN) &&
- os.conn <= C_DISCONNECTING)
- ns.conn = os.conn;
-
- /* After a network error (+C_TEAR_DOWN) only C_UNCONNECTED or C_DISCONNECTING can follow.
- * If you try to go into some Sync* state, that shall fail (elsewhere). */
- if (os.conn >= C_TIMEOUT && os.conn <= C_TEAR_DOWN &&
- ns.conn != C_UNCONNECTED && ns.conn != C_DISCONNECTING && ns.conn <= C_CONNECTED)
- ns.conn = os.conn;
-
- /* we cannot fail (again) if we already detached */
- if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
- ns.disk = D_DISKLESS;
-
- /* After C_DISCONNECTING only C_STANDALONE may follow */
- if (os.conn == C_DISCONNECTING && ns.conn != C_STANDALONE)
- ns.conn = os.conn;
-
- if (ns.conn < C_CONNECTED) {
- ns.peer_isp = 0;
- ns.peer = R_UNKNOWN;
- if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
- ns.pdsk = D_UNKNOWN;
- }
-
- /* Clear the aftr_isp when becoming unconfigured */
- if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
- ns.aftr_isp = 0;
-
- /* Abort resync if a disk fails/detaches */
- if (os.conn > C_CONNECTED && ns.conn > C_CONNECTED &&
- (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
- if (warn)
- *warn = os.conn == C_VERIFY_S || os.conn == C_VERIFY_T ?
- ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
- ns.conn = C_CONNECTED;
- }
-
- /* Connection breaks down before we finished "Negotiating" */
- if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
- get_ldev_if_state(mdev, D_NEGOTIATING)) {
- if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
- ns.disk = mdev->new_state_tmp.disk;
- ns.pdsk = mdev->new_state_tmp.pdsk;
- } else {
- if (warn)
- *warn = CONNECTION_LOST_NEGOTIATING;
- ns.disk = D_DISKLESS;
- ns.pdsk = D_UNKNOWN;
- }
- put_ldev(mdev);
- }
-
- /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
- if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
- if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
- ns.disk = D_UP_TO_DATE;
- if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
- ns.pdsk = D_UP_TO_DATE;
- }
-
- /* Implications of the connection stat on the disk states */
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_UNKNOWN;
- switch ((enum drbd_conns)ns.conn) {
- case C_WF_BITMAP_T:
- case C_PAUSED_SYNC_T:
- case C_STARTING_SYNC_T:
- case C_WF_SYNC_UUID:
- case C_BEHIND:
- disk_min = D_INCONSISTENT;
- disk_max = D_OUTDATED;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_VERIFY_S:
- case C_VERIFY_T:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_CONNECTED:
- disk_min = D_DISKLESS;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_DISKLESS;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_WF_BITMAP_S:
- case C_PAUSED_SYNC_S:
- case C_STARTING_SYNC_S:
- case C_AHEAD:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
- break;
- case C_SYNC_TARGET:
- disk_min = D_INCONSISTENT;
- disk_max = D_INCONSISTENT;
- pdsk_min = D_UP_TO_DATE;
- pdsk_max = D_UP_TO_DATE;
- break;
- case C_SYNC_SOURCE:
- disk_min = D_UP_TO_DATE;
- disk_max = D_UP_TO_DATE;
- pdsk_min = D_INCONSISTENT;
- pdsk_max = D_INCONSISTENT;
- break;
- case C_STANDALONE:
- case C_DISCONNECTING:
- case C_UNCONNECTED:
- case C_TIMEOUT:
- case C_BROKEN_PIPE:
- case C_NETWORK_FAILURE:
- case C_PROTOCOL_ERROR:
- case C_TEAR_DOWN:
- case C_WF_CONNECTION:
- case C_WF_REPORT_PARAMS:
- case C_MASK:
- break;
- }
- if (ns.disk > disk_max)
- ns.disk = disk_max;
-
- if (ns.disk < disk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_DISK;
- ns.disk = disk_min;
- }
- if (ns.pdsk > pdsk_max)
- ns.pdsk = pdsk_max;
-
- if (ns.pdsk < pdsk_min) {
- if (warn)
- *warn = IMPLICITLY_UPGRADED_PDSK;
- ns.pdsk = pdsk_min;
- }
-
- if (fp == FP_STONITH &&
- (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED) &&
- !(os.role == R_PRIMARY && os.conn < C_CONNECTED && os.pdsk > D_OUTDATED))
- ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
-
- if (mdev->sync_conf.on_no_data == OND_SUSPEND_IO &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE) &&
- !(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE))
- ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
-
- if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
- if (ns.conn == C_SYNC_SOURCE)
- ns.conn = C_PAUSED_SYNC_S;
- if (ns.conn == C_SYNC_TARGET)
- ns.conn = C_PAUSED_SYNC_T;
- } else {
- if (ns.conn == C_PAUSED_SYNC_S)
- ns.conn = C_SYNC_SOURCE;
- if (ns.conn == C_PAUSED_SYNC_T)
- ns.conn = C_SYNC_TARGET;
- }
-
- return ns;
-}
-
-/* helper for __drbd_set_state */
-static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
-{
- if (mdev->agreed_pro_version < 90)
- mdev->ov_start_sector = 0;
- mdev->rs_total = drbd_bm_bits(mdev);
- mdev->ov_position = 0;
- if (cs == C_VERIFY_T) {
- /* starting online verify from an arbitrary position
- * does not fit well into the existing protocol.
- * on C_VERIFY_T, we initialize ov_left and friends
- * implicitly in receive_DataRequest once the
- * first P_OV_REQUEST is received */
- mdev->ov_start_sector = ~(sector_t)0;
- } else {
- unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
- if (bit >= mdev->rs_total) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(mdev->rs_total - 1);
- mdev->rs_total = 1;
- } else
- mdev->rs_total -= bit;
- mdev->ov_position = mdev->ov_start_sector;
- }
- mdev->ov_left = mdev->rs_total;
-}
-
-static void drbd_resume_al(struct drbd_conf *mdev)
-{
- if (drbd_test_and_clear_flag(mdev, AL_SUSPENDED))
- dev_info(DEV, "Resumed AL updates\n");
-}
-
-/**
- * __drbd_set_state() - Set a new DRBD state
- * @mdev: DRBD device.
- * @ns: new state.
- * @flags: Flags
- * @done: Optional completion, that will get completed after the after_state_ch() finished
- *
- * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
- */
-enum drbd_state_rv
-__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
- enum chg_state_flags flags, struct completion *done)
-{
- union drbd_state os;
- enum drbd_state_rv rv = SS_SUCCESS;
- enum sanitize_state_warnings ssw;
- struct after_state_chg_work *ascw;
-
- os = mdev->state;
-
- ns = sanitize_state(mdev, os, ns, &ssw);
-
- if (ns.i == os.i)
- return SS_NOTHING_TO_DO;
-
- if (!(flags & CS_HARD)) {
- /* pre-state-change checks ; only look at ns */
- /* See drbd_state_sw_errors in drbd_strings.c */
-
- rv = is_valid_state(mdev, ns);
- if (rv < SS_SUCCESS) {
- /* If the old state was illegal as well, then let
- this happen...*/
-
- if (is_valid_state(mdev, os) == rv)
- rv = is_valid_state_transition(mdev, ns, os);
- } else
- rv = is_valid_state_transition(mdev, ns, os);
- }
-
- if (rv < SS_SUCCESS) {
- if (flags & CS_VERBOSE)
- print_st_err(mdev, os, ns, rv);
- return rv;
- }
-
- print_sanitize_warnings(mdev, ssw);
-
- {
- char *pbp, pb[300];
- pbp = pb;
- *pbp = 0;
- if (ns.role != os.role)
- pbp += sprintf(pbp, "role( %s -> %s ) ",
- drbd_role_str(os.role),
- drbd_role_str(ns.role));
- if (ns.peer != os.peer)
- pbp += sprintf(pbp, "peer( %s -> %s ) ",
- drbd_role_str(os.peer),
- drbd_role_str(ns.peer));
- if (ns.conn != os.conn)
- pbp += sprintf(pbp, "conn( %s -> %s ) ",
- drbd_conn_str(os.conn),
- drbd_conn_str(ns.conn));
- if (ns.disk != os.disk)
- pbp += sprintf(pbp, "disk( %s -> %s ) ",
- drbd_disk_str(os.disk),
- drbd_disk_str(ns.disk));
- if (ns.pdsk != os.pdsk)
- pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
- drbd_disk_str(os.pdsk),
- drbd_disk_str(ns.pdsk));
- if (is_susp(ns) != is_susp(os))
- pbp += sprintf(pbp, "susp( %d -> %d ) ",
- is_susp(os),
- is_susp(ns));
- if (ns.aftr_isp != os.aftr_isp)
- pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
- os.aftr_isp,
- ns.aftr_isp);
- if (ns.peer_isp != os.peer_isp)
- pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
- os.peer_isp,
- ns.peer_isp);
- if (ns.user_isp != os.user_isp)
- pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
- os.user_isp,
- ns.user_isp);
- dev_info(DEV, "%s\n", pb);
- }
-
- /* solve the race between becoming unconfigured,
- * worker doing the cleanup, and
- * admin reconfiguring us:
- * on (re)configure, first set CONFIG_PENDING,
- * then wait for a potentially exiting worker,
- * start the worker, and schedule one no_op.
- * then proceed with configuration.
- */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY &&
- !drbd_test_and_set_flag(mdev, CONFIG_PENDING))
- drbd_set_flag(mdev, DEVICE_DYING);
-
- /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
- * on the ldev here, to be sure the transition -> D_DISKLESS resp.
- * drbd_ldev_destroy() won't happen before our corresponding
- * after_state_ch works run, where we put_ldev again. */
- if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
- (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
- atomic_inc(&mdev->local_cnt);
-
- mdev->state = ns;
-
- if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
- drbd_print_uuids(mdev, "attached to UUIDs");
-
- wake_up(&mdev->misc_wait);
- wake_up(&mdev->state_wait);
-
- /* Aborted verify run, or we reached the stop sector.
- * Log the last position, unless end-of-device. */
- if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
- ns.conn <= C_CONNECTED) {
- mdev->ov_start_sector =
- BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
- if (mdev->ov_left)
- dev_info(DEV, "Online Verify reached sector %llu\n",
- (unsigned long long)mdev->ov_start_sector);
- }
-
- if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
- dev_info(DEV, "Syncer continues.\n");
- mdev->rs_paused += (long)jiffies
- -(long)mdev->rs_mark_time[mdev->rs_last_mark];
- if (ns.conn == C_SYNC_TARGET)
- mod_timer(&mdev->resync_timer, jiffies);
- }
-
- if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
- (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
- dev_info(DEV, "Resync suspended\n");
- mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
- }
-
- if (os.conn == C_CONNECTED &&
- (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
- unsigned long now = jiffies;
- int i;
-
- set_ov_position(mdev, ns.conn);
- mdev->rs_start = now;
- mdev->rs_last_events = 0;
- mdev->rs_last_sect_ev = 0;
- mdev->ov_last_oos_size = 0;
- mdev->ov_last_oos_start = 0;
-
- for (i = 0; i < DRBD_SYNC_MARKS; i++) {
- mdev->rs_mark_left[i] = mdev->ov_left;
- mdev->rs_mark_time[i] = now;
- }
-
- drbd_rs_controller_reset(mdev);
-
- if (ns.conn == C_VERIFY_S) {
- dev_info(DEV, "Starting Online Verify from sector %llu\n",
- (unsigned long long)mdev->ov_position);
- mod_timer(&mdev->resync_timer, jiffies);
- }
- }
-
- if (get_ldev(mdev)) {
- u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
- MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
- MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
-
- if (drbd_test_flag(mdev, CRASHED_PRIMARY))
- mdf |= MDF_CRASHED_PRIMARY;
- if (mdev->state.role == R_PRIMARY ||
- (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
- mdf |= MDF_PRIMARY_IND;
- if (mdev->state.conn > C_WF_REPORT_PARAMS)
- mdf |= MDF_CONNECTED_IND;
- if (mdev->state.disk > D_INCONSISTENT)
- mdf |= MDF_CONSISTENT;
- if (mdev->state.disk > D_OUTDATED)
- mdf |= MDF_WAS_UP_TO_DATE;
- if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
- mdf |= MDF_PEER_OUT_DATED;
- if (mdf != mdev->ldev->md.flags) {
- mdev->ldev->md.flags = mdf;
- drbd_md_mark_dirty(mdev);
- }
- if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
- drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
- put_ldev(mdev);
- }
-
- /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
- if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
- os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
- drbd_set_flag(mdev, CONSIDER_RESYNC);
-
- /* Receiver should clean up itself */
- if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Now the receiver finished cleaning up itself, it should die */
- if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
- drbd_thread_stop_nowait(&mdev->receiver);
-
- /* Upon network failure, we need to restart the receiver. */
- if (os.conn > C_WF_CONNECTION &&
- ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
- drbd_thread_restart_nowait(&mdev->receiver);
-
- /* Resume AL writing if we get a connection */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- drbd_resume_al(mdev);
-
- /* remember last connect and attach times so request_timer_fn() won't
- * kill newly established sessions while we are still trying to thaw
- * previously frozen IO */
- if (os.conn != C_WF_REPORT_PARAMS && ns.conn == C_WF_REPORT_PARAMS)
- mdev->last_reconnect_jif = jiffies;
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- mdev->last_reattach_jif = jiffies;
-
- ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
- if (ascw) {
- ascw->os = os;
- ascw->ns = ns;
- ascw->flags = flags;
- ascw->w.cb = w_after_state_ch;
- ascw->done = done;
- drbd_queue_work(&mdev->data.work, &ascw->w);
- } else {
- dev_warn(DEV, "Could not kmalloc an ascw\n");
- }
-
- return rv;
-}
-
-static int w_after_state_ch(struct drbd_conf *mdev, struct drbd_work *w, int unused)
-{
- struct after_state_chg_work *ascw =
- container_of(w, struct after_state_chg_work, w);
- after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
- if (ascw->flags & CS_WAIT_COMPLETE) {
- D_ASSERT(ascw->done != NULL);
- complete(ascw->done);
- }
- kfree(ascw);
-
- return 1;
-}
-
-static void abw_start_sync(struct drbd_conf *mdev, int rv)
-{
- if (rv) {
- dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
- _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
- return;
- }
-
- switch (mdev->state.conn) {
- case C_STARTING_SYNC_T:
- _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
- break;
- case C_STARTING_SYNC_S:
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- break;
- }
-}
-
-int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
- int (*io_fn)(struct drbd_conf *),
- char *why, enum bm_flag flags)
-{
- int rv;
-
- D_ASSERT(current == mdev->worker.task);
-
- /* open coded non-blocking drbd_suspend_io(mdev); */
- drbd_set_flag(mdev, SUSPEND_IO);
-
- drbd_bm_lock(mdev, why, flags);
- rv = io_fn(mdev);
- drbd_bm_unlock(mdev);
-
- drbd_resume_io(mdev);
-
- return rv;
-}
-
-/**
- * after_state_ch() - Perform after state change actions that may sleep
- * @mdev: DRBD device.
- * @os: old state.
- * @ns: new state.
- * @flags: Flags
- */
-static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
- union drbd_state ns, enum chg_state_flags flags)
-{
- enum drbd_fencing_p fp;
- enum drbd_req_event what = nothing;
- union drbd_state nsm = (union drbd_state){ .i = -1 };
-
- if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
- drbd_clear_flag(mdev, CRASHED_PRIMARY);
- if (mdev->p_uuid)
- mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
- }
-
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- /* Inform userspace about the change... */
- drbd_bcast_state(mdev, ns);
-
- if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
- (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
- drbd_khelper(mdev, "pri-on-incon-degr");
-
- /* Here we have the actions that are performed after a
- state change. This function might sleep */
-
- if (os.disk <= D_NEGOTIATING && ns.disk > D_NEGOTIATING)
- mod_timer(&mdev->request_timer, jiffies + HZ);
-
- nsm.i = -1;
- if (ns.susp_nod) {
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
- what = resend;
-
- if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
- ns.disk > D_NEGOTIATING)
- what = restart_frozen_disk_io;
-
- if (what != nothing)
- nsm.susp_nod = 0;
- }
-
- if (ns.susp_fen) {
- /* case1: The outdate peer handler is successful: */
- if (os.pdsk > D_OUTDATED && ns.pdsk <= D_OUTDATED) {
- if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
- drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- }
- spin_lock_irq(&mdev->req_lock);
- _tl_clear(mdev);
- _drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
- /* case2: The connection was established again: */
- if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- what = resend;
- nsm.susp_fen = 0;
- }
- }
-
- if (what != nothing) {
- spin_lock_irq(&mdev->req_lock);
- _tl_restart(mdev, what);
- nsm.i &= mdev->state.i;
- _drbd_set_state(mdev, nsm, CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
- }
-
- /* Became sync source. With protocol >= 96, we still need to send out
- * the sync uuid now. Need to do that before any drbd_send_state, or
- * the other side may go "paused sync" before receiving the sync uuids,
- * which is unexpected. */
- if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
- (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
- mdev->agreed_pro_version >= 96 && get_ldev(mdev)) {
- drbd_gen_and_send_sync_uuid(mdev);
- put_ldev(mdev);
- }
-
- /* Do not change the order of the if above and the two below... */
- if (os.pdsk == D_DISKLESS && ns.pdsk > D_DISKLESS) { /* attach on the peer */
- /* we probably will start a resync soon.
- * make sure those things are properly reset. */
- mdev->rs_total = 0;
- mdev->rs_failed = 0;
- atomic_set(&mdev->rs_pending_cnt, 0);
- drbd_rs_cancel_all(mdev);
-
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
- /* No point in queuing send_bitmap if we don't have a connection
- * anymore, so check also the _current_ state, not only the new state
- * at the time this work was queued. */
- if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
- mdev->state.conn == C_WF_BITMAP_S)
- drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
- "send_bitmap (WFBitMapS)",
- BM_LOCKED_TEST_ALLOWED);
-
- /* Lost contact to peer's copy of the data */
- if ((os.pdsk >= D_INCONSISTENT &&
- os.pdsk != D_UNKNOWN &&
- os.pdsk != D_OUTDATED)
- && (ns.pdsk < D_INCONSISTENT ||
- ns.pdsk == D_UNKNOWN ||
- ns.pdsk == D_OUTDATED)) {
- if (get_ldev(mdev)) {
- if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- if (is_susp(mdev->state)) {
- drbd_set_flag(mdev, NEW_CUR_UUID);
- } else {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- }
- put_ldev(mdev);
- }
- }
-
- if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
- if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
- mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
- drbd_uuid_new_current(mdev);
- drbd_send_uuids(mdev);
- }
- /* D_DISKLESS Peer becomes secondary */
- if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
- /* We may still be Primary ourselves.
- * No harm done if the bitmap still changes,
- * redirtied pages will follow later. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote diskless peer", BM_LOCKED_SET_ALLOWED);
- put_ldev(mdev);
- }
-
- /* Write out all changed bits on demote.
- * Though, no need to da that just yet
- * if there is a resync going on still */
- if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
- mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
- /* No changes to the bitmap expected this time, so assert that,
- * even though no harm was done if it did change. */
- drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
- "demote", BM_LOCKED_TEST_ALLOWED);
- put_ldev(mdev);
- }
-
- /* Last part of the attaching process ... */
- if (ns.conn >= C_CONNECTED &&
- os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
- drbd_send_sizes(mdev, 0, 0); /* to start sync... */
- drbd_send_uuids(mdev);
- drbd_send_state(mdev, ns);
- }
-
- /* We want to pause/continue resync, tell peer. */
- if (ns.conn >= C_CONNECTED &&
- ((os.aftr_isp != ns.aftr_isp) ||
- (os.user_isp != ns.user_isp)))
- drbd_send_state(mdev, ns);
-
- /* In case one of the isp bits got set, suspend other devices. */
- if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
- (ns.aftr_isp || ns.peer_isp || ns.user_isp))
- suspend_other_sg(mdev);
-
- /* Make sure the peer gets informed about eventual state
- changes (ISP bits) while we were in WFReportParams. */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
- drbd_send_state(mdev, ns);
-
- /* We are in the progress to start a full sync... */
- if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
- (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
- /* no other bitmap changes expected during this phase */
- drbd_queue_bitmap_io(mdev,
- &drbd_bmio_set_n_write, &abw_start_sync,
- "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
-
- /* We are invalidating our self... */
- if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
- os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
- /* other bitmap operation expected during this phase */
- drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
- "set_n_write from invalidate", BM_LOCKED_MASK);
-
- /* first half of local IO error, failure to attach,
- * or administrative detach */
- if (os.disk != D_FAILED && ns.disk == D_FAILED) {
- /* corresponding get_ldev was in __drbd_set_state, to serialize
- * our cleanup here with the transition to D_DISKLESS.
- * But it is still not safe to dreference ldev here, we may end
- * up here from a failed attach, before ldev was even set. */
- if (mdev->ldev) {
- enum drbd_io_error_p eh = mdev->ldev->dc.on_io_error;
-
- /* In some setups, this handler triggers a suicide,
- * basically mapping IO error to node failure, to
- * reduce the number of different failure scenarios.
- *
- * This handler intentionally runs before we abort IO,
- * notify the peer, or try to update our meta data. */
- if (eh == EP_CALL_HELPER && drbd_test_flag(mdev, WAS_IO_ERROR))
- drbd_khelper(mdev, "local-io-error");
-
- /* Immediately allow completion of all application IO,
- * that waits for completion from the local disk,
- * if this was a force-detach due to disk_timeout
- * or administrator request (drbdsetup detach --force).
- * Do NOT abort otherwise.
- * Aborting local requests may cause serious problems,
- * if requests are completed to upper layers already,
- * and then later the already submitted local bio completes.
- * This can cause DMA into former bio pages that meanwhile
- * have been re-used for other things.
- * So aborting local requests may cause crashes,
- * or even worse, silent data corruption.
- */
- if (drbd_test_flag(mdev, FORCE_DETACH))
- tl_abort_disk_io(mdev);
-
- /* current state still has to be D_FAILED,
- * there is only one way out: to D_DISKLESS,
- * and that may only happen after our put_ldev below. */
- if (mdev->state.disk != D_FAILED)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s during detach\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- drbd_rs_cancel_all(mdev);
-
- /* In case we want to get something to stable storage still,
- * this may be the last chance.
- * Following put_ldev may transition to D_DISKLESS. */
- drbd_md_sync(mdev);
- }
- put_ldev(mdev);
- }
-
- /* second half of local IO error, failure to attach,
- * or administrative detach,
- * after local_cnt references have reached zero again */
- if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
- /* We must still be diskless,
- * re-attach has to be serialized with this! */
- if (mdev->state.disk != D_DISKLESS)
- dev_err(DEV,
- "ASSERT FAILED: disk is %s while going diskless\n",
- drbd_disk_str(mdev->state.disk));
-
- if (ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* corresponding get_ldev in __drbd_set_state
- * this may finally trigger drbd_ldev_destroy. */
- put_ldev(mdev);
- }
-
- /* Notify peer that I had a local IO error, and did not detached.. */
- if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* Disks got bigger while they were detached */
- if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
- drbd_test_and_clear_flag(mdev, RESYNC_AFTER_NEG)) {
- if (ns.conn == C_CONNECTED)
- resync_after_online_grow(mdev);
- }
-
- /* A resync finished or aborted, wake paused devices... */
- if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
- (os.peer_isp && !ns.peer_isp) ||
- (os.user_isp && !ns.user_isp))
- resume_next_sg(mdev);
-
- /* sync target done with resync. Explicitly notify peer, even though
- * it should (at least for non-empty resyncs) already know itself. */
- if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
- drbd_send_state(mdev, ns);
-
- /* Verify finished, or reached stop sector. Peer did not know about
- * the stop sector, and we may even have changed the stop sector during
- * verify to interrupt/stop early. Send the new state. */
- if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
- && mdev->agreed_pro_version >= 97)
- drbd_send_state(mdev, ns);
-
- /* Wake up role changes, that were delayed because of connection establishing */
- if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS) {
- drbd_clear_flag(mdev, STATE_SENT);
- wake_up(&mdev->state_wait);
- }
-
- /* This triggers bitmap writeout of potentially still unwritten pages
- * if the resync finished cleanly, or aborted because of peer disk
- * failure, or because of connection loss.
- * For resync aborted because of local disk failure, we cannot do
- * any bitmap writeout anymore.
- * No harm done if some bits change during this phase.
- */
- if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
- drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
- "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
- put_ldev(mdev);
- }
-
- /* free tl_hash if we Got thawed and are C_STANDALONE */
- if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
- drbd_free_tl_hash(mdev);
-
- /* Upon network connection, we need to start the receiver */
- if (os.conn == C_STANDALONE && ns.conn == C_UNCONNECTED)
- drbd_thread_start(&mdev->receiver);
-
- /* Terminate worker thread if we are unconfigured - it will be
- restarted as needed... */
- if (ns.disk == D_DISKLESS &&
- ns.conn == C_STANDALONE &&
- ns.role == R_SECONDARY) {
- if (os.aftr_isp != ns.aftr_isp)
- resume_next_sg(mdev);
- /* set in __drbd_set_state, unless CONFIG_PENDING was set */
- if (drbd_test_flag(mdev, DEVICE_DYING))
- drbd_thread_stop_nowait(&mdev->worker);
- }
-
- drbd_md_sync(mdev);
-}
-
-
static int drbd_thread_setup(void *arg)
{
struct drbd_thread *thi = (struct drbd_thread *) arg;
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_tconn *tconn = thi->tconn;
unsigned long flags;
int retval;
+ snprintf(current->comm, sizeof(current->comm), "drbd_%c_%s",
+ thi->name[0], thi->tconn->name);
+
restart:
retval = thi->function(thi);
spin_lock_irqsave(&thi->t_lock, flags);
- /* if the receiver has been "Exiting", the last thing it did
+ /* if the receiver has been "EXITING", the last thing it did
* was set the conn state to "StandAlone",
* if now a re-connect request comes in, conn state goes C_UNCONNECTED,
* and receiver thread will be "started".
- * drbd_thread_start needs to set "Restarting" in that case.
+ * drbd_thread_start needs to set "RESTARTING" in that case.
* t_state check and assignment needs to be within the same spinlock,
- * so either thread_start sees Exiting, and can remap to Restarting,
- * or thread_start see None, and can proceed as normal.
+ * so either thread_start sees EXITING, and can remap to RESTARTING,
+ * or thread_start see NONE, and can proceed as normal.
*/
- if (thi->t_state == Restarting) {
- dev_info(DEV, "Restarting %s\n", current->comm);
- thi->t_state = Running;
+ if (thi->t_state == RESTARTING) {
+ conn_info(tconn, "Restarting %s thread\n", thi->name);
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
goto restart;
}
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
smp_mb();
- complete(&thi->stop);
+ complete_all(&thi->stop);
spin_unlock_irqrestore(&thi->t_lock, flags);
- dev_info(DEV, "Terminating %s\n", current->comm);
+ conn_info(tconn, "Terminating %s\n", current->comm);
/* Release mod reference taken when thread was started */
+
+ kref_put(&tconn->kref, &conn_destroy);
module_put(THIS_MODULE);
return retval;
}
-static void drbd_thread_init(struct drbd_conf *mdev, struct drbd_thread *thi,
- int (*func) (struct drbd_thread *))
+static void drbd_thread_init(struct drbd_tconn *tconn, struct drbd_thread *thi,
+ int (*func) (struct drbd_thread *), char *name)
{
spin_lock_init(&thi->t_lock);
thi->task = NULL;
- thi->t_state = None;
+ thi->t_state = NONE;
thi->function = func;
- thi->mdev = mdev;
+ thi->tconn = tconn;
+ strncpy(thi->name, name, ARRAY_SIZE(thi->name));
}
int drbd_thread_start(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_tconn *tconn = thi->tconn;
struct task_struct *nt;
unsigned long flags;
- const char *me =
- thi == &mdev->receiver ? "receiver" :
- thi == &mdev->asender ? "asender" :
- thi == &mdev->worker ? "worker" : "NONSENSE";
-
/* is used from state engine doing drbd_thread_stop_nowait,
* while holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
switch (thi->t_state) {
- case None:
- dev_info(DEV, "Starting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case NONE:
+ conn_info(tconn, "Starting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* Get ref on module for thread - this is released when thread exits */
if (!try_module_get(THIS_MODULE)) {
- dev_err(DEV, "Failed to get module reference in drbd_thread_start\n");
+ conn_err(tconn, "Failed to get module reference in drbd_thread_start\n");
spin_unlock_irqrestore(&thi->t_lock, flags);
return false;
}
+ kref_get(&thi->tconn->kref);
+
init_completion(&thi->stop);
- D_ASSERT(thi->task == NULL);
thi->reset_cpu_mask = 1;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
flush_signals(current); /* otherw. may get -ERESTARTNOINTR */
nt = kthread_create(drbd_thread_setup, (void *) thi,
- "drbd%d_%s", mdev_to_minor(mdev), me);
+ "drbd_%c_%s", thi->name[0], thi->tconn->name);
if (IS_ERR(nt)) {
- dev_err(DEV, "Couldn't start thread\n");
+ conn_err(tconn, "Couldn't start thread\n");
+ kref_put(&tconn->kref, &conn_destroy);
module_put(THIS_MODULE);
return false;
}
spin_lock_irqsave(&thi->t_lock, flags);
thi->task = nt;
- thi->t_state = Running;
+ thi->t_state = RUNNING;
spin_unlock_irqrestore(&thi->t_lock, flags);
wake_up_process(nt);
break;
- case Exiting:
- thi->t_state = Restarting;
- dev_info(DEV, "Restarting %s thread (from %s [%d])\n",
- me, current->comm, current->pid);
+ case EXITING:
+ thi->t_state = RESTARTING;
+ conn_info(tconn, "Restarting %s thread (from %s [%d])\n",
+ thi->name, current->comm, current->pid);
/* fall through */
- case Running:
- case Restarting:
+ case RUNNING:
+ case RESTARTING:
default:
spin_unlock_irqrestore(&thi->t_lock, flags);
break;
@@ -1882,12 +447,12 @@
{
unsigned long flags;
- enum drbd_thread_state ns = restart ? Restarting : Exiting;
+ enum drbd_thread_state ns = restart ? RESTARTING : EXITING;
/* may be called from state engine, holding the req lock irqsave */
spin_lock_irqsave(&thi->t_lock, flags);
- if (thi->t_state == None) {
+ if (thi->t_state == NONE) {
spin_unlock_irqrestore(&thi->t_lock, flags);
if (restart)
drbd_thread_start(thi);
@@ -1905,7 +470,6 @@
init_completion(&thi->stop);
if (thi->task != current)
force_sig(DRBD_SIGKILL, thi->task);
-
}
spin_unlock_irqrestore(&thi->t_lock, flags);
@@ -1914,6 +478,35 @@
wait_for_completion(&thi->stop);
}
+static struct drbd_thread *drbd_task_to_thread(struct drbd_tconn *tconn, struct task_struct *task)
+{
+ struct drbd_thread *thi =
+ task == tconn->receiver.task ? &tconn->receiver :
+ task == tconn->asender.task ? &tconn->asender :
+ task == tconn->worker.task ? &tconn->worker : NULL;
+
+ return thi;
+}
+
+char *drbd_task_to_thread_name(struct drbd_tconn *tconn, struct task_struct *task)
+{
+ struct drbd_thread *thi = drbd_task_to_thread(tconn, task);
+ return thi ? thi->name : task->comm;
+}
+
+int conn_lowest_minor(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr = 0, m;
+
+ rcu_read_lock();
+ mdev = idr_get_next(&tconn->volumes, &vnr);
+ m = mdev ? mdev_to_minor(mdev) : -1;
+ rcu_read_unlock();
+
+ return m;
+}
+
#ifdef CONFIG_SMP
/**
* drbd_calc_cpu_mask() - Generate CPU masks, spread over all CPUs
@@ -1922,240 +515,345 @@
* Forces all threads of a device onto the same CPU. This is beneficial for
* DRBD's performance. May be overwritten by user's configuration.
*/
-void drbd_calc_cpu_mask(struct drbd_conf *mdev)
+void drbd_calc_cpu_mask(struct drbd_tconn *tconn)
{
int ord, cpu;
/* user override. */
- if (cpumask_weight(mdev->cpu_mask))
+ if (cpumask_weight(tconn->cpu_mask))
return;
- ord = mdev_to_minor(mdev) % cpumask_weight(cpu_online_mask);
+ ord = conn_lowest_minor(tconn) % cpumask_weight(cpu_online_mask);
for_each_online_cpu(cpu) {
if (ord-- == 0) {
- cpumask_set_cpu(cpu, mdev->cpu_mask);
+ cpumask_set_cpu(cpu, tconn->cpu_mask);
return;
}
}
/* should not be reached */
- cpumask_setall(mdev->cpu_mask);
+ cpumask_setall(tconn->cpu_mask);
}
/**
* drbd_thread_current_set_cpu() - modifies the cpu mask of the _current_ thread
* @mdev: DRBD device.
+ * @thi: drbd_thread object
*
* call in the "main loop" of _all_ threads, no need for any mutex, current won't die
* prematurely.
*/
-void drbd_thread_current_set_cpu(struct drbd_conf *mdev)
+void drbd_thread_current_set_cpu(struct drbd_thread *thi)
{
struct task_struct *p = current;
- struct drbd_thread *thi =
- p == mdev->asender.task ? &mdev->asender :
- p == mdev->receiver.task ? &mdev->receiver :
- p == mdev->worker.task ? &mdev->worker :
- NULL;
- ERR_IF(thi == NULL)
- return;
+
if (!thi->reset_cpu_mask)
return;
thi->reset_cpu_mask = 0;
- set_cpus_allowed_ptr(p, mdev->cpu_mask);
+ set_cpus_allowed_ptr(p, thi->tconn->cpu_mask);
}
#endif
-/* the appropriate socket mutex must be held already */
-int _drbd_send_cmd(struct drbd_conf *mdev, struct socket *sock,
- enum drbd_packets cmd, struct p_header80 *h,
- size_t size, unsigned msg_flags)
-{
- int sent, ok;
-
- ERR_IF(!h) return false;
- ERR_IF(!size) return false;
-
- h->magic = BE_DRBD_MAGIC;
- h->command = cpu_to_be16(cmd);
- h->length = cpu_to_be16(size-sizeof(struct p_header80));
-
- sent = drbd_send(mdev, sock, h, size, msg_flags);
-
- ok = (sent == size);
- if (!ok && !signal_pending(current))
- dev_warn(DEV, "short sent %s size=%d sent=%d\n",
- cmdname(cmd), (int)size, sent);
- return ok;
-}
-
-/* don't pass the socket. we may only look at it
- * when we hold the appropriate socket mutex.
+/**
+ * drbd_header_size - size of a packet header
+ *
+ * The header size is a multiple of 8, so any payload following the header is
+ * word aligned on 64-bit architectures. (The bitmap send and receive code
+ * relies on this.)
*/
-int drbd_send_cmd(struct drbd_conf *mdev, int use_data_socket,
- enum drbd_packets cmd, struct p_header80 *h, size_t size)
+unsigned int drbd_header_size(struct drbd_tconn *tconn)
{
- int ok = 0;
- struct socket *sock;
-
- if (use_data_socket) {
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
+ if (tconn->agreed_pro_version >= 100) {
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header100), 8));
+ return sizeof(struct p_header100);
} else {
- mutex_lock(&mdev->meta.mutex);
- sock = mdev->meta.socket;
+ BUILD_BUG_ON(sizeof(struct p_header80) !=
+ sizeof(struct p_header95));
+ BUILD_BUG_ON(!IS_ALIGNED(sizeof(struct p_header80), 8));
+ return sizeof(struct p_header80);
}
+}
- /* drbd_disconnect() could have called drbd_free_sock()
- * while we were waiting in down()... */
- if (likely(sock != NULL))
- ok = _drbd_send_cmd(mdev, sock, cmd, h, size, 0);
+static unsigned int prepare_header80(struct p_header80 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be16(size);
+ return sizeof(struct p_header80);
+}
- if (use_data_socket)
- mutex_unlock(&mdev->data.mutex);
+static unsigned int prepare_header95(struct p_header95 *h, enum drbd_packet cmd, int size)
+{
+ h->magic = cpu_to_be16(DRBD_MAGIC_BIG);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ return sizeof(struct p_header95);
+}
+
+static unsigned int prepare_header100(struct p_header100 *h, enum drbd_packet cmd,
+ int size, int vnr)
+{
+ h->magic = cpu_to_be32(DRBD_MAGIC_100);
+ h->volume = cpu_to_be16(vnr);
+ h->command = cpu_to_be16(cmd);
+ h->length = cpu_to_be32(size);
+ h->pad = 0;
+ return sizeof(struct p_header100);
+}
+
+static unsigned int prepare_header(struct drbd_tconn *tconn, int vnr,
+ void *buffer, enum drbd_packet cmd, int size)
+{
+ if (tconn->agreed_pro_version >= 100)
+ return prepare_header100(buffer, cmd, size, vnr);
+ else if (tconn->agreed_pro_version >= 95 &&
+ size > DRBD_MAX_SIZE_H80_PACKET)
+ return prepare_header95(buffer, cmd, size);
else
- mutex_unlock(&mdev->meta.mutex);
- return ok;
+ return prepare_header80(buffer, cmd, size);
}
-int drbd_send_cmd2(struct drbd_conf *mdev, enum drbd_packets cmd, char *data,
- size_t size)
+static void *__conn_prepare_command(struct drbd_tconn *tconn,
+ struct drbd_socket *sock)
{
- struct p_header80 h;
- int ok;
-
- h.magic = BE_DRBD_MAGIC;
- h.command = cpu_to_be16(cmd);
- h.length = cpu_to_be16(size);
-
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- ok = (sizeof(h) ==
- drbd_send(mdev, mdev->data.socket, &h, sizeof(h), 0));
- ok = ok && (size ==
- drbd_send(mdev, mdev->data.socket, data, size, 0));
-
- drbd_put_data_sock(mdev);
-
- return ok;
+ if (!sock->socket)
+ return NULL;
+ return sock->sbuf + drbd_header_size(tconn);
}
-int drbd_send_sync_param(struct drbd_conf *mdev, struct syncer_conf *sc)
+void *conn_prepare_command(struct drbd_tconn *tconn, struct drbd_socket *sock)
{
+ void *p;
+
+ mutex_lock(&sock->mutex);
+ p = __conn_prepare_command(tconn, sock);
+ if (!p)
+ mutex_unlock(&sock->mutex);
+
+ return p;
+}
+
+void *drbd_prepare_command(struct drbd_conf *mdev, struct drbd_socket *sock)
+{
+ return conn_prepare_command(mdev->tconn, sock);
+}
+
+static int __send_command(struct drbd_tconn *tconn, int vnr,
+ struct drbd_socket *sock, enum drbd_packet cmd,
+ unsigned int header_size, void *data,
+ unsigned int size)
+{
+ int msg_flags;
+ int err;
+
+ /*
+ * Called with @data == NULL and the size of the data blocks in @size
+ * for commands that send data blocks. For those commands, omit the
+ * MSG_MORE flag: this will increase the likelihood that data blocks
+ * which are page aligned on the sender will end up page aligned on the
+ * receiver.
+ */
+ msg_flags = data ? MSG_MORE : 0;
+
+ header_size += prepare_header(tconn, vnr, sock->sbuf, cmd,
+ header_size + size);
+ err = drbd_send_all(tconn, sock->socket, sock->sbuf, header_size,
+ msg_flags);
+ if (data && !err)
+ err = drbd_send_all(tconn, sock->socket, data, size, 0);
+ return err;
+}
+
+static int __conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ return __send_command(tconn, 0, sock, cmd, header_size, data, size);
+}
+
+int conn_send_command(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __conn_send_command(tconn, sock, cmd, header_size, data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_command(struct drbd_conf *mdev, struct drbd_socket *sock,
+ enum drbd_packet cmd, unsigned int header_size,
+ void *data, unsigned int size)
+{
+ int err;
+
+ err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, header_size,
+ data, size);
+ mutex_unlock(&sock->mutex);
+ return err;
+}
+
+int drbd_send_ping(struct drbd_tconn *tconn)
+{
+ struct drbd_socket *sock;
+
+ sock = &tconn->meta;
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, P_PING, 0, NULL, 0);
+}
+
+int drbd_send_ping_ack(struct drbd_tconn *tconn)
+{
+ struct drbd_socket *sock;
+
+ sock = &tconn->meta;
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, P_PING_ACK, 0, NULL, 0);
+}
+
+int drbd_send_sync_param(struct drbd_conf *mdev)
+{
+ struct drbd_socket *sock;
struct p_rs_param_95 *p;
- struct socket *sock;
- int size, rv;
- const int apv = mdev->agreed_pro_version;
+ int size;
+ const int apv = mdev->tconn->agreed_pro_version;
+ enum drbd_packet cmd;
+ struct net_conf *nc;
+ struct disk_conf *dc;
+
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
size = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
- + strlen(mdev->sync_conf.verify_alg) + 1
+ + strlen(nc->verify_alg) + 1
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- /* used from admin command context and receiver/worker context.
- * to avoid kmalloc, grab the socket right here,
- * then use the pre-allocated sbuf there */
- mutex_lock(&mdev->data.mutex);
- sock = mdev->data.socket;
+ cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
- if (likely(sock != NULL)) {
- enum drbd_packets cmd = apv >= 89 ? P_SYNC_PARAM89 : P_SYNC_PARAM;
+ /* initialize verify_alg and csums_alg */
+ memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- p = &mdev->data.sbuf.rs_param_95;
+ if (get_ldev(mdev)) {
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+ p->resync_rate = cpu_to_be32(dc->resync_rate);
+ p->c_plan_ahead = cpu_to_be32(dc->c_plan_ahead);
+ p->c_delay_target = cpu_to_be32(dc->c_delay_target);
+ p->c_fill_target = cpu_to_be32(dc->c_fill_target);
+ p->c_max_rate = cpu_to_be32(dc->c_max_rate);
+ put_ldev(mdev);
+ } else {
+ p->resync_rate = cpu_to_be32(DRBD_RESYNC_RATE_DEF);
+ p->c_plan_ahead = cpu_to_be32(DRBD_C_PLAN_AHEAD_DEF);
+ p->c_delay_target = cpu_to_be32(DRBD_C_DELAY_TARGET_DEF);
+ p->c_fill_target = cpu_to_be32(DRBD_C_FILL_TARGET_DEF);
+ p->c_max_rate = cpu_to_be32(DRBD_C_MAX_RATE_DEF);
+ }
- /* initialize verify_alg and csums_alg */
- memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
+ if (apv >= 88)
+ strcpy(p->verify_alg, nc->verify_alg);
+ if (apv >= 89)
+ strcpy(p->csums_alg, nc->csums_alg);
+ rcu_read_unlock();
- p->rate = cpu_to_be32(sc->rate);
- p->c_plan_ahead = cpu_to_be32(sc->c_plan_ahead);
- p->c_delay_target = cpu_to_be32(sc->c_delay_target);
- p->c_fill_target = cpu_to_be32(sc->c_fill_target);
- p->c_max_rate = cpu_to_be32(sc->c_max_rate);
-
- if (apv >= 88)
- strcpy(p->verify_alg, mdev->sync_conf.verify_alg);
- if (apv >= 89)
- strcpy(p->csums_alg, mdev->sync_conf.csums_alg);
-
- rv = _drbd_send_cmd(mdev, sock, cmd, &p->head, size, 0);
- } else
- rv = 0; /* not ok */
-
- mutex_unlock(&mdev->data.mutex);
-
- return rv;
+ return drbd_send_command(mdev, sock, cmd, size, NULL, 0);
}
-int drbd_send_protocol(struct drbd_conf *mdev)
+int __drbd_send_protocol(struct drbd_tconn *tconn, enum drbd_packet cmd)
{
+ struct drbd_socket *sock;
struct p_protocol *p;
- int size, cf, rv;
+ struct net_conf *nc;
+ int size, cf;
- size = sizeof(struct p_protocol);
+ sock = &tconn->data;
+ p = __conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
- if (mdev->agreed_pro_version >= 87)
- size += strlen(mdev->net_conf->integrity_alg) + 1;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
- /* we must not recurse into our own queue,
- * as that is blocked during handshake */
- p = kmalloc(size, GFP_NOIO);
- if (p == NULL)
- return 0;
-
- p->protocol = cpu_to_be32(mdev->net_conf->wire_protocol);
- p->after_sb_0p = cpu_to_be32(mdev->net_conf->after_sb_0p);
- p->after_sb_1p = cpu_to_be32(mdev->net_conf->after_sb_1p);
- p->after_sb_2p = cpu_to_be32(mdev->net_conf->after_sb_2p);
- p->two_primaries = cpu_to_be32(mdev->net_conf->two_primaries);
-
- cf = 0;
- if (mdev->net_conf->want_lose)
- cf |= CF_WANT_LOSE;
- if (mdev->net_conf->dry_run) {
- if (mdev->agreed_pro_version >= 92)
- cf |= CF_DRY_RUN;
- else {
- dev_err(DEV, "--dry-run is not supported by peer");
- kfree(p);
- return -1;
- }
+ if (nc->tentative && tconn->agreed_pro_version < 92) {
+ rcu_read_unlock();
+ mutex_unlock(&sock->mutex);
+ conn_err(tconn, "--dry-run is not supported by peer");
+ return -EOPNOTSUPP;
}
+
+ size = sizeof(*p);
+ if (tconn->agreed_pro_version >= 87)
+ size += strlen(nc->integrity_alg) + 1;
+
+ p->protocol = cpu_to_be32(nc->wire_protocol);
+ p->after_sb_0p = cpu_to_be32(nc->after_sb_0p);
+ p->after_sb_1p = cpu_to_be32(nc->after_sb_1p);
+ p->after_sb_2p = cpu_to_be32(nc->after_sb_2p);
+ p->two_primaries = cpu_to_be32(nc->two_primaries);
+ cf = 0;
+ if (nc->discard_my_data)
+ cf |= CF_DISCARD_MY_DATA;
+ if (nc->tentative)
+ cf |= CF_DRY_RUN;
p->conn_flags = cpu_to_be32(cf);
- if (mdev->agreed_pro_version >= 87)
- strcpy(p->integrity_alg, mdev->net_conf->integrity_alg);
+ if (tconn->agreed_pro_version >= 87)
+ strcpy(p->integrity_alg, nc->integrity_alg);
+ rcu_read_unlock();
- rv = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_PROTOCOL,
- (struct p_header80 *)p, size);
- kfree(p);
- return rv;
+ return __conn_send_command(tconn, sock, cmd, size, NULL, 0);
+}
+
+int drbd_send_protocol(struct drbd_tconn *tconn)
+{
+ int err;
+
+ mutex_lock(&tconn->data.mutex);
+ err = __drbd_send_protocol(tconn, P_PROTOCOL);
+ mutex_unlock(&tconn->data.mutex);
+
+ return err;
}
int _drbd_send_uuids(struct drbd_conf *mdev, u64 uuid_flags)
{
- struct p_uuids p;
+ struct drbd_socket *sock;
+ struct p_uuids *p;
int i;
if (!get_ldev_if_state(mdev, D_NEGOTIATING))
- return 1;
+ return 0;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p) {
+ put_ldev(mdev);
+ return -EIO;
+ }
spin_lock_irq(&mdev->ldev->md.uuid_lock);
for (i = UI_CURRENT; i < UI_SIZE; i++)
- p.uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
+ p->uuid[i] = mdev->ldev ? cpu_to_be64(mdev->ldev->md.uuid[i]) : 0;
spin_unlock_irq(&mdev->ldev->md.uuid_lock);
mdev->comm_bm_set = drbd_bm_total_weight(mdev);
- p.uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
- uuid_flags |= mdev->net_conf->want_lose ? 1 : 0;
- uuid_flags |= drbd_test_flag(mdev, CRASHED_PRIMARY) ? 2 : 0;
+ p->uuid[UI_SIZE] = cpu_to_be64(mdev->comm_bm_set);
+ rcu_read_lock();
+ uuid_flags |= rcu_dereference(mdev->tconn->net_conf)->discard_my_data ? 1 : 0;
+ rcu_read_unlock();
+ uuid_flags |= test_bit(CRASHED_PRIMARY, &mdev->flags) ? 2 : 0;
uuid_flags |= mdev->new_state_tmp.disk == D_INCONSISTENT ? 4 : 0;
- p.uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
+ p->uuid[UI_FLAGS] = cpu_to_be64(uuid_flags);
put_ldev(mdev);
-
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_UUIDS,
- (struct p_header80 *)&p, sizeof(p));
+ return drbd_send_command(mdev, sock, P_UUIDS, sizeof(*p), NULL, 0);
}
int drbd_send_uuids(struct drbd_conf *mdev)
@@ -2186,9 +884,10 @@
}
}
-int drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
+void drbd_gen_and_send_sync_uuid(struct drbd_conf *mdev)
{
- struct p_rs_uuid p;
+ struct drbd_socket *sock;
+ struct p_rs_uuid *p;
u64 uuid;
D_ASSERT(mdev->state.disk == D_UP_TO_DATE);
@@ -2201,24 +900,29 @@
drbd_uuid_set(mdev, UI_BITMAP, uuid);
drbd_print_uuids(mdev, "updated sync UUID");
drbd_md_sync(mdev);
- p.uuid = cpu_to_be64(uuid);
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SYNC_UUID,
- (struct p_header80 *)&p, sizeof(p));
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (p) {
+ p->uuid = cpu_to_be64(uuid);
+ drbd_send_command(mdev, sock, P_SYNC_UUID, sizeof(*p), NULL, 0);
+ }
}
int drbd_send_sizes(struct drbd_conf *mdev, int trigger_reply, enum dds_flags flags)
{
- struct p_sizes p;
+ struct drbd_socket *sock;
+ struct p_sizes *p;
sector_t d_size, u_size;
int q_order_type;
unsigned int max_bio_size;
- int ok;
if (get_ldev_if_state(mdev, D_NEGOTIATING)) {
D_ASSERT(mdev->ldev->backing_bdev);
d_size = drbd_get_max_capacity(mdev->ldev);
- u_size = mdev->ldev->dc.disk_size;
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
q_order_type = drbd_queue_order_type(mdev);
max_bio_size = queue_max_hw_sectors(mdev->ldev->backing_bdev->bd_disk->queue) << 9;
max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE);
@@ -2230,20 +934,23 @@
max_bio_size = DRBD_MAX_BIO_SIZE; /* ... multiple BIOs per peer_request */
}
- /* Never allow old drbd (up to 8.3.7) to see more than 32KiB */
- if (mdev->agreed_pro_version <= 94)
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+
+ if (mdev->tconn->agreed_pro_version <= 94)
max_bio_size = min(max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ else if (mdev->tconn->agreed_pro_version < 100)
+ max_bio_size = min(max_bio_size, DRBD_MAX_BIO_SIZE_P95);
- p.d_size = cpu_to_be64(d_size);
- p.u_size = cpu_to_be64(u_size);
- p.c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
- p.max_bio_size = cpu_to_be32(max_bio_size);
- p.queue_order_type = cpu_to_be16(q_order_type);
- p.dds_flags = cpu_to_be16(flags);
-
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_SIZES,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ p->d_size = cpu_to_be64(d_size);
+ p->u_size = cpu_to_be64(u_size);
+ p->c_size = cpu_to_be64(trigger_reply ? 0 : drbd_get_capacity(mdev->this_bdev));
+ p->max_bio_size = cpu_to_be32(max_bio_size);
+ p->queue_order_type = cpu_to_be16(q_order_type);
+ p->dds_flags = cpu_to_be16(flags);
+ return drbd_send_command(mdev, sock, P_SIZES, sizeof(*p), NULL, 0);
}
/**
@@ -2252,34 +959,21 @@
*/
int drbd_send_current_state(struct drbd_conf *mdev)
{
- struct socket *sock;
- struct p_state p;
- int ok = 0;
+ struct drbd_socket *sock;
+ struct p_state *p;
- /* Grab state lock so we wont send state if we're in the middle
- * of a cluster wide state change on another thread */
- drbd_state_lock(mdev);
-
- mutex_lock(&mdev->data.mutex);
-
- p.state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
- sock = mdev->data.socket;
-
- if (likely(sock != NULL)) {
- ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header80 *)&p, sizeof(p), 0);
- }
-
- mutex_unlock(&mdev->data.mutex);
-
- drbd_state_unlock(mdev);
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(mdev->state.i); /* Within the send mutex */
+ return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
}
/**
* drbd_send_state() - After a state change, sends the new state to the peer
- * @mdev: DRBD device.
- * @state: the state to send, not necessarily the current state.
+ * @mdev: DRBD device.
+ * @state: the state to send, not necessarily the current state.
*
* Each state change queues an "after_state_ch" work, which will eventually
* send the resulting new state to the peer. If more state changes happen
@@ -2288,50 +982,95 @@
*/
int drbd_send_state(struct drbd_conf *mdev, union drbd_state state)
{
- struct socket *sock;
- struct p_state p;
- int ok = 0;
+ struct drbd_socket *sock;
+ struct p_state *p;
- mutex_lock(&mdev->data.mutex);
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->state = cpu_to_be32(state.i); /* Within the send mutex */
+ return drbd_send_command(mdev, sock, P_STATE, sizeof(*p), NULL, 0);
+}
- p.state = cpu_to_be32(state.i);
- sock = mdev->data.socket;
+int drbd_send_state_req(struct drbd_conf *mdev, union drbd_state mask, union drbd_state val)
+{
+ struct drbd_socket *sock;
+ struct p_req_state *p;
- if (likely(sock != NULL)) {
- ok = _drbd_send_cmd(mdev, sock, P_STATE,
- (struct p_header80 *)&p, sizeof(p), 0);
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return drbd_send_command(mdev, sock, P_STATE_CHG_REQ, sizeof(*p), NULL, 0);
+}
+
+int conn_send_state_req(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_packet cmd;
+ struct drbd_socket *sock;
+ struct p_req_state *p;
+
+ cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REQ : P_CONN_ST_CHG_REQ;
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
+ p->mask = cpu_to_be32(mask.i);
+ p->val = cpu_to_be32(val.i);
+ return conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
+}
+
+void drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
+{
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+
+ sock = &mdev->tconn->meta;
+ p = drbd_prepare_command(mdev, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ drbd_send_command(mdev, sock, P_STATE_CHG_REPLY, sizeof(*p), NULL, 0);
}
-
- mutex_unlock(&mdev->data.mutex);
-
- return ok;
}
-int drbd_send_state_req(struct drbd_conf *mdev,
- union drbd_state mask, union drbd_state val)
+void conn_send_sr_reply(struct drbd_tconn *tconn, enum drbd_state_rv retcode)
{
- struct p_req_state p;
+ struct drbd_socket *sock;
+ struct p_req_state_reply *p;
+ enum drbd_packet cmd = tconn->agreed_pro_version < 100 ? P_STATE_CHG_REPLY : P_CONN_ST_CHG_REPLY;
- p.mask = cpu_to_be32(mask.i);
- p.val = cpu_to_be32(val.i);
-
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_STATE_CHG_REQ,
- (struct p_header80 *)&p, sizeof(p));
+ sock = &tconn->meta;
+ p = conn_prepare_command(tconn, sock);
+ if (p) {
+ p->retcode = cpu_to_be32(retcode);
+ conn_send_command(tconn, sock, cmd, sizeof(*p), NULL, 0);
+ }
}
-int drbd_send_sr_reply(struct drbd_conf *mdev, enum drbd_state_rv retcode)
+static void dcbp_set_code(struct p_compressed_bm *p, enum drbd_bitmap_code code)
{
- struct p_req_state_reply p;
+ BUG_ON(code & ~0xf);
+ p->encoding = (p->encoding & ~0xf) | code;
+}
- p.retcode = cpu_to_be32(retcode);
+static void dcbp_set_start(struct p_compressed_bm *p, int set)
+{
+ p->encoding = (p->encoding & ~0x80) | (set ? 0x80 : 0);
+}
- return drbd_send_cmd(mdev, USE_META_SOCKET, P_STATE_CHG_REPLY,
- (struct p_header80 *)&p, sizeof(p));
+static void dcbp_set_pad_bits(struct p_compressed_bm *p, int n)
+{
+ BUG_ON(n & ~0x7);
+ p->encoding = (p->encoding & (~0x7 << 4)) | (n << 4);
}
int fill_bitmap_rle_bits(struct drbd_conf *mdev,
- struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct p_compressed_bm *p,
+ unsigned int size,
+ struct bm_xfer_ctx *c)
{
struct bitstream bs;
unsigned long plain_bits;
@@ -2339,19 +1078,21 @@
unsigned long rl;
unsigned len;
unsigned toggle;
- int bits;
+ int bits, use_rle;
/* may we use this feature? */
- if ((mdev->sync_conf.use_rle == 0) ||
- (mdev->agreed_pro_version < 90))
- return 0;
+ rcu_read_lock();
+ use_rle = rcu_dereference(mdev->tconn->net_conf)->use_rle;
+ rcu_read_unlock();
+ if (!use_rle || mdev->tconn->agreed_pro_version < 90)
+ return 0;
if (c->bit_offset >= c->bm_bits)
return 0; /* nothing to do. */
/* use at most thus many bytes */
- bitstream_init(&bs, p->code, BM_PACKET_VLI_BYTES_MAX, 0);
- memset(p->code, 0, BM_PACKET_VLI_BYTES_MAX);
+ bitstream_init(&bs, p->code, size, 0);
+ memset(p->code, 0, size);
/* plain bits covered in this code string */
plain_bits = 0;
@@ -2373,12 +1114,12 @@
if (rl == 0) {
/* the first checked bit was set,
* store start value, */
- DCBP_set_start(p, 1);
+ dcbp_set_start(p, 1);
/* but skip encoding of zero run length */
toggle = !toggle;
continue;
}
- DCBP_set_start(p, 0);
+ dcbp_set_start(p, 0);
}
/* paranoia: catch zero runlength.
@@ -2418,7 +1159,7 @@
bm_xfer_ctx_bit_to_word_offset(c);
/* store pad_bits */
- DCBP_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
+ dcbp_set_pad_bits(p, (8 - bs.cur.bit) & 0x7);
return len;
}
@@ -2430,48 +1171,52 @@
* code upon failure.
*/
static int
-send_bitmap_rle_or_plain(struct drbd_conf *mdev,
- struct p_header80 *h, struct bm_xfer_ctx *c)
+send_bitmap_rle_or_plain(struct drbd_conf *mdev, struct bm_xfer_ctx *c)
{
- struct p_compressed_bm *p = (void*)h;
- unsigned long num_words;
- int len;
- int ok;
+ struct drbd_socket *sock = &mdev->tconn->data;
+ unsigned int header_size = drbd_header_size(mdev->tconn);
+ struct p_compressed_bm *p = sock->sbuf + header_size;
+ int len, err;
- len = fill_bitmap_rle_bits(mdev, p, c);
-
+ len = fill_bitmap_rle_bits(mdev, p,
+ DRBD_SOCKET_BUFFER_SIZE - header_size - sizeof(*p), c);
if (len < 0)
return -EIO;
if (len) {
- DCBP_set_code(p, RLE_VLI_Bits);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_COMPRESSED_BITMAP, h,
- sizeof(*p) + len, 0);
-
+ dcbp_set_code(p, RLE_VLI_Bits);
+ err = __send_command(mdev->tconn, mdev->vnr, sock,
+ P_COMPRESSED_BITMAP, sizeof(*p) + len,
+ NULL, 0);
c->packets[0]++;
- c->bytes[0] += sizeof(*p) + len;
+ c->bytes[0] += header_size + sizeof(*p) + len;
if (c->bit_offset >= c->bm_bits)
len = 0; /* DONE */
} else {
/* was not compressible.
* send a buffer full of plain text bits instead. */
- num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- len = num_words * sizeof(long);
+ unsigned int data_size;
+ unsigned long num_words;
+ unsigned long *p = sock->sbuf + header_size;
+
+ data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ len = num_words * sizeof(*p);
if (len)
- drbd_bm_get_lel(mdev, c->word_offset, num_words, (unsigned long*)h->payload);
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BITMAP,
- h, sizeof(struct p_header80) + len, 0);
+ drbd_bm_get_lel(mdev, c->word_offset, num_words, p);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, P_BITMAP, len, NULL, 0);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
c->packets[1]++;
- c->bytes[1] += sizeof(struct p_header80) + len;
+ c->bytes[1] += header_size + len;
if (c->bit_offset > c->bm_bits)
c->bit_offset = c->bm_bits;
}
- if (ok) {
+ if (!err) {
if (len == 0) {
INFO_bm_xfer_stats(mdev, "send", c);
return 0;
@@ -2482,21 +1227,13 @@
}
/* See the comment at receive_bitmap() */
-int _drbd_send_bitmap(struct drbd_conf *mdev)
+static int _drbd_send_bitmap(struct drbd_conf *mdev)
{
struct bm_xfer_ctx c;
- struct p_header80 *p;
int err;
- ERR_IF(!mdev->bitmap) return false;
-
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- p = (struct p_header80 *) __get_free_page(GFP_NOIO);
- if (!p) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
+ if (!expect(mdev->bitmap))
return false;
- }
if (get_ldev(mdev)) {
if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
@@ -2521,37 +1258,39 @@
};
do {
- err = send_bitmap_rle_or_plain(mdev, p, &c);
+ err = send_bitmap_rle_or_plain(mdev, &c);
} while (err > 0);
- free_page((unsigned long) p);
return err == 0;
}
int drbd_send_bitmap(struct drbd_conf *mdev)
{
- int err;
+ struct drbd_socket *sock = &mdev->tconn->data;
+ int err = -1;
- if (!drbd_get_data_sock(mdev))
- return -1;
- err = !_drbd_send_bitmap(mdev);
- drbd_put_data_sock(mdev);
+ mutex_lock(&sock->mutex);
+ if (sock->socket)
+ err = !_drbd_send_bitmap(mdev);
+ mutex_unlock(&sock->mutex);
return err;
}
-int drbd_send_b_ack(struct drbd_conf *mdev, u32 barrier_nr, u32 set_size)
+void drbd_send_b_ack(struct drbd_tconn *tconn, u32 barrier_nr, u32 set_size)
{
- int ok;
- struct p_barrier_ack p;
+ struct drbd_socket *sock;
+ struct p_barrier_ack *p;
- p.barrier = barrier_nr;
- p.set_size = cpu_to_be32(set_size);
+ if (tconn->cstate < C_WF_REPORT_PARAMS)
+ return;
- if (mdev->state.conn < C_CONNECTED)
- return false;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, P_BARRIER_ACK,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &tconn->meta;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return;
+ p->barrier = barrier_nr;
+ p->set_size = cpu_to_be32(set_size);
+ conn_send_command(tconn, sock, P_BARRIER_ACK, sizeof(*p), NULL, 0);
}
/**
@@ -2562,62 +1301,62 @@
* @blksize: size in byte, needs to be in big endian byte order
* @block_id: Id, big endian byte order
*/
-static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packets cmd,
- u64 sector,
- u32 blksize,
- u64 block_id)
+static int _drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+ u64 sector, u32 blksize, u64 block_id)
{
- int ok;
- struct p_block_ack p;
+ struct drbd_socket *sock;
+ struct p_block_ack *p;
- p.sector = sector;
- p.block_id = block_id;
- p.blksize = blksize;
- p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
+ if (mdev->state.conn < C_CONNECTED)
+ return -EIO;
- if (!mdev->meta.socket || mdev->state.conn < C_CONNECTED)
- return false;
- ok = drbd_send_cmd(mdev, USE_META_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->meta;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = sector;
+ p->block_id = block_id;
+ p->blksize = blksize;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
}
/* dp->sector and dp->block_id already/still in network byte order,
* data_size is payload size according to dp->head,
* and may need to be corrected for digest size. */
-int drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_data *dp, int data_size)
+void drbd_send_ack_dp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_data *dp, int data_size)
{
- data_size -= (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
- return _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
- dp->block_id);
+ if (mdev->tconn->peer_integrity_tfm)
+ data_size -= crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ _drbd_send_ack(mdev, cmd, dp->sector, cpu_to_be32(data_size),
+ dp->block_id);
}
-int drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct p_block_req *rp)
+void drbd_send_ack_rp(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct p_block_req *rp)
{
- return _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
+ _drbd_send_ack(mdev, cmd, rp->sector, rp->blksize, rp->block_id);
}
/**
* drbd_send_ack() - Sends an ack packet
- * @mdev: DRBD device.
- * @cmd: Packet command code.
- * @e: Epoch entry.
+ * @mdev: DRBD device
+ * @cmd: packet command code
+ * @peer_req: peer request
*/
-int drbd_send_ack(struct drbd_conf *mdev,
- enum drbd_packets cmd, struct drbd_epoch_entry *e)
+int drbd_send_ack(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
return _drbd_send_ack(mdev, cmd,
- cpu_to_be64(e->sector),
- cpu_to_be32(e->size),
- e->block_id);
+ cpu_to_be64(peer_req->i.sector),
+ cpu_to_be32(peer_req->i.size),
+ peer_req->block_id);
}
/* This function misuses the block_id field to signal if the blocks
* are is sync or not. */
-int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packets cmd,
+int drbd_send_ack_ex(struct drbd_conf *mdev, enum drbd_packet cmd,
sector_t sector, int blksize, u64 block_id)
{
return _drbd_send_ack(mdev, cmd,
@@ -2629,85 +1368,87 @@
int drbd_send_drequest(struct drbd_conf *mdev, int cmd,
sector_t sector, int size, u64 block_id)
{
- int ok;
- struct p_block_req p;
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- p.sector = cpu_to_be64(sector);
- p.block_id = block_id;
- p.blksize = cpu_to_be32(size);
-
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, cmd,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = block_id;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p), NULL, 0);
}
-int drbd_send_drequest_csum(struct drbd_conf *mdev,
- sector_t sector, int size,
- void *digest, int digest_size,
- enum drbd_packets cmd)
+int drbd_send_drequest_csum(struct drbd_conf *mdev, sector_t sector, int size,
+ void *digest, int digest_size, enum drbd_packet cmd)
{
- int ok;
- struct p_block_req p;
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbeef;
- p.blksize = cpu_to_be32(size);
+ /* FIXME: Put the digest into the preallocated socket buffer. */
- p.head.magic = BE_DRBD_MAGIC;
- p.head.command = cpu_to_be16(cmd);
- p.head.length = cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + digest_size);
-
- mutex_lock(&mdev->data.mutex);
-
- ok = (sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), 0));
- ok = ok && (digest_size == drbd_send(mdev, mdev->data.socket, digest, digest_size, 0));
-
- mutex_unlock(&mdev->data.mutex);
-
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, cmd, sizeof(*p),
+ digest, digest_size);
}
int drbd_send_ov_request(struct drbd_conf *mdev, sector_t sector, int size)
{
- int ok;
- struct p_block_req p;
+ struct drbd_socket *sock;
+ struct p_block_req *p;
- p.sector = cpu_to_be64(sector);
- p.block_id = BE_DRBD_MAGIC + 0xbabe;
- p.blksize = cpu_to_be32(size);
-
- ok = drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OV_REQUEST,
- (struct p_header80 *)&p, sizeof(p));
- return ok;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(sector);
+ p->block_id = ID_SYNCER /* unused */;
+ p->blksize = cpu_to_be32(size);
+ return drbd_send_command(mdev, sock, P_OV_REQUEST, sizeof(*p), NULL, 0);
}
/* called on sndtimeo
* returns false if we should retry,
* true if we think connection is dead
*/
-static int we_should_drop_the_connection(struct drbd_conf *mdev, struct socket *sock)
+static int we_should_drop_the_connection(struct drbd_tconn *tconn, struct socket *sock)
{
int drop_it;
/* long elapsed = (long)(jiffies - mdev->last_received); */
- drop_it = mdev->meta.socket == sock
- || !mdev->asender.task
- || get_t_state(&mdev->asender) != Running
- || mdev->state.conn < C_CONNECTED;
+ drop_it = tconn->meta.socket == sock
+ || !tconn->asender.task
+ || get_t_state(&tconn->asender) != RUNNING
+ || tconn->cstate < C_WF_REPORT_PARAMS;
if (drop_it)
return true;
- drop_it = !--mdev->ko_count;
+ drop_it = !--tconn->ko_count;
if (!drop_it) {
- dev_err(DEV, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
- current->comm, current->pid, mdev->ko_count);
- request_ping(mdev);
+ conn_err(tconn, "[%s/%d] sock_sendmsg time expired, ko = %u\n",
+ current->comm, current->pid, tconn->ko_count);
+ request_ping(tconn);
}
return drop_it; /* && (mdev->state == R_PRIMARY) */;
}
+static void drbd_update_congested(struct drbd_tconn *tconn)
+{
+ struct sock *sk = tconn->data.socket->sk;
+ if (sk->sk_wmem_queued > sk->sk_sndbuf * 4 / 5)
+ set_bit(NET_CONGESTED, &tconn->flags);
+}
+
/* The idea of sendpage seems to be to put some kind of reference
* to the page into the skb, and to hand it over to the NIC. In
* this process get_page() gets called.
@@ -2730,21 +1471,28 @@
* with page_count == 0 or PageSlab.
*/
static int _drbd_no_send_page(struct drbd_conf *mdev, struct page *page,
- int offset, size_t size, unsigned msg_flags)
+ int offset, size_t size, unsigned msg_flags)
{
- int sent = drbd_send(mdev, mdev->data.socket, kmap(page) + offset, size, msg_flags);
+ struct socket *socket;
+ void *addr;
+ int err;
+
+ socket = mdev->tconn->data.socket;
+ addr = kmap(page) + offset;
+ err = drbd_send_all(mdev->tconn, socket, addr, size, msg_flags);
kunmap(page);
- if (sent == size)
- mdev->send_cnt += size>>9;
- return sent == size;
+ if (!err)
+ mdev->send_cnt += size >> 9;
+ return err;
}
static int _drbd_send_page(struct drbd_conf *mdev, struct page *page,
int offset, size_t size, unsigned msg_flags)
{
+ struct socket *socket = mdev->tconn->data.socket;
mm_segment_t oldfs = get_fs();
- int sent, ok;
int len = size;
+ int err = -EIO;
/* e.g. XFS meta- & log-data is in slab pages, which have a
* page_count of 0 and/or have PageSlab() set.
@@ -2756,34 +1504,35 @@
return _drbd_no_send_page(mdev, page, offset, size, msg_flags);
msg_flags |= MSG_NOSIGNAL;
- drbd_update_congested(mdev);
+ drbd_update_congested(mdev->tconn);
set_fs(KERNEL_DS);
do {
- sent = mdev->data.socket->ops->sendpage(mdev->data.socket, page,
- offset, len,
- msg_flags);
- if (sent == -EAGAIN) {
- if (we_should_drop_the_connection(mdev,
- mdev->data.socket))
- break;
- else
- continue;
- }
+ int sent;
+
+ sent = socket->ops->sendpage(socket, page, offset, len, msg_flags);
if (sent <= 0) {
+ if (sent == -EAGAIN) {
+ if (we_should_drop_the_connection(mdev->tconn, socket))
+ break;
+ continue;
+ }
dev_warn(DEV, "%s: size=%d len=%d sent=%d\n",
__func__, (int)size, len, sent);
+ if (sent < 0)
+ err = sent;
break;
}
len -= sent;
offset += sent;
} while (len > 0 /* THINK && mdev->cstate >= C_CONNECTED*/);
set_fs(oldfs);
- drbd_clear_flag(mdev, NET_CONGESTED);
+ clear_bit(NET_CONGESTED, &mdev->tconn->flags);
- ok = (len == 0);
- if (likely(ok))
- mdev->send_cnt += size>>9;
- return ok;
+ if (len == 0) {
+ err = 0;
+ mdev->send_cnt += size >> 9;
+ }
+ return err;
}
static int _drbd_send_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2792,12 +1541,15 @@
int i;
/* hint all but last page with MSG_MORE */
bio_for_each_segment(bvec, bio, i) {
- if (!_drbd_no_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ int err;
+
+ err = _drbd_no_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
static int _drbd_send_zc_bio(struct drbd_conf *mdev, struct bio *bio)
@@ -2806,32 +1558,40 @@
int i;
/* hint all but last page with MSG_MORE */
bio_for_each_segment(bvec, bio, i) {
- if (!_drbd_send_page(mdev, bvec->bv_page,
- bvec->bv_offset, bvec->bv_len,
- i == bio->bi_vcnt -1 ? 0 : MSG_MORE))
- return 0;
+ int err;
+
+ err = _drbd_send_page(mdev, bvec->bv_page,
+ bvec->bv_offset, bvec->bv_len,
+ i == bio->bi_vcnt - 1 ? 0 : MSG_MORE);
+ if (err)
+ return err;
}
- return 1;
+ return 0;
}
-static int _drbd_send_zc_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static int _drbd_send_zc_ee(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
{
- struct page *page = e->pages;
- unsigned len = e->size;
+ struct page *page = peer_req->pages;
+ unsigned len = peer_req->i.size;
+ int err;
+
/* hint all but last page with MSG_MORE */
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
- if (!_drbd_send_page(mdev, page, 0, l,
- page_chain_next(page) ? MSG_MORE : 0))
- return 0;
+
+ err = _drbd_send_page(mdev, page, 0, l,
+ page_chain_next(page) ? MSG_MORE : 0);
+ if (err)
+ return err;
len -= l;
}
- return 1;
+ return 0;
}
static u32 bio_flags_to_wire(struct drbd_conf *mdev, unsigned long bi_rw)
{
- if (mdev->agreed_pro_version >= 95)
+ if (mdev->tconn->agreed_pro_version >= 95)
return (bi_rw & REQ_SYNC ? DP_RW_SYNC : 0) |
(bi_rw & REQ_FUA ? DP_FUA : 0) |
(bi_rw & REQ_FLUSH ? DP_FLUSH : 0) |
@@ -2845,50 +1605,36 @@
*/
int drbd_send_dblock(struct drbd_conf *mdev, struct drbd_request *req)
{
- int ok = 1;
- struct p_data p;
+ struct drbd_socket *sock;
+ struct p_data *p;
unsigned int dp_flags = 0;
- void *dgb;
int dgs;
+ int err;
- if (!drbd_get_data_sock(mdev))
- return 0;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
-
- if (req->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(P_DATA);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(P_DATA);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(union p_header) + dgs + req->size);
- }
-
- p.sector = cpu_to_be64(req->sector);
- p.block_id = (unsigned long)req;
- p.seq_num = cpu_to_be32(atomic_add_return(1, &mdev->packet_seq));
-
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->block_id = (unsigned long)req;
+ p->seq_num = cpu_to_be32(atomic_inc_return(&mdev->packet_seq));
dp_flags = bio_flags_to_wire(mdev, req->master_bio->bi_rw);
-
if (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T)
dp_flags |= DP_MAY_SET_IN_SYNC;
-
- p.dp_flags = cpu_to_be32(dp_flags);
- drbd_set_flag(mdev, UNPLUG_REMOTE);
- ok = (sizeof(p) ==
- drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0));
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, dgb);
- ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
+ if (mdev->tconn->agreed_pro_version >= 100) {
+ if (req->rq_state & RQ_EXP_RECEIVE_ACK)
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ if (req->rq_state & RQ_EXP_WRITE_ACK)
+ dp_flags |= DP_SEND_WRITE_ACK;
}
- if (ok) {
+ p->dp_flags = cpu_to_be32(dp_flags);
+ if (dgs)
+ drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, p + 1);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, P_DATA, sizeof(*p) + dgs, NULL, req->i.size);
+ if (!err) {
/* For protocol A, we have to memcpy the payload into
* socket buffers, as we may complete right away
* as soon as we handed it over to tcp, at which point the data
@@ -2900,92 +1646,76 @@
* out ok after sending on this side, but does not fit on the
* receiving side, we sure have detected corruption elsewhere.
*/
- if (mdev->net_conf->wire_protocol == DRBD_PROT_A || dgs)
- ok = _drbd_send_bio(mdev, req->master_bio);
+ if (!(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK)) || dgs)
+ err = _drbd_send_bio(mdev, req->master_bio);
else
- ok = _drbd_send_zc_bio(mdev, req->master_bio);
+ err = _drbd_send_zc_bio(mdev, req->master_bio);
/* double check digest, sometimes buffers have been modified in flight. */
if (dgs > 0 && dgs <= 64) {
/* 64 byte, 512 bit, is the largest digest size
* currently supported in kernel crypto. */
unsigned char digest[64];
- drbd_csum_bio(mdev, mdev->integrity_w_tfm, req->master_bio, digest);
- if (memcmp(mdev->int_dig_out, digest, dgs)) {
+ drbd_csum_bio(mdev, mdev->tconn->integrity_tfm, req->master_bio, digest);
+ if (memcmp(p + 1, digest, dgs)) {
dev_warn(DEV,
"Digest mismatch, buffer modified by upper layers during write: %llus +%u\n",
- (unsigned long long)req->sector, req->size);
+ (unsigned long long)req->i.sector, req->i.size);
}
} /* else if (dgs > 64) {
... Be noisy about digest too large ...
} */
}
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- drbd_put_data_sock(mdev);
-
- return ok;
+ return err;
}
/* answer packet, used to send data back for read requests:
* Peer -> (diskless) R_PRIMARY (P_DATA_REPLY)
* C_SYNC_SOURCE -> C_SYNC_TARGET (P_RS_DATA_REPLY)
*/
-int drbd_send_block(struct drbd_conf *mdev, enum drbd_packets cmd,
- struct drbd_epoch_entry *e)
+int drbd_send_block(struct drbd_conf *mdev, enum drbd_packet cmd,
+ struct drbd_peer_request *peer_req)
{
- int ok;
- struct p_data p;
- void *dgb;
+ struct drbd_socket *sock;
+ struct p_data *p;
+ int err;
int dgs;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_w_tfm) ?
- crypto_hash_digestsize(mdev->integrity_w_tfm) : 0;
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
- if (e->size <= DRBD_MAX_SIZE_H80_PACKET) {
- p.head.h80.magic = BE_DRBD_MAGIC;
- p.head.h80.command = cpu_to_be16(cmd);
- p.head.h80.length =
- cpu_to_be16(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- } else {
- p.head.h95.magic = BE_DRBD_MAGIC_BIG;
- p.head.h95.command = cpu_to_be16(cmd);
- p.head.h95.length =
- cpu_to_be32(sizeof(p) - sizeof(struct p_header80) + dgs + e->size);
- }
+ dgs = mdev->tconn->integrity_tfm ? crypto_hash_digestsize(mdev->tconn->integrity_tfm) : 0;
- p.sector = cpu_to_be64(e->sector);
- p.block_id = e->block_id;
- /* p.seq_num = 0; No sequence numbers here.. */
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(peer_req->i.sector);
+ p->block_id = peer_req->block_id;
+ p->seq_num = 0; /* unused */
+ p->dp_flags = 0;
+ if (dgs)
+ drbd_csum_ee(mdev, mdev->tconn->integrity_tfm, peer_req, p + 1);
+ err = __send_command(mdev->tconn, mdev->vnr, sock, cmd, sizeof(*p) + dgs, NULL, peer_req->i.size);
+ if (!err)
+ err = _drbd_send_zc_ee(mdev, peer_req);
+ mutex_unlock(&sock->mutex); /* locked by drbd_prepare_command() */
- /* Only called by our kernel thread.
- * This one may be interrupted by DRBD_SIG and/or DRBD_SIGKILL
- * in response to admin command or module unload.
- */
- if (!drbd_get_data_sock(mdev))
- return 0;
-
- ok = sizeof(p) == drbd_send(mdev, mdev->data.socket, &p, sizeof(p), dgs ? MSG_MORE : 0);
- if (ok && dgs) {
- dgb = mdev->int_dig_out;
- drbd_csum_ee(mdev, mdev->integrity_w_tfm, e, dgb);
- ok = dgs == drbd_send(mdev, mdev->data.socket, dgb, dgs, 0);
- }
- if (ok)
- ok = _drbd_send_zc_ee(mdev, e);
-
- drbd_put_data_sock(mdev);
-
- return ok;
+ return err;
}
-int drbd_send_oos(struct drbd_conf *mdev, struct drbd_request *req)
+int drbd_send_out_of_sync(struct drbd_conf *mdev, struct drbd_request *req)
{
- struct p_block_desc p;
+ struct drbd_socket *sock;
+ struct p_block_desc *p;
- p.sector = cpu_to_be64(req->sector);
- p.blksize = cpu_to_be32(req->size);
-
- return drbd_send_cmd(mdev, USE_DATA_SOCKET, P_OUT_OF_SYNC, &p.head, sizeof(p));
+ sock = &mdev->tconn->data;
+ p = drbd_prepare_command(mdev, sock);
+ if (!p)
+ return -EIO;
+ p->sector = cpu_to_be64(req->i.sector);
+ p->blksize = cpu_to_be32(req->i.size);
+ return drbd_send_command(mdev, sock, P_OUT_OF_SYNC, sizeof(*p), NULL, 0);
}
/*
@@ -3004,7 +1734,7 @@
/*
* you must have down()ed the appropriate [m]sock_mutex elsewhere!
*/
-int drbd_send(struct drbd_conf *mdev, struct socket *sock,
+int drbd_send(struct drbd_tconn *tconn, struct socket *sock,
void *buf, size_t size, unsigned msg_flags)
{
struct kvec iov;
@@ -3012,7 +1742,7 @@
int rv, sent = 0;
if (!sock)
- return -1000;
+ return -EBADR;
/* THINK if (signal_pending) return ... ? */
@@ -3025,9 +1755,11 @@
msg.msg_controllen = 0;
msg.msg_flags = msg_flags | MSG_NOSIGNAL;
- if (sock == mdev->data.socket) {
- mdev->ko_count = mdev->net_conf->ko_count;
- drbd_update_congested(mdev);
+ if (sock == tconn->data.socket) {
+ rcu_read_lock();
+ tconn->ko_count = rcu_dereference(tconn->net_conf)->ko_count;
+ rcu_read_unlock();
+ drbd_update_congested(tconn);
}
do {
/* STRANGE
@@ -3041,12 +1773,11 @@
*/
rv = kernel_sendmsg(sock, &msg, &iov, 1, size);
if (rv == -EAGAIN) {
- if (we_should_drop_the_connection(mdev, sock))
+ if (we_should_drop_the_connection(tconn, sock))
break;
else
continue;
}
- D_ASSERT(rv != 0);
if (rv == -EINTR) {
flush_signals(current);
rv = 0;
@@ -3058,22 +1789,40 @@
iov.iov_len -= rv;
} while (sent < size);
- if (sock == mdev->data.socket)
- drbd_clear_flag(mdev, NET_CONGESTED);
+ if (sock == tconn->data.socket)
+ clear_bit(NET_CONGESTED, &tconn->flags);
if (rv <= 0) {
if (rv != -EAGAIN) {
- dev_err(DEV, "%s_sendmsg returned %d\n",
- sock == mdev->meta.socket ? "msock" : "sock",
- rv);
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ conn_err(tconn, "%s_sendmsg returned %d\n",
+ sock == tconn->meta.socket ? "msock" : "sock",
+ rv);
+ conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
} else
- drbd_force_state(mdev, NS(conn, C_TIMEOUT));
+ conn_request_state(tconn, NS(conn, C_TIMEOUT), CS_HARD);
}
return sent;
}
+/**
+ * drbd_send_all - Send an entire buffer
+ *
+ * Returns 0 upon success and a negative error value otherwise.
+ */
+int drbd_send_all(struct drbd_tconn *tconn, struct socket *sock, void *buffer,
+ size_t size, unsigned msg_flags)
+{
+ int err;
+
+ err = drbd_send(tconn, sock, buffer, size, msg_flags);
+ if (err < 0)
+ return err;
+ if (err != size)
+ return -EIO;
+ return 0;
+}
+
static int drbd_open(struct block_device *bdev, fmode_t mode)
{
struct drbd_conf *mdev = bdev->bd_disk->private_data;
@@ -3081,7 +1830,7 @@
int rv = 0;
mutex_lock(&drbd_main_mutex);
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
/* to have a stable mdev->state.role
* and no race with updating open_cnt */
@@ -3094,7 +1843,7 @@
if (!rv)
mdev->open_cnt++;
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
mutex_unlock(&drbd_main_mutex);
return rv;
@@ -3111,35 +1860,14 @@
static void drbd_set_defaults(struct drbd_conf *mdev)
{
- /* This way we get a compile error when sync_conf grows,
- and we forgot to initialize it here */
- mdev->sync_conf = (struct syncer_conf) {
- /* .rate = */ DRBD_RATE_DEF,
- /* .after = */ DRBD_AFTER_DEF,
- /* .al_extents = */ DRBD_AL_EXTENTS_DEF,
- /* .verify_alg = */ {}, 0,
- /* .cpu_mask = */ {}, 0,
- /* .csums_alg = */ {}, 0,
- /* .use_rle = */ 0,
- /* .on_no_data = */ DRBD_ON_NO_DATA_DEF,
- /* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
- /* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
- /* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
- /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
- /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
- };
-
- /* Have to use that way, because the layout differs between
- big endian and little endian */
- mdev->state = (union drbd_state) {
+ /* Beware! The actual layout differs
+ * between big endian and little endian */
+ mdev->state = (union drbd_dev_state) {
{ .role = R_SECONDARY,
.peer = R_UNKNOWN,
.conn = C_STANDALONE,
.disk = D_DISKLESS,
.pdsk = D_UNKNOWN,
- .susp = 0,
- .susp_nod = 0,
- .susp_fen = 0
} };
}
@@ -3155,28 +1883,17 @@
atomic_set(&mdev->rs_pending_cnt, 0);
atomic_set(&mdev->unacked_cnt, 0);
atomic_set(&mdev->local_cnt, 0);
- atomic_set(&mdev->net_cnt, 0);
- atomic_set(&mdev->packet_seq, 0);
- atomic_set(&mdev->pp_in_use, 0);
atomic_set(&mdev->pp_in_use_by_net, 0);
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
atomic_set(&mdev->ap_in_flight, 0);
atomic_set(&mdev->md_io_in_use, 0);
- mutex_init(&mdev->data.mutex);
- mutex_init(&mdev->meta.mutex);
- sema_init(&mdev->data.work.s, 0);
- sema_init(&mdev->meta.work.s, 0);
- mutex_init(&mdev->state_mutex);
-
- spin_lock_init(&mdev->data.work.q_lock);
- spin_lock_init(&mdev->meta.work.q_lock);
+ mutex_init(&mdev->own_state_mutex);
+ mdev->state_mutex = &mdev->own_state_mutex;
spin_lock_init(&mdev->al_lock);
- spin_lock_init(&mdev->req_lock);
spin_lock_init(&mdev->peer_seq_lock);
- spin_lock_init(&mdev->epoch_lock);
INIT_LIST_HEAD(&mdev->active_ee);
INIT_LIST_HEAD(&mdev->sync_ee);
@@ -3184,8 +1901,6 @@
INIT_LIST_HEAD(&mdev->read_ee);
INIT_LIST_HEAD(&mdev->net_ee);
INIT_LIST_HEAD(&mdev->resync_reads);
- INIT_LIST_HEAD(&mdev->data.work.q);
- INIT_LIST_HEAD(&mdev->meta.work.q);
INIT_LIST_HEAD(&mdev->resync_work.list);
INIT_LIST_HEAD(&mdev->unplug_work.list);
INIT_LIST_HEAD(&mdev->go_diskless.list);
@@ -3199,6 +1914,14 @@
mdev->md_sync_work.cb = w_md_sync;
mdev->bm_io_work.w.cb = w_bitmap_io;
mdev->start_resync_work.cb = w_start_resync;
+
+ mdev->resync_work.mdev = mdev;
+ mdev->unplug_work.mdev = mdev;
+ mdev->go_diskless.mdev = mdev;
+ mdev->md_sync_work.mdev = mdev;
+ mdev->bm_io_work.w.mdev = mdev;
+ mdev->start_resync_work.mdev = mdev;
+
init_timer(&mdev->resync_timer);
init_timer(&mdev->md_sync_timer);
init_timer(&mdev->start_resync_timer);
@@ -3214,17 +1937,10 @@
init_waitqueue_head(&mdev->misc_wait);
init_waitqueue_head(&mdev->state_wait);
- init_waitqueue_head(&mdev->net_cnt_wait);
init_waitqueue_head(&mdev->ee_wait);
init_waitqueue_head(&mdev->al_wait);
init_waitqueue_head(&mdev->seq_wait);
- drbd_thread_init(mdev, &mdev->receiver, drbdd_init);
- drbd_thread_init(mdev, &mdev->worker, drbd_worker);
- drbd_thread_init(mdev, &mdev->asender, drbd_asender);
-
- mdev->agreed_pro_version = PRO_VERSION_MAX;
- mdev->write_ordering = WO_bdev_flush;
mdev->resync_wenr = LC_FREE;
mdev->peer_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
mdev->local_max_bio_size = DRBD_MAX_BIO_SIZE_SAFE;
@@ -3233,13 +1949,10 @@
void drbd_mdev_cleanup(struct drbd_conf *mdev)
{
int i;
- if (mdev->receiver.t_state != None)
+ if (mdev->tconn->receiver.t_state != NONE)
dev_err(DEV, "ASSERT FAILED: receiver t_state == %d expected 0.\n",
- mdev->receiver.t_state);
+ mdev->tconn->receiver.t_state);
- /* no need to lock it, I'm the only thread alive */
- if (atomic_read(&mdev->current_epoch->epoch_size) != 0)
- dev_err(DEV, "epoch_size:%d\n", atomic_read(&mdev->current_epoch->epoch_size));
mdev->al_writ_cnt =
mdev->bm_writ_cnt =
mdev->read_cnt =
@@ -3256,7 +1969,7 @@
mdev->rs_mark_left[i] = 0;
mdev->rs_mark_time[i] = 0;
}
- D_ASSERT(mdev->net_conf == NULL);
+ D_ASSERT(mdev->tconn->net_conf == NULL);
drbd_set_my_capacity(mdev, 0);
if (mdev->bitmap) {
@@ -3265,21 +1978,18 @@
drbd_bm_cleanup(mdev);
}
- drbd_free_resources(mdev);
- drbd_clear_flag(mdev, AL_SUSPENDED);
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;
- /*
- * currently we drbd_init_ee only on module load, so
- * we may do drbd_release_ee only on module unload!
- */
+ clear_bit(AL_SUSPENDED, &mdev->flags);
+
D_ASSERT(list_empty(&mdev->active_ee));
D_ASSERT(list_empty(&mdev->sync_ee));
D_ASSERT(list_empty(&mdev->done_ee));
D_ASSERT(list_empty(&mdev->read_ee));
D_ASSERT(list_empty(&mdev->net_ee));
D_ASSERT(list_empty(&mdev->resync_reads));
- D_ASSERT(list_empty(&mdev->data.work.q));
- D_ASSERT(list_empty(&mdev->meta.work.q));
+ D_ASSERT(list_empty(&mdev->tconn->sender_work.q));
D_ASSERT(list_empty(&mdev->resync_work.list));
D_ASSERT(list_empty(&mdev->unplug_work.list));
D_ASSERT(list_empty(&mdev->go_diskless.list));
@@ -3353,7 +2063,7 @@
goto Enomem;
drbd_ee_cache = kmem_cache_create(
- "drbd_ee", sizeof(struct drbd_epoch_entry), 0, 0, NULL);
+ "drbd_ee", sizeof(struct drbd_peer_request), 0, 0, NULL);
if (drbd_ee_cache == NULL)
goto Enomem;
@@ -3368,11 +2078,9 @@
goto Enomem;
/* mempools */
-#ifdef COMPAT_HAVE_BIOSET_CREATE
drbd_md_io_bio_set = bioset_create(DRBD_MIN_POOL_PAGES, 0);
if (drbd_md_io_bio_set == NULL)
goto Enomem;
-#endif
drbd_md_io_page_pool = mempool_create_page_pool(DRBD_MIN_POOL_PAGES, 0);
if (drbd_md_io_page_pool == NULL)
@@ -3421,73 +2129,53 @@
.notifier_call = drbd_notify_sys,
};
-static void drbd_release_ee_lists(struct drbd_conf *mdev)
+static void drbd_release_all_peer_reqs(struct drbd_conf *mdev)
{
int rr;
- rr = drbd_release_ee(mdev, &mdev->active_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->active_ee);
if (rr)
dev_err(DEV, "%d EEs in active list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->sync_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->sync_ee);
if (rr)
dev_err(DEV, "%d EEs in sync list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->read_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->read_ee);
if (rr)
dev_err(DEV, "%d EEs in read list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->done_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->done_ee);
if (rr)
dev_err(DEV, "%d EEs in done list found!\n", rr);
- rr = drbd_release_ee(mdev, &mdev->net_ee);
+ rr = drbd_free_peer_reqs(mdev, &mdev->net_ee);
if (rr)
dev_err(DEV, "%d EEs in net list found!\n", rr);
}
-/* caution. no locking.
- * currently only used from module cleanup code. */
-static void drbd_delete_device(unsigned int minor)
+/* caution. no locking. */
+void drbd_minor_destroy(struct kref *kref)
{
- struct drbd_conf *mdev = minor_to_mdev(minor);
-
- if (!mdev)
- return;
+ struct drbd_conf *mdev = container_of(kref, struct drbd_conf, kref);
+ struct drbd_tconn *tconn = mdev->tconn;
del_timer_sync(&mdev->request_timer);
/* paranoia asserts */
- if (mdev->open_cnt != 0)
- dev_err(DEV, "open_cnt = %d in %s:%u", mdev->open_cnt,
- __FILE__ , __LINE__);
-
- ERR_IF (!list_empty(&mdev->data.work.q)) {
- struct list_head *lp;
- list_for_each(lp, &mdev->data.work.q) {
- dev_err(DEV, "lp = %p\n", lp);
- }
- };
+ D_ASSERT(mdev->open_cnt == 0);
/* end paranoia asserts */
- del_gendisk(mdev->vdisk);
-
/* cleanup stuff that may have been allocated during
* device (re-)configuration or state changes */
if (mdev->this_bdev)
bdput(mdev->this_bdev);
- drbd_free_resources(mdev);
+ drbd_free_bc(mdev->ldev);
+ mdev->ldev = NULL;
- drbd_release_ee_lists(mdev);
-
- /* should be freed on disconnect? */
- kfree(mdev->ee_hash);
- /*
- mdev->ee_hash_s = 0;
- mdev->ee_hash = NULL;
- */
+ drbd_release_all_peer_reqs(mdev);
lc_destroy(mdev->act_log);
lc_destroy(mdev->resync);
@@ -3495,19 +2183,101 @@
kfree(mdev->p_uuid);
/* mdev->p_uuid = NULL; */
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
+ if (mdev->bitmap) /* should no longer be there. */
+ drbd_bm_cleanup(mdev);
+ __free_page(mdev->md_io_page);
+ put_disk(mdev->vdisk);
+ blk_cleanup_queue(mdev->rq_queue);
+ kfree(mdev->rs_plan_s);
+ kfree(mdev);
- /* cleanup the rest that has been
- * allocated from drbd_new_device
- * and actually free the mdev itself */
- drbd_free_mdev(mdev);
+ kref_put(&tconn->kref, &conn_destroy);
}
+/* One global retry thread, if we need to push back some bio and have it
+ * reinserted through our make request function.
+ */
+static struct retry_worker {
+ struct workqueue_struct *wq;
+ struct work_struct worker;
+
+ spinlock_t lock;
+ struct list_head writes;
+} retry;
+
+static void do_retry(struct work_struct *ws)
+{
+ struct retry_worker *retry = container_of(ws, struct retry_worker, worker);
+ LIST_HEAD(writes);
+ struct drbd_request *req, *tmp;
+
+ spin_lock_irq(&retry->lock);
+ list_splice_init(&retry->writes, &writes);
+ spin_unlock_irq(&retry->lock);
+
+ list_for_each_entry_safe(req, tmp, &writes, tl_requests) {
+ struct drbd_conf *mdev = req->w.mdev;
+ struct bio *bio = req->master_bio;
+ unsigned long start_time = req->start_time;
+ bool expected;
+
+ expected =
+ expect(atomic_read(&req->completion_ref) == 0) &&
+ expect(req->rq_state & RQ_POSTPONED) &&
+ expect((req->rq_state & RQ_LOCAL_PENDING) == 0 ||
+ (req->rq_state & RQ_LOCAL_ABORTED) != 0);
+
+ if (!expected)
+ dev_err(DEV, "req=%p completion_ref=%d rq_state=%x\n",
+ req, atomic_read(&req->completion_ref),
+ req->rq_state);
+
+ /* We still need to put one kref associated with the
+ * "completion_ref" going zero in the code path that queued it
+ * here. The request object may still be referenced by a
+ * frozen local req->private_bio, in case we force-detached.
+ */
+ kref_put(&req->kref, drbd_req_destroy);
+
+ /* A single suspended or otherwise blocking device may stall
+ * all others as well. Fortunately, this code path is to
+ * recover from a situation that "should not happen":
+ * concurrent writes in multi-primary setup.
+ * In a "normal" lifecycle, this workqueue is supposed to be
+ * destroyed without ever doing anything.
+ * If it turns out to be an issue anyways, we can do per
+ * resource (replication group) or per device (minor) retry
+ * workqueues instead.
+ */
+
+ /* We are not just doing generic_make_request(),
+ * as we want to keep the start_time information. */
+ inc_ap_bio(mdev);
+ __drbd_make_request(mdev, bio, start_time);
+ }
+}
+
+void drbd_restart_request(struct drbd_request *req)
+{
+ unsigned long flags;
+ spin_lock_irqsave(&retry.lock, flags);
+ list_move_tail(&req->tl_requests, &retry.writes);
+ spin_unlock_irqrestore(&retry.lock, flags);
+
+ /* Drop the extra reference that would otherwise
+ * have been dropped by complete_master_bio.
+ * do_retry() needs to grab a new one. */
+ dec_ap_bio(req->w.mdev);
+
+ queue_work(retry.wq, &retry.worker);
+}
+
+
static void drbd_cleanup(void)
{
unsigned int i;
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn, *tmp;
unregister_reboot_notifier(&drbd_notifier);
@@ -3522,19 +2292,31 @@
if (drbd_proc)
remove_proc_entry("drbd", NULL);
- drbd_nl_cleanup();
+ if (retry.wq)
+ destroy_workqueue(retry.wq);
- if (minor_table) {
- i = minor_count;
- while (i--)
- drbd_delete_device(i);
- drbd_destroy_mempools();
+ drbd_genl_unregister();
+
+ idr_for_each_entry(&minors, mdev, i) {
+ idr_remove(&minors, mdev_to_minor(mdev));
+ idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ del_gendisk(mdev->vdisk);
+ /* synchronize_rcu(); No other threads running at this point */
+ kref_put(&mdev->kref, &drbd_minor_destroy);
}
- kfree(minor_table);
+ /* not _rcu since, no other updater anymore. Genl already unregistered */
+ list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+ list_del(&tconn->all_tconn); /* not _rcu no proc, not other threads */
+ /* synchronize_rcu(); */
+ kref_put(&tconn->kref, &conn_destroy);
+ }
+ drbd_destroy_mempools();
unregister_blkdev(DRBD_MAJOR, "drbd");
+ idr_destroy(&minors);
+
printk(KERN_INFO "drbd: module cleanup done.\n");
}
@@ -3559,7 +2341,7 @@
goto out;
}
- if (drbd_test_flag(mdev, CALLBACK_PENDING)) {
+ if (test_bit(CALLBACK_PENDING, &mdev->tconn->flags)) {
r |= (1 << BDI_async_congested);
/* Without good local data, we would need to read from remote,
* and that would need the worker thread as well, which is
@@ -3583,7 +2365,7 @@
reason = 'b';
}
- if (bdi_bits & (1 << BDI_async_congested) && drbd_test_flag(mdev, NET_CONGESTED)) {
+ if (bdi_bits & (1 << BDI_async_congested) && test_bit(NET_CONGESTED, &mdev->tconn->flags)) {
r |= (1 << BDI_async_congested);
reason = reason == 'b' ? 'a' : 'n';
}
@@ -3593,20 +2375,243 @@
return r;
}
-struct drbd_conf *drbd_new_device(unsigned int minor)
+static void drbd_init_workqueue(struct drbd_work_queue* wq)
+{
+ spin_lock_init(&wq->q_lock);
+ INIT_LIST_HEAD(&wq->q);
+ init_waitqueue_head(&wq->q_wait);
+}
+
+struct drbd_tconn *conn_get_by_name(const char *name)
+{
+ struct drbd_tconn *tconn;
+
+ if (!name || !name[0])
+ return NULL;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+ if (!strcmp(tconn->name, name)) {
+ kref_get(&tconn->kref);
+ goto found;
+ }
+ }
+ tconn = NULL;
+found:
+ rcu_read_unlock();
+ return tconn;
+}
+
+struct drbd_tconn *conn_get_by_addrs(void *my_addr, int my_addr_len,
+ void *peer_addr, int peer_addr_len)
+{
+ struct drbd_tconn *tconn;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(tconn, &drbd_tconns, all_tconn) {
+ if (tconn->my_addr_len == my_addr_len &&
+ tconn->peer_addr_len == peer_addr_len &&
+ !memcmp(&tconn->my_addr, my_addr, my_addr_len) &&
+ !memcmp(&tconn->peer_addr, peer_addr, peer_addr_len)) {
+ kref_get(&tconn->kref);
+ goto found;
+ }
+ }
+ tconn = NULL;
+found:
+ rcu_read_unlock();
+ return tconn;
+}
+
+static int drbd_alloc_socket(struct drbd_socket *socket)
+{
+ socket->rbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->rbuf)
+ return -ENOMEM;
+ socket->sbuf = (void *) __get_free_page(GFP_KERNEL);
+ if (!socket->sbuf)
+ return -ENOMEM;
+ return 0;
+}
+
+static void drbd_free_socket(struct drbd_socket *socket)
+{
+ free_page((unsigned long) socket->sbuf);
+ free_page((unsigned long) socket->rbuf);
+}
+
+void conn_free_crypto(struct drbd_tconn *tconn)
+{
+ drbd_free_sock(tconn);
+
+ crypto_free_hash(tconn->csums_tfm);
+ crypto_free_hash(tconn->verify_tfm);
+ crypto_free_hash(tconn->cram_hmac_tfm);
+ crypto_free_hash(tconn->integrity_tfm);
+ crypto_free_hash(tconn->peer_integrity_tfm);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+
+ tconn->csums_tfm = NULL;
+ tconn->verify_tfm = NULL;
+ tconn->cram_hmac_tfm = NULL;
+ tconn->integrity_tfm = NULL;
+ tconn->peer_integrity_tfm = NULL;
+ tconn->int_dig_in = NULL;
+ tconn->int_dig_vv = NULL;
+}
+
+int set_resource_options(struct drbd_tconn *tconn, struct res_opts *res_opts)
+{
+ cpumask_var_t new_cpu_mask;
+ int err;
+
+ if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL))
+ return -ENOMEM;
+ /*
+ retcode = ERR_NOMEM;
+ drbd_msg_put_info("unable to allocate cpumask");
+ */
+
+ /* silently ignore cpu mask on UP kernel */
+ if (nr_cpu_ids > 1 && res_opts->cpu_mask[0] != 0) {
+ /* FIXME: Get rid of constant 32 here */
+ err = bitmap_parse(res_opts->cpu_mask, 32,
+ cpumask_bits(new_cpu_mask), nr_cpu_ids);
+ if (err) {
+ conn_warn(tconn, "bitmap_parse() failed with %d\n", err);
+ /* retcode = ERR_CPU_MASK_PARSE; */
+ goto fail;
+ }
+ }
+ tconn->res_opts = *res_opts;
+ if (!cpumask_equal(tconn->cpu_mask, new_cpu_mask)) {
+ cpumask_copy(tconn->cpu_mask, new_cpu_mask);
+ drbd_calc_cpu_mask(tconn);
+ tconn->receiver.reset_cpu_mask = 1;
+ tconn->asender.reset_cpu_mask = 1;
+ tconn->worker.reset_cpu_mask = 1;
+ }
+ err = 0;
+
+fail:
+ free_cpumask_var(new_cpu_mask);
+ return err;
+
+}
+
+/* caller must be under genl_lock() */
+struct drbd_tconn *conn_create(const char *name, struct res_opts *res_opts)
+{
+ struct drbd_tconn *tconn;
+
+ tconn = kzalloc(sizeof(struct drbd_tconn), GFP_KERNEL);
+ if (!tconn)
+ return NULL;
+
+ tconn->name = kstrdup(name, GFP_KERNEL);
+ if (!tconn->name)
+ goto fail;
+
+ if (drbd_alloc_socket(&tconn->data))
+ goto fail;
+ if (drbd_alloc_socket(&tconn->meta))
+ goto fail;
+
+ if (!zalloc_cpumask_var(&tconn->cpu_mask, GFP_KERNEL))
+ goto fail;
+
+ if (set_resource_options(tconn, res_opts))
+ goto fail;
+
+ tconn->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
+ if (!tconn->current_epoch)
+ goto fail;
+
+ INIT_LIST_HEAD(&tconn->transfer_log);
+
+ INIT_LIST_HEAD(&tconn->current_epoch->list);
+ tconn->epochs = 1;
+ spin_lock_init(&tconn->epoch_lock);
+ tconn->write_ordering = WO_bdev_flush;
+
+ tconn->send.seen_any_write_yet = false;
+ tconn->send.current_epoch_nr = 0;
+ tconn->send.current_epoch_writes = 0;
+
+ tconn->cstate = C_STANDALONE;
+ mutex_init(&tconn->cstate_mutex);
+ spin_lock_init(&tconn->req_lock);
+ mutex_init(&tconn->conf_update);
+ init_waitqueue_head(&tconn->ping_wait);
+ idr_init(&tconn->volumes);
+
+ drbd_init_workqueue(&tconn->sender_work);
+ mutex_init(&tconn->data.mutex);
+ mutex_init(&tconn->meta.mutex);
+
+ drbd_thread_init(tconn, &tconn->receiver, drbdd_init, "receiver");
+ drbd_thread_init(tconn, &tconn->worker, drbd_worker, "worker");
+ drbd_thread_init(tconn, &tconn->asender, drbd_asender, "asender");
+
+ kref_init(&tconn->kref);
+ list_add_tail_rcu(&tconn->all_tconn, &drbd_tconns);
+
+ return tconn;
+
+fail:
+ kfree(tconn->current_epoch);
+ free_cpumask_var(tconn->cpu_mask);
+ drbd_free_socket(&tconn->meta);
+ drbd_free_socket(&tconn->data);
+ kfree(tconn->name);
+ kfree(tconn);
+
+ return NULL;
+}
+
+void conn_destroy(struct kref *kref)
+{
+ struct drbd_tconn *tconn = container_of(kref, struct drbd_tconn, kref);
+
+ if (atomic_read(&tconn->current_epoch->epoch_size) != 0)
+ conn_err(tconn, "epoch_size:%d\n", atomic_read(&tconn->current_epoch->epoch_size));
+ kfree(tconn->current_epoch);
+
+ idr_destroy(&tconn->volumes);
+
+ free_cpumask_var(tconn->cpu_mask);
+ drbd_free_socket(&tconn->meta);
+ drbd_free_socket(&tconn->data);
+ kfree(tconn->name);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+ kfree(tconn);
+}
+
+enum drbd_ret_code conn_new_minor(struct drbd_tconn *tconn, unsigned int minor, int vnr)
{
struct drbd_conf *mdev;
struct gendisk *disk;
struct request_queue *q;
+ int vnr_got = vnr;
+ int minor_got = minor;
+ enum drbd_ret_code err = ERR_NOMEM;
+
+ mdev = minor_to_mdev(minor);
+ if (mdev)
+ return ERR_MINOR_EXISTS;
/* GFP_KERNEL, we are outside of all write-out paths */
mdev = kzalloc(sizeof(struct drbd_conf), GFP_KERNEL);
if (!mdev)
- return NULL;
- if (!zalloc_cpumask_var(&mdev->cpu_mask, GFP_KERNEL))
- goto out_no_cpumask;
+ return ERR_NOMEM;
+
+ kref_get(&tconn->kref);
+ mdev->tconn = tconn;
mdev->minor = minor;
+ mdev->vnr = vnr;
drbd_init_set_defaults(mdev);
@@ -3644,7 +2649,7 @@
blk_queue_max_hw_sectors(q, DRBD_MAX_BIO_SIZE_SAFE >> 8);
blk_queue_bounce_limit(q, BLK_BOUNCE_ANY);
blk_queue_merge_bvec(q, drbd_merge_bvec);
- q->queue_lock = &mdev->req_lock;
+ q->queue_lock = &mdev->tconn->req_lock; /* needed since we use */
mdev->md_io_page = alloc_page(GFP_KERNEL);
if (!mdev->md_io_page)
@@ -3652,30 +2657,44 @@
if (drbd_bm_init(mdev))
goto out_no_bitmap;
- /* no need to lock access, we are still initializing this minor device. */
- if (!tl_init(mdev))
- goto out_no_tl;
+ mdev->read_requests = RB_ROOT;
+ mdev->write_requests = RB_ROOT;
- mdev->app_reads_hash = kzalloc(APP_R_HSIZE*sizeof(void *), GFP_KERNEL);
- if (!mdev->app_reads_hash)
- goto out_no_app_reads;
+ if (!idr_pre_get(&minors, GFP_KERNEL))
+ goto out_no_minor_idr;
+ if (idr_get_new_above(&minors, mdev, minor, &minor_got))
+ goto out_no_minor_idr;
+ if (minor_got != minor) {
+ err = ERR_MINOR_EXISTS;
+ drbd_msg_put_info("requested minor exists already");
+ goto out_idr_remove_minor;
+ }
- mdev->current_epoch = kzalloc(sizeof(struct drbd_epoch), GFP_KERNEL);
- if (!mdev->current_epoch)
- goto out_no_epoch;
+ if (!idr_pre_get(&tconn->volumes, GFP_KERNEL))
+ goto out_idr_remove_minor;
+ if (idr_get_new_above(&tconn->volumes, mdev, vnr, &vnr_got))
+ goto out_idr_remove_minor;
+ if (vnr_got != vnr) {
+ err = ERR_INVALID_REQUEST;
+ drbd_msg_put_info("requested volume exists already");
+ goto out_idr_remove_vol;
+ }
+ add_disk(disk);
+ kref_init(&mdev->kref); /* one ref for both idrs and the the add_disk */
- INIT_LIST_HEAD(&mdev->current_epoch->list);
- mdev->epochs = 1;
+ /* inherit the connection state */
+ mdev->state.conn = tconn->cstate;
+ if (mdev->state.conn == C_WF_REPORT_PARAMS)
+ drbd_connected(mdev);
- return mdev;
+ return NO_ERROR;
-/* out_whatever_else:
- kfree(mdev->current_epoch); */
-out_no_epoch:
- kfree(mdev->app_reads_hash);
-out_no_app_reads:
- tl_cleanup(mdev);
-out_no_tl:
+out_idr_remove_vol:
+ idr_remove(&tconn->volumes, vnr_got);
+out_idr_remove_minor:
+ idr_remove(&minors, minor_got);
+ synchronize_rcu();
+out_no_minor_idr:
drbd_bm_cleanup(mdev);
out_no_bitmap:
__free_page(mdev->md_io_page);
@@ -3684,55 +2703,25 @@
out_no_disk:
blk_cleanup_queue(q);
out_no_q:
- free_cpumask_var(mdev->cpu_mask);
-out_no_cpumask:
kfree(mdev);
- return NULL;
+ kref_put(&tconn->kref, &conn_destroy);
+ return err;
}
-/* counterpart of drbd_new_device.
- * last part of drbd_delete_device. */
-void drbd_free_mdev(struct drbd_conf *mdev)
-{
- kfree(mdev->current_epoch);
- kfree(mdev->app_reads_hash);
- tl_cleanup(mdev);
- if (mdev->bitmap) /* should no longer be there. */
- drbd_bm_cleanup(mdev);
- __free_page(mdev->md_io_page);
- put_disk(mdev->vdisk);
- blk_cleanup_queue(mdev->rq_queue);
- free_cpumask_var(mdev->cpu_mask);
- drbd_free_tl_hash(mdev);
- kfree(mdev);
-}
-
-
int __init drbd_init(void)
{
int err;
- if (sizeof(struct p_handshake) != 80) {
- printk(KERN_ERR
- "drbd: never change the size or layout "
- "of the HandShake packet.\n");
- return -EINVAL;
- }
-
if (minor_count < DRBD_MINOR_COUNT_MIN || minor_count > DRBD_MINOR_COUNT_MAX) {
printk(KERN_ERR
- "drbd: invalid minor_count (%d)\n", minor_count);
+ "drbd: invalid minor_count (%d)\n", minor_count);
#ifdef MODULE
return -EINVAL;
#else
- minor_count = 8;
+ minor_count = DRBD_MINOR_COUNT_DEF;
#endif
}
- err = drbd_nl_init();
- if (err)
- return err;
-
err = register_blkdev(DRBD_MAJOR, "drbd");
if (err) {
printk(KERN_ERR
@@ -3741,6 +2730,13 @@
return err;
}
+ err = drbd_genl_register();
+ if (err) {
+ printk(KERN_ERR "drbd: unable to register generic netlink family\n");
+ goto fail;
+ }
+
+
register_reboot_notifier(&drbd_notifier);
/*
@@ -3751,22 +2747,29 @@
init_waitqueue_head(&drbd_pp_wait);
drbd_proc = NULL; /* play safe for drbd_cleanup */
- minor_table = kzalloc(sizeof(struct drbd_conf *)*minor_count,
- GFP_KERNEL);
- if (!minor_table)
- goto Enomem;
+ idr_init(&minors);
err = drbd_create_mempools();
if (err)
- goto Enomem;
+ goto fail;
drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
if (!drbd_proc) {
printk(KERN_ERR "drbd: unable to register proc file\n");
- goto Enomem;
+ goto fail;
}
rwlock_init(&global_state_lock);
+ INIT_LIST_HEAD(&drbd_tconns);
+
+ retry.wq = create_singlethread_workqueue("drbd-reissue");
+ if (!retry.wq) {
+ printk(KERN_ERR "drbd: unable to create retry workqueue\n");
+ goto fail;
+ }
+ INIT_WORK(&retry.worker, do_retry);
+ spin_lock_init(&retry.lock);
+ INIT_LIST_HEAD(&retry.writes);
printk(KERN_INFO "drbd: initialized. "
"Version: " REL_VERSION " (api:%d/proto:%d-%d)\n",
@@ -3774,11 +2777,10 @@
printk(KERN_INFO "drbd: %s\n", drbd_buildtag());
printk(KERN_INFO "drbd: registered as block device major %d\n",
DRBD_MAJOR);
- printk(KERN_INFO "drbd: minor_table @ 0x%p\n", minor_table);
return 0; /* Success! */
-Enomem:
+fail:
drbd_cleanup();
if (err == -ENOMEM)
/* currently always the case */
@@ -3799,47 +2801,42 @@
kfree(ldev);
}
-void drbd_free_sock(struct drbd_conf *mdev)
+void drbd_free_sock(struct drbd_tconn *tconn)
{
- if (mdev->data.socket) {
- mutex_lock(&mdev->data.mutex);
- kernel_sock_shutdown(mdev->data.socket, SHUT_RDWR);
- sock_release(mdev->data.socket);
- mdev->data.socket = NULL;
- mutex_unlock(&mdev->data.mutex);
+ if (tconn->data.socket) {
+ mutex_lock(&tconn->data.mutex);
+ kernel_sock_shutdown(tconn->data.socket, SHUT_RDWR);
+ sock_release(tconn->data.socket);
+ tconn->data.socket = NULL;
+ mutex_unlock(&tconn->data.mutex);
}
- if (mdev->meta.socket) {
- mutex_lock(&mdev->meta.mutex);
- kernel_sock_shutdown(mdev->meta.socket, SHUT_RDWR);
- sock_release(mdev->meta.socket);
- mdev->meta.socket = NULL;
- mutex_unlock(&mdev->meta.mutex);
+ if (tconn->meta.socket) {
+ mutex_lock(&tconn->meta.mutex);
+ kernel_sock_shutdown(tconn->meta.socket, SHUT_RDWR);
+ sock_release(tconn->meta.socket);
+ tconn->meta.socket = NULL;
+ mutex_unlock(&tconn->meta.mutex);
}
}
-
-void drbd_free_resources(struct drbd_conf *mdev)
-{
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = NULL;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = NULL;
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = NULL;
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = NULL;
-
- drbd_free_sock(mdev);
-
- __no_warn(local,
- drbd_free_bc(mdev->ldev);
- mdev->ldev = NULL;);
-}
-
/* meta data management */
+void conn_md_sync(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_md_sync(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
struct meta_data_on_disk {
u64 la_size; /* last agreed size. */
u64 uuid[UI_SIZE]; /* UUIDs. */
@@ -3850,7 +2847,7 @@
u32 md_size_sect;
u32 al_offset; /* offset to this block */
u32 al_nr_extents; /* important for restoring the AL */
- /* `-- act_log->nr_elements <-- sync_conf.al_extents */
+ /* `-- act_log->nr_elements <-- ldev->dc.al_extents */
u32 bm_offset; /* offset to the bitmap, from here */
u32 bm_bytes_per_bit; /* BM_BLOCK_SIZE */
u32 la_peer_max_bio_size; /* last peer max_bio_size */
@@ -3870,7 +2867,7 @@
del_timer(&mdev->md_sync_timer);
/* timer may be rearmed by drbd_md_mark_dirty() now. */
- if (!drbd_test_and_clear_flag(mdev, MD_DIRTY))
+ if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
return;
/* We use here D_FAILED and not D_ATTACHING because we try to write
@@ -3888,7 +2885,7 @@
for (i = UI_CURRENT; i < UI_SIZE; i++)
buffer->uuid[i] = cpu_to_be64(mdev->ldev->md.uuid[i]);
buffer->flags = cpu_to_be32(mdev->ldev->md.flags);
- buffer->magic = cpu_to_be32(DRBD_MD_MAGIC);
+ buffer->magic = cpu_to_be32(DRBD_MD_MAGIC_84_UNCLEAN);
buffer->md_size_sect = cpu_to_be32(mdev->ldev->md.md_size_sect);
buffer->al_offset = cpu_to_be32(mdev->ldev->md.al_offset);
@@ -3902,7 +2899,7 @@
D_ASSERT(drbd_md_ss__(mdev, mdev->ldev) == mdev->ldev->md.md_offset);
sector = mdev->ldev->md.md_offset;
- if (!drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
+ if (drbd_md_sync_page_io(mdev, mdev->ldev, sector, WRITE)) {
/* this was a try anyways ... */
dev_err(DEV, "meta data update failed!\n");
drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
@@ -3923,11 +2920,12 @@
* @bdev: Device from which the meta data should be read in.
*
* Return 0 (NO_ERROR) on success, and an enum drbd_ret_code in case
- * something goes wrong. Currently only: ERR_IO_MD_DISK, ERR_MD_INVALID.
+ * something goes wrong.
*/
int drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev)
{
struct meta_data_on_disk *buffer;
+ u32 magic, flags;
int i, rv = NO_ERROR;
if (!get_ldev_if_state(mdev, D_ATTACHING))
@@ -3937,7 +2935,7 @@
if (!buffer)
goto out;
- if (!drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
+ if (drbd_md_sync_page_io(mdev, bdev, bdev->md.md_offset, READ)) {
/* NOTE: can't do normal error processing here as this is
called BEFORE disk is attached */
dev_err(DEV, "Error while reading metadata.\n");
@@ -3945,8 +2943,20 @@
goto err;
}
- if (be32_to_cpu(buffer->magic) != DRBD_MD_MAGIC) {
- dev_err(DEV, "Error while reading metadata, magic not found.\n");
+ magic = be32_to_cpu(buffer->magic);
+ flags = be32_to_cpu(buffer->flags);
+ if (magic == DRBD_MD_MAGIC_84_UNCLEAN ||
+ (magic == DRBD_MD_MAGIC_08 && !(flags & MDF_AL_CLEAN))) {
+ /* btw: that's Activity Log clean, not "all" clean. */
+ dev_err(DEV, "Found unclean meta data. Did you \"drbdadm apply-al\"?\n");
+ rv = ERR_MD_UNCLEAN;
+ goto err;
+ }
+ if (magic != DRBD_MD_MAGIC_08) {
+ if (magic == DRBD_MD_MAGIC_07)
+ dev_err(DEV, "Found old (0.7) meta data magic. Did you \"drbdadm create-md\"?\n");
+ else
+ dev_err(DEV, "Meta data magic not found. Did you \"drbdadm create-md\"?\n");
rv = ERR_MD_INVALID;
goto err;
}
@@ -3980,20 +2990,16 @@
for (i = UI_CURRENT; i < UI_SIZE; i++)
bdev->md.uuid[i] = be64_to_cpu(buffer->uuid[i]);
bdev->md.flags = be32_to_cpu(buffer->flags);
- mdev->sync_conf.al_extents = be32_to_cpu(buffer->al_nr_extents);
bdev->md.device_uuid = be64_to_cpu(buffer->device_uuid);
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED) {
unsigned int peer;
peer = be32_to_cpu(buffer->la_peer_max_bio_size);
peer = max(peer, DRBD_MAX_BIO_SIZE_SAFE);
mdev->peer_max_bio_size = peer;
}
- spin_unlock_irq(&mdev->req_lock);
-
- if (mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
+ spin_unlock_irq(&mdev->tconn->req_lock);
err:
drbd_md_put_buffer(mdev);
@@ -4014,7 +3020,7 @@
#ifdef DEBUG
void drbd_md_mark_dirty_(struct drbd_conf *mdev, unsigned int line, const char *func)
{
- if (!drbd_test_and_set_flag(mdev, MD_DIRTY)) {
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags)) {
mod_timer(&mdev->md_sync_timer, jiffies + HZ);
mdev->last_md_mark_dirty.line = line;
mdev->last_md_mark_dirty.func = func;
@@ -4023,7 +3029,7 @@
#else
void drbd_md_mark_dirty(struct drbd_conf *mdev)
{
- if (!drbd_test_and_set_flag(mdev, MD_DIRTY))
+ if (!test_and_set_bit(MD_DIRTY, &mdev->flags))
mod_timer(&mdev->md_sync_timer, jiffies + 5*HZ);
}
#endif
@@ -4171,9 +3177,10 @@
return rv;
}
-static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_bitmap_io(struct drbd_work *w, int unused)
{
struct bm_io_work *work = container_of(w, struct bm_io_work, w);
+ struct drbd_conf *mdev = w->mdev;
int rv = -EIO;
D_ASSERT(atomic_read(&mdev->ap_bio_cnt) == 0);
@@ -4185,18 +3192,17 @@
put_ldev(mdev);
}
- drbd_clear_flag(mdev, BITMAP_IO);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(BITMAP_IO, &mdev->flags);
wake_up(&mdev->misc_wait);
if (work->done)
work->done(mdev, rv);
- drbd_clear_flag(mdev, BITMAP_IO_QUEUED);
+ clear_bit(BITMAP_IO_QUEUED, &mdev->flags);
work->why = NULL;
work->flags = 0;
- return 1;
+ return 0;
}
void drbd_ldev_destroy(struct drbd_conf *mdev)
@@ -4209,15 +3215,13 @@
drbd_free_bc(mdev->ldev);
mdev->ldev = NULL;);
- if (mdev->md_io_tmpp) {
- __free_page(mdev->md_io_tmpp);
- mdev->md_io_tmpp = NULL;
- }
- drbd_clear_flag(mdev, GO_DISKLESS);
+ clear_bit(GO_DISKLESS, &mdev->flags);
}
-static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_go_diskless(struct drbd_work *w, int unused)
{
+ struct drbd_conf *mdev = w->mdev;
+
D_ASSERT(mdev->state.disk == D_FAILED);
/* we cannot assert local_cnt == 0 here, as get_ldev_if_state will
* inc/dec it frequently. Once we are D_DISKLESS, no one will touch
@@ -4232,11 +3236,15 @@
* (Do we want a specific meta data flag for this?)
*
* If that does not make it to stable storage either,
- * we cannot do anything about that anymore. */
- if (mdev->bitmap) {
+ * we cannot do anything about that anymore.
+ *
+ * We still need to check if both bitmap and ldev are present, we may
+ * end up here after a failed attach, before ldev was even assigned.
+ */
+ if (mdev->bitmap && mdev->ldev) {
if (drbd_bitmap_io_from_worker(mdev, drbd_bm_write,
"detach", BM_LOCKED_MASK)) {
- if (drbd_test_flag(mdev, WAS_READ_ERROR)) {
+ if (test_bit(WAS_READ_ERROR, &mdev->flags)) {
drbd_md_set_flag(mdev, MDF_FULL_SYNC);
drbd_md_sync(mdev);
}
@@ -4244,14 +3252,14 @@
}
drbd_force_state(mdev, NS(disk, D_DISKLESS));
- return 1;
+ return 0;
}
void drbd_go_diskless(struct drbd_conf *mdev)
{
D_ASSERT(mdev->state.disk == D_FAILED);
- if (!drbd_test_and_set_flag(mdev, GO_DISKLESS))
- drbd_queue_work(&mdev->data.work, &mdev->go_diskless);
+ if (!test_and_set_bit(GO_DISKLESS, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->go_diskless);
}
/**
@@ -4271,10 +3279,10 @@
void (*done)(struct drbd_conf *, int),
char *why, enum bm_flag flags)
{
- D_ASSERT(current == mdev->worker.task);
+ D_ASSERT(current == mdev->tconn->worker.task);
- D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO_QUEUED));
- D_ASSERT(!drbd_test_flag(mdev, BITMAP_IO));
+ D_ASSERT(!test_bit(BITMAP_IO_QUEUED, &mdev->flags));
+ D_ASSERT(!test_bit(BITMAP_IO, &mdev->flags));
D_ASSERT(list_empty(&mdev->bm_io_work.w.list));
if (mdev->bm_io_work.why)
dev_err(DEV, "FIXME going to queue '%s' but '%s' still pending?\n",
@@ -4285,13 +3293,13 @@
mdev->bm_io_work.why = why;
mdev->bm_io_work.flags = flags;
- spin_lock_irq(&mdev->req_lock);
- drbd_set_flag(mdev, BITMAP_IO);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ set_bit(BITMAP_IO, &mdev->flags);
if (atomic_read(&mdev->ap_bio_cnt) == 0) {
- if (!drbd_test_and_set_flag(mdev, BITMAP_IO_QUEUED))
- drbd_queue_work(&mdev->data.work, &mdev->bm_io_work.w);
+ if (!test_and_set_bit(BITMAP_IO_QUEUED, &mdev->flags))
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->bm_io_work.w);
}
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
/**
@@ -4308,7 +3316,7 @@
{
int rv;
- D_ASSERT(current != mdev->worker.task);
+ D_ASSERT(current != mdev->tconn->worker.task);
if ((flags & BM_LOCKED_SET_ALLOWED) == 0)
drbd_suspend_io(mdev);
@@ -4347,18 +3355,127 @@
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
- drbd_queue_work_front(&mdev->data.work, &mdev->md_sync_work);
+ /* must not double-queue! */
+ if (list_empty(&mdev->md_sync_work.list))
+ drbd_queue_work_front(&mdev->tconn->sender_work, &mdev->md_sync_work);
}
-static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int w_md_sync(struct drbd_work *w, int unused)
{
+ struct drbd_conf *mdev = w->mdev;
+
dev_warn(DEV, "md_sync_timer expired! Worker calls drbd_md_sync().\n");
#ifdef DEBUG
dev_warn(DEV, "last md_mark_dirty: %s:%u\n",
mdev->last_md_mark_dirty.func, mdev->last_md_mark_dirty.line);
#endif
drbd_md_sync(mdev);
- return 1;
+ return 0;
+}
+
+const char *cmdname(enum drbd_packet cmd)
+{
+ /* THINK may need to become several global tables
+ * when we want to support more than
+ * one PRO_VERSION */
+ static const char *cmdnames[] = {
+ [P_DATA] = "Data",
+ [P_DATA_REPLY] = "DataReply",
+ [P_RS_DATA_REPLY] = "RSDataReply",
+ [P_BARRIER] = "Barrier",
+ [P_BITMAP] = "ReportBitMap",
+ [P_BECOME_SYNC_TARGET] = "BecomeSyncTarget",
+ [P_BECOME_SYNC_SOURCE] = "BecomeSyncSource",
+ [P_UNPLUG_REMOTE] = "UnplugRemote",
+ [P_DATA_REQUEST] = "DataRequest",
+ [P_RS_DATA_REQUEST] = "RSDataRequest",
+ [P_SYNC_PARAM] = "SyncParam",
+ [P_SYNC_PARAM89] = "SyncParam89",
+ [P_PROTOCOL] = "ReportProtocol",
+ [P_UUIDS] = "ReportUUIDs",
+ [P_SIZES] = "ReportSizes",
+ [P_STATE] = "ReportState",
+ [P_SYNC_UUID] = "ReportSyncUUID",
+ [P_AUTH_CHALLENGE] = "AuthChallenge",
+ [P_AUTH_RESPONSE] = "AuthResponse",
+ [P_PING] = "Ping",
+ [P_PING_ACK] = "PingAck",
+ [P_RECV_ACK] = "RecvAck",
+ [P_WRITE_ACK] = "WriteAck",
+ [P_RS_WRITE_ACK] = "RSWriteAck",
+ [P_SUPERSEDED] = "Superseded",
+ [P_NEG_ACK] = "NegAck",
+ [P_NEG_DREPLY] = "NegDReply",
+ [P_NEG_RS_DREPLY] = "NegRSDReply",
+ [P_BARRIER_ACK] = "BarrierAck",
+ [P_STATE_CHG_REQ] = "StateChgRequest",
+ [P_STATE_CHG_REPLY] = "StateChgReply",
+ [P_OV_REQUEST] = "OVRequest",
+ [P_OV_REPLY] = "OVReply",
+ [P_OV_RESULT] = "OVResult",
+ [P_CSUM_RS_REQUEST] = "CsumRSRequest",
+ [P_RS_IS_IN_SYNC] = "CsumRSIsInSync",
+ [P_COMPRESSED_BITMAP] = "CBitmap",
+ [P_DELAY_PROBE] = "DelayProbe",
+ [P_OUT_OF_SYNC] = "OutOfSync",
+ [P_RETRY_WRITE] = "RetryWrite",
+ [P_RS_CANCEL] = "RSCancel",
+ [P_CONN_ST_CHG_REQ] = "conn_st_chg_req",
+ [P_CONN_ST_CHG_REPLY] = "conn_st_chg_reply",
+ [P_RETRY_WRITE] = "retry_write",
+ [P_PROTOCOL_UPDATE] = "protocol_update",
+
+ /* enum drbd_packet, but not commands - obsoleted flags:
+ * P_MAY_IGNORE
+ * P_MAX_OPT_CMD
+ */
+ };
+
+ /* too big for the array: 0xfffX */
+ if (cmd == P_INITIAL_META)
+ return "InitialMeta";
+ if (cmd == P_INITIAL_DATA)
+ return "InitialData";
+ if (cmd == P_CONNECTION_FEATURES)
+ return "ConnectionFeatures";
+ if (cmd >= ARRAY_SIZE(cmdnames))
+ return "Unknown";
+ return cmdnames[cmd];
+}
+
+/**
+ * drbd_wait_misc - wait for a request to make progress
+ * @mdev: device associated with the request
+ * @i: the struct drbd_interval embedded in struct drbd_request or
+ * struct drbd_peer_request
+ */
+int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
+{
+ struct net_conf *nc;
+ DEFINE_WAIT(wait);
+ long timeout;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -ETIMEDOUT;
+ }
+ timeout = nc->ko_count ? nc->timeout * HZ / 10 * nc->ko_count : MAX_SCHEDULE_TIMEOUT;
+ rcu_read_unlock();
+
+ /* Indicate to wake up mdev->misc_wait on progress. */
+ i->waiting = true;
+ prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ timeout = schedule_timeout(timeout);
+ finish_wait(&mdev->misc_wait, &wait);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (!timeout || mdev->state.conn < C_CONNECTED)
+ return -ETIMEDOUT;
+ if (signal_pending(current))
+ return -ERESTARTSYS;
+ return 0;
}
#ifdef CONFIG_DRBD_FAULT_INJECTION
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index c8dda4e..76bb3a6 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -29,159 +29,317 @@
#include <linux/fs.h>
#include <linux/file.h>
#include <linux/slab.h>
-#include <linux/connector.h>
#include <linux/blkpg.h>
#include <linux/cpumask.h>
#include "drbd_int.h"
#include "drbd_req.h"
#include "drbd_wrappers.h"
#include <asm/unaligned.h>
-#include <linux/drbd_tag_magic.h>
#include <linux/drbd_limits.h>
-#include <linux/compiler.h>
#include <linux/kthread.h>
-static unsigned short *tl_add_blob(unsigned short *, enum drbd_tags, const void *, int);
-static unsigned short *tl_add_str(unsigned short *, enum drbd_tags, const char *);
-static unsigned short *tl_add_int(unsigned short *, enum drbd_tags, const void *);
+#include <net/genetlink.h>
-/* see get_sb_bdev and bd_claim */
+/* .doit */
+// int drbd_adm_create_resource(struct sk_buff *skb, struct genl_info *info);
+// int drbd_adm_delete_resource(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info);
+
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info);
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info);
+/* .dumpit */
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb);
+
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+#include <linux/genl_magic_func.h>
+
+/* used blkdev_get_by_path, to claim our meta data device(s) */
static char *drbd_m_holder = "Hands off! this is DRBD's meta data device.";
-/* Generate the tag_list to struct functions */
-#define NL_PACKET(name, number, fields) \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) __attribute__ ((unused)); \
-static int name ## _from_tags(struct drbd_conf *mdev, \
- unsigned short *tags, struct name *arg) \
-{ \
- int tag; \
- int dlen; \
- \
- while ((tag = get_unaligned(tags++)) != TT_END) { \
- dlen = get_unaligned(tags++); \
- switch (tag_number(tag)) { \
- fields \
- default: \
- if (tag & T_MANDATORY) { \
- dev_err(DEV, "Unknown tag: %d\n", tag_number(tag)); \
- return 0; \
- } \
- } \
- tags = (unsigned short *)((char *)tags + dlen); \
- } \
- return 1; \
-}
-#define NL_INTEGER(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INTEGER ); */ \
- arg->member = get_unaligned((int *)(tags)); \
- break;
-#define NL_INT64(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_INT64 ); */ \
- arg->member = get_unaligned((u64 *)(tags)); \
- break;
-#define NL_BIT(pn, pr, member) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_BIT ); */ \
- arg->member = *(char *)(tags) ? 1 : 0; \
- break;
-#define NL_STRING(pn, pr, member, len) \
- case pn: /* D_ASSERT( tag_type(tag) == TT_STRING ); */ \
- if (dlen > len) { \
- dev_err(DEV, "arg too long: %s (%u wanted, max len: %u bytes)\n", \
- #member, dlen, (unsigned int)len); \
- return 0; \
- } \
- arg->member ## _len = dlen; \
- memcpy(arg->member, tags, min_t(size_t, dlen, len)); \
- break;
-#include <linux/drbd_nl.h>
+/* Configuration is strictly serialized, because generic netlink message
+ * processing is strictly serialized by the genl_lock().
+ * Which means we can use one static global drbd_config_context struct.
+ */
+static struct drbd_config_context {
+ /* assigned from drbd_genlmsghdr */
+ unsigned int minor;
+ /* assigned from request attributes, if present */
+ unsigned int volume;
+#define VOLUME_UNSPECIFIED (-1U)
+ /* pointer into the request skb,
+ * limited lifetime! */
+ char *resource_name;
+ struct nlattr *my_addr;
+ struct nlattr *peer_addr;
-/* Generate the struct to tag_list functions */
-#define NL_PACKET(name, number, fields) \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) __attribute__ ((unused)); \
-static unsigned short* \
-name ## _to_tags(struct drbd_conf *mdev, \
- struct name *arg, unsigned short *tags) \
-{ \
- fields \
- return tags; \
+ /* reply buffer */
+ struct sk_buff *reply_skb;
+ /* pointer into reply buffer */
+ struct drbd_genlmsghdr *reply_dh;
+ /* resolved from attributes, if possible */
+ struct drbd_conf *mdev;
+ struct drbd_tconn *tconn;
+} adm_ctx;
+
+static void drbd_adm_send_reply(struct sk_buff *skb, struct genl_info *info)
+{
+ genlmsg_end(skb, genlmsg_data(nlmsg_data(nlmsg_hdr(skb))));
+ if (genlmsg_reply(skb, info))
+ printk(KERN_ERR "drbd: error sending genl reply\n");
}
-#define NL_INTEGER(pn, pr, member) \
- put_unaligned(pn | pr | TT_INTEGER, tags++); \
- put_unaligned(sizeof(int), tags++); \
- put_unaligned(arg->member, (int *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(int));
-#define NL_INT64(pn, pr, member) \
- put_unaligned(pn | pr | TT_INT64, tags++); \
- put_unaligned(sizeof(u64), tags++); \
- put_unaligned(arg->member, (u64 *)tags); \
- tags = (unsigned short *)((char *)tags+sizeof(u64));
-#define NL_BIT(pn, pr, member) \
- put_unaligned(pn | pr | TT_BIT, tags++); \
- put_unaligned(sizeof(char), tags++); \
- *(char *)tags = arg->member; \
- tags = (unsigned short *)((char *)tags+sizeof(char));
-#define NL_STRING(pn, pr, member, len) \
- put_unaligned(pn | pr | TT_STRING, tags++); \
- put_unaligned(arg->member ## _len, tags++); \
- memcpy(tags, arg->member, arg->member ## _len); \
- tags = (unsigned short *)((char *)tags + arg->member ## _len);
-#include <linux/drbd_nl.h>
+/* Used on a fresh "drbd_adm_prepare"d reply_skb, this cannot fail: The only
+ * reason it could fail was no space in skb, and there are 4k available. */
+int drbd_msg_put_info(const char *info)
+{
+ struct sk_buff *skb = adm_ctx.reply_skb;
+ struct nlattr *nla;
+ int err = -EMSGSIZE;
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name);
-void drbd_nl_send_reply(struct cn_msg *, int);
+ if (!info || !info[0])
+ return 0;
+
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_REPLY);
+ if (!nla)
+ return err;
+
+ err = nla_put_string(skb, T_info_text, info);
+ if (err) {
+ nla_nest_cancel(skb, nla);
+ return err;
+ } else
+ nla_nest_end(skb, nla);
+ return 0;
+}
+
+/* This would be a good candidate for a "pre_doit" hook,
+ * and per-family private info->pointers.
+ * But we need to stay compatible with older kernels.
+ * If it returns successfully, adm_ctx members are valid.
+ */
+#define DRBD_ADM_NEED_MINOR 1
+#define DRBD_ADM_NEED_RESOURCE 2
+#define DRBD_ADM_NEED_CONNECTION 4
+static int drbd_adm_prepare(struct sk_buff *skb, struct genl_info *info,
+ unsigned flags)
+{
+ struct drbd_genlmsghdr *d_in = info->userhdr;
+ const u8 cmd = info->genlhdr->cmd;
+ int err;
+
+ memset(&adm_ctx, 0, sizeof(adm_ctx));
+
+ /* genl_rcv_msg only checks for CAP_NET_ADMIN on "GENL_ADMIN_PERM" :( */
+ if (cmd != DRBD_ADM_GET_STATUS && !capable(CAP_NET_ADMIN))
+ return -EPERM;
+
+ adm_ctx.reply_skb = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!adm_ctx.reply_skb) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh = genlmsg_put_reply(adm_ctx.reply_skb,
+ info, &drbd_genl_family, 0, cmd);
+ /* put of a few bytes into a fresh skb of >= 4k will always succeed.
+ * but anyways */
+ if (!adm_ctx.reply_dh) {
+ err = -ENOMEM;
+ goto fail;
+ }
+
+ adm_ctx.reply_dh->minor = d_in->minor;
+ adm_ctx.reply_dh->ret_code = NO_ERROR;
+
+ adm_ctx.volume = VOLUME_UNSPECIFIED;
+ if (info->attrs[DRBD_NLA_CFG_CONTEXT]) {
+ struct nlattr *nla;
+ /* parse and validate only */
+ err = drbd_cfg_context_from_attrs(NULL, info);
+ if (err)
+ goto fail;
+
+ /* It was present, and valid,
+ * copy it over to the reply skb. */
+ err = nla_put_nohdr(adm_ctx.reply_skb,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]->nla_len,
+ info->attrs[DRBD_NLA_CFG_CONTEXT]);
+ if (err)
+ goto fail;
+
+ /* and assign stuff to the global adm_ctx */
+ nla = nested_attr_tb[__nla_type(T_ctx_volume)];
+ if (nla)
+ adm_ctx.volume = nla_get_u32(nla);
+ nla = nested_attr_tb[__nla_type(T_ctx_resource_name)];
+ if (nla)
+ adm_ctx.resource_name = nla_data(nla);
+ adm_ctx.my_addr = nested_attr_tb[__nla_type(T_ctx_my_addr)];
+ adm_ctx.peer_addr = nested_attr_tb[__nla_type(T_ctx_peer_addr)];
+ if ((adm_ctx.my_addr &&
+ nla_len(adm_ctx.my_addr) > sizeof(adm_ctx.tconn->my_addr)) ||
+ (adm_ctx.peer_addr &&
+ nla_len(adm_ctx.peer_addr) > sizeof(adm_ctx.tconn->peer_addr))) {
+ err = -EINVAL;
+ goto fail;
+ }
+ }
+
+ adm_ctx.minor = d_in->minor;
+ adm_ctx.mdev = minor_to_mdev(d_in->minor);
+ adm_ctx.tconn = conn_get_by_name(adm_ctx.resource_name);
+
+ if (!adm_ctx.mdev && (flags & DRBD_ADM_NEED_MINOR)) {
+ drbd_msg_put_info("unknown minor");
+ return ERR_MINOR_INVALID;
+ }
+ if (!adm_ctx.tconn && (flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("unknown resource");
+ return ERR_INVALID_REQUEST;
+ }
+
+ if (flags & DRBD_ADM_NEED_CONNECTION) {
+ if (adm_ctx.tconn && !(flags & DRBD_ADM_NEED_RESOURCE)) {
+ drbd_msg_put_info("no resource name expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev) {
+ drbd_msg_put_info("no minor number expected");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.my_addr && adm_ctx.peer_addr)
+ adm_ctx.tconn = conn_get_by_addrs(nla_data(adm_ctx.my_addr),
+ nla_len(adm_ctx.my_addr),
+ nla_data(adm_ctx.peer_addr),
+ nla_len(adm_ctx.peer_addr));
+ if (!adm_ctx.tconn) {
+ drbd_msg_put_info("unknown connection");
+ return ERR_INVALID_REQUEST;
+ }
+ }
+
+ /* some more paranoia, if the request was over-determined */
+ if (adm_ctx.mdev && adm_ctx.tconn &&
+ adm_ctx.mdev->tconn != adm_ctx.tconn) {
+ pr_warning("request: minor=%u, resource=%s; but that minor belongs to connection %s\n",
+ adm_ctx.minor, adm_ctx.resource_name,
+ adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists in different resource");
+ return ERR_INVALID_REQUEST;
+ }
+ if (adm_ctx.mdev &&
+ adm_ctx.volume != VOLUME_UNSPECIFIED &&
+ adm_ctx.volume != adm_ctx.mdev->vnr) {
+ pr_warning("request: minor=%u, volume=%u; but that minor is volume %u in %s\n",
+ adm_ctx.minor, adm_ctx.volume,
+ adm_ctx.mdev->vnr, adm_ctx.mdev->tconn->name);
+ drbd_msg_put_info("minor exists as different volume");
+ return ERR_INVALID_REQUEST;
+ }
+
+ return NO_ERROR;
+
+fail:
+ nlmsg_free(adm_ctx.reply_skb);
+ adm_ctx.reply_skb = NULL;
+ return err;
+}
+
+static int drbd_adm_finish(struct genl_info *info, int retcode)
+{
+ if (adm_ctx.tconn) {
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+ adm_ctx.tconn = NULL;
+ }
+
+ if (!adm_ctx.reply_skb)
+ return -ENOMEM;
+
+ adm_ctx.reply_dh->ret_code = retcode;
+ drbd_adm_send_reply(adm_ctx.reply_skb, info);
+ return 0;
+}
+
+static void setup_khelper_env(struct drbd_tconn *tconn, char **envp)
+{
+ char *afs;
+
+ /* FIXME: A future version will not allow this case. */
+ if (tconn->my_addr_len == 0 || tconn->peer_addr_len == 0)
+ return;
+
+ switch (((struct sockaddr *)&tconn->peer_addr)->sa_family) {
+ case AF_INET6:
+ afs = "ipv6";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI6",
+ &((struct sockaddr_in6 *)&tconn->peer_addr)->sin6_addr);
+ break;
+ case AF_INET:
+ afs = "ipv4";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+ break;
+ default:
+ afs = "ssocks";
+ snprintf(envp[4], 60, "DRBD_PEER_ADDRESS=%pI4",
+ &((struct sockaddr_in *)&tconn->peer_addr)->sin_addr);
+ }
+ snprintf(envp[3], 20, "DRBD_PEER_AF=%s", afs);
+}
int drbd_khelper(struct drbd_conf *mdev, char *cmd)
{
char *envp[] = { "HOME=/",
"TERM=linux",
"PATH=/sbin:/usr/sbin:/bin:/usr/bin",
- NULL, /* Will be set to address family */
- NULL, /* Will be set to address */
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
NULL };
-
- char mb[12], af[20], ad[60], *afs;
+ char mb[12];
char *argv[] = {usermode_helper, cmd, mb, NULL };
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct sib_info sib;
int ret;
- if (current == mdev->worker.task)
- drbd_set_flag(mdev, CALLBACK_PENDING);
+ if (current == tconn->worker.task)
+ set_bit(CALLBACK_PENDING, &tconn->flags);
snprintf(mb, 12, "minor-%d", mdev_to_minor(mdev));
-
- if (get_net_conf(mdev)) {
- switch (((struct sockaddr *)mdev->net_conf->peer_addr)->sa_family) {
- case AF_INET6:
- afs = "ipv6";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI6",
- &((struct sockaddr_in6 *)mdev->net_conf->peer_addr)->sin6_addr);
- break;
- case AF_INET:
- afs = "ipv4";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- break;
- default:
- afs = "ssocks";
- snprintf(ad, 60, "DRBD_PEER_ADDRESS=%pI4",
- &((struct sockaddr_in *)mdev->net_conf->peer_addr)->sin_addr);
- }
- snprintf(af, 20, "DRBD_PEER_AF=%s", afs);
- envp[3]=af;
- envp[4]=ad;
- put_net_conf(mdev);
- }
+ setup_khelper_env(tconn, envp);
/* The helper may take some time.
* write out any unsynced meta data changes now */
drbd_md_sync(mdev);
dev_info(DEV, "helper command: %s %s %s\n", usermode_helper, cmd, mb);
-
- drbd_bcast_ev_helper(mdev, cmd);
+ sib.sib_reason = SIB_HELPER_PRE;
+ sib.helper_name = cmd;
+ drbd_bcast_event(mdev, &sib);
ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
if (ret)
dev_warn(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
@@ -191,9 +349,12 @@
dev_info(DEV, "helper command: %s %s %s exit code %u (0x%x)\n",
usermode_helper, cmd, mb,
(ret >> 8) & 0xff, ret);
+ sib.sib_reason = SIB_HELPER_POST;
+ sib.helper_exit_code = ret;
+ drbd_bcast_event(mdev, &sib);
- if (current == mdev->worker.task)
- drbd_clear_flag(mdev, CALLBACK_PENDING);
+ if (current == tconn->worker.task)
+ clear_bit(CALLBACK_PENDING, &tconn->flags);
if (ret < 0) /* Ignore any ERRNOs we got. */
ret = 0;
@@ -201,116 +362,163 @@
return ret;
}
-enum drbd_disk_state drbd_try_outdate_peer(struct drbd_conf *mdev)
+int conn_khelper(struct drbd_tconn *tconn, char *cmd)
{
+ char *envp[] = { "HOME=/",
+ "TERM=linux",
+ "PATH=/sbin:/usr/sbin:/bin:/usr/bin",
+ (char[20]) { }, /* address family */
+ (char[60]) { }, /* address */
+ NULL };
+ char *argv[] = {usermode_helper, cmd, tconn->name, NULL };
+ int ret;
+
+ setup_khelper_env(tconn, envp);
+ conn_md_sync(tconn);
+
+ conn_info(tconn, "helper command: %s %s %s\n", usermode_helper, cmd, tconn->name);
+ /* TODO: conn_bcast_event() ?? */
+
+ ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC);
+ if (ret)
+ conn_warn(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ else
+ conn_info(tconn, "helper command: %s %s %s exit code %u (0x%x)\n",
+ usermode_helper, cmd, tconn->name,
+ (ret >> 8) & 0xff, ret);
+ /* TODO: conn_bcast_event() ?? */
+
+ if (ret < 0) /* Ignore any ERRNOs we got. */
+ ret = 0;
+
+ return ret;
+}
+
+static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
+{
+ enum drbd_fencing_p fp = FP_NOT_AVAIL;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (get_ldev_if_state(mdev, D_CONSISTENT)) {
+ fp = max_t(enum drbd_fencing_p, fp,
+ rcu_dereference(mdev->ldev->disk_conf)->fencing);
+ put_ldev(mdev);
+ }
+ }
+ rcu_read_unlock();
+
+ return fp;
+}
+
+bool conn_try_outdate_peer(struct drbd_tconn *tconn)
+{
+ union drbd_state mask = { };
+ union drbd_state val = { };
+ enum drbd_fencing_p fp;
char *ex_to_string;
int r;
- enum drbd_disk_state nps;
- enum drbd_fencing_p fp;
- D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
-
- if (get_ldev_if_state(mdev, D_CONSISTENT)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- } else {
- dev_warn(DEV, "Not fencing peer, I'm not even Consistent myself.\n");
- nps = mdev->state.pdsk;
- goto out;
+ if (tconn->cstate >= C_WF_REPORT_PARAMS) {
+ conn_err(tconn, "Expected cstate < C_WF_REPORT_PARAMS\n");
+ return false;
}
- r = drbd_khelper(mdev, "fence-peer");
+ fp = highest_fencing_policy(tconn);
+ switch (fp) {
+ case FP_NOT_AVAIL:
+ conn_warn(tconn, "Not fencing peer, I'm not even Consistent myself.\n");
+ goto out;
+ case FP_DONT_CARE:
+ return true;
+ default: ;
+ }
+
+ r = conn_khelper(tconn, "fence-peer");
switch ((r>>8) & 0xff) {
case 3: /* peer is inconsistent */
ex_to_string = "peer is inconsistent or worse";
- nps = D_INCONSISTENT;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_INCONSISTENT;
break;
case 4: /* peer got outdated, or was already outdated */
ex_to_string = "peer was fenced";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
case 5: /* peer was down */
- if (mdev->state.disk == D_UP_TO_DATE) {
+ if (conn_highest_disk(tconn) == D_UP_TO_DATE) {
/* we will(have) create(d) a new UUID anyways... */
ex_to_string = "peer is unreachable, assumed to be dead";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
} else {
ex_to_string = "peer unreachable, doing nothing since disk != UpToDate";
- nps = mdev->state.pdsk;
}
break;
case 6: /* Peer is primary, voluntarily outdate myself.
* This is useful when an unconnected R_SECONDARY is asked to
* become R_PRIMARY, but finds the other peer being active. */
ex_to_string = "peer is active";
- dev_warn(DEV, "Peer is primary, outdating myself.\n");
- nps = D_UNKNOWN;
- _drbd_request_state(mdev, NS(disk, D_OUTDATED), CS_WAIT_COMPLETE);
+ conn_warn(tconn, "Peer is primary, outdating myself.\n");
+ mask.disk = D_MASK;
+ val.disk = D_OUTDATED;
break;
case 7:
if (fp != FP_STONITH)
- dev_err(DEV, "fence-peer() = 7 && fencing != Stonith !!!\n");
+ conn_err(tconn, "fence-peer() = 7 && fencing != Stonith !!!\n");
ex_to_string = "peer was stonithed";
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
break;
default:
/* The script is broken ... */
- nps = D_UNKNOWN;
- dev_err(DEV, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
- return nps;
+ conn_err(tconn, "fence-peer helper broken, returned %d\n", (r>>8)&0xff);
+ return false; /* Eventually leave IO frozen */
}
- dev_info(DEV, "fence-peer helper returned %d (%s)\n",
- (r>>8) & 0xff, ex_to_string);
+ conn_info(tconn, "fence-peer helper returned %d (%s)\n",
+ (r>>8) & 0xff, ex_to_string);
-out:
- if (mdev->state.susp_fen && nps >= D_UNKNOWN) {
- /* The handler was not successful... unfreeze here, the
- state engine can not unfreeze... */
- _drbd_request_state(mdev, NS(susp_fen, 0), CS_VERBOSE);
- }
+ out:
- return nps;
+ /* Not using
+ conn_request_state(tconn, mask, val, CS_VERBOSE);
+ here, because we might were able to re-establish the connection in the
+ meantime. */
+ spin_lock_irq(&tconn->req_lock);
+ if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
+ _conn_request_state(tconn, mask, val, CS_VERBOSE);
+ spin_unlock_irq(&tconn->req_lock);
+
+ return conn_highest_pdsk(tconn) <= D_OUTDATED;
}
static int _try_outdate_peer_async(void *data)
{
- struct drbd_conf *mdev = (struct drbd_conf *)data;
- enum drbd_disk_state nps;
- union drbd_state ns;
+ struct drbd_tconn *tconn = (struct drbd_tconn *)data;
- nps = drbd_try_outdate_peer(mdev);
+ conn_try_outdate_peer(tconn);
- /* Not using
- drbd_request_state(mdev, NS(pdsk, nps));
- here, because we might were able to re-establish the connection
- in the meantime. This can only partially be solved in the state's
- engine is_valid_state() and is_valid_state_transition()
- functions.
-
- nps can be D_INCONSISTENT, D_OUTDATED or D_UNKNOWN.
- pdsk == D_INCONSISTENT while conn >= C_CONNECTED is valid,
- therefore we have to have the pre state change check here.
- */
- spin_lock_irq(&mdev->req_lock);
- ns = mdev->state;
- if (ns.conn < C_WF_REPORT_PARAMS && !drbd_test_flag(mdev, STATE_SENT)) {
- ns.pdsk = nps;
- _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- }
- spin_unlock_irq(&mdev->req_lock);
-
+ kref_put(&tconn->kref, &conn_destroy);
return 0;
}
-void drbd_try_outdate_peer_async(struct drbd_conf *mdev)
+void conn_try_outdate_peer_async(struct drbd_tconn *tconn)
{
struct task_struct *opa;
- opa = kthread_run(_try_outdate_peer_async, mdev, "drbd%d_a_helper", mdev_to_minor(mdev));
- if (IS_ERR(opa))
- dev_err(DEV, "out of mem, failed to invoke fence-peer helper\n");
+ kref_get(&tconn->kref);
+ opa = kthread_run(_try_outdate_peer_async, tconn, "drbd_async_h");
+ if (IS_ERR(opa)) {
+ conn_err(tconn, "out of mem, failed to invoke fence-peer helper\n");
+ kref_put(&tconn->kref, &conn_destroy);
+ }
}
enum drbd_state_rv
@@ -318,15 +526,15 @@
{
const int max_tries = 4;
enum drbd_state_rv rv = SS_UNKNOWN_ERROR;
+ struct net_conf *nc;
int try = 0;
int forced = 0;
union drbd_state mask, val;
- enum drbd_disk_state nps;
if (new_role == R_PRIMARY)
- request_ping(mdev); /* Detect a dead peer ASAP */
+ request_ping(mdev->tconn); /* Detect a dead peer ASAP */
- mutex_lock(&mdev->state_mutex);
+ mutex_lock(mdev->state_mutex);
mask.i = 0; mask.role = R_MASK;
val.i = 0; val.role = new_role;
@@ -354,38 +562,34 @@
if (rv == SS_NO_UP_TO_DATE_DISK &&
mdev->state.disk == D_CONSISTENT && mask.pdsk == 0) {
D_ASSERT(mdev->state.pdsk == D_UNKNOWN);
- nps = drbd_try_outdate_peer(mdev);
- if (nps == D_OUTDATED || nps == D_INCONSISTENT) {
+ if (conn_try_outdate_peer(mdev->tconn)) {
val.disk = D_UP_TO_DATE;
mask.disk = D_MASK;
}
-
- val.pdsk = nps;
- mask.pdsk = D_MASK;
-
continue;
}
if (rv == SS_NOTHING_TO_DO)
- goto fail;
+ goto out;
if (rv == SS_PRIMARY_NOP && mask.pdsk == 0) {
- nps = drbd_try_outdate_peer(mdev);
-
- if (force && nps > D_OUTDATED) {
+ if (!conn_try_outdate_peer(mdev->tconn) && force) {
dev_warn(DEV, "Forced into split brain situation!\n");
- nps = D_OUTDATED;
+ mask.pdsk = D_MASK;
+ val.pdsk = D_OUTDATED;
+
}
-
- mask.pdsk = D_MASK;
- val.pdsk = nps;
-
continue;
}
if (rv == SS_TWO_PRIMARIES) {
/* Maybe the peer is detected as dead very soon...
retry at most once more in this case. */
- schedule_timeout_interruptible((mdev->net_conf->ping_timeo+1)*HZ/10);
+ int timeo;
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ timeo = nc ? (nc->ping_timeo + 1) * HZ / 10 : 1;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
if (try < max_tries)
try = max_tries - 1;
continue;
@@ -394,13 +598,13 @@
rv = _drbd_request_state(mdev, mask, val,
CS_VERBOSE + CS_WAIT_COMPLETE);
if (rv < SS_SUCCESS)
- goto fail;
+ goto out;
}
break;
}
if (rv < SS_SUCCESS)
- goto fail;
+ goto out;
if (forced)
dev_warn(DEV, "Forced to consider local data as UpToDate!\n");
@@ -408,6 +612,8 @@
/* Wait until nothing is on the fly :) */
wait_event(mdev->misc_wait, atomic_read(&mdev->ap_pending_cnt) == 0);
+ /* FIXME also wait for all pending P_BARRIER_ACK? */
+
if (new_role == R_SECONDARY) {
set_disk_ro(mdev->vdisk, true);
if (get_ldev(mdev)) {
@@ -415,10 +621,12 @@
put_ldev(mdev);
}
} else {
- if (get_net_conf(mdev)) {
- mdev->net_conf->want_lose = 0;
- put_net_conf(mdev);
- }
+ mutex_lock(&mdev->tconn->conf_update);
+ nc = mdev->tconn->net_conf;
+ if (nc)
+ nc->discard_my_data = 0; /* without copy; single bit op is atomic */
+ mutex_unlock(&mdev->tconn->conf_update);
+
set_disk_ro(mdev->vdisk, false);
if (get_ldev(mdev)) {
if (((mdev->state.conn < C_CONNECTED ||
@@ -444,67 +652,47 @@
drbd_md_sync(mdev);
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- fail:
- mutex_unlock(&mdev->state_mutex);
+out:
+ mutex_unlock(mdev->state_mutex);
return rv;
}
-static struct drbd_conf *ensure_mdev(int minor, int create)
+static const char *from_attrs_err_to_txt(int err)
{
- struct drbd_conf *mdev;
-
- if (minor >= minor_count)
- return NULL;
-
- mdev = minor_to_mdev(minor);
-
- if (!mdev && create) {
- struct gendisk *disk = NULL;
- mdev = drbd_new_device(minor);
-
- spin_lock_irq(&drbd_pp_lock);
- if (minor_table[minor] == NULL) {
- minor_table[minor] = mdev;
- disk = mdev->vdisk;
- mdev = NULL;
- } /* else: we lost the race */
- spin_unlock_irq(&drbd_pp_lock);
-
- if (disk) /* we won the race above */
- /* in case we ever add a drbd_delete_device(),
- * don't forget the del_gendisk! */
- add_disk(disk);
- else /* we lost the race above */
- drbd_free_mdev(mdev);
-
- mdev = minor_to_mdev(minor);
- }
-
- return mdev;
+ return err == -ENOMSG ? "required attribute missing" :
+ err == -EOPNOTSUPP ? "unknown mandatory attribute" :
+ err == -EEXIST ? "can not change invariant setting" :
+ "invalid attribute value";
}
-static int drbd_nl_primary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_set_role(struct sk_buff *skb, struct genl_info *info)
{
- struct primary primary_args;
+ struct set_role_parms parms;
+ int err;
+ enum drbd_ret_code retcode;
- memset(&primary_args, 0, sizeof(struct primary));
- if (!primary_from_tags(mdev, nlp->tag_list, &primary_args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_SET_ROLE_PARMS]) {
+ err = set_role_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
}
- reply->ret_code =
- drbd_set_role(mdev, R_PRIMARY, primary_args.primary_force);
-
- return 0;
-}
-
-static int drbd_nl_secondary(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- reply->ret_code = drbd_set_role(mdev, R_SECONDARY, 0);
-
+ if (info->genlhdr->cmd == DRBD_ADM_PRIMARY)
+ retcode = drbd_set_role(adm_ctx.mdev, R_PRIMARY, parms.assume_uptodate);
+ else
+ retcode = drbd_set_role(adm_ctx.mdev, R_SECONDARY, 0);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -514,7 +702,12 @@
struct drbd_backing_dev *bdev)
{
sector_t md_size_sect = 0;
- switch (bdev->dc.meta_dev_idx) {
+ int meta_dev_idx;
+
+ rcu_read_lock();
+ meta_dev_idx = rcu_dereference(bdev->disk_conf)->meta_dev_idx;
+
+ switch (meta_dev_idx) {
default:
/* v07 style fixed size indexed meta data */
bdev->md.md_size_sect = MD_RESERVED_SECT;
@@ -533,7 +726,7 @@
case DRBD_MD_INDEX_FLEX_INT:
bdev->md.md_offset = drbd_md_ss__(mdev, bdev);
/* al size is still fixed */
- bdev->md.al_offset = -MD_AL_MAX_SIZE;
+ bdev->md.al_offset = -MD_AL_SECTORS;
/* we need (slightly less than) ~ this much bitmap sectors: */
md_size_sect = drbd_get_capacity(bdev->backing_bdev);
md_size_sect = ALIGN(md_size_sect, BM_SECT_PER_EXT);
@@ -549,6 +742,7 @@
bdev->md.bm_offset = -md_size_sect + MD_AL_OFFSET;
break;
}
+ rcu_read_unlock();
}
/* input size is expected to be in KB */
@@ -581,17 +775,23 @@
* R_PRIMARY D_INCONSISTENT, and C_SYNC_TARGET:
* peer may not initiate a resize.
*/
+/* Note these are not to be confused with
+ * drbd_adm_suspend_io/drbd_adm_resume_io,
+ * which are (sub) state changes triggered by admin (drbdsetup),
+ * and can be long lived.
+ * This changes an mdev->flag, is triggered by drbd internals,
+ * and should be short-lived. */
void drbd_suspend_io(struct drbd_conf *mdev)
{
- drbd_set_flag(mdev, SUSPEND_IO);
- if (is_susp(mdev->state))
+ set_bit(SUSPEND_IO, &mdev->flags);
+ if (drbd_suspended(mdev))
return;
wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_bio_cnt));
}
void drbd_resume_io(struct drbd_conf *mdev)
{
- drbd_clear_flag(mdev, SUSPEND_IO);
+ clear_bit(SUSPEND_IO, &mdev->flags);
wake_up(&mdev->misc_wait);
}
@@ -605,7 +805,7 @@
enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
{
sector_t prev_first_sect, prev_size; /* previous meta location */
- sector_t la_size;
+ sector_t la_size, u_size;
sector_t size;
char ppb[10];
@@ -633,7 +833,10 @@
/* TODO: should only be some assert here, not (re)init... */
drbd_md_set_sector_offsets(mdev, mdev->ldev);
- size = drbd_new_dev_size(mdev, mdev->ldev, flags & DDSF_FORCED);
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
if (drbd_get_capacity(mdev->this_bdev) != size ||
drbd_bm_capacity(mdev) != size) {
@@ -696,12 +899,12 @@
}
sector_t
-drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev, int assume_peer_has_space)
+drbd_new_dev_size(struct drbd_conf *mdev, struct drbd_backing_dev *bdev,
+ sector_t u_size, int assume_peer_has_space)
{
sector_t p_size = mdev->p_size; /* partner's disk size. */
sector_t la_size = bdev->md.la_size_sect; /* last agreed size. */
sector_t m_size; /* my size */
- sector_t u_size = bdev->dc.disk_size; /* size requested by user. */
sector_t size = 0;
m_size = drbd_get_max_capacity(bdev);
@@ -750,24 +953,21 @@
* failed, and 0 on success. You should call drbd_md_sync() after you called
* this function.
*/
-static int drbd_check_al_size(struct drbd_conf *mdev)
+static int drbd_check_al_size(struct drbd_conf *mdev, struct disk_conf *dc)
{
struct lru_cache *n, *t;
struct lc_element *e;
unsigned int in_use;
int i;
- ERR_IF(mdev->sync_conf.al_extents < 7)
- mdev->sync_conf.al_extents = 127;
-
if (mdev->act_log &&
- mdev->act_log->nr_elements == mdev->sync_conf.al_extents)
+ mdev->act_log->nr_elements == dc->al_extents)
return 0;
in_use = 0;
t = mdev->act_log;
- n = lc_create("act_log", drbd_al_ext_cache,
- mdev->sync_conf.al_extents, sizeof(struct lc_element), 0);
+ n = lc_create("act_log", drbd_al_ext_cache, AL_UPDATES_PER_TRANSACTION,
+ dc->al_extents, sizeof(struct lc_element), 0);
if (n == NULL) {
dev_err(DEV, "Cannot allocate act_log lru!\n");
@@ -808,7 +1008,9 @@
struct request_queue * const b = mdev->ldev->backing_bdev->bd_disk->queue;
max_hw_sectors = min(queue_max_hw_sectors(b), max_bio_size >> 9);
- max_segments = mdev->ldev->dc.max_bio_bvecs;
+ rcu_read_lock();
+ max_segments = rcu_dereference(mdev->ldev->disk_conf)->max_bio_bvecs;
+ rcu_read_unlock();
put_ldev(mdev);
}
@@ -852,12 +1054,14 @@
Because new from 8.3.8 onwards the peer can use multiple
BIOs for a single peer_request */
if (mdev->state.conn >= C_CONNECTED) {
- if (mdev->agreed_pro_version < 94) {
- peer = min(mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
+ if (mdev->tconn->agreed_pro_version < 94)
+ peer = min( mdev->peer_max_bio_size, DRBD_MAX_SIZE_H80_PACKET);
/* Correct old drbd (up to 8.3.7) if it believes it can do more than 32KiB */
- } else if (mdev->agreed_pro_version == 94)
+ else if (mdev->tconn->agreed_pro_version == 94)
peer = DRBD_MAX_SIZE_H80_PACKET;
- else /* drbd 8.3.8 onwards */
+ else if (mdev->tconn->agreed_pro_version < 100)
+ peer = DRBD_MAX_BIO_SIZE_P95; /* drbd 8.3.8 onwards, before 8.4.0 */
+ else
peer = DRBD_MAX_BIO_SIZE;
}
@@ -872,36 +1076,27 @@
drbd_setup_queue_param(mdev, new);
}
-/* serialize deconfig (worker exiting, doing cleanup)
- * and reconfig (drbdsetup disk, drbdsetup net)
- *
- * Wait for a potentially exiting worker, then restart it,
- * or start a new one. Flush any pending work, there may still be an
- * after_state_change queued.
- */
-static void drbd_reconfig_start(struct drbd_conf *mdev)
+/* Starts the worker thread */
+static void conn_reconfig_start(struct drbd_tconn *tconn)
{
- wait_event(mdev->state_wait, !drbd_test_and_set_flag(mdev, CONFIG_PENDING));
- wait_event(mdev->state_wait, !drbd_test_flag(mdev, DEVICE_DYING));
- drbd_thread_start(&mdev->worker);
- drbd_flush_workqueue(mdev);
+ drbd_thread_start(&tconn->worker);
+ conn_flush_workqueue(tconn);
}
-/* if still unconfigured, stops worker again.
- * if configured now, clears CONFIG_PENDING.
- * wakes potential waiters */
-static void drbd_reconfig_done(struct drbd_conf *mdev)
+/* if still unconfigured, stops worker again. */
+static void conn_reconfig_done(struct drbd_tconn *tconn)
{
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.disk == D_DISKLESS &&
- mdev->state.conn == C_STANDALONE &&
- mdev->state.role == R_SECONDARY) {
- drbd_set_flag(mdev, DEVICE_DYING);
- drbd_thread_stop_nowait(&mdev->worker);
- } else
- drbd_clear_flag(mdev, CONFIG_PENDING);
- spin_unlock_irq(&mdev->req_lock);
- wake_up(&mdev->state_wait);
+ bool stop_threads;
+ spin_lock_irq(&tconn->req_lock);
+ stop_threads = conn_all_vols_unconf(tconn) &&
+ tconn->cstate == C_STANDALONE;
+ spin_unlock_irq(&tconn->req_lock);
+ if (stop_threads) {
+ /* asender is implicitly stopped by receiver
+ * in conn_disconnect() */
+ drbd_thread_stop(&tconn->receiver);
+ drbd_thread_stop(&tconn->worker);
+ }
}
/* Make sure IO is suspended before calling this function(). */
@@ -909,42 +1104,182 @@
{
int s = 0;
- if (lc_try_lock(mdev->act_log)) {
- drbd_al_shrink(mdev);
- lc_unlock(mdev->act_log);
- } else {
+ if (!lc_try_lock(mdev->act_log)) {
dev_warn(DEV, "Failed to lock al in drbd_suspend_al()\n");
return;
}
- spin_lock_irq(&mdev->req_lock);
+ drbd_al_shrink(mdev);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED)
- s = !drbd_test_and_set_flag(mdev, AL_SUSPENDED);
-
- spin_unlock_irq(&mdev->req_lock);
+ s = !test_and_set_bit(AL_SUSPENDED, &mdev->flags);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ lc_unlock(mdev->act_log);
if (s)
dev_info(DEV, "Suspended AL updates\n");
}
-/* does always return 0;
- * interesting return code is in reply->ret_code */
-static int drbd_nl_disk_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+
+static bool should_set_defaults(struct genl_info *info)
{
+ unsigned flags = ((struct drbd_genlmsghdr*)info->userhdr)->flags;
+ return 0 != (flags & DRBD_GENL_F_SET_DEFAULTS);
+}
+
+static void enforce_disk_conf_limits(struct disk_conf *dc)
+{
+ if (dc->al_extents < DRBD_AL_EXTENTS_MIN)
+ dc->al_extents = DRBD_AL_EXTENTS_MIN;
+ if (dc->al_extents > DRBD_AL_EXTENTS_MAX)
+ dc->al_extents = DRBD_AL_EXTENTS_MAX;
+
+ if (dc->c_plan_ahead > DRBD_C_PLAN_AHEAD_MAX)
+ dc->c_plan_ahead = DRBD_C_PLAN_AHEAD_MAX;
+}
+
+int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct drbd_conf *mdev;
+ struct disk_conf *new_disk_conf, *old_disk_conf;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
+ int err, fifo_size;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+
+ /* we also need a disk
+ * to change the options on */
+ if (!get_ldev(mdev)) {
+ retcode = ERR_NO_DISK;
+ goto out;
+ }
+
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ if (should_set_defaults(info))
+ set_disk_conf_defaults(new_disk_conf);
+
+ err = disk_conf_from_attrs_for_change(new_disk_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ }
+
+ if (!expect(new_disk_conf->resync_rate >= 1))
+ new_disk_conf->resync_rate = 1;
+
+ enforce_disk_conf_limits(new_disk_conf);
+
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
+ dev_err(DEV, "kmalloc of fifo_buffer failed");
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+ }
+
+ drbd_suspend_io(mdev);
+ wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
+ drbd_al_shrink(mdev);
+ err = drbd_check_al_size(mdev, new_disk_conf);
+ lc_unlock(mdev->act_log);
+ wake_up(&mdev->al_wait);
+ drbd_resume_io(mdev);
+
+ if (err) {
+ retcode = ERR_NOMEM;
+ goto fail_unlock;
+ }
+
+ write_lock_irq(&global_state_lock);
+ retcode = drbd_resync_after_valid(mdev, new_disk_conf->resync_after);
+ if (retcode == NO_ERROR) {
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ drbd_resync_after_changed(mdev);
+ }
+ write_unlock_irq(&global_state_lock);
+
+ if (retcode != NO_ERROR)
+ goto fail_unlock;
+
+ if (new_plan) {
+ old_plan = mdev->rs_plan_s;
+ rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&mdev->tconn->conf_update);
+
+ if (new_disk_conf->al_updates)
+ mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+ drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
+
+ drbd_md_sync(mdev);
+
+ if (mdev->state.conn >= C_CONNECTED)
+ drbd_send_sync_param(mdev);
+
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ kfree(old_plan);
+ mod_timer(&mdev->request_timer, jiffies + HZ);
+ goto success;
+
+fail_unlock:
+ mutex_unlock(&mdev->tconn->conf_update);
+ fail:
+ kfree(new_disk_conf);
+ kfree(new_plan);
+success:
+ put_ldev(mdev);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_conf *mdev;
+ int err;
enum drbd_ret_code retcode;
enum determine_dev_size dd;
sector_t max_possible_sectors;
sector_t min_md_device_sectors;
struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */
+ struct disk_conf *new_disk_conf = NULL;
struct block_device *bdev;
struct lru_cache *resync_lru = NULL;
+ struct fifo_buffer *new_plan = NULL;
union drbd_state ns, os;
enum drbd_state_rv rv;
- int cp_discovered = 0;
- int logical_block_size;
+ struct net_conf *nc;
- drbd_reconfig_start(mdev);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto finish;
+
+ mdev = adm_ctx.mdev;
+ conn_reconfig_start(mdev->tconn);
/* if you want to reconfigure, please tear down first */
if (mdev->state.disk > D_DISKLESS) {
@@ -958,52 +1293,66 @@
wait_event(mdev->misc_wait, !atomic_read(&mdev->local_cnt));
/* make sure there is no leftover from previous force-detach attempts */
- drbd_clear_flag(mdev, FORCE_DETACH);
- drbd_clear_flag(mdev, WAS_IO_ERROR);
- drbd_clear_flag(mdev, WAS_READ_ERROR);
+ clear_bit(FORCE_DETACH, &mdev->flags);
+ clear_bit(WAS_IO_ERROR, &mdev->flags);
+ clear_bit(WAS_READ_ERROR, &mdev->flags);
/* and no leftover from previously aborted resync or verify, either */
mdev->rs_total = 0;
mdev->rs_failed = 0;
atomic_set(&mdev->rs_pending_cnt, 0);
- /* allocation not in the IO path, cqueue thread context */
+ /* allocation not in the IO path, drbdsetup context */
nbc = kzalloc(sizeof(struct drbd_backing_dev), GFP_KERNEL);
if (!nbc) {
retcode = ERR_NOMEM;
goto fail;
}
-
- nbc->dc.disk_size = DRBD_DISK_SIZE_SECT_DEF;
- nbc->dc.on_io_error = DRBD_ON_IO_ERROR_DEF;
- nbc->dc.fencing = DRBD_FENCING_DEF;
- nbc->dc.max_bio_bvecs = DRBD_MAX_BIO_BVECS_DEF;
-
spin_lock_init(&nbc->md.uuid_lock);
- if (!disk_conf_from_tags(mdev, nlp->tag_list, &nbc->dc)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+ nbc->disk_conf = new_disk_conf;
+
+ set_disk_conf_defaults(new_disk_conf);
+ err = disk_conf_from_attrs(new_disk_conf, info);
+ if (err) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
goto fail;
}
- if (nbc->dc.meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
+ enforce_disk_conf_limits(new_disk_conf);
+
+ new_plan = fifo_alloc((new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ);
+ if (!new_plan) {
+ retcode = ERR_NOMEM;
+ goto fail;
+ }
+
+ if (new_disk_conf->meta_dev_idx < DRBD_MD_INDEX_FLEX_INT) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
- if (get_net_conf(mdev)) {
- int prot = mdev->net_conf->wire_protocol;
- put_net_conf(mdev);
- if (nbc->dc.fencing == FP_STONITH && prot == DRBD_PROT_A) {
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc) {
+ if (new_disk_conf->fencing == FP_STONITH && nc->wire_protocol == DRBD_PROT_A) {
+ rcu_read_unlock();
retcode = ERR_STONITH_AND_PROT_A;
goto fail;
}
}
+ rcu_read_unlock();
- bdev = blkdev_get_by_path(nbc->dc.backing_dev,
+ bdev = blkdev_get_by_path(new_disk_conf->backing_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL, mdev);
if (IS_ERR(bdev)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.backing_dev,
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_DISK;
goto fail;
@@ -1018,12 +1367,12 @@
* should check it for you already; but if you don't, or
* someone fooled it, we need to double check here)
*/
- bdev = blkdev_get_by_path(nbc->dc.meta_dev,
+ bdev = blkdev_get_by_path(new_disk_conf->meta_dev,
FMODE_READ | FMODE_WRITE | FMODE_EXCL,
- (nbc->dc.meta_dev_idx < 0) ?
+ (new_disk_conf->meta_dev_idx < 0) ?
(void *)mdev : (void *)drbd_m_holder);
if (IS_ERR(bdev)) {
- dev_err(DEV, "open(\"%s\") failed with %ld\n", nbc->dc.meta_dev,
+ dev_err(DEV, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev,
PTR_ERR(bdev));
retcode = ERR_OPEN_MD_DISK;
goto fail;
@@ -1031,14 +1380,14 @@
nbc->md_bdev = bdev;
if ((nbc->backing_bdev == nbc->md_bdev) !=
- (nbc->dc.meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
- nbc->dc.meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
+ (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL ||
+ new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_FLEX_INT)) {
retcode = ERR_MD_IDX_INVALID;
goto fail;
}
resync_lru = lc_create("resync", drbd_bm_ext_cache,
- 61, sizeof(struct bm_extent),
+ 1, 61, sizeof(struct bm_extent),
offsetof(struct bm_extent, lce));
if (!resync_lru) {
retcode = ERR_NOMEM;
@@ -1048,21 +1397,21 @@
/* RT - for drbd_get_max_capacity() DRBD_MD_INDEX_FLEX_INT */
drbd_md_set_sector_offsets(mdev, nbc);
- if (drbd_get_max_capacity(nbc) < nbc->dc.disk_size) {
+ if (drbd_get_max_capacity(nbc) < new_disk_conf->disk_size) {
dev_err(DEV, "max capacity %llu smaller than disk size %llu\n",
(unsigned long long) drbd_get_max_capacity(nbc),
- (unsigned long long) nbc->dc.disk_size);
+ (unsigned long long) new_disk_conf->disk_size);
retcode = ERR_DISK_TOO_SMALL;
goto fail;
}
- if (nbc->dc.meta_dev_idx < 0) {
+ if (new_disk_conf->meta_dev_idx < 0) {
max_possible_sectors = DRBD_MAX_SECTORS_FLEX;
/* at least one MB, otherwise it does not make sense */
min_md_device_sectors = (2<<10);
} else {
max_possible_sectors = DRBD_MAX_SECTORS;
- min_md_device_sectors = MD_RESERVED_SECT * (nbc->dc.meta_dev_idx + 1);
+ min_md_device_sectors = MD_RESERVED_SECT * (new_disk_conf->meta_dev_idx + 1);
}
if (drbd_get_capacity(nbc->md_bdev) < min_md_device_sectors) {
@@ -1087,14 +1436,20 @@
dev_warn(DEV, "==> truncating very big lower level device "
"to currently maximum possible %llu sectors <==\n",
(unsigned long long) max_possible_sectors);
- if (nbc->dc.meta_dev_idx >= 0)
+ if (new_disk_conf->meta_dev_idx >= 0)
dev_warn(DEV, "==>> using internal or flexible "
"meta data may help <<==\n");
}
drbd_suspend_io(mdev);
/* also wait for the last barrier ack. */
- wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || is_susp(mdev->state));
+ /* FIXME see also https://daiquiri.linbit/cgi-bin/bugzilla/show_bug.cgi?id=171
+ * We need a way to either ignore barrier acks for barriers sent before a device
+ * was attached, or a way to wait for all pending barrier acks to come in.
+ * As barriers are counted per resource,
+ * we'd need to suspend io on all devices of a resource.
+ */
+ wait_event(mdev->misc_wait, !atomic_read(&mdev->ap_pending_cnt) || drbd_suspended(mdev));
/* and for any other previously queued work */
drbd_flush_workqueue(mdev);
@@ -1109,25 +1464,6 @@
drbd_md_set_sector_offsets(mdev, nbc);
- /* allocate a second IO page if logical_block_size != 512 */
- logical_block_size = bdev_logical_block_size(nbc->md_bdev);
- if (logical_block_size == 0)
- logical_block_size = MD_SECTOR_SIZE;
-
- if (logical_block_size != MD_SECTOR_SIZE) {
- if (!mdev->md_io_tmpp) {
- struct page *page = alloc_page(GFP_NOIO);
- if (!page)
- goto force_diskless_dec;
-
- dev_warn(DEV, "Meta data's bdev logical_block_size = %d != %d\n",
- logical_block_size, MD_SECTOR_SIZE);
- dev_warn(DEV, "Workaround engaged (has performance impact).\n");
-
- mdev->md_io_tmpp = page;
- }
- }
-
if (!mdev->bitmap) {
if (drbd_bm_init(mdev)) {
retcode = ERR_NOMEM;
@@ -1149,30 +1485,25 @@
}
/* Since we are diskless, fix the activity log first... */
- if (drbd_check_al_size(mdev)) {
+ if (drbd_check_al_size(mdev, new_disk_conf)) {
retcode = ERR_NOMEM;
goto force_diskless_dec;
}
/* Prevent shrinking of consistent devices ! */
if (drbd_md_test_flag(nbc, MDF_CONSISTENT) &&
- drbd_new_dev_size(mdev, nbc, 0) < nbc->md.la_size_sect) {
+ drbd_new_dev_size(mdev, nbc, nbc->disk_conf->disk_size, 0) < nbc->md.la_size_sect) {
dev_warn(DEV, "refusing to truncate a consistent device\n");
retcode = ERR_DISK_TOO_SMALL;
goto force_diskless_dec;
}
- if (!drbd_al_read_log(mdev, nbc)) {
- retcode = ERR_IO_MD_DISK;
- goto force_diskless_dec;
- }
-
/* Reset the "barriers don't work" bits here, then force meta data to
* be written, to ensure we determine if barriers are supported. */
- if (nbc->dc.no_md_flush)
- drbd_set_flag(mdev, MD_NO_FUA);
+ if (new_disk_conf->md_flushes)
+ clear_bit(MD_NO_FUA, &mdev->flags);
else
- drbd_clear_flag(mdev, MD_NO_FUA);
+ set_bit(MD_NO_FUA, &mdev->flags);
/* Point of no return reached.
* Devices and memory are no longer released by error cleanup below.
@@ -1181,22 +1512,22 @@
D_ASSERT(mdev->ldev == NULL);
mdev->ldev = nbc;
mdev->resync = resync_lru;
+ mdev->rs_plan_s = new_plan;
nbc = NULL;
resync_lru = NULL;
+ new_disk_conf = NULL;
+ new_plan = NULL;
- mdev->write_ordering = WO_bdev_flush;
- drbd_bump_write_ordering(mdev, WO_bdev_flush);
+ drbd_bump_write_ordering(mdev->tconn, WO_bdev_flush);
if (drbd_md_test_flag(mdev->ldev, MDF_CRASHED_PRIMARY))
- drbd_set_flag(mdev, CRASHED_PRIMARY);
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
else
- drbd_clear_flag(mdev, CRASHED_PRIMARY);
+ clear_bit(CRASHED_PRIMARY, &mdev->flags);
if (drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
- !(mdev->state.role == R_PRIMARY && mdev->state.susp_nod)) {
- drbd_set_flag(mdev, CRASHED_PRIMARY);
- cp_discovered = 1;
- }
+ !(mdev->state.role == R_PRIMARY && mdev->tconn->susp_nod))
+ set_bit(CRASHED_PRIMARY, &mdev->flags);
mdev->send_cnt = 0;
mdev->recv_cnt = 0;
@@ -1219,20 +1550,22 @@
* so we can automatically recover from a crash of a
* degraded but active "cluster" after a certain timeout.
*/
- drbd_clear_flag(mdev, USE_DEGR_WFC_T);
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
if (mdev->state.role != R_PRIMARY &&
drbd_md_test_flag(mdev->ldev, MDF_PRIMARY_IND) &&
!drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
- drbd_set_flag(mdev, USE_DEGR_WFC_T);
+ set_bit(USE_DEGR_WFC_T, &mdev->flags);
dd = drbd_determine_dev_size(mdev, 0);
if (dd == dev_size_error) {
retcode = ERR_NOMEM_BITMAP;
goto force_diskless_dec;
} else if (dd == grew)
- drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
- if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC)) {
+ if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
+ (test_bit(CRASHED_PRIMARY, &mdev->flags) &&
+ drbd_md_test_flag(mdev->ldev, MDF_AL_DISABLED))) {
dev_info(DEV, "Assuming that all blocks are out of sync "
"(aka FullSync)\n");
if (drbd_bitmap_io(mdev, &drbd_bmio_set_n_write,
@@ -1242,16 +1575,7 @@
}
} else {
if (drbd_bitmap_io(mdev, &drbd_bm_read,
- "read from attaching", BM_LOCKED_MASK) < 0) {
- retcode = ERR_IO_MD_DISK;
- goto force_diskless_dec;
- }
- }
-
- if (cp_discovered) {
- drbd_al_apply_to_bm(mdev);
- if (drbd_bitmap_io(mdev, &drbd_bm_write,
- "crashed primary apply AL", BM_LOCKED_MASK)) {
+ "read from attaching", BM_LOCKED_MASK)) {
retcode = ERR_IO_MD_DISK;
goto force_diskless_dec;
}
@@ -1260,9 +1584,9 @@
if (_drbd_bm_total_weight(mdev) == drbd_bm_bits(mdev))
drbd_suspend_al(mdev); /* IO is still suspended here... */
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- ns.i = os.i;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ os = drbd_read_state(mdev);
+ ns = os;
/* If MDF_CONSISTENT is not set go into inconsistent state,
otherwise investigate MDF_WasUpToDate...
If MDF_WAS_UP_TO_DATE is not set go into D_OUTDATED disk state,
@@ -1280,8 +1604,9 @@
if (drbd_md_test_flag(mdev->ldev, MDF_PEER_OUT_DATED))
ns.pdsk = D_OUTDATED;
- if ( ns.disk == D_CONSISTENT &&
- (ns.pdsk == D_OUTDATED || mdev->ldev->dc.fencing == FP_DONT_CARE))
+ rcu_read_lock();
+ if (ns.disk == D_CONSISTENT &&
+ (ns.pdsk == D_OUTDATED || rcu_dereference(mdev->ldev->disk_conf)->fencing == FP_DONT_CARE))
ns.disk = D_UP_TO_DATE;
/* All tests on MDF_PRIMARY_IND, MDF_CONNECTED_IND,
@@ -1289,6 +1614,13 @@
this point, because drbd_request_state() modifies these
flags. */
+ if (rcu_dereference(mdev->ldev->disk_conf)->al_updates)
+ mdev->ldev->md.flags &= ~MDF_AL_DISABLED;
+ else
+ mdev->ldev->md.flags |= MDF_AL_DISABLED;
+
+ rcu_read_unlock();
+
/* In case we are C_CONNECTED postpone any decision on the new disk
state after the negotiation phase. */
if (mdev->state.conn == C_CONNECTED) {
@@ -1304,12 +1636,13 @@
}
rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (rv < SS_SUCCESS)
goto force_diskless_dec;
+ mod_timer(&mdev->request_timer, jiffies + HZ);
+
if (mdev->state.role == R_PRIMARY)
mdev->ldev->md.uuid[UI_CURRENT] |= (u64)1;
else
@@ -1320,16 +1653,17 @@
kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
put_ldev(mdev);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(mdev->tconn);
+ drbd_adm_finish(info, retcode);
return 0;
force_diskless_dec:
put_ldev(mdev);
force_diskless:
- drbd_force_state(mdev, NS(disk, D_FAILED));
+ drbd_force_state(mdev, NS(disk, D_DISKLESS));
drbd_md_sync(mdev);
fail:
+ conn_reconfig_done(mdev->tconn);
if (nbc) {
if (nbc->backing_bdev)
blkdev_put(nbc->backing_bdev,
@@ -1339,34 +1673,24 @@
FMODE_READ | FMODE_WRITE | FMODE_EXCL);
kfree(nbc);
}
+ kfree(new_disk_conf);
lc_destroy(resync_lru);
+ kfree(new_plan);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ finish:
+ drbd_adm_finish(info, retcode);
return 0;
}
-/* Detaching the disk is a process in multiple stages. First we need to lock
- * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
- * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
- * internal references as well.
- * Only then we have finally detached. */
-static int drbd_nl_detach(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static int adm_detach(struct drbd_conf *mdev, int force)
{
- enum drbd_ret_code retcode;
+ enum drbd_state_rv retcode;
int ret;
- struct detach dt = {};
- if (!detach_from_tags(mdev, nlp->tag_list, &dt)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- goto out;
- }
-
- if (dt.detach_force) {
- drbd_set_flag(mdev, FORCE_DETACH);
+ if (force) {
+ set_bit(FORCE_DETACH, &mdev->flags);
drbd_force_state(mdev, NS(disk, D_FAILED));
- reply->ret_code = SS_SUCCESS;
+ retcode = SS_SUCCESS;
goto out;
}
@@ -1378,326 +1702,529 @@
ret = wait_event_interruptible(mdev->misc_wait,
mdev->state.disk != D_FAILED);
drbd_resume_io(mdev);
-
if ((int)retcode == (int)SS_IS_DISKLESS)
retcode = SS_NOTHING_TO_DO;
if (ret)
retcode = ERR_INTR;
- reply->ret_code = retcode;
out:
+ return retcode;
+}
+
+/* Detaching the disk is a process in multiple stages. First we need to lock
+ * out application IO, in-flight IO, IO stuck in drbd_al_begin_io.
+ * Then we transition to D_DISKLESS, and wait for put_ldev() to return all
+ * internal references as well.
+ * Only then we have finally detached. */
+int drbd_adm_detach(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct detach_parms parms = { };
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (info->attrs[DRBD_NLA_DETACH_PARMS]) {
+ err = detach_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+
+ retcode = adm_detach(adm_ctx.mdev, parms.force_detach);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_net_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static bool conn_resync_running(struct drbd_tconn *tconn)
{
- int i, ns;
- enum drbd_ret_code retcode;
- struct net_conf *new_conf = NULL;
- struct crypto_hash *tfm = NULL;
- struct crypto_hash *integrity_w_tfm = NULL;
- struct crypto_hash *integrity_r_tfm = NULL;
- struct hlist_head *new_tl_hash = NULL;
- struct hlist_head *new_ee_hash = NULL;
- struct drbd_conf *odev;
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_SYNC_SOURCE ||
+ mdev->state.conn == C_SYNC_TARGET ||
+ mdev->state.conn == C_PAUSED_SYNC_S ||
+ mdev->state.conn == C_PAUSED_SYNC_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static bool conn_ov_running(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ bool rv = false;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_VERIFY_S ||
+ mdev->state.conn == C_VERIFY_T) {
+ rv = true;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+static enum drbd_ret_code
+_check_net_options(struct drbd_tconn *tconn, struct net_conf *old_conf, struct net_conf *new_conf)
+{
+ struct drbd_conf *mdev;
+ int i;
+
+ if (old_conf && tconn->cstate == C_WF_REPORT_PARAMS && tconn->agreed_pro_version < 100) {
+ if (new_conf->wire_protocol != old_conf->wire_protocol)
+ return ERR_NEED_APV_100;
+
+ if (new_conf->two_primaries != old_conf->two_primaries)
+ return ERR_NEED_APV_100;
+
+ if (strcmp(new_conf->integrity_alg, old_conf->integrity_alg))
+ return ERR_NEED_APV_100;
+ }
+
+ if (!new_conf->two_primaries &&
+ conn_highest_role(tconn) == R_PRIMARY &&
+ conn_highest_peer(tconn) == R_PRIMARY)
+ return ERR_NEED_ALLOW_TWO_PRI;
+
+ if (new_conf->two_primaries &&
+ (new_conf->wire_protocol != DRBD_PROT_C))
+ return ERR_NOT_PROTO_C;
+
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (get_ldev(mdev)) {
+ enum drbd_fencing_p fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+ put_ldev(mdev);
+ if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH)
+ return ERR_STONITH_AND_PROT_A;
+ }
+ if (mdev->state.role == R_PRIMARY && new_conf->discard_my_data)
+ return ERR_DISCARD_IMPOSSIBLE;
+ }
+
+ if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A)
+ return ERR_CONG_NOT_PROTO_A;
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+check_net_options(struct drbd_tconn *tconn, struct net_conf *new_conf)
+{
+ static enum drbd_ret_code rv;
+ struct drbd_conf *mdev;
+ int i;
+
+ rcu_read_lock();
+ rv = _check_net_options(tconn, rcu_dereference(tconn->net_conf), new_conf);
+ rcu_read_unlock();
+
+ /* tconn->volumes protected by genl_lock() here */
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ if (!mdev->bitmap) {
+ if(drbd_bm_init(mdev))
+ return ERR_NOMEM;
+ }
+ }
+
+ return rv;
+}
+
+struct crypto {
+ struct crypto_hash *verify_tfm;
+ struct crypto_hash *csums_tfm;
+ struct crypto_hash *cram_hmac_tfm;
+ struct crypto_hash *integrity_tfm;
+};
+
+static int
+alloc_hash(struct crypto_hash **tfm, char *tfm_name, int err_alg)
+{
+ if (!tfm_name[0])
+ return NO_ERROR;
+
+ *tfm = crypto_alloc_hash(tfm_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(*tfm)) {
+ *tfm = NULL;
+ return err_alg;
+ }
+
+ return NO_ERROR;
+}
+
+static enum drbd_ret_code
+alloc_crypto(struct crypto *crypto, struct net_conf *new_conf)
+{
char hmac_name[CRYPTO_MAX_ALG_NAME];
- void *int_dig_out = NULL;
- void *int_dig_in = NULL;
- void *int_dig_vv = NULL;
- struct sockaddr *new_my_addr, *new_peer_addr, *taken_addr;
+ enum drbd_ret_code rv;
- drbd_reconfig_start(mdev);
+ rv = alloc_hash(&crypto->csums_tfm, new_conf->csums_alg,
+ ERR_CSUMS_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->verify_tfm, new_conf->verify_alg,
+ ERR_VERIFY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ rv = alloc_hash(&crypto->integrity_tfm, new_conf->integrity_alg,
+ ERR_INTEGRITY_ALG);
+ if (rv != NO_ERROR)
+ return rv;
+ if (new_conf->cram_hmac_alg[0] != 0) {
+ snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
+ new_conf->cram_hmac_alg);
- if (mdev->state.conn > C_STANDALONE) {
+ rv = alloc_hash(&crypto->cram_hmac_tfm, hmac_name,
+ ERR_AUTH_ALG);
+ }
+
+ return rv;
+}
+
+static void free_crypto(struct crypto *crypto)
+{
+ crypto_free_hash(crypto->cram_hmac_tfm);
+ crypto_free_hash(crypto->integrity_tfm);
+ crypto_free_hash(crypto->csums_tfm);
+ crypto_free_hash(crypto->verify_tfm);
+}
+
+int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct drbd_tconn *tconn;
+ struct net_conf *old_conf, *new_conf = NULL;
+ int err;
+ int ovr; /* online verify running */
+ int rsr; /* re-sync running */
+ struct crypto crypto = { };
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ tconn = adm_ctx.tconn;
+
+ new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_conf) {
+ retcode = ERR_NOMEM;
+ goto out;
+ }
+
+ conn_reconfig_start(tconn);
+
+ mutex_lock(&tconn->data.mutex);
+ mutex_lock(&tconn->conf_update);
+ old_conf = tconn->net_conf;
+
+ if (!old_conf) {
+ drbd_msg_put_info("net conf missing, try connect");
+ retcode = ERR_INVALID_REQUEST;
+ goto fail;
+ }
+
+ *new_conf = *old_conf;
+ if (should_set_defaults(info))
+ set_net_conf_defaults(new_conf);
+
+ err = net_conf_from_attrs_for_change(new_conf, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ retcode = check_net_options(tconn, new_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ /* re-sync running */
+ rsr = conn_resync_running(tconn);
+ if (rsr && strcmp(new_conf->csums_alg, old_conf->csums_alg)) {
+ retcode = ERR_CSUMS_RESYNC_RUNNING;
+ goto fail;
+ }
+
+ /* online verify running */
+ ovr = conn_ov_running(tconn);
+ if (ovr && strcmp(new_conf->verify_alg, old_conf->verify_alg)) {
+ retcode = ERR_VERIFY_RUNNING;
+ goto fail;
+ }
+
+ retcode = alloc_crypto(&crypto, new_conf);
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ rcu_assign_pointer(tconn->net_conf, new_conf);
+
+ if (!rsr) {
+ crypto_free_hash(tconn->csums_tfm);
+ tconn->csums_tfm = crypto.csums_tfm;
+ crypto.csums_tfm = NULL;
+ }
+ if (!ovr) {
+ crypto_free_hash(tconn->verify_tfm);
+ tconn->verify_tfm = crypto.verify_tfm;
+ crypto.verify_tfm = NULL;
+ }
+
+ crypto_free_hash(tconn->integrity_tfm);
+ tconn->integrity_tfm = crypto.integrity_tfm;
+ if (tconn->cstate >= C_WF_REPORT_PARAMS && tconn->agreed_pro_version >= 100)
+ /* Do this without trying to take tconn->data.mutex again. */
+ __drbd_send_protocol(tconn, P_PROTOCOL_UPDATE);
+
+ crypto_free_hash(tconn->cram_hmac_tfm);
+ tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+ synchronize_rcu();
+ kfree(old_conf);
+
+ if (tconn->cstate >= C_WF_REPORT_PARAMS)
+ drbd_send_sync_param(minor_to_mdev(conn_lowest_minor(tconn)));
+
+ goto done;
+
+ fail:
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+ free_crypto(&crypto);
+ kfree(new_conf);
+ done:
+ conn_reconfig_done(tconn);
+ out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_conf *mdev;
+ struct net_conf *old_conf, *new_conf = NULL;
+ struct crypto crypto = { };
+ struct drbd_tconn *tconn;
+ enum drbd_ret_code retcode;
+ int i;
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+ if (!(adm_ctx.my_addr && adm_ctx.peer_addr)) {
+ drbd_msg_put_info("connection endpoint(s) missing");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* No need for _rcu here. All reconfiguration is
+ * strictly serialized on genl_lock(). We are protected against
+ * concurrent reconfiguration/addition/deletion */
+ list_for_each_entry(tconn, &drbd_tconns, all_tconn) {
+ if (nla_len(adm_ctx.my_addr) == tconn->my_addr_len &&
+ !memcmp(nla_data(adm_ctx.my_addr), &tconn->my_addr, tconn->my_addr_len)) {
+ retcode = ERR_LOCAL_ADDR;
+ goto out;
+ }
+
+ if (nla_len(adm_ctx.peer_addr) == tconn->peer_addr_len &&
+ !memcmp(nla_data(adm_ctx.peer_addr), &tconn->peer_addr, tconn->peer_addr_len)) {
+ retcode = ERR_PEER_ADDR;
+ goto out;
+ }
+ }
+
+ tconn = adm_ctx.tconn;
+ conn_reconfig_start(tconn);
+
+ if (tconn->cstate > C_STANDALONE) {
retcode = ERR_NET_CONFIGURED;
goto fail;
}
- /* allocation not in the IO path, cqueue thread context */
- new_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ /* allocation not in the IO path, drbdsetup / netlink process context */
+ new_conf = kzalloc(sizeof(*new_conf), GFP_KERNEL);
if (!new_conf) {
retcode = ERR_NOMEM;
goto fail;
}
- new_conf->timeout = DRBD_TIMEOUT_DEF;
- new_conf->try_connect_int = DRBD_CONNECT_INT_DEF;
- new_conf->ping_int = DRBD_PING_INT_DEF;
- new_conf->max_epoch_size = DRBD_MAX_EPOCH_SIZE_DEF;
- new_conf->max_buffers = DRBD_MAX_BUFFERS_DEF;
- new_conf->unplug_watermark = DRBD_UNPLUG_WATERMARK_DEF;
- new_conf->sndbuf_size = DRBD_SNDBUF_SIZE_DEF;
- new_conf->rcvbuf_size = DRBD_RCVBUF_SIZE_DEF;
- new_conf->ko_count = DRBD_KO_COUNT_DEF;
- new_conf->after_sb_0p = DRBD_AFTER_SB_0P_DEF;
- new_conf->after_sb_1p = DRBD_AFTER_SB_1P_DEF;
- new_conf->after_sb_2p = DRBD_AFTER_SB_2P_DEF;
- new_conf->want_lose = 0;
- new_conf->two_primaries = 0;
- new_conf->wire_protocol = DRBD_PROT_C;
- new_conf->ping_timeo = DRBD_PING_TIMEO_DEF;
- new_conf->rr_conflict = DRBD_RR_CONFLICT_DEF;
- new_conf->on_congestion = DRBD_ON_CONGESTION_DEF;
- new_conf->cong_extents = DRBD_CONG_EXTENTS_DEF;
+ set_net_conf_defaults(new_conf);
- if (!net_conf_from_tags(mdev, nlp->tag_list, new_conf)) {
+ err = net_conf_from_attrs(new_conf, info);
+ if (err && err != -ENOMSG) {
retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
goto fail;
}
- if (new_conf->two_primaries
- && (new_conf->wire_protocol != DRBD_PROT_C)) {
- retcode = ERR_NOT_PROTO_C;
+ retcode = check_net_options(tconn, new_conf);
+ if (retcode != NO_ERROR)
goto fail;
- }
- if (get_ldev(mdev)) {
- enum drbd_fencing_p fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- if (new_conf->wire_protocol == DRBD_PROT_A && fp == FP_STONITH) {
- retcode = ERR_STONITH_AND_PROT_A;
- goto fail;
- }
- }
-
- if (new_conf->on_congestion != OC_BLOCK && new_conf->wire_protocol != DRBD_PROT_A) {
- retcode = ERR_CONG_NOT_PROTO_A;
+ retcode = alloc_crypto(&crypto, new_conf);
+ if (retcode != NO_ERROR)
goto fail;
- }
-
- if (mdev->state.role == R_PRIMARY && new_conf->want_lose) {
- retcode = ERR_DISCARD;
- goto fail;
- }
-
- retcode = NO_ERROR;
-
- new_my_addr = (struct sockaddr *)&new_conf->my_addr;
- new_peer_addr = (struct sockaddr *)&new_conf->peer_addr;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev || odev == mdev)
- continue;
- if (get_net_conf(odev)) {
- taken_addr = (struct sockaddr *)&odev->net_conf->my_addr;
- if (new_conf->my_addr_len == odev->net_conf->my_addr_len &&
- !memcmp(new_my_addr, taken_addr, new_conf->my_addr_len))
- retcode = ERR_LOCAL_ADDR;
-
- taken_addr = (struct sockaddr *)&odev->net_conf->peer_addr;
- if (new_conf->peer_addr_len == odev->net_conf->peer_addr_len &&
- !memcmp(new_peer_addr, taken_addr, new_conf->peer_addr_len))
- retcode = ERR_PEER_ADDR;
-
- put_net_conf(odev);
- if (retcode != NO_ERROR)
- goto fail;
- }
- }
-
- if (new_conf->cram_hmac_alg[0] != 0) {
- snprintf(hmac_name, CRYPTO_MAX_ALG_NAME, "hmac(%s)",
- new_conf->cram_hmac_alg);
- tfm = crypto_alloc_hash(hmac_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(tfm)) {
- tfm = NULL;
- retcode = ERR_AUTH_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- retcode = ERR_AUTH_ALG_ND;
- goto fail;
- }
- }
-
- if (new_conf->integrity_alg[0]) {
- integrity_w_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_w_tfm)) {
- integrity_w_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(integrity_w_tfm))) {
- retcode=ERR_INTEGRITY_ALG_ND;
- goto fail;
- }
-
- integrity_r_tfm = crypto_alloc_hash(new_conf->integrity_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(integrity_r_tfm)) {
- integrity_r_tfm = NULL;
- retcode=ERR_INTEGRITY_ALG;
- goto fail;
- }
- }
-
- ns = new_conf->max_epoch_size/8;
- if (mdev->tl_hash_s != ns) {
- new_tl_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_tl_hash) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
-
- ns = new_conf->max_buffers/8;
- if (new_conf->two_primaries && (mdev->ee_hash_s != ns)) {
- new_ee_hash = kzalloc(ns*sizeof(void *), GFP_KERNEL);
- if (!new_ee_hash) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
((char *)new_conf->shared_secret)[SHARED_SECRET_MAX-1] = 0;
- if (integrity_w_tfm) {
- i = crypto_hash_digestsize(integrity_w_tfm);
- int_dig_out = kmalloc(i, GFP_KERNEL);
- if (!int_dig_out) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_in = kmalloc(i, GFP_KERNEL);
- if (!int_dig_in) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- int_dig_vv = kmalloc(i, GFP_KERNEL);
- if (!int_dig_vv) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
+ conn_flush_workqueue(tconn);
- if (!mdev->bitmap) {
- if(drbd_bm_init(mdev)) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
-
- drbd_flush_workqueue(mdev);
- spin_lock_irq(&mdev->req_lock);
- if (mdev->net_conf != NULL) {
+ mutex_lock(&tconn->conf_update);
+ old_conf = tconn->net_conf;
+ if (old_conf) {
retcode = ERR_NET_CONFIGURED;
- spin_unlock_irq(&mdev->req_lock);
+ mutex_unlock(&tconn->conf_update);
goto fail;
}
- mdev->net_conf = new_conf;
+ rcu_assign_pointer(tconn->net_conf, new_conf);
- mdev->send_cnt = 0;
- mdev->recv_cnt = 0;
+ conn_free_crypto(tconn);
+ tconn->cram_hmac_tfm = crypto.cram_hmac_tfm;
+ tconn->integrity_tfm = crypto.integrity_tfm;
+ tconn->csums_tfm = crypto.csums_tfm;
+ tconn->verify_tfm = crypto.verify_tfm;
- if (new_tl_hash) {
- kfree(mdev->tl_hash);
- mdev->tl_hash_s = mdev->net_conf->max_epoch_size/8;
- mdev->tl_hash = new_tl_hash;
+ tconn->my_addr_len = nla_len(adm_ctx.my_addr);
+ memcpy(&tconn->my_addr, nla_data(adm_ctx.my_addr), tconn->my_addr_len);
+ tconn->peer_addr_len = nla_len(adm_ctx.peer_addr);
+ memcpy(&tconn->peer_addr, nla_data(adm_ctx.peer_addr), tconn->peer_addr_len);
+
+ mutex_unlock(&tconn->conf_update);
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, i) {
+ mdev->send_cnt = 0;
+ mdev->recv_cnt = 0;
}
+ rcu_read_unlock();
- if (new_ee_hash) {
- kfree(mdev->ee_hash);
- mdev->ee_hash_s = mdev->net_conf->max_buffers/8;
- mdev->ee_hash = new_ee_hash;
- }
+ retcode = conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = tfm;
-
- crypto_free_hash(mdev->integrity_w_tfm);
- mdev->integrity_w_tfm = integrity_w_tfm;
-
- crypto_free_hash(mdev->integrity_r_tfm);
- mdev->integrity_r_tfm = integrity_r_tfm;
-
- kfree(mdev->int_dig_out);
- kfree(mdev->int_dig_in);
- kfree(mdev->int_dig_vv);
- mdev->int_dig_out=int_dig_out;
- mdev->int_dig_in=int_dig_in;
- mdev->int_dig_vv=int_dig_vv;
- retcode = _drbd_set_state(_NS(mdev, conn, C_UNCONNECTED), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
-
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(tconn);
+ drbd_adm_finish(info, retcode);
return 0;
fail:
- kfree(int_dig_out);
- kfree(int_dig_in);
- kfree(int_dig_vv);
- crypto_free_hash(tfm);
- crypto_free_hash(integrity_w_tfm);
- crypto_free_hash(integrity_r_tfm);
- kfree(new_tl_hash);
- kfree(new_ee_hash);
+ free_crypto(&crypto);
kfree(new_conf);
- reply->ret_code = retcode;
- drbd_reconfig_done(mdev);
+ conn_reconfig_done(tconn);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_disconnect(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+static enum drbd_state_rv conn_try_disconnect(struct drbd_tconn *tconn, bool force)
{
- int retcode;
- struct disconnect dc;
+ enum drbd_state_rv rv;
- memset(&dc, 0, sizeof(struct disconnect));
- if (!disconnect_from_tags(mdev, nlp->tag_list, &dc)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
- }
+ rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+ force ? CS_HARD : 0);
- if (dc.force) {
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.conn >= C_WF_CONNECTION)
- _drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), CS_HARD, NULL);
- spin_unlock_irq(&mdev->req_lock);
- goto done;
- }
-
- retcode = _drbd_request_state(mdev, NS(conn, C_DISCONNECTING), CS_ORDERED);
-
- if (retcode == SS_NOTHING_TO_DO)
- goto done;
- else if (retcode == SS_ALREADY_STANDALONE)
- goto done;
- else if (retcode == SS_PRIMARY_NOP) {
- /* Our statche checking code wants to see the peer outdated. */
- retcode = drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- pdsk, D_OUTDATED));
- } else if (retcode == SS_CW_FAILED_BY_PEER) {
+ switch (rv) {
+ case SS_NOTHING_TO_DO:
+ break;
+ case SS_ALREADY_STANDALONE:
+ return SS_SUCCESS;
+ case SS_PRIMARY_NOP:
+ /* Our state checking code wants to see the peer outdated. */
+ rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+ pdsk, D_OUTDATED), CS_VERBOSE);
+ break;
+ case SS_CW_FAILED_BY_PEER:
/* The peer probably wants to see us outdated. */
- retcode = _drbd_request_state(mdev, NS2(conn, C_DISCONNECTING,
- disk, D_OUTDATED),
- CS_ORDERED);
- if (retcode == SS_IS_DISKLESS || retcode == SS_LOWER_THAN_OUTDATED) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- retcode = SS_SUCCESS;
+ rv = conn_request_state(tconn, NS2(conn, C_DISCONNECTING,
+ disk, D_OUTDATED), 0);
+ if (rv == SS_IS_DISKLESS || rv == SS_LOWER_THAN_OUTDATED) {
+ rv = conn_request_state(tconn, NS(conn, C_DISCONNECTING),
+ CS_HARD);
+ }
+ break;
+ default:;
+ /* no special handling necessary */
+ }
+
+ if (rv >= SS_SUCCESS) {
+ enum drbd_state_rv rv2;
+ /* No one else can reconfigure the network while I am here.
+ * The state handling only uses drbd_thread_stop_nowait(),
+ * we want to really wait here until the receiver is no more.
+ */
+ drbd_thread_stop(&adm_ctx.tconn->receiver);
+
+ /* Race breaker. This additional state change request may be
+ * necessary, if this was a forced disconnect during a receiver
+ * restart. We may have "killed" the receiver thread just
+ * after drbdd_init() returned. Typically, we should be
+ * C_STANDALONE already, now, and this becomes a no-op.
+ */
+ rv2 = conn_request_state(tconn, NS(conn, C_STANDALONE),
+ CS_VERBOSE | CS_HARD);
+ if (rv2 < SS_SUCCESS)
+ conn_err(tconn,
+ "unexpected rv2=%d in conn_try_disconnect()\n",
+ rv2);
+ }
+ return rv;
+}
+
+int drbd_adm_disconnect(struct sk_buff *skb, struct genl_info *info)
+{
+ struct disconnect_parms parms;
+ struct drbd_tconn *tconn;
+ enum drbd_state_rv rv;
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_CONNECTION);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto fail;
+
+ tconn = adm_ctx.tconn;
+ memset(&parms, 0, sizeof(parms));
+ if (info->attrs[DRBD_NLA_DISCONNECT_PARMS]) {
+ err = disconnect_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
}
}
- if (retcode < SS_SUCCESS)
- goto fail;
-
- if (wait_event_interruptible(mdev->state_wait,
- mdev->state.conn != C_DISCONNECTING)) {
- /* Do not test for mdev->state.conn == C_STANDALONE, since
- someone else might connect us in the mean time! */
- retcode = ERR_INTR;
- goto fail;
- }
-
- done:
- retcode = NO_ERROR;
+ rv = conn_try_disconnect(tconn, parms.force_disconnect);
+ if (rv < SS_SUCCESS)
+ retcode = rv; /* FIXME: Type mismatch. */
+ else
+ retcode = NO_ERROR;
fail:
- drbd_md_sync(mdev);
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -1709,7 +2236,7 @@
if (mdev->state.role != mdev->state.peer)
iass = (mdev->state.role == R_PRIMARY);
else
- iass = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+ iass = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
if (iass)
drbd_start_resync(mdev, C_SYNC_SOURCE);
@@ -1717,20 +2244,34 @@
_drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE + CS_SERIALIZE);
}
-static int drbd_nl_resize(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
{
- struct resize rs;
- int retcode = NO_ERROR;
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+ struct resize_parms rs;
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
enum determine_dev_size dd;
enum dds_flags ddsf;
+ sector_t u_size;
+ int err;
- memset(&rs, 0, sizeof(struct resize));
- if (!resize_from_tags(mdev, nlp->tag_list, &rs)) {
- retcode = ERR_MANDATORY_TAG;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
goto fail;
+
+ memset(&rs, 0, sizeof(struct resize_parms));
+ if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
+ err = resize_parms_from_attrs(&rs, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
+ }
}
+ mdev = adm_ctx.mdev;
if (mdev->state.conn > C_CONNECTED) {
retcode = ERR_RESIZE_RESYNC;
goto fail;
@@ -1747,15 +2288,36 @@
goto fail;
}
- if (rs.no_resync && mdev->agreed_pro_version < 93) {
+ if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
retcode = ERR_NEED_APV_93;
goto fail_ldev;
}
+ rcu_read_lock();
+ u_size = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+ if (u_size != (sector_t)rs.resize_size) {
+ new_disk_conf = kmalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ retcode = ERR_NOMEM;
+ goto fail_ldev;
+ }
+ }
+
if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
- mdev->ldev->dc.disk_size = (sector_t)rs.resize_size;
+ if (new_disk_conf) {
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = (sector_t)rs.resize_size;
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+ }
+
ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
dd = drbd_determine_dev_size(mdev, ddsf);
drbd_md_sync(mdev);
@@ -1767,14 +2329,14 @@
if (mdev->state.conn == C_CONNECTED) {
if (dd == grew)
- drbd_set_flag(mdev, RESIZE_PENDING);
+ set_bit(RESIZE_PENDING, &mdev->flags);
drbd_send_uuids(mdev);
drbd_send_sizes(mdev, 1, ddsf);
}
fail:
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
fail_ldev:
@@ -1782,210 +2344,61 @@
goto fail;
}
-static int drbd_nl_syncer_conf(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resource_opts(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ enum drbd_ret_code retcode;
+ struct drbd_tconn *tconn;
+ struct res_opts res_opts;
int err;
- int ovr; /* online verify running */
- int rsr; /* re-sync running */
- struct crypto_hash *verify_tfm = NULL;
- struct crypto_hash *csums_tfm = NULL;
- struct syncer_conf sc;
- cpumask_var_t new_cpu_mask;
- int *rs_plan_s = NULL;
- int fifo_size;
- if (!zalloc_cpumask_var(&new_cpu_mask, GFP_KERNEL)) {
- retcode = ERR_NOMEM;
- goto fail;
- }
-
- if (nlp->flags & DRBD_NL_SET_DEFAULTS) {
- memset(&sc, 0, sizeof(struct syncer_conf));
- sc.rate = DRBD_RATE_DEF;
- sc.after = DRBD_AFTER_DEF;
- sc.al_extents = DRBD_AL_EXTENTS_DEF;
- sc.on_no_data = DRBD_ON_NO_DATA_DEF;
- sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
- sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
- sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
- sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
- sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
- } else
- memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
-
- if (!syncer_conf_from_tags(mdev, nlp->tag_list, &sc)) {
- retcode = ERR_MANDATORY_TAG;
- goto fail;
- }
-
- /* re-sync running */
- rsr = ( mdev->state.conn == C_SYNC_SOURCE ||
- mdev->state.conn == C_SYNC_TARGET ||
- mdev->state.conn == C_PAUSED_SYNC_S ||
- mdev->state.conn == C_PAUSED_SYNC_T );
-
- if (rsr && strcmp(sc.csums_alg, mdev->sync_conf.csums_alg)) {
- retcode = ERR_CSUMS_RESYNC_RUNNING;
- goto fail;
- }
-
- if (!rsr && sc.csums_alg[0]) {
- csums_tfm = crypto_alloc_hash(sc.csums_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(csums_tfm)) {
- csums_tfm = NULL;
- retcode = ERR_CSUMS_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(csums_tfm))) {
- retcode = ERR_CSUMS_ALG_ND;
- goto fail;
- }
- }
-
- /* online verify running */
- ovr = (mdev->state.conn == C_VERIFY_S || mdev->state.conn == C_VERIFY_T);
-
- if (ovr) {
- if (strcmp(sc.verify_alg, mdev->sync_conf.verify_alg)) {
- retcode = ERR_VERIFY_RUNNING;
- goto fail;
- }
- }
-
- if (!ovr && sc.verify_alg[0]) {
- verify_tfm = crypto_alloc_hash(sc.verify_alg, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(verify_tfm)) {
- verify_tfm = NULL;
- retcode = ERR_VERIFY_ALG;
- goto fail;
- }
-
- if (!drbd_crypto_is_hash(crypto_hash_tfm(verify_tfm))) {
- retcode = ERR_VERIFY_ALG_ND;
- goto fail;
- }
- }
-
- /* silently ignore cpu mask on UP kernel */
- if (nr_cpu_ids > 1 && sc.cpu_mask[0] != 0) {
- err = bitmap_parse(sc.cpu_mask, 32,
- cpumask_bits(new_cpu_mask), nr_cpu_ids);
- if (err) {
- dev_warn(DEV, "bitmap_parse() failed with %d\n", err);
- retcode = ERR_CPU_MASK_PARSE;
- goto fail;
- }
- }
-
- ERR_IF (sc.rate < 1) sc.rate = 1;
- ERR_IF (sc.al_extents < 7) sc.al_extents = 127; /* arbitrary minimum */
-#define AL_MAX ((MD_AL_MAX_SIZE-1) * AL_EXTENTS_PT)
- if (sc.al_extents > AL_MAX) {
- dev_err(DEV, "sc.al_extents > %d\n", AL_MAX);
- sc.al_extents = AL_MAX;
- }
-#undef AL_MAX
-
- /* to avoid spurious errors when configuring minors before configuring
- * the minors they depend on: if necessary, first create the minor we
- * depend on */
- if (sc.after >= 0)
- ensure_mdev(sc.after, 1);
-
- /* most sanity checks done, try to assign the new sync-after
- * dependency. need to hold the global lock in there,
- * to avoid a race in the dependency loop check. */
- retcode = drbd_alter_sa(mdev, sc.after);
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
if (retcode != NO_ERROR)
goto fail;
+ tconn = adm_ctx.tconn;
- fifo_size = (sc.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_KERNEL);
- if (!rs_plan_s) {
- dev_err(DEV, "kmalloc of fifo_buffer failed");
+ res_opts = tconn->res_opts;
+ if (should_set_defaults(info))
+ set_res_opts_defaults(&res_opts);
+
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto fail;
+ }
+
+ err = set_resource_options(tconn, &res_opts);
+ if (err) {
+ retcode = ERR_INVALID_REQUEST;
+ if (err == -ENOMEM)
retcode = ERR_NOMEM;
- goto fail;
- }
}
- /* ok, assign the rest of it as well.
- * lock against receive_SyncParam() */
- spin_lock(&mdev->peer_seq_lock);
- mdev->sync_conf = sc;
-
- if (!rsr) {
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- csums_tfm = NULL;
- }
-
- if (!ovr) {
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- verify_tfm = NULL;
- }
-
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
- rs_plan_s = NULL;
- }
-
- spin_unlock(&mdev->peer_seq_lock);
-
- if (get_ldev(mdev)) {
- wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
- drbd_al_shrink(mdev);
- err = drbd_check_al_size(mdev);
- lc_unlock(mdev->act_log);
- wake_up(&mdev->al_wait);
-
- put_ldev(mdev);
- drbd_md_sync(mdev);
-
- if (err) {
- retcode = ERR_NOMEM;
- goto fail;
- }
- }
-
- if (mdev->state.conn >= C_CONNECTED)
- drbd_send_sync_param(mdev, &sc);
-
- if (!cpumask_equal(mdev->cpu_mask, new_cpu_mask)) {
- cpumask_copy(mdev->cpu_mask, new_cpu_mask);
- drbd_calc_cpu_mask(mdev);
- mdev->receiver.reset_cpu_mask = 1;
- mdev->asender.reset_cpu_mask = 1;
- mdev->worker.reset_cpu_mask = 1;
- }
-
- kobject_uevent(&disk_to_dev(mdev->vdisk)->kobj, KOBJ_CHANGE);
fail:
- kfree(rs_plan_s);
- free_cpumask_var(new_cpu_mask);
- crypto_free_hash(csums_tfm);
- crypto_free_hash(verify_tfm);
- reply->ret_code = retcode;
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_invalidate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ struct drbd_conf *mdev;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
/* If there is still bitmap IO pending, probably because of a previous
* resync just being finished, wait for it before requesting a new resync.
* Also wait for it's after_state_ch(). */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T), CS_ORDERED);
@@ -1994,10 +2407,10 @@
retcode = drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_T));
while (retcode == SS_NEED_CONNECTION) {
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn < C_CONNECTED)
retcode = _drbd_set_state(_NS(mdev, disk, D_INCONSISTENT), CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (retcode != SS_NEED_CONNECTION)
break;
@@ -2006,7 +2419,25 @@
}
drbd_resume_io(mdev);
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+static int drbd_adm_simple_request_state(struct sk_buff *skb, struct genl_info *info,
+ union drbd_state mask, union drbd_state val)
+{
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ retcode = drbd_request_state(adm_ctx.mdev, mask, val);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
@@ -2019,29 +2450,36 @@
return rv;
}
-static int drbd_nl_invalidate_peer(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_invalidate_peer(struct sk_buff *skb, struct genl_info *info)
{
- int retcode;
+ int retcode; /* drbd_ret_code, drbd_state_rv */
+ struct drbd_conf *mdev;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
/* If there is still bitmap IO pending, probably because of a previous
* resync just being finished, wait for it before requesting a new resync.
* Also wait for it's after_state_ch(). */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
drbd_flush_workqueue(mdev);
retcode = _drbd_request_state(mdev, NS(conn, C_STARTING_SYNC_S), CS_ORDERED);
-
if (retcode < SS_SUCCESS) {
if (retcode == SS_NEED_CONNECTION && mdev->state.role == R_PRIMARY) {
- /* The peer will get a resync upon connect anyways. Just make that
- into a full resync. */
+ /* The peer will get a resync upon connect anyways.
+ * Just make that into a full resync. */
retcode = drbd_request_state(mdev, NS(pdsk, D_INCONSISTENT));
if (retcode >= SS_SUCCESS) {
if (drbd_bitmap_io(mdev, &drbd_bmio_set_susp_al,
- "set_n_write from invalidate_peer",
- BM_LOCKED_SET_ALLOWED))
+ "set_n_write from invalidate_peer",
+ BM_LOCKED_SET_ALLOWED))
retcode = ERR_IO_MD_DISK;
}
} else
@@ -2049,30 +2487,41 @@
}
drbd_resume_io(mdev);
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_pause_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_pause_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ enum drbd_ret_code retcode;
- if (drbd_request_state(mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 1)) == SS_NOTHING_TO_DO)
retcode = ERR_PAUSE_IS_SET;
-
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_resume_sync(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_sync(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
- union drbd_state s;
+ union drbd_dev_state s;
+ enum drbd_ret_code retcode;
- if (drbd_request_state(mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
- s = mdev->state;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (drbd_request_state(adm_ctx.mdev, NS(user_isp, 0)) == SS_NOTHING_TO_DO) {
+ s = adm_ctx.mdev->state;
if (s.conn == C_PAUSED_SYNC_S || s.conn == C_PAUSED_SYNC_T) {
retcode = s.aftr_isp ? ERR_PIC_AFTER_DEP :
s.peer_isp ? ERR_PIC_PEER_DEP : ERR_PAUSE_IS_CLEAR;
@@ -2081,178 +2530,482 @@
}
}
- reply->ret_code = retcode;
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_suspend_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_suspend_io(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(susp, 1));
-
- return 0;
+ return drbd_adm_simple_request_state(skb, info, NS(susp, 1));
}
-static int drbd_nl_resume_io(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info)
{
- if (drbd_test_flag(mdev, NEW_CUR_UUID)) {
+ struct drbd_conf *mdev;
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
}
drbd_suspend_io(mdev);
- reply->ret_code = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
- if (reply->ret_code == SS_SUCCESS) {
+ retcode = drbd_request_state(mdev, NS3(susp, 0, susp_nod, 0, susp_fen, 0));
+ if (retcode == SS_SUCCESS) {
if (mdev->state.conn < C_CONNECTED)
- tl_clear(mdev);
+ tl_clear(mdev->tconn);
if (mdev->state.disk == D_DISKLESS || mdev->state.disk == D_FAILED)
- tl_restart(mdev, fail_frozen_disk_io);
+ tl_restart(mdev->tconn, FAIL_FROZEN_DISK_IO);
}
drbd_resume_io(mdev);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_outdate(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_outdate(struct sk_buff *skb, struct genl_info *info)
{
- reply->ret_code = drbd_request_state(mdev, NS(disk, D_OUTDATED));
+ return drbd_adm_simple_request_state(skb, info, NS(disk, D_OUTDATED));
+}
+
+int nla_put_drbd_cfg_context(struct sk_buff *skb, struct drbd_tconn *tconn, unsigned vnr)
+{
+ struct nlattr *nla;
+ nla = nla_nest_start(skb, DRBD_NLA_CFG_CONTEXT);
+ if (!nla)
+ goto nla_put_failure;
+ if (vnr != VOLUME_UNSPECIFIED &&
+ nla_put_u32(skb, T_ctx_volume, vnr))
+ goto nla_put_failure;
+ if (nla_put_string(skb, T_ctx_resource_name, tconn->name))
+ goto nla_put_failure;
+ if (tconn->my_addr_len &&
+ nla_put(skb, T_ctx_my_addr, tconn->my_addr_len, &tconn->my_addr))
+ goto nla_put_failure;
+ if (tconn->peer_addr_len &&
+ nla_put(skb, T_ctx_peer_addr, tconn->peer_addr_len, &tconn->peer_addr))
+ goto nla_put_failure;
+ nla_nest_end(skb, nla);
return 0;
+
+nla_put_failure:
+ if (nla)
+ nla_nest_cancel(skb, nla);
+ return -EMSGSIZE;
}
-static int drbd_nl_get_config(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
+ const struct sib_info *sib)
{
- unsigned short *tl;
+ struct state_info *si = NULL; /* for sizeof(si->member); */
+ struct net_conf *nc;
+ struct nlattr *nla;
+ int got_ldev;
+ int err = 0;
+ int exclude_sensitive;
- tl = reply->tag_list;
+ /* If sib != NULL, this is drbd_bcast_event, which anyone can listen
+ * to. So we better exclude_sensitive information.
+ *
+ * If sib == NULL, this is drbd_adm_get_status, executed synchronously
+ * in the context of the requesting user process. Exclude sensitive
+ * information, unless current has superuser.
+ *
+ * NOTE: for drbd_adm_get_status_all(), this is a netlink dump, and
+ * relies on the current implementation of netlink_dump(), which
+ * executes the dump callback successively from netlink_recvmsg(),
+ * always in the context of the receiving process */
+ exclude_sensitive = sib || !capable(CAP_SYS_ADMIN);
- if (get_ldev(mdev)) {
- tl = disk_conf_to_tags(mdev, &mdev->ldev->dc, tl);
- put_ldev(mdev);
- }
+ got_ldev = get_ldev(mdev);
- if (get_net_conf(mdev)) {
- tl = net_conf_to_tags(mdev, mdev->net_conf, tl);
- put_net_conf(mdev);
- }
- tl = syncer_conf_to_tags(mdev, &mdev->sync_conf, tl);
+ /* We need to add connection name and volume number information still.
+ * Minor number is in drbd_genlmsghdr. */
+ if (nla_put_drbd_cfg_context(skb, mdev->tconn, mdev->vnr))
+ goto nla_put_failure;
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ if (res_opts_to_skb(skb, &mdev->tconn->res_opts, exclude_sensitive))
+ goto nla_put_failure;
- return (int)((char *)tl - (char *)reply->tag_list);
-}
+ rcu_read_lock();
+ if (got_ldev)
+ if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
+ goto nla_put_failure;
-static int drbd_nl_get_state(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- unsigned short *tl = reply->tag_list;
- union drbd_state s = mdev->state;
- unsigned long rs_left;
- unsigned int res;
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc)
+ err = net_conf_to_skb(skb, nc, exclude_sensitive);
+ rcu_read_unlock();
+ if (err)
+ goto nla_put_failure;
- tl = get_state_to_tags(mdev, (struct get_state *)&s, tl);
+ nla = nla_nest_start(skb, DRBD_NLA_STATE_INFO);
+ if (!nla)
+ goto nla_put_failure;
+ if (nla_put_u32(skb, T_sib_reason, sib ? sib->sib_reason : SIB_GET_STATUS_REPLY) ||
+ nla_put_u32(skb, T_current_state, mdev->state.i) ||
+ nla_put_u64(skb, T_ed_uuid, mdev->ed_uuid) ||
+ nla_put_u64(skb, T_capacity, drbd_get_capacity(mdev->this_bdev)) ||
+ nla_put_u64(skb, T_send_cnt, mdev->send_cnt) ||
+ nla_put_u64(skb, T_recv_cnt, mdev->recv_cnt) ||
+ nla_put_u64(skb, T_read_cnt, mdev->read_cnt) ||
+ nla_put_u64(skb, T_writ_cnt, mdev->writ_cnt) ||
+ nla_put_u64(skb, T_al_writ_cnt, mdev->al_writ_cnt) ||
+ nla_put_u64(skb, T_bm_writ_cnt, mdev->bm_writ_cnt) ||
+ nla_put_u32(skb, T_ap_bio_cnt, atomic_read(&mdev->ap_bio_cnt)) ||
+ nla_put_u32(skb, T_ap_pending_cnt, atomic_read(&mdev->ap_pending_cnt)) ||
+ nla_put_u32(skb, T_rs_pending_cnt, atomic_read(&mdev->rs_pending_cnt)))
+ goto nla_put_failure;
- /* no local ref, no bitmap, no syncer progress. */
- if (s.conn >= C_SYNC_SOURCE && s.conn <= C_PAUSED_SYNC_T) {
- if (get_ldev(mdev)) {
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_ldev(mdev);
+ if (got_ldev) {
+ int err;
+
+ spin_lock_irq(&mdev->ldev->md.uuid_lock);
+ err = nla_put(skb, T_uuids, sizeof(si->uuids), mdev->ldev->md.uuid);
+ spin_unlock_irq(&mdev->ldev->md.uuid_lock);
+
+ if (err)
+ goto nla_put_failure;
+
+ if (nla_put_u32(skb, T_disk_flags, mdev->ldev->md.flags) ||
+ nla_put_u64(skb, T_bits_total, drbd_bm_bits(mdev)) ||
+ nla_put_u64(skb, T_bits_oos, drbd_bm_total_weight(mdev)))
+ goto nla_put_failure;
+ if (C_SYNC_SOURCE <= mdev->state.conn &&
+ C_PAUSED_SYNC_T >= mdev->state.conn) {
+ if (nla_put_u64(skb, T_bits_rs_total, mdev->rs_total) ||
+ nla_put_u64(skb, T_bits_rs_failed, mdev->rs_failed))
+ goto nla_put_failure;
}
}
- put_unaligned(TT_END, tl++); /* Close the tag list */
- return (int)((char *)tl - (char *)reply->tag_list);
-}
+ if (sib) {
+ switch(sib->sib_reason) {
+ case SIB_SYNC_PROGRESS:
+ case SIB_GET_STATUS_REPLY:
+ break;
+ case SIB_STATE_CHANGE:
+ if (nla_put_u32(skb, T_prev_state, sib->os.i) ||
+ nla_put_u32(skb, T_new_state, sib->ns.i))
+ goto nla_put_failure;
+ break;
+ case SIB_HELPER_POST:
+ if (nla_put_u32(skb, T_helper_exit_code,
+ sib->helper_exit_code))
+ goto nla_put_failure;
+ /* fall through */
+ case SIB_HELPER_PRE:
+ if (nla_put_string(skb, T_helper, sib->helper_name))
+ goto nla_put_failure;
+ break;
+ }
+ }
+ nla_nest_end(skb, nla);
-static int drbd_nl_get_uuids(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
-{
- unsigned short *tl;
-
- tl = reply->tag_list;
-
- if (get_ldev(mdev)) {
- unsigned long flags;
- spin_lock_irqsave(&mdev->ldev->md.uuid_lock, flags);
- tl = tl_add_blob(tl, T_uuids, mdev->ldev->md.uuid, UI_SIZE*sizeof(u64));
- tl = tl_add_int(tl, T_uuids_flags, &mdev->ldev->md.flags);
- spin_unlock_irqrestore(&mdev->ldev->md.uuid_lock, flags);
+ if (0)
+nla_put_failure:
+ err = -EMSGSIZE;
+ if (got_ldev)
put_ldev(mdev);
- }
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- return (int)((char *)tl - (char *)reply->tag_list);
+ return err;
}
-/**
- * drbd_nl_get_timeout_flag() - Used by drbdsetup to find out which timeout value to use
- * @mdev: DRBD device.
- * @nlp: Netlink/connector packet from drbdsetup
- * @reply: Reply packet for drbdsetup
+int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ err = nla_put_status_info(adm_ctx.reply_skb, adm_ctx.mdev, NULL);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
+ }
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int get_one_status(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct drbd_conf *mdev;
+ struct drbd_genlmsghdr *dh;
+ struct drbd_tconn *pos = (struct drbd_tconn*)cb->args[0];
+ struct drbd_tconn *tconn = NULL;
+ struct drbd_tconn *tmp;
+ unsigned volume = cb->args[1];
+
+ /* Open coded, deferred, iteration:
+ * list_for_each_entry_safe(tconn, tmp, &drbd_tconns, all_tconn) {
+ * idr_for_each_entry(&tconn->volumes, mdev, i) {
+ * ...
+ * }
+ * }
+ * where tconn is cb->args[0];
+ * and i is cb->args[1];
+ *
+ * cb->args[2] indicates if we shall loop over all resources,
+ * or just dump all volumes of a single resource.
+ *
+ * This may miss entries inserted after this dump started,
+ * or entries deleted before they are reached.
+ *
+ * We need to make sure the mdev won't disappear while
+ * we are looking at it, and revalidate our iterators
+ * on each iteration.
+ */
+
+ /* synchronize with conn_create()/conn_destroy() */
+ rcu_read_lock();
+ /* revalidate iterator position */
+ list_for_each_entry_rcu(tmp, &drbd_tconns, all_tconn) {
+ if (pos == NULL) {
+ /* first iteration */
+ pos = tmp;
+ tconn = pos;
+ break;
+ }
+ if (tmp == pos) {
+ tconn = pos;
+ break;
+ }
+ }
+ if (tconn) {
+next_tconn:
+ mdev = idr_get_next(&tconn->volumes, &volume);
+ if (!mdev) {
+ /* No more volumes to dump on this tconn.
+ * Advance tconn iterator. */
+ pos = list_entry_rcu(tconn->all_tconn.next,
+ struct drbd_tconn, all_tconn);
+ /* Did we dump any volume on this tconn yet? */
+ if (volume != 0) {
+ /* If we reached the end of the list,
+ * or only a single resource dump was requested,
+ * we are done. */
+ if (&pos->all_tconn == &drbd_tconns || cb->args[2])
+ goto out;
+ volume = 0;
+ tconn = pos;
+ goto next_tconn;
+ }
+ }
+
+ dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, &drbd_genl_family,
+ NLM_F_MULTI, DRBD_ADM_GET_STATUS);
+ if (!dh)
+ goto out;
+
+ if (!mdev) {
+ /* This is a tconn without a single volume.
+ * Suprisingly enough, it may have a network
+ * configuration. */
+ struct net_conf *nc;
+ dh->minor = -1U;
+ dh->ret_code = NO_ERROR;
+ if (nla_put_drbd_cfg_context(skb, tconn, VOLUME_UNSPECIFIED))
+ goto cancel;
+ nc = rcu_dereference(tconn->net_conf);
+ if (nc && net_conf_to_skb(skb, nc, 1) != 0)
+ goto cancel;
+ goto done;
+ }
+
+ D_ASSERT(mdev->vnr == volume);
+ D_ASSERT(mdev->tconn == tconn);
+
+ dh->minor = mdev_to_minor(mdev);
+ dh->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(skb, mdev, NULL)) {
+cancel:
+ genlmsg_cancel(skb, dh);
+ goto out;
+ }
+done:
+ genlmsg_end(skb, dh);
+ }
+
+out:
+ rcu_read_unlock();
+ /* where to start the next iteration */
+ cb->args[0] = (long)pos;
+ cb->args[1] = (pos == tconn) ? volume + 1 : 0;
+
+ /* No more tconns/volumes/minors found results in an empty skb.
+ * Which will terminate the dump. */
+ return skb->len;
+}
+
+/*
+ * Request status of all resources, or of all volumes within a single resource.
+ *
+ * This is a dump, as the answer may not fit in a single reply skb otherwise.
+ * Which means we cannot use the family->attrbuf or other such members, because
+ * dump is NOT protected by the genl_lock(). During dump, we only have access
+ * to the incoming skb, and need to opencode "parsing" of the nlattr payload.
+ *
+ * Once things are setup properly, we call into get_one_status().
*/
-static int drbd_nl_get_timeout_flag(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb)
{
- unsigned short *tl;
- char rv;
+ const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ;
+ struct nlattr *nla;
+ const char *resource_name;
+ struct drbd_tconn *tconn;
+ int maxtype;
- tl = reply->tag_list;
+ /* Is this a followup call? */
+ if (cb->args[0]) {
+ /* ... of a single resource dump,
+ * and the resource iterator has been advanced already? */
+ if (cb->args[2] && cb->args[2] != cb->args[0])
+ return 0; /* DONE. */
+ goto dump;
+ }
- rv = mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
- drbd_test_flag(mdev, USE_DEGR_WFC_T) ? UT_DEGRADED : UT_DEFAULT;
+ /* First call (from netlink_dump_start). We need to figure out
+ * which resource(s) the user wants us to dump. */
+ nla = nla_find(nlmsg_attrdata(cb->nlh, hdrlen),
+ nlmsg_attrlen(cb->nlh, hdrlen),
+ DRBD_NLA_CFG_CONTEXT);
- tl = tl_add_blob(tl, T_use_degraded, &rv, sizeof(rv));
- put_unaligned(TT_END, tl++); /* Close the tag list */
+ /* No explicit context given. Dump all. */
+ if (!nla)
+ goto dump;
+ maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1;
+ nla = drbd_nla_find_nested(maxtype, nla, __nla_type(T_ctx_resource_name));
+ if (IS_ERR(nla))
+ return PTR_ERR(nla);
+ /* context given, but no name present? */
+ if (!nla)
+ return -EINVAL;
+ resource_name = nla_data(nla);
+ tconn = conn_get_by_name(resource_name);
- return (int)((char *)tl - (char *)reply->tag_list);
+ if (!tconn)
+ return -ENODEV;
+
+ kref_put(&tconn->kref, &conn_destroy); /* get_one_status() (re)validates tconn by itself */
+
+ /* prime iterators, and set "filter" mode mark:
+ * only dump this tconn. */
+ cb->args[0] = (long)tconn;
+ /* cb->args[1] = 0; passed in this way. */
+ cb->args[2] = (long)tconn;
+
+dump:
+ return get_one_status(skb, cb);
}
-static int drbd_nl_start_ov(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info)
{
- /* default to resume from last known position, if possible */
- struct start_ov args = {
- .start_sector = mdev->ov_start_sector,
- .stop_sector = ULLONG_MAX,
- };
+ enum drbd_ret_code retcode;
+ struct timeout_parms tp;
+ int err;
- if (!start_ov_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ tp.timeout_type =
+ adm_ctx.mdev->state.pdsk == D_OUTDATED ? UT_PEER_OUTDATED :
+ test_bit(USE_DEGR_WFC_T, &adm_ctx.mdev->flags) ? UT_DEGRADED :
+ UT_DEFAULT;
+
+ err = timeout_parms_to_priv_skb(adm_ctx.reply_skb, &tp);
+ if (err) {
+ nlmsg_free(adm_ctx.reply_skb);
+ return err;
}
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_start_ov(struct sk_buff *skb, struct genl_info *info)
+{
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
+ struct start_ov_parms parms;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ mdev = adm_ctx.mdev;
+
+ /* resume from last known position, if possible */
+ parms.ov_start_sector = mdev->ov_start_sector;
+ parms.ov_stop_sector = ULLONG_MAX;
+ if (info->attrs[DRBD_NLA_START_OV_PARMS]) {
+ int err = start_ov_parms_from_attrs(&parms, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
+ }
+ }
+ /* w_make_ov_request expects position to be aligned */
+ mdev->ov_start_sector = parms.ov_start_sector & ~(BM_SECT_PER_BIT-1);
+ mdev->ov_stop_sector = parms.ov_stop_sector;
/* If there is still bitmap IO pending, e.g. previous resync or verify
* just being finished, wait for it before requesting a new resync. */
drbd_suspend_io(mdev);
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
-
- /* w_make_ov_request expects start position to be aligned */
- mdev->ov_start_sector = args.start_sector & ~(BM_SECT_PER_BIT-1);
- mdev->ov_stop_sector = args.stop_sector;
- reply->ret_code = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
+ retcode = drbd_request_state(mdev,NS(conn,C_VERIFY_S));
drbd_resume_io(mdev);
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-static int drbd_nl_new_c_uuid(struct drbd_conf *mdev, struct drbd_nl_cfg_req *nlp,
- struct drbd_nl_cfg_reply *reply)
+int drbd_adm_new_c_uuid(struct sk_buff *skb, struct genl_info *info)
{
- int retcode = NO_ERROR;
+ struct drbd_conf *mdev;
+ enum drbd_ret_code retcode;
int skip_initial_sync = 0;
int err;
+ struct new_c_uuid_parms args;
- struct new_c_uuid args;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out_nolock;
- memset(&args, 0, sizeof(struct new_c_uuid));
- if (!new_c_uuid_from_tags(mdev, nlp->tag_list, &args)) {
- reply->ret_code = ERR_MANDATORY_TAG;
- return 0;
+ mdev = adm_ctx.mdev;
+ memset(&args, 0, sizeof(args));
+ if (info->attrs[DRBD_NLA_NEW_C_UUID_PARMS]) {
+ err = new_c_uuid_parms_from_attrs(&args, info);
+ if (err) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out_nolock;
+ }
}
- mutex_lock(&mdev->state_mutex); /* Protects us against serialized state changes. */
+ mutex_lock(mdev->state_mutex); /* Protects us against serialized state changes. */
if (!get_ldev(mdev)) {
retcode = ERR_NO_DISK;
@@ -2260,7 +3013,7 @@
}
/* this is "skip initial sync", assume to be clean */
- if (mdev->state.conn == C_CONNECTED && mdev->agreed_pro_version >= 90 &&
+ if (mdev->state.conn == C_CONNECTED && mdev->tconn->agreed_pro_version >= 90 &&
mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED && args.clear_bm) {
dev_info(DEV, "Preparing to skip initial sync\n");
skip_initial_sync = 1;
@@ -2283,10 +3036,10 @@
drbd_send_uuids_skip_initial_sync(mdev);
_drbd_uuid_set(mdev, UI_BITMAP, 0);
drbd_print_uuids(mdev, "cleared bitmap UUID");
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_set_state(_NS2(mdev, disk, D_UP_TO_DATE, pdsk, D_UP_TO_DATE),
CS_VERBOSE, NULL);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
}
@@ -2294,416 +3047,283 @@
out_dec:
put_ldev(mdev);
out:
- mutex_unlock(&mdev->state_mutex);
-
- reply->ret_code = retcode;
+ mutex_unlock(mdev->state_mutex);
+out_nolock:
+ drbd_adm_finish(info, retcode);
return 0;
}
-struct cn_handler_struct {
- int (*function)(struct drbd_conf *,
- struct drbd_nl_cfg_req *,
- struct drbd_nl_cfg_reply *);
- int reply_body_size;
-};
-
-static struct cn_handler_struct cnd_table[] = {
- [ P_primary ] = { &drbd_nl_primary, 0 },
- [ P_secondary ] = { &drbd_nl_secondary, 0 },
- [ P_disk_conf ] = { &drbd_nl_disk_conf, 0 },
- [ P_detach ] = { &drbd_nl_detach, 0 },
- [ P_net_conf ] = { &drbd_nl_net_conf, 0 },
- [ P_disconnect ] = { &drbd_nl_disconnect, 0 },
- [ P_resize ] = { &drbd_nl_resize, 0 },
- [ P_syncer_conf ] = { &drbd_nl_syncer_conf, 0 },
- [ P_invalidate ] = { &drbd_nl_invalidate, 0 },
- [ P_invalidate_peer ] = { &drbd_nl_invalidate_peer, 0 },
- [ P_pause_sync ] = { &drbd_nl_pause_sync, 0 },
- [ P_resume_sync ] = { &drbd_nl_resume_sync, 0 },
- [ P_suspend_io ] = { &drbd_nl_suspend_io, 0 },
- [ P_resume_io ] = { &drbd_nl_resume_io, 0 },
- [ P_outdate ] = { &drbd_nl_outdate, 0 },
- [ P_get_config ] = { &drbd_nl_get_config,
- sizeof(struct syncer_conf_tag_len_struct) +
- sizeof(struct disk_conf_tag_len_struct) +
- sizeof(struct net_conf_tag_len_struct) },
- [ P_get_state ] = { &drbd_nl_get_state,
- sizeof(struct get_state_tag_len_struct) +
- sizeof(struct sync_progress_tag_len_struct) },
- [ P_get_uuids ] = { &drbd_nl_get_uuids,
- sizeof(struct get_uuids_tag_len_struct) },
- [ P_get_timeout_flag ] = { &drbd_nl_get_timeout_flag,
- sizeof(struct get_timeout_flag_tag_len_struct)},
- [ P_start_ov ] = { &drbd_nl_start_ov, 0 },
- [ P_new_c_uuid ] = { &drbd_nl_new_c_uuid, 0 },
-};
-
-static void drbd_connector_callback(struct cn_msg *req, struct netlink_skb_parms *nsp)
+static enum drbd_ret_code
+drbd_check_resource_name(const char *name)
{
- struct drbd_nl_cfg_req *nlp = (struct drbd_nl_cfg_req *)req->data;
- struct cn_handler_struct *cm;
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- struct drbd_conf *mdev;
- int retcode, rr;
- int reply_size = sizeof(struct cn_msg)
- + sizeof(struct drbd_nl_cfg_reply)
- + sizeof(short int);
+ if (!name || !name[0]) {
+ drbd_msg_put_info("resource name missing");
+ return ERR_MANDATORY_TAG;
+ }
+ /* if we want to use these in sysfs/configfs/debugfs some day,
+ * we must not allow slashes */
+ if (strchr(name, '/')) {
+ drbd_msg_put_info("invalid resource name");
+ return ERR_INVALID_REQUEST;
+ }
+ return NO_ERROR;
+}
- if (!try_module_get(THIS_MODULE)) {
- printk(KERN_ERR "drbd: try_module_get() failed!\n");
- return;
+int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+ struct res_opts res_opts;
+ int err;
+
+ retcode = drbd_adm_prepare(skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ set_res_opts_defaults(&res_opts);
+ err = res_opts_from_attrs(&res_opts, info);
+ if (err && err != -ENOMSG) {
+ retcode = ERR_MANDATORY_TAG;
+ drbd_msg_put_info(from_attrs_err_to_txt(err));
+ goto out;
}
- if (!capable(CAP_SYS_ADMIN)) {
- retcode = ERR_PERM;
- goto fail;
+ retcode = drbd_check_resource_name(adm_ctx.resource_name);
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (adm_ctx.tconn) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL) {
+ retcode = ERR_INVALID_REQUEST;
+ drbd_msg_put_info("resource exists");
+ }
+ /* else: still NO_ERROR */
+ goto out;
}
- mdev = ensure_mdev(nlp->drbd_minor,
- (nlp->flags & DRBD_NL_CREATE_DEVICE));
- if (!mdev) {
- retcode = ERR_MINOR_INVALID;
- goto fail;
- }
-
- if (nlp->packet_type >= P_nl_after_last_packet ||
- nlp->packet_type == P_return_code_only) {
- retcode = ERR_PACKET_NR;
- goto fail;
- }
-
- cm = cnd_table + nlp->packet_type;
-
- /* This may happen if packet number is 0: */
- if (cm->function == NULL) {
- retcode = ERR_PACKET_NR;
- goto fail;
- }
-
- reply_size += cm->reply_body_size;
-
- /* allocation not in the IO path, cqueue thread context */
- cn_reply = kzalloc(reply_size, GFP_KERNEL);
- if (!cn_reply) {
+ if (!conn_create(adm_ctx.resource_name, &res_opts))
retcode = ERR_NOMEM;
- goto fail;
- }
- reply = (struct drbd_nl_cfg_reply *) cn_reply->data;
-
- reply->packet_type =
- cm->reply_body_size ? nlp->packet_type : P_return_code_only;
- reply->minor = nlp->drbd_minor;
- reply->ret_code = NO_ERROR; /* Might by modified by cm->function. */
- /* reply->tag_list; might be modified by cm->function. */
-
- rr = cm->function(mdev, nlp, reply);
-
- cn_reply->id = req->id;
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) + rr;
- cn_reply->flags = 0;
-
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_KERNEL);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
-
- kfree(cn_reply);
- module_put(THIS_MODULE);
- return;
- fail:
- drbd_nl_send_reply(req, retcode);
- module_put(THIS_MODULE);
-}
-
-static atomic_t drbd_nl_seq = ATOMIC_INIT(2); /* two. */
-
-static unsigned short *
-__tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data,
- unsigned short len, int nul_terminated)
-{
- unsigned short l = tag_descriptions[tag_number(tag)].max_len;
- len = (len < l) ? len : l;
- put_unaligned(tag, tl++);
- put_unaligned(len, tl++);
- memcpy(tl, data, len);
- tl = (unsigned short*)((char*)tl + len);
- if (nul_terminated)
- *((char*)tl - 1) = 0;
- return tl;
-}
-
-static unsigned short *
-tl_add_blob(unsigned short *tl, enum drbd_tags tag, const void *data, int len)
-{
- return __tl_add_blob(tl, tag, data, len, 0);
-}
-
-static unsigned short *
-tl_add_str(unsigned short *tl, enum drbd_tags tag, const char *str)
-{
- return __tl_add_blob(tl, tag, str, strlen(str)+1, 0);
-}
-
-static unsigned short *
-tl_add_int(unsigned short *tl, enum drbd_tags tag, const void *val)
-{
- put_unaligned(tag, tl++);
- switch(tag_type(tag)) {
- case TT_INTEGER:
- put_unaligned(sizeof(int), tl++);
- put_unaligned(*(int *)val, (int *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(int));
- break;
- case TT_INT64:
- put_unaligned(sizeof(u64), tl++);
- put_unaligned(*(u64 *)val, (u64 *)tl);
- tl = (unsigned short*)((char*)tl+sizeof(u64));
- break;
- default:
- /* someone did something stupid. */
- ;
- }
- return tl;
-}
-
-void drbd_bcast_state(struct drbd_conf *mdev, union drbd_state state)
-{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct get_state_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
-
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = get_state_to_tags(mdev, (struct get_state *)&state, tl);
-
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_get_state;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-}
-
-void drbd_bcast_ev_helper(struct drbd_conf *mdev, char *helper_name)
-{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct call_helper_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
-
- /* dev_warn(DEV, "drbd_bcast_state() got called\n"); */
-
- tl = tl_add_str(tl, T_helper, helper_name);
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_call_helper;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-}
-
-void drbd_bcast_ee(struct drbd_conf *mdev,
- const char *reason, const int dgs,
- const char* seen_hash, const char* calc_hash,
- const struct drbd_epoch_entry* e)
-{
- struct cn_msg *cn_reply;
- struct drbd_nl_cfg_reply *reply;
- unsigned short *tl;
- struct page *page;
- unsigned len;
-
- if (!e)
- return;
- if (!reason || !reason[0])
- return;
-
- /* apparently we have to memcpy twice, first to prepare the data for the
- * struct cn_msg, then within cn_netlink_send from the cn_msg to the
- * netlink skb. */
- /* receiver thread context, which is not in the writeout path (of this node),
- * but may be in the writeout path of the _other_ node.
- * GFP_NOIO to avoid potential "distributed deadlock". */
- cn_reply = kzalloc(
- sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct dump_ee_tag_len_struct)+
- sizeof(short int),
- GFP_NOIO);
-
- if (!cn_reply) {
- dev_err(DEV, "could not kmalloc buffer for drbd_bcast_ee, sector %llu, size %u\n",
- (unsigned long long)e->sector, e->size);
- return;
- }
-
- reply = (struct drbd_nl_cfg_reply*)cn_reply->data;
- tl = reply->tag_list;
-
- tl = tl_add_str(tl, T_dump_ee_reason, reason);
- tl = tl_add_blob(tl, T_seen_digest, seen_hash, dgs);
- tl = tl_add_blob(tl, T_calc_digest, calc_hash, dgs);
- tl = tl_add_int(tl, T_ee_sector, &e->sector);
- tl = tl_add_int(tl, T_ee_block_id, &e->block_id);
-
- /* dump the first 32k */
- len = min_t(unsigned, e->size, 32 << 10);
- put_unaligned(T_ee_data, tl++);
- put_unaligned(len, tl++);
-
- page = e->pages;
- page_chain_for_each(page) {
- void *d = kmap_atomic(page);
- unsigned l = min_t(unsigned, len, PAGE_SIZE);
- memcpy(tl, d, l);
- kunmap_atomic(d);
- tl = (unsigned short*)((char*)tl + l);
- len -= l;
- if (len == 0)
- break;
- }
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1,&drbd_nl_seq);
- cn_reply->ack = 0; // not used here.
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char*)tl - (char*)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_dump_ee;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- kfree(cn_reply);
-}
-
-void drbd_bcast_sync_progress(struct drbd_conf *mdev)
-{
- char buffer[sizeof(struct cn_msg)+
- sizeof(struct drbd_nl_cfg_reply)+
- sizeof(struct sync_progress_tag_len_struct)+
- sizeof(short int)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- unsigned short *tl = reply->tag_list;
- unsigned long rs_left;
- unsigned int res;
-
- /* no local ref, no bitmap, no syncer progress, no broadcast. */
- if (!get_ldev(mdev))
- return;
- drbd_get_syncer_progress(mdev, &rs_left, &res);
- put_ldev(mdev);
-
- tl = tl_add_int(tl, T_sync_progress, &res);
- put_unaligned(TT_END, tl++); /* Close the tag list */
-
- cn_reply->id.idx = CN_IDX_DRBD;
- cn_reply->id.val = CN_VAL_DRBD;
-
- cn_reply->seq = atomic_add_return(1, &drbd_nl_seq);
- cn_reply->ack = 0; /* not used here. */
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply) +
- (int)((char *)tl - (char *)reply->tag_list);
- cn_reply->flags = 0;
-
- reply->packet_type = P_sync_progress;
- reply->minor = mdev_to_minor(mdev);
- reply->ret_code = NO_ERROR;
-
- cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
-}
-
-int __init drbd_nl_init(void)
-{
- static struct cb_id cn_id_drbd;
- int err, try=10;
-
- cn_id_drbd.val = CN_VAL_DRBD;
- do {
- cn_id_drbd.idx = cn_idx;
- err = cn_add_callback(&cn_id_drbd, "cn_drbd", &drbd_connector_callback);
- if (!err)
- break;
- cn_idx = (cn_idx + CN_IDX_STEP);
- } while (try--);
-
- if (err) {
- printk(KERN_ERR "drbd: cn_drbd failed to register\n");
- return err;
- }
-
+out:
+ drbd_adm_finish(info, retcode);
return 0;
}
-void drbd_nl_cleanup(void)
+int drbd_adm_add_minor(struct sk_buff *skb, struct genl_info *info)
{
- static struct cb_id cn_id_drbd;
+ struct drbd_genlmsghdr *dh = info->userhdr;
+ enum drbd_ret_code retcode;
- cn_id_drbd.idx = cn_idx;
- cn_id_drbd.val = CN_VAL_DRBD;
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
- cn_del_callback(&cn_id_drbd);
+ if (dh->minor > MINORMASK) {
+ drbd_msg_put_info("requested minor out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+ if (adm_ctx.volume > DRBD_VOLUME_MAX) {
+ drbd_msg_put_info("requested volume id out of range");
+ retcode = ERR_INVALID_REQUEST;
+ goto out;
+ }
+
+ /* drbd_adm_prepare made sure already
+ * that mdev->tconn and mdev->vnr match the request. */
+ if (adm_ctx.mdev) {
+ if (info->nlhdr->nlmsg_flags & NLM_F_EXCL)
+ retcode = ERR_MINOR_EXISTS;
+ /* else: still NO_ERROR */
+ goto out;
+ }
+
+ retcode = conn_new_minor(adm_ctx.tconn, dh->minor, adm_ctx.volume);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
}
-void drbd_nl_send_reply(struct cn_msg *req, int ret_code)
+static enum drbd_ret_code adm_delete_minor(struct drbd_conf *mdev)
{
- char buffer[sizeof(struct cn_msg)+sizeof(struct drbd_nl_cfg_reply)];
- struct cn_msg *cn_reply = (struct cn_msg *) buffer;
- struct drbd_nl_cfg_reply *reply =
- (struct drbd_nl_cfg_reply *)cn_reply->data;
- int rr;
-
- memset(buffer, 0, sizeof(buffer));
- cn_reply->id = req->id;
-
- cn_reply->seq = req->seq;
- cn_reply->ack = req->ack + 1;
- cn_reply->len = sizeof(struct drbd_nl_cfg_reply);
- cn_reply->flags = 0;
-
- reply->packet_type = P_return_code_only;
- reply->minor = ((struct drbd_nl_cfg_req *)req->data)->drbd_minor;
- reply->ret_code = ret_code;
-
- rr = cn_netlink_send(cn_reply, CN_IDX_DRBD, GFP_NOIO);
- if (rr && rr != -ESRCH)
- printk(KERN_INFO "drbd: cn_netlink_send()=%d\n", rr);
+ if (mdev->state.disk == D_DISKLESS &&
+ /* no need to be mdev->state.conn == C_STANDALONE &&
+ * we may want to delete a minor from a live replication group.
+ */
+ mdev->state.role == R_SECONDARY) {
+ _drbd_request_state(mdev, NS(conn, C_WF_REPORT_PARAMS),
+ CS_VERBOSE + CS_WAIT_COMPLETE);
+ idr_remove(&mdev->tconn->volumes, mdev->vnr);
+ idr_remove(&minors, mdev_to_minor(mdev));
+ del_gendisk(mdev->vdisk);
+ synchronize_rcu();
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ return NO_ERROR;
+ } else
+ return ERR_MINOR_CONFIGURED;
}
+int drbd_adm_delete_minor(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_MINOR);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ retcode = adm_delete_minor(adm_ctx.mdev);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_down(struct sk_buff *skb, struct genl_info *info)
+{
+ int retcode; /* enum drbd_ret_code rsp. enum drbd_state_rv */
+ struct drbd_conf *mdev;
+ unsigned i;
+
+ retcode = drbd_adm_prepare(skb, info, 0);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (!adm_ctx.tconn) {
+ retcode = ERR_RES_NOT_KNOWN;
+ goto out;
+ }
+
+ /* demote */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = drbd_set_role(mdev, R_SECONDARY, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info("failed to demote");
+ goto out;
+ }
+ }
+
+ retcode = conn_try_disconnect(adm_ctx.tconn, 0);
+ if (retcode < SS_SUCCESS) {
+ drbd_msg_put_info("failed to disconnect");
+ goto out;
+ }
+
+ /* detach */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = adm_detach(mdev, 0);
+ if (retcode < SS_SUCCESS || retcode > NO_ERROR) {
+ drbd_msg_put_info("failed to detach");
+ goto out;
+ }
+ }
+
+ /* If we reach this, all volumes (of this tconn) are Secondary,
+ * Disconnected, Diskless, aka Unconfigured. Make sure all threads have
+ * actually stopped, state handling only does drbd_thread_stop_nowait(). */
+ drbd_thread_stop(&adm_ctx.tconn->worker);
+
+ /* Now, nothing can fail anymore */
+
+ /* delete volumes */
+ idr_for_each_entry(&adm_ctx.tconn->volumes, mdev, i) {
+ retcode = adm_delete_minor(mdev);
+ if (retcode != NO_ERROR) {
+ /* "can not happen" */
+ drbd_msg_put_info("failed to delete volume");
+ goto out;
+ }
+ }
+
+ /* delete connection */
+ if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+ list_del_rcu(&adm_ctx.tconn->all_tconn);
+ synchronize_rcu();
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+
+ retcode = NO_ERROR;
+ } else {
+ /* "can not happen" */
+ retcode = ERR_RES_IN_USE;
+ drbd_msg_put_info("failed to delete connection");
+ }
+ goto out;
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+int drbd_adm_del_resource(struct sk_buff *skb, struct genl_info *info)
+{
+ enum drbd_ret_code retcode;
+
+ retcode = drbd_adm_prepare(skb, info, DRBD_ADM_NEED_RESOURCE);
+ if (!adm_ctx.reply_skb)
+ return retcode;
+ if (retcode != NO_ERROR)
+ goto out;
+
+ if (conn_lowest_minor(adm_ctx.tconn) < 0) {
+ list_del_rcu(&adm_ctx.tconn->all_tconn);
+ synchronize_rcu();
+ kref_put(&adm_ctx.tconn->kref, &conn_destroy);
+
+ retcode = NO_ERROR;
+ } else {
+ retcode = ERR_RES_IN_USE;
+ }
+
+ if (retcode == NO_ERROR)
+ drbd_thread_stop(&adm_ctx.tconn->worker);
+out:
+ drbd_adm_finish(info, retcode);
+ return 0;
+}
+
+void drbd_bcast_event(struct drbd_conf *mdev, const struct sib_info *sib)
+{
+ static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */
+ struct sk_buff *msg;
+ struct drbd_genlmsghdr *d_out;
+ unsigned seq;
+ int err = -ENOMEM;
+
+ if (sib->sib_reason == SIB_SYNC_PROGRESS &&
+ time_after(jiffies, mdev->rs_last_bcast + HZ))
+ mdev->rs_last_bcast = jiffies;
+ else
+ return;
+
+ seq = atomic_inc_return(&drbd_genl_seq);
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO);
+ if (!msg)
+ goto failed;
+
+ err = -EMSGSIZE;
+ d_out = genlmsg_put(msg, 0, seq, &drbd_genl_family, 0, DRBD_EVENT);
+ if (!d_out) /* cannot happen, but anyways. */
+ goto nla_put_failure;
+ d_out->minor = mdev_to_minor(mdev);
+ d_out->ret_code = NO_ERROR;
+
+ if (nla_put_status_info(msg, mdev, sib))
+ goto nla_put_failure;
+ genlmsg_end(msg, d_out);
+ err = drbd_genl_multicast_events(msg, 0);
+ /* msg has been consumed or freed in netlink_broadcast() */
+ if (err && err != -ESRCH)
+ goto failed;
+
+ return;
+
+nla_put_failure:
+ nlmsg_free(msg);
+failed:
+ dev_err(DEV, "Error %d while broadcasting event. "
+ "Event seq:%u sib_reason:%u\n",
+ err, seq, sib->sib_reason);
+}
diff --git a/drivers/block/drbd/drbd_nla.c b/drivers/block/drbd/drbd_nla.c
new file mode 100644
index 0000000..fa672b6d
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.c
@@ -0,0 +1,55 @@
+#include "drbd_wrappers.h"
+#include <linux/kernel.h>
+#include <net/netlink.h>
+#include <linux/drbd_genl_api.h>
+#include "drbd_nla.h"
+
+static int drbd_nla_check_mandatory(int maxtype, struct nlattr *nla)
+{
+ struct nlattr *head = nla_data(nla);
+ int len = nla_len(nla);
+ int rem;
+
+ /*
+ * validate_nla (called from nla_parse_nested) ignores attributes
+ * beyond maxtype, and does not understand the DRBD_GENLA_F_MANDATORY flag.
+ * In order to have it validate attributes with the DRBD_GENLA_F_MANDATORY
+ * flag set also, check and remove that flag before calling
+ * nla_parse_nested.
+ */
+
+ nla_for_each_attr(nla, head, len, rem) {
+ if (nla->nla_type & DRBD_GENLA_F_MANDATORY) {
+ nla->nla_type &= ~DRBD_GENLA_F_MANDATORY;
+ if (nla_type(nla) > maxtype)
+ return -EOPNOTSUPP;
+ }
+ }
+ return 0;
+}
+
+int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy)
+{
+ int err;
+
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (!err)
+ err = nla_parse_nested(tb, maxtype, nla, policy);
+
+ return err;
+}
+
+struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype)
+{
+ int err;
+ /*
+ * If any nested attribute has the DRBD_GENLA_F_MANDATORY flag set and
+ * we don't know about that attribute, reject all the nested
+ * attributes.
+ */
+ err = drbd_nla_check_mandatory(maxtype, nla);
+ if (err)
+ return ERR_PTR(err);
+ return nla_find_nested(nla, attrtype);
+}
diff --git a/drivers/block/drbd/drbd_nla.h b/drivers/block/drbd/drbd_nla.h
new file mode 100644
index 0000000..679c2d5
--- /dev/null
+++ b/drivers/block/drbd/drbd_nla.h
@@ -0,0 +1,8 @@
+#ifndef __DRBD_NLA_H
+#define __DRBD_NLA_H
+
+extern int drbd_nla_parse_nested(struct nlattr *tb[], int maxtype, struct nlattr *nla,
+ const struct nla_policy *policy);
+extern struct nlattr *drbd_nla_find_nested(int maxtype, struct nlattr *nla, int attrtype);
+
+#endif /* __DRBD_NLA_H */
diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c
index 662bc8e..56672a6 100644
--- a/drivers/block/drbd/drbd_proc.c
+++ b/drivers/block/drbd/drbd_proc.c
@@ -171,7 +171,7 @@
if (mdev->state.conn == C_VERIFY_S ||
mdev->state.conn == C_VERIFY_T) {
bit_pos = bm_bits - mdev->ov_left;
- if (mdev->agreed_pro_version >= 97)
+ if (verify_can_do_stop_sector(mdev))
stop_sector = mdev->ov_stop_sector;
} else
bit_pos = mdev->bm_resync_fo;
@@ -200,9 +200,11 @@
static int drbd_seq_show(struct seq_file *seq, void *v)
{
- int i, hole = 0;
+ int i, prev_i = -1;
const char *sn;
struct drbd_conf *mdev;
+ struct net_conf *nc;
+ char wp;
static char write_ordering_chars[] = {
[WO_none] = 'n',
@@ -233,16 +235,11 @@
oos .. known out-of-sync kB
*/
- for (i = 0; i < minor_count; i++) {
- mdev = minor_to_mdev(i);
- if (!mdev) {
- hole = 1;
- continue;
- }
- if (hole) {
- hole = 0;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, mdev, i) {
+ if (prev_i != i - 1)
seq_printf(seq, "\n");
- }
+ prev_i = i;
sn = drbd_conn_str(mdev->state.conn);
@@ -254,6 +251,8 @@
/* reset mdev->congestion_reason */
bdi_rw_congested(&mdev->rq_queue->backing_dev_info);
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ wp = nc ? nc->wire_protocol - DRBD_PROT_A + 'A' : ' ';
seq_printf(seq,
"%2d: cs:%s ro:%s/%s ds:%s/%s %c %c%c%c%c%c%c\n"
" ns:%u nr:%u dw:%u dr:%u al:%u bm:%u "
@@ -263,14 +262,13 @@
drbd_role_str(mdev->state.peer),
drbd_disk_str(mdev->state.disk),
drbd_disk_str(mdev->state.pdsk),
- (mdev->net_conf == NULL ? ' ' :
- (mdev->net_conf->wire_protocol - DRBD_PROT_A+'A')),
- is_susp(mdev->state) ? 's' : 'r',
+ wp,
+ drbd_suspended(mdev) ? 's' : 'r',
mdev->state.aftr_isp ? 'a' : '-',
mdev->state.peer_isp ? 'p' : '-',
mdev->state.user_isp ? 'u' : '-',
mdev->congestion_reason ?: '-',
- drbd_test_flag(mdev, AL_SUSPENDED) ? 's' : '-',
+ test_bit(AL_SUSPENDED, &mdev->flags) ? 's' : '-',
mdev->send_cnt/2,
mdev->recv_cnt/2,
mdev->writ_cnt/2,
@@ -282,8 +280,8 @@
atomic_read(&mdev->rs_pending_cnt),
atomic_read(&mdev->unacked_cnt),
atomic_read(&mdev->ap_bio_cnt),
- mdev->epochs,
- write_ordering_chars[mdev->write_ordering]
+ mdev->tconn->epochs,
+ write_ordering_chars[mdev->tconn->write_ordering]
);
seq_printf(seq, " oos:%llu\n",
Bit2KB((unsigned long long)
@@ -308,6 +306,7 @@
}
}
}
+ rcu_read_unlock();
return 0;
}
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index eb0cafe..0331ad0 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -48,17 +48,25 @@
#include "drbd_vli.h"
+struct packet_info {
+ enum drbd_packet cmd;
+ unsigned int size;
+ unsigned int vnr;
+ void *data;
+};
+
enum finish_epoch {
FE_STILL_LIVE,
FE_DESTROYED,
FE_RECYCLED,
};
-static int drbd_do_handshake(struct drbd_conf *mdev);
-static int drbd_do_auth(struct drbd_conf *mdev);
+static int drbd_do_features(struct drbd_tconn *tconn);
+static int drbd_do_auth(struct drbd_tconn *tconn);
+static int drbd_disconnected(struct drbd_conf *mdev);
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *, struct drbd_epoch *, enum epoch_event);
-static int e_end_block(struct drbd_conf *, struct drbd_work *, int);
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *, struct drbd_epoch *, enum epoch_event);
+static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
@@ -142,11 +150,12 @@
*head = chain_first;
}
-static struct page *drbd_pp_first_pages_or_try_alloc(struct drbd_conf *mdev, int number)
+static struct page *__drbd_alloc_pages(struct drbd_conf *mdev,
+ unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
- int i = 0;
+ unsigned int i = 0;
/* Yes, testing drbd_pp_vacant outside the lock is racy.
* So what. It saves a spin_lock. */
@@ -175,7 +184,7 @@
return page;
/* Not enough pages immediately available this time.
- * No need to jump around here, drbd_pp_alloc will retry this
+ * No need to jump around here, drbd_alloc_pages will retry this
* function "soon". */
if (page) {
tmp = page_chain_tail(page, NULL);
@@ -187,9 +196,10 @@
return NULL;
}
-static void reclaim_net_ee(struct drbd_conf *mdev, struct list_head *to_be_freed)
+static void reclaim_finished_net_peer_reqs(struct drbd_conf *mdev,
+ struct list_head *to_be_freed)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct list_head *le, *tle;
/* The EEs are always appended to the end of the list. Since
@@ -198,8 +208,8 @@
stop to examine the list... */
list_for_each_safe(le, tle, &mdev->net_ee) {
- e = list_entry(le, struct drbd_epoch_entry, w.list);
- if (drbd_ee_has_active_page(e))
+ peer_req = list_entry(le, struct drbd_peer_request, w.list);
+ if (drbd_peer_req_has_active_page(peer_req))
break;
list_move(le, to_be_freed);
}
@@ -208,18 +218,18 @@
static void drbd_kick_lo_and_reclaim_net(struct drbd_conf *mdev)
{
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ reclaim_finished_net_peer_reqs(mdev, &reclaimed);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(mdev, peer_req);
}
/**
- * drbd_pp_alloc() - Returns @number pages, retries forever (or until signalled)
+ * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @mdev: DRBD device.
* @number: number of pages requested
* @retry: whether to retry, if not enough pages are available right now
@@ -230,23 +240,31 @@
*
* Returns a page chain linked via page->private.
*/
-static struct page *drbd_pp_alloc(struct drbd_conf *mdev, unsigned number, bool retry)
+struct page *drbd_alloc_pages(struct drbd_conf *mdev, unsigned int number,
+ bool retry)
{
struct page *page = NULL;
+ struct net_conf *nc;
DEFINE_WAIT(wait);
+ int mxb;
/* Yes, we may run up to @number over max_buffers. If we
* follow it strictly, the admin will get it wrong anyways. */
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers)
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ mxb = nc ? nc->max_buffers : 1000000;
+ rcu_read_unlock();
+
+ if (atomic_read(&mdev->pp_in_use) < mxb)
+ page = __drbd_alloc_pages(mdev, number);
while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
drbd_kick_lo_and_reclaim_net(mdev);
- if (atomic_read(&mdev->pp_in_use) < mdev->net_conf->max_buffers) {
- page = drbd_pp_first_pages_or_try_alloc(mdev, number);
+ if (atomic_read(&mdev->pp_in_use) < mxb) {
+ page = __drbd_alloc_pages(mdev, number);
if (page)
break;
}
@@ -255,7 +273,7 @@
break;
if (signal_pending(current)) {
- dev_warn(DEV, "drbd_pp_alloc interrupted!\n");
+ dev_warn(DEV, "drbd_alloc_pages interrupted!\n");
break;
}
@@ -268,11 +286,11 @@
return page;
}
-/* Must not be used from irq, as that may deadlock: see drbd_pp_alloc.
- * Is also used from inside an other spin_lock_irq(&mdev->req_lock);
+/* Must not be used from irq, as that may deadlock: see drbd_alloc_pages.
+ * Is also used from inside an other spin_lock_irq(&mdev->tconn->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
-static void drbd_pp_free(struct drbd_conf *mdev, struct page *page, int is_net)
+static void drbd_free_pages(struct drbd_conf *mdev, struct page *page, int is_net)
{
atomic_t *a = is_net ? &mdev->pp_in_use_by_net : &mdev->pp_in_use;
int i;
@@ -280,7 +298,7 @@
if (page == NULL)
return;
- if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE)*minor_count)
+ if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * minor_count)
i = page_chain_free(page);
else {
struct page *tmp;
@@ -302,127 +320,130 @@
_drbd_wait_ee_list_empty()
You must not have the req_lock:
- drbd_free_ee()
- drbd_alloc_ee()
- drbd_init_ee()
- drbd_release_ee()
+ drbd_free_peer_req()
+ drbd_alloc_peer_req()
+ drbd_free_peer_reqs()
drbd_ee_fix_bhs()
- drbd_process_done_ee()
+ drbd_finish_peer_reqs()
drbd_clear_done_ee()
drbd_wait_ee_list_empty()
*/
-struct drbd_epoch_entry *drbd_alloc_ee(struct drbd_conf *mdev,
- u64 id,
- sector_t sector,
- unsigned int data_size,
- gfp_t gfp_mask) __must_hold(local)
+struct drbd_peer_request *
+drbd_alloc_peer_req(struct drbd_conf *mdev, u64 id, sector_t sector,
+ unsigned int data_size, gfp_t gfp_mask) __must_hold(local)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct page *page = NULL;
unsigned nr_pages = (data_size + PAGE_SIZE -1) >> PAGE_SHIFT;
if (drbd_insert_fault(mdev, DRBD_FAULT_AL_EE))
return NULL;
- e = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
- if (!e) {
+ peer_req = mempool_alloc(drbd_ee_mempool, gfp_mask & ~__GFP_HIGHMEM);
+ if (!peer_req) {
if (!(gfp_mask & __GFP_NOWARN))
- dev_err(DEV, "alloc_ee: Allocation of an EE failed\n");
+ dev_err(DEV, "%s: allocation failed\n", __func__);
return NULL;
}
if (data_size) {
- page = drbd_pp_alloc(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
+ page = drbd_alloc_pages(mdev, nr_pages, (gfp_mask & __GFP_WAIT));
if (!page)
goto fail;
}
- INIT_HLIST_NODE(&e->collision);
- e->epoch = NULL;
- e->mdev = mdev;
- e->pages = page;
- atomic_set(&e->pending_bios, 0);
- e->size = data_size;
- e->flags = 0;
- e->sector = sector;
- e->block_id = id;
+ drbd_clear_interval(&peer_req->i);
+ peer_req->i.size = data_size;
+ peer_req->i.sector = sector;
+ peer_req->i.local = false;
+ peer_req->i.waiting = false;
- return e;
+ peer_req->epoch = NULL;
+ peer_req->w.mdev = mdev;
+ peer_req->pages = page;
+ atomic_set(&peer_req->pending_bios, 0);
+ peer_req->flags = 0;
+ /*
+ * The block_id is opaque to the receiver. It is not endianness
+ * converted, and sent back to the sender unchanged.
+ */
+ peer_req->block_id = id;
+
+ return peer_req;
fail:
- mempool_free(e, drbd_ee_mempool);
+ mempool_free(peer_req, drbd_ee_mempool);
return NULL;
}
-void drbd_free_some_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e, int is_net)
+void __drbd_free_peer_req(struct drbd_conf *mdev, struct drbd_peer_request *peer_req,
+ int is_net)
{
- if (e->flags & EE_HAS_DIGEST)
- kfree(e->digest);
- drbd_pp_free(mdev, e->pages, is_net);
- D_ASSERT(atomic_read(&e->pending_bios) == 0);
- D_ASSERT(hlist_unhashed(&e->collision));
- mempool_free(e, drbd_ee_mempool);
+ if (peer_req->flags & EE_HAS_DIGEST)
+ kfree(peer_req->digest);
+ drbd_free_pages(mdev, peer_req->pages, is_net);
+ D_ASSERT(atomic_read(&peer_req->pending_bios) == 0);
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
+ mempool_free(peer_req, drbd_ee_mempool);
}
-int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list)
+int drbd_free_peer_reqs(struct drbd_conf *mdev, struct list_head *list)
{
LIST_HEAD(work_list);
- struct drbd_epoch_entry *e, *t;
+ struct drbd_peer_request *peer_req, *t;
int count = 0;
int is_net = list == &mdev->net_ee;
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
list_splice_init(list, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &work_list, w.list) {
- drbd_free_some_ee(mdev, e, is_net);
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ __drbd_free_peer_req(mdev, peer_req, is_net);
count++;
}
return count;
}
-
/*
- * This function is called from _asender only_
- * but see also comments in _req_mod(,barrier_acked)
- * and receive_Barrier.
- *
- * Move entries from net_ee to done_ee, if ready.
- * Grab done_ee, call all callbacks, free the entries.
- * The callbacks typically send out ACKs.
+ * See also comments in _req_mod(,BARRIER_ACKED) and receive_Barrier.
*/
-static int drbd_process_done_ee(struct drbd_conf *mdev)
+static int drbd_finish_peer_reqs(struct drbd_conf *mdev)
{
LIST_HEAD(work_list);
LIST_HEAD(reclaimed);
- struct drbd_epoch_entry *e, *t;
- int ok = (mdev->state.conn >= C_WF_REPORT_PARAMS);
+ struct drbd_peer_request *peer_req, *t;
+ int err = 0;
- spin_lock_irq(&mdev->req_lock);
- reclaim_net_ee(mdev, &reclaimed);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ reclaim_finished_net_peer_reqs(mdev, &reclaimed);
list_splice_init(&mdev->done_ee, &work_list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- list_for_each_entry_safe(e, t, &reclaimed, w.list)
- drbd_free_net_ee(mdev, e);
+ list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
+ drbd_free_net_peer_req(mdev, peer_req);
/* possible callbacks here:
- * e_end_block, and e_end_resync_block, e_send_discard_ack.
+ * e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
*/
- list_for_each_entry_safe(e, t, &work_list, w.list) {
+ list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
+ int err2;
+
/* list_del not necessary, next/prev members not touched */
- ok = e->w.cb(mdev, &e->w, !ok) && ok;
- drbd_free_ee(mdev, e);
+ err2 = peer_req->w.cb(&peer_req->w, !!err);
+ if (!err)
+ err = err2;
+ drbd_free_peer_req(mdev, peer_req);
}
wake_up(&mdev->ee_wait);
- return ok;
+ return err;
}
-void _drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void _drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head)
{
DEFINE_WAIT(wait);
@@ -430,55 +451,22 @@
* and calling prepare_to_wait in the fast path */
while (!list_empty(head)) {
prepare_to_wait(&mdev->ee_wait, &wait, TASK_UNINTERRUPTIBLE);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
io_schedule();
finish_wait(&mdev->ee_wait, &wait);
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
}
}
-void drbd_wait_ee_list_empty(struct drbd_conf *mdev, struct list_head *head)
+static void drbd_wait_ee_list_empty(struct drbd_conf *mdev,
+ struct list_head *head)
{
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_wait_ee_list_empty(mdev, head);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
}
-/* see also kernel_accept; which is only present since 2.6.18.
- * also we want to log which part of it failed, exactly */
-static int drbd_accept(struct drbd_conf *mdev, const char **what,
- struct socket *sock, struct socket **newsock)
-{
- struct sock *sk = sock->sk;
- int err = 0;
-
- *what = "listen";
- err = sock->ops->listen(sock, 5);
- if (err < 0)
- goto out;
-
- *what = "sock_create_lite";
- err = sock_create_lite(sk->sk_family, sk->sk_type, sk->sk_protocol,
- newsock);
- if (err < 0)
- goto out;
-
- *what = "accept";
- err = sock->ops->accept(sock, *newsock, 0);
- if (err < 0) {
- sock_release(*newsock);
- *newsock = NULL;
- goto out;
- }
- (*newsock)->ops = sock->ops;
- __module_get((*newsock)->ops->owner);
-
-out:
- return err;
-}
-
-static int drbd_recv_short(struct drbd_conf *mdev, struct socket *sock,
- void *buf, size_t size, int flags)
+static int drbd_recv_short(struct socket *sock, void *buf, size_t size, int flags)
{
mm_segment_t oldfs;
struct kvec iov = {
@@ -500,48 +488,62 @@
return rv;
}
-static int drbd_recv(struct drbd_conf *mdev, void *buf, size_t size)
+static int drbd_recv(struct drbd_tconn *tconn, void *buf, size_t size)
{
- mm_segment_t oldfs;
- struct kvec iov = {
- .iov_base = buf,
- .iov_len = size,
- };
- struct msghdr msg = {
- .msg_iovlen = 1,
- .msg_iov = (struct iovec *)&iov,
- .msg_flags = MSG_WAITALL | MSG_NOSIGNAL
- };
int rv;
- oldfs = get_fs();
- set_fs(KERNEL_DS);
- rv = sock_recvmsg(mdev->data.socket, &msg, size, msg.msg_flags);
- set_fs(oldfs);
+ rv = drbd_recv_short(tconn->data.socket, buf, size, 0);
if (rv < 0) {
if (rv == -ECONNRESET)
- dev_info(DEV, "sock was reset by peer\n");
+ conn_info(tconn, "sock was reset by peer\n");
else if (rv != -ERESTARTSYS)
- dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ conn_err(tconn, "sock_recvmsg returned %d\n", rv);
} else if (rv == 0) {
- if (drbd_test_flag(mdev, DISCONNECT_SENT)) {
- long t; /* time_left */
- t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED,
- mdev->net_conf->ping_timeo * HZ/10);
+ if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(tconn->ping_wait, tconn->cstate < C_WF_REPORT_PARAMS, t);
+
if (t)
goto out;
}
- dev_info(DEV, "sock was shut down by peer\n");
+ conn_info(tconn, "sock was shut down by peer\n");
}
if (rv != size)
- drbd_force_state(mdev, NS(conn, C_BROKEN_PIPE));
+ conn_request_state(tconn, NS(conn, C_BROKEN_PIPE), CS_HARD);
out:
return rv;
}
+static int drbd_recv_all(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv(tconn, buf, size);
+ if (err != size) {
+ if (err >= 0)
+ err = -EIO;
+ } else
+ err = 0;
+ return err;
+}
+
+static int drbd_recv_all_warn(struct drbd_tconn *tconn, void *buf, size_t size)
+{
+ int err;
+
+ err = drbd_recv_all(tconn, buf, size);
+ if (err && !signal_pending(current))
+ conn_warn(tconn, "short read (expected size %d)\n", (int)size);
+ return err;
+}
+
/* quoting tcp(7):
* On individual connections, the socket buffer size must be set prior to the
* listen(2) or connect(2) calls in order to have it take effect.
@@ -561,29 +563,50 @@
}
}
-static struct socket *drbd_try_connect(struct drbd_conf *mdev)
+static struct socket *drbd_try_connect(struct drbd_tconn *tconn)
{
const char *what;
struct socket *sock;
struct sockaddr_in6 src_in6;
- int err;
+ struct sockaddr_in6 peer_in6;
+ struct net_conf *nc;
+ int err, peer_addr_len, my_addr_len;
+ int sndbuf_size, rcvbuf_size, connect_int;
int disconnect_on_error = 1;
- if (!get_net_conf(mdev))
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
return NULL;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, tconn->my_addr_len, sizeof(src_in6));
+ memcpy(&src_in6, &tconn->my_addr, my_addr_len);
+
+ if (((struct sockaddr *)&tconn->my_addr)->sa_family == AF_INET6)
+ src_in6.sin6_port = 0;
+ else
+ ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
+
+ peer_addr_len = min_t(int, tconn->peer_addr_len, sizeof(src_in6));
+ memcpy(&peer_in6, &tconn->peer_addr, peer_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &sock);
+ err = sock_create_kern(((struct sockaddr *)&src_in6)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &sock);
if (err < 0) {
sock = NULL;
goto out;
}
sock->sk->sk_rcvtimeo =
- sock->sk->sk_sndtimeo = mdev->net_conf->try_connect_int*HZ;
- drbd_setbufsize(sock, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ sock->sk->sk_sndtimeo = connect_int * HZ;
+ drbd_setbufsize(sock, sndbuf_size, rcvbuf_size);
/* explicitly bind to the configured IP as source IP
* for the outgoing connections.
@@ -592,17 +615,8 @@
* Make sure to use 0 as port number, so linux selects
* a free one dynamically.
*/
- memcpy(&src_in6, mdev->net_conf->my_addr,
- min_t(int, mdev->net_conf->my_addr_len, sizeof(src_in6)));
- if (((struct sockaddr *)mdev->net_conf->my_addr)->sa_family == AF_INET6)
- src_in6.sin6_port = 0;
- else
- ((struct sockaddr_in *)&src_in6)->sin_port = 0; /* AF_INET & AF_SCI */
-
what = "bind before connect";
- err = sock->ops->bind(sock,
- (struct sockaddr *) &src_in6,
- mdev->net_conf->my_addr_len);
+ err = sock->ops->bind(sock, (struct sockaddr *) &src_in6, my_addr_len);
if (err < 0)
goto out;
@@ -610,9 +624,7 @@
* stay C_WF_CONNECTION, don't go Disconnecting! */
disconnect_on_error = 0;
what = "connect";
- err = sock->ops->connect(sock,
- (struct sockaddr *)mdev->net_conf->peer_addr,
- mdev->net_conf->peer_addr_len, 0);
+ err = sock->ops->connect(sock, (struct sockaddr *) &peer_in6, peer_addr_len, 0);
out:
if (err < 0) {
@@ -630,91 +642,174 @@
disconnect_on_error = 0;
break;
default:
- dev_err(DEV, "%s failed, err = %d\n", what, err);
+ conn_err(tconn, "%s failed, err = %d\n", what, err);
}
if (disconnect_on_error)
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
- put_net_conf(mdev);
+
return sock;
}
-static struct socket *drbd_wait_for_connect(struct drbd_conf *mdev)
+struct accept_wait_data {
+ struct drbd_tconn *tconn;
+ struct socket *s_listen;
+ struct completion door_bell;
+ void (*original_sk_state_change)(struct sock *sk);
+
+};
+
+static void drbd_incoming_connection(struct sock *sk)
{
- int timeo, err;
- struct socket *s_estab = NULL, *s_listen;
+ struct accept_wait_data *ad = sk->sk_user_data;
+ void (*state_change)(struct sock *sk);
+
+ state_change = ad->original_sk_state_change;
+ if (sk->sk_state == TCP_ESTABLISHED)
+ complete(&ad->door_bell);
+ state_change(sk);
+}
+
+static int prepare_listen_socket(struct drbd_tconn *tconn, struct accept_wait_data *ad)
+{
+ int err, sndbuf_size, rcvbuf_size, my_addr_len;
+ struct sockaddr_in6 my_addr;
+ struct socket *s_listen;
+ struct net_conf *nc;
const char *what;
- if (!get_net_conf(mdev))
- return NULL;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return -EIO;
+ }
+ sndbuf_size = nc->sndbuf_size;
+ rcvbuf_size = nc->rcvbuf_size;
+ rcu_read_unlock();
+
+ my_addr_len = min_t(int, tconn->my_addr_len, sizeof(struct sockaddr_in6));
+ memcpy(&my_addr, &tconn->my_addr, my_addr_len);
what = "sock_create_kern";
- err = sock_create_kern(((struct sockaddr *)mdev->net_conf->my_addr)->sa_family,
- SOCK_STREAM, IPPROTO_TCP, &s_listen);
+ err = sock_create_kern(((struct sockaddr *)&my_addr)->sa_family,
+ SOCK_STREAM, IPPROTO_TCP, &s_listen);
if (err) {
s_listen = NULL;
goto out;
}
- timeo = mdev->net_conf->try_connect_int * HZ;
- timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
-
- s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- s_listen->sk->sk_rcvtimeo = timeo;
- s_listen->sk->sk_sndtimeo = timeo;
- drbd_setbufsize(s_listen, mdev->net_conf->sndbuf_size,
- mdev->net_conf->rcvbuf_size);
+ s_listen->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ drbd_setbufsize(s_listen, sndbuf_size, rcvbuf_size);
what = "bind before listen";
- err = s_listen->ops->bind(s_listen,
- (struct sockaddr *) mdev->net_conf->my_addr,
- mdev->net_conf->my_addr_len);
+ err = s_listen->ops->bind(s_listen, (struct sockaddr *)&my_addr, my_addr_len);
if (err < 0)
goto out;
- err = drbd_accept(mdev, &what, s_listen, &s_estab);
+ ad->s_listen = s_listen;
+ write_lock_bh(&s_listen->sk->sk_callback_lock);
+ ad->original_sk_state_change = s_listen->sk->sk_state_change;
+ s_listen->sk->sk_state_change = drbd_incoming_connection;
+ s_listen->sk->sk_user_data = ad;
+ write_unlock_bh(&s_listen->sk->sk_callback_lock);
+ what = "listen";
+ err = s_listen->ops->listen(s_listen, 5);
+ if (err < 0)
+ goto out;
+
+ return 0;
out:
if (s_listen)
sock_release(s_listen);
if (err < 0) {
if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
- dev_err(DEV, "%s failed, err = %d\n", what, err);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_err(tconn, "%s failed, err = %d\n", what, err);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
}
- put_net_conf(mdev);
+
+ return -EIO;
+}
+
+static void unregister_state_change(struct sock *sk, struct accept_wait_data *ad)
+{
+ write_lock_bh(&sk->sk_callback_lock);
+ sk->sk_state_change = ad->original_sk_state_change;
+ sk->sk_user_data = NULL;
+ write_unlock_bh(&sk->sk_callback_lock);
+}
+
+static struct socket *drbd_wait_for_connect(struct drbd_tconn *tconn, struct accept_wait_data *ad)
+{
+ int timeo, connect_int, err = 0;
+ struct socket *s_estab = NULL;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (!nc) {
+ rcu_read_unlock();
+ return NULL;
+ }
+ connect_int = nc->connect_int;
+ rcu_read_unlock();
+
+ timeo = connect_int * HZ;
+ timeo += (random32() & 1) ? timeo / 7 : -timeo / 7; /* 28.5% random jitter */
+
+ err = wait_for_completion_interruptible_timeout(&ad->door_bell, timeo);
+ if (err <= 0)
+ return NULL;
+
+ err = kernel_accept(ad->s_listen, &s_estab, 0);
+ if (err < 0) {
+ if (err != -EAGAIN && err != -EINTR && err != -ERESTARTSYS) {
+ conn_err(tconn, "accept failed, err = %d\n", err);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ }
+ }
+
+ if (s_estab)
+ unregister_state_change(s_estab->sk, ad);
return s_estab;
}
-static int drbd_send_fp(struct drbd_conf *mdev,
- struct socket *sock, enum drbd_packets cmd)
-{
- struct p_header80 *h = &mdev->data.sbuf.header.h80;
+static int decode_header(struct drbd_tconn *, void *, struct packet_info *);
- return _drbd_send_cmd(mdev, sock, cmd, h, sizeof(*h), 0);
+static int send_first_packet(struct drbd_tconn *tconn, struct drbd_socket *sock,
+ enum drbd_packet cmd)
+{
+ if (!conn_prepare_command(tconn, sock))
+ return -EIO;
+ return conn_send_command(tconn, sock, cmd, 0, NULL, 0);
}
-static enum drbd_packets drbd_recv_fp(struct drbd_conf *mdev, struct socket *sock)
+static int receive_first_packet(struct drbd_tconn *tconn, struct socket *sock)
{
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
- int rr;
+ unsigned int header_size = drbd_header_size(tconn);
+ struct packet_info pi;
+ int err;
- rr = drbd_recv_short(mdev, sock, h, sizeof(*h), 0);
-
- if (rr == sizeof(*h) && h->magic == BE_DRBD_MAGIC)
- return be16_to_cpu(h->command);
-
- return 0xffff;
+ err = drbd_recv_short(sock, tconn->data.rbuf, header_size, 0);
+ if (err != header_size) {
+ if (err >= 0)
+ err = -EIO;
+ return err;
+ }
+ err = decode_header(tconn, tconn->data.rbuf, &pi);
+ if (err)
+ return err;
+ return pi.cmd;
}
/**
* drbd_socket_okay() - Free the socket if its connection is not okay
- * @mdev: DRBD device.
* @sock: pointer to the pointer to the socket.
*/
-static int drbd_socket_okay(struct drbd_conf *mdev, struct socket **sock)
+static int drbd_socket_okay(struct socket **sock)
{
int rr;
char tb[4];
@@ -722,7 +817,7 @@
if (!*sock)
return false;
- rr = drbd_recv_short(mdev, *sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
+ rr = drbd_recv_short(*sock, tb, 4, MSG_DONTWAIT | MSG_PEEK);
if (rr > 0 || rr == -EAGAIN) {
return true;
@@ -732,6 +827,31 @@
return false;
}
}
+/* Gets called if a connection is established, or if a new minor gets created
+ in a connection */
+int drbd_connected(struct drbd_conf *mdev)
+{
+ int err;
+
+ atomic_set(&mdev->packet_seq, 0);
+ mdev->peer_seq = 0;
+
+ mdev->state_mutex = mdev->tconn->agreed_pro_version < 100 ?
+ &mdev->tconn->cstate_mutex :
+ &mdev->own_state_mutex;
+
+ err = drbd_send_sync_param(mdev);
+ if (!err)
+ err = drbd_send_sizes(mdev, 0, 0);
+ if (!err)
+ err = drbd_send_uuids(mdev);
+ if (!err)
+ err = drbd_send_current_state(mdev);
+ clear_bit(USE_DEGR_WFC_T, &mdev->flags);
+ clear_bit(RESIZE_PENDING, &mdev->flags);
+ mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+ return err;
+}
/*
* return values:
@@ -741,232 +861,305 @@
* no point in trying again, please go standalone.
* -2 We do not have a network config...
*/
-static int drbd_connect(struct drbd_conf *mdev)
+static int conn_connect(struct drbd_tconn *tconn)
{
- struct socket *s, *sock, *msock;
- int try, h, ok;
+ struct drbd_socket sock, msock;
+ struct drbd_conf *mdev;
+ struct net_conf *nc;
+ int vnr, timeout, h, ok;
+ bool discard_my_data;
enum drbd_state_rv rv;
+ struct accept_wait_data ad = {
+ .tconn = tconn,
+ .door_bell = COMPLETION_INITIALIZER_ONSTACK(ad.door_bell),
+ };
- D_ASSERT(!mdev->data.socket);
-
- drbd_clear_flag(mdev, DISCONNECT_SENT);
- if (drbd_request_state(mdev, NS(conn, C_WF_CONNECTION)) < SS_SUCCESS)
+ clear_bit(DISCONNECT_SENT, &tconn->flags);
+ if (conn_request_state(tconn, NS(conn, C_WF_CONNECTION), CS_VERBOSE) < SS_SUCCESS)
return -2;
- sock = NULL;
- msock = NULL;
+ mutex_init(&sock.mutex);
+ sock.sbuf = tconn->data.sbuf;
+ sock.rbuf = tconn->data.rbuf;
+ sock.socket = NULL;
+ mutex_init(&msock.mutex);
+ msock.sbuf = tconn->meta.sbuf;
+ msock.rbuf = tconn->meta.rbuf;
+ msock.socket = NULL;
+
+ /* Assume that the peer only understands protocol 80 until we know better. */
+ tconn->agreed_pro_version = 80;
+
+ if (prepare_listen_socket(tconn, &ad))
+ return 0;
do {
- for (try = 0;;) {
- /* 3 tries, this should take less than a second! */
- s = drbd_try_connect(mdev);
- if (s || ++try >= 3)
- break;
- /* give the other side time to call bind() & listen() */
- schedule_timeout_interruptible(HZ / 10);
- }
+ struct socket *s;
+ s = drbd_try_connect(tconn);
if (s) {
- if (!sock) {
- drbd_send_fp(mdev, s, P_HAND_SHAKE_S);
- sock = s;
- s = NULL;
- } else if (!msock) {
- drbd_clear_flag(mdev, DISCARD_CONCURRENT);
- drbd_send_fp(mdev, s, P_HAND_SHAKE_M);
- msock = s;
- s = NULL;
+ if (!sock.socket) {
+ sock.socket = s;
+ send_first_packet(tconn, &sock, P_INITIAL_DATA);
+ } else if (!msock.socket) {
+ clear_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ msock.socket = s;
+ send_first_packet(tconn, &msock, P_INITIAL_META);
} else {
- dev_err(DEV, "Logic error in drbd_connect()\n");
+ conn_err(tconn, "Logic error in conn_connect()\n");
goto out_release_sockets;
}
}
- if (sock && msock) {
- schedule_timeout_interruptible(mdev->net_conf->ping_timeo*HZ/10);
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
+ if (sock.socket && msock.socket) {
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ timeout = nc->ping_timeo * HZ / 10;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeout);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
if (ok)
break;
}
retry:
- s = drbd_wait_for_connect(mdev);
+ s = drbd_wait_for_connect(tconn, &ad);
if (s) {
- try = drbd_recv_fp(mdev, s);
- drbd_socket_okay(mdev, &sock);
- drbd_socket_okay(mdev, &msock);
- switch (try) {
- case P_HAND_SHAKE_S:
- if (sock) {
- dev_warn(DEV, "initial packet S crossed\n");
- sock_release(sock);
+ int fp = receive_first_packet(tconn, s);
+ drbd_socket_okay(&sock.socket);
+ drbd_socket_okay(&msock.socket);
+ switch (fp) {
+ case P_INITIAL_DATA:
+ if (sock.socket) {
+ conn_warn(tconn, "initial packet S crossed\n");
+ sock_release(sock.socket);
+ sock.socket = s;
+ goto randomize;
}
- sock = s;
+ sock.socket = s;
break;
- case P_HAND_SHAKE_M:
- if (msock) {
- dev_warn(DEV, "initial packet M crossed\n");
- sock_release(msock);
+ case P_INITIAL_META:
+ set_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ if (msock.socket) {
+ conn_warn(tconn, "initial packet M crossed\n");
+ sock_release(msock.socket);
+ msock.socket = s;
+ goto randomize;
}
- msock = s;
- drbd_set_flag(mdev, DISCARD_CONCURRENT);
+ msock.socket = s;
break;
default:
- dev_warn(DEV, "Error receiving initial packet\n");
+ conn_warn(tconn, "Error receiving initial packet\n");
sock_release(s);
+randomize:
if (random32() & 1)
goto retry;
}
}
- if (mdev->state.conn <= C_DISCONNECTING)
+ if (tconn->cstate <= C_DISCONNECTING)
goto out_release_sockets;
if (signal_pending(current)) {
flush_signals(current);
smp_rmb();
- if (get_t_state(&mdev->receiver) == Exiting)
+ if (get_t_state(&tconn->receiver) == EXITING)
goto out_release_sockets;
}
- if (sock && msock) {
- ok = drbd_socket_okay(mdev, &sock);
- ok = drbd_socket_okay(mdev, &msock) && ok;
- if (ok)
- break;
- }
- } while (1);
+ ok = drbd_socket_okay(&sock.socket);
+ ok = drbd_socket_okay(&msock.socket) && ok;
+ } while (!ok);
- msock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
- sock->sk->sk_allocation = GFP_NOIO;
- msock->sk->sk_allocation = GFP_NOIO;
+ sock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
+ msock.socket->sk->sk_reuse = SK_CAN_REUSE; /* SO_REUSEADDR */
- sock->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
- msock->sk->sk_priority = TC_PRIO_INTERACTIVE;
+ sock.socket->sk->sk_allocation = GFP_NOIO;
+ msock.socket->sk->sk_allocation = GFP_NOIO;
+
+ sock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE_BULK;
+ msock.socket->sk->sk_priority = TC_PRIO_INTERACTIVE;
/* NOT YET ...
- * sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- * sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- * first set it to the P_HAND_SHAKE timeout,
+ * sock.socket->sk->sk_sndtimeo = tconn->net_conf->timeout*HZ/10;
+ * sock.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ * first set it to the P_CONNECTION_FEATURES timeout,
* which we set to 4x the configured ping_timeout. */
- sock->sk->sk_sndtimeo =
- sock->sk->sk_rcvtimeo = mdev->net_conf->ping_timeo*4*HZ/10;
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
- msock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- msock->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
+ sock.socket->sk->sk_sndtimeo =
+ sock.socket->sk->sk_rcvtimeo = nc->ping_timeo*4*HZ/10;
+
+ msock.socket->sk->sk_rcvtimeo = nc->ping_int*HZ;
+ timeout = nc->timeout * HZ / 10;
+ discard_my_data = nc->discard_my_data;
+ rcu_read_unlock();
+
+ msock.socket->sk->sk_sndtimeo = timeout;
/* we don't want delays.
* we use TCP_CORK where appropriate, though */
- drbd_tcp_nodelay(sock);
- drbd_tcp_nodelay(msock);
+ drbd_tcp_nodelay(sock.socket);
+ drbd_tcp_nodelay(msock.socket);
- mdev->data.socket = sock;
- mdev->meta.socket = msock;
- mdev->last_received = jiffies;
+ tconn->data.socket = sock.socket;
+ tconn->meta.socket = msock.socket;
+ tconn->last_received = jiffies;
- D_ASSERT(mdev->asender.task == NULL);
-
- h = drbd_do_handshake(mdev);
+ h = drbd_do_features(tconn);
if (h <= 0)
return h;
- if (mdev->cram_hmac_tfm) {
+ if (tconn->cram_hmac_tfm) {
/* drbd_request_state(mdev, NS(conn, WFAuth)); */
- switch (drbd_do_auth(mdev)) {
+ switch (drbd_do_auth(tconn)) {
case -1:
- dev_err(DEV, "Authentication of peer failed\n");
+ conn_err(tconn, "Authentication of peer failed\n");
return -1;
case 0:
- dev_err(DEV, "Authentication of peer failed, trying again.\n");
+ conn_err(tconn, "Authentication of peer failed, trying again.\n");
return 0;
}
}
- sock->sk->sk_sndtimeo = mdev->net_conf->timeout*HZ/10;
- sock->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
+ tconn->data.socket->sk->sk_sndtimeo = timeout;
+ tconn->data.socket->sk->sk_rcvtimeo = MAX_SCHEDULE_TIMEOUT;
- atomic_set(&mdev->packet_seq, 0);
- mdev->peer_seq = 0;
-
- if (drbd_send_protocol(mdev) == -1)
+ if (drbd_send_protocol(tconn) == -EOPNOTSUPP)
return -1;
- drbd_set_flag(mdev, STATE_SENT);
- drbd_send_sync_param(mdev, &mdev->sync_conf);
- drbd_send_sizes(mdev, 0, 0);
- drbd_send_uuids(mdev);
- drbd_send_current_state(mdev);
- drbd_clear_flag(mdev, USE_DEGR_WFC_T);
- drbd_clear_flag(mdev, RESIZE_PENDING);
- spin_lock_irq(&mdev->req_lock);
- rv = _drbd_set_state(_NS(mdev, conn, C_WF_REPORT_PARAMS), CS_VERBOSE, NULL);
- if (mdev->state.conn != C_WF_REPORT_PARAMS)
- drbd_clear_flag(mdev, STATE_SENT);
- spin_unlock_irq(&mdev->req_lock);
+ set_bit(STATE_SENT, &tconn->flags);
- if (rv < SS_SUCCESS)
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+
+ if (discard_my_data)
+ set_bit(DISCARD_MY_DATA, &mdev->flags);
+ else
+ clear_bit(DISCARD_MY_DATA, &mdev->flags);
+
+ drbd_connected(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ rv = conn_request_state(tconn, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE);
+ if (rv < SS_SUCCESS) {
+ clear_bit(STATE_SENT, &tconn->flags);
return 0;
+ }
- drbd_thread_start(&mdev->asender);
- mod_timer(&mdev->request_timer, jiffies + HZ); /* just start it here. */
+ drbd_thread_start(&tconn->asender);
- return 1;
+ mutex_lock(&tconn->conf_update);
+ /* The discard_my_data flag is a single-shot modifier to the next
+ * connection attempt, the handshake of which is now well underway.
+ * No need for rcu style copying of the whole struct
+ * just to clear a single value. */
+ tconn->net_conf->discard_my_data = 0;
+ mutex_unlock(&tconn->conf_update);
+
+ return h;
out_release_sockets:
- if (sock)
- sock_release(sock);
- if (msock)
- sock_release(msock);
+ if (ad.s_listen)
+ sock_release(ad.s_listen);
+ if (sock.socket)
+ sock_release(sock.socket);
+ if (msock.socket)
+ sock_release(msock.socket);
return -1;
}
-static int drbd_recv_header(struct drbd_conf *mdev, enum drbd_packets *cmd, unsigned int *packet_size)
+static int decode_header(struct drbd_tconn *tconn, void *header, struct packet_info *pi)
{
- union p_header *h = &mdev->data.rbuf.header;
- int r;
+ unsigned int header_size = drbd_header_size(tconn);
- r = drbd_recv(mdev, h, sizeof(*h));
- if (unlikely(r != sizeof(*h))) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read expecting header on sock: r=%d\n", r);
- return false;
- }
-
- if (likely(h->h80.magic == BE_DRBD_MAGIC)) {
- *cmd = be16_to_cpu(h->h80.command);
- *packet_size = be16_to_cpu(h->h80.length);
- } else if (h->h95.magic == BE_DRBD_MAGIC_BIG) {
- *cmd = be16_to_cpu(h->h95.command);
- *packet_size = be32_to_cpu(h->h95.length);
+ if (header_size == sizeof(struct p_header100) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC_100)) {
+ struct p_header100 *h = header;
+ if (h->pad != 0) {
+ conn_err(tconn, "Header padding is not zero\n");
+ return -EINVAL;
+ }
+ pi->vnr = be16_to_cpu(h->volume);
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ } else if (header_size == sizeof(struct p_header95) &&
+ *(__be16 *)header == cpu_to_be16(DRBD_MAGIC_BIG)) {
+ struct p_header95 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be32_to_cpu(h->length);
+ pi->vnr = 0;
+ } else if (header_size == sizeof(struct p_header80) &&
+ *(__be32 *)header == cpu_to_be32(DRBD_MAGIC)) {
+ struct p_header80 *h = header;
+ pi->cmd = be16_to_cpu(h->command);
+ pi->size = be16_to_cpu(h->length);
+ pi->vnr = 0;
} else {
- dev_err(DEV, "magic?? on data m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->h80.magic),
- be16_to_cpu(h->h80.command),
- be16_to_cpu(h->h80.length));
- return false;
+ conn_err(tconn, "Wrong magic value 0x%08x in protocol version %d\n",
+ be32_to_cpu(*(__be32 *)header),
+ tconn->agreed_pro_version);
+ return -EINVAL;
}
- mdev->last_received = jiffies;
-
- return true;
+ pi->data = header + header_size;
+ return 0;
}
-static void drbd_flush(struct drbd_conf *mdev)
+static int drbd_recv_header(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ void *buffer = tconn->data.rbuf;
+ int err;
+
+ err = drbd_recv_all_warn(tconn, buffer, drbd_header_size(tconn));
+ if (err)
+ return err;
+
+ err = decode_header(tconn, buffer, pi);
+ tconn->last_received = jiffies;
+
+ return err;
+}
+
+static void drbd_flush(struct drbd_tconn *tconn)
{
int rv;
+ struct drbd_conf *mdev;
+ int vnr;
- if (mdev->write_ordering >= WO_bdev_flush && get_ldev(mdev)) {
- rv = blkdev_issue_flush(mdev->ldev->backing_bdev, GFP_NOIO,
- NULL);
- if (rv) {
- dev_info(DEV, "local disk flush failed with status %d\n", rv);
- /* would rather check on EOPNOTSUPP, but that is not reliable.
- * don't try again for ANY return value != 0
- * if (rv == -EOPNOTSUPP) */
- drbd_bump_write_ordering(mdev, WO_drain_io);
+ if (tconn->write_ordering >= WO_bdev_flush) {
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (!get_ldev(mdev))
+ continue;
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+
+ rv = blkdev_issue_flush(mdev->ldev->backing_bdev,
+ GFP_NOIO, NULL);
+ if (rv) {
+ dev_info(DEV, "local disk flush failed with status %d\n", rv);
+ /* would rather check on EOPNOTSUPP, but that is not reliable.
+ * don't try again for ANY return value != 0
+ * if (rv == -EOPNOTSUPP) */
+ drbd_bump_write_ordering(tconn, WO_drain_io);
+ }
+ put_ldev(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+
+ rcu_read_lock();
+ if (rv)
+ break;
}
- put_ldev(mdev);
+ rcu_read_unlock();
}
}
@@ -976,7 +1169,7 @@
* @epoch: Epoch object.
* @ev: Epoch event.
*/
-static enum finish_epoch drbd_may_finish_epoch(struct drbd_conf *mdev,
+static enum finish_epoch drbd_may_finish_epoch(struct drbd_tconn *tconn,
struct drbd_epoch *epoch,
enum epoch_event ev)
{
@@ -984,7 +1177,7 @@
struct drbd_epoch *next_epoch;
enum finish_epoch rv = FE_STILL_LIVE;
- spin_lock(&mdev->epoch_lock);
+ spin_lock(&tconn->epoch_lock);
do {
next_epoch = NULL;
@@ -1006,18 +1199,22 @@
atomic_read(&epoch->active) == 0 &&
(test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags) || ev & EV_CLEANUP)) {
if (!(ev & EV_CLEANUP)) {
- spin_unlock(&mdev->epoch_lock);
- drbd_send_b_ack(mdev, epoch->barrier_nr, epoch_size);
- spin_lock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
+ drbd_send_b_ack(epoch->tconn, epoch->barrier_nr, epoch_size);
+ spin_lock(&tconn->epoch_lock);
}
+#if 0
+ /* FIXME: dec unacked on connection, once we have
+ * something to count pending connection packets in. */
if (test_bit(DE_HAVE_BARRIER_NUMBER, &epoch->flags))
- dec_unacked(mdev);
+ dec_unacked(epoch->tconn);
+#endif
- if (mdev->current_epoch != epoch) {
+ if (tconn->current_epoch != epoch) {
next_epoch = list_entry(epoch->list.next, struct drbd_epoch, list);
list_del(&epoch->list);
ev = EV_BECAME_LAST | (ev & EV_CLEANUP);
- mdev->epochs--;
+ tconn->epochs--;
kfree(epoch);
if (rv == FE_STILL_LIVE)
@@ -1028,7 +1225,6 @@
/* atomic_set(&epoch->active, 0); is already zero */
if (rv == FE_STILL_LIVE)
rv = FE_RECYCLED;
- wake_up(&mdev->ee_wait);
}
}
@@ -1038,40 +1234,52 @@
epoch = next_epoch;
} while (1);
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
return rv;
}
/**
* drbd_bump_write_ordering() - Fall back to an other write ordering method
- * @mdev: DRBD device.
+ * @tconn: DRBD connection.
* @wo: Write ordering method to try.
*/
-void drbd_bump_write_ordering(struct drbd_conf *mdev, enum write_ordering_e wo) __must_hold(local)
+void drbd_bump_write_ordering(struct drbd_tconn *tconn, enum write_ordering_e wo)
{
+ struct disk_conf *dc;
+ struct drbd_conf *mdev;
enum write_ordering_e pwo;
+ int vnr;
static char *write_ordering_str[] = {
[WO_none] = "none",
[WO_drain_io] = "drain",
[WO_bdev_flush] = "flush",
};
- pwo = mdev->write_ordering;
+ pwo = tconn->write_ordering;
wo = min(pwo, wo);
- if (wo == WO_bdev_flush && mdev->ldev->dc.no_disk_flush)
- wo = WO_drain_io;
- if (wo == WO_drain_io && mdev->ldev->dc.no_disk_drain)
- wo = WO_none;
- mdev->write_ordering = wo;
- if (pwo != mdev->write_ordering || wo == WO_bdev_flush)
- dev_info(DEV, "Method to ensure write ordering: %s\n", write_ordering_str[mdev->write_ordering]);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (!get_ldev_if_state(mdev, D_ATTACHING))
+ continue;
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+
+ if (wo == WO_bdev_flush && !dc->disk_flushes)
+ wo = WO_drain_io;
+ if (wo == WO_drain_io && !dc->disk_drain)
+ wo = WO_none;
+ put_ldev(mdev);
+ }
+ rcu_read_unlock();
+ tconn->write_ordering = wo;
+ if (pwo != tconn->write_ordering || wo == WO_bdev_flush)
+ conn_info(tconn, "Method to ensure write ordering: %s\n", write_ordering_str[tconn->write_ordering]);
}
/**
- * drbd_submit_ee()
+ * drbd_submit_peer_request()
* @mdev: DRBD device.
- * @e: epoch entry
+ * @peer_req: peer request
* @rw: flag field, see bio->bi_rw
*
* May spread the pages to multiple bios,
@@ -1085,14 +1293,15 @@
* on certain Xen deployments.
*/
/* TODO allocate from our own bio_set. */
-int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
- const unsigned rw, const int fault_type)
+int drbd_submit_peer_request(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req,
+ const unsigned rw, const int fault_type)
{
struct bio *bios = NULL;
struct bio *bio;
- struct page *page = e->pages;
- sector_t sector = e->sector;
- unsigned ds = e->size;
+ struct page *page = peer_req->pages;
+ sector_t sector = peer_req->i.sector;
+ unsigned ds = peer_req->i.size;
unsigned n_bios = 0;
unsigned nr_pages = (ds + PAGE_SIZE -1) >> PAGE_SHIFT;
int err = -ENOMEM;
@@ -1111,12 +1320,12 @@
dev_err(DEV, "submit_ee: Allocation of a bio failed\n");
goto fail;
}
- /* > e->sector, unless this is the first bio */
+ /* > peer_req->i.sector, unless this is the first bio */
bio->bi_sector = sector;
bio->bi_bdev = mdev->ldev->backing_bdev;
bio->bi_rw = rw;
- bio->bi_private = e;
- bio->bi_end_io = drbd_endio_sec;
+ bio->bi_private = peer_req;
+ bio->bi_end_io = drbd_peer_request_endio;
bio->bi_next = bios;
bios = bio;
@@ -1145,7 +1354,7 @@
D_ASSERT(page == NULL);
D_ASSERT(ds == 0);
- atomic_set(&e->pending_bios, n_bios);
+ atomic_set(&peer_req->pending_bios, n_bios);
do {
bio = bios;
bios = bios->bi_next;
@@ -1164,26 +1373,57 @@
return err;
}
-static int receive_Barrier(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void drbd_remove_epoch_entry_interval(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_interval *i = &peer_req->i;
+
+ drbd_remove_interval(&mdev->write_requests, i);
+ drbd_clear_interval(i);
+
+ /* Wake up any processes waiting for this peer request to complete. */
+ if (i->waiting)
+ wake_up(&mdev->misc_wait);
+}
+
+void conn_wait_active_ee_empty(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+}
+
+static int receive_Barrier(struct drbd_tconn *tconn, struct packet_info *pi)
{
int rv;
- struct p_barrier *p = &mdev->data.rbuf.barrier;
+ struct p_barrier *p = pi->data;
struct drbd_epoch *epoch;
- inc_unacked(mdev);
-
- mdev->current_epoch->barrier_nr = p->barrier;
- rv = drbd_may_finish_epoch(mdev, mdev->current_epoch, EV_GOT_BARRIER_NR);
+ /* FIXME these are unacked on connection,
+ * not a specific (peer)device.
+ */
+ tconn->current_epoch->barrier_nr = p->barrier;
+ tconn->current_epoch->tconn = tconn;
+ rv = drbd_may_finish_epoch(tconn, tconn->current_epoch, EV_GOT_BARRIER_NR);
/* P_BARRIER_ACK may imply that the corresponding extent is dropped from
* the activity log, which means it would not be resynced in case the
* R_PRIMARY crashes now.
* Therefore we must send the barrier_ack after the barrier request was
* completed. */
- switch (mdev->write_ordering) {
+ switch (tconn->write_ordering) {
case WO_none:
if (rv == FE_RECYCLED)
- return true;
+ return 0;
/* receiver context, in the writeout path of the other node.
* avoid potential distributed deadlock */
@@ -1191,81 +1431,75 @@
if (epoch)
break;
else
- dev_warn(DEV, "Allocation of an epoch failed, slowing down\n");
+ conn_warn(tconn, "Allocation of an epoch failed, slowing down\n");
/* Fall through */
case WO_bdev_flush:
case WO_drain_io:
- drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
- drbd_flush(mdev);
+ conn_wait_active_ee_empty(tconn);
+ drbd_flush(tconn);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
+ if (atomic_read(&tconn->current_epoch->epoch_size)) {
epoch = kmalloc(sizeof(struct drbd_epoch), GFP_NOIO);
if (epoch)
break;
}
- epoch = mdev->current_epoch;
- wait_event(mdev->ee_wait, atomic_read(&epoch->epoch_size) == 0);
-
- D_ASSERT(atomic_read(&epoch->active) == 0);
- D_ASSERT(epoch->flags == 0);
-
- return true;
+ return 0;
default:
- dev_err(DEV, "Strangeness in mdev->write_ordering %d\n", mdev->write_ordering);
- return false;
+ conn_err(tconn, "Strangeness in tconn->write_ordering %d\n", tconn->write_ordering);
+ return -EIO;
}
epoch->flags = 0;
atomic_set(&epoch->epoch_size, 0);
atomic_set(&epoch->active, 0);
- spin_lock(&mdev->epoch_lock);
- if (atomic_read(&mdev->current_epoch->epoch_size)) {
- list_add(&epoch->list, &mdev->current_epoch->list);
- mdev->current_epoch = epoch;
- mdev->epochs++;
+ spin_lock(&tconn->epoch_lock);
+ if (atomic_read(&tconn->current_epoch->epoch_size)) {
+ list_add(&epoch->list, &tconn->current_epoch->list);
+ tconn->current_epoch = epoch;
+ tconn->epochs++;
} else {
/* The current_epoch got recycled while we allocated this one... */
kfree(epoch);
}
- spin_unlock(&mdev->epoch_lock);
+ spin_unlock(&tconn->epoch_lock);
- return true;
+ return 0;
}
/* used from receive_RSDataReply (recv_resync_read)
* and from receive_Data */
-static struct drbd_epoch_entry *
-read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector, int data_size) __must_hold(local)
+static struct drbd_peer_request *
+read_in_block(struct drbd_conf *mdev, u64 id, sector_t sector,
+ int data_size) __must_hold(local)
{
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
struct page *page;
- int dgs, ds, rr;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
+ int dgs, ds, err;
+ void *dig_in = mdev->tconn->int_dig_in;
+ void *dig_vv = mdev->tconn->int_dig_vv;
unsigned long *data;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data digest: read %d expected %d\n",
- rr, dgs);
+ dgs = 0;
+ if (mdev->tconn->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ /*
+ * FIXME: Receive the incoming digest into the receive buffer
+ * here, together with its struct p_data?
+ */
+ err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+ if (err)
return NULL;
- }
+ data_size -= dgs;
}
- data_size -= dgs;
-
- ERR_IF(data_size & 0x1ff) return NULL;
- ERR_IF(data_size > DRBD_MAX_BIO_SIZE) return NULL;
+ if (!expect(IS_ALIGNED(data_size, 512)))
+ return NULL;
+ if (!expect(data_size <= DRBD_MAX_BIO_SIZE))
+ return NULL;
/* even though we trust out peer,
* we sometimes have to double check. */
@@ -1280,47 +1514,42 @@
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, id, sector, data_size, GFP_NOIO);
- if (!e)
+ peer_req = drbd_alloc_peer_req(mdev, id, sector, data_size, GFP_NOIO);
+ if (!peer_req)
return NULL;
if (!data_size)
- return e;
+ return peer_req;
ds = data_size;
- page = e->pages;
+ page = peer_req->pages;
page_chain_for_each(page) {
unsigned len = min_t(int, ds, PAGE_SIZE);
data = kmap(page);
- rr = drbd_recv(mdev, data, len);
+ err = drbd_recv_all_warn(mdev->tconn, data, len);
if (drbd_insert_fault(mdev, DRBD_FAULT_RECEIVE)) {
dev_err(DEV, "Fault injection: Corrupting data on receive\n");
data[0] = data[0] ^ (unsigned long)-1;
}
kunmap(page);
- if (rr != len) {
- drbd_free_ee(mdev, e);
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving data: read %d expected %d\n",
- rr, len);
+ if (err) {
+ drbd_free_peer_req(mdev, peer_req);
return NULL;
}
- ds -= rr;
+ ds -= len;
}
if (dgs) {
- drbd_csum_ee(mdev, mdev->integrity_r_tfm, e, dig_vv);
+ drbd_csum_ee(mdev, mdev->tconn->peer_integrity_tfm, peer_req, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED: %llus +%u\n",
(unsigned long long)sector, data_size);
- drbd_bcast_ee(mdev, "digest failed",
- dgs, dig_in, dig_vv, e);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
return NULL;
}
}
mdev->recv_cnt += data_size>>9;
- return e;
+ return peer_req;
}
/* drbd_drain_block() just takes a data block
@@ -1329,30 +1558,26 @@
static int drbd_drain_block(struct drbd_conf *mdev, int data_size)
{
struct page *page;
- int rr, rv = 1;
+ int err = 0;
void *data;
if (!data_size)
- return true;
+ return 0;
- page = drbd_pp_alloc(mdev, 1, 1);
+ page = drbd_alloc_pages(mdev, 1, 1);
data = kmap(page);
while (data_size) {
- rr = drbd_recv(mdev, data, min_t(int, data_size, PAGE_SIZE));
- if (rr != min_t(int, data_size, PAGE_SIZE)) {
- rv = 0;
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data: read %d expected %d\n",
- rr, min_t(int, data_size, PAGE_SIZE));
+ unsigned int len = min_t(int, data_size, PAGE_SIZE);
+
+ err = drbd_recv_all_warn(mdev->tconn, data, len);
+ if (err)
break;
- }
- data_size -= rr;
+ data_size -= len;
}
kunmap(page);
- drbd_pp_free(mdev, page, 0);
- return rv;
+ drbd_free_pages(mdev, page, 0);
+ return err;
}
static int recv_dless_read(struct drbd_conf *mdev, struct drbd_request *req,
@@ -1360,26 +1585,19 @@
{
struct bio_vec *bvec;
struct bio *bio;
- int dgs, rr, i, expect;
- void *dig_in = mdev->int_dig_in;
- void *dig_vv = mdev->int_dig_vv;
+ int dgs, err, i, expect;
+ void *dig_in = mdev->tconn->int_dig_in;
+ void *dig_vv = mdev->tconn->int_dig_vv;
- dgs = (mdev->agreed_pro_version >= 87 && mdev->integrity_r_tfm) ?
- crypto_hash_digestsize(mdev->integrity_r_tfm) : 0;
-
- if (dgs) {
- rr = drbd_recv(mdev, dig_in, dgs);
- if (rr != dgs) {
- if (!signal_pending(current))
- dev_warn(DEV,
- "short read receiving data reply digest: read %d expected %d\n",
- rr, dgs);
- return 0;
- }
+ dgs = 0;
+ if (mdev->tconn->peer_integrity_tfm) {
+ dgs = crypto_hash_digestsize(mdev->tconn->peer_integrity_tfm);
+ err = drbd_recv_all_warn(mdev->tconn, dig_in, dgs);
+ if (err)
+ return err;
+ data_size -= dgs;
}
- data_size -= dgs;
-
/* optimistically update recv_cnt. if receiving fails below,
* we disconnect anyways, and counters will be reset. */
mdev->recv_cnt += data_size>>9;
@@ -1388,63 +1606,61 @@
D_ASSERT(sector == bio->bi_sector);
bio_for_each_segment(bvec, bio, i) {
+ void *mapped = kmap(bvec->bv_page) + bvec->bv_offset;
expect = min_t(int, data_size, bvec->bv_len);
- rr = drbd_recv(mdev,
- kmap(bvec->bv_page)+bvec->bv_offset,
- expect);
+ err = drbd_recv_all_warn(mdev->tconn, mapped, expect);
kunmap(bvec->bv_page);
- if (rr != expect) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving data reply: "
- "read %d expected %d\n",
- rr, expect);
- return 0;
- }
- data_size -= rr;
+ if (err)
+ return err;
+ data_size -= expect;
}
if (dgs) {
- drbd_csum_bio(mdev, mdev->integrity_r_tfm, bio, dig_vv);
+ drbd_csum_bio(mdev, mdev->tconn->peer_integrity_tfm, bio, dig_vv);
if (memcmp(dig_in, dig_vv, dgs)) {
dev_err(DEV, "Digest integrity check FAILED. Broken NICs?\n");
- return 0;
+ return -EINVAL;
}
}
D_ASSERT(data_size == 0);
- return 1;
+ return 0;
}
-/* e_end_resync_block() is called via
- * drbd_process_done_ee() by asender only */
-static int e_end_resync_block(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+/*
+ * e_end_resync_block() is called in asender context via
+ * drbd_finish_peer_reqs().
+ */
+static int e_end_resync_block(struct drbd_work *w, int unused)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok;
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ int err;
- D_ASSERT(hlist_unhashed(&e->collision));
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- drbd_set_in_sync(mdev, sector, e->size);
- ok = drbd_send_ack(mdev, P_RS_WRITE_ACK, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ drbd_set_in_sync(mdev, sector, peer_req->i.size);
+ err = drbd_send_ack(mdev, P_RS_WRITE_ACK, peer_req);
} else {
/* Record failure to sync */
- drbd_rs_failed_io(mdev, sector, e->size);
+ drbd_rs_failed_io(mdev, sector, peer_req->i.size);
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
}
dec_unacked(mdev);
- return ok;
+ return err;
}
static int recv_resync_read(struct drbd_conf *mdev, sector_t sector, int data_size) __releases(local)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
- e = read_in_block(mdev, ID_SYNCER, sector, data_size);
- if (!e)
+ peer_req = read_in_block(mdev, ID_SYNCER, sector, data_size);
+ if (!peer_req)
goto fail;
dec_rs_pending(mdev);
@@ -1453,64 +1669,88 @@
/* corresponding dec_unacked() in e_end_resync_block()
* respective _drbd_clear_done_ee */
- e->w.cb = e_end_resync_block;
+ peer_req->w.cb = e_end_resync_block;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->sync_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->sync_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
atomic_add(data_size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
- return true;
+ if (drbd_submit_peer_request(mdev, peer_req, WRITE, DRBD_FAULT_RS_WR) == 0)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
fail:
put_ldev(mdev);
- return false;
+ return -EIO;
}
-static int receive_DataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static struct drbd_request *
+find_request(struct drbd_conf *mdev, struct rb_root *root, u64 id,
+ sector_t sector, bool missing_ok, const char *func)
{
struct drbd_request *req;
+
+ /* Request object according to our peer */
+ req = (struct drbd_request *)(unsigned long)id;
+ if (drbd_contains_interval(root, sector, &req->i) && req->i.local)
+ return req;
+ if (!missing_ok) {
+ dev_err(DEV, "%s: failed to find request 0x%lx, sector %llus\n", func,
+ (unsigned long)id, (unsigned long long)sector);
+ }
+ return NULL;
+}
+
+static int receive_DataReply(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct drbd_request *req;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
sector = be64_to_cpu(p->sector);
- spin_lock_irq(&mdev->req_lock);
- req = _ar_id_to_req(mdev, p->block_id, sector);
- spin_unlock_irq(&mdev->req_lock);
- if (unlikely(!req)) {
- dev_err(DEV, "Got a corrupt block_id/sector pair(1).\n");
- return false;
- }
+ spin_lock_irq(&mdev->tconn->req_lock);
+ req = find_request(mdev, &mdev->read_requests, p->block_id, sector, false, __func__);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (unlikely(!req))
+ return -EIO;
/* hlist_del(&req->collision) is done in _req_may_be_done, to avoid
* special casing it there for the various failure cases.
* still no race with drbd_fail_pending_reads */
- ok = recv_dless_read(mdev, req, sector, data_size);
-
- if (ok)
- req_mod(req, data_received);
+ err = recv_dless_read(mdev, req, sector, pi->size);
+ if (!err)
+ req_mod(req, DATA_RECEIVED);
/* else: nothing. handled from drbd_disconnect...
* I don't think we may complete this just yet
* in case we are "on-disconnect: freeze" */
- return ok;
+ return err;
}
-static int receive_RSDataReply(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_RSDataReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
- int ok;
- struct p_data *p = &mdev->data.rbuf.data;
+ int err;
+ struct p_data *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
sector = be64_to_cpu(p->sector);
D_ASSERT(p->block_id == ID_SYNCER);
@@ -1518,42 +1758,63 @@
if (get_ldev(mdev)) {
/* data is submitted to disk within recv_resync_read.
* corresponding put_ldev done below on error,
- * or in drbd_endio_write_sec. */
- ok = recv_resync_read(mdev, sector, data_size);
+ * or in drbd_peer_request_endio. */
+ err = recv_resync_read(mdev, sector, pi->size);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not write resync data to local disk.\n");
- ok = drbd_drain_block(mdev, data_size);
+ err = drbd_drain_block(mdev, pi->size);
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
}
- atomic_add(data_size >> 9, &mdev->rs_sect_in);
+ atomic_add(pi->size >> 9, &mdev->rs_sect_in);
- return ok;
+ return err;
}
-/* e_end_block() is called via drbd_process_done_ee().
- * this means this function only runs in the asender thread
- */
-static int e_end_block(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void restart_conflicting_writes(struct drbd_conf *mdev,
+ sector_t sector, int size)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- sector_t sector = e->sector;
- int ok = 1, pcmd;
+ struct drbd_interval *i;
+ struct drbd_request *req;
- if (mdev->net_conf->wire_protocol == DRBD_PROT_C) {
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED))
+ continue;
+ /* as it is RQ_POSTPONED, this will cause it to
+ * be queued on the retry workqueue. */
+ __req_mod(req, CONFLICT_RESOLVED, NULL);
+ }
+}
+
+/*
+ * e_end_block() is called in asender context via drbd_finish_peer_reqs().
+ */
+static int e_end_block(struct drbd_work *w, int cancel)
+{
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ int err = 0, pcmd;
+
+ if (peer_req->flags & EE_SEND_WRITE_ACK) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
pcmd = (mdev->state.conn >= C_SYNC_SOURCE &&
mdev->state.conn <= C_PAUSED_SYNC_T &&
- e->flags & EE_MAY_SET_IN_SYNC) ?
+ peer_req->flags & EE_MAY_SET_IN_SYNC) ?
P_RS_WRITE_ACK : P_WRITE_ACK;
- ok &= drbd_send_ack(mdev, pcmd, e);
+ err = drbd_send_ack(mdev, pcmd, peer_req);
if (pcmd == P_RS_WRITE_ACK)
- drbd_set_in_sync(mdev, sector, e->size);
+ drbd_set_in_sync(mdev, sector, peer_req->i.size);
} else {
- ok = drbd_send_ack(mdev, P_NEG_ACK, e);
+ err = drbd_send_ack(mdev, P_NEG_ACK, peer_req);
/* we expect it to be marked out of sync anyways...
* maybe assert this? */
}
@@ -1561,52 +1822,115 @@
}
/* we delete from the conflict detection hash _after_ we sent out the
* P_WRITE_ACK / P_NEG_ACK, to get the sequence number right. */
- if (mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->collision));
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
- } else {
- D_ASSERT(hlist_unhashed(&e->collision));
- }
+ if (peer_req->flags & EE_IN_INTERVAL_TREE) {
+ spin_lock_irq(&mdev->tconn->req_lock);
+ D_ASSERT(!drbd_interval_empty(&peer_req->i));
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ if (peer_req->flags & EE_RESTART_REQUESTS)
+ restart_conflicting_writes(mdev, sector, peer_req->i.size);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ } else
+ D_ASSERT(drbd_interval_empty(&peer_req->i));
- drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
+ drbd_may_finish_epoch(mdev->tconn, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0));
- return ok;
+ return err;
}
-static int e_send_discard_ack(struct drbd_conf *mdev, struct drbd_work *w, int unused)
+static int e_send_ack(struct drbd_work *w, enum drbd_packet ack)
{
- struct drbd_epoch_entry *e = (struct drbd_epoch_entry *)w;
- int ok = 1;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_peer_request *peer_req =
+ container_of(w, struct drbd_peer_request, w);
+ int err;
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- ok = drbd_send_ack(mdev, P_DISCARD_ACK, e);
-
- spin_lock_irq(&mdev->req_lock);
- D_ASSERT(!hlist_unhashed(&e->collision));
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
-
+ err = drbd_send_ack(mdev, ack, peer_req);
dec_unacked(mdev);
- return ok;
+ return err;
}
-static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_epoch_entry *data_e)
+static int e_send_superseded(struct drbd_work *w, int unused)
{
+ return e_send_ack(w, P_SUPERSEDED);
+}
- struct drbd_epoch_entry *rs_e;
+static int e_send_retry_write(struct drbd_work *w, int unused)
+{
+ struct drbd_tconn *tconn = w->mdev->tconn;
+
+ return e_send_ack(w, tconn->agreed_pro_version >= 100 ?
+ P_RETRY_WRITE : P_SUPERSEDED);
+}
+
+static bool seq_greater(u32 a, u32 b)
+{
+ /*
+ * We assume 32-bit wrap-around here.
+ * For 24-bit wrap-around, we would have to shift:
+ * a <<= 8; b <<= 8;
+ */
+ return (s32)a - (s32)b > 0;
+}
+
+static u32 seq_max(u32 a, u32 b)
+{
+ return seq_greater(a, b) ? a : b;
+}
+
+static bool need_peer_seq(struct drbd_conf *mdev)
+{
+ struct drbd_tconn *tconn = mdev->tconn;
+ int tp;
+
+ /*
+ * We only need to keep track of the last packet_seq number of our peer
+ * if we are in dual-primary mode and we have the resolve-conflicts flag set; see
+ * handle_write_conflicts().
+ */
+
+ rcu_read_lock();
+ tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+ rcu_read_unlock();
+
+ return tp && test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+}
+
+static void update_peer_seq(struct drbd_conf *mdev, unsigned int peer_seq)
+{
+ unsigned int newest_peer_seq;
+
+ if (need_peer_seq(mdev)) {
+ spin_lock(&mdev->peer_seq_lock);
+ newest_peer_seq = seq_max(mdev->peer_seq, peer_seq);
+ mdev->peer_seq = newest_peer_seq;
+ spin_unlock(&mdev->peer_seq_lock);
+ /* wake up only if we actually changed mdev->peer_seq */
+ if (peer_seq == newest_peer_seq)
+ wake_up(&mdev->seq_wait);
+ }
+}
+
+static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
+{
+ return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
+}
+
+/* maybe change sync_ee into interval trees as well? */
+static bool overlapping_resync_write(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
+{
+ struct drbd_peer_request *rs_req;
bool rv = 0;
- spin_lock_irq(&mdev->req_lock);
- list_for_each_entry(rs_e, &mdev->sync_ee, w.list) {
- if (overlaps(data_e->sector, data_e->size, rs_e->sector, rs_e->size)) {
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_for_each_entry(rs_req, &mdev->sync_ee, w.list) {
+ if (overlaps(peer_req->i.sector, peer_req->i.size,
+ rs_req->i.sector, rs_req->i.size)) {
rv = 1;
break;
}
}
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
return rv;
}
@@ -1632,35 +1956,41 @@
*
* returns 0 if we may process the packet,
* -ERESTARTSYS if we were interrupted (by disconnect signal). */
-static int drbd_wait_peer_seq(struct drbd_conf *mdev, const u32 packet_seq)
+static int wait_for_and_update_peer_seq(struct drbd_conf *mdev, const u32 peer_seq)
{
DEFINE_WAIT(wait);
- unsigned int p_seq;
long timeout;
- int ret = 0;
+ int ret;
+
+ if (!need_peer_seq(mdev))
+ return 0;
+
spin_lock(&mdev->peer_seq_lock);
for (;;) {
- prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
- if (seq_le(packet_seq, mdev->peer_seq+1))
+ if (!seq_greater(peer_seq - 1, mdev->peer_seq)) {
+ mdev->peer_seq = seq_max(mdev->peer_seq, peer_seq);
+ ret = 0;
break;
+ }
if (signal_pending(current)) {
ret = -ERESTARTSYS;
break;
}
- p_seq = mdev->peer_seq;
+ prepare_to_wait(&mdev->seq_wait, &wait, TASK_INTERRUPTIBLE);
spin_unlock(&mdev->peer_seq_lock);
- timeout = schedule_timeout(30*HZ);
+ rcu_read_lock();
+ timeout = rcu_dereference(mdev->tconn->net_conf)->ping_timeo*HZ/10;
+ rcu_read_unlock();
+ timeout = schedule_timeout(timeout);
spin_lock(&mdev->peer_seq_lock);
- if (timeout == 0 && p_seq == mdev->peer_seq) {
+ if (!timeout) {
ret = -ETIMEDOUT;
- dev_err(DEV, "ASSERT FAILED waited 30 seconds for sequence update, forcing reconnect\n");
+ dev_err(DEV, "Timed out waiting for missing ack packets; disconnecting\n");
break;
}
}
- finish_wait(&mdev->seq_wait, &wait);
- if (mdev->peer_seq+1 == packet_seq)
- mdev->peer_seq++;
spin_unlock(&mdev->peer_seq_lock);
+ finish_wait(&mdev->seq_wait, &wait);
return ret;
}
@@ -1675,233 +2005,277 @@
(dpf & DP_DISCARD ? REQ_DISCARD : 0);
}
-/* mirrored write */
-static int receive_Data(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static void fail_postponed_requests(struct drbd_conf *mdev, sector_t sector,
+ unsigned int size)
{
+ struct drbd_interval *i;
+
+ repeat:
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ struct drbd_request *req;
+ struct bio_and_error m;
+
+ if (!i->local)
+ continue;
+ req = container_of(i, struct drbd_request, i);
+ if (!(req->rq_state & RQ_POSTPONED))
+ continue;
+ req->rq_state &= ~RQ_POSTPONED;
+ __req_mod(req, NEG_ACKED, &m);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ goto repeat;
+ }
+}
+
+static int handle_write_conflicts(struct drbd_conf *mdev,
+ struct drbd_peer_request *peer_req)
+{
+ struct drbd_tconn *tconn = mdev->tconn;
+ bool resolve_conflicts = test_bit(RESOLVE_CONFLICTS, &tconn->flags);
+ sector_t sector = peer_req->i.sector;
+ const unsigned int size = peer_req->i.size;
+ struct drbd_interval *i;
+ bool equal;
+ int err;
+
+ /*
+ * Inserting the peer request into the write_requests tree will prevent
+ * new conflicting local requests from being added.
+ */
+ drbd_insert_interval(&mdev->write_requests, &peer_req->i);
+
+ repeat:
+ drbd_for_each_overlap(i, &mdev->write_requests, sector, size) {
+ if (i == &peer_req->i)
+ continue;
+
+ if (!i->local) {
+ /*
+ * Our peer has sent a conflicting remote request; this
+ * should not happen in a two-node setup. Wait for the
+ * earlier peer request to complete.
+ */
+ err = drbd_wait_misc(mdev, i);
+ if (err)
+ goto out;
+ goto repeat;
+ }
+
+ equal = i->sector == sector && i->size == size;
+ if (resolve_conflicts) {
+ /*
+ * If the peer request is fully contained within the
+ * overlapping request, it can be considered overwritten
+ * and thus superseded; otherwise, it will be retried
+ * once all overlapping requests have completed.
+ */
+ bool superseded = i->sector <= sector && i->sector +
+ (i->size >> 9) >= sector + (size >> 9);
+
+ if (!equal)
+ dev_alert(DEV, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u, "
+ "assuming %s came first\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size,
+ superseded ? "local" : "remote");
+
+ inc_unacked(mdev);
+ peer_req->w.cb = superseded ? e_send_superseded :
+ e_send_retry_write;
+ list_add_tail(&peer_req->w.list, &mdev->done_ee);
+ wake_asender(mdev->tconn);
+
+ err = -ENOENT;
+ goto out;
+ } else {
+ struct drbd_request *req =
+ container_of(i, struct drbd_request, i);
+
+ if (!equal)
+ dev_alert(DEV, "Concurrent writes detected: "
+ "local=%llus +%u, remote=%llus +%u\n",
+ (unsigned long long)i->sector, i->size,
+ (unsigned long long)sector, size);
+
+ if (req->rq_state & RQ_LOCAL_PENDING ||
+ !(req->rq_state & RQ_POSTPONED)) {
+ /*
+ * Wait for the node with the discard flag to
+ * decide if this request has been superseded
+ * or needs to be retried.
+ * Requests that have been superseded will
+ * disappear from the write_requests tree.
+ *
+ * In addition, wait for the conflicting
+ * request to finish locally before submitting
+ * the conflicting peer request.
+ */
+ err = drbd_wait_misc(mdev, &req->i);
+ if (err) {
+ _conn_request_state(mdev->tconn,
+ NS(conn, C_TIMEOUT),
+ CS_HARD);
+ fail_postponed_requests(mdev, sector, size);
+ goto out;
+ }
+ goto repeat;
+ }
+ /*
+ * Remember to restart the conflicting requests after
+ * the new peer request has completed.
+ */
+ peer_req->flags |= EE_RESTART_REQUESTS;
+ }
+ }
+ err = 0;
+
+ out:
+ if (err)
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ return err;
+}
+
+/* mirrored write */
+static int receive_Data(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
sector_t sector;
- struct drbd_epoch_entry *e;
- struct p_data *p = &mdev->data.rbuf.data;
+ struct drbd_peer_request *peer_req;
+ struct p_data *p = pi->data;
+ u32 peer_seq = be32_to_cpu(p->seq_num);
int rw = WRITE;
u32 dp_flags;
+ int err, tp;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
if (!get_ldev(mdev)) {
- spin_lock(&mdev->peer_seq_lock);
- if (mdev->peer_seq+1 == be32_to_cpu(p->seq_num))
- mdev->peer_seq++;
- spin_unlock(&mdev->peer_seq_lock);
+ int err2;
- drbd_send_ack_dp(mdev, P_NEG_ACK, p, data_size);
- atomic_inc(&mdev->current_epoch->epoch_size);
- return drbd_drain_block(mdev, data_size);
+ err = wait_for_and_update_peer_seq(mdev, peer_seq);
+ drbd_send_ack_dp(mdev, P_NEG_ACK, p, pi->size);
+ atomic_inc(&tconn->current_epoch->epoch_size);
+ err2 = drbd_drain_block(mdev, pi->size);
+ if (!err)
+ err = err2;
+ return err;
}
- /* get_ldev(mdev) successful.
- * Corresponding put_ldev done either below (on various errors),
- * or in drbd_endio_write_sec, if we successfully submit the data at
- * the end of this function. */
+ /*
+ * Corresponding put_ldev done either below (on various errors), or in
+ * drbd_peer_request_endio, if we successfully submit the data at the
+ * end of this function.
+ */
sector = be64_to_cpu(p->sector);
- e = read_in_block(mdev, p->block_id, sector, data_size);
- if (!e) {
+ peer_req = read_in_block(mdev, p->block_id, sector, pi->size);
+ if (!peer_req) {
put_ldev(mdev);
- return false;
+ return -EIO;
}
- e->w.cb = e_end_block;
+ peer_req->w.cb = e_end_block;
dp_flags = be32_to_cpu(p->dp_flags);
rw |= wire_flags_to_bio(mdev, dp_flags);
- if (e->pages == NULL) {
- D_ASSERT(e->size == 0);
+ if (peer_req->pages == NULL) {
+ D_ASSERT(peer_req->i.size == 0);
D_ASSERT(dp_flags & DP_FLUSH);
}
if (dp_flags & DP_MAY_SET_IN_SYNC)
- e->flags |= EE_MAY_SET_IN_SYNC;
+ peer_req->flags |= EE_MAY_SET_IN_SYNC;
- spin_lock(&mdev->epoch_lock);
- e->epoch = mdev->current_epoch;
- atomic_inc(&e->epoch->epoch_size);
- atomic_inc(&e->epoch->active);
- spin_unlock(&mdev->epoch_lock);
+ spin_lock(&tconn->epoch_lock);
+ peer_req->epoch = tconn->current_epoch;
+ atomic_inc(&peer_req->epoch->epoch_size);
+ atomic_inc(&peer_req->epoch->active);
+ spin_unlock(&tconn->epoch_lock);
- /* I'm the receiver, I do hold a net_cnt reference. */
- if (!mdev->net_conf->two_primaries) {
- spin_lock_irq(&mdev->req_lock);
- } else {
- /* don't get the req_lock yet,
- * we may sleep in drbd_wait_peer_seq */
- const int size = e->size;
- const int discard = drbd_test_flag(mdev, DISCARD_CONCURRENT);
- DEFINE_WAIT(wait);
- struct drbd_request *i;
- struct hlist_node *n;
- struct hlist_head *slot;
- int first;
-
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- BUG_ON(mdev->ee_hash == NULL);
- BUG_ON(mdev->tl_hash == NULL);
-
- /* conflict detection and handling:
- * 1. wait on the sequence number,
- * in case this data packet overtook ACK packets.
- * 2. check our hash tables for conflicting requests.
- * we only need to walk the tl_hash, since an ee can not
- * have a conflict with an other ee: on the submitting
- * node, the corresponding req had already been conflicting,
- * and a conflicting req is never sent.
- *
- * Note: for two_primaries, we are protocol C,
- * so there cannot be any request that is DONE
- * but still on the transfer log.
- *
- * unconditionally add to the ee_hash.
- *
- * if no conflicting request is found:
- * submit.
- *
- * if any conflicting request is found
- * that has not yet been acked,
- * AND I have the "discard concurrent writes" flag:
- * queue (via done_ee) the P_DISCARD_ACK; OUT.
- *
- * if any conflicting request is found:
- * block the receiver, waiting on misc_wait
- * until no more conflicting requests are there,
- * or we get interrupted (disconnect).
- *
- * we do not just write after local io completion of those
- * requests, but only after req is done completely, i.e.
- * we wait for the P_DISCARD_ACK to arrive!
- *
- * then proceed normally, i.e. submit.
- */
- if (drbd_wait_peer_seq(mdev, be32_to_cpu(p->seq_num)))
+ rcu_read_lock();
+ tp = rcu_dereference(mdev->tconn->net_conf)->two_primaries;
+ rcu_read_unlock();
+ if (tp) {
+ peer_req->flags |= EE_IN_INTERVAL_TREE;
+ err = wait_for_and_update_peer_seq(mdev, peer_seq);
+ if (err)
goto out_interrupted;
-
- spin_lock_irq(&mdev->req_lock);
-
- hlist_add_head(&e->collision, ee_hash_slot(mdev, sector));
-
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev, sector);
- first = 1;
- for (;;) {
- int have_unacked = 0;
- int have_conflict = 0;
- prepare_to_wait(&mdev->misc_wait, &wait,
- TASK_INTERRUPTIBLE);
- hlist_for_each_entry(i, n, slot, collision) {
- if (OVERLAPS) {
- /* only ALERT on first iteration,
- * we may be woken up early... */
- if (first)
- dev_alert(DEV, "%s[%u] Concurrent local write detected!"
- " new: %llus +%u; pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- if (i->rq_state & RQ_NET_PENDING)
- ++have_unacked;
- ++have_conflict;
- }
- }
-#undef OVERLAPS
- if (!have_conflict)
- break;
-
- /* Discard Ack only for the _first_ iteration */
- if (first && discard && have_unacked) {
- dev_alert(DEV, "Concurrent write! [DISCARD BY FLAG] sec=%llus\n",
- (unsigned long long)sector);
- inc_unacked(mdev);
- e->w.cb = e_send_discard_ack;
- list_add_tail(&e->w.list, &mdev->done_ee);
-
- spin_unlock_irq(&mdev->req_lock);
-
- /* we could probably send that P_DISCARD_ACK ourselves,
- * but I don't like the receiver using the msock */
-
+ spin_lock_irq(&mdev->tconn->req_lock);
+ err = handle_write_conflicts(mdev, peer_req);
+ if (err) {
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (err == -ENOENT) {
put_ldev(mdev);
- wake_asender(mdev);
- finish_wait(&mdev->misc_wait, &wait);
- return true;
+ return 0;
}
-
- if (signal_pending(current)) {
- hlist_del_init(&e->collision);
-
- spin_unlock_irq(&mdev->req_lock);
-
- finish_wait(&mdev->misc_wait, &wait);
- goto out_interrupted;
- }
-
- spin_unlock_irq(&mdev->req_lock);
- if (first) {
- first = 0;
- dev_alert(DEV, "Concurrent write! [W AFTERWARDS] "
- "sec=%llus\n", (unsigned long long)sector);
- } else if (discard) {
- /* we had none on the first iteration.
- * there must be none now. */
- D_ASSERT(have_unacked == 0);
- }
- schedule();
- spin_lock_irq(&mdev->req_lock);
+ goto out_interrupted;
}
- finish_wait(&mdev->misc_wait, &wait);
- }
-
- list_add(&e->w.list, &mdev->active_ee);
- spin_unlock_irq(&mdev->req_lock);
+ } else
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->active_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (mdev->state.conn == C_SYNC_TARGET)
- wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, e));
+ wait_event(mdev->ee_wait, !overlapping_resync_write(mdev, peer_req));
- switch (mdev->net_conf->wire_protocol) {
- case DRBD_PROT_C:
+ if (mdev->tconn->agreed_pro_version < 100) {
+ rcu_read_lock();
+ switch (rcu_dereference(mdev->tconn->net_conf)->wire_protocol) {
+ case DRBD_PROT_C:
+ dp_flags |= DP_SEND_WRITE_ACK;
+ break;
+ case DRBD_PROT_B:
+ dp_flags |= DP_SEND_RECEIVE_ACK;
+ break;
+ }
+ rcu_read_unlock();
+ }
+
+ if (dp_flags & DP_SEND_WRITE_ACK) {
+ peer_req->flags |= EE_SEND_WRITE_ACK;
inc_unacked(mdev);
/* corresponding dec_unacked() in e_end_block()
* respective _drbd_clear_done_ee */
- break;
- case DRBD_PROT_B:
+ }
+
+ if (dp_flags & DP_SEND_RECEIVE_ACK) {
/* I really don't like it that the receiver thread
* sends on the msock, but anyways */
- drbd_send_ack(mdev, P_RECV_ACK, e);
- break;
- case DRBD_PROT_A:
- /* nothing to do */
- break;
+ drbd_send_ack(mdev, P_RECV_ACK, peer_req);
}
if (mdev->state.pdsk < D_INCONSISTENT) {
/* In case we have the only disk of the cluster, */
- drbd_set_out_of_sync(mdev, e->sector, e->size);
- e->flags |= EE_CALL_AL_COMPLETE_IO;
- e->flags &= ~EE_MAY_SET_IN_SYNC;
- drbd_al_begin_io(mdev, e->sector);
+ drbd_set_out_of_sync(mdev, peer_req->i.sector, peer_req->i.size);
+ peer_req->flags |= EE_CALL_AL_COMPLETE_IO;
+ peer_req->flags &= ~EE_MAY_SET_IN_SYNC;
+ drbd_al_begin_io(mdev, &peer_req->i);
}
- if (drbd_submit_ee(mdev, e, rw, DRBD_FAULT_DT_WR) == 0)
- return true;
+ err = drbd_submit_peer_request(mdev, peer_req, rw, DRBD_FAULT_DT_WR);
+ if (!err)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- hlist_del_init(&e->collision);
- spin_unlock_irq(&mdev->req_lock);
- if (e->flags & EE_CALL_AL_COMPLETE_IO)
- drbd_al_complete_io(mdev, e->sector);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ drbd_remove_epoch_entry_interval(mdev, peer_req);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ if (peer_req->flags & EE_CALL_AL_COMPLETE_IO)
+ drbd_al_complete_io(mdev, &peer_req->i);
out_interrupted:
- drbd_may_finish_epoch(mdev, e->epoch, EV_PUT + EV_CLEANUP);
+ drbd_may_finish_epoch(tconn, peer_req->epoch, EV_PUT + EV_CLEANUP);
put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return false;
+ drbd_free_peer_req(mdev, peer_req);
+ return err;
}
/* We may throttle resync, if the lower device seems to be busy,
@@ -1922,9 +2296,14 @@
struct lc_element *tmp;
int curr_events;
int throttle = 0;
+ unsigned int c_min_rate;
+
+ rcu_read_lock();
+ c_min_rate = rcu_dereference(mdev->ldev->disk_conf)->c_min_rate;
+ rcu_read_unlock();
/* feature disabled? */
- if (mdev->sync_conf.c_min_rate == 0)
+ if (c_min_rate == 0)
return 0;
spin_lock_irq(&mdev->al_lock);
@@ -1964,40 +2343,46 @@
db = mdev->rs_mark_left[i] - rs_left;
dbdt = Bit2KB(db/dt);
- if (dbdt > mdev->sync_conf.c_min_rate)
+ if (dbdt > c_min_rate)
throttle = 1;
}
return throttle;
}
-static int receive_DataRequest(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int digest_size)
+static int receive_DataRequest(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
- const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
- struct drbd_epoch_entry *e;
+ sector_t capacity;
+ struct drbd_peer_request *peer_req;
struct digest_info *di = NULL;
int size, verb;
unsigned int fault_type;
- struct p_block_req *p = &mdev->data.rbuf.block_req;
+ struct p_block_req *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+ capacity = drbd_get_capacity(mdev->this_bdev);
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
- if (size <= 0 || (size & 0x1ff) != 0 || size > DRBD_MAX_BIO_SIZE) {
+ if (size <= 0 || !IS_ALIGNED(size, 512) || size > DRBD_MAX_BIO_SIZE) {
dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return false;
+ return -EINVAL;
}
if (sector + (size>>9) > capacity) {
dev_err(DEV, "%s:%d: sector: %llus, size: %u\n", __FILE__, __LINE__,
(unsigned long long)sector, size);
- return false;
+ return -EINVAL;
}
if (!get_ldev_if_state(mdev, D_UP_TO_DATE)) {
verb = 1;
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
drbd_send_ack_rp(mdev, P_NEG_DREPLY, p);
break;
@@ -2012,35 +2397,34 @@
drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size, ID_IN_SYNC);
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
+ BUG();
}
if (verb && __ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Can not satisfy peer's read request, "
"no local data.\n");
/* drain possibly payload */
- return drbd_drain_block(mdev, digest_size);
+ return drbd_drain_block(mdev, pi->size);
}
/* GFP_NOIO, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
- e = drbd_alloc_ee(mdev, p->block_id, sector, size, GFP_NOIO);
- if (!e) {
+ peer_req = drbd_alloc_peer_req(mdev, p->block_id, sector, size, GFP_NOIO);
+ if (!peer_req) {
put_ldev(mdev);
- return false;
+ return -ENOMEM;
}
- switch (cmd) {
+ switch (pi->cmd) {
case P_DATA_REQUEST:
- e->w.cb = w_e_end_data_req;
+ peer_req->w.cb = w_e_end_data_req;
fault_type = DRBD_FAULT_DT_RD;
/* application IO, don't drbd_rs_begin_io */
goto submit;
case P_RS_DATA_REQUEST:
- e->w.cb = w_e_end_rsdata_req;
+ peer_req->w.cb = w_e_end_rsdata_req;
fault_type = DRBD_FAULT_RS_RD;
/* used in the sector offset progress display */
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -2049,28 +2433,28 @@
case P_OV_REPLY:
case P_CSUM_RS_REQUEST:
fault_type = DRBD_FAULT_RS_RD;
- di = kmalloc(sizeof(*di) + digest_size, GFP_NOIO);
+ di = kmalloc(sizeof(*di) + pi->size, GFP_NOIO);
if (!di)
goto out_free_e;
- di->digest_size = digest_size;
+ di->digest_size = pi->size;
di->digest = (((char *)di)+sizeof(struct digest_info));
- e->digest = di;
- e->flags |= EE_HAS_DIGEST;
+ peer_req->digest = di;
+ peer_req->flags |= EE_HAS_DIGEST;
- if (drbd_recv(mdev, di->digest, digest_size) != digest_size)
+ if (drbd_recv_all(mdev->tconn, di->digest, pi->size))
goto out_free_e;
- if (cmd == P_CSUM_RS_REQUEST) {
- D_ASSERT(mdev->agreed_pro_version >= 89);
- e->w.cb = w_e_end_csum_rs_req;
+ if (pi->cmd == P_CSUM_RS_REQUEST) {
+ D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
+ peer_req->w.cb = w_e_end_csum_rs_req;
/* used in the sector offset progress display */
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
- } else if (cmd == P_OV_REPLY) {
+ } else if (pi->cmd == P_OV_REPLY) {
/* track progress, we may need to throttle */
atomic_add(size >> 9, &mdev->rs_sect_in);
- e->w.cb = w_e_end_ov_reply;
+ peer_req->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
/* drbd_rs_begin_io done when we sent this request,
* but accounting still needs to be done. */
@@ -2080,7 +2464,7 @@
case P_OV_REQUEST:
if (mdev->ov_start_sector == ~(sector_t)0 &&
- mdev->agreed_pro_version >= 90) {
+ mdev->tconn->agreed_pro_version >= 90) {
unsigned long now = jiffies;
int i;
mdev->ov_start_sector = sector;
@@ -2094,15 +2478,12 @@
dev_info(DEV, "Online Verify start sector: %llu\n",
(unsigned long long)sector);
}
- e->w.cb = w_e_end_ov_req;
+ peer_req->w.cb = w_e_end_ov_req;
fault_type = DRBD_FAULT_RS_RD;
break;
default:
- dev_err(DEV, "unexpected command (%s) in receive_DataRequest\n",
- cmdname(cmd));
- fault_type = DRBD_FAULT_MAX;
- goto out_free_e;
+ BUG();
}
/* Throttle, drbd_rs_begin_io and submit should become asynchronous
@@ -2137,30 +2518,31 @@
submit:
inc_unacked(mdev);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add_tail(&peer_req->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- if (drbd_submit_ee(mdev, e, READ, fault_type) == 0)
- return true;
+ if (drbd_submit_peer_request(mdev, peer_req, READ, fault_type) == 0)
+ return 0;
/* don't care for the reason here */
dev_err(DEV, "submit failed, triggering re-connect\n");
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* no drbd_rs_complete_io(), we are dropping the connection anyways */
out_free_e:
put_ldev(mdev);
- drbd_free_ee(mdev, e);
- return false;
+ drbd_free_peer_req(mdev, peer_req);
+ return -EIO;
}
static int drbd_asb_recover_0p(struct drbd_conf *mdev) __must_hold(local)
{
int self, peer, rv = -100;
unsigned long ch_self, ch_peer;
+ enum drbd_after_sb_p after_sb_0p;
self = mdev->ldev->md.uuid[UI_BITMAP] & 1;
peer = mdev->p_uuid[UI_BITMAP] & 1;
@@ -2168,10 +2550,14 @@
ch_peer = mdev->p_uuid[UI_SIZE];
ch_self = mdev->comm_bm_set;
- switch (mdev->net_conf->after_sb_0p) {
+ rcu_read_lock();
+ after_sb_0p = rcu_dereference(mdev->tconn->net_conf)->after_sb_0p;
+ rcu_read_unlock();
+ switch (after_sb_0p) {
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
case ASB_CALL_HELPER:
+ case ASB_VIOLENTLY:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_DISCONNECT:
@@ -2200,14 +2586,14 @@
"Using discard-least-changes instead\n");
case ASB_DISCARD_ZERO_CHG:
if (ch_peer == 0 && ch_self == 0) {
- rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+ rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
? -1 : 1;
break;
} else {
if (ch_peer == 0) { rv = 1; break; }
if (ch_self == 0) { rv = -1; break; }
}
- if (mdev->net_conf->after_sb_0p == ASB_DISCARD_ZERO_CHG)
+ if (after_sb_0p == ASB_DISCARD_ZERO_CHG)
break;
case ASB_DISCARD_LEAST_CHG:
if (ch_self < ch_peer)
@@ -2216,7 +2602,7 @@
rv = 1;
else /* ( ch_self == ch_peer ) */
/* Well, then use something else. */
- rv = drbd_test_flag(mdev, DISCARD_CONCURRENT)
+ rv = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags)
? -1 : 1;
break;
case ASB_DISCARD_LOCAL:
@@ -2232,13 +2618,18 @@
static int drbd_asb_recover_1p(struct drbd_conf *mdev) __must_hold(local)
{
int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_1p;
- switch (mdev->net_conf->after_sb_1p) {
+ rcu_read_lock();
+ after_sb_1p = rcu_dereference(mdev->tconn->net_conf)->after_sb_1p;
+ rcu_read_unlock();
+ switch (after_sb_1p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
case ASB_DISCARD_LOCAL:
case ASB_DISCARD_REMOTE:
+ case ASB_DISCARD_ZERO_CHG:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_DISCONNECT:
@@ -2281,8 +2672,12 @@
static int drbd_asb_recover_2p(struct drbd_conf *mdev) __must_hold(local)
{
int hg, rv = -100;
+ enum drbd_after_sb_p after_sb_2p;
- switch (mdev->net_conf->after_sb_2p) {
+ rcu_read_lock();
+ after_sb_2p = rcu_dereference(mdev->tconn->net_conf)->after_sb_2p;
+ rcu_read_unlock();
+ switch (after_sb_2p) {
case ASB_DISCARD_YOUNGER_PRI:
case ASB_DISCARD_OLDER_PRI:
case ASB_DISCARD_LEAST_CHG:
@@ -2290,6 +2685,7 @@
case ASB_DISCARD_REMOTE:
case ASB_CONSENSUS:
case ASB_DISCARD_SECONDARY:
+ case ASB_DISCARD_ZERO_CHG:
dev_err(DEV, "Configuration error.\n");
break;
case ASB_VIOLENTLY:
@@ -2375,7 +2771,7 @@
if (mdev->p_uuid[UI_BITMAP] == (u64)0 && mdev->ldev->md.uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
if ((mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1)) == (mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) &&
@@ -2398,7 +2794,7 @@
if (mdev->ldev->md.uuid[UI_BITMAP] == (u64)0 && mdev->p_uuid[UI_BITMAP] != (u64)0) {
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
if ((mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) == (mdev->p_uuid[UI_BITMAP] & ~((u64)1)) &&
@@ -2420,7 +2816,7 @@
}
/* Common power [off|failure] */
- rct = (drbd_test_flag(mdev, CRASHED_PRIMARY) ? 1 : 0) +
+ rct = (test_bit(CRASHED_PRIMARY, &mdev->flags) ? 1 : 0) +
(mdev->p_uuid[UI_FLAGS] & 2);
/* lowest bit is set when we were primary,
* next bit (weight 2) is set when peer was primary */
@@ -2431,7 +2827,7 @@
case 1: /* self_pri && !peer_pri */ return 1;
case 2: /* !self_pri && peer_pri */ return -1;
case 3: /* self_pri && peer_pri */
- dc = drbd_test_flag(mdev, DISCARD_CONCURRENT);
+ dc = test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags);
return dc ? -1 : 1;
}
}
@@ -2444,14 +2840,14 @@
*rule_nr = 51;
peer = mdev->p_uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- if (mdev->agreed_pro_version < 96 ?
+ if (mdev->tconn->agreed_pro_version < 96 ?
(mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1)) ==
(mdev->p_uuid[UI_HISTORY_START + 1] & ~((u64)1)) :
peer + UUID_NEW_BM_OFFSET == (mdev->p_uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of the peer's UUIDs. */
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
mdev->p_uuid[UI_BITMAP] = mdev->p_uuid[UI_HISTORY_START];
@@ -2481,14 +2877,14 @@
*rule_nr = 71;
self = mdev->ldev->md.uuid[UI_HISTORY_START] & ~((u64)1);
if (self == peer) {
- if (mdev->agreed_pro_version < 96 ?
+ if (mdev->tconn->agreed_pro_version < 96 ?
(mdev->ldev->md.uuid[UI_HISTORY_START + 1] & ~((u64)1)) ==
(mdev->p_uuid[UI_HISTORY_START] & ~((u64)1)) :
self + UUID_NEW_BM_OFFSET == (mdev->ldev->md.uuid[UI_BITMAP] & ~((u64)1))) {
/* The last P_SYNC_UUID did not get though. Undo the last start of
resync as sync source modifications of our UUIDs. */
- if (mdev->agreed_pro_version < 91)
+ if (mdev->tconn->agreed_pro_version < 91)
return -1091;
__drbd_uuid_set(mdev, UI_BITMAP, mdev->ldev->md.uuid[UI_HISTORY_START]);
@@ -2536,9 +2932,10 @@
static enum drbd_conns drbd_sync_handshake(struct drbd_conf *mdev, enum drbd_role peer_role,
enum drbd_disk_state peer_disk) __must_hold(local)
{
- int hg, rule_nr;
enum drbd_conns rv = C_MASK;
enum drbd_disk_state mydisk;
+ struct net_conf *nc;
+ int hg, rule_nr, rr_conflict, tentative;
mydisk = mdev->state.disk;
if (mydisk == D_NEGOTIATING)
@@ -2578,7 +2975,10 @@
if (abs(hg) == 100)
drbd_khelper(mdev, "initial-split-brain");
- if (hg == 100 || (hg == -100 && mdev->net_conf->always_asbp)) {
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+
+ if (hg == 100 || (hg == -100 && nc->always_asbp)) {
int pcount = (mdev->state.role == R_PRIMARY)
+ (peer_role == R_PRIMARY);
int forced = (hg == -100);
@@ -2607,9 +3007,9 @@
}
if (hg == -100) {
- if (mdev->net_conf->want_lose && !(mdev->p_uuid[UI_FLAGS]&1))
+ if (test_bit(DISCARD_MY_DATA, &mdev->flags) && !(mdev->p_uuid[UI_FLAGS]&1))
hg = -1;
- if (!mdev->net_conf->want_lose && (mdev->p_uuid[UI_FLAGS]&1))
+ if (!test_bit(DISCARD_MY_DATA, &mdev->flags) && (mdev->p_uuid[UI_FLAGS]&1))
hg = 1;
if (abs(hg) < 100)
@@ -2617,6 +3017,9 @@
"Sync from %s node\n",
(hg < 0) ? "peer" : "this");
}
+ rr_conflict = nc->rr_conflict;
+ tentative = nc->tentative;
+ rcu_read_unlock();
if (hg == -100) {
/* FIXME this log message is not correct if we end up here
@@ -2635,7 +3038,7 @@
if (hg < 0 && /* by intention we do not use mydisk here. */
mdev->state.role == R_PRIMARY && mdev->state.disk >= D_CONSISTENT) {
- switch (mdev->net_conf->rr_conflict) {
+ switch (rr_conflict) {
case ASB_CALL_HELPER:
drbd_khelper(mdev, "pri-lost");
/* fall through */
@@ -2648,7 +3051,7 @@
}
}
- if (mdev->net_conf->dry_run || drbd_test_flag(mdev, CONN_DRY_RUN)) {
+ if (tentative || test_bit(CONN_DRY_RUN, &mdev->tconn->flags)) {
if (hg == 0)
dev_info(DEV, "dry-run connect: No resync, would become Connected immediately.\n");
else
@@ -2680,33 +3083,29 @@
return rv;
}
-/* returns 1 if invalid */
-static int cmp_after_sb(enum drbd_after_sb_p peer, enum drbd_after_sb_p self)
+static enum drbd_after_sb_p convert_after_sb(enum drbd_after_sb_p peer)
{
/* ASB_DISCARD_REMOTE - ASB_DISCARD_LOCAL is valid */
- if ((peer == ASB_DISCARD_REMOTE && self == ASB_DISCARD_LOCAL) ||
- (self == ASB_DISCARD_REMOTE && peer == ASB_DISCARD_LOCAL))
- return 0;
+ if (peer == ASB_DISCARD_REMOTE)
+ return ASB_DISCARD_LOCAL;
/* any other things with ASB_DISCARD_REMOTE or ASB_DISCARD_LOCAL are invalid */
- if (peer == ASB_DISCARD_REMOTE || peer == ASB_DISCARD_LOCAL ||
- self == ASB_DISCARD_REMOTE || self == ASB_DISCARD_LOCAL)
- return 1;
+ if (peer == ASB_DISCARD_LOCAL)
+ return ASB_DISCARD_REMOTE;
/* everything else is valid if they are equal on both sides. */
- if (peer == self)
- return 0;
-
- /* everything es is invalid. */
- return 1;
+ return peer;
}
-static int receive_protocol(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_protocol(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_protocol *p = &mdev->data.rbuf.protocol;
- int p_proto, p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
- int p_want_lose, p_two_primaries, cf;
- char p_integrity_alg[SHARED_SECRET_MAX] = "";
+ struct p_protocol *p = pi->data;
+ enum drbd_after_sb_p p_after_sb_0p, p_after_sb_1p, p_after_sb_2p;
+ int p_proto, p_discard_my_data, p_two_primaries, cf;
+ struct net_conf *nc, *old_net_conf, *new_net_conf = NULL;
+ char integrity_alg[SHARED_SECRET_MAX] = "";
+ struct crypto_hash *peer_integrity_tfm = NULL;
+ void *int_dig_in = NULL, *int_dig_vv = NULL;
p_proto = be32_to_cpu(p->protocol);
p_after_sb_0p = be32_to_cpu(p->after_sb_0p);
@@ -2714,63 +3113,138 @@
p_after_sb_2p = be32_to_cpu(p->after_sb_2p);
p_two_primaries = be32_to_cpu(p->two_primaries);
cf = be32_to_cpu(p->conn_flags);
- p_want_lose = cf & CF_WANT_LOSE;
+ p_discard_my_data = cf & CF_DISCARD_MY_DATA;
- drbd_clear_flag(mdev, CONN_DRY_RUN);
+ if (tconn->agreed_pro_version >= 87) {
+ int err;
- if (cf & CF_DRY_RUN)
- drbd_set_flag(mdev, CONN_DRY_RUN);
-
- if (p_proto != mdev->net_conf->wire_protocol) {
- dev_err(DEV, "incompatible communication protocols\n");
- goto disconnect;
+ if (pi->size > sizeof(integrity_alg))
+ return -EIO;
+ err = drbd_recv_all(tconn, integrity_alg, pi->size);
+ if (err)
+ return err;
+ integrity_alg[SHARED_SECRET_MAX - 1] = 0;
}
- if (cmp_after_sb(p_after_sb_0p, mdev->net_conf->after_sb_0p)) {
- dev_err(DEV, "incompatible after-sb-0pri settings\n");
- goto disconnect;
+ if (pi->cmd != P_PROTOCOL_UPDATE) {
+ clear_bit(CONN_DRY_RUN, &tconn->flags);
+
+ if (cf & CF_DRY_RUN)
+ set_bit(CONN_DRY_RUN, &tconn->flags);
+
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+
+ if (p_proto != nc->wire_protocol) {
+ conn_err(tconn, "incompatible %s settings\n", "protocol");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_0p) != nc->after_sb_0p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-0pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_1p) != nc->after_sb_1p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-1pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (convert_after_sb(p_after_sb_2p) != nc->after_sb_2p) {
+ conn_err(tconn, "incompatible %s settings\n", "after-sb-2pri");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_discard_my_data && nc->discard_my_data) {
+ conn_err(tconn, "incompatible %s settings\n", "discard-my-data");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (p_two_primaries != nc->two_primaries) {
+ conn_err(tconn, "incompatible %s settings\n", "allow-two-primaries");
+ goto disconnect_rcu_unlock;
+ }
+
+ if (strcmp(integrity_alg, nc->integrity_alg)) {
+ conn_err(tconn, "incompatible %s settings\n", "data-integrity-alg");
+ goto disconnect_rcu_unlock;
+ }
+
+ rcu_read_unlock();
}
- if (cmp_after_sb(p_after_sb_1p, mdev->net_conf->after_sb_1p)) {
- dev_err(DEV, "incompatible after-sb-1pri settings\n");
- goto disconnect;
- }
+ if (integrity_alg[0]) {
+ int hash_size;
- if (cmp_after_sb(p_after_sb_2p, mdev->net_conf->after_sb_2p)) {
- dev_err(DEV, "incompatible after-sb-2pri settings\n");
- goto disconnect;
- }
+ /*
+ * We can only change the peer data integrity algorithm
+ * here. Changing our own data integrity algorithm
+ * requires that we send a P_PROTOCOL_UPDATE packet at
+ * the same time; otherwise, the peer has no way to
+ * tell between which packets the algorithm should
+ * change.
+ */
- if (p_want_lose && mdev->net_conf->want_lose) {
- dev_err(DEV, "both sides have the 'want_lose' flag set\n");
- goto disconnect;
- }
-
- if (p_two_primaries != mdev->net_conf->two_primaries) {
- dev_err(DEV, "incompatible setting of the two-primaries options\n");
- goto disconnect;
- }
-
- if (mdev->agreed_pro_version >= 87) {
- unsigned char *my_alg = mdev->net_conf->integrity_alg;
-
- if (drbd_recv(mdev, p_integrity_alg, data_size) != data_size)
- return false;
-
- p_integrity_alg[SHARED_SECRET_MAX-1] = 0;
- if (strcmp(p_integrity_alg, my_alg)) {
- dev_err(DEV, "incompatible setting of the data-integrity-alg\n");
+ peer_integrity_tfm = crypto_alloc_hash(integrity_alg, 0, CRYPTO_ALG_ASYNC);
+ if (!peer_integrity_tfm) {
+ conn_err(tconn, "peer data-integrity-alg %s not supported\n",
+ integrity_alg);
goto disconnect;
}
- dev_info(DEV, "data-integrity-alg: %s\n",
- my_alg[0] ? my_alg : (unsigned char *)"<not-used>");
+
+ hash_size = crypto_hash_digestsize(peer_integrity_tfm);
+ int_dig_in = kmalloc(hash_size, GFP_KERNEL);
+ int_dig_vv = kmalloc(hash_size, GFP_KERNEL);
+ if (!(int_dig_in && int_dig_vv)) {
+ conn_err(tconn, "Allocation of buffers for data integrity checking failed\n");
+ goto disconnect;
+ }
}
- return true;
+ new_net_conf = kmalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ conn_err(tconn, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+ mutex_lock(&tconn->data.mutex);
+ mutex_lock(&tconn->conf_update);
+ old_net_conf = tconn->net_conf;
+ *new_net_conf = *old_net_conf;
+
+ new_net_conf->wire_protocol = p_proto;
+ new_net_conf->after_sb_0p = convert_after_sb(p_after_sb_0p);
+ new_net_conf->after_sb_1p = convert_after_sb(p_after_sb_1p);
+ new_net_conf->after_sb_2p = convert_after_sb(p_after_sb_2p);
+ new_net_conf->two_primaries = p_two_primaries;
+
+ rcu_assign_pointer(tconn->net_conf, new_net_conf);
+ mutex_unlock(&tconn->conf_update);
+ mutex_unlock(&tconn->data.mutex);
+
+ crypto_free_hash(tconn->peer_integrity_tfm);
+ kfree(tconn->int_dig_in);
+ kfree(tconn->int_dig_vv);
+ tconn->peer_integrity_tfm = peer_integrity_tfm;
+ tconn->int_dig_in = int_dig_in;
+ tconn->int_dig_vv = int_dig_vv;
+
+ if (strcmp(old_net_conf->integrity_alg, integrity_alg))
+ conn_info(tconn, "peer data-integrity-alg: %s\n",
+ integrity_alg[0] ? integrity_alg : "(none)");
+
+ synchronize_rcu();
+ kfree(old_net_conf);
+ return 0;
+
+disconnect_rcu_unlock:
+ rcu_read_unlock();
disconnect:
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ crypto_free_hash(peer_integrity_tfm);
+ kfree(int_dig_in);
+ kfree(int_dig_vv);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* helper function
@@ -2792,24 +3266,64 @@
alg, name, PTR_ERR(tfm));
return tfm;
}
- if (!drbd_crypto_is_hash(crypto_hash_tfm(tfm))) {
- crypto_free_hash(tfm);
- dev_err(DEV, "\"%s\" is not a digest (%s)\n", alg, name);
- return ERR_PTR(-EINVAL);
- }
return tfm;
}
-static int receive_SyncParam(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int packet_size)
+static int ignore_remaining_packet(struct drbd_tconn *tconn, struct packet_info *pi)
{
- int ok = true;
- struct p_rs_param_95 *p = &mdev->data.rbuf.rs_param_95;
+ void *buffer = tconn->data.rbuf;
+ int size = pi->size;
+
+ while (size) {
+ int s = min_t(int, size, DRBD_SOCKET_BUFFER_SIZE);
+ s = drbd_recv(tconn, buffer, s);
+ if (s <= 0) {
+ if (s < 0)
+ return s;
+ break;
+ }
+ size -= s;
+ }
+ if (size)
+ return -EIO;
+ return 0;
+}
+
+/*
+ * config_unknown_volume - device configuration command for unknown volume
+ *
+ * When a device is added to an existing connection, the node on which the
+ * device is added first will send configuration commands to its peer but the
+ * peer will not know about the device yet. It will warn and ignore these
+ * commands. Once the device is added on the second node, the second node will
+ * send the same device configuration commands, but in the other direction.
+ *
+ * (We can also end up here if drbd is misconfigured.)
+ */
+static int config_unknown_volume(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ conn_warn(tconn, "%s packet received for volume %u, which is not configured locally\n",
+ cmdname(pi->cmd), pi->vnr);
+ return ignore_remaining_packet(tconn, pi);
+}
+
+static int receive_SyncParam(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct p_rs_param_95 *p;
unsigned int header_size, data_size, exp_max_sz;
struct crypto_hash *verify_tfm = NULL;
struct crypto_hash *csums_tfm = NULL;
- const int apv = mdev->agreed_pro_version;
- int *rs_plan_s = NULL;
+ struct net_conf *old_net_conf, *new_net_conf = NULL;
+ struct disk_conf *old_disk_conf = NULL, *new_disk_conf = NULL;
+ const int apv = tconn->agreed_pro_version;
+ struct fifo_buffer *old_plan = NULL, *new_plan = NULL;
int fifo_size = 0;
+ int err;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
exp_max_sz = apv <= 87 ? sizeof(struct p_rs_param)
: apv == 88 ? sizeof(struct p_rs_param)
@@ -2817,32 +3331,49 @@
: apv <= 94 ? sizeof(struct p_rs_param_89)
: /* apv >= 95 */ sizeof(struct p_rs_param_95);
- if (packet_size > exp_max_sz) {
+ if (pi->size > exp_max_sz) {
dev_err(DEV, "SyncParam packet too long: received %u, expected <= %u bytes\n",
- packet_size, exp_max_sz);
- return false;
+ pi->size, exp_max_sz);
+ return -EIO;
}
if (apv <= 88) {
- header_size = sizeof(struct p_rs_param) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param);
+ data_size = pi->size - header_size;
} else if (apv <= 94) {
- header_size = sizeof(struct p_rs_param_89) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param_89);
+ data_size = pi->size - header_size;
D_ASSERT(data_size == 0);
} else {
- header_size = sizeof(struct p_rs_param_95) - sizeof(struct p_header80);
- data_size = packet_size - header_size;
+ header_size = sizeof(struct p_rs_param_95);
+ data_size = pi->size - header_size;
D_ASSERT(data_size == 0);
}
/* initialize verify_alg and csums_alg */
+ p = pi->data;
memset(p->verify_alg, 0, 2 * SHARED_SECRET_MAX);
- if (drbd_recv(mdev, &p->head.payload, header_size) != header_size)
- return false;
+ err = drbd_recv_all(mdev->tconn, p, header_size);
+ if (err)
+ return err;
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
+ mutex_lock(&mdev->tconn->conf_update);
+ old_net_conf = mdev->tconn->net_conf;
+ if (get_ldev(mdev)) {
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ put_ldev(mdev);
+ mutex_unlock(&mdev->tconn->conf_update);
+ dev_err(DEV, "Allocation of new disk_conf failed\n");
+ return -ENOMEM;
+ }
+
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+
+ new_disk_conf->resync_rate = be32_to_cpu(p->resync_rate);
+ }
if (apv >= 88) {
if (apv == 88) {
@@ -2850,12 +3381,13 @@
dev_err(DEV, "verify-alg of wrong size, "
"peer wants %u, accepting only up to %u byte\n",
data_size, SHARED_SECRET_MAX);
- return false;
+ err = -EIO;
+ goto reconnect;
}
- if (drbd_recv(mdev, p->verify_alg, data_size) != data_size)
- return false;
-
+ err = drbd_recv_all(mdev->tconn, p->verify_alg, data_size);
+ if (err)
+ goto reconnect;
/* we expect NUL terminated string */
/* but just in case someone tries to be evil */
D_ASSERT(p->verify_alg[data_size-1] == 0);
@@ -2870,10 +3402,10 @@
p->csums_alg[SHARED_SECRET_MAX-1] = 0;
}
- if (strcmp(mdev->sync_conf.verify_alg, p->verify_alg)) {
+ if (strcmp(old_net_conf->verify_alg, p->verify_alg)) {
if (mdev->state.conn == C_WF_REPORT_PARAMS) {
dev_err(DEV, "Different verify-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.verify_alg, p->verify_alg);
+ old_net_conf->verify_alg, p->verify_alg);
goto disconnect;
}
verify_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2884,10 +3416,10 @@
}
}
- if (apv >= 89 && strcmp(mdev->sync_conf.csums_alg, p->csums_alg)) {
+ if (apv >= 89 && strcmp(old_net_conf->csums_alg, p->csums_alg)) {
if (mdev->state.conn == C_WF_REPORT_PARAMS) {
dev_err(DEV, "Different csums-alg settings. me=\"%s\" peer=\"%s\"\n",
- mdev->sync_conf.csums_alg, p->csums_alg);
+ old_net_conf->csums_alg, p->csums_alg);
goto disconnect;
}
csums_tfm = drbd_crypto_alloc_digest_safe(mdev,
@@ -2898,57 +3430,91 @@
}
}
- if (apv > 94) {
- mdev->sync_conf.rate = be32_to_cpu(p->rate);
- mdev->sync_conf.c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
- mdev->sync_conf.c_delay_target = be32_to_cpu(p->c_delay_target);
- mdev->sync_conf.c_fill_target = be32_to_cpu(p->c_fill_target);
- mdev->sync_conf.c_max_rate = be32_to_cpu(p->c_max_rate);
+ if (apv > 94 && new_disk_conf) {
+ new_disk_conf->c_plan_ahead = be32_to_cpu(p->c_plan_ahead);
+ new_disk_conf->c_delay_target = be32_to_cpu(p->c_delay_target);
+ new_disk_conf->c_fill_target = be32_to_cpu(p->c_fill_target);
+ new_disk_conf->c_max_rate = be32_to_cpu(p->c_max_rate);
- fifo_size = (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ;
- if (fifo_size != mdev->rs_plan_s.size && fifo_size > 0) {
- rs_plan_s = kzalloc(sizeof(int) * fifo_size, GFP_NOIO);
- if (!rs_plan_s) {
+ fifo_size = (new_disk_conf->c_plan_ahead * 10 * SLEEP_TIME) / HZ;
+ if (fifo_size != mdev->rs_plan_s->size) {
+ new_plan = fifo_alloc(fifo_size);
+ if (!new_plan) {
dev_err(DEV, "kmalloc of fifo_buffer failed");
+ put_ldev(mdev);
goto disconnect;
}
}
}
- spin_lock(&mdev->peer_seq_lock);
- /* lock against drbd_nl_syncer_conf() */
- if (verify_tfm) {
- strcpy(mdev->sync_conf.verify_alg, p->verify_alg);
- mdev->sync_conf.verify_alg_len = strlen(p->verify_alg) + 1;
- crypto_free_hash(mdev->verify_tfm);
- mdev->verify_tfm = verify_tfm;
- dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+ if (verify_tfm || csums_tfm) {
+ new_net_conf = kzalloc(sizeof(struct net_conf), GFP_KERNEL);
+ if (!new_net_conf) {
+ dev_err(DEV, "Allocation of new net_conf failed\n");
+ goto disconnect;
+ }
+
+ *new_net_conf = *old_net_conf;
+
+ if (verify_tfm) {
+ strcpy(new_net_conf->verify_alg, p->verify_alg);
+ new_net_conf->verify_alg_len = strlen(p->verify_alg) + 1;
+ crypto_free_hash(mdev->tconn->verify_tfm);
+ mdev->tconn->verify_tfm = verify_tfm;
+ dev_info(DEV, "using verify-alg: \"%s\"\n", p->verify_alg);
+ }
+ if (csums_tfm) {
+ strcpy(new_net_conf->csums_alg, p->csums_alg);
+ new_net_conf->csums_alg_len = strlen(p->csums_alg) + 1;
+ crypto_free_hash(mdev->tconn->csums_tfm);
+ mdev->tconn->csums_tfm = csums_tfm;
+ dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
+ }
+ rcu_assign_pointer(tconn->net_conf, new_net_conf);
}
- if (csums_tfm) {
- strcpy(mdev->sync_conf.csums_alg, p->csums_alg);
- mdev->sync_conf.csums_alg_len = strlen(p->csums_alg) + 1;
- crypto_free_hash(mdev->csums_tfm);
- mdev->csums_tfm = csums_tfm;
- dev_info(DEV, "using csums-alg: \"%s\"\n", p->csums_alg);
- }
- if (fifo_size != mdev->rs_plan_s.size) {
- kfree(mdev->rs_plan_s.values);
- mdev->rs_plan_s.values = rs_plan_s;
- mdev->rs_plan_s.size = fifo_size;
- mdev->rs_planed = 0;
- }
- spin_unlock(&mdev->peer_seq_lock);
}
- return ok;
+ if (new_disk_conf) {
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ put_ldev(mdev);
+ }
+
+ if (new_plan) {
+ old_plan = mdev->rs_plan_s;
+ rcu_assign_pointer(mdev->rs_plan_s, new_plan);
+ }
+
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ if (new_net_conf)
+ kfree(old_net_conf);
+ kfree(old_disk_conf);
+ kfree(old_plan);
+
+ return 0;
+
+reconnect:
+ if (new_disk_conf) {
+ put_ldev(mdev);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&mdev->tconn->conf_update);
+ return -EIO;
+
disconnect:
+ kfree(new_plan);
+ if (new_disk_conf) {
+ put_ldev(mdev);
+ kfree(new_disk_conf);
+ }
+ mutex_unlock(&mdev->tconn->conf_update);
/* just for completeness: actually not needed,
* as this is not reached if csums_tfm was ok. */
crypto_free_hash(csums_tfm);
/* but free the verify_tfm again, if csums_tfm did not work out */
crypto_free_hash(verify_tfm);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
/* warn if the arguments differ by more than 12.5% */
@@ -2964,59 +3530,77 @@
(unsigned long long)a, (unsigned long long)b);
}
-static int receive_sizes(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_sizes *p = &mdev->data.rbuf.sizes;
+ struct drbd_conf *mdev;
+ struct p_sizes *p = pi->data;
enum determine_dev_size dd = unchanged;
sector_t p_size, p_usize, my_usize;
int ldsc = 0; /* local disk size changed */
enum dds_flags ddsf;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
p_size = be64_to_cpu(p->d_size);
p_usize = be64_to_cpu(p->u_size);
- if (p_size == 0 && mdev->state.disk == D_DISKLESS) {
- dev_err(DEV, "some backing storage is needed\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
- }
-
/* just store the peer's disk size for now.
* we still need to figure out whether we accept that. */
mdev->p_size = p_size;
if (get_ldev(mdev)) {
+ rcu_read_lock();
+ my_usize = rcu_dereference(mdev->ldev->disk_conf)->disk_size;
+ rcu_read_unlock();
+
warn_if_differ_considerably(mdev, "lower level device sizes",
p_size, drbd_get_max_capacity(mdev->ldev));
warn_if_differ_considerably(mdev, "user requested size",
- p_usize, mdev->ldev->dc.disk_size);
+ p_usize, my_usize);
/* if this is the first connect, or an otherwise expected
* param exchange, choose the minimum */
if (mdev->state.conn == C_WF_REPORT_PARAMS)
- p_usize = min_not_zero((sector_t)mdev->ldev->dc.disk_size,
- p_usize);
-
- my_usize = mdev->ldev->dc.disk_size;
-
- if (mdev->ldev->dc.disk_size != p_usize) {
- mdev->ldev->dc.disk_size = p_usize;
- dev_info(DEV, "Peer sets u_size to %lu sectors\n",
- (unsigned long)mdev->ldev->dc.disk_size);
- }
+ p_usize = min_not_zero(my_usize, p_usize);
/* Never shrink a device with usable data during connect.
But allow online shrinking if we are connected. */
- if (drbd_new_dev_size(mdev, mdev->ldev, 0) <
- drbd_get_capacity(mdev->this_bdev) &&
- mdev->state.disk >= D_OUTDATED &&
- mdev->state.conn < C_CONNECTED) {
+ if (drbd_new_dev_size(mdev, mdev->ldev, p_usize, 0) <
+ drbd_get_capacity(mdev->this_bdev) &&
+ mdev->state.disk >= D_OUTDATED &&
+ mdev->state.conn < C_CONNECTED) {
dev_err(DEV, "The peer's disk size is too small!\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- mdev->ldev->dc.disk_size = my_usize;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
put_ldev(mdev);
- return false;
+ return -EIO;
}
+
+ if (my_usize != p_usize) {
+ struct disk_conf *old_disk_conf, *new_disk_conf = NULL;
+
+ new_disk_conf = kzalloc(sizeof(struct disk_conf), GFP_KERNEL);
+ if (!new_disk_conf) {
+ dev_err(DEV, "Allocation of new disk_conf failed\n");
+ put_ldev(mdev);
+ return -ENOMEM;
+ }
+
+ mutex_lock(&mdev->tconn->conf_update);
+ old_disk_conf = mdev->ldev->disk_conf;
+ *new_disk_conf = *old_disk_conf;
+ new_disk_conf->disk_size = p_usize;
+
+ rcu_assign_pointer(mdev->ldev->disk_conf, new_disk_conf);
+ mutex_unlock(&mdev->tconn->conf_update);
+ synchronize_rcu();
+ kfree(old_disk_conf);
+
+ dev_info(DEV, "Peer sets u_size to %lu sectors\n",
+ (unsigned long)my_usize);
+ }
+
put_ldev(mdev);
}
@@ -3025,7 +3609,7 @@
dd = drbd_determine_dev_size(mdev, ddsf);
put_ldev(mdev);
if (dd == dev_size_error)
- return false;
+ return -EIO;
drbd_md_sync(mdev);
} else {
/* I am diskless, need to accept the peer's size. */
@@ -3051,7 +3635,7 @@
* needs to know my new size... */
drbd_send_sizes(mdev, 0, ddsf);
}
- if (drbd_test_and_clear_flag(mdev, RESIZE_PENDING) ||
+ if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
(dd == grew && mdev->state.conn == C_CONNECTED)) {
if (mdev->state.pdsk >= D_INCONSISTENT &&
mdev->state.disk >= D_INCONSISTENT) {
@@ -3060,19 +3644,24 @@
else
resync_after_online_grow(mdev);
} else
- drbd_set_flag(mdev, RESYNC_AFTER_NEG);
+ set_bit(RESYNC_AFTER_NEG, &mdev->flags);
}
}
- return true;
+ return 0;
}
-static int receive_uuids(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_uuids(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_uuids *p = &mdev->data.rbuf.uuids;
+ struct drbd_conf *mdev;
+ struct p_uuids *p = pi->data;
u64 *p_uuid;
int i, updated_uuids = 0;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
p_uuid = kmalloc(sizeof(u64)*UI_EXTENDED_SIZE, GFP_NOIO);
for (i = UI_CURRENT; i < UI_EXTENDED_SIZE; i++)
@@ -3087,14 +3676,14 @@
(mdev->ed_uuid & ~((u64)1)) != (p_uuid[UI_CURRENT] & ~((u64)1))) {
dev_err(DEV, "Can only connect to data with current UUID=%016llX\n",
(unsigned long long)mdev->ed_uuid);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
if (get_ldev(mdev)) {
int skip_initial_sync =
mdev->state.conn == C_CONNECTED &&
- mdev->agreed_pro_version >= 90 &&
+ mdev->tconn->agreed_pro_version >= 90 &&
mdev->ldev->md.uuid[UI_CURRENT] == UUID_JUST_CREATED &&
(p_uuid[UI_FLAGS] & 8);
if (skip_initial_sync) {
@@ -3121,14 +3710,15 @@
ongoing cluster wide state change is finished. That is important if
we are primary and are detaching from our disk. We need to see the
new disk state... */
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, CLUSTER_ST_CHANGE));
+ mutex_lock(mdev->state_mutex);
+ mutex_unlock(mdev->state_mutex);
if (mdev->state.conn >= C_CONNECTED && mdev->state.disk < D_INCONSISTENT)
updated_uuids |= drbd_set_ed_uuid(mdev, p_uuid[UI_CURRENT]);
if (updated_uuids)
drbd_print_uuids(mdev, "receiver updated UUIDs to");
- return true;
+ return 0;
}
/**
@@ -3140,6 +3730,7 @@
union drbd_state ms;
static enum drbd_conns c_tab[] = {
+ [C_WF_REPORT_PARAMS] = C_WF_REPORT_PARAMS,
[C_CONNECTED] = C_CONNECTED,
[C_STARTING_SYNC_S] = C_STARTING_SYNC_T,
@@ -3161,40 +3752,74 @@
return ms;
}
-static int receive_req_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_state(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_req_state *p = &mdev->data.rbuf.req_state;
+ struct drbd_conf *mdev;
+ struct p_req_state *p = pi->data;
union drbd_state mask, val;
enum drbd_state_rv rv;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
mask.i = be32_to_cpu(p->mask);
val.i = be32_to_cpu(p->val);
- if (drbd_test_flag(mdev, DISCARD_CONCURRENT) &&
- drbd_test_flag(mdev, CLUSTER_ST_CHANGE)) {
+ if (test_bit(RESOLVE_CONFLICTS, &mdev->tconn->flags) &&
+ mutex_is_locked(mdev->state_mutex)) {
drbd_send_sr_reply(mdev, SS_CONCURRENT_ST_CHG);
- return true;
+ return 0;
}
mask = convert_state(mask);
val = convert_state(val);
rv = drbd_change_state(mdev, CS_VERBOSE, mask, val);
-
drbd_send_sr_reply(mdev, rv);
+
drbd_md_sync(mdev);
- return true;
+ return 0;
}
-static int receive_state(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_req_conn_state(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_state *p = &mdev->data.rbuf.state;
+ struct p_req_state *p = pi->data;
+ union drbd_state mask, val;
+ enum drbd_state_rv rv;
+
+ mask.i = be32_to_cpu(p->mask);
+ val.i = be32_to_cpu(p->val);
+
+ if (test_bit(RESOLVE_CONFLICTS, &tconn->flags) &&
+ mutex_is_locked(&tconn->cstate_mutex)) {
+ conn_send_sr_reply(tconn, SS_CONCURRENT_ST_CHG);
+ return 0;
+ }
+
+ mask = convert_state(mask);
+ val = convert_state(val);
+
+ rv = conn_request_state(tconn, mask, val, CS_VERBOSE | CS_LOCAL_ONLY | CS_IGN_OUTD_FAIL);
+ conn_send_sr_reply(tconn, rv);
+
+ return 0;
+}
+
+static int receive_state(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct p_state *p = pi->data;
union drbd_state os, ns, peer_state;
enum drbd_disk_state real_peer_disk;
enum chg_state_flags cs_flags;
int rv;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return config_unknown_volume(tconn, pi);
+
peer_state.i = be32_to_cpu(p->state);
real_peer_disk = peer_state.disk;
@@ -3203,16 +3828,16 @@
dev_info(DEV, "real peer disk state = %s\n", drbd_disk_str(real_peer_disk));
}
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
retry:
- os = ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ os = ns = drbd_read_state(mdev);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* If some other part of the code (asender thread, timeout)
* already decided to close the connection again,
* we must not "re-establish" it here. */
if (os.conn <= C_TEAR_DOWN)
- return false;
+ return -ECONNRESET;
/* If this is the "end of sync" confirmation, usually the peer disk
* transitions from D_INCONSISTENT to D_UP_TO_DATE. For empty (0 bits
@@ -3240,16 +3865,16 @@
peer_state.conn == C_CONNECTED) {
if (drbd_bm_total_weight(mdev) <= mdev->rs_failed)
drbd_resync_finished(mdev);
- return true;
+ return 0;
}
}
/* explicit verify finished notification, stop sector reached. */
if (os.conn == C_VERIFY_T && os.disk == D_UP_TO_DATE &&
peer_state.conn == C_CONNECTED && real_peer_disk == D_UP_TO_DATE) {
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
- return true;
+ return 0;
}
/* peer says his disk is inconsistent, while we think it is uptodate,
@@ -3280,7 +3905,7 @@
os.disk == D_NEGOTIATING));
/* if we have both been inconsistent, and the peer has been
* forced to be UpToDate with --overwrite-data */
- cr |= drbd_test_flag(mdev, CONSIDER_RESYNC);
+ cr |= test_bit(CONSIDER_RESYNC, &mdev->flags);
/* if we had been plain connected, and the admin requested to
* start a sync by "invalidate" or "invalidate-remote" */
cr |= (os.conn == C_CONNECTED &&
@@ -3300,44 +3925,44 @@
peer_state.disk = D_DISKLESS;
real_peer_disk = D_DISKLESS;
} else {
- if (drbd_test_and_clear_flag(mdev, CONN_DRY_RUN))
- return false;
+ if (test_and_clear_bit(CONN_DRY_RUN, &mdev->tconn->flags))
+ return -EIO;
D_ASSERT(os.conn == C_WF_REPORT_PARAMS);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
}
}
- spin_lock_irq(&mdev->req_lock);
- if (mdev->state.i != os.i)
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (os.i != drbd_read_state(mdev).i)
goto retry;
- drbd_clear_flag(mdev, CONSIDER_RESYNC);
+ clear_bit(CONSIDER_RESYNC, &mdev->flags);
ns.peer = peer_state.role;
ns.pdsk = real_peer_disk;
ns.peer_isp = (peer_state.aftr_isp | peer_state.user_isp);
if ((ns.conn == C_CONNECTED || ns.conn == C_WF_BITMAP_S) && ns.disk == D_NEGOTIATING)
ns.disk = mdev->new_state_tmp.disk;
cs_flags = CS_VERBOSE + (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED ? 0 : CS_HARD);
- if (ns.pdsk == D_CONSISTENT && is_susp(ns) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
- drbd_test_flag(mdev, NEW_CUR_UUID)) {
- /* Do not allow tl_restart(resend) for a rebooted peer. We can only allow this
+ if (ns.pdsk == D_CONSISTENT && drbd_suspended(mdev) && ns.conn == C_CONNECTED && os.conn < C_CONNECTED &&
+ test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ /* Do not allow tl_restart(RESEND) for a rebooted peer. We can only allow this
for temporal network outages! */
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
dev_err(DEV, "Aborting Connect, can not thaw IO with an only Consistent peer\n");
- tl_clear(mdev);
+ tl_clear(mdev->tconn);
drbd_uuid_new_current(mdev);
- drbd_clear_flag(mdev, NEW_CUR_UUID);
- drbd_force_state(mdev, NS2(conn, C_PROTOCOL_ERROR, susp, 0));
- return false;
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ conn_request_state(mdev->tconn, NS2(conn, C_PROTOCOL_ERROR, susp, 0), CS_HARD);
+ return -EIO;
}
rv = _drbd_set_state(mdev, ns, cs_flags, NULL);
- ns = mdev->state;
- spin_unlock_irq(&mdev->req_lock);
+ ns = drbd_read_state(mdev);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (rv < SS_SUCCESS) {
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return false;
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return -EIO;
}
if (os.conn > C_WF_REPORT_PARAMS) {
@@ -3351,16 +3976,21 @@
}
}
- mdev->net_conf->want_lose = 0;
+ clear_bit(DISCARD_MY_DATA, &mdev->flags);
drbd_md_sync(mdev); /* update connected indicator, la_size, ... */
- return true;
+ return 0;
}
-static int receive_sync_uuid(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_sync_uuid(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_rs_uuid *p = &mdev->data.rbuf.rs_uuid;
+ struct drbd_conf *mdev;
+ struct p_rs_uuid *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
wait_event(mdev->misc_wait,
mdev->state.conn == C_WF_SYNC_UUID ||
@@ -3383,7 +4013,7 @@
} else
dev_err(DEV, "Ignoring SyncUUID packet!\n");
- return true;
+ return 0;
}
/**
@@ -3393,27 +4023,27 @@
* code upon failure.
*/
static int
-receive_bitmap_plain(struct drbd_conf *mdev, unsigned int data_size,
- unsigned long *buffer, struct bm_xfer_ctx *c)
+receive_bitmap_plain(struct drbd_conf *mdev, unsigned int size,
+ unsigned long *p, struct bm_xfer_ctx *c)
{
- unsigned num_words = min_t(size_t, BM_PACKET_WORDS, c->bm_words - c->word_offset);
- unsigned want = num_words * sizeof(long);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE -
+ drbd_header_size(mdev->tconn);
+ unsigned int num_words = min_t(size_t, data_size / sizeof(*p),
+ c->bm_words - c->word_offset);
+ unsigned int want = num_words * sizeof(*p);
int err;
- if (want != data_size) {
- dev_err(DEV, "%s:want (%u) != data_size (%u)\n", __func__, want, data_size);
+ if (want != size) {
+ dev_err(DEV, "%s:want (%u) != size (%u)\n", __func__, want, size);
return -EIO;
}
if (want == 0)
return 0;
- err = drbd_recv(mdev, buffer, want);
- if (err != want) {
- if (err >= 0)
- err = -EIO;
+ err = drbd_recv_all(mdev->tconn, p, want);
+ if (err)
return err;
- }
- drbd_bm_merge_lel(mdev, c->word_offset, num_words, buffer);
+ drbd_bm_merge_lel(mdev, c->word_offset, num_words, p);
c->word_offset += num_words;
c->bit_offset = c->word_offset * BITS_PER_LONG;
@@ -3423,6 +4053,21 @@
return 1;
}
+static enum drbd_bitmap_code dcbp_get_code(struct p_compressed_bm *p)
+{
+ return (enum drbd_bitmap_code)(p->encoding & 0x0f);
+}
+
+static int dcbp_get_start(struct p_compressed_bm *p)
+{
+ return (p->encoding & 0x80) != 0;
+}
+
+static int dcbp_get_pad_bits(struct p_compressed_bm *p)
+{
+ return (p->encoding >> 4) & 0x7;
+}
+
/**
* recv_bm_rle_bits
*
@@ -3432,7 +4077,8 @@
static int
recv_bm_rle_bits(struct drbd_conf *mdev,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
struct bitstream bs;
u64 look_ahead;
@@ -3440,12 +4086,11 @@
u64 tmp;
unsigned long s = c->bit_offset;
unsigned long e;
- int len = be16_to_cpu(p->head.length) - (sizeof(*p) - sizeof(p->head));
- int toggle = DCBP_get_start(p);
+ int toggle = dcbp_get_start(p);
int have;
int bits;
- bitstream_init(&bs, p->code, len, DCBP_get_pad_bits(p));
+ bitstream_init(&bs, p->code, len, dcbp_get_pad_bits(p));
bits = bitstream_get_bits(&bs, &look_ahead, 64);
if (bits < 0)
@@ -3497,17 +4142,18 @@
static int
decode_bitmap_c(struct drbd_conf *mdev,
struct p_compressed_bm *p,
- struct bm_xfer_ctx *c)
+ struct bm_xfer_ctx *c,
+ unsigned int len)
{
- if (DCBP_get_code(p) == RLE_VLI_Bits)
- return recv_bm_rle_bits(mdev, p, c);
+ if (dcbp_get_code(p) == RLE_VLI_Bits)
+ return recv_bm_rle_bits(mdev, p, c, len - sizeof(*p));
/* other variants had been implemented for evaluation,
* but have been dropped as this one turned out to be "best"
* during all our tests. */
dev_err(DEV, "receive_bitmap_c: unknown encoding %u\n", p->encoding);
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
+ conn_request_state(mdev->tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
return -EIO;
}
@@ -3515,11 +4161,13 @@
const char *direction, struct bm_xfer_ctx *c)
{
/* what would it take to transfer it "plaintext" */
- unsigned plain = sizeof(struct p_header80) *
- ((c->bm_words+BM_PACKET_WORDS-1)/BM_PACKET_WORDS+1)
- + c->bm_words * sizeof(long);
- unsigned total = c->bytes[0] + c->bytes[1];
- unsigned r;
+ unsigned int header_size = drbd_header_size(mdev->tconn);
+ unsigned int data_size = DRBD_SOCKET_BUFFER_SIZE - header_size;
+ unsigned int plain =
+ header_size * (DIV_ROUND_UP(c->bm_words, data_size) + 1) +
+ c->bm_words * sizeof(unsigned long);
+ unsigned int total = c->bytes[0] + c->bytes[1];
+ unsigned int r;
/* total can not be zero. but just in case: */
if (total == 0)
@@ -3553,67 +4201,63 @@
in order to be agnostic to the 32 vs 64 bits issue.
returns 0 on failure, 1 if we successfully received it. */
-static int receive_bitmap(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_bitmap(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
struct bm_xfer_ctx c;
- void *buffer;
int err;
- int ok = false;
- struct p_header80 *h = &mdev->data.rbuf.header.h80;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
drbd_bm_lock(mdev, "receive bitmap", BM_LOCKED_SET_ALLOWED);
/* you are supposed to send additional out-of-sync information
* if you actually set bits during this phase */
- /* maybe we should use some per thread scratch page,
- * and allocate that during initial device creation? */
- buffer = (unsigned long *) __get_free_page(GFP_NOIO);
- if (!buffer) {
- dev_err(DEV, "failed to allocate one page buffer in %s\n", __func__);
- goto out;
- }
-
c = (struct bm_xfer_ctx) {
.bm_bits = drbd_bm_bits(mdev),
.bm_words = drbd_bm_words(mdev),
};
for(;;) {
- if (cmd == P_BITMAP) {
- err = receive_bitmap_plain(mdev, data_size, buffer, &c);
- } else if (cmd == P_COMPRESSED_BITMAP) {
+ if (pi->cmd == P_BITMAP)
+ err = receive_bitmap_plain(mdev, pi->size, pi->data, &c);
+ else if (pi->cmd == P_COMPRESSED_BITMAP) {
/* MAYBE: sanity check that we speak proto >= 90,
* and the feature is enabled! */
- struct p_compressed_bm *p;
+ struct p_compressed_bm *p = pi->data;
- if (data_size > BM_PACKET_PAYLOAD_BYTES) {
+ if (pi->size > DRBD_SOCKET_BUFFER_SIZE - drbd_header_size(tconn)) {
dev_err(DEV, "ReportCBitmap packet too large\n");
+ err = -EIO;
goto out;
}
- /* use the page buff */
- p = buffer;
- memcpy(p, h, sizeof(*h));
- if (drbd_recv(mdev, p->head.payload, data_size) != data_size)
- goto out;
- if (data_size <= (sizeof(*p) - sizeof(p->head))) {
- dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", data_size);
+ if (pi->size <= sizeof(*p)) {
+ dev_err(DEV, "ReportCBitmap packet too small (l:%u)\n", pi->size);
+ err = -EIO;
goto out;
}
- err = decode_bitmap_c(mdev, p, &c);
+ err = drbd_recv_all(mdev->tconn, p, pi->size);
+ if (err)
+ goto out;
+ err = decode_bitmap_c(mdev, p, &c, pi->size);
} else {
- dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", cmd);
+ dev_warn(DEV, "receive_bitmap: cmd neither ReportBitMap nor ReportCBitMap (is 0x%x)", pi->cmd);
+ err = -EIO;
goto out;
}
- c.packets[cmd == P_BITMAP]++;
- c.bytes[cmd == P_BITMAP] += sizeof(struct p_header80) + data_size;
+ c.packets[pi->cmd == P_BITMAP]++;
+ c.bytes[pi->cmd == P_BITMAP] += drbd_header_size(tconn) + pi->size;
if (err <= 0) {
if (err < 0)
goto out;
break;
}
- if (!drbd_recv_header(mdev, &cmd, &data_size))
+ err = drbd_recv_header(mdev->tconn, pi);
+ if (err)
goto out;
}
@@ -3622,8 +4266,8 @@
if (mdev->state.conn == C_WF_BITMAP_T) {
enum drbd_state_rv rv;
- ok = !drbd_send_bitmap(mdev);
- if (!ok)
+ err = drbd_send_bitmap(mdev);
+ if (err)
goto out;
/* Omit CS_ORDERED with this state transition to avoid deadlocks. */
rv = _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
@@ -3634,47 +4278,40 @@
dev_info(DEV, "unexpected cstate (%s) in receive_bitmap\n",
drbd_conn_str(mdev->state.conn));
}
+ err = 0;
- ok = true;
out:
drbd_bm_unlock(mdev);
- if (ok && mdev->state.conn == C_WF_BITMAP_S)
+ if (!err && mdev->state.conn == C_WF_BITMAP_S)
drbd_start_resync(mdev, C_SYNC_SOURCE);
- free_page((unsigned long) buffer);
- return ok;
+ return err;
}
-static int receive_skip(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_skip(struct drbd_tconn *tconn, struct packet_info *pi)
{
- /* TODO zero copy sink :) */
- static char sink[128];
- int size, want, r;
+ conn_warn(tconn, "skipping unknown optional packet type %d, l: %d!\n",
+ pi->cmd, pi->size);
- dev_warn(DEV, "skipping unknown optional packet type %d, l: %d!\n",
- cmd, data_size);
-
- size = data_size;
- while (size > 0) {
- want = min_t(int, size, sizeof(sink));
- r = drbd_recv(mdev, sink, want);
- ERR_IF(r <= 0) break;
- size -= r;
- }
- return size == 0;
+ return ignore_remaining_packet(tconn, pi);
}
-static int receive_UnplugRemote(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_UnplugRemote(struct drbd_tconn *tconn, struct packet_info *pi)
{
/* Make sure we've acked all the TCP data associated
* with the data requests being unplugged */
- drbd_tcp_quickack(mdev->data.socket);
+ drbd_tcp_quickack(tconn->data.socket);
- return true;
+ return 0;
}
-static int receive_out_of_sync(struct drbd_conf *mdev, enum drbd_packets cmd, unsigned int data_size)
+static int receive_out_of_sync(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_desc *p = &mdev->data.rbuf.block_desc;
+ struct drbd_conf *mdev;
+ struct p_block_desc *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
switch (mdev->state.conn) {
case C_WF_SYNC_UUID:
@@ -3688,15 +4325,13 @@
drbd_set_out_of_sync(mdev, be64_to_cpu(p->sector), be32_to_cpu(p->blksize));
- return true;
+ return 0;
}
-typedef int (*drbd_cmd_handler_f)(struct drbd_conf *, enum drbd_packets cmd, unsigned int to_receive);
-
struct data_cmd {
int expect_payload;
size_t pkt_size;
- drbd_cmd_handler_f function;
+ int (*fn)(struct drbd_tconn *, struct packet_info *);
};
static struct data_cmd drbd_cmd_handler[] = {
@@ -3704,13 +4339,13 @@
[P_DATA_REPLY] = { 1, sizeof(struct p_data), receive_DataReply },
[P_RS_DATA_REPLY] = { 1, sizeof(struct p_data), receive_RSDataReply } ,
[P_BARRIER] = { 0, sizeof(struct p_barrier), receive_Barrier } ,
- [P_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_COMPRESSED_BITMAP] = { 1, sizeof(struct p_header80), receive_bitmap } ,
- [P_UNPLUG_REMOTE] = { 0, sizeof(struct p_header80), receive_UnplugRemote },
+ [P_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_COMPRESSED_BITMAP] = { 1, 0, receive_bitmap } ,
+ [P_UNPLUG_REMOTE] = { 0, 0, receive_UnplugRemote },
[P_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
[P_RS_DATA_REQUEST] = { 0, sizeof(struct p_block_req), receive_DataRequest },
- [P_SYNC_PARAM] = { 1, sizeof(struct p_header80), receive_SyncParam },
- [P_SYNC_PARAM89] = { 1, sizeof(struct p_header80), receive_SyncParam },
+ [P_SYNC_PARAM] = { 1, 0, receive_SyncParam },
+ [P_SYNC_PARAM89] = { 1, 0, receive_SyncParam },
[P_PROTOCOL] = { 1, sizeof(struct p_protocol), receive_protocol },
[P_UUIDS] = { 0, sizeof(struct p_uuids), receive_uuids },
[P_SIZES] = { 0, sizeof(struct p_sizes), receive_sizes },
@@ -3722,124 +4357,75 @@
[P_CSUM_RS_REQUEST] = { 1, sizeof(struct p_block_req), receive_DataRequest },
[P_DELAY_PROBE] = { 0, sizeof(struct p_delay_probe93), receive_skip },
[P_OUT_OF_SYNC] = { 0, sizeof(struct p_block_desc), receive_out_of_sync },
- /* anything missing from this table is in
- * the asender_tbl, see get_asender_cmd */
- [P_MAX_CMD] = { 0, 0, NULL },
+ [P_CONN_ST_CHG_REQ] = { 0, sizeof(struct p_req_state), receive_req_conn_state },
+ [P_PROTOCOL_UPDATE] = { 1, sizeof(struct p_protocol), receive_protocol },
};
-/* All handler functions that expect a sub-header get that sub-heder in
- mdev->data.rbuf.header.head.payload.
-
- Usually in mdev->data.rbuf.header.head the callback can find the usual
- p_header, but they may not rely on that. Since there is also p_header95 !
- */
-
-static void drbdd(struct drbd_conf *mdev)
+static void drbdd(struct drbd_tconn *tconn)
{
- union p_header *header = &mdev->data.rbuf.header;
- unsigned int packet_size;
- enum drbd_packets cmd;
+ struct packet_info pi;
size_t shs; /* sub header size */
- int rv;
+ int err;
- while (get_t_state(&mdev->receiver) == Running) {
- drbd_thread_current_set_cpu(mdev);
- if (!drbd_recv_header(mdev, &cmd, &packet_size))
+ while (get_t_state(&tconn->receiver) == RUNNING) {
+ struct data_cmd *cmd;
+
+ drbd_thread_current_set_cpu(&tconn->receiver);
+ if (drbd_recv_header(tconn, &pi))
goto err_out;
- if (unlikely(cmd >= P_MAX_CMD || !drbd_cmd_handler[cmd].function)) {
- dev_err(DEV, "unknown packet type %d, l: %d!\n", cmd, packet_size);
+ cmd = &drbd_cmd_handler[pi.cmd];
+ if (unlikely(pi.cmd >= ARRAY_SIZE(drbd_cmd_handler) || !cmd->fn)) {
+ conn_err(tconn, "Unexpected data packet %s (0x%04x)",
+ cmdname(pi.cmd), pi.cmd);
goto err_out;
}
- shs = drbd_cmd_handler[cmd].pkt_size - sizeof(union p_header);
- if (packet_size - shs > 0 && !drbd_cmd_handler[cmd].expect_payload) {
- dev_err(DEV, "No payload expected %s l:%d\n", cmdname(cmd), packet_size);
+ shs = cmd->pkt_size;
+ if (pi.size > shs && !cmd->expect_payload) {
+ conn_err(tconn, "No payload expected %s l:%d\n",
+ cmdname(pi.cmd), pi.size);
goto err_out;
}
if (shs) {
- rv = drbd_recv(mdev, &header->h80.payload, shs);
- if (unlikely(rv != shs)) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read while reading sub header: rv=%d\n", rv);
+ err = drbd_recv_all_warn(tconn, pi.data, shs);
+ if (err)
goto err_out;
- }
+ pi.size -= shs;
}
- rv = drbd_cmd_handler[cmd].function(mdev, cmd, packet_size - shs);
-
- if (unlikely(!rv)) {
- dev_err(DEV, "error receiving %s, l: %d!\n",
- cmdname(cmd), packet_size);
+ err = cmd->fn(tconn, &pi);
+ if (err) {
+ conn_err(tconn, "error receiving %s, e: %d l: %d!\n",
+ cmdname(pi.cmd), err, pi.size);
goto err_out;
}
}
+ return;
- if (0) {
- err_out:
- drbd_force_state(mdev, NS(conn, C_PROTOCOL_ERROR));
- }
- /* If we leave here, we probably want to update at least the
- * "Connected" indicator on stable storage. Do so explicitly here. */
- drbd_md_sync(mdev);
+ err_out:
+ conn_request_state(tconn, NS(conn, C_PROTOCOL_ERROR), CS_HARD);
}
-void drbd_flush_workqueue(struct drbd_conf *mdev)
+void conn_flush_workqueue(struct drbd_tconn *tconn)
{
struct drbd_wq_barrier barr;
barr.w.cb = w_prev_work_done;
+ barr.w.tconn = tconn;
init_completion(&barr.done);
- drbd_queue_work(&mdev->data.work, &barr.w);
+ drbd_queue_work(&tconn->sender_work, &barr.w);
wait_for_completion(&barr.done);
}
-void drbd_free_tl_hash(struct drbd_conf *mdev)
+static void conn_disconnect(struct drbd_tconn *tconn)
{
- struct hlist_head *h;
+ struct drbd_conf *mdev;
+ enum drbd_conns oc;
+ int vnr;
- spin_lock_irq(&mdev->req_lock);
-
- if (!mdev->tl_hash || mdev->state.conn != C_STANDALONE) {
- spin_unlock_irq(&mdev->req_lock);
- return;
- }
- /* paranoia code */
- for (h = mdev->ee_hash; h < mdev->ee_hash + mdev->ee_hash_s; h++)
- if (h->first)
- dev_err(DEV, "ASSERT FAILED ee_hash[%u].first == %p, expected NULL\n",
- (int)(h - mdev->ee_hash), h->first);
- kfree(mdev->ee_hash);
- mdev->ee_hash = NULL;
- mdev->ee_hash_s = 0;
-
- /* We may not have had the chance to wait for all locally pending
- * application requests. The hlist_add_fake() prevents access after
- * free on master bio completion. */
- for (h = mdev->tl_hash; h < mdev->tl_hash + mdev->tl_hash_s; h++) {
- struct drbd_request *req;
- struct hlist_node *pos, *n;
- hlist_for_each_entry_safe(req, pos, n, h, collision) {
- hlist_del_init(&req->collision);
- hlist_add_fake(&req->collision);
- }
- }
-
- kfree(mdev->tl_hash);
- mdev->tl_hash = NULL;
- mdev->tl_hash_s = 0;
- spin_unlock_irq(&mdev->req_lock);
-}
-
-static void drbd_disconnect(struct drbd_conf *mdev)
-{
- enum drbd_fencing_p fp;
- union drbd_state os, ns;
- int rv = SS_UNKNOWN_ERROR;
- unsigned int i;
-
- if (mdev->state.conn == C_STANDALONE)
+ if (tconn->cstate == C_STANDALONE)
return;
/* We are about to start the cleanup after connection loss.
@@ -3847,18 +4433,54 @@
* Usually we should be in some network failure state already,
* but just in case we are not, we fix it up here.
*/
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
+ conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
/* asender does not clean up anything. it must not interfere, either */
- drbd_thread_stop(&mdev->asender);
- drbd_free_sock(mdev);
+ drbd_thread_stop(&tconn->asender);
+ drbd_free_sock(tconn);
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_disconnected(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+
+ if (!list_empty(&tconn->current_epoch->list))
+ conn_err(tconn, "ASSERTION FAILED: tconn->current_epoch->list not empty\n");
+ /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
+ atomic_set(&tconn->current_epoch->epoch_size, 0);
+ tconn->send.seen_any_write_yet = false;
+
+ conn_info(tconn, "Connection closed\n");
+
+ if (conn_highest_role(tconn) == R_PRIMARY && conn_highest_pdsk(tconn) >= D_UNKNOWN)
+ conn_try_outdate_peer_async(tconn);
+
+ spin_lock_irq(&tconn->req_lock);
+ oc = tconn->cstate;
+ if (oc >= C_UNCONNECTED)
+ _conn_request_state(tconn, NS(conn, C_UNCONNECTED), CS_VERBOSE);
+
+ spin_unlock_irq(&tconn->req_lock);
+
+ if (oc == C_DISCONNECTING)
+ conn_request_state(tconn, NS(conn, C_STANDALONE), CS_VERBOSE | CS_HARD);
+}
+
+static int drbd_disconnected(struct drbd_conf *mdev)
+{
+ unsigned int i;
/* wait for current activity to cease. */
- spin_lock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
_drbd_wait_ee_list_empty(mdev, &mdev->active_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->sync_ee);
_drbd_wait_ee_list_empty(mdev, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
/* We do not have data structures that would allow us to
* get the rs_pending_cnt down to 0 again.
@@ -3876,7 +4498,6 @@
atomic_set(&mdev->rs_pending_cnt, 0);
wake_up(&mdev->misc_wait);
- /* make sure syncer is stopped and w_resume_next_sg queued */
del_timer_sync(&mdev->resync_timer);
resync_timer_fn((unsigned long)mdev);
@@ -3885,53 +4506,28 @@
* to be "canceled" */
drbd_flush_workqueue(mdev);
- /* This also does reclaim_net_ee(). If we do this too early, we might
- * miss some resync ee and pages.*/
- drbd_process_done_ee(mdev);
+ drbd_finish_peer_reqs(mdev);
+
+ /* This second workqueue flush is necessary, since drbd_finish_peer_reqs()
+ might have issued a work again. The one before drbd_finish_peer_reqs() is
+ necessary to reclain net_ee in drbd_finish_peer_reqs(). */
+ drbd_flush_workqueue(mdev);
+
+ /* need to do it again, drbd_finish_peer_reqs() may have populated it
+ * again via drbd_try_clear_on_disk_bm(). */
+ drbd_rs_cancel_all(mdev);
kfree(mdev->p_uuid);
mdev->p_uuid = NULL;
- if (!is_susp(mdev->state))
- tl_clear(mdev);
-
- dev_info(DEV, "Connection closed\n");
+ if (!drbd_suspended(mdev))
+ tl_clear(mdev->tconn);
drbd_md_sync(mdev);
- fp = FP_DONT_CARE;
- if (get_ldev(mdev)) {
- fp = mdev->ldev->dc.fencing;
- put_ldev(mdev);
- }
-
- if (mdev->state.role == R_PRIMARY && fp >= FP_RESOURCE && mdev->state.pdsk >= D_UNKNOWN)
- drbd_try_outdate_peer_async(mdev);
-
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
- if (os.conn >= C_UNCONNECTED) {
- /* Do not restart in case we are C_DISCONNECTING */
- ns = os;
- ns.conn = C_UNCONNECTED;
- rv = _drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- }
- spin_unlock_irq(&mdev->req_lock);
-
- if (os.conn == C_DISCONNECTING) {
- wait_event(mdev->net_cnt_wait, atomic_read(&mdev->net_cnt) == 0);
-
- crypto_free_hash(mdev->cram_hmac_tfm);
- mdev->cram_hmac_tfm = NULL;
-
- kfree(mdev->net_conf);
- mdev->net_conf = NULL;
- drbd_request_state(mdev, NS(conn, C_STANDALONE));
- }
-
/* serialize with bitmap writeout triggered by the state change,
* if any. */
- wait_event(mdev->misc_wait, !drbd_test_flag(mdev, BITMAP_IO));
+ wait_event(mdev->misc_wait, !test_bit(BITMAP_IO, &mdev->flags));
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
@@ -3940,7 +4536,7 @@
* Actually we don't care for exactly when the network stack does its
* put_page(), but release our reference on these pages right here.
*/
- i = drbd_release_ee(mdev, &mdev->net_ee);
+ i = drbd_free_peer_reqs(mdev, &mdev->net_ee);
if (i)
dev_info(DEV, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&mdev->pp_in_use_by_net);
@@ -3955,9 +4551,7 @@
D_ASSERT(list_empty(&mdev->sync_ee));
D_ASSERT(list_empty(&mdev->done_ee));
- /* ok, no more ee's on the fly, it is safe to reset the epoch_size */
- atomic_set(&mdev->current_epoch->epoch_size, 0);
- D_ASSERT(list_empty(&mdev->current_epoch->list));
+ return 0;
}
/*
@@ -3969,29 +4563,19 @@
*
* for now, they are expected to be zero, but ignored.
*/
-static int drbd_send_handshake(struct drbd_conf *mdev)
+static int drbd_send_features(struct drbd_tconn *tconn)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.sbuf.handshake;
- int ok;
+ struct drbd_socket *sock;
+ struct p_connection_features *p;
- if (mutex_lock_interruptible(&mdev->data.mutex)) {
- dev_err(DEV, "interrupted during initial handshake\n");
- return 0; /* interrupted. not ok. */
- }
-
- if (mdev->data.socket == NULL) {
- mutex_unlock(&mdev->data.mutex);
- return 0;
- }
-
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
memset(p, 0, sizeof(*p));
p->protocol_min = cpu_to_be32(PRO_VERSION_MIN);
p->protocol_max = cpu_to_be32(PRO_VERSION_MAX);
- ok = _drbd_send_cmd( mdev, mdev->data.socket, P_HAND_SHAKE,
- (struct p_header80 *)p, sizeof(*p), 0 );
- mutex_unlock(&mdev->data.mutex);
- return ok;
+ return conn_send_command(tconn, sock, P_CONNECTION_FEATURES, sizeof(*p), NULL, 0);
}
/*
@@ -4001,42 +4585,38 @@
* -1 peer talks different language,
* no point in trying again, please go standalone.
*/
-static int drbd_do_handshake(struct drbd_conf *mdev)
+static int drbd_do_features(struct drbd_tconn *tconn)
{
- /* ASSERT current == mdev->receiver ... */
- struct p_handshake *p = &mdev->data.rbuf.handshake;
- const int expect = sizeof(struct p_handshake) - sizeof(struct p_header80);
- unsigned int length;
- enum drbd_packets cmd;
- int rv;
+ /* ASSERT current == tconn->receiver ... */
+ struct p_connection_features *p;
+ const int expect = sizeof(struct p_connection_features);
+ struct packet_info pi;
+ int err;
- rv = drbd_send_handshake(mdev);
- if (!rv)
+ err = drbd_send_features(tconn);
+ if (err)
return 0;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
+ err = drbd_recv_header(tconn, &pi);
+ if (err)
return 0;
- if (cmd != P_HAND_SHAKE) {
- dev_err(DEV, "expected HandShake packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ if (pi.cmd != P_CONNECTION_FEATURES) {
+ conn_err(tconn, "expected ConnectionFeatures packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
return -1;
}
- if (length != expect) {
- dev_err(DEV, "expected HandShake length: %u, received: %u\n",
- expect, length);
+ if (pi.size != expect) {
+ conn_err(tconn, "expected ConnectionFeatures length: %u, received: %u\n",
+ expect, pi.size);
return -1;
}
- rv = drbd_recv(mdev, &p->head.payload, expect);
-
- if (rv != expect) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving handshake packet: l=%u\n", rv);
+ p = pi.data;
+ err = drbd_recv_all_warn(tconn, p, expect);
+ if (err)
return 0;
- }
p->protocol_min = be32_to_cpu(p->protocol_min);
p->protocol_max = be32_to_cpu(p->protocol_max);
@@ -4047,15 +4627,15 @@
PRO_VERSION_MIN > p->protocol_max)
goto incompat;
- mdev->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
+ tconn->agreed_pro_version = min_t(int, PRO_VERSION_MAX, p->protocol_max);
- dev_info(DEV, "Handshake successful: "
- "Agreed network protocol version %d\n", mdev->agreed_pro_version);
+ conn_info(tconn, "Handshake successful: "
+ "Agreed network protocol version %d\n", tconn->agreed_pro_version);
return 1;
incompat:
- dev_err(DEV, "incompatible DRBD dialects: "
+ conn_err(tconn, "incompatible DRBD dialects: "
"I support %d-%d, peer supports %d-%d\n",
PRO_VERSION_MIN, PRO_VERSION_MAX,
p->protocol_min, p->protocol_max);
@@ -4063,7 +4643,7 @@
}
#if !defined(CONFIG_CRYPTO_HMAC) && !defined(CONFIG_CRYPTO_HMAC_MODULE)
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
{
dev_err(DEV, "This kernel was build without CONFIG_CRYPTO_HMAC.\n");
dev_err(DEV, "You need to disable 'cram-hmac-alg' in drbd.conf.\n");
@@ -4078,121 +4658,139 @@
-1 - auth failed, don't try again.
*/
-static int drbd_do_auth(struct drbd_conf *mdev)
+static int drbd_do_auth(struct drbd_tconn *tconn)
{
+ struct drbd_socket *sock;
char my_challenge[CHALLENGE_LEN]; /* 64 Bytes... */
struct scatterlist sg;
char *response = NULL;
char *right_response = NULL;
char *peers_ch = NULL;
- unsigned int key_len = strlen(mdev->net_conf->shared_secret);
+ unsigned int key_len;
+ char secret[SHARED_SECRET_MAX]; /* 64 byte */
unsigned int resp_size;
struct hash_desc desc;
- enum drbd_packets cmd;
- unsigned int length;
- int rv;
+ struct packet_info pi;
+ struct net_conf *nc;
+ int err, rv;
- desc.tfm = mdev->cram_hmac_tfm;
+ /* FIXME: Put the challenge/response into the preallocated socket buffer. */
+
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ key_len = strlen(nc->shared_secret);
+ memcpy(secret, nc->shared_secret, key_len);
+ rcu_read_unlock();
+
+ desc.tfm = tconn->cram_hmac_tfm;
desc.flags = 0;
- rv = crypto_hash_setkey(mdev->cram_hmac_tfm,
- (u8 *)mdev->net_conf->shared_secret, key_len);
+ rv = crypto_hash_setkey(tconn->cram_hmac_tfm, (u8 *)secret, key_len);
if (rv) {
- dev_err(DEV, "crypto_hash_setkey() failed with %d\n", rv);
+ conn_err(tconn, "crypto_hash_setkey() failed with %d\n", rv);
rv = -1;
goto fail;
}
get_random_bytes(my_challenge, CHALLENGE_LEN);
- rv = drbd_send_cmd2(mdev, P_AUTH_CHALLENGE, my_challenge, CHALLENGE_LEN);
+ sock = &tconn->data;
+ if (!conn_prepare_command(tconn, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(tconn, sock, P_AUTH_CHALLENGE, 0,
+ my_challenge, CHALLENGE_LEN);
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
- goto fail;
-
- if (cmd != P_AUTH_CHALLENGE) {
- dev_err(DEV, "expected AuthChallenge packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ err = drbd_recv_header(tconn, &pi);
+ if (err) {
rv = 0;
goto fail;
}
- if (length > CHALLENGE_LEN * 2) {
- dev_err(DEV, "expected AuthChallenge payload too big.\n");
+ if (pi.cmd != P_AUTH_CHALLENGE) {
+ conn_err(tconn, "expected AuthChallenge packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
+ rv = 0;
+ goto fail;
+ }
+
+ if (pi.size > CHALLENGE_LEN * 2) {
+ conn_err(tconn, "expected AuthChallenge payload too big.\n");
rv = -1;
goto fail;
}
- peers_ch = kmalloc(length, GFP_NOIO);
+ peers_ch = kmalloc(pi.size, GFP_NOIO);
if (peers_ch == NULL) {
- dev_err(DEV, "kmalloc of peers_ch failed\n");
+ conn_err(tconn, "kmalloc of peers_ch failed\n");
rv = -1;
goto fail;
}
- rv = drbd_recv(mdev, peers_ch, length);
-
- if (rv != length) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read AuthChallenge: l=%u\n", rv);
+ err = drbd_recv_all_warn(tconn, peers_ch, pi.size);
+ if (err) {
rv = 0;
goto fail;
}
- resp_size = crypto_hash_digestsize(mdev->cram_hmac_tfm);
+ resp_size = crypto_hash_digestsize(tconn->cram_hmac_tfm);
response = kmalloc(resp_size, GFP_NOIO);
if (response == NULL) {
- dev_err(DEV, "kmalloc of response failed\n");
+ conn_err(tconn, "kmalloc of response failed\n");
rv = -1;
goto fail;
}
sg_init_table(&sg, 1);
- sg_set_buf(&sg, peers_ch, length);
+ sg_set_buf(&sg, peers_ch, pi.size);
rv = crypto_hash_digest(&desc, &sg, sg.length, response);
if (rv) {
- dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
rv = -1;
goto fail;
}
- rv = drbd_send_cmd2(mdev, P_AUTH_RESPONSE, response, resp_size);
+ if (!conn_prepare_command(tconn, sock)) {
+ rv = 0;
+ goto fail;
+ }
+ rv = !conn_send_command(tconn, sock, P_AUTH_RESPONSE, 0,
+ response, resp_size);
if (!rv)
goto fail;
- rv = drbd_recv_header(mdev, &cmd, &length);
- if (!rv)
- goto fail;
-
- if (cmd != P_AUTH_RESPONSE) {
- dev_err(DEV, "expected AuthResponse packet, received: %s (0x%04x)\n",
- cmdname(cmd), cmd);
+ err = drbd_recv_header(tconn, &pi);
+ if (err) {
rv = 0;
goto fail;
}
- if (length != resp_size) {
- dev_err(DEV, "expected AuthResponse payload of wrong size\n");
+ if (pi.cmd != P_AUTH_RESPONSE) {
+ conn_err(tconn, "expected AuthResponse packet, received: %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
rv = 0;
goto fail;
}
- rv = drbd_recv(mdev, response , resp_size);
+ if (pi.size != resp_size) {
+ conn_err(tconn, "expected AuthResponse payload of wrong size\n");
+ rv = 0;
+ goto fail;
+ }
- if (rv != resp_size) {
- if (!signal_pending(current))
- dev_warn(DEV, "short read receiving AuthResponse: l=%u\n", rv);
+ err = drbd_recv_all_warn(tconn, response , resp_size);
+ if (err) {
rv = 0;
goto fail;
}
right_response = kmalloc(resp_size, GFP_NOIO);
if (right_response == NULL) {
- dev_err(DEV, "kmalloc of right_response failed\n");
+ conn_err(tconn, "kmalloc of right_response failed\n");
rv = -1;
goto fail;
}
@@ -4201,7 +4799,7 @@
rv = crypto_hash_digest(&desc, &sg, sg.length, right_response);
if (rv) {
- dev_err(DEV, "crypto_hash_digest() failed with %d\n", rv);
+ conn_err(tconn, "crypto_hash_digest() failed with %d\n", rv);
rv = -1;
goto fail;
}
@@ -4209,8 +4807,8 @@
rv = !memcmp(response, right_response, resp_size);
if (rv)
- dev_info(DEV, "Peer authenticated using %d bytes of '%s' HMAC\n",
- resp_size, mdev->net_conf->cram_hmac_alg);
+ conn_info(tconn, "Peer authenticated using %d bytes HMAC\n",
+ resp_size);
else
rv = -1;
@@ -4225,82 +4823,106 @@
int drbdd_init(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
- unsigned int minor = mdev_to_minor(mdev);
+ struct drbd_tconn *tconn = thi->tconn;
int h;
- sprintf(current->comm, "drbd%d_receiver", minor);
-
- dev_info(DEV, "receiver (re)started\n");
+ conn_info(tconn, "receiver (re)started\n");
do {
- h = drbd_connect(mdev);
+ h = conn_connect(tconn);
if (h == 0) {
- drbd_disconnect(mdev);
+ conn_disconnect(tconn);
schedule_timeout_interruptible(HZ);
}
if (h == -1) {
- dev_warn(DEV, "Discarding network configuration.\n");
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_warn(tconn, "Discarding network configuration.\n");
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
} while (h == 0);
- if (h > 0) {
- if (get_net_conf(mdev)) {
- drbdd(mdev);
- put_net_conf(mdev);
- }
- }
+ if (h > 0)
+ drbdd(tconn);
- drbd_disconnect(mdev);
+ conn_disconnect(tconn);
- dev_info(DEV, "receiver terminated\n");
+ conn_info(tconn, "receiver terminated\n");
return 0;
}
/* ********* acknowledge sender ******** */
-static int got_RqSReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_conn_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_req_state_reply *p = (struct p_req_state_reply *)h;
-
+ struct p_req_state_reply *p = pi->data;
int retcode = be32_to_cpu(p->retcode);
if (retcode >= SS_SUCCESS) {
- drbd_set_flag(mdev, CL_ST_CHG_SUCCESS);
+ set_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags);
} else {
- drbd_set_flag(mdev, CL_ST_CHG_FAIL);
+ set_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags);
+ conn_err(tconn, "Requested state change failed by peer: %s (%d)\n",
+ drbd_set_st_err_str(retcode), retcode);
+ }
+ wake_up(&tconn->ping_wait);
+
+ return 0;
+}
+
+static int got_RqSReply(struct drbd_tconn *tconn, struct packet_info *pi)
+{
+ struct drbd_conf *mdev;
+ struct p_req_state_reply *p = pi->data;
+ int retcode = be32_to_cpu(p->retcode);
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
+ if (test_bit(CONN_WD_ST_CHG_REQ, &tconn->flags)) {
+ D_ASSERT(tconn->agreed_pro_version < 100);
+ return got_conn_RqSReply(tconn, pi);
+ }
+
+ if (retcode >= SS_SUCCESS) {
+ set_bit(CL_ST_CHG_SUCCESS, &mdev->flags);
+ } else {
+ set_bit(CL_ST_CHG_FAIL, &mdev->flags);
dev_err(DEV, "Requested state change failed by peer: %s (%d)\n",
- drbd_set_st_err_str(retcode), retcode);
+ drbd_set_st_err_str(retcode), retcode);
}
wake_up(&mdev->state_wait);
- return true;
+ return 0;
}
-static int got_Ping(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_Ping(struct drbd_tconn *tconn, struct packet_info *pi)
{
- return drbd_send_ping_ack(mdev);
+ return drbd_send_ping_ack(tconn);
}
-static int got_PingAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_PingAck(struct drbd_tconn *tconn, struct packet_info *pi)
{
/* restore idle timeout */
- mdev->meta.socket->sk->sk_rcvtimeo = mdev->net_conf->ping_int*HZ;
- if (!drbd_test_and_set_flag(mdev, GOT_PING_ACK))
- wake_up(&mdev->misc_wait);
+ tconn->meta.socket->sk->sk_rcvtimeo = tconn->net_conf->ping_int*HZ;
+ if (!test_and_set_bit(GOT_PING_ACK, &tconn->flags))
+ wake_up(&tconn->ping_wait);
- return true;
+ return 0;
}
-static int got_IsInSync(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_IsInSync(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_conf *mdev;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
int blksize = be32_to_cpu(p->blksize);
- D_ASSERT(mdev->agreed_pro_version >= 89);
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
+ D_ASSERT(mdev->tconn->agreed_pro_version >= 89);
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
@@ -4314,162 +4936,139 @@
dec_rs_pending(mdev);
atomic_add(blksize >> 9, &mdev->rs_sect_in);
- return true;
+ return 0;
}
-/* when we receive the ACK for a write request,
- * verify that we actually know about it */
-static struct drbd_request *_ack_id_to_req(struct drbd_conf *mdev,
- u64 id, sector_t sector)
-{
- struct hlist_head *slot = tl_hash_slot(mdev, sector);
- struct hlist_node *n;
- struct drbd_request *req;
-
- hlist_for_each_entry(req, n, slot, collision) {
- if ((unsigned long)req == (unsigned long)id) {
- if (req->sector != sector) {
- dev_err(DEV, "_ack_id_to_req: found req %p but it has "
- "wrong sector (%llus versus %llus)\n", req,
- (unsigned long long)req->sector,
- (unsigned long long)sector);
- break;
- }
- return req;
- }
- }
- return NULL;
-}
-
-typedef struct drbd_request *(req_validator_fn)
- (struct drbd_conf *mdev, u64 id, sector_t sector);
-
-static int validate_req_change_req_state(struct drbd_conf *mdev,
- u64 id, sector_t sector, req_validator_fn validator,
- const char *func, enum drbd_req_event what)
+static int
+validate_req_change_req_state(struct drbd_conf *mdev, u64 id, sector_t sector,
+ struct rb_root *root, const char *func,
+ enum drbd_req_event what, bool missing_ok)
{
struct drbd_request *req;
struct bio_and_error m;
- spin_lock_irq(&mdev->req_lock);
- req = validator(mdev, id, sector);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ req = find_request(mdev, root, id, sector, missing_ok, func);
if (unlikely(!req)) {
- spin_unlock_irq(&mdev->req_lock);
-
- dev_err(DEV, "%s: failed to find req %p, sector %llus\n", func,
- (void *)(unsigned long)id, (unsigned long long)sector);
- return false;
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ return -EIO;
}
__req_mod(req, what, &m);
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
if (m.bio)
complete_master_bio(mdev, &m);
- return true;
+ return 0;
}
-static int got_BlockAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BlockAck(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_conf *mdev;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
int blksize = be32_to_cpu(p->blksize);
enum drbd_req_event what;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- if (is_syncer_block_id(p->block_id)) {
+ if (p->block_id == ID_SYNCER) {
drbd_set_in_sync(mdev, sector, blksize);
dec_rs_pending(mdev);
- return true;
+ return 0;
}
- switch (be16_to_cpu(h->command)) {
+ switch (pi->cmd) {
case P_RS_WRITE_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = write_acked_by_peer_and_sis;
+ what = WRITE_ACKED_BY_PEER_AND_SIS;
break;
case P_WRITE_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = write_acked_by_peer;
+ what = WRITE_ACKED_BY_PEER;
break;
case P_RECV_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_B);
- what = recv_acked_by_peer;
+ what = RECV_ACKED_BY_PEER;
break;
- case P_DISCARD_ACK:
- D_ASSERT(mdev->net_conf->wire_protocol == DRBD_PROT_C);
- what = conflict_discarded_by_peer;
+ case P_SUPERSEDED:
+ what = CONFLICT_RESOLVED;
+ break;
+ case P_RETRY_WRITE:
+ what = POSTPONE_WRITE;
break;
default:
- D_ASSERT(0);
- return false;
+ BUG();
}
return validate_req_change_req_state(mdev, p->block_id, sector,
- _ack_id_to_req, __func__ , what);
+ &mdev->write_requests, __func__,
+ what, false);
}
-static int got_NegAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegAck(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_conf *mdev;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
int size = be32_to_cpu(p->blksize);
- struct drbd_request *req;
- struct bio_and_error m;
+ int err;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- if (is_syncer_block_id(p->block_id)) {
+ if (p->block_id == ID_SYNCER) {
dec_rs_pending(mdev);
drbd_rs_failed_io(mdev, sector, size);
- return true;
+ return 0;
}
- spin_lock_irq(&mdev->req_lock);
- req = _ack_id_to_req(mdev, p->block_id, sector);
- if (!req) {
- spin_unlock_irq(&mdev->req_lock);
- if (mdev->net_conf->wire_protocol == DRBD_PROT_A ||
- mdev->net_conf->wire_protocol == DRBD_PROT_B) {
- /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
- The master bio might already be completed, therefore the
- request is no longer in the collision hash.
- => Do not try to validate block_id as request. */
- /* In Protocol B we might already have got a P_RECV_ACK
- but then get a P_NEG_ACK after wards. */
- drbd_set_out_of_sync(mdev, sector, size);
- return true;
- } else {
- dev_err(DEV, "%s: failed to find req %p, sector %llus\n", __func__,
- (void *)(unsigned long)p->block_id, (unsigned long long)sector);
- return false;
- }
+ err = validate_req_change_req_state(mdev, p->block_id, sector,
+ &mdev->write_requests, __func__,
+ NEG_ACKED, true);
+ if (err) {
+ /* Protocol A has no P_WRITE_ACKs, but has P_NEG_ACKs.
+ The master bio might already be completed, therefore the
+ request is no longer in the collision hash. */
+ /* In Protocol B we might already have got a P_RECV_ACK
+ but then get a P_NEG_ACK afterwards. */
+ drbd_set_out_of_sync(mdev, sector, size);
}
- __req_mod(req, neg_acked, &m);
- spin_unlock_irq(&mdev->req_lock);
-
- if (m.bio)
- complete_master_bio(mdev, &m);
- return true;
+ return 0;
}
-static int got_NegDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegDReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_conf *mdev;
+ struct p_block_ack *p = pi->data;
sector_t sector = be64_to_cpu(p->sector);
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
- dev_err(DEV, "Got NegDReply; Sector %llus, len %u; Fail original request.\n",
+
+ dev_err(DEV, "Got NegDReply; Sector %llus, len %u.\n",
(unsigned long long)sector, be32_to_cpu(p->blksize));
return validate_req_change_req_state(mdev, p->block_id, sector,
- _ar_id_to_req, __func__ , neg_acked);
+ &mdev->read_requests, __func__,
+ NEG_ACKED, false);
}
-static int got_NegRSDReply(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_NegRSDReply(struct drbd_tconn *tconn, struct packet_info *pi)
{
+ struct drbd_conf *mdev;
sector_t sector;
int size;
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct p_block_ack *p = pi->data;
+
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
@@ -4480,57 +5079,66 @@
if (get_ldev_if_state(mdev, D_FAILED)) {
drbd_rs_complete_io(mdev, sector);
- switch (be16_to_cpu(h->command)) {
+ switch (pi->cmd) {
case P_NEG_RS_DREPLY:
drbd_rs_failed_io(mdev, sector, size);
case P_RS_CANCEL:
break;
default:
- D_ASSERT(0);
- put_ldev(mdev);
- return false;
+ BUG();
}
put_ldev(mdev);
}
- return true;
+ return 0;
}
-static int got_BarrierAck(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_BarrierAck(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_barrier_ack *p = (struct p_barrier_ack *)h;
+ struct p_barrier_ack *p = pi->data;
+ struct drbd_conf *mdev;
+ int vnr;
- tl_release(mdev, p->barrier, be32_to_cpu(p->set_size));
+ tl_release(tconn, p->barrier, be32_to_cpu(p->set_size));
- if (mdev->state.conn == C_AHEAD &&
- atomic_read(&mdev->ap_in_flight) == 0 &&
- !drbd_test_and_set_flag(mdev, AHEAD_TO_SYNC_SOURCE)) {
- mdev->start_resync_timer.expires = jiffies + HZ;
- add_timer(&mdev->start_resync_timer);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.conn == C_AHEAD &&
+ atomic_read(&mdev->ap_in_flight) == 0 &&
+ !test_and_set_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags)) {
+ mdev->start_resync_timer.expires = jiffies + HZ;
+ add_timer(&mdev->start_resync_timer);
+ }
}
+ rcu_read_unlock();
- return true;
+ return 0;
}
-static int got_OVResult(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_OVResult(struct drbd_tconn *tconn, struct packet_info *pi)
{
- struct p_block_ack *p = (struct p_block_ack *)h;
+ struct drbd_conf *mdev;
+ struct p_block_ack *p = pi->data;
struct drbd_work *w;
sector_t sector;
int size;
+ mdev = vnr_to_mdev(tconn, pi->vnr);
+ if (!mdev)
+ return -EIO;
+
sector = be64_to_cpu(p->sector);
size = be32_to_cpu(p->blksize);
update_peer_seq(mdev, be32_to_cpu(p->seq_num));
if (be64_to_cpu(p->block_id) == ID_OUT_OF_SYNC)
- drbd_ov_oos_found(mdev, sector, size);
+ drbd_ov_out_of_sync_found(mdev, sector, size);
else
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
if (!get_ldev(mdev))
- return true;
+ return 0;
drbd_rs_complete_io(mdev, sector);
dec_rs_pending(mdev);
@@ -4545,114 +5153,137 @@
w = kmalloc(sizeof(*w), GFP_NOIO);
if (w) {
w->cb = w_ov_finished;
- drbd_queue_work_front(&mdev->data.work, w);
+ w->mdev = mdev;
+ drbd_queue_work(&mdev->tconn->sender_work, w);
} else {
dev_err(DEV, "kmalloc(w) failed.");
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
}
}
put_ldev(mdev);
- return true;
+ return 0;
}
-static int got_skip(struct drbd_conf *mdev, struct p_header80 *h)
+static int got_skip(struct drbd_tconn *tconn, struct packet_info *pi)
{
- return true;
+ return 0;
+}
+
+static int tconn_finish_peer_reqs(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr, not_empty = 0;
+
+ do {
+ clear_bit(SIGNAL_ASENDER, &tconn->flags);
+ flush_signals(current);
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ if (drbd_finish_peer_reqs(mdev)) {
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ return 1;
+ }
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ set_bit(SIGNAL_ASENDER, &tconn->flags);
+
+ spin_lock_irq(&tconn->req_lock);
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ not_empty = !list_empty(&mdev->done_ee);
+ if (not_empty)
+ break;
+ }
+ spin_unlock_irq(&tconn->req_lock);
+ rcu_read_unlock();
+ } while (not_empty);
+
+ return 0;
}
struct asender_cmd {
size_t pkt_size;
- int (*process)(struct drbd_conf *mdev, struct p_header80 *h);
+ int (*fn)(struct drbd_tconn *tconn, struct packet_info *);
};
-static struct asender_cmd *get_asender_cmd(int cmd)
-{
- static struct asender_cmd asender_tbl[] = {
- /* anything missing from this table is in
- * the drbd_cmd_handler (drbd_default_handler) table,
- * see the beginning of drbdd() */
- [P_PING] = { sizeof(struct p_header80), got_Ping },
- [P_PING_ACK] = { sizeof(struct p_header80), got_PingAck },
+static struct asender_cmd asender_tbl[] = {
+ [P_PING] = { 0, got_Ping },
+ [P_PING_ACK] = { 0, got_PingAck },
[P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
[P_RS_WRITE_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
- [P_DISCARD_ACK] = { sizeof(struct p_block_ack), got_BlockAck },
+ [P_SUPERSEDED] = { sizeof(struct p_block_ack), got_BlockAck },
[P_NEG_ACK] = { sizeof(struct p_block_ack), got_NegAck },
[P_NEG_DREPLY] = { sizeof(struct p_block_ack), got_NegDReply },
- [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply},
+ [P_NEG_RS_DREPLY] = { sizeof(struct p_block_ack), got_NegRSDReply },
[P_OV_RESULT] = { sizeof(struct p_block_ack), got_OVResult },
[P_BARRIER_ACK] = { sizeof(struct p_barrier_ack), got_BarrierAck },
[P_STATE_CHG_REPLY] = { sizeof(struct p_req_state_reply), got_RqSReply },
[P_RS_IS_IN_SYNC] = { sizeof(struct p_block_ack), got_IsInSync },
[P_DELAY_PROBE] = { sizeof(struct p_delay_probe93), got_skip },
- [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply},
- [P_MAX_CMD] = { 0, NULL },
- };
- if (cmd > P_MAX_CMD || asender_tbl[cmd].process == NULL)
- return NULL;
- return &asender_tbl[cmd];
-}
+ [P_RS_CANCEL] = { sizeof(struct p_block_ack), got_NegRSDReply },
+ [P_CONN_ST_CHG_REPLY]={ sizeof(struct p_req_state_reply), got_conn_RqSReply },
+ [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck },
+};
int drbd_asender(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
- struct p_header80 *h = &mdev->meta.rbuf.header.h80;
+ struct drbd_tconn *tconn = thi->tconn;
struct asender_cmd *cmd = NULL;
-
- int rv, len;
- void *buf = h;
+ struct packet_info pi;
+ int rv;
+ void *buf = tconn->meta.rbuf;
int received = 0;
- int expect = sizeof(struct p_header80);
- int empty;
- int ping_timeout_active = 0;
-
- sprintf(current->comm, "drbd%d_asender", mdev_to_minor(mdev));
+ unsigned int header_size = drbd_header_size(tconn);
+ int expect = header_size;
+ bool ping_timeout_active = false;
+ struct net_conf *nc;
+ int ping_timeo, tcp_cork, ping_int;
current->policy = SCHED_RR; /* Make this a realtime task! */
current->rt_priority = 2; /* more important than all other tasks */
- while (get_t_state(thi) == Running) {
- drbd_thread_current_set_cpu(mdev);
- if (drbd_test_and_clear_flag(mdev, SEND_PING)) {
- ERR_IF(!drbd_send_ping(mdev)) goto reconnect;
- mdev->meta.socket->sk->sk_rcvtimeo =
- mdev->net_conf->ping_timeo*HZ/10;
- ping_timeout_active = 1;
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
+
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ ping_timeo = nc->ping_timeo;
+ tcp_cork = nc->tcp_cork;
+ ping_int = nc->ping_int;
+ rcu_read_unlock();
+
+ if (test_and_clear_bit(SEND_PING, &tconn->flags)) {
+ if (drbd_send_ping(tconn)) {
+ conn_err(tconn, "drbd_send_ping has failed\n");
+ goto reconnect;
+ }
+ tconn->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10;
+ ping_timeout_active = true;
}
- /* conditionally cork;
- * it may hurt latency if we cork without much to send */
- if (!mdev->net_conf->no_cork &&
- 3 < atomic_read(&mdev->unacked_cnt))
- drbd_tcp_cork(mdev->meta.socket);
- while (1) {
- drbd_clear_flag(mdev, SIGNAL_ASENDER);
- flush_signals(current);
- if (!drbd_process_done_ee(mdev))
- goto reconnect;
- /* to avoid race with newly queued ACKs */
- drbd_set_flag(mdev, SIGNAL_ASENDER);
- spin_lock_irq(&mdev->req_lock);
- empty = list_empty(&mdev->done_ee);
- spin_unlock_irq(&mdev->req_lock);
- /* new ack may have been queued right here,
- * but then there is also a signal pending,
- * and we start over... */
- if (empty)
- break;
+ /* TODO: conditionally cork; it may hurt latency if we cork without
+ much to send */
+ if (tcp_cork)
+ drbd_tcp_cork(tconn->meta.socket);
+ if (tconn_finish_peer_reqs(tconn)) {
+ conn_err(tconn, "tconn_finish_peer_reqs() failed\n");
+ goto reconnect;
}
/* but unconditionally uncork unless disabled */
- if (!mdev->net_conf->no_cork)
- drbd_tcp_uncork(mdev->meta.socket);
+ if (tcp_cork)
+ drbd_tcp_uncork(tconn->meta.socket);
/* short circuit, recv_msg would return EINTR anyways. */
if (signal_pending(current))
continue;
- rv = drbd_recv_short(mdev, mdev->meta.socket,
- buf, expect-received, 0);
- drbd_clear_flag(mdev, SIGNAL_ASENDER);
+ rv = drbd_recv_short(tconn->meta.socket, buf, expect-received, 0);
+ clear_bit(SIGNAL_ASENDER, &tconn->flags);
flush_signals(current);
@@ -4670,87 +5301,91 @@
received += rv;
buf += rv;
} else if (rv == 0) {
- if (drbd_test_flag(mdev, DISCONNECT_SENT)) {
- long t; /* time_left */
- t = wait_event_timeout(mdev->state_wait, mdev->state.conn < C_CONNECTED,
- mdev->net_conf->ping_timeo * HZ/10);
+ if (test_bit(DISCONNECT_SENT, &tconn->flags)) {
+ long t;
+ rcu_read_lock();
+ t = rcu_dereference(tconn->net_conf)->ping_timeo * HZ/10;
+ rcu_read_unlock();
+
+ t = wait_event_timeout(tconn->ping_wait,
+ tconn->cstate < C_WF_REPORT_PARAMS,
+ t);
if (t)
break;
}
- dev_err(DEV, "meta connection shut down by peer.\n");
+ conn_err(tconn, "meta connection shut down by peer.\n");
goto reconnect;
} else if (rv == -EAGAIN) {
/* If the data socket received something meanwhile,
* that is good enough: peer is still alive. */
- if (time_after(mdev->last_received,
- jiffies - mdev->meta.socket->sk->sk_rcvtimeo))
+ if (time_after(tconn->last_received,
+ jiffies - tconn->meta.socket->sk->sk_rcvtimeo))
continue;
if (ping_timeout_active) {
- dev_err(DEV, "PingAck did not arrive in time.\n");
+ conn_err(tconn, "PingAck did not arrive in time.\n");
goto reconnect;
}
- drbd_set_flag(mdev, SEND_PING);
+ set_bit(SEND_PING, &tconn->flags);
continue;
} else if (rv == -EINTR) {
continue;
} else {
- dev_err(DEV, "sock_recvmsg returned %d\n", rv);
+ conn_err(tconn, "sock_recvmsg returned %d\n", rv);
goto reconnect;
}
if (received == expect && cmd == NULL) {
- if (unlikely(h->magic != BE_DRBD_MAGIC)) {
- dev_err(DEV, "magic?? on meta m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->magic),
- be16_to_cpu(h->command),
- be16_to_cpu(h->length));
+ if (decode_header(tconn, tconn->meta.rbuf, &pi))
goto reconnect;
- }
- cmd = get_asender_cmd(be16_to_cpu(h->command));
- len = be16_to_cpu(h->length);
- if (unlikely(cmd == NULL)) {
- dev_err(DEV, "unknown command?? on meta m: 0x%08x c: %d l: %d\n",
- be32_to_cpu(h->magic),
- be16_to_cpu(h->command),
- be16_to_cpu(h->length));
+ cmd = &asender_tbl[pi.cmd];
+ if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) {
+ conn_err(tconn, "Unexpected meta packet %s (0x%04x)\n",
+ cmdname(pi.cmd), pi.cmd);
goto disconnect;
}
- expect = cmd->pkt_size;
- ERR_IF(len != expect-sizeof(struct p_header80))
+ expect = header_size + cmd->pkt_size;
+ if (pi.size != expect - header_size) {
+ conn_err(tconn, "Wrong packet size on meta (c: %d, l: %d)\n",
+ pi.cmd, pi.size);
goto reconnect;
+ }
}
if (received == expect) {
- mdev->last_received = jiffies;
- D_ASSERT(cmd != NULL);
- if (!cmd->process(mdev, h))
+ bool err;
+
+ err = cmd->fn(tconn, &pi);
+ if (err) {
+ conn_err(tconn, "%pf failed\n", cmd->fn);
goto reconnect;
+ }
- /* the idle_timeout (ping-int)
- * has been restored in got_PingAck() */
- if (cmd == get_asender_cmd(P_PING_ACK))
- ping_timeout_active = 0;
+ tconn->last_received = jiffies;
- buf = h;
+ if (cmd == &asender_tbl[P_PING_ACK]) {
+ /* restore idle timeout */
+ tconn->meta.socket->sk->sk_rcvtimeo = ping_int * HZ;
+ ping_timeout_active = false;
+ }
+
+ buf = tconn->meta.rbuf;
received = 0;
- expect = sizeof(struct p_header80);
+ expect = header_size;
cmd = NULL;
}
}
if (0) {
reconnect:
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
- drbd_md_sync(mdev);
+ conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
+ conn_md_sync(tconn);
}
if (0) {
disconnect:
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- drbd_md_sync(mdev);
+ conn_request_state(tconn, NS(conn, C_DISCONNECTING), CS_HARD);
}
- drbd_clear_flag(mdev, SIGNAL_ASENDER);
+ clear_bit(SIGNAL_ASENDER, &tconn->flags);
- D_ASSERT(mdev->state.conn < C_CONNECTED);
- dev_info(DEV, "asender terminated\n");
+ conn_info(tconn, "asender terminated\n");
return 0;
}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 135ea76..f58a4a4 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -31,6 +31,8 @@
#include "drbd_req.h"
+static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size);
+
/* Update disk stats at start of I/O request */
static void _drbd_start_io_acct(struct drbd_conf *mdev, struct drbd_request *req, struct bio *bio)
{
@@ -40,6 +42,8 @@
part_round_stats(cpu, &mdev->vdisk->part0);
part_stat_inc(cpu, &mdev->vdisk->part0, ios[rw]);
part_stat_add(cpu, &mdev->vdisk->part0, sectors[rw], bio_sectors(bio));
+ (void) cpu; /* The macro invocations above want the cpu argument, I do not like
+ the compiler warning about cpu only assigned but never used... */
part_inc_in_flight(&mdev->vdisk->part0, rw);
part_stat_unlock();
}
@@ -57,9 +61,51 @@
part_stat_unlock();
}
-static void _req_is_done(struct drbd_conf *mdev, struct drbd_request *req, const int rw)
+static struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
+ struct bio *bio_src)
{
- const unsigned long s = req->rq_state;
+ struct drbd_request *req;
+
+ req = mempool_alloc(drbd_request_mempool, GFP_NOIO);
+ if (!req)
+ return NULL;
+
+ drbd_req_make_private_bio(req, bio_src);
+ req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
+ req->w.mdev = mdev;
+ req->master_bio = bio_src;
+ req->epoch = 0;
+
+ drbd_clear_interval(&req->i);
+ req->i.sector = bio_src->bi_sector;
+ req->i.size = bio_src->bi_size;
+ req->i.local = true;
+ req->i.waiting = false;
+
+ INIT_LIST_HEAD(&req->tl_requests);
+ INIT_LIST_HEAD(&req->w.list);
+
+ /* one reference to be put by __drbd_make_request */
+ atomic_set(&req->completion_ref, 1);
+ /* one kref as long as completion_ref > 0 */
+ kref_init(&req->kref);
+ return req;
+}
+
+void drbd_req_destroy(struct kref *kref)
+{
+ struct drbd_request *req = container_of(kref, struct drbd_request, kref);
+ struct drbd_conf *mdev = req->w.mdev;
+ const unsigned s = req->rq_state;
+
+ if ((req->master_bio && !(s & RQ_POSTPONED)) ||
+ atomic_read(&req->completion_ref) ||
+ (s & RQ_LOCAL_PENDING) ||
+ ((s & RQ_NET_MASK) && !(s & RQ_NET_DONE))) {
+ dev_err(DEV, "drbd_req_destroy: Logic BUG rq_state = 0x%x, completion_ref = %d\n",
+ s, atomic_read(&req->completion_ref));
+ return;
+ }
/* remove it from the transfer log.
* well, only if it had been there in the first
@@ -67,24 +113,33 @@
* and never sent), it should still be "empty" as
* initialized in drbd_req_new(), so we can list_del() it
* here unconditionally */
- list_del(&req->tl_requests);
+ list_del_init(&req->tl_requests);
/* if it was a write, we may have to set the corresponding
* bit(s) out-of-sync first. If it had a local part, we need to
* release the reference to the activity log. */
- if (rw == WRITE) {
+ if (s & RQ_WRITE) {
/* Set out-of-sync unless both OK flags are set
* (local only or remote failed).
* Other places where we set out-of-sync:
* READ with local io-error */
- if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
- drbd_set_out_of_sync(mdev, req->sector, req->size);
- if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
- drbd_set_in_sync(mdev, req->sector, req->size);
+ /* There is a special case:
+ * we may notice late that IO was suspended,
+ * and postpone, or schedule for retry, a write,
+ * before it even was submitted or sent.
+ * In that case we do not want to touch the bitmap at all.
+ */
+ if ((s & (RQ_POSTPONED|RQ_LOCAL_MASK|RQ_NET_MASK)) != RQ_POSTPONED) {
+ if (!(s & RQ_NET_OK) || !(s & RQ_LOCAL_OK))
+ drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
+
+ if ((s & RQ_NET_OK) && (s & RQ_LOCAL_OK) && (s & RQ_NET_SIS))
+ drbd_set_in_sync(mdev, req->i.sector, req->i.size);
+ }
/* one might be tempted to move the drbd_al_complete_io
- * to the local io completion callback drbd_endio_pri.
+ * to the local io completion callback drbd_request_endio.
* but, if this was a mirror write, we may only
* drbd_al_complete_io after this is RQ_NET_DONE,
* otherwise the extent could be dropped from the al
@@ -93,109 +148,35 @@
* but after the extent has been dropped from the al,
* we would forget to resync the corresponding extent.
*/
- if (s & RQ_LOCAL_MASK) {
+ if (s & RQ_IN_ACT_LOG) {
if (get_ldev_if_state(mdev, D_FAILED)) {
- if (s & RQ_IN_ACT_LOG)
- drbd_al_complete_io(mdev, req->sector);
+ drbd_al_complete_io(mdev, &req->i);
put_ldev(mdev);
} else if (__ratelimit(&drbd_ratelimit_state)) {
- dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu), "
- "but my Disk seems to have failed :(\n",
- (unsigned long long) req->sector);
+ dev_warn(DEV, "Should have called drbd_al_complete_io(, %llu, %u), "
+ "but my Disk seems to have failed :(\n",
+ (unsigned long long) req->i.sector, req->i.size);
}
}
}
- drbd_req_free(req);
+ mempool_free(req, drbd_request_mempool);
}
-static void queue_barrier(struct drbd_conf *mdev)
-{
- struct drbd_tl_epoch *b;
+static void wake_all_senders(struct drbd_tconn *tconn) {
+ wake_up(&tconn->sender_work.q_wait);
+}
- /* We are within the req_lock. Once we queued the barrier for sending,
- * we set the CREATE_BARRIER bit. It is cleared as soon as a new
- * barrier/epoch object is added. This is the only place this bit is
- * set. It indicates that the barrier for this epoch is already queued,
- * and no new epoch has been created yet. */
- if (drbd_test_flag(mdev, CREATE_BARRIER))
+/* must hold resource->req_lock */
+static void start_new_tl_epoch(struct drbd_tconn *tconn)
+{
+ /* no point closing an epoch, if it is empty, anyways. */
+ if (tconn->current_tle_writes == 0)
return;
- b = mdev->newest_tle;
- b->w.cb = w_send_barrier;
- /* inc_ap_pending done here, so we won't
- * get imbalanced on connection loss.
- * dec_ap_pending will be done in got_BarrierAck
- * or (on connection loss) in tl_clear. */
- inc_ap_pending(mdev);
- drbd_queue_work(&mdev->data.work, &b->w);
- drbd_set_flag(mdev, CREATE_BARRIER);
-}
-
-static void _about_to_complete_local_write(struct drbd_conf *mdev,
- struct drbd_request *req)
-{
- const unsigned long s = req->rq_state;
- struct drbd_request *i;
- struct drbd_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
-
- /* Before we can signal completion to the upper layers,
- * we may need to close the current epoch.
- * We can skip this, if this request has not even been sent, because we
- * did not have a fully established connection yet/anymore, during
- * bitmap exchange, or while we are C_AHEAD due to congestion policy.
- */
- if (mdev->state.conn >= C_CONNECTED &&
- (s & RQ_NET_SENT) != 0 &&
- req->epoch == mdev->newest_tle->br_number)
- queue_barrier(mdev);
-
- /* we need to do the conflict detection stuff,
- * if we have the ee_hash (two_primaries) and
- * this has been on the network */
- if ((s & RQ_NET_DONE) && mdev->ee_hash != NULL) {
- const sector_t sector = req->sector;
- const int size = req->size;
-
- /* ASSERT:
- * there must be no conflicting requests, since
- * they must have been failed on the spot */
-#define OVERLAPS overlaps(sector, size, i->sector, i->size)
- slot = tl_hash_slot(mdev, sector);
- hlist_for_each_entry(i, n, slot, collision) {
- if (OVERLAPS) {
- dev_alert(DEV, "LOGIC BUG: completed: %p %llus +%u; "
- "other: %p %llus +%u\n",
- req, (unsigned long long)sector, size,
- i, (unsigned long long)i->sector, i->size);
- }
- }
-
- /* maybe "wake" those conflicting epoch entries
- * that wait for this request to finish.
- *
- * currently, there can be only _one_ such ee
- * (well, or some more, which would be pending
- * P_DISCARD_ACK not yet sent by the asender...),
- * since we block the receiver thread upon the
- * first conflict detection, which will wait on
- * misc_wait. maybe we want to assert that?
- *
- * anyways, if we found one,
- * we just have to do a wake_up. */
-#undef OVERLAPS
-#define OVERLAPS overlaps(sector, size, e->sector, e->size)
- slot = ee_hash_slot(mdev, req->sector);
- hlist_for_each_entry(e, n, slot, collision) {
- if (OVERLAPS) {
- wake_up(&mdev->misc_wait);
- break;
- }
- }
- }
-#undef OVERLAPS
+ tconn->current_tle_writes = 0;
+ atomic_inc(&tconn->current_tle_nr);
+ wake_all_senders(tconn);
}
void complete_master_bio(struct drbd_conf *mdev,
@@ -205,17 +186,33 @@
dec_ap_bio(mdev);
}
+
+static void drbd_remove_request_interval(struct rb_root *root,
+ struct drbd_request *req)
+{
+ struct drbd_conf *mdev = req->w.mdev;
+ struct drbd_interval *i = &req->i;
+
+ drbd_remove_interval(root, i);
+
+ /* Wake up any processes waiting for this request to complete. */
+ if (i->waiting)
+ wake_up(&mdev->misc_wait);
+}
+
/* Helper for __req_mod().
* Set m->bio to the master bio, if it is fit to be completed,
* or leave it alone (it is initialized to NULL in __req_mod),
* if it has already been completed, or cannot be completed yet.
* If m->bio is set, the error status to be returned is placed in m->error.
*/
-void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
+static
+void drbd_req_complete(struct drbd_request *req, struct bio_and_error *m)
{
- const unsigned long s = req->rq_state;
- struct drbd_conf *mdev = req->mdev;
- int rw = req->rq_state & RQ_WRITE ? WRITE : READ;
+ const unsigned s = req->rq_state;
+ struct drbd_conf *mdev = req->w.mdev;
+ int rw;
+ int error, ok;
/* we must not complete the master bio, while it is
* still being processed by _drbd_send_zc_bio (drbd_send_dblock)
@@ -226,178 +223,219 @@
* the receiver,
* the bio_endio completion callbacks.
*/
- if (s & RQ_NET_QUEUED)
+ if ((s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED)) ||
+ (s & RQ_NET_QUEUED) || (s & RQ_NET_PENDING) ||
+ (s & RQ_COMPLETION_SUSP)) {
+ dev_err(DEV, "drbd_req_complete: Logic BUG rq_state = 0x%x\n", s);
return;
- if (s & RQ_NET_PENDING)
+ }
+
+ if (!req->master_bio) {
+ dev_err(DEV, "drbd_req_complete: Logic BUG, master_bio == NULL!\n");
return;
- if (s & RQ_LOCAL_PENDING && !(s & RQ_LOCAL_ABORTED))
- return;
+ }
- if (req->master_bio) {
- /* this is data_received (remote read)
- * or protocol C P_WRITE_ACK
- * or protocol B P_RECV_ACK
- * or protocol A "handed_over_to_network" (SendAck)
- * or canceled or failed,
- * or killed from the transfer log due to connection loss.
- */
+ rw = bio_rw(req->master_bio);
- /*
- * figure out whether to report success or failure.
- *
- * report success when at least one of the operations succeeded.
- * or, to put the other way,
- * only report failure, when both operations failed.
- *
- * what to do about the failures is handled elsewhere.
- * what we need to do here is just: complete the master_bio.
- *
- * local completion error, if any, has been stored as ERR_PTR
- * in private_bio within drbd_endio_pri.
- */
- int ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
- int error = PTR_ERR(req->private_bio);
+ /*
+ * figure out whether to report success or failure.
+ *
+ * report success when at least one of the operations succeeded.
+ * or, to put the other way,
+ * only report failure, when both operations failed.
+ *
+ * what to do about the failures is handled elsewhere.
+ * what we need to do here is just: complete the master_bio.
+ *
+ * local completion error, if any, has been stored as ERR_PTR
+ * in private_bio within drbd_request_endio.
+ */
+ ok = (s & RQ_LOCAL_OK) || (s & RQ_NET_OK);
+ error = PTR_ERR(req->private_bio);
- /* remove the request from the conflict detection
- * respective block_id verification hash */
- if (!hlist_unhashed(&req->collision))
- hlist_del(&req->collision);
- else
- D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
+ /* remove the request from the conflict detection
+ * respective block_id verification hash */
+ if (!drbd_interval_empty(&req->i)) {
+ struct rb_root *root;
- /* for writes we need to do some extra housekeeping */
if (rw == WRITE)
- _about_to_complete_local_write(mdev, req);
+ root = &mdev->write_requests;
+ else
+ root = &mdev->read_requests;
+ drbd_remove_request_interval(root, req);
+ } else if (!(s & RQ_POSTPONED))
+ D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
- /* Update disk stats */
- _drbd_end_io_acct(mdev, req);
+ /* Before we can signal completion to the upper layers,
+ * we may need to close the current transfer log epoch.
+ * We are within the request lock, so we can simply compare
+ * the request epoch number with the current transfer log
+ * epoch number. If they match, increase the current_tle_nr,
+ * and reset the transfer log epoch write_cnt.
+ */
+ if (rw == WRITE &&
+ req->epoch == atomic_read(&mdev->tconn->current_tle_nr))
+ start_new_tl_epoch(mdev->tconn);
+ /* Update disk stats */
+ _drbd_end_io_acct(mdev, req);
+
+ /* If READ failed,
+ * have it be pushed back to the retry work queue,
+ * so it will re-enter __drbd_make_request(),
+ * and be re-assigned to a suitable local or remote path,
+ * or failed if we do not have access to good data anymore.
+ *
+ * Unless it was failed early by __drbd_make_request(),
+ * because no path was available, in which case
+ * it was not even added to the transfer_log.
+ *
+ * READA may fail, and will not be retried.
+ *
+ * WRITE should have used all available paths already.
+ */
+ if (!ok && rw == READ && !list_empty(&req->tl_requests))
+ req->rq_state |= RQ_POSTPONED;
+
+ if (!(req->rq_state & RQ_POSTPONED)) {
m->error = ok ? 0 : (error ?: -EIO);
m->bio = req->master_bio;
req->master_bio = NULL;
}
-
- if (s & RQ_LOCAL_PENDING)
- return;
-
- if ((s & RQ_NET_MASK) == 0 || (s & RQ_NET_DONE)) {
- /* this is disconnected (local only) operation,
- * or protocol C P_WRITE_ACK,
- * or protocol A or B P_BARRIER_ACK,
- * or killed from the transfer log due to connection loss. */
- _req_is_done(mdev, req, rw);
- }
- /* else: network part and not DONE yet. that is
- * protocol A or B, barrier ack still pending... */
}
-static void _req_may_be_done_not_susp(struct drbd_request *req, struct bio_and_error *m)
+static int drbd_req_put_completion_ref(struct drbd_request *req, struct bio_and_error *m, int put)
{
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_conf *mdev = req->w.mdev;
+ D_ASSERT(m || (req->rq_state & RQ_POSTPONED));
- if (!is_susp(mdev->state))
- _req_may_be_done(req, m);
-}
-
-/*
- * checks whether there was an overlapping request
- * or ee already registered.
- *
- * if so, return 1, in which case this request is completed on the spot,
- * without ever being submitted or send.
- *
- * return 0 if it is ok to submit this request.
- *
- * NOTE:
- * paranoia: assume something above us is broken, and issues different write
- * requests for the same block simultaneously...
- *
- * To ensure these won't be reordered differently on both nodes, resulting in
- * diverging data sets, we discard the later one(s). Not that this is supposed
- * to happen, but this is the rationale why we also have to check for
- * conflicting requests with local origin, and why we have to do so regardless
- * of whether we allowed multiple primaries.
- *
- * BTW, in case we only have one primary, the ee_hash is empty anyways, and the
- * second hlist_for_each_entry becomes a noop. This is even simpler than to
- * grab a reference on the net_conf, and check for the two_primaries flag...
- */
-static int _req_conflicts(struct drbd_request *req)
-{
- struct drbd_conf *mdev = req->mdev;
- const sector_t sector = req->sector;
- const int size = req->size;
- struct drbd_request *i;
- struct drbd_epoch_entry *e;
- struct hlist_node *n;
- struct hlist_head *slot;
-
- D_ASSERT(hlist_unhashed(&req->collision));
-
- if (!get_net_conf(mdev))
+ if (!atomic_sub_and_test(put, &req->completion_ref))
return 0;
- /* BUG_ON */
- ERR_IF (mdev->tl_hash_s == 0)
- goto out_no_conflict;
- BUG_ON(mdev->tl_hash == NULL);
+ drbd_req_complete(req, m);
-#define OVERLAPS overlaps(i->sector, i->size, sector, size)
- slot = tl_hash_slot(mdev, sector);
- hlist_for_each_entry(i, n, slot, collision) {
- if (OVERLAPS) {
- dev_alert(DEV, "%s[%u] Concurrent local write detected! "
- "[DISCARD L] new: %llus +%u; "
- "pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)i->sector, i->size);
- goto out_conflict;
- }
+ if (req->rq_state & RQ_POSTPONED) {
+ /* don't destroy the req object just yet,
+ * but queue it for retry */
+ drbd_restart_request(req);
+ return 0;
}
- if (mdev->ee_hash_s) {
- /* now, check for overlapping requests with remote origin */
- BUG_ON(mdev->ee_hash == NULL);
-#undef OVERLAPS
-#define OVERLAPS overlaps(e->sector, e->size, sector, size)
- slot = ee_hash_slot(mdev, sector);
- hlist_for_each_entry(e, n, slot, collision) {
- if (OVERLAPS) {
- dev_alert(DEV, "%s[%u] Concurrent remote write detected!"
- " [DISCARD L] new: %llus +%u; "
- "pending: %llus +%u\n",
- current->comm, current->pid,
- (unsigned long long)sector, size,
- (unsigned long long)e->sector, e->size);
- goto out_conflict;
- }
- }
- }
-#undef OVERLAPS
-
-out_no_conflict:
- /* this is like it should be, and what we expected.
- * our users do behave after all... */
- put_net_conf(mdev);
- return 0;
-
-out_conflict:
- put_net_conf(mdev);
return 1;
}
+/* I'd like this to be the only place that manipulates
+ * req->completion_ref and req->kref. */
+static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m,
+ int clear, int set)
+{
+ struct drbd_conf *mdev = req->w.mdev;
+ unsigned s = req->rq_state;
+ int c_put = 0;
+ int k_put = 0;
+
+ if (drbd_suspended(mdev) && !((s | clear) & RQ_COMPLETION_SUSP))
+ set |= RQ_COMPLETION_SUSP;
+
+ /* apply */
+
+ req->rq_state &= ~clear;
+ req->rq_state |= set;
+
+ /* no change? */
+ if (req->rq_state == s)
+ return;
+
+ /* intent: get references */
+
+ if (!(s & RQ_LOCAL_PENDING) && (set & RQ_LOCAL_PENDING))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_NET_PENDING) && (set & RQ_NET_PENDING)) {
+ inc_ap_pending(mdev);
+ atomic_inc(&req->completion_ref);
+ }
+
+ if (!(s & RQ_NET_QUEUED) && (set & RQ_NET_QUEUED))
+ atomic_inc(&req->completion_ref);
+
+ if (!(s & RQ_EXP_BARR_ACK) && (set & RQ_EXP_BARR_ACK))
+ kref_get(&req->kref); /* wait for the DONE */
+
+ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT))
+ atomic_add(req->i.size >> 9, &mdev->ap_in_flight);
+
+ if (!(s & RQ_COMPLETION_SUSP) && (set & RQ_COMPLETION_SUSP))
+ atomic_inc(&req->completion_ref);
+
+ /* progress: put references */
+
+ if ((s & RQ_COMPLETION_SUSP) && (clear & RQ_COMPLETION_SUSP))
+ ++c_put;
+
+ if (!(s & RQ_LOCAL_ABORTED) && (set & RQ_LOCAL_ABORTED)) {
+ D_ASSERT(req->rq_state & RQ_LOCAL_PENDING);
+ /* local completion may still come in later,
+ * we need to keep the req object around. */
+ kref_get(&req->kref);
+ ++c_put;
+ }
+
+ if ((s & RQ_LOCAL_PENDING) && (clear & RQ_LOCAL_PENDING)) {
+ if (req->rq_state & RQ_LOCAL_ABORTED)
+ ++k_put;
+ else
+ ++c_put;
+ }
+
+ if ((s & RQ_NET_PENDING) && (clear & RQ_NET_PENDING)) {
+ dec_ap_pending(mdev);
+ ++c_put;
+ }
+
+ if ((s & RQ_NET_QUEUED) && (clear & RQ_NET_QUEUED))
+ ++c_put;
+
+ if ((s & RQ_EXP_BARR_ACK) && !(s & RQ_NET_DONE) && (set & RQ_NET_DONE)) {
+ if (req->rq_state & RQ_NET_SENT)
+ atomic_sub(req->i.size >> 9, &mdev->ap_in_flight);
+ ++k_put;
+ }
+
+ /* potentially complete and destroy */
+
+ if (k_put || c_put) {
+ /* Completion does it's own kref_put. If we are going to
+ * kref_sub below, we need req to be still around then. */
+ int at_least = k_put + !!c_put;
+ int refcount = atomic_read(&req->kref.refcount);
+ if (refcount < at_least)
+ dev_err(DEV,
+ "mod_rq_state: Logic BUG: %x -> %x: refcount = %d, should be >= %d\n",
+ s, req->rq_state, refcount, at_least);
+ }
+
+ /* If we made progress, retry conflicting peer requests, if any. */
+ if (req->i.waiting)
+ wake_up(&mdev->misc_wait);
+
+ if (c_put)
+ k_put += drbd_req_put_completion_ref(req, m, c_put);
+ if (k_put)
+ kref_sub(&req->kref, k_put, drbd_req_destroy);
+}
+
static void drbd_report_io_error(struct drbd_conf *mdev, struct drbd_request *req)
{
char b[BDEVNAME_SIZE];
- if (__ratelimit(&drbd_ratelimit_state))
+ if (!__ratelimit(&drbd_ratelimit_state))
return;
dev_warn(DEV, "local %s IO error sector %llu+%u on %s\n",
(req->rq_state & RQ_WRITE) ? "WRITE" : "READ",
- (unsigned long long)req->sector,
- req->size >> 9,
+ (unsigned long long)req->i.sector,
+ req->i.size >> 9,
bdevname(mdev->ldev->backing_bdev, b));
}
@@ -416,9 +454,12 @@
int __req_mod(struct drbd_request *req, enum drbd_req_event what,
struct bio_and_error *m)
{
- struct drbd_conf *mdev = req->mdev;
- int rv = 0;
- m->bio = NULL;
+ struct drbd_conf *mdev = req->w.mdev;
+ struct net_conf *nc;
+ int p, rv = 0;
+
+ if (m)
+ m->bio = NULL;
switch (what) {
default:
@@ -427,118 +468,91 @@
/* does not happen...
* initialization done in drbd_req_new
- case created:
+ case CREATED:
break;
*/
- case to_be_send: /* via network */
- /* reached via drbd_make_request_common
+ case TO_BE_SENT: /* via network */
+ /* reached via __drbd_make_request
* and from w_read_retry_remote */
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ p = nc->wire_protocol;
+ rcu_read_unlock();
+ req->rq_state |=
+ p == DRBD_PROT_C ? RQ_EXP_WRITE_ACK :
+ p == DRBD_PROT_B ? RQ_EXP_RECEIVE_ACK : 0;
+ mod_rq_state(req, m, 0, RQ_NET_PENDING);
break;
- case to_be_submitted: /* locally */
- /* reached via drbd_make_request_common */
+ case TO_BE_SUBMITTED: /* locally */
+ /* reached via __drbd_make_request */
D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
- req->rq_state |= RQ_LOCAL_PENDING;
+ mod_rq_state(req, m, 0, RQ_LOCAL_PENDING);
break;
- case completed_ok:
+ case COMPLETED_OK:
if (req->rq_state & RQ_WRITE)
- mdev->writ_cnt += req->size>>9;
+ mdev->writ_cnt += req->i.size >> 9;
else
- mdev->read_cnt += req->size>>9;
+ mdev->read_cnt += req->i.size >> 9;
- req->rq_state |= (RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING,
+ RQ_LOCAL_COMPLETED|RQ_LOCAL_OK);
break;
- case abort_disk_io:
- req->rq_state |= RQ_LOCAL_ABORTED;
- if (req->rq_state & RQ_WRITE)
- _req_may_be_done_not_susp(req, m);
- else
- goto goto_queue_for_net_read;
+ case ABORT_DISK_IO:
+ mod_rq_state(req, m, 0, RQ_LOCAL_ABORTED);
break;
- case write_completed_with_error:
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
+ case WRITE_COMPLETED_WITH_ERROR:
drbd_report_io_error(mdev, req);
__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
break;
- case read_ahead_completed_with_error:
- /* it is legal to fail READA */
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
- _req_may_be_done_not_susp(req, m);
- break;
-
- case read_completed_with_error:
- drbd_set_out_of_sync(mdev, req->sector, req->size);
-
- req->rq_state |= RQ_LOCAL_COMPLETED;
- req->rq_state &= ~RQ_LOCAL_PENDING;
-
- if (req->rq_state & RQ_LOCAL_ABORTED) {
- _req_may_be_done(req, m);
- break;
- }
-
+ case READ_COMPLETED_WITH_ERROR:
+ drbd_set_out_of_sync(mdev, req->i.sector, req->i.size);
drbd_report_io_error(mdev, req);
__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
+ /* fall through. */
+ case READ_AHEAD_COMPLETED_WITH_ERROR:
+ /* it is legal to fail READA, no __drbd_chk_io_error in that case. */
+ mod_rq_state(req, m, RQ_LOCAL_PENDING, RQ_LOCAL_COMPLETED);
+ break;
- goto_queue_for_net_read:
-
- D_ASSERT(!(req->rq_state & RQ_NET_MASK));
-
- /* no point in retrying if there is no good remote data,
- * or we have no connection. */
- if (mdev->state.pdsk != D_UP_TO_DATE) {
- _req_may_be_done_not_susp(req, m);
- break;
- }
-
- /* _req_mod(req,to_be_send); oops, recursion... */
- req->rq_state |= RQ_NET_PENDING;
- inc_ap_pending(mdev);
- /* fall through: _req_mod(req,queue_for_net_read); */
-
- case queue_for_net_read:
+ case QUEUE_FOR_NET_READ:
/* READ or READA, and
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
- /* from drbd_make_request_common
+ /* from __drbd_make_request
* or from bio_endio during read io-error recovery */
- /* so we can verify the handle in the answer packet
- * corresponding hlist_del is in _req_may_be_done() */
- hlist_add_head(&req->collision, ar_hash_slot(mdev, req->sector));
+ /* So we can verify the handle in the answer packet.
+ * Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(drbd_interval_empty(&req->i));
+ drbd_insert_interval(&mdev->read_requests, &req->i);
- drbd_set_flag(mdev, UNPLUG_REMOTE);
+ set_bit(UNPLUG_REMOTE, &mdev->flags);
D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
- req->w.cb = (req->rq_state & RQ_LOCAL_MASK)
- ? w_read_retry_remote
- : w_send_read_req;
- drbd_queue_work(&mdev->data.work, &req->w);
+ D_ASSERT((req->rq_state & RQ_LOCAL_MASK) == 0);
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_read_req;
+ drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
- case queue_for_net_write:
+ case QUEUE_FOR_NET_WRITE:
/* assert something? */
- /* from drbd_make_request_common only */
+ /* from __drbd_make_request only */
- hlist_add_head(&req->collision, tl_hash_slot(mdev, req->sector));
- /* corresponding hlist_del is in _req_may_be_done() */
+ /* Corresponding drbd_remove_request_interval is in
+ * drbd_req_complete() */
+ D_ASSERT(drbd_interval_empty(&req->i));
+ drbd_insert_interval(&mdev->write_requests, &req->i);
/* NOTE
* In case the req ended up on the transfer log before being
@@ -549,7 +563,7 @@
*
* _req_add_to_epoch(req); this has to be after the
* _maybe_start_new_epoch(req); which happened in
- * drbd_make_request_common, because we now may set the bit
+ * __drbd_make_request, because we now may set the bit
* again ourselves to close the current epoch.
*
* Add req to the (now) current epoch (barrier). */
@@ -557,204 +571,189 @@
/* otherwise we may lose an unplug, which may cause some remote
* io-scheduler timeout to expire, increasing maximum latency,
* hurting performance. */
- drbd_set_flag(mdev, UNPLUG_REMOTE);
-
- /* see drbd_make_request_common,
- * just after it grabs the req_lock */
- D_ASSERT(drbd_test_flag(mdev, CREATE_BARRIER) == 0);
-
- req->epoch = mdev->newest_tle->br_number;
-
- /* increment size of current epoch */
- mdev->newest_tle->n_writes++;
+ set_bit(UNPLUG_REMOTE, &mdev->flags);
/* queue work item to send data */
D_ASSERT(req->rq_state & RQ_NET_PENDING);
- req->rq_state |= RQ_NET_QUEUED;
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED|RQ_EXP_BARR_ACK);
req->w.cb = w_send_dblock;
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&mdev->tconn->sender_work, &req->w);
/* close the epoch, in case it outgrew the limit */
- if (mdev->newest_tle->n_writes >= mdev->net_conf->max_epoch_size)
- queue_barrier(mdev);
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ p = nc->max_epoch_size;
+ rcu_read_unlock();
+ if (mdev->tconn->current_tle_writes >= p)
+ start_new_tl_epoch(mdev->tconn);
break;
- case queue_for_send_oos:
- req->rq_state |= RQ_NET_QUEUED;
- req->w.cb = w_send_oos;
- drbd_queue_work(&mdev->data.work, &req->w);
+ case QUEUE_FOR_SEND_OOS:
+ mod_rq_state(req, m, 0, RQ_NET_QUEUED);
+ req->w.cb = w_send_out_of_sync;
+ drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
- case read_retry_remote_canceled:
- case send_canceled:
- case send_failed:
+ case READ_RETRY_REMOTE_CANCELED:
+ case SEND_CANCELED:
+ case SEND_FAILED:
/* real cleanup will be done from tl_clear. just update flags
* so it is no longer marked as on the worker queue */
- req->rq_state &= ~RQ_NET_QUEUED;
- /* if we did it right, tl_clear should be scheduled only after
- * this, so this should not be necessary! */
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_QUEUED, 0);
break;
- case handed_over_to_network:
+ case HANDED_OVER_TO_NETWORK:
/* assert something? */
- if (bio_data_dir(req->master_bio) == WRITE)
- atomic_add(req->size>>9, &mdev->ap_in_flight);
-
if (bio_data_dir(req->master_bio) == WRITE &&
- mdev->net_conf->wire_protocol == DRBD_PROT_A) {
+ !(req->rq_state & (RQ_EXP_RECEIVE_ACK | RQ_EXP_WRITE_ACK))) {
/* this is what is dangerous about protocol A:
* pretend it was successfully written on the peer. */
- if (req->rq_state & RQ_NET_PENDING) {
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= RQ_NET_OK;
- } /* else: neg-ack was faster... */
+ if (req->rq_state & RQ_NET_PENDING)
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
+ /* else: neg-ack was faster... */
/* it is still not yet RQ_NET_DONE until the
* corresponding epoch barrier got acked as well,
* so we know what to dirty on connection loss */
}
- req->rq_state &= ~RQ_NET_QUEUED;
- req->rq_state |= RQ_NET_SENT;
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_SENT);
break;
- case oos_handed_to_network:
+ case OOS_HANDED_TO_NETWORK:
/* Was not set PENDING, no longer QUEUED, so is now DONE
* as far as this connection is concerned. */
- req->rq_state &= ~RQ_NET_QUEUED;
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_QUEUED, RQ_NET_DONE);
break;
- case connection_lost_while_pending:
+ case CONNECTION_LOST_WHILE_PENDING:
/* transfer log cleanup after connection loss */
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING)
- dec_ap_pending(mdev);
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
- req->rq_state |= RQ_NET_DONE;
- if (req->rq_state & RQ_NET_SENT && req->rq_state & RQ_WRITE)
- atomic_sub(req->size>>9, &mdev->ap_in_flight);
-
- /* if it is still queued, we may not complete it here.
- * it will be canceled soon. */
- if (!(req->rq_state & RQ_NET_QUEUED))
- _req_may_be_done(req, m); /* Allowed while state.susp */
+ mod_rq_state(req, m,
+ RQ_NET_OK|RQ_NET_PENDING|RQ_COMPLETION_SUSP,
+ RQ_NET_DONE);
break;
- case conflict_discarded_by_peer:
- /* for discarded conflicting writes of multiple primaries,
+ case CONFLICT_RESOLVED:
+ /* for superseded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
- * node crashes are covered by the activity log. */
- if (what == conflict_discarded_by_peer)
- dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
- " DRBD is not a random data generator!\n",
- (unsigned long long)req->sector, req->size);
- req->rq_state |= RQ_NET_DONE;
- /* fall through */
- case write_acked_by_peer_and_sis:
- case write_acked_by_peer:
- if (what == write_acked_by_peer_and_sis)
- req->rq_state |= RQ_NET_SIS;
+ * node crashes are covered by the activity log.
+ *
+ * If this request had been marked as RQ_POSTPONED before,
+ * it will actually not be completed, but "restarted",
+ * resubmitted from the retry worker context. */
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_DONE|RQ_NET_OK);
+ break;
+
+ case WRITE_ACKED_BY_PEER_AND_SIS:
+ req->rq_state |= RQ_NET_SIS;
+ case WRITE_ACKED_BY_PEER:
+ D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
/* protocol C; successfully written on peer.
* Nothing more to do here.
* We want to keep the tl in place for all protocols, to cater
* for volatile write-back caches on lower level devices. */
- case recv_acked_by_peer:
+ goto ack_common;
+ case RECV_ACKED_BY_PEER:
+ D_ASSERT(req->rq_state & RQ_EXP_RECEIVE_ACK);
/* protocol B; pretends to be successfully written on peer.
- * see also notes above in handed_over_to_network about
+ * see also notes above in HANDED_OVER_TO_NETWORK about
* protocol != C */
- req->rq_state |= RQ_NET_OK;
+ ack_common:
D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- atomic_sub(req->size>>9, &mdev->ap_in_flight);
- req->rq_state &= ~RQ_NET_PENDING;
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK);
break;
- case neg_acked:
- /* assert something? */
- if (req->rq_state & RQ_NET_PENDING) {
- dec_ap_pending(mdev);
- atomic_sub(req->size>>9, &mdev->ap_in_flight);
- }
- req->rq_state &= ~(RQ_NET_OK|RQ_NET_PENDING);
-
- req->rq_state |= RQ_NET_DONE;
- _req_may_be_done_not_susp(req, m);
- /* else: done by handed_over_to_network */
+ case POSTPONE_WRITE:
+ D_ASSERT(req->rq_state & RQ_EXP_WRITE_ACK);
+ /* If this node has already detected the write conflict, the
+ * worker will be waiting on misc_wait. Wake it up once this
+ * request has completed locally.
+ */
+ D_ASSERT(req->rq_state & RQ_NET_PENDING);
+ req->rq_state |= RQ_POSTPONED;
+ if (req->i.waiting)
+ wake_up(&mdev->misc_wait);
+ /* Do not clear RQ_NET_PENDING. This request will make further
+ * progress via restart_conflicting_writes() or
+ * fail_postponed_requests(). Hopefully. */
break;
- case fail_frozen_disk_io:
+ case NEG_ACKED:
+ mod_rq_state(req, m, RQ_NET_OK|RQ_NET_PENDING, 0);
+ break;
+
+ case FAIL_FROZEN_DISK_IO:
+ if (!(req->rq_state & RQ_LOCAL_COMPLETED))
+ break;
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
+ break;
+
+ case RESTART_FROZEN_DISK_IO:
if (!(req->rq_state & RQ_LOCAL_COMPLETED))
break;
- _req_may_be_done(req, m); /* Allowed while state.susp */
- break;
-
- case restart_frozen_disk_io:
- if (!(req->rq_state & RQ_LOCAL_COMPLETED))
- break;
-
- req->rq_state &= ~RQ_LOCAL_COMPLETED;
+ mod_rq_state(req, m,
+ RQ_COMPLETION_SUSP|RQ_LOCAL_COMPLETED,
+ RQ_LOCAL_PENDING);
rv = MR_READ;
if (bio_data_dir(req->master_bio) == WRITE)
rv = MR_WRITE;
- get_ldev(mdev);
+ get_ldev(mdev); /* always succeeds in this call path */
req->w.cb = w_restart_disk_io;
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&mdev->tconn->sender_work, &req->w);
break;
- case resend:
+ case RESEND:
/* Simply complete (local only) READs. */
if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
- _req_may_be_done(req, m);
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, 0);
break;
}
/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
- before the connection loss (B&C only); only P_BARRIER_ACK was missing.
- Trowing them out of the TL here by pretending we got a BARRIER_ACK
- We ensure that the peer was not rebooted */
+ before the connection loss (B&C only); only P_BARRIER_ACK
+ (or the local completion?) was missing when we suspended.
+ Throwing them out of the TL here by pretending we got a BARRIER_ACK.
+ During connection handshake, we ensure that the peer was not rebooted. */
if (!(req->rq_state & RQ_NET_OK)) {
+ /* FIXME could this possibly be a req->w.cb == w_send_out_of_sync?
+ * in that case we must not set RQ_NET_PENDING. */
+
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP, RQ_NET_QUEUED|RQ_NET_PENDING);
if (req->w.cb) {
- drbd_queue_work(&mdev->data.work, &req->w);
+ drbd_queue_work(&mdev->tconn->sender_work, &req->w);
rv = req->rq_state & RQ_WRITE ? MR_WRITE : MR_READ;
- }
+ } /* else: FIXME can this happen? */
break;
}
- /* else, fall through to barrier_acked */
+ /* else, fall through to BARRIER_ACKED */
- case barrier_acked:
+ case BARRIER_ACKED:
+ /* barrier ack for READ requests does not make sense */
if (!(req->rq_state & RQ_WRITE))
break;
if (req->rq_state & RQ_NET_PENDING) {
- /* barrier came in before all requests have been acked.
+ /* barrier came in before all requests were acked.
* this is bad, because if the connection is lost now,
* we won't be able to clean them up... */
- dev_err(DEV, "FIXME (barrier_acked but pending)\n");
- list_move(&req->tl_requests, &mdev->out_of_sequence_requests);
+ dev_err(DEV, "FIXME (BARRIER_ACKED but pending)\n");
}
- if ((req->rq_state & RQ_NET_MASK) != 0) {
- req->rq_state |= RQ_NET_DONE;
- if (mdev->net_conf->wire_protocol == DRBD_PROT_A)
- atomic_sub(req->size>>9, &mdev->ap_in_flight);
- }
- _req_may_be_done(req, m); /* Allowed while state.susp */
+ /* Allowed to complete requests, even while suspended.
+ * As this is called for all requests within a matching epoch,
+ * we need to filter, and only set RQ_NET_DONE for those that
+ * have actually been on the wire. */
+ mod_rq_state(req, m, RQ_COMPLETION_SUSP,
+ (req->rq_state & RQ_NET_MASK) ? RQ_NET_DONE : 0);
break;
- case data_received:
+ case DATA_RECEIVED:
D_ASSERT(req->rq_state & RQ_NET_PENDING);
- dec_ap_pending(mdev);
- req->rq_state &= ~RQ_NET_PENDING;
- req->rq_state |= (RQ_NET_OK|RQ_NET_DONE);
- _req_may_be_done_not_susp(req, m);
+ mod_rq_state(req, m, RQ_NET_PENDING, RQ_NET_OK|RQ_NET_DONE);
break;
};
@@ -768,75 +767,265 @@
* since size may be bigger than BM_BLOCK_SIZE,
* we may need to check several bits.
*/
-static int drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
+static bool drbd_may_do_local_read(struct drbd_conf *mdev, sector_t sector, int size)
{
unsigned long sbnr, ebnr;
sector_t esector, nr_sectors;
if (mdev->state.disk == D_UP_TO_DATE)
- return 1;
- if (mdev->state.disk >= D_OUTDATED)
- return 0;
- if (mdev->state.disk < D_INCONSISTENT)
- return 0;
- /* state.disk == D_INCONSISTENT We will have a look at the BitMap */
- nr_sectors = drbd_get_capacity(mdev->this_bdev);
+ return true;
+ if (mdev->state.disk != D_INCONSISTENT)
+ return false;
esector = sector + (size >> 9) - 1;
-
+ nr_sectors = drbd_get_capacity(mdev->this_bdev);
D_ASSERT(sector < nr_sectors);
D_ASSERT(esector < nr_sectors);
sbnr = BM_SECT_TO_BIT(sector);
ebnr = BM_SECT_TO_BIT(esector);
- return 0 == drbd_bm_count_bits(mdev, sbnr, ebnr);
+ return drbd_bm_count_bits(mdev, sbnr, ebnr) == 0;
}
+static bool remote_due_to_read_balancing(struct drbd_conf *mdev, sector_t sector,
+ enum drbd_read_balancing rbm)
+{
+ struct backing_dev_info *bdi;
+ int stripe_shift;
+
+ switch (rbm) {
+ case RB_CONGESTED_REMOTE:
+ bdi = &mdev->ldev->backing_bdev->bd_disk->queue->backing_dev_info;
+ return bdi_read_congested(bdi);
+ case RB_LEAST_PENDING:
+ return atomic_read(&mdev->local_cnt) >
+ atomic_read(&mdev->ap_pending_cnt) + atomic_read(&mdev->rs_pending_cnt);
+ case RB_32K_STRIPING: /* stripe_shift = 15 */
+ case RB_64K_STRIPING:
+ case RB_128K_STRIPING:
+ case RB_256K_STRIPING:
+ case RB_512K_STRIPING:
+ case RB_1M_STRIPING: /* stripe_shift = 20 */
+ stripe_shift = (rbm - RB_32K_STRIPING + 15);
+ return (sector >> (stripe_shift - 9)) & 1;
+ case RB_ROUND_ROBIN:
+ return test_and_change_bit(READ_BALANCE_RR, &mdev->flags);
+ case RB_PREFER_REMOTE:
+ return true;
+ case RB_PREFER_LOCAL:
+ default:
+ return false;
+ }
+}
+
+/*
+ * complete_conflicting_writes - wait for any conflicting write requests
+ *
+ * The write_requests tree contains all active write requests which we
+ * currently know about. Wait for any requests to complete which conflict with
+ * the new one.
+ *
+ * Only way out: remove the conflicting intervals from the tree.
+ */
+static void complete_conflicting_writes(struct drbd_request *req)
+{
+ DEFINE_WAIT(wait);
+ struct drbd_conf *mdev = req->w.mdev;
+ struct drbd_interval *i;
+ sector_t sector = req->i.sector;
+ int size = req->i.size;
+
+ i = drbd_find_overlap(&mdev->write_requests, sector, size);
+ if (!i)
+ return;
+
+ for (;;) {
+ prepare_to_wait(&mdev->misc_wait, &wait, TASK_UNINTERRUPTIBLE);
+ i = drbd_find_overlap(&mdev->write_requests, sector, size);
+ if (!i)
+ break;
+ /* Indicate to wake up device->misc_wait on progress. */
+ i->waiting = true;
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ schedule();
+ spin_lock_irq(&mdev->tconn->req_lock);
+ }
+ finish_wait(&mdev->misc_wait, &wait);
+}
+
+/* called within req_lock and rcu_read_lock() */
static void maybe_pull_ahead(struct drbd_conf *mdev)
{
- int congested = 0;
+ struct drbd_tconn *tconn = mdev->tconn;
+ struct net_conf *nc;
+ bool congested = false;
+ enum drbd_on_congestion on_congestion;
+
+ nc = rcu_dereference(tconn->net_conf);
+ on_congestion = nc ? nc->on_congestion : OC_BLOCK;
+ if (on_congestion == OC_BLOCK ||
+ tconn->agreed_pro_version < 96)
+ return;
/* If I don't even have good local storage, we can not reasonably try
* to pull ahead of the peer. We also need the local reference to make
* sure mdev->act_log is there.
- * Note: caller has to make sure that net_conf is there.
*/
if (!get_ldev_if_state(mdev, D_UP_TO_DATE))
return;
- if (mdev->net_conf->cong_fill &&
- atomic_read(&mdev->ap_in_flight) >= mdev->net_conf->cong_fill) {
+ if (nc->cong_fill &&
+ atomic_read(&mdev->ap_in_flight) >= nc->cong_fill) {
dev_info(DEV, "Congestion-fill threshold reached\n");
- congested = 1;
+ congested = true;
}
- if (mdev->act_log->used >= mdev->net_conf->cong_extents) {
+ if (mdev->act_log->used >= nc->cong_extents) {
dev_info(DEV, "Congestion-extents threshold reached\n");
- congested = 1;
+ congested = true;
}
if (congested) {
- queue_barrier(mdev); /* last barrier, after mirrored writes */
+ /* start a new epoch for non-mirrored writes */
+ start_new_tl_epoch(mdev->tconn);
- if (mdev->net_conf->on_congestion == OC_PULL_AHEAD)
+ if (on_congestion == OC_PULL_AHEAD)
_drbd_set_state(_NS(mdev, conn, C_AHEAD), 0, NULL);
- else /*mdev->net_conf->on_congestion == OC_DISCONNECT */
+ else /*nc->on_congestion == OC_DISCONNECT */
_drbd_set_state(_NS(mdev, conn, C_DISCONNECTING), 0, NULL);
}
put_ldev(mdev);
}
-static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
+/* If this returns false, and req->private_bio is still set,
+ * this should be submitted locally.
+ *
+ * If it returns false, but req->private_bio is not set,
+ * we do not have access to good data :(
+ *
+ * Otherwise, this destroys req->private_bio, if any,
+ * and returns true.
+ */
+static bool do_remote_read(struct drbd_request *req)
+{
+ struct drbd_conf *mdev = req->w.mdev;
+ enum drbd_read_balancing rbm;
+
+ if (req->private_bio) {
+ if (!drbd_may_do_local_read(mdev,
+ req->i.sector, req->i.size)) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(mdev);
+ }
+ }
+
+ if (mdev->state.pdsk != D_UP_TO_DATE)
+ return false;
+
+ if (req->private_bio == NULL)
+ return true;
+
+ /* TODO: improve read balancing decisions, take into account drbd
+ * protocol, pending requests etc. */
+
+ rcu_read_lock();
+ rbm = rcu_dereference(mdev->ldev->disk_conf)->read_balancing;
+ rcu_read_unlock();
+
+ if (rbm == RB_PREFER_LOCAL && req->private_bio)
+ return false; /* submit locally */
+
+ if (remote_due_to_read_balancing(mdev, req->i.sector, rbm)) {
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(mdev);
+ }
+ return true;
+ }
+
+ return false;
+}
+
+/* returns number of connections (== 1, for drbd 8.4)
+ * expected to actually write this data,
+ * which does NOT include those that we are L_AHEAD for. */
+static int drbd_process_write_request(struct drbd_request *req)
+{
+ struct drbd_conf *mdev = req->w.mdev;
+ int remote, send_oos;
+
+ rcu_read_lock();
+ remote = drbd_should_do_remote(mdev->state);
+ if (remote) {
+ maybe_pull_ahead(mdev);
+ remote = drbd_should_do_remote(mdev->state);
+ }
+ send_oos = drbd_should_send_out_of_sync(mdev->state);
+ rcu_read_unlock();
+
+ /* Need to replicate writes. Unless it is an empty flush,
+ * which is better mapped to a DRBD P_BARRIER packet,
+ * also for drbd wire protocol compatibility reasons.
+ * If this was a flush, just start a new epoch.
+ * Unless the current epoch was empty anyways, or we are not currently
+ * replicating, in which case there is no point. */
+ if (unlikely(req->i.size == 0)) {
+ /* The only size==0 bios we expect are empty flushes. */
+ D_ASSERT(req->master_bio->bi_rw & REQ_FLUSH);
+ if (remote)
+ start_new_tl_epoch(mdev->tconn);
+ return 0;
+ }
+
+ if (!remote && !send_oos)
+ return 0;
+
+ D_ASSERT(!(remote && send_oos));
+
+ if (remote) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_WRITE);
+ } else if (drbd_set_out_of_sync(mdev, req->i.sector, req->i.size))
+ _req_mod(req, QUEUE_FOR_SEND_OOS);
+
+ return remote;
+}
+
+static void
+drbd_submit_req_private_bio(struct drbd_request *req)
+{
+ struct drbd_conf *mdev = req->w.mdev;
+ struct bio *bio = req->private_bio;
+ const int rw = bio_rw(bio);
+
+ bio->bi_bdev = mdev->ldev->backing_bdev;
+
+ /* State may have changed since we grabbed our reference on the
+ * ->ldev member. Double check, and short-circuit to endio.
+ * In case the last activity log transaction failed to get on
+ * stable storage, and this is a WRITE, we may not even submit
+ * this bio. */
+ if (get_ldev(mdev)) {
+ if (drbd_insert_fault(mdev,
+ rw == WRITE ? DRBD_FAULT_DT_WR
+ : rw == READ ? DRBD_FAULT_DT_RD
+ : DRBD_FAULT_DT_RA))
+ bio_endio(bio, -EIO);
+ else
+ generic_make_request(bio);
+ put_ldev(mdev);
+ } else
+ bio_endio(bio, -EIO);
+}
+
+void __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
{
const int rw = bio_rw(bio);
- const int size = bio->bi_size;
- const sector_t sector = bio->bi_sector;
- struct drbd_tl_epoch *b = NULL;
+ struct bio_and_error m = { NULL, };
struct drbd_request *req;
- int local, remote, send_oos = 0;
- int err = -EIO;
- int ret = 0;
- union drbd_state s;
+ bool no_remote = false;
/* allocate outside of all locks; */
req = drbd_req_new(mdev, bio);
@@ -846,55 +1035,14 @@
* if user cannot handle io errors, that's not our business. */
dev_err(DEV, "could not kmalloc() req\n");
bio_endio(bio, -ENOMEM);
- return 0;
+ return;
}
req->start_time = start_time;
- local = get_ldev(mdev);
- if (!local) {
- bio_put(req->private_bio); /* or we get a bio leak */
+ if (!get_ldev(mdev)) {
+ bio_put(req->private_bio);
req->private_bio = NULL;
}
- if (rw == WRITE) {
- /* Need to replicate writes. Unless it is an empty flush,
- * which is better mapped to a DRBD P_BARRIER packet,
- * also for drbd wire protocol compatibility reasons. */
- if (unlikely(size == 0)) {
- /* The only size==0 bios we expect are empty flushes. */
- D_ASSERT(bio->bi_rw & REQ_FLUSH);
- remote = 0;
- } else
- remote = 1;
- } else {
- /* READ || READA */
- if (local) {
- if (!drbd_may_do_local_read(mdev, sector, size)) {
- /* we could kick the syncer to
- * sync this extent asap, wait for
- * it, then continue locally.
- * Or just issue the request remotely.
- */
- local = 0;
- bio_put(req->private_bio);
- req->private_bio = NULL;
- put_ldev(mdev);
- }
- }
- remote = !local && mdev->state.pdsk >= D_UP_TO_DATE;
- }
-
- /* If we have a disk, but a READA request is mapped to remote,
- * we are R_PRIMARY, D_INCONSISTENT, SyncTarget.
- * Just fail that READA request right here.
- *
- * THINK: maybe fail all READA when not local?
- * or make this configurable...
- * if network is slow, READA won't do any good.
- */
- if (rw == READA && mdev->state.disk >= D_INCONSISTENT && !local) {
- err = -EWOULDBLOCK;
- goto fail_and_free_req;
- }
/* For WRITES going to the local disk, grab a reference on the target
* extent. This waits for any resync activity in the corresponding
@@ -903,349 +1051,131 @@
* of transactional on-disk meta data updates.
* Empty flushes don't need to go into the activity log, they can only
* flush data for pending writes which are already in there. */
- if (rw == WRITE && local && size
- && !drbd_test_flag(mdev, AL_SUSPENDED)) {
+ if (rw == WRITE && req->private_bio && req->i.size
+ && !test_bit(AL_SUSPENDED, &mdev->flags)) {
req->rq_state |= RQ_IN_ACT_LOG;
- drbd_al_begin_io(mdev, sector);
+ drbd_al_begin_io(mdev, &req->i);
}
- s = mdev->state;
- remote = remote && drbd_should_do_remote(s);
- send_oos = rw == WRITE && drbd_should_send_oos(s);
- D_ASSERT(!(remote && send_oos));
-
- if (!(local || remote) && !is_susp(mdev->state)) {
- if (__ratelimit(&drbd_ratelimit_state))
- dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
- (unsigned long long)req->sector, req->size >> 9);
- goto fail_free_complete;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ if (rw == WRITE) {
+ /* This may temporarily give up the req_lock,
+ * but will re-aquire it before it returns here.
+ * Needs to be before the check on drbd_suspended() */
+ complete_conflicting_writes(req);
}
- /* For WRITE request, we have to make sure that we have an
- * unused_spare_tle, in case we need to start a new epoch.
- * I try to be smart and avoid to pre-allocate always "just in case",
- * but there is a race between testing the bit and pointer outside the
- * spinlock, and grabbing the spinlock.
- * if we lost that race, we retry. */
- if (rw == WRITE && (remote || send_oos) &&
- mdev->unused_spare_tle == NULL &&
- drbd_test_flag(mdev, CREATE_BARRIER)) {
-allocate_barrier:
- b = kmalloc(sizeof(struct drbd_tl_epoch), GFP_NOIO);
- if (!b) {
- dev_err(DEV, "Failed to alloc barrier.\n");
- err = -ENOMEM;
- goto fail_free_complete;
+ /* no more giving up req_lock from now on! */
+
+ if (drbd_suspended(mdev)) {
+ /* push back and retry: */
+ req->rq_state |= RQ_POSTPONED;
+ if (req->private_bio) {
+ bio_put(req->private_bio);
+ req->private_bio = NULL;
+ put_ldev(mdev);
}
+ goto out;
}
- /* GOOD, everything prepared, grab the spin_lock */
- spin_lock_irq(&mdev->req_lock);
-
- if (is_susp(mdev->state)) {
- /* If we got suspended, use the retry mechanism of
- drbd_make_request() to restart processing of this
- bio. In the next call to drbd_make_request
- we sleep in inc_ap_bio() */
- ret = 1;
- spin_unlock_irq(&mdev->req_lock);
- goto fail_free_complete;
- }
-
- if (remote || send_oos) {
- remote = drbd_should_do_remote(mdev->state);
- send_oos = rw == WRITE && drbd_should_send_oos(mdev->state);
- D_ASSERT(!(remote && send_oos));
-
- if (!(remote || send_oos))
- dev_warn(DEV, "lost connection while grabbing the req_lock!\n");
- if (!(local || remote)) {
- dev_err(DEV, "IO ERROR: neither local nor remote disk\n");
- spin_unlock_irq(&mdev->req_lock);
- goto fail_free_complete;
- }
- }
-
- if (b && mdev->unused_spare_tle == NULL) {
- mdev->unused_spare_tle = b;
- b = NULL;
- }
- if (rw == WRITE && (remote || send_oos) &&
- mdev->unused_spare_tle == NULL &&
- drbd_test_flag(mdev, CREATE_BARRIER)) {
- /* someone closed the current epoch
- * while we were grabbing the spinlock */
- spin_unlock_irq(&mdev->req_lock);
- goto allocate_barrier;
- }
-
-
/* Update disk stats */
_drbd_start_io_acct(mdev, req, bio);
- /* _maybe_start_new_epoch(mdev);
- * If we need to generate a write barrier packet, we have to add the
- * new epoch (barrier) object, and queue the barrier packet for sending,
- * and queue the req's data after it _within the same lock_, otherwise
- * we have race conditions were the reorder domains could be mixed up.
- *
- * Even read requests may start a new epoch and queue the corresponding
- * barrier packet. To get the write ordering right, we only have to
- * make sure that, if this is a write request and it triggered a
- * barrier packet, this request is queued within the same spinlock. */
- if ((remote || send_oos) && mdev->unused_spare_tle &&
- drbd_test_and_clear_flag(mdev, CREATE_BARRIER)) {
- _tl_add_barrier(mdev, mdev->unused_spare_tle);
- mdev->unused_spare_tle = NULL;
- } else {
- D_ASSERT(!(remote && rw == WRITE &&
- drbd_test_flag(mdev, CREATE_BARRIER)));
+ /* We fail READ/READA early, if we can not serve it.
+ * We must do this before req is registered on any lists.
+ * Otherwise, drbd_req_complete() will queue failed READ for retry. */
+ if (rw != WRITE) {
+ if (!do_remote_read(req) && !req->private_bio)
+ goto nodata;
}
- /* NOTE
- * Actually, 'local' may be wrong here already, since we may have failed
- * to write to the meta data, and may become wrong anytime because of
- * local io-error for some other request, which would lead to us
- * "detaching" the local disk.
- *
- * 'remote' may become wrong any time because the network could fail.
- *
- * This is a harmless race condition, though, since it is handled
- * correctly at the appropriate places; so it just defers the failure
- * of the respective operation.
- */
-
- /* mark them early for readability.
- * this just sets some state flags. */
- if (remote)
- _req_mod(req, to_be_send);
- if (local)
- _req_mod(req, to_be_submitted);
-
- /* check this request on the collision detection hash tables.
- * if we have a conflict, just complete it here.
- * THINK do we want to check reads, too? (I don't think so...) */
- if (rw == WRITE && _req_conflicts(req))
- goto fail_conflicting;
+ /* which transfer log epoch does this belong to? */
+ req->epoch = atomic_read(&mdev->tconn->current_tle_nr);
/* no point in adding empty flushes to the transfer log,
* they are mapped to drbd barriers already. */
- if (likely(size!=0))
- list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+ if (likely(req->i.size!=0)) {
+ if (rw == WRITE)
+ mdev->tconn->current_tle_writes++;
- /* NOTE remote first: to get the concurrent write detection right,
- * we must register the request before start of local IO. */
- if (remote) {
- /* either WRITE and C_CONNECTED,
- * or READ, and no local disk,
- * or READ, but not in sync.
- */
- _req_mod(req, (rw == WRITE)
- ? queue_for_net_write
- : queue_for_net_read);
+ list_add_tail(&req->tl_requests, &mdev->tconn->transfer_log);
}
- if (send_oos && drbd_set_out_of_sync(mdev, sector, size))
- _req_mod(req, queue_for_send_oos);
- if (remote &&
- mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
- maybe_pull_ahead(mdev);
-
- /* If this was a flush, queue a drbd barrier/start a new epoch.
- * Unless the current epoch was empty anyways, or we are not currently
- * replicating, in which case there is no point. */
- if (unlikely(bio->bi_rw & REQ_FLUSH)
- && mdev->newest_tle->n_writes
- && drbd_should_do_remote(mdev->state))
- queue_barrier(mdev);
-
- spin_unlock_irq(&mdev->req_lock);
- kfree(b); /* if someone else has beaten us to it... */
-
- if (local) {
- req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
-
- /* State may have changed since we grabbed our reference on the
- * mdev->ldev member. Double check, and short-circuit to endio.
- * In case the last activity log transaction failed to get on
- * stable storage, and this is a WRITE, we may not even submit
- * this bio. */
- if (get_ldev(mdev)) {
- if (drbd_insert_fault(mdev, rw == WRITE ? DRBD_FAULT_DT_WR
- : rw == READ ? DRBD_FAULT_DT_RD
- : DRBD_FAULT_DT_RA))
- bio_endio(req->private_bio, -EIO);
- else
- generic_make_request(req->private_bio);
- put_ldev(mdev);
+ if (rw == WRITE) {
+ if (!drbd_process_write_request(req))
+ no_remote = true;
+ } else {
+ /* We either have a private_bio, or we can read from remote.
+ * Otherwise we had done the goto nodata above. */
+ if (req->private_bio == NULL) {
+ _req_mod(req, TO_BE_SENT);
+ _req_mod(req, QUEUE_FOR_NET_READ);
} else
- bio_endio(req->private_bio, -EIO);
+ no_remote = true;
}
- return 0;
-
-fail_conflicting:
- /* this is a conflicting request.
- * even though it may have been only _partially_
- * overlapping with one of the currently pending requests,
- * without even submitting or sending it, we will
- * pretend that it was successfully served right now.
- */
- _drbd_end_io_acct(mdev, req);
- spin_unlock_irq(&mdev->req_lock);
- if (remote)
- dec_ap_pending(mdev);
- /* THINK: do we want to fail it (-EIO), or pretend success?
- * this pretends success. */
- err = 0;
-
-fail_free_complete:
- if (req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_complete_io(mdev, sector);
-fail_and_free_req:
- if (local) {
- bio_put(req->private_bio);
- req->private_bio = NULL;
- put_ldev(mdev);
- }
- if (!ret)
- bio_endio(bio, err);
-
- drbd_req_free(req);
- dec_ap_bio(mdev);
- kfree(b);
-
- return ret;
-}
-
-/* helper function for drbd_make_request
- * if we can determine just by the mdev (state) that this request will fail,
- * return 1
- * otherwise return 0
- */
-static int drbd_fail_request_early(struct drbd_conf *mdev, int is_write)
-{
- if (mdev->state.role != R_PRIMARY &&
- (!allow_oos || is_write)) {
- if (__ratelimit(&drbd_ratelimit_state)) {
- dev_err(DEV, "Process %s[%u] tried to %s; "
- "since we are not in Primary state, "
- "we cannot allow this\n",
- current->comm, current->pid,
- is_write ? "WRITE" : "READ");
- }
- return 1;
+ if (req->private_bio) {
+ /* needs to be marked within the same spinlock */
+ _req_mod(req, TO_BE_SUBMITTED);
+ /* but we need to give up the spinlock to submit */
+ spin_unlock_irq(&mdev->tconn->req_lock);
+ drbd_submit_req_private_bio(req);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ } else if (no_remote) {
+nodata:
+ if (__ratelimit(&drbd_ratelimit_state))
+ dev_err(DEV, "IO ERROR: neither local nor remote data, sector %llu+%u\n",
+ (unsigned long long)req->i.sector, req->i.size >> 9);
+ /* A write may have been queued for send_oos, however.
+ * So we can not simply free it, we must go through drbd_req_put_completion_ref() */
}
- return 0;
+out:
+ if (drbd_req_put_completion_ref(req, &m, 1))
+ kref_put(&req->kref, drbd_req_destroy);
+ spin_unlock_irq(&mdev->tconn->req_lock);
+
+ if (m.bio)
+ complete_master_bio(mdev, &m);
+ return;
}
void drbd_make_request(struct request_queue *q, struct bio *bio)
{
- unsigned int s_enr, e_enr;
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
unsigned long start_time;
- if (drbd_fail_request_early(mdev, bio_data_dir(bio) & WRITE)) {
- bio_endio(bio, -EPERM);
- return;
- }
-
start_time = jiffies;
/*
* what we "blindly" assume:
*/
- D_ASSERT((bio->bi_size & 0x1ff) == 0);
+ D_ASSERT(IS_ALIGNED(bio->bi_size, 512));
- /* to make some things easier, force alignment of requests within the
- * granularity of our hash tables */
- s_enr = bio->bi_sector >> HT_SHIFT;
- e_enr = bio->bi_size ? (bio->bi_sector+(bio->bi_size>>9)-1) >> HT_SHIFT : s_enr;
-
- if (likely(s_enr == e_enr)) {
- do {
- inc_ap_bio(mdev, 1);
- } while (drbd_make_request_common(mdev, bio, start_time));
- return;
- }
-
- /* can this bio be split generically?
- * Maybe add our own split-arbitrary-bios function. */
- if (bio->bi_vcnt != 1 || bio->bi_idx != 0 || bio->bi_size > DRBD_MAX_BIO_SIZE) {
- /* rather error out here than BUG in bio_split */
- dev_err(DEV, "bio would need to, but cannot, be split: "
- "(vcnt=%u,idx=%u,size=%u,sector=%llu)\n",
- bio->bi_vcnt, bio->bi_idx, bio->bi_size,
- (unsigned long long)bio->bi_sector);
- bio_endio(bio, -EINVAL);
- } else {
- /* This bio crosses some boundary, so we have to split it. */
- struct bio_pair *bp;
- /* works for the "do not cross hash slot boundaries" case
- * e.g. sector 262269, size 4096
- * s_enr = 262269 >> 6 = 4097
- * e_enr = (262269+8-1) >> 6 = 4098
- * HT_SHIFT = 6
- * sps = 64, mask = 63
- * first_sectors = 64 - (262269 & 63) = 3
- */
- const sector_t sect = bio->bi_sector;
- const int sps = 1 << HT_SHIFT; /* sectors per slot */
- const int mask = sps - 1;
- const sector_t first_sectors = sps - (sect & mask);
- bp = bio_split(bio, first_sectors);
-
- /* we need to get a "reference count" (ap_bio_cnt)
- * to avoid races with the disconnect/reconnect/suspend code.
- * In case we need to split the bio here, we need to get three references
- * atomically, otherwise we might deadlock when trying to submit the
- * second one! */
- inc_ap_bio(mdev, 3);
-
- D_ASSERT(e_enr == s_enr + 1);
-
- while (drbd_make_request_common(mdev, &bp->bio1, start_time))
- inc_ap_bio(mdev, 1);
-
- while (drbd_make_request_common(mdev, &bp->bio2, start_time))
- inc_ap_bio(mdev, 1);
-
- dec_ap_bio(mdev);
-
- bio_pair_release(bp);
- }
+ inc_ap_bio(mdev);
+ __drbd_make_request(mdev, bio, start_time);
}
-/* This is called by bio_add_page(). With this function we reduce
- * the number of BIOs that span over multiple DRBD_MAX_BIO_SIZEs
- * units (was AL_EXTENTs).
+/* This is called by bio_add_page().
*
- * we do the calculation within the lower 32bit of the byte offsets,
- * since we don't care for actual offset, but only check whether it
- * would cross "activity log extent" boundaries.
+ * q->max_hw_sectors and other global limits are already enforced there.
+ *
+ * We need to call down to our lower level device,
+ * in case it has special restrictions.
+ *
+ * We also may need to enforce configured max-bio-bvecs limits.
*
* As long as the BIO is empty we have to allow at least one bvec,
- * regardless of size and offset. so the resulting bio may still
- * cross extent boundaries. those are dealt with (bio_split) in
- * drbd_make_request.
+ * regardless of size and offset, so no need to ask lower levels.
*/
int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec)
{
struct drbd_conf *mdev = (struct drbd_conf *) q->queuedata;
- unsigned int bio_offset =
- (unsigned int)bvm->bi_sector << 9; /* 32 bit */
unsigned int bio_size = bvm->bi_size;
- int limit, backing_limit;
+ int limit = DRBD_MAX_BIO_SIZE;
+ int backing_limit;
- limit = DRBD_MAX_BIO_SIZE
- - ((bio_offset & (DRBD_MAX_BIO_SIZE-1)) + bio_size);
- if (limit < 0)
- limit = 0;
- if (bio_size == 0) {
- if (limit <= bvec->bv_len)
- limit = bvec->bv_len;
- } else if (limit && get_ldev(mdev)) {
+ if (bio_size && get_ldev(mdev)) {
struct request_queue * const b =
mdev->ldev->backing_bdev->bd_disk->queue;
if (b->merge_bvec_fn) {
@@ -1257,24 +1187,38 @@
return limit;
}
+struct drbd_request *find_oldest_request(struct drbd_tconn *tconn)
+{
+ /* Walk the transfer log,
+ * and find the oldest not yet completed request */
+ struct drbd_request *r;
+ list_for_each_entry(r, &tconn->transfer_log, tl_requests) {
+ if (atomic_read(&r->completion_ref))
+ return r;
+ }
+ return NULL;
+}
+
void request_timer_fn(unsigned long data)
{
struct drbd_conf *mdev = (struct drbd_conf *) data;
+ struct drbd_tconn *tconn = mdev->tconn;
struct drbd_request *req; /* oldest request */
- struct list_head *le;
+ struct net_conf *nc;
unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */
unsigned long now;
- if (get_net_conf(mdev)) {
- if (mdev->state.conn >= C_WF_REPORT_PARAMS)
- ent = mdev->net_conf->timeout*HZ/10
- * mdev->net_conf->ko_count;
- put_net_conf(mdev);
- }
+ rcu_read_lock();
+ nc = rcu_dereference(tconn->net_conf);
+ if (nc && mdev->state.conn >= C_WF_REPORT_PARAMS)
+ ent = nc->timeout * HZ/10 * nc->ko_count;
+
if (get_ldev(mdev)) { /* implicit state.disk >= D_INCONSISTENT */
- dt = mdev->ldev->dc.disk_timeout * HZ / 10;
+ dt = rcu_dereference(mdev->ldev->disk_conf)->disk_timeout * HZ / 10;
put_ldev(mdev);
}
+ rcu_read_unlock();
+
et = min_not_zero(dt, ent);
if (!et)
@@ -1282,17 +1226,14 @@
now = jiffies;
- spin_lock_irq(&mdev->req_lock);
- le = &mdev->oldest_tle->requests;
- if (list_empty(le)) {
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&tconn->req_lock);
+ req = find_oldest_request(tconn);
+ if (!req) {
+ spin_unlock_irq(&tconn->req_lock);
mod_timer(&mdev->request_timer, now + et);
return;
}
- le = le->prev;
- req = list_entry(le, struct drbd_request, tl_requests);
-
/* The request is considered timed out, if
* - we have some effective timeout from the configuration,
* with above state restrictions applied,
@@ -1311,17 +1252,17 @@
*/
if (ent && req->rq_state & RQ_NET_PENDING &&
time_after(now, req->start_time + ent) &&
- !time_in_range(now, mdev->last_reconnect_jif, mdev->last_reconnect_jif + ent)) {
+ !time_in_range(now, tconn->last_reconnect_jif, tconn->last_reconnect_jif + ent)) {
dev_warn(DEV, "Remote failed to finish a request within ko-count * timeout\n");
_drbd_set_state(_NS(mdev, conn, C_TIMEOUT), CS_VERBOSE | CS_HARD, NULL);
}
- if (dt && req->rq_state & RQ_LOCAL_PENDING &&
+ if (dt && req->rq_state & RQ_LOCAL_PENDING && req->w.mdev == mdev &&
time_after(now, req->start_time + dt) &&
!time_in_range(now, mdev->last_reattach_jif, mdev->last_reattach_jif + dt)) {
dev_warn(DEV, "Local backing device failed to meet the disk-timeout\n");
__drbd_chk_io_error(mdev, DRBD_FORCE_DETACH);
}
nt = (time_after(now, req->start_time + et) ? now : req->start_time) + et;
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&tconn->req_lock);
mod_timer(&mdev->request_timer, nt);
}
diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h
index 3d21119..016de6b 100644
--- a/drivers/block/drbd/drbd_req.h
+++ b/drivers/block/drbd/drbd_req.h
@@ -77,40 +77,41 @@
*/
enum drbd_req_event {
- created,
- to_be_send,
- to_be_submitted,
+ CREATED,
+ TO_BE_SENT,
+ TO_BE_SUBMITTED,
/* XXX yes, now I am inconsistent...
* these are not "events" but "actions"
* oh, well... */
- queue_for_net_write,
- queue_for_net_read,
- queue_for_send_oos,
+ QUEUE_FOR_NET_WRITE,
+ QUEUE_FOR_NET_READ,
+ QUEUE_FOR_SEND_OOS,
- send_canceled,
- send_failed,
- handed_over_to_network,
- oos_handed_to_network,
- connection_lost_while_pending,
- read_retry_remote_canceled,
- recv_acked_by_peer,
- write_acked_by_peer,
- write_acked_by_peer_and_sis, /* and set_in_sync */
- conflict_discarded_by_peer,
- neg_acked,
- barrier_acked, /* in protocol A and B */
- data_received, /* (remote read) */
+ SEND_CANCELED,
+ SEND_FAILED,
+ HANDED_OVER_TO_NETWORK,
+ OOS_HANDED_TO_NETWORK,
+ CONNECTION_LOST_WHILE_PENDING,
+ READ_RETRY_REMOTE_CANCELED,
+ RECV_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER,
+ WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
+ CONFLICT_RESOLVED,
+ POSTPONE_WRITE,
+ NEG_ACKED,
+ BARRIER_ACKED, /* in protocol A and B */
+ DATA_RECEIVED, /* (remote read) */
- read_completed_with_error,
- read_ahead_completed_with_error,
- write_completed_with_error,
- abort_disk_io,
- completed_ok,
- resend,
- fail_frozen_disk_io,
- restart_frozen_disk_io,
- nothing, /* for tracing only */
+ READ_COMPLETED_WITH_ERROR,
+ READ_AHEAD_COMPLETED_WITH_ERROR,
+ WRITE_COMPLETED_WITH_ERROR,
+ ABORT_DISK_IO,
+ COMPLETED_OK,
+ RESEND,
+ FAIL_FROZEN_DISK_IO,
+ RESTART_FROZEN_DISK_IO,
+ NOTHING,
};
/* encoding of request states for now. we don't actually need that many bits.
@@ -142,8 +143,8 @@
* recv_ack (B) or implicit "ack" (A),
* still waiting for the barrier ack.
* master_bio may already be completed and invalidated.
- * 11100: write_acked (C),
- * data_received (for remote read, any protocol)
+ * 11100: write acked (C),
+ * data received (for remote read, any protocol)
* or finally the barrier ack has arrived (B,A)...
* request can be freed
* 01100: neg-acked (write, protocol C)
@@ -198,6 +199,22 @@
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
+
+ /* The peer has sent a retry ACK */
+ __RQ_POSTPONED,
+
+ /* would have been completed,
+ * but was not, because of drbd_suspended() */
+ __RQ_COMPLETION_SUSP,
+
+ /* We expect a receive ACK (wire proto B) */
+ __RQ_EXP_RECEIVE_ACK,
+
+ /* We expect a write ACK (wite proto C) */
+ __RQ_EXP_WRITE_ACK,
+
+ /* waiting for a barrier ack, did an extra kref_get */
+ __RQ_EXP_BARR_ACK,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
@@ -219,56 +236,16 @@
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
+#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
+#define RQ_COMPLETION_SUSP (1UL << __RQ_COMPLETION_SUSP)
+#define RQ_EXP_RECEIVE_ACK (1UL << __RQ_EXP_RECEIVE_ACK)
+#define RQ_EXP_WRITE_ACK (1UL << __RQ_EXP_WRITE_ACK)
+#define RQ_EXP_BARR_ACK (1UL << __RQ_EXP_BARR_ACK)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
-#define MR_WRITE_SHIFT 0
-#define MR_WRITE (1 << MR_WRITE_SHIFT)
-#define MR_READ_SHIFT 1
-#define MR_READ (1 << MR_READ_SHIFT)
-
-/* epoch entries */
-static inline
-struct hlist_head *ee_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- BUG_ON(mdev->ee_hash_s == 0);
- return mdev->ee_hash +
- ((unsigned int)(sector>>HT_SHIFT) % mdev->ee_hash_s);
-}
-
-/* transfer log (drbd_request objects) */
-static inline
-struct hlist_head *tl_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- BUG_ON(mdev->tl_hash_s == 0);
- return mdev->tl_hash +
- ((unsigned int)(sector>>HT_SHIFT) % mdev->tl_hash_s);
-}
-
-/* application reads (drbd_request objects) */
-static struct hlist_head *ar_hash_slot(struct drbd_conf *mdev, sector_t sector)
-{
- return mdev->app_reads_hash
- + ((unsigned int)(sector) % APP_R_HSIZE);
-}
-
-/* when we receive the answer for a read request,
- * verify that we actually know about it */
-static inline struct drbd_request *_ar_id_to_req(struct drbd_conf *mdev,
- u64 id, sector_t sector)
-{
- struct hlist_head *slot = ar_hash_slot(mdev, sector);
- struct hlist_node *n;
- struct drbd_request *req;
-
- hlist_for_each_entry(req, n, slot, collision) {
- if ((unsigned long)req == (unsigned long)id) {
- D_ASSERT(req->sector == sector);
- return req;
- }
- }
- return NULL;
-}
+#define MR_WRITE 1
+#define MR_READ 2
static inline void drbd_req_make_private_bio(struct drbd_request *req, struct bio *bio_src)
{
@@ -278,41 +255,10 @@
req->private_bio = bio;
bio->bi_private = req;
- bio->bi_end_io = drbd_endio_pri;
+ bio->bi_end_io = drbd_request_endio;
bio->bi_next = NULL;
}
-static inline struct drbd_request *drbd_req_new(struct drbd_conf *mdev,
- struct bio *bio_src)
-{
- struct drbd_request *req =
- mempool_alloc(drbd_request_mempool, GFP_NOIO);
- if (likely(req)) {
- drbd_req_make_private_bio(req, bio_src);
-
- req->rq_state = bio_data_dir(bio_src) == WRITE ? RQ_WRITE : 0;
- req->mdev = mdev;
- req->master_bio = bio_src;
- req->epoch = 0;
- req->sector = bio_src->bi_sector;
- req->size = bio_src->bi_size;
- INIT_HLIST_NODE(&req->collision);
- INIT_LIST_HEAD(&req->tl_requests);
- INIT_LIST_HEAD(&req->w.list);
- }
- return req;
-}
-
-static inline void drbd_req_free(struct drbd_request *req)
-{
- mempool_free(req, drbd_request_mempool);
-}
-
-static inline int overlaps(sector_t s1, int l1, sector_t s2, int l2)
-{
- return !((s1 + (l1>>9) <= s2) || (s1 >= s2 + (l2>>9)));
-}
-
/* Short lived temporary struct on the stack.
* We could squirrel the error to be returned into
* bio->bi_size, or similar. But that would be too ugly. */
@@ -321,6 +267,7 @@
int error;
};
+extern void drbd_req_destroy(struct kref *kref);
extern void _req_may_be_done(struct drbd_request *req,
struct bio_and_error *m);
extern int __req_mod(struct drbd_request *req, enum drbd_req_event what,
@@ -328,13 +275,17 @@
extern void complete_master_bio(struct drbd_conf *mdev,
struct bio_and_error *m);
extern void request_timer_fn(unsigned long data);
-extern void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what);
+extern void tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
+extern void _tl_restart(struct drbd_tconn *tconn, enum drbd_req_event what);
+
+/* this is in drbd_main.c */
+extern void drbd_restart_request(struct drbd_request *req);
/* use this if you don't want to deal with calling complete_master_bio()
* outside the spinlock, e.g. when walking some list on cleanup. */
static inline int _req_mod(struct drbd_request *req, enum drbd_req_event what)
{
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
@@ -354,13 +305,13 @@
enum drbd_req_event what)
{
unsigned long flags;
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
int rv;
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
rv = __req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
if (m.bio)
complete_master_bio(mdev, &m);
@@ -368,7 +319,7 @@
return rv;
}
-static inline bool drbd_should_do_remote(union drbd_state s)
+static inline bool drbd_should_do_remote(union drbd_dev_state s)
{
return s.pdsk == D_UP_TO_DATE ||
(s.pdsk >= D_INCONSISTENT &&
@@ -378,7 +329,7 @@
That is equivalent since before 96 IO was frozen in the C_WF_BITMAP*
states. */
}
-static inline bool drbd_should_send_oos(union drbd_state s)
+static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s)
{
return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S;
/* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
new file mode 100644
index 0000000..69ef352
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.c
@@ -0,0 +1,1857 @@
+/*
+ drbd_state.c
+
+ This file is part of DRBD by Philipp Reisner and Lars Ellenberg.
+
+ Copyright (C) 2001-2008, LINBIT Information Technologies GmbH.
+ Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>.
+ Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>.
+
+ Thanks to Carter Burden, Bart Grantham and Gennadiy Nerubayev
+ from Logicworks, Inc. for making SDP replication support possible.
+
+ drbd is free software; you can redistribute it and/or modify
+ it under the terms of the GNU General Public License as published by
+ the Free Software Foundation; either version 2, or (at your option)
+ any later version.
+
+ drbd is distributed in the hope that it will be useful,
+ but WITHOUT ANY WARRANTY; without even the implied warranty of
+ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ GNU General Public License for more details.
+
+ You should have received a copy of the GNU General Public License
+ along with drbd; see the file COPYING. If not, write to
+ the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include <linux/drbd_limits.h>
+#include "drbd_int.h"
+#include "drbd_req.h"
+
+/* in drbd_main.c */
+extern void tl_abort_disk_io(struct drbd_conf *mdev);
+
+struct after_state_chg_work {
+ struct drbd_work w;
+ union drbd_state os;
+ union drbd_state ns;
+ enum chg_state_flags flags;
+ struct completion *done;
+};
+
+enum sanitize_state_warnings {
+ NO_WARNING,
+ ABORTED_ONLINE_VERIFY,
+ ABORTED_RESYNC,
+ CONNECTION_LOST_NEGOTIATING,
+ IMPLICITLY_UPGRADED_DISK,
+ IMPLICITLY_UPGRADED_PDSK,
+};
+
+static int w_after_state_ch(struct drbd_work *w, int unused);
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags);
+static enum drbd_state_rv is_valid_state(struct drbd_conf *, union drbd_state);
+static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_tconn *);
+static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns);
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
+ enum sanitize_state_warnings *warn);
+
+static inline bool is_susp(union drbd_state s)
+{
+ return s.susp || s.susp_nod || s.susp_fen;
+}
+
+bool conn_all_vols_unconf(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ bool rv = true;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (mdev->state.disk != D_DISKLESS ||
+ mdev->state.conn != C_STANDALONE ||
+ mdev->state.role != R_SECONDARY) {
+ rv = false;
+ break;
+ }
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/* Unfortunately the states where not correctly ordered, when
+ they where defined. therefore can not use max_t() here. */
+static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_PRIMARY || role2 == R_PRIMARY)
+ return R_PRIMARY;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_UNKNOWN;
+}
+static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2)
+{
+ if (role1 == R_UNKNOWN || role2 == R_UNKNOWN)
+ return R_UNKNOWN;
+ if (role1 == R_SECONDARY || role2 == R_SECONDARY)
+ return R_SECONDARY;
+ return R_PRIMARY;
+}
+
+enum drbd_role conn_highest_role(struct drbd_tconn *tconn)
+{
+ enum drbd_role role = R_UNKNOWN;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ role = max_role(role, mdev->state.role);
+ rcu_read_unlock();
+
+ return role;
+}
+
+enum drbd_role conn_highest_peer(struct drbd_tconn *tconn)
+{
+ enum drbd_role peer = R_UNKNOWN;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ peer = max_role(peer, mdev->state.peer);
+ rcu_read_unlock();
+
+ return peer;
+}
+
+enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn)
+{
+ enum drbd_disk_state ds = D_DISKLESS;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ ds = max_t(enum drbd_disk_state, ds, mdev->state.disk);
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn)
+{
+ enum drbd_disk_state ds = D_MASK;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ ds = min_t(enum drbd_disk_state, ds, mdev->state.disk);
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn)
+{
+ enum drbd_disk_state ds = D_DISKLESS;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ ds = max_t(enum drbd_disk_state, ds, mdev->state.pdsk);
+ rcu_read_unlock();
+
+ return ds;
+}
+
+enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn)
+{
+ enum drbd_conns conn = C_MASK;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ conn = min_t(enum drbd_conns, conn, mdev->state.conn);
+ rcu_read_unlock();
+
+ return conn;
+}
+
+static bool no_peer_wf_report_params(struct drbd_tconn *tconn)
+{
+ struct drbd_conf *mdev;
+ int vnr;
+ bool rv = true;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr)
+ if (mdev->state.conn == C_WF_REPORT_PARAMS) {
+ rv = false;
+ break;
+ }
+ rcu_read_unlock();
+
+ return rv;
+}
+
+
+/**
+ * cl_wide_st_chg() - true if the state change is a cluster wide one
+ * @mdev: DRBD device.
+ * @os: old (current) state.
+ * @ns: new (wanted) state.
+ */
+static int cl_wide_st_chg(struct drbd_conf *mdev,
+ union drbd_state os, union drbd_state ns)
+{
+ return (os.conn >= C_CONNECTED && ns.conn >= C_CONNECTED &&
+ ((os.role != R_PRIMARY && ns.role == R_PRIMARY) ||
+ (os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S) ||
+ (os.disk != D_FAILED && ns.disk == D_FAILED))) ||
+ (os.conn >= C_CONNECTED && ns.conn == C_DISCONNECTING) ||
+ (os.conn == C_CONNECTED && ns.conn == C_VERIFY_S) ||
+ (os.conn == C_CONNECTED && ns.conn == C_WF_REPORT_PARAMS);
+}
+
+static union drbd_state
+apply_mask_val(union drbd_state os, union drbd_state mask, union drbd_state val)
+{
+ union drbd_state ns;
+ ns.i = (os.i & ~mask.i) | val.i;
+ return ns;
+}
+
+enum drbd_state_rv
+drbd_change_state(struct drbd_conf *mdev, enum chg_state_flags f,
+ union drbd_state mask, union drbd_state val)
+{
+ unsigned long flags;
+ union drbd_state ns;
+ enum drbd_state_rv rv;
+
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(mdev), mask, val);
+ rv = _drbd_set_state(mdev, ns, f, NULL);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_force_state() - Impose a change which happens outside our control on our state
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ */
+void drbd_force_state(struct drbd_conf *mdev,
+ union drbd_state mask, union drbd_state val)
+{
+ drbd_change_state(mdev, CS_HARD, mask, val);
+}
+
+static enum drbd_state_rv
+_req_st_cond(struct drbd_conf *mdev, union drbd_state mask,
+ union drbd_state val)
+{
+ union drbd_state os, ns;
+ unsigned long flags;
+ enum drbd_state_rv rv;
+
+ if (test_and_clear_bit(CL_ST_CHG_SUCCESS, &mdev->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CL_ST_CHG_FAIL, &mdev->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ os = drbd_read_state(mdev);
+ ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+ if (!cl_wide_st_chg(mdev, os, ns))
+ rv = SS_CW_NO_NEED;
+ if (rv == SS_UNKNOWN_ERROR) {
+ rv = is_valid_state(mdev, ns);
+ if (rv >= SS_SUCCESS) {
+ rv = is_valid_soft_transition(os, ns, mdev->tconn);
+ if (rv >= SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+ }
+ }
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+ return rv;
+}
+
+/**
+ * drbd_req_state() - Perform an eventually cluster wide state change
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Should not be called directly, use drbd_request_state() or
+ * _drbd_request_state().
+ */
+static enum drbd_state_rv
+drbd_req_state(struct drbd_conf *mdev, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ struct completion done;
+ unsigned long flags;
+ union drbd_state os, ns;
+ enum drbd_state_rv rv;
+
+ init_completion(&done);
+
+ if (f & CS_SERIALIZE)
+ mutex_lock(mdev->state_mutex);
+
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ os = drbd_read_state(mdev);
+ ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS) {
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+ goto abort;
+ }
+
+ if (cl_wide_st_chg(mdev, os, ns)) {
+ rv = is_valid_state(mdev, ns);
+ if (rv == SS_SUCCESS)
+ rv = is_valid_soft_transition(os, ns, mdev->tconn);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+
+ if (drbd_send_state_req(mdev, mask, val)) {
+ rv = SS_CW_FAILED_BY_PEER;
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+
+ wait_event(mdev->state_wait,
+ (rv = _req_st_cond(mdev, mask, val)));
+
+ if (rv < SS_SUCCESS) {
+ if (f & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ goto abort;
+ }
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ ns = apply_mask_val(drbd_read_state(mdev), mask, val);
+ rv = _drbd_set_state(mdev, ns, f, &done);
+ } else {
+ rv = _drbd_set_state(mdev, ns, f, &done);
+ }
+
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
+
+ if (f & CS_WAIT_COMPLETE && rv == SS_SUCCESS) {
+ D_ASSERT(current != mdev->tconn->worker.task);
+ wait_for_completion(&done);
+ }
+
+abort:
+ if (f & CS_SERIALIZE)
+ mutex_unlock(mdev->state_mutex);
+
+ return rv;
+}
+
+/**
+ * _drbd_request_state() - Request a state change (with flags)
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ * @f: flags
+ *
+ * Cousin of drbd_request_state(), useful with the CS_WAIT_COMPLETE
+ * flag, or when logging of failed state change requests is not desired.
+ */
+enum drbd_state_rv
+_drbd_request_state(struct drbd_conf *mdev, union drbd_state mask,
+ union drbd_state val, enum chg_state_flags f)
+{
+ enum drbd_state_rv rv;
+
+ wait_event(mdev->state_wait,
+ (rv = drbd_req_state(mdev, mask, val, f)) != SS_IN_TRANSIENT_STATE);
+
+ return rv;
+}
+
+static void print_st(struct drbd_conf *mdev, char *name, union drbd_state ns)
+{
+ dev_err(DEV, " %s = { cs:%s ro:%s/%s ds:%s/%s %c%c%c%c%c%c }\n",
+ name,
+ drbd_conn_str(ns.conn),
+ drbd_role_str(ns.role),
+ drbd_role_str(ns.peer),
+ drbd_disk_str(ns.disk),
+ drbd_disk_str(ns.pdsk),
+ is_susp(ns) ? 's' : 'r',
+ ns.aftr_isp ? 'a' : '-',
+ ns.peer_isp ? 'p' : '-',
+ ns.user_isp ? 'u' : '-',
+ ns.susp_fen ? 'F' : '-',
+ ns.susp_nod ? 'N' : '-'
+ );
+}
+
+void print_st_err(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, enum drbd_state_rv err)
+{
+ if (err == SS_IN_TRANSIENT_STATE)
+ return;
+ dev_err(DEV, "State change failed: %s\n", drbd_set_st_err_str(err));
+ print_st(mdev, " state", os);
+ print_st(mdev, "wanted", ns);
+}
+
+static long print_state_change(char *pb, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char *pbp;
+ pbp = pb;
+ *pbp = 0;
+
+ if (ns.role != os.role && flags & CS_DC_ROLE)
+ pbp += sprintf(pbp, "role( %s -> %s ) ",
+ drbd_role_str(os.role),
+ drbd_role_str(ns.role));
+ if (ns.peer != os.peer && flags & CS_DC_PEER)
+ pbp += sprintf(pbp, "peer( %s -> %s ) ",
+ drbd_role_str(os.peer),
+ drbd_role_str(ns.peer));
+ if (ns.conn != os.conn && flags & CS_DC_CONN)
+ pbp += sprintf(pbp, "conn( %s -> %s ) ",
+ drbd_conn_str(os.conn),
+ drbd_conn_str(ns.conn));
+ if (ns.disk != os.disk && flags & CS_DC_DISK)
+ pbp += sprintf(pbp, "disk( %s -> %s ) ",
+ drbd_disk_str(os.disk),
+ drbd_disk_str(ns.disk));
+ if (ns.pdsk != os.pdsk && flags & CS_DC_PDSK)
+ pbp += sprintf(pbp, "pdsk( %s -> %s ) ",
+ drbd_disk_str(os.pdsk),
+ drbd_disk_str(ns.pdsk));
+
+ return pbp - pb;
+}
+
+static void drbd_pr_state_change(struct drbd_conf *mdev, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags ^ CS_DC_MASK);
+
+ if (ns.aftr_isp != os.aftr_isp)
+ pbp += sprintf(pbp, "aftr_isp( %d -> %d ) ",
+ os.aftr_isp,
+ ns.aftr_isp);
+ if (ns.peer_isp != os.peer_isp)
+ pbp += sprintf(pbp, "peer_isp( %d -> %d ) ",
+ os.peer_isp,
+ ns.peer_isp);
+ if (ns.user_isp != os.user_isp)
+ pbp += sprintf(pbp, "user_isp( %d -> %d ) ",
+ os.user_isp,
+ ns.user_isp);
+
+ if (pbp != pb)
+ dev_info(DEV, "%s\n", pb);
+}
+
+static void conn_pr_state_change(struct drbd_tconn *tconn, union drbd_state os, union drbd_state ns,
+ enum chg_state_flags flags)
+{
+ char pb[300];
+ char *pbp = pb;
+
+ pbp += print_state_change(pbp, os, ns, flags);
+
+ if (is_susp(ns) != is_susp(os) && flags & CS_DC_SUSP)
+ pbp += sprintf(pbp, "susp( %d -> %d ) ",
+ is_susp(os),
+ is_susp(ns));
+
+ if (pbp != pb)
+ conn_info(tconn, "%s\n", pb);
+}
+
+
+/**
+ * is_valid_state() - Returns an SS_ error code if ns is not valid
+ * @mdev: DRBD device.
+ * @ns: State to consider.
+ */
+static enum drbd_state_rv
+is_valid_state(struct drbd_conf *mdev, union drbd_state ns)
+{
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ enum drbd_fencing_p fp;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct net_conf *nc;
+
+ rcu_read_lock();
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+ put_ldev(mdev);
+ }
+
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ if (nc) {
+ if (!nc->two_primaries && ns.role == R_PRIMARY) {
+ if (ns.peer == R_PRIMARY)
+ rv = SS_TWO_PRIMARIES;
+ else if (conn_highest_peer(mdev->tconn) == R_PRIMARY)
+ rv = SS_O_VOL_PEER_PRI;
+ }
+ }
+
+ if (rv <= 0)
+ /* already found a reason to abort */;
+ else if (ns.role == R_SECONDARY && mdev->open_cnt)
+ rv = SS_DEVICE_IN_USE;
+
+ else if (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.disk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (fp >= FP_RESOURCE &&
+ ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk >= D_UNKNOWN)
+ rv = SS_PRIMARY_NOP;
+
+ else if (ns.role == R_PRIMARY && ns.disk <= D_INCONSISTENT && ns.pdsk <= D_INCONSISTENT)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_INCONSISTENT)
+ rv = SS_NO_LOCAL_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.pdsk < D_INCONSISTENT)
+ rv = SS_NO_REMOTE_DISK;
+
+ else if (ns.conn > C_CONNECTED && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE)
+ rv = SS_NO_UP_TO_DATE_DISK;
+
+ else if ((ns.conn == C_CONNECTED ||
+ ns.conn == C_WF_BITMAP_S ||
+ ns.conn == C_SYNC_SOURCE ||
+ ns.conn == C_PAUSED_SYNC_S) &&
+ ns.disk == D_OUTDATED)
+ rv = SS_CONNECTED_OUTDATES;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ (nc->verify_alg[0] == 0))
+ rv = SS_NO_VERIFY_ALG;
+
+ else if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ mdev->tconn->agreed_pro_version < 88)
+ rv = SS_NOT_SUPPORTED;
+
+ else if (ns.conn >= C_CONNECTED && ns.pdsk == D_UNKNOWN)
+ rv = SS_CONNECTED_OUTDATES;
+
+ rcu_read_unlock();
+
+ return rv;
+}
+
+/**
+ * is_valid_soft_transition() - Returns an SS_ error code if the state transition is not possible
+ * This function limits state transitions that may be declined by DRBD. I.e.
+ * user requests (aka soft transitions).
+ * @mdev: DRBD device.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_soft_transition(union drbd_state os, union drbd_state ns, struct drbd_tconn *tconn)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+
+ if ((ns.conn == C_STARTING_SYNC_T || ns.conn == C_STARTING_SYNC_S) &&
+ os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_STANDALONE)
+ rv = SS_ALREADY_STANDALONE;
+
+ if (ns.disk > D_ATTACHING && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ if (ns.conn == C_WF_CONNECTION && os.conn < C_UNCONNECTED)
+ rv = SS_NO_NET_CONFIG;
+
+ if (ns.disk == D_OUTDATED && os.disk < D_OUTDATED && os.disk != D_ATTACHING)
+ rv = SS_LOWER_THAN_OUTDATED;
+
+ if (ns.conn == C_DISCONNECTING && os.conn == C_UNCONNECTED)
+ rv = SS_IN_TRANSIENT_STATE;
+
+ /* if (ns.conn == os.conn && ns.conn == C_WF_REPORT_PARAMS)
+ rv = SS_IN_TRANSIENT_STATE; */
+
+ /* While establishing a connection only allow cstate to change.
+ Delay/refuse role changes, detach attach etc... */
+ if (test_bit(STATE_SENT, &tconn->flags) &&
+ !(os.conn == C_WF_REPORT_PARAMS ||
+ (ns.conn == C_WF_REPORT_PARAMS && os.conn == C_WF_CONNECTION)))
+ rv = SS_IN_TRANSIENT_STATE;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) && os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T) &&
+ ns.conn != os.conn && os.conn > C_CONNECTED)
+ rv = SS_RESYNC_RUNNING;
+
+ if ((ns.conn == C_STARTING_SYNC_S || ns.conn == C_STARTING_SYNC_T) &&
+ os.conn < C_CONNECTED)
+ rv = SS_NEED_CONNECTION;
+
+ if ((ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)
+ && os.conn < C_WF_REPORT_PARAMS)
+ rv = SS_NEED_CONNECTION; /* No NetworkFailure -> SyncTarget etc... */
+
+ return rv;
+}
+
+static enum drbd_state_rv
+is_valid_conn_transition(enum drbd_conns oc, enum drbd_conns nc)
+{
+ /* no change -> nothing to do, at least for the connection part */
+ if (oc == nc)
+ return SS_NOTHING_TO_DO;
+
+ /* disconnect of an unconfigured connection does not make sense */
+ if (oc == C_STANDALONE && nc == C_DISCONNECTING)
+ return SS_ALREADY_STANDALONE;
+
+ /* from C_STANDALONE, we start with C_UNCONNECTED */
+ if (oc == C_STANDALONE && nc != C_UNCONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* When establishing a connection we need to go through WF_REPORT_PARAMS!
+ Necessary to do the right thing upon invalidate-remote on a disconnected resource */
+ if (oc < C_WF_REPORT_PARAMS && nc >= C_CONNECTED)
+ return SS_NEED_CONNECTION;
+
+ /* After a network error only C_UNCONNECTED or C_DISCONNECTING may follow. */
+ if (oc >= C_TIMEOUT && oc <= C_TEAR_DOWN && nc != C_UNCONNECTED && nc != C_DISCONNECTING)
+ return SS_IN_TRANSIENT_STATE;
+
+ /* After C_DISCONNECTING only C_STANDALONE may follow */
+ if (oc == C_DISCONNECTING && nc != C_STANDALONE)
+ return SS_IN_TRANSIENT_STATE;
+
+ return SS_SUCCESS;
+}
+
+
+/**
+ * is_valid_transition() - Returns an SS_ error code if the state transition is not possible
+ * This limits hard state transitions. Hard state transitions are facts there are
+ * imposed on DRBD by the environment. E.g. disk broke or network broke down.
+ * But those hard state transitions are still not allowed to do everything.
+ * @ns: new state.
+ * @os: old state.
+ */
+static enum drbd_state_rv
+is_valid_transition(union drbd_state os, union drbd_state ns)
+{
+ enum drbd_state_rv rv;
+
+ rv = is_valid_conn_transition(os.conn, ns.conn);
+
+ /* we cannot fail (again) if we already detached */
+ if (ns.disk == D_FAILED && os.disk == D_DISKLESS)
+ rv = SS_IS_DISKLESS;
+
+ return rv;
+}
+
+static void print_sanitize_warnings(struct drbd_conf *mdev, enum sanitize_state_warnings warn)
+{
+ static const char *msg_table[] = {
+ [NO_WARNING] = "",
+ [ABORTED_ONLINE_VERIFY] = "Online-verify aborted.",
+ [ABORTED_RESYNC] = "Resync aborted.",
+ [CONNECTION_LOST_NEGOTIATING] = "Connection lost while negotiating, no data!",
+ [IMPLICITLY_UPGRADED_DISK] = "Implicitly upgraded disk",
+ [IMPLICITLY_UPGRADED_PDSK] = "Implicitly upgraded pdsk",
+ };
+
+ if (warn != NO_WARNING)
+ dev_warn(DEV, "%s\n", msg_table[warn]);
+}
+
+/**
+ * sanitize_state() - Resolves implicitly necessary additional changes to a state transition
+ * @mdev: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @warn_sync_abort:
+ *
+ * When we loose connection, we have to set the state of the peers disk (pdsk)
+ * to D_UNKNOWN. This rule and many more along those lines are in this function.
+ */
+static union drbd_state sanitize_state(struct drbd_conf *mdev, union drbd_state ns,
+ enum sanitize_state_warnings *warn)
+{
+ enum drbd_fencing_p fp;
+ enum drbd_disk_state disk_min, disk_max, pdsk_min, pdsk_max;
+
+ if (warn)
+ *warn = NO_WARNING;
+
+ fp = FP_DONT_CARE;
+ if (get_ldev(mdev)) {
+ rcu_read_lock();
+ fp = rcu_dereference(mdev->ldev->disk_conf)->fencing;
+ rcu_read_unlock();
+ put_ldev(mdev);
+ }
+
+ /* Implications from connection to peer and peer_isp */
+ if (ns.conn < C_CONNECTED) {
+ ns.peer_isp = 0;
+ ns.peer = R_UNKNOWN;
+ if (ns.pdsk > D_UNKNOWN || ns.pdsk < D_INCONSISTENT)
+ ns.pdsk = D_UNKNOWN;
+ }
+
+ /* Clear the aftr_isp when becoming unconfigured */
+ if (ns.conn == C_STANDALONE && ns.disk == D_DISKLESS && ns.role == R_SECONDARY)
+ ns.aftr_isp = 0;
+
+ /* An implication of the disk states onto the connection state */
+ /* Abort resync if a disk fails/detaches */
+ if (ns.conn > C_CONNECTED && (ns.disk <= D_FAILED || ns.pdsk <= D_FAILED)) {
+ if (warn)
+ *warn = ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T ?
+ ABORTED_ONLINE_VERIFY : ABORTED_RESYNC;
+ ns.conn = C_CONNECTED;
+ }
+
+ /* Connection breaks down before we finished "Negotiating" */
+ if (ns.conn < C_CONNECTED && ns.disk == D_NEGOTIATING &&
+ get_ldev_if_state(mdev, D_NEGOTIATING)) {
+ if (mdev->ed_uuid == mdev->ldev->md.uuid[UI_CURRENT]) {
+ ns.disk = mdev->new_state_tmp.disk;
+ ns.pdsk = mdev->new_state_tmp.pdsk;
+ } else {
+ if (warn)
+ *warn = CONNECTION_LOST_NEGOTIATING;
+ ns.disk = D_DISKLESS;
+ ns.pdsk = D_UNKNOWN;
+ }
+ put_ldev(mdev);
+ }
+
+ /* D_CONSISTENT and D_OUTDATED vanish when we get connected */
+ if (ns.conn >= C_CONNECTED && ns.conn < C_AHEAD) {
+ if (ns.disk == D_CONSISTENT || ns.disk == D_OUTDATED)
+ ns.disk = D_UP_TO_DATE;
+ if (ns.pdsk == D_CONSISTENT || ns.pdsk == D_OUTDATED)
+ ns.pdsk = D_UP_TO_DATE;
+ }
+
+ /* Implications of the connection stat on the disk states */
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_UNKNOWN;
+ switch ((enum drbd_conns)ns.conn) {
+ case C_WF_BITMAP_T:
+ case C_PAUSED_SYNC_T:
+ case C_STARTING_SYNC_T:
+ case C_WF_SYNC_UUID:
+ case C_BEHIND:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_OUTDATED;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_VERIFY_S:
+ case C_VERIFY_T:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_CONNECTED:
+ disk_min = D_DISKLESS;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_DISKLESS;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_WF_BITMAP_S:
+ case C_PAUSED_SYNC_S:
+ case C_STARTING_SYNC_S:
+ case C_AHEAD:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_CONSISTENT; /* D_OUTDATED would be nice. But explicit outdate necessary*/
+ break;
+ case C_SYNC_TARGET:
+ disk_min = D_INCONSISTENT;
+ disk_max = D_INCONSISTENT;
+ pdsk_min = D_UP_TO_DATE;
+ pdsk_max = D_UP_TO_DATE;
+ break;
+ case C_SYNC_SOURCE:
+ disk_min = D_UP_TO_DATE;
+ disk_max = D_UP_TO_DATE;
+ pdsk_min = D_INCONSISTENT;
+ pdsk_max = D_INCONSISTENT;
+ break;
+ case C_STANDALONE:
+ case C_DISCONNECTING:
+ case C_UNCONNECTED:
+ case C_TIMEOUT:
+ case C_BROKEN_PIPE:
+ case C_NETWORK_FAILURE:
+ case C_PROTOCOL_ERROR:
+ case C_TEAR_DOWN:
+ case C_WF_CONNECTION:
+ case C_WF_REPORT_PARAMS:
+ case C_MASK:
+ break;
+ }
+ if (ns.disk > disk_max)
+ ns.disk = disk_max;
+
+ if (ns.disk < disk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_DISK;
+ ns.disk = disk_min;
+ }
+ if (ns.pdsk > pdsk_max)
+ ns.pdsk = pdsk_max;
+
+ if (ns.pdsk < pdsk_min) {
+ if (warn)
+ *warn = IMPLICITLY_UPGRADED_PDSK;
+ ns.pdsk = pdsk_min;
+ }
+
+ if (fp == FP_STONITH &&
+ (ns.role == R_PRIMARY && ns.conn < C_CONNECTED && ns.pdsk > D_OUTDATED))
+ ns.susp_fen = 1; /* Suspend IO while fence-peer handler runs (peer lost) */
+
+ if (mdev->tconn->res_opts.on_no_data == OND_SUSPEND_IO &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ ns.susp_nod = 1; /* Suspend IO while no data available (no accessible data available) */
+
+ if (ns.aftr_isp || ns.peer_isp || ns.user_isp) {
+ if (ns.conn == C_SYNC_SOURCE)
+ ns.conn = C_PAUSED_SYNC_S;
+ if (ns.conn == C_SYNC_TARGET)
+ ns.conn = C_PAUSED_SYNC_T;
+ } else {
+ if (ns.conn == C_PAUSED_SYNC_S)
+ ns.conn = C_SYNC_SOURCE;
+ if (ns.conn == C_PAUSED_SYNC_T)
+ ns.conn = C_SYNC_TARGET;
+ }
+
+ return ns;
+}
+
+void drbd_resume_al(struct drbd_conf *mdev)
+{
+ if (test_and_clear_bit(AL_SUSPENDED, &mdev->flags))
+ dev_info(DEV, "Resumed AL updates\n");
+}
+
+/* helper for __drbd_set_state */
+static void set_ov_position(struct drbd_conf *mdev, enum drbd_conns cs)
+{
+ if (mdev->tconn->agreed_pro_version < 90)
+ mdev->ov_start_sector = 0;
+ mdev->rs_total = drbd_bm_bits(mdev);
+ mdev->ov_position = 0;
+ if (cs == C_VERIFY_T) {
+ /* starting online verify from an arbitrary position
+ * does not fit well into the existing protocol.
+ * on C_VERIFY_T, we initialize ov_left and friends
+ * implicitly in receive_DataRequest once the
+ * first P_OV_REQUEST is received */
+ mdev->ov_start_sector = ~(sector_t)0;
+ } else {
+ unsigned long bit = BM_SECT_TO_BIT(mdev->ov_start_sector);
+ if (bit >= mdev->rs_total) {
+ mdev->ov_start_sector =
+ BM_BIT_TO_SECT(mdev->rs_total - 1);
+ mdev->rs_total = 1;
+ } else
+ mdev->rs_total -= bit;
+ mdev->ov_position = mdev->ov_start_sector;
+ }
+ mdev->ov_left = mdev->rs_total;
+}
+
+/**
+ * __drbd_set_state() - Set a new DRBD state
+ * @mdev: DRBD device.
+ * @ns: new state.
+ * @flags: Flags
+ * @done: Optional completion, that will get completed after the after_state_ch() finished
+ *
+ * Caller needs to hold req_lock, and global_state_lock. Do not call directly.
+ */
+enum drbd_state_rv
+__drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
+ enum chg_state_flags flags, struct completion *done)
+{
+ union drbd_state os;
+ enum drbd_state_rv rv = SS_SUCCESS;
+ enum sanitize_state_warnings ssw;
+ struct after_state_chg_work *ascw;
+
+ os = drbd_read_state(mdev);
+
+ ns = sanitize_state(mdev, ns, &ssw);
+ if (ns.i == os.i)
+ return SS_NOTHING_TO_DO;
+
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS)
+ return rv;
+
+ if (!(flags & CS_HARD)) {
+ /* pre-state-change checks ; only look at ns */
+ /* See drbd_state_sw_errors in drbd_strings.c */
+
+ rv = is_valid_state(mdev, ns);
+ if (rv < SS_SUCCESS) {
+ /* If the old state was illegal as well, then let
+ this happen...*/
+
+ if (is_valid_state(mdev, os) == rv)
+ rv = is_valid_soft_transition(os, ns, mdev->tconn);
+ } else
+ rv = is_valid_soft_transition(os, ns, mdev->tconn);
+ }
+
+ if (rv < SS_SUCCESS) {
+ if (flags & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+ return rv;
+ }
+
+ print_sanitize_warnings(mdev, ssw);
+
+ drbd_pr_state_change(mdev, os, ns, flags);
+
+ /* Display changes to the susp* flags that where caused by the call to
+ sanitize_state(). Only display it here if we where not called from
+ _conn_request_state() */
+ if (!(flags & CS_DC_SUSP))
+ conn_pr_state_change(mdev->tconn, os, ns, (flags & ~CS_DC_MASK) | CS_DC_SUSP);
+
+ /* if we are going -> D_FAILED or D_DISKLESS, grab one extra reference
+ * on the ldev here, to be sure the transition -> D_DISKLESS resp.
+ * drbd_ldev_destroy() won't happen before our corresponding
+ * after_state_ch works run, where we put_ldev again. */
+ if ((os.disk != D_FAILED && ns.disk == D_FAILED) ||
+ (os.disk != D_DISKLESS && ns.disk == D_DISKLESS))
+ atomic_inc(&mdev->local_cnt);
+
+ mdev->state.i = ns.i;
+ mdev->tconn->susp = ns.susp;
+ mdev->tconn->susp_nod = ns.susp_nod;
+ mdev->tconn->susp_fen = ns.susp_fen;
+
+ if (os.disk == D_ATTACHING && ns.disk >= D_NEGOTIATING)
+ drbd_print_uuids(mdev, "attached to UUIDs");
+
+ /* Wake up role changes, that were delayed because of connection establishing */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn != C_WF_REPORT_PARAMS &&
+ no_peer_wf_report_params(mdev->tconn))
+ clear_bit(STATE_SENT, &mdev->tconn->flags);
+
+ wake_up(&mdev->misc_wait);
+ wake_up(&mdev->state_wait);
+ wake_up(&mdev->tconn->ping_wait);
+
+ /* Aborted verify run, or we reached the stop sector.
+ * Log the last position, unless end-of-device. */
+ if ((os.conn == C_VERIFY_S || os.conn == C_VERIFY_T) &&
+ ns.conn <= C_CONNECTED) {
+ mdev->ov_start_sector =
+ BM_BIT_TO_SECT(drbd_bm_bits(mdev) - mdev->ov_left);
+ if (mdev->ov_left)
+ dev_info(DEV, "Online Verify reached sector %llu\n",
+ (unsigned long long)mdev->ov_start_sector);
+ }
+
+ if ((os.conn == C_PAUSED_SYNC_T || os.conn == C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_TARGET || ns.conn == C_SYNC_SOURCE)) {
+ dev_info(DEV, "Syncer continues.\n");
+ mdev->rs_paused += (long)jiffies
+ -(long)mdev->rs_mark_time[mdev->rs_last_mark];
+ if (ns.conn == C_SYNC_TARGET)
+ mod_timer(&mdev->resync_timer, jiffies);
+ }
+
+ if ((os.conn == C_SYNC_TARGET || os.conn == C_SYNC_SOURCE) &&
+ (ns.conn == C_PAUSED_SYNC_T || ns.conn == C_PAUSED_SYNC_S)) {
+ dev_info(DEV, "Resync suspended\n");
+ mdev->rs_mark_time[mdev->rs_last_mark] = jiffies;
+ }
+
+ if (os.conn == C_CONNECTED &&
+ (ns.conn == C_VERIFY_S || ns.conn == C_VERIFY_T)) {
+ unsigned long now = jiffies;
+ int i;
+
+ set_ov_position(mdev, ns.conn);
+ mdev->rs_start = now;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
+ mdev->ov_last_oos_size = 0;
+ mdev->ov_last_oos_start = 0;
+
+ for (i = 0; i < DRBD_SYNC_MARKS; i++) {
+ mdev->rs_mark_left[i] = mdev->ov_left;
+ mdev->rs_mark_time[i] = now;
+ }
+
+ drbd_rs_controller_reset(mdev);
+
+ if (ns.conn == C_VERIFY_S) {
+ dev_info(DEV, "Starting Online Verify from sector %llu\n",
+ (unsigned long long)mdev->ov_position);
+ mod_timer(&mdev->resync_timer, jiffies);
+ }
+ }
+
+ if (get_ldev(mdev)) {
+ u32 mdf = mdev->ldev->md.flags & ~(MDF_CONSISTENT|MDF_PRIMARY_IND|
+ MDF_CONNECTED_IND|MDF_WAS_UP_TO_DATE|
+ MDF_PEER_OUT_DATED|MDF_CRASHED_PRIMARY);
+
+ mdf &= ~MDF_AL_CLEAN;
+ if (test_bit(CRASHED_PRIMARY, &mdev->flags))
+ mdf |= MDF_CRASHED_PRIMARY;
+ if (mdev->state.role == R_PRIMARY ||
+ (mdev->state.pdsk < D_INCONSISTENT && mdev->state.peer == R_PRIMARY))
+ mdf |= MDF_PRIMARY_IND;
+ if (mdev->state.conn > C_WF_REPORT_PARAMS)
+ mdf |= MDF_CONNECTED_IND;
+ if (mdev->state.disk > D_INCONSISTENT)
+ mdf |= MDF_CONSISTENT;
+ if (mdev->state.disk > D_OUTDATED)
+ mdf |= MDF_WAS_UP_TO_DATE;
+ if (mdev->state.pdsk <= D_OUTDATED && mdev->state.pdsk >= D_INCONSISTENT)
+ mdf |= MDF_PEER_OUT_DATED;
+ if (mdf != mdev->ldev->md.flags) {
+ mdev->ldev->md.flags = mdf;
+ drbd_md_mark_dirty(mdev);
+ }
+ if (os.disk < D_CONSISTENT && ns.disk >= D_CONSISTENT)
+ drbd_set_ed_uuid(mdev, mdev->ldev->md.uuid[UI_CURRENT]);
+ put_ldev(mdev);
+ }
+
+ /* Peer was forced D_UP_TO_DATE & R_PRIMARY, consider to resync */
+ if (os.disk == D_INCONSISTENT && os.pdsk == D_INCONSISTENT &&
+ os.peer == R_SECONDARY && ns.peer == R_PRIMARY)
+ set_bit(CONSIDER_RESYNC, &mdev->flags);
+
+ /* Receiver should clean up itself */
+ if (os.conn != C_DISCONNECTING && ns.conn == C_DISCONNECTING)
+ drbd_thread_stop_nowait(&mdev->tconn->receiver);
+
+ /* Now the receiver finished cleaning up itself, it should die */
+ if (os.conn != C_STANDALONE && ns.conn == C_STANDALONE)
+ drbd_thread_stop_nowait(&mdev->tconn->receiver);
+
+ /* Upon network failure, we need to restart the receiver. */
+ if (os.conn > C_WF_CONNECTION &&
+ ns.conn <= C_TEAR_DOWN && ns.conn >= C_TIMEOUT)
+ drbd_thread_restart_nowait(&mdev->tconn->receiver);
+
+ /* Resume AL writing if we get a connection */
+ if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+ drbd_resume_al(mdev);
+
+ /* remember last attach time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ ns.disk > D_NEGOTIATING)
+ mdev->last_reattach_jif = jiffies;
+
+ ascw = kmalloc(sizeof(*ascw), GFP_ATOMIC);
+ if (ascw) {
+ ascw->os = os;
+ ascw->ns = ns;
+ ascw->flags = flags;
+ ascw->w.cb = w_after_state_ch;
+ ascw->w.mdev = mdev;
+ ascw->done = done;
+ drbd_queue_work(&mdev->tconn->sender_work, &ascw->w);
+ } else {
+ dev_err(DEV, "Could not kmalloc an ascw\n");
+ }
+
+ return rv;
+}
+
+static int w_after_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_state_chg_work *ascw =
+ container_of(w, struct after_state_chg_work, w);
+ struct drbd_conf *mdev = w->mdev;
+
+ after_state_ch(mdev, ascw->os, ascw->ns, ascw->flags);
+ if (ascw->flags & CS_WAIT_COMPLETE) {
+ D_ASSERT(ascw->done != NULL);
+ complete(ascw->done);
+ }
+ kfree(ascw);
+
+ return 0;
+}
+
+static void abw_start_sync(struct drbd_conf *mdev, int rv)
+{
+ if (rv) {
+ dev_err(DEV, "Writing the bitmap failed not starting resync.\n");
+ _drbd_request_state(mdev, NS(conn, C_CONNECTED), CS_VERBOSE);
+ return;
+ }
+
+ switch (mdev->state.conn) {
+ case C_STARTING_SYNC_T:
+ _drbd_request_state(mdev, NS(conn, C_WF_SYNC_UUID), CS_VERBOSE);
+ break;
+ case C_STARTING_SYNC_S:
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ break;
+ }
+}
+
+int drbd_bitmap_io_from_worker(struct drbd_conf *mdev,
+ int (*io_fn)(struct drbd_conf *),
+ char *why, enum bm_flag flags)
+{
+ int rv;
+
+ D_ASSERT(current == mdev->tconn->worker.task);
+
+ /* open coded non-blocking drbd_suspend_io(mdev); */
+ set_bit(SUSPEND_IO, &mdev->flags);
+
+ drbd_bm_lock(mdev, why, flags);
+ rv = io_fn(mdev);
+ drbd_bm_unlock(mdev);
+
+ drbd_resume_io(mdev);
+
+ return rv;
+}
+
+/**
+ * after_state_ch() - Perform after state change actions that may sleep
+ * @mdev: DRBD device.
+ * @os: old state.
+ * @ns: new state.
+ * @flags: Flags
+ */
+static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
+ union drbd_state ns, enum chg_state_flags flags)
+{
+ struct sib_info sib;
+
+ sib.sib_reason = SIB_STATE_CHANGE;
+ sib.os = os;
+ sib.ns = ns;
+
+ if (os.conn != C_CONNECTED && ns.conn == C_CONNECTED) {
+ clear_bit(CRASHED_PRIMARY, &mdev->flags);
+ if (mdev->p_uuid)
+ mdev->p_uuid[UI_FLAGS] &= ~((u64)2);
+ }
+
+ /* Inform userspace about the change... */
+ drbd_bcast_event(mdev, &sib);
+
+ if (!(os.role == R_PRIMARY && os.disk < D_UP_TO_DATE && os.pdsk < D_UP_TO_DATE) &&
+ (ns.role == R_PRIMARY && ns.disk < D_UP_TO_DATE && ns.pdsk < D_UP_TO_DATE))
+ drbd_khelper(mdev, "pri-on-incon-degr");
+
+ /* Here we have the actions that are performed after a
+ state change. This function might sleep */
+
+ if (ns.susp_nod) {
+ struct drbd_tconn *tconn = mdev->tconn;
+ enum drbd_req_event what = NOTHING;
+
+ spin_lock_irq(&tconn->req_lock);
+ if (os.conn < C_CONNECTED && conn_lowest_conn(tconn) >= C_CONNECTED)
+ what = RESEND;
+
+ if ((os.disk == D_ATTACHING || os.disk == D_NEGOTIATING) &&
+ conn_lowest_disk(tconn) > D_NEGOTIATING)
+ what = RESTART_FROZEN_DISK_IO;
+
+ if (tconn->susp_nod && what != NOTHING) {
+ _tl_restart(tconn, what);
+ _conn_request_state(tconn,
+ (union drbd_state) { { .susp_nod = 1 } },
+ (union drbd_state) { { .susp_nod = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&tconn->req_lock);
+ }
+
+ if (ns.susp_fen) {
+ struct drbd_tconn *tconn = mdev->tconn;
+
+ spin_lock_irq(&tconn->req_lock);
+ if (tconn->susp_fen && conn_lowest_conn(tconn) >= C_CONNECTED) {
+ /* case2: The connection was established again: */
+ struct drbd_conf *odev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, odev, vnr)
+ clear_bit(NEW_CUR_UUID, &odev->flags);
+ rcu_read_unlock();
+ _tl_restart(tconn, RESEND);
+ _conn_request_state(tconn,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ }
+ spin_unlock_irq(&tconn->req_lock);
+ }
+
+ /* Became sync source. With protocol >= 96, we still need to send out
+ * the sync uuid now. Need to do that before any drbd_send_state, or
+ * the other side may go "paused sync" before receiving the sync uuids,
+ * which is unexpected. */
+ if ((os.conn != C_SYNC_SOURCE && os.conn != C_PAUSED_SYNC_S) &&
+ (ns.conn == C_SYNC_SOURCE || ns.conn == C_PAUSED_SYNC_S) &&
+ mdev->tconn->agreed_pro_version >= 96 && get_ldev(mdev)) {
+ drbd_gen_and_send_sync_uuid(mdev);
+ put_ldev(mdev);
+ }
+
+ /* Do not change the order of the if above and the two below... */
+ if (os.pdsk == D_DISKLESS &&
+ ns.pdsk > D_DISKLESS && ns.pdsk != D_UNKNOWN) { /* attach on the peer */
+ /* we probably will start a resync soon.
+ * make sure those things are properly reset. */
+ mdev->rs_total = 0;
+ mdev->rs_failed = 0;
+ atomic_set(&mdev->rs_pending_cnt, 0);
+ drbd_rs_cancel_all(mdev);
+
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev, ns);
+ }
+ /* No point in queuing send_bitmap if we don't have a connection
+ * anymore, so check also the _current_ state, not only the new state
+ * at the time this work was queued. */
+ if (os.conn != C_WF_BITMAP_S && ns.conn == C_WF_BITMAP_S &&
+ mdev->state.conn == C_WF_BITMAP_S)
+ drbd_queue_bitmap_io(mdev, &drbd_send_bitmap, NULL,
+ "send_bitmap (WFBitMapS)",
+ BM_LOCKED_TEST_ALLOWED);
+
+ /* Lost contact to peer's copy of the data */
+ if ((os.pdsk >= D_INCONSISTENT &&
+ os.pdsk != D_UNKNOWN &&
+ os.pdsk != D_OUTDATED)
+ && (ns.pdsk < D_INCONSISTENT ||
+ ns.pdsk == D_UNKNOWN ||
+ ns.pdsk == D_OUTDATED)) {
+ if (get_ldev(mdev)) {
+ if ((ns.role == R_PRIMARY || ns.peer == R_PRIMARY) &&
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ if (drbd_suspended(mdev)) {
+ set_bit(NEW_CUR_UUID, &mdev->flags);
+ } else {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
+ }
+ put_ldev(mdev);
+ }
+ }
+
+ if (ns.pdsk < D_INCONSISTENT && get_ldev(mdev)) {
+ if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY &&
+ mdev->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) {
+ drbd_uuid_new_current(mdev);
+ drbd_send_uuids(mdev);
+ }
+ /* D_DISKLESS Peer becomes secondary */
+ if (os.peer == R_PRIMARY && ns.peer == R_SECONDARY)
+ /* We may still be Primary ourselves.
+ * No harm done if the bitmap still changes,
+ * redirtied pages will follow later. */
+ drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+ "demote diskless peer", BM_LOCKED_SET_ALLOWED);
+ put_ldev(mdev);
+ }
+
+ /* Write out all changed bits on demote.
+ * Though, no need to da that just yet
+ * if there is a resync going on still */
+ if (os.role == R_PRIMARY && ns.role == R_SECONDARY &&
+ mdev->state.conn <= C_CONNECTED && get_ldev(mdev)) {
+ /* No changes to the bitmap expected this time, so assert that,
+ * even though no harm was done if it did change. */
+ drbd_bitmap_io_from_worker(mdev, &drbd_bm_write,
+ "demote", BM_LOCKED_TEST_ALLOWED);
+ put_ldev(mdev);
+ }
+
+ /* Last part of the attaching process ... */
+ if (ns.conn >= C_CONNECTED &&
+ os.disk == D_ATTACHING && ns.disk == D_NEGOTIATING) {
+ drbd_send_sizes(mdev, 0, 0); /* to start sync... */
+ drbd_send_uuids(mdev);
+ drbd_send_state(mdev, ns);
+ }
+
+ /* We want to pause/continue resync, tell peer. */
+ if (ns.conn >= C_CONNECTED &&
+ ((os.aftr_isp != ns.aftr_isp) ||
+ (os.user_isp != ns.user_isp)))
+ drbd_send_state(mdev, ns);
+
+ /* In case one of the isp bits got set, suspend other devices. */
+ if ((!os.aftr_isp && !os.peer_isp && !os.user_isp) &&
+ (ns.aftr_isp || ns.peer_isp || ns.user_isp))
+ suspend_other_sg(mdev);
+
+ /* Make sure the peer gets informed about eventual state
+ changes (ISP bits) while we were in WFReportParams. */
+ if (os.conn == C_WF_REPORT_PARAMS && ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ if (os.conn != C_AHEAD && ns.conn == C_AHEAD)
+ drbd_send_state(mdev, ns);
+
+ /* We are in the progress to start a full sync... */
+ if ((os.conn != C_STARTING_SYNC_T && ns.conn == C_STARTING_SYNC_T) ||
+ (os.conn != C_STARTING_SYNC_S && ns.conn == C_STARTING_SYNC_S))
+ /* no other bitmap changes expected during this phase */
+ drbd_queue_bitmap_io(mdev,
+ &drbd_bmio_set_n_write, &abw_start_sync,
+ "set_n_write from StartingSync", BM_LOCKED_TEST_ALLOWED);
+
+ /* We are invalidating our self... */
+ if (os.conn < C_CONNECTED && ns.conn < C_CONNECTED &&
+ os.disk > D_INCONSISTENT && ns.disk == D_INCONSISTENT)
+ /* other bitmap operation expected during this phase */
+ drbd_queue_bitmap_io(mdev, &drbd_bmio_set_n_write, NULL,
+ "set_n_write from invalidate", BM_LOCKED_MASK);
+
+ /* first half of local IO error, failure to attach,
+ * or administrative detach */
+ if (os.disk != D_FAILED && ns.disk == D_FAILED) {
+ enum drbd_io_error_p eh = EP_PASS_ON;
+ int was_io_error = 0;
+ /* corresponding get_ldev was in __drbd_set_state, to serialize
+ * our cleanup here with the transition to D_DISKLESS.
+ * But is is still not save to dreference ldev here, since
+ * we might come from an failed Attach before ldev was set. */
+ if (mdev->ldev) {
+ rcu_read_lock();
+ eh = rcu_dereference(mdev->ldev->disk_conf)->on_io_error;
+ rcu_read_unlock();
+
+ was_io_error = test_and_clear_bit(WAS_IO_ERROR, &mdev->flags);
+
+ if (was_io_error && eh == EP_CALL_HELPER)
+ drbd_khelper(mdev, "local-io-error");
+
+ /* Immediately allow completion of all application IO,
+ * that waits for completion from the local disk,
+ * if this was a force-detach due to disk_timeout
+ * or administrator request (drbdsetup detach --force).
+ * Do NOT abort otherwise.
+ * Aborting local requests may cause serious problems,
+ * if requests are completed to upper layers already,
+ * and then later the already submitted local bio completes.
+ * This can cause DMA into former bio pages that meanwhile
+ * have been re-used for other things.
+ * So aborting local requests may cause crashes,
+ * or even worse, silent data corruption.
+ */
+ if (test_and_clear_bit(FORCE_DETACH, &mdev->flags))
+ tl_abort_disk_io(mdev);
+
+ /* current state still has to be D_FAILED,
+ * there is only one way out: to D_DISKLESS,
+ * and that may only happen after our put_ldev below. */
+ if (mdev->state.disk != D_FAILED)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s during detach\n",
+ drbd_disk_str(mdev->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ drbd_rs_cancel_all(mdev);
+
+ /* In case we want to get something to stable storage still,
+ * this may be the last chance.
+ * Following put_ldev may transition to D_DISKLESS. */
+ drbd_md_sync(mdev);
+ }
+ put_ldev(mdev);
+ }
+
+ /* second half of local IO error, failure to attach,
+ * or administrative detach,
+ * after local_cnt references have reached zero again */
+ if (os.disk != D_DISKLESS && ns.disk == D_DISKLESS) {
+ /* We must still be diskless,
+ * re-attach has to be serialized with this! */
+ if (mdev->state.disk != D_DISKLESS)
+ dev_err(DEV,
+ "ASSERT FAILED: disk is %s while going diskless\n",
+ drbd_disk_str(mdev->state.disk));
+
+ if (ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+ /* corresponding get_ldev in __drbd_set_state
+ * this may finally trigger drbd_ldev_destroy. */
+ put_ldev(mdev);
+ }
+
+ /* Notify peer that I had a local IO error, and did not detached.. */
+ if (os.disk == D_UP_TO_DATE && ns.disk == D_INCONSISTENT && ns.conn >= C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ /* Disks got bigger while they were detached */
+ if (ns.disk > D_NEGOTIATING && ns.pdsk > D_NEGOTIATING &&
+ test_and_clear_bit(RESYNC_AFTER_NEG, &mdev->flags)) {
+ if (ns.conn == C_CONNECTED)
+ resync_after_online_grow(mdev);
+ }
+
+ /* A resync finished or aborted, wake paused devices... */
+ if ((os.conn > C_CONNECTED && ns.conn <= C_CONNECTED) ||
+ (os.peer_isp && !ns.peer_isp) ||
+ (os.user_isp && !ns.user_isp))
+ resume_next_sg(mdev);
+
+ /* sync target done with resync. Explicitly notify peer, even though
+ * it should (at least for non-empty resyncs) already know itself. */
+ if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
+ drbd_send_state(mdev, ns);
+
+ /* Verify finished, or reached stop sector. Peer did not know about
+ * the stop sector, and we may even have changed the stop sector during
+ * verify to interrupt/stop early. Send the new state. */
+ if (os.conn == C_VERIFY_S && ns.conn == C_CONNECTED
+ && verify_can_do_stop_sector(mdev))
+ drbd_send_state(mdev, ns);
+
+ /* This triggers bitmap writeout of potentially still unwritten pages
+ * if the resync finished cleanly, or aborted because of peer disk
+ * failure, or because of connection loss.
+ * For resync aborted because of local disk failure, we cannot do
+ * any bitmap writeout anymore.
+ * No harm done if some bits change during this phase.
+ */
+ if (os.conn > C_CONNECTED && ns.conn <= C_CONNECTED && get_ldev(mdev)) {
+ drbd_queue_bitmap_io(mdev, &drbd_bm_write_copy_pages, NULL,
+ "write from resync_finished", BM_LOCKED_CHANGE_ALLOWED);
+ put_ldev(mdev);
+ }
+
+ if (ns.disk == D_DISKLESS &&
+ ns.conn == C_STANDALONE &&
+ ns.role == R_SECONDARY) {
+ if (os.aftr_isp != ns.aftr_isp)
+ resume_next_sg(mdev);
+ }
+
+ drbd_md_sync(mdev);
+}
+
+struct after_conn_state_chg_work {
+ struct drbd_work w;
+ enum drbd_conns oc;
+ union drbd_state ns_min;
+ union drbd_state ns_max; /* new, max state, over all mdevs */
+ enum chg_state_flags flags;
+};
+
+static int w_after_conn_state_ch(struct drbd_work *w, int unused)
+{
+ struct after_conn_state_chg_work *acscw =
+ container_of(w, struct after_conn_state_chg_work, w);
+ struct drbd_tconn *tconn = w->tconn;
+ enum drbd_conns oc = acscw->oc;
+ union drbd_state ns_max = acscw->ns_max;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ kfree(acscw);
+
+ /* Upon network configuration, we need to start the receiver */
+ if (oc == C_STANDALONE && ns_max.conn == C_UNCONNECTED)
+ drbd_thread_start(&tconn->receiver);
+
+ if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) {
+ struct net_conf *old_conf;
+
+ mutex_lock(&tconn->conf_update);
+ old_conf = tconn->net_conf;
+ tconn->my_addr_len = 0;
+ tconn->peer_addr_len = 0;
+ rcu_assign_pointer(tconn->net_conf, NULL);
+ conn_free_crypto(tconn);
+ mutex_unlock(&tconn->conf_update);
+
+ synchronize_rcu();
+ kfree(old_conf);
+ }
+
+ if (ns_max.susp_fen) {
+ /* case1: The outdate peer handler is successful: */
+ if (ns_max.pdsk <= D_OUTDATED) {
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
+ drbd_uuid_new_current(mdev);
+ clear_bit(NEW_CUR_UUID, &mdev->flags);
+ }
+ }
+ rcu_read_unlock();
+ spin_lock_irq(&tconn->req_lock);
+ _tl_restart(tconn, CONNECTION_LOST_WHILE_PENDING);
+ _conn_request_state(tconn,
+ (union drbd_state) { { .susp_fen = 1 } },
+ (union drbd_state) { { .susp_fen = 0 } },
+ CS_VERBOSE);
+ spin_unlock_irq(&tconn->req_lock);
+ }
+ }
+ kref_put(&tconn->kref, &conn_destroy);
+
+ conn_md_sync(tconn);
+
+ return 0;
+}
+
+void conn_old_common_state(struct drbd_tconn *tconn, union drbd_state *pcs, enum chg_state_flags *pf)
+{
+ enum chg_state_flags flags = ~0;
+ struct drbd_conf *mdev;
+ int vnr, first_vol = 1;
+ union drbd_dev_state os, cs = {
+ { .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = tconn->cstate,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN,
+ } };
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ os = mdev->state;
+
+ if (first_vol) {
+ cs = os;
+ first_vol = 0;
+ continue;
+ }
+
+ if (cs.role != os.role)
+ flags &= ~CS_DC_ROLE;
+
+ if (cs.peer != os.peer)
+ flags &= ~CS_DC_PEER;
+
+ if (cs.conn != os.conn)
+ flags &= ~CS_DC_CONN;
+
+ if (cs.disk != os.disk)
+ flags &= ~CS_DC_DISK;
+
+ if (cs.pdsk != os.pdsk)
+ flags &= ~CS_DC_PDSK;
+ }
+ rcu_read_unlock();
+
+ *pf |= CS_DC_MASK;
+ *pf &= flags;
+ (*pcs).i = cs.i;
+}
+
+static enum drbd_state_rv
+conn_is_valid_transition(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ union drbd_state ns, os;
+ struct drbd_conf *mdev;
+ int vnr;
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ os = drbd_read_state(mdev);
+ ns = sanitize_state(mdev, apply_mask_val(os, mask, val), NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ if (ns.i == os.i)
+ continue;
+
+ rv = is_valid_transition(os, ns);
+ if (rv < SS_SUCCESS)
+ break;
+
+ if (!(flags & CS_HARD)) {
+ rv = is_valid_state(mdev, ns);
+ if (rv < SS_SUCCESS) {
+ if (is_valid_state(mdev, os) == rv)
+ rv = is_valid_soft_transition(os, ns, tconn);
+ } else
+ rv = is_valid_soft_transition(os, ns, tconn);
+ }
+ if (rv < SS_SUCCESS)
+ break;
+ }
+ rcu_read_unlock();
+
+ if (rv < SS_SUCCESS && flags & CS_VERBOSE)
+ print_st_err(mdev, os, ns, rv);
+
+ return rv;
+}
+
+void
+conn_set_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ union drbd_state *pns_min, union drbd_state *pns_max, enum chg_state_flags flags)
+{
+ union drbd_state ns, os, ns_max = { };
+ union drbd_state ns_min = {
+ { .role = R_MASK,
+ .peer = R_MASK,
+ .conn = val.conn,
+ .disk = D_MASK,
+ .pdsk = D_MASK
+ } };
+ struct drbd_conf *mdev;
+ enum drbd_state_rv rv;
+ int vnr, number_of_volumes = 0;
+
+ if (mask.conn == C_MASK) {
+ /* remember last connect time so request_timer_fn() won't
+ * kill newly established sessions while we are still trying to thaw
+ * previously frozen IO */
+ if (tconn->cstate != C_WF_REPORT_PARAMS && val.conn == C_WF_REPORT_PARAMS)
+ tconn->last_reconnect_jif = jiffies;
+
+ tconn->cstate = val.conn;
+ }
+
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ number_of_volumes++;
+ os = drbd_read_state(mdev);
+ ns = apply_mask_val(os, mask, val);
+ ns = sanitize_state(mdev, ns, NULL);
+
+ if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED)
+ ns.disk = os.disk;
+
+ rv = __drbd_set_state(mdev, ns, flags, NULL);
+ if (rv < SS_SUCCESS)
+ BUG();
+
+ ns.i = mdev->state.i;
+ ns_max.role = max_role(ns.role, ns_max.role);
+ ns_max.peer = max_role(ns.peer, ns_max.peer);
+ ns_max.conn = max_t(enum drbd_conns, ns.conn, ns_max.conn);
+ ns_max.disk = max_t(enum drbd_disk_state, ns.disk, ns_max.disk);
+ ns_max.pdsk = max_t(enum drbd_disk_state, ns.pdsk, ns_max.pdsk);
+
+ ns_min.role = min_role(ns.role, ns_min.role);
+ ns_min.peer = min_role(ns.peer, ns_min.peer);
+ ns_min.conn = min_t(enum drbd_conns, ns.conn, ns_min.conn);
+ ns_min.disk = min_t(enum drbd_disk_state, ns.disk, ns_min.disk);
+ ns_min.pdsk = min_t(enum drbd_disk_state, ns.pdsk, ns_min.pdsk);
+ }
+ rcu_read_unlock();
+
+ if (number_of_volumes == 0) {
+ ns_min = ns_max = (union drbd_state) { {
+ .role = R_SECONDARY,
+ .peer = R_UNKNOWN,
+ .conn = val.conn,
+ .disk = D_DISKLESS,
+ .pdsk = D_UNKNOWN
+ } };
+ }
+
+ ns_min.susp = ns_max.susp = tconn->susp;
+ ns_min.susp_nod = ns_max.susp_nod = tconn->susp_nod;
+ ns_min.susp_fen = ns_max.susp_fen = tconn->susp_fen;
+
+ *pns_min = ns_min;
+ *pns_max = ns_max;
+}
+
+static enum drbd_state_rv
+_conn_rq_cond(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val)
+{
+ enum drbd_state_rv rv;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_OKAY, &tconn->flags))
+ return SS_CW_SUCCESS;
+
+ if (test_and_clear_bit(CONN_WD_ST_CHG_FAIL, &tconn->flags))
+ return SS_CW_FAILED_BY_PEER;
+
+ rv = tconn->cstate != C_WF_REPORT_PARAMS ? SS_CW_NO_NEED : SS_UNKNOWN_ERROR;
+
+ if (rv == SS_UNKNOWN_ERROR)
+ rv = conn_is_valid_transition(tconn, mask, val, 0);
+
+ if (rv == SS_SUCCESS)
+ rv = SS_UNKNOWN_ERROR; /* cont waiting, otherwise fail. */
+
+ return rv;
+}
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv = SS_SUCCESS;
+ struct after_conn_state_chg_work *acscw;
+ enum drbd_conns oc = tconn->cstate;
+ union drbd_state ns_max, ns_min, os;
+ bool have_mutex = false;
+
+ if (mask.conn) {
+ rv = is_valid_conn_transition(oc, val.conn);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ rv = conn_is_valid_transition(tconn, mask, val, flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+
+ if (oc == C_WF_REPORT_PARAMS && val.conn == C_DISCONNECTING &&
+ !(flags & (CS_LOCAL_ONLY | CS_HARD))) {
+
+ /* This will be a cluster-wide state change.
+ * Need to give up the spinlock, grab the mutex,
+ * then send the state change request, ... */
+ spin_unlock_irq(&tconn->req_lock);
+ mutex_lock(&tconn->cstate_mutex);
+ have_mutex = true;
+
+ set_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+ if (conn_send_state_req(tconn, mask, val)) {
+ /* sending failed. */
+ clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+ rv = SS_CW_FAILED_BY_PEER;
+ /* need to re-aquire the spin lock, though */
+ goto abort_unlocked;
+ }
+
+ if (val.conn == C_DISCONNECTING)
+ set_bit(DISCONNECT_SENT, &tconn->flags);
+
+ /* ... and re-aquire the spinlock.
+ * If _conn_rq_cond() returned >= SS_SUCCESS, we must call
+ * conn_set_state() within the same spinlock. */
+ spin_lock_irq(&tconn->req_lock);
+ wait_event_lock_irq(tconn->ping_wait,
+ (rv = _conn_rq_cond(tconn, mask, val)),
+ tconn->req_lock,
+ );
+ clear_bit(CONN_WD_ST_CHG_REQ, &tconn->flags);
+ if (rv < SS_SUCCESS)
+ goto abort;
+ }
+
+ conn_old_common_state(tconn, &os, &flags);
+ flags |= CS_DC_SUSP;
+ conn_set_state(tconn, mask, val, &ns_min, &ns_max, flags);
+ conn_pr_state_change(tconn, os, ns_max, flags);
+
+ acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC);
+ if (acscw) {
+ acscw->oc = os.conn;
+ acscw->ns_min = ns_min;
+ acscw->ns_max = ns_max;
+ acscw->flags = flags;
+ acscw->w.cb = w_after_conn_state_ch;
+ kref_get(&tconn->kref);
+ acscw->w.tconn = tconn;
+ drbd_queue_work(&tconn->sender_work, &acscw->w);
+ } else {
+ conn_err(tconn, "Could not kmalloc an acscw\n");
+ }
+
+ abort:
+ if (have_mutex) {
+ /* mutex_unlock() "... must not be used in interrupt context.",
+ * so give up the spinlock, then re-aquire it */
+ spin_unlock_irq(&tconn->req_lock);
+ abort_unlocked:
+ mutex_unlock(&tconn->cstate_mutex);
+ spin_lock_irq(&tconn->req_lock);
+ }
+ if (rv < SS_SUCCESS && flags & CS_VERBOSE) {
+ conn_err(tconn, "State change failed: %s\n", drbd_set_st_err_str(rv));
+ conn_err(tconn, " mask = 0x%x val = 0x%x\n", mask.i, val.i);
+ conn_err(tconn, " old_conn:%s wanted_conn:%s\n", drbd_conn_str(oc), drbd_conn_str(val.conn));
+ }
+ return rv;
+}
+
+enum drbd_state_rv
+conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags)
+{
+ enum drbd_state_rv rv;
+
+ spin_lock_irq(&tconn->req_lock);
+ rv = _conn_request_state(tconn, mask, val, flags);
+ spin_unlock_irq(&tconn->req_lock);
+
+ return rv;
+}
diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h
new file mode 100644
index 0000000..a3c361b
--- /dev/null
+++ b/drivers/block/drbd/drbd_state.h
@@ -0,0 +1,161 @@
+#ifndef DRBD_STATE_H
+#define DRBD_STATE_H
+
+struct drbd_conf;
+struct drbd_tconn;
+
+/**
+ * DOC: DRBD State macros
+ *
+ * These macros are used to express state changes in easily readable form.
+ *
+ * The NS macros expand to a mask and a value, that can be bit ored onto the
+ * current state as soon as the spinlock (req_lock) was taken.
+ *
+ * The _NS macros are used for state functions that get called with the
+ * spinlock. These macros expand directly to the new state value.
+ *
+ * Besides the basic forms NS() and _NS() additional _?NS[23] are defined
+ * to express state changes that affect more than one aspect of the state.
+ *
+ * E.g. NS2(conn, C_CONNECTED, peer, R_SECONDARY)
+ * Means that the network connection was established and that the peer
+ * is in secondary role.
+ */
+#define role_MASK R_MASK
+#define peer_MASK R_MASK
+#define disk_MASK D_MASK
+#define pdsk_MASK D_MASK
+#define conn_MASK C_MASK
+#define susp_MASK 1
+#define user_isp_MASK 1
+#define aftr_isp_MASK 1
+#define susp_nod_MASK 1
+#define susp_fen_MASK 1
+
+#define NS(T, S) \
+ ({ union drbd_state mask; mask.i = 0; mask.T = T##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T = (S); val; })
+#define NS2(T1, S1, T2, S2) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val; })
+#define NS3(T1, S1, T2, S2, T3, S3) \
+ ({ union drbd_state mask; mask.i = 0; mask.T1 = T1##_MASK; \
+ mask.T2 = T2##_MASK; mask.T3 = T3##_MASK; mask; }), \
+ ({ union drbd_state val; val.i = 0; val.T1 = (S1); \
+ val.T2 = (S2); val.T3 = (S3); val; })
+
+#define _NS(D, T, S) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T = (S); __ns; })
+#define _NS2(D, T1, S1, T2, S2) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns; })
+#define _NS3(D, T1, S1, T2, S2, T3, S3) \
+ D, ({ union drbd_state __ns; __ns = drbd_read_state(D); __ns.T1 = (S1); \
+ __ns.T2 = (S2); __ns.T3 = (S3); __ns; })
+
+enum chg_state_flags {
+ CS_HARD = 1 << 0,
+ CS_VERBOSE = 1 << 1,
+ CS_WAIT_COMPLETE = 1 << 2,
+ CS_SERIALIZE = 1 << 3,
+ CS_ORDERED = CS_WAIT_COMPLETE + CS_SERIALIZE,
+ CS_LOCAL_ONLY = 1 << 4, /* Do not consider a device pair wide state change */
+ CS_DC_ROLE = 1 << 5, /* DC = display as connection state change */
+ CS_DC_PEER = 1 << 6,
+ CS_DC_CONN = 1 << 7,
+ CS_DC_DISK = 1 << 8,
+ CS_DC_PDSK = 1 << 9,
+ CS_DC_SUSP = 1 << 10,
+ CS_DC_MASK = CS_DC_ROLE + CS_DC_PEER + CS_DC_CONN + CS_DC_DISK + CS_DC_PDSK,
+ CS_IGN_OUTD_FAIL = 1 << 11,
+};
+
+/* drbd_dev_state and drbd_state are different types. This is to stress the
+ small difference. There is no suspended flag (.susp), and no suspended
+ while fence handler runs flas (susp_fen). */
+union drbd_dev_state {
+ struct {
+#if defined(__LITTLE_ENDIAN_BITFIELD)
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned _unused:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned peer_isp:1 ;
+ unsigned user_isp:1 ;
+ unsigned _pad:11; /* 0 unused */
+#elif defined(__BIG_ENDIAN_BITFIELD)
+ unsigned _pad:11;
+ unsigned user_isp:1 ;
+ unsigned peer_isp:1 ;
+ unsigned aftr_isp:1 ; /* isp .. imposed sync pause */
+ unsigned _unused:1 ;
+ unsigned pdsk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned disk:4 ; /* 8/16 from D_DISKLESS to D_UP_TO_DATE */
+ unsigned conn:5 ; /* 17/32 cstates */
+ unsigned peer:2 ; /* 3/4 primary/secondary/unknown */
+ unsigned role:2 ; /* 3/4 primary/secondary/unknown */
+#else
+# error "this endianess is not supported"
+#endif
+ };
+ unsigned int i;
+};
+
+extern enum drbd_state_rv drbd_change_state(struct drbd_conf *mdev,
+ enum chg_state_flags f,
+ union drbd_state mask,
+ union drbd_state val);
+extern void drbd_force_state(struct drbd_conf *, union drbd_state,
+ union drbd_state);
+extern enum drbd_state_rv _drbd_request_state(struct drbd_conf *,
+ union drbd_state,
+ union drbd_state,
+ enum chg_state_flags);
+extern enum drbd_state_rv __drbd_set_state(struct drbd_conf *, union drbd_state,
+ enum chg_state_flags,
+ struct completion *done);
+extern void print_st_err(struct drbd_conf *, union drbd_state,
+ union drbd_state, int);
+
+enum drbd_state_rv
+_conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+enum drbd_state_rv
+conn_request_state(struct drbd_tconn *tconn, union drbd_state mask, union drbd_state val,
+ enum chg_state_flags flags);
+
+extern void drbd_resume_al(struct drbd_conf *mdev);
+extern bool conn_all_vols_unconf(struct drbd_tconn *tconn);
+
+/**
+ * drbd_request_state() - Reqest a state change
+ * @mdev: DRBD device.
+ * @mask: mask of state bits to change.
+ * @val: value of new state bits.
+ *
+ * This is the most graceful way of requesting a state change. It is verbose
+ * quite verbose in case the state change is not possible, and all those
+ * state changes are globally serialized.
+ */
+static inline int drbd_request_state(struct drbd_conf *mdev,
+ union drbd_state mask,
+ union drbd_state val)
+{
+ return _drbd_request_state(mdev, mask, val, CS_VERBOSE + CS_ORDERED);
+}
+
+enum drbd_role conn_highest_role(struct drbd_tconn *tconn);
+enum drbd_role conn_highest_peer(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_highest_disk(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_lowest_disk(struct drbd_tconn *tconn);
+enum drbd_disk_state conn_highest_pdsk(struct drbd_tconn *tconn);
+enum drbd_conns conn_lowest_conn(struct drbd_tconn *tconn);
+
+#endif
diff --git a/drivers/block/drbd/drbd_strings.c b/drivers/block/drbd/drbd_strings.c
index c44a2a6..9a664bd 100644
--- a/drivers/block/drbd/drbd_strings.c
+++ b/drivers/block/drbd/drbd_strings.c
@@ -89,6 +89,7 @@
[-SS_LOWER_THAN_OUTDATED] = "Disk state is lower than outdated",
[-SS_IN_TRANSIENT_STATE] = "In transient state, retry after next state change",
[-SS_CONCURRENT_ST_CHG] = "Concurrent state changes detected and aborted",
+ [-SS_O_VOL_PEER_PRI] = "Other vol primary on peer not allowed by config",
};
const char *drbd_conn_str(enum drbd_conns s)
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 7cd32e7..424dc7b 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -38,16 +38,13 @@
#include "drbd_int.h"
#include "drbd_req.h"
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel);
-static int w_make_resync_request(struct drbd_conf *mdev,
- struct drbd_work *w, int cancel);
-
+static int w_make_ov_request(struct drbd_work *w, int cancel);
/* endio handlers:
* drbd_md_io_complete (defined here)
- * drbd_endio_pri (defined here)
- * drbd_endio_sec (defined here)
+ * drbd_request_endio (defined here)
+ * drbd_peer_request_endio (defined here)
* bm_async_io_complete (defined in drbd_bitmap.c)
*
* For all these callbacks, note the following:
@@ -60,7 +57,7 @@
/* About the global_state_lock
Each state transition on an device holds a read lock. In case we have
- to evaluate the sync after dependencies, we grab a write lock, because
+ to evaluate the resync after dependencies, we grab a write lock, because
we need stable states on all devices for that. */
rwlock_t global_state_lock;
@@ -98,97 +95,93 @@
/* reads on behalf of the partner,
* "submitted" by the receiver
*/
-void drbd_endio_read_sec_final(struct drbd_epoch_entry *e) __releases(local)
+void drbd_endio_read_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
+ struct drbd_conf *mdev = peer_req->w.mdev;
- D_ASSERT(e->block_id != ID_VACANT);
-
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->read_cnt += e->size >> 9;
- list_del(&e->w.list);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ mdev->read_cnt += peer_req->i.size >> 9;
+ list_del(&peer_req->w.list);
if (list_empty(&mdev->read_ee))
wake_up(&mdev->ee_wait);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_READ_ERROR);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
- drbd_queue_work(&mdev->data.work, &e->w);
+ drbd_queue_work(&mdev->tconn->sender_work, &peer_req->w);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver, final stage. */
-static void drbd_endio_write_sec_final(struct drbd_epoch_entry *e) __releases(local)
+static void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(local)
{
unsigned long flags = 0;
- struct drbd_conf *mdev = e->mdev;
- sector_t e_sector;
+ struct drbd_conf *mdev = peer_req->w.mdev;
+ struct drbd_interval i;
int do_wake;
- int is_syncer_req;
+ u64 block_id;
int do_al_complete_io;
- D_ASSERT(e->block_id != ID_VACANT);
-
- /* after we moved e to done_ee,
+ /* after we moved peer_req to done_ee,
* we may no longer access it,
* it may be freed/reused already!
* (as soon as we release the req_lock) */
- e_sector = e->sector;
- do_al_complete_io = e->flags & EE_CALL_AL_COMPLETE_IO;
- is_syncer_req = is_syncer_block_id(e->block_id);
+ i = peer_req->i;
+ do_al_complete_io = peer_req->flags & EE_CALL_AL_COMPLETE_IO;
+ block_id = peer_req->block_id;
- spin_lock_irqsave(&mdev->req_lock, flags);
- mdev->writ_cnt += e->size >> 9;
- list_del(&e->w.list); /* has been on active_ee or sync_ee */
- list_add_tail(&e->w.list, &mdev->done_ee);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
+ mdev->writ_cnt += peer_req->i.size >> 9;
+ list_move_tail(&peer_req->w.list, &mdev->done_ee);
- /* No hlist_del_init(&e->collision) here, we did not send the Ack yet,
- * neither did we wake possibly waiting conflicting requests.
- * done from "drbd_process_done_ee" within the appropriate w.cb
- * (e_end_block/e_end_resync_block) or from _drbd_clear_done_ee */
+ /*
+ * Do not remove from the write_requests tree here: we did not send the
+ * Ack yet and did not wake possibly waiting conflicting requests.
+ * Removed from the tree from "drbd_process_done_ee" within the
+ * appropriate w.cb (e_end_block/e_end_resync_block) or from
+ * _drbd_clear_done_ee.
+ */
- do_wake = is_syncer_req
- ? list_empty(&mdev->sync_ee)
- : list_empty(&mdev->active_ee);
+ do_wake = list_empty(block_id == ID_SYNCER ? &mdev->sync_ee : &mdev->active_ee);
- if (test_bit(__EE_WAS_ERROR, &e->flags))
+ if (test_bit(__EE_WAS_ERROR, &peer_req->flags))
__drbd_chk_io_error(mdev, DRBD_WRITE_ERROR);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
- if (is_syncer_req)
- drbd_rs_complete_io(mdev, e_sector);
+ if (block_id == ID_SYNCER)
+ drbd_rs_complete_io(mdev, i.sector);
if (do_wake)
wake_up(&mdev->ee_wait);
if (do_al_complete_io)
- drbd_al_complete_io(mdev, e_sector);
+ drbd_al_complete_io(mdev, &i);
- wake_asender(mdev);
+ wake_asender(mdev->tconn);
put_ldev(mdev);
}
/* writes on behalf of the partner, or resync writes,
* "submitted" by the receiver.
*/
-void drbd_endio_sec(struct bio *bio, int error)
+void drbd_peer_request_endio(struct bio *bio, int error)
{
- struct drbd_epoch_entry *e = bio->bi_private;
- struct drbd_conf *mdev = e->mdev;
+ struct drbd_peer_request *peer_req = bio->bi_private;
+ struct drbd_conf *mdev = peer_req->w.mdev;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
int is_write = bio_data_dir(bio) == WRITE;
if (error && __ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: error=%d s=%llus\n",
is_write ? "write" : "read", error,
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
if (!error && !uptodate) {
if (__ratelimit(&drbd_ratelimit_state))
dev_warn(DEV, "%s: setting error to -EIO s=%llus\n",
is_write ? "write" : "read",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
/* strange behavior of some lower level drivers...
* fail the request by clearing the uptodate flag,
* but do not return any error?! */
@@ -196,24 +189,24 @@
}
if (error)
- set_bit(__EE_WAS_ERROR, &e->flags);
+ set_bit(__EE_WAS_ERROR, &peer_req->flags);
bio_put(bio); /* no need for the bio anymore */
- if (atomic_dec_and_test(&e->pending_bios)) {
+ if (atomic_dec_and_test(&peer_req->pending_bios)) {
if (is_write)
- drbd_endio_write_sec_final(e);
+ drbd_endio_write_sec_final(peer_req);
else
- drbd_endio_read_sec_final(e);
+ drbd_endio_read_sec_final(peer_req);
}
}
/* read, readA or write requests on R_PRIMARY coming from drbd_make_request
*/
-void drbd_endio_pri(struct bio *bio, int error)
+void drbd_request_endio(struct bio *bio, int error)
{
unsigned long flags;
struct drbd_request *req = bio->bi_private;
- struct drbd_conf *mdev = req->mdev;
+ struct drbd_conf *mdev = req->w.mdev;
struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -227,6 +220,7 @@
error = -EIO;
}
+
/* If this request was aborted locally before,
* but now was completed "successfully",
* chances are that this caused arbitrary data corruption.
@@ -266,50 +260,32 @@
/* to avoid recursion in __req_mod */
if (unlikely(error)) {
what = (bio_data_dir(bio) == WRITE)
- ? write_completed_with_error
+ ? WRITE_COMPLETED_WITH_ERROR
: (bio_rw(bio) == READ)
- ? read_completed_with_error
- : read_ahead_completed_with_error;
+ ? READ_COMPLETED_WITH_ERROR
+ : READ_AHEAD_COMPLETED_WITH_ERROR;
} else
- what = completed_ok;
+ what = COMPLETED_OK;
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
/* not req_mod(), we need irqsave here! */
- spin_lock_irqsave(&mdev->req_lock, flags);
+ spin_lock_irqsave(&mdev->tconn->req_lock, flags);
__req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
+ spin_unlock_irqrestore(&mdev->tconn->req_lock, flags);
put_ldev(mdev);
if (m.bio)
complete_master_bio(mdev, &m);
}
-int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- struct drbd_request *req = container_of(w, struct drbd_request, w);
-
- /* We should not detach for read io-error,
- * but try to WRITE the P_DATA_REPLY to the failed location,
- * to give the disk the chance to relocate that block */
-
- spin_lock_irq(&mdev->req_lock);
- if (cancel || mdev->state.pdsk != D_UP_TO_DATE) {
- _req_mod(req, read_retry_remote_canceled);
- spin_unlock_irq(&mdev->req_lock);
- return 1;
- }
- spin_unlock_irq(&mdev->req_lock);
-
- return w_send_read_req(mdev, w, 0);
-}
-
-void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm, struct drbd_epoch_entry *e, void *digest)
+void drbd_csum_ee(struct drbd_conf *mdev, struct crypto_hash *tfm,
+ struct drbd_peer_request *peer_req, void *digest)
{
struct hash_desc desc;
struct scatterlist sg;
- struct page *page = e->pages;
+ struct page *page = peer_req->pages;
struct page *tmp;
unsigned len;
@@ -326,7 +302,7 @@
page = tmp;
}
/* and now the last, possibly only partially used page */
- len = e->size & (PAGE_SIZE - 1);
+ len = peer_req->i.size & (PAGE_SIZE - 1);
sg_set_page(&sg, page, len ?: PAGE_SIZE, 0);
crypto_hash_update(&desc, &sg, sg.length);
crypto_hash_final(&desc, digest);
@@ -352,59 +328,58 @@
crypto_hash_final(&desc, digest);
}
-/* TODO merge common code with w_e_end_ov_req */
-int w_e_send_csum(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* MAYBE merge common code with w_e_end_ov_req */
+static int w_e_send_csum(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
int digest_size;
void *digest;
- int ok = 1;
-
- D_ASSERT(e->block_id == DRBD_MAGIC + 0xbeef);
+ int err = 0;
if (unlikely(cancel))
goto out;
- if (likely((e->flags & EE_WAS_ERROR) != 0))
+ if (unlikely((peer_req->flags & EE_WAS_ERROR) != 0))
goto out;
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+ digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- sector_t sector = e->sector;
- unsigned int size = e->size;
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
- /* Free e and pages before send.
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
+ drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
+ /* Free peer_req and pages before send.
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
- e = NULL;
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
+ peer_req = NULL;
inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev, sector, size,
- digest, digest_size,
- P_CSUM_RS_REQUEST);
+ err = drbd_send_drequest_csum(mdev, sector, size,
+ digest, digest_size,
+ P_CSUM_RS_REQUEST);
kfree(digest);
} else {
dev_err(DEV, "kmalloc() of digest failed.\n");
- ok = 0;
+ err = -ENOMEM;
}
out:
- if (e)
- drbd_free_ee(mdev, e);
+ if (peer_req)
+ drbd_free_peer_req(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_drequest(..., csum) failed\n");
- return ok;
+ return err;
}
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
static int read_for_csum(struct drbd_conf *mdev, sector_t sector, int size)
{
- struct drbd_epoch_entry *e;
+ struct drbd_peer_request *peer_req;
if (!get_ldev(mdev))
return -EIO;
@@ -414,45 +389,47 @@
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
- e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
- if (!e)
+ peer_req = drbd_alloc_peer_req(mdev, ID_SYNCER /* unused */, sector,
+ size, GFP_TRY);
+ if (!peer_req)
goto defer;
- e->w.cb = w_e_send_csum;
- spin_lock_irq(&mdev->req_lock);
- list_add(&e->w.list, &mdev->read_ee);
- spin_unlock_irq(&mdev->req_lock);
+ peer_req->w.cb = w_e_send_csum;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add(&peer_req->w.list, &mdev->read_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
atomic_add(size >> 9, &mdev->rs_sect_ev);
- if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
+ if (drbd_submit_peer_request(mdev, peer_req, READ, DRBD_FAULT_RS_RD) == 0)
return 0;
/* If it failed because of ENOMEM, retry should help. If it failed
* because bio_add_page failed (probably broken lower level driver),
* retry may or may not help.
* If it does not, you may need to force disconnect. */
- spin_lock_irq(&mdev->req_lock);
- list_del(&e->w.list);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_del(&peer_req->w.list);
+ spin_unlock_irq(&mdev->tconn->req_lock);
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
defer:
put_ldev(mdev);
return -EAGAIN;
}
-int w_resync_timer(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_resync_timer(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
switch (mdev->state.conn) {
case C_VERIFY_S:
- w_make_ov_request(mdev, w, cancel);
+ w_make_ov_request(w, cancel);
break;
case C_SYNC_TARGET:
- w_make_resync_request(mdev, w, cancel);
+ w_make_resync_request(w, cancel);
break;
}
- return 1;
+ return 0;
}
void resync_timer_fn(unsigned long data)
@@ -460,7 +437,7 @@
struct drbd_conf *mdev = (struct drbd_conf *) data;
if (list_empty(&mdev->resync_work.list))
- drbd_queue_work(&mdev->data.work, &mdev->resync_work);
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->resync_work);
}
static void fifo_set(struct fifo_buffer *fb, int value)
@@ -492,8 +469,24 @@
fb->values[i] += value;
}
+struct fifo_buffer *fifo_alloc(int fifo_size)
+{
+ struct fifo_buffer *fb;
+
+ fb = kzalloc(sizeof(struct fifo_buffer) + sizeof(int) * fifo_size, GFP_NOIO);
+ if (!fb)
+ return NULL;
+
+ fb->head_index = 0;
+ fb->size = fifo_size;
+ fb->total = 0;
+
+ return fb;
+}
+
static int drbd_rs_controller(struct drbd_conf *mdev)
{
+ struct disk_conf *dc;
unsigned int sect_in; /* Number of sectors that came in since the last turn */
unsigned int want; /* The number of sectors we want in the proxy */
int req_sect; /* Number of sectors to request in this turn */
@@ -502,38 +495,39 @@
int steps; /* Number of time steps to plan ahead */
int curr_corr;
int max_sect;
+ struct fifo_buffer *plan;
sect_in = atomic_xchg(&mdev->rs_sect_in, 0); /* Number of sectors that came in */
mdev->rs_in_flight -= sect_in;
- spin_lock(&mdev->peer_seq_lock); /* get an atomic view on mdev->rs_plan_s */
+ dc = rcu_dereference(mdev->ldev->disk_conf);
+ plan = rcu_dereference(mdev->rs_plan_s);
- steps = mdev->rs_plan_s.size; /* (mdev->sync_conf.c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
+ steps = plan->size; /* (dc->c_plan_ahead * 10 * SLEEP_TIME) / HZ; */
if (mdev->rs_in_flight + sect_in == 0) { /* At start of resync */
- want = ((mdev->sync_conf.rate * 2 * SLEEP_TIME) / HZ) * steps;
+ want = ((dc->resync_rate * 2 * SLEEP_TIME) / HZ) * steps;
} else { /* normal path */
- want = mdev->sync_conf.c_fill_target ? mdev->sync_conf.c_fill_target :
- sect_in * mdev->sync_conf.c_delay_target * HZ / (SLEEP_TIME * 10);
+ want = dc->c_fill_target ? dc->c_fill_target :
+ sect_in * dc->c_delay_target * HZ / (SLEEP_TIME * 10);
}
- correction = want - mdev->rs_in_flight - mdev->rs_planed;
+ correction = want - mdev->rs_in_flight - plan->total;
/* Plan ahead */
cps = correction / steps;
- fifo_add_val(&mdev->rs_plan_s, cps);
- mdev->rs_planed += cps * steps;
+ fifo_add_val(plan, cps);
+ plan->total += cps * steps;
/* What we do in this step */
- curr_corr = fifo_push(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
- mdev->rs_planed -= curr_corr;
+ curr_corr = fifo_push(plan, 0);
+ plan->total -= curr_corr;
req_sect = sect_in + curr_corr;
if (req_sect < 0)
req_sect = 0;
- max_sect = (mdev->sync_conf.c_max_rate * 2 * SLEEP_TIME) / HZ;
+ max_sect = (dc->c_max_rate * 2 * SLEEP_TIME) / HZ;
if (req_sect > max_sect)
req_sect = max_sect;
@@ -549,22 +543,25 @@
static int drbd_rs_number_requests(struct drbd_conf *mdev)
{
int number;
- if (mdev->rs_plan_s.size) { /* mdev->sync_conf.c_plan_ahead */
+
+ rcu_read_lock();
+ if (rcu_dereference(mdev->rs_plan_s)->size) {
number = drbd_rs_controller(mdev) >> (BM_BLOCK_SHIFT - 9);
mdev->c_sync_rate = number * HZ * (BM_BLOCK_SIZE / 1024) / SLEEP_TIME;
} else {
- mdev->c_sync_rate = mdev->sync_conf.rate;
+ mdev->c_sync_rate = rcu_dereference(mdev->ldev->disk_conf)->resync_rate;
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
}
+ rcu_read_unlock();
/* ignore the amount of pending requests, the resync controller should
* throttle down to incoming reply rate soon enough anyways. */
return number;
}
-static int w_make_resync_request(struct drbd_conf *mdev,
- struct drbd_work *w, int cancel)
+int w_make_resync_request(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
unsigned long bit;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
@@ -574,12 +571,12 @@
int i = 0;
if (unlikely(cancel))
- return 1;
+ return 0;
if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
if (!get_ldev(mdev)) {
@@ -588,7 +585,7 @@
to continue resync with a broken disk makes no sense at
all */
dev_err(DEV, "Disk broke down during resync!\n");
- return 1;
+ return 0;
}
max_bio_size = queue_max_hw_sectors(mdev->rq_queue) << 9;
@@ -598,15 +595,15 @@
for (i = 0; i < number; i++) {
/* Stop generating RS requests, when half of the send buffer is filled */
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket) {
- queued = mdev->data.socket->sk->sk_wmem_queued;
- sndbuf = mdev->data.socket->sk->sk_sndbuf;
+ mutex_lock(&mdev->tconn->data.mutex);
+ if (mdev->tconn->data.socket) {
+ queued = mdev->tconn->data.socket->sk->sk_wmem_queued;
+ sndbuf = mdev->tconn->data.socket->sk->sk_sndbuf;
} else {
queued = 1;
sndbuf = 0;
}
- mutex_unlock(&mdev->data.mutex);
+ mutex_unlock(&mdev->tconn->data.mutex);
if (queued > sndbuf / 2)
goto requeue;
@@ -617,7 +614,7 @@
if (bit == DRBD_END_OF_BITMAP) {
mdev->bm_resync_fo = drbd_bm_bits(mdev);
put_ldev(mdev);
- return 1;
+ return 0;
}
sector = BM_BIT_TO_SECT(bit);
@@ -676,11 +673,11 @@
/* adjust very last sectors, in case we are oddly sized */
if (sector + (size>>9) > capacity)
size = (capacity-sector)<<9;
- if (mdev->agreed_pro_version >= 89 && mdev->csums_tfm) {
+ if (mdev->tconn->agreed_pro_version >= 89 && mdev->tconn->csums_tfm) {
switch (read_for_csum(mdev, sector, size)) {
case -EIO: /* Disk failure */
put_ldev(mdev);
- return 0;
+ return -EIO;
case -EAGAIN: /* allocation failed, or ldev busy */
drbd_rs_complete_io(mdev, sector);
mdev->bm_resync_fo = BM_SECT_TO_BIT(sector);
@@ -693,13 +690,16 @@
BUG();
}
} else {
+ int err;
+
inc_rs_pending(mdev);
- if (!drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
- sector, size, ID_SYNCER)) {
+ err = drbd_send_drequest(mdev, P_RS_DATA_REQUEST,
+ sector, size, ID_SYNCER);
+ if (err) {
dev_err(DEV, "drbd_send_drequest() failed, aborting...\n");
dec_rs_pending(mdev);
put_ldev(mdev);
- return 0;
+ return err;
}
}
}
@@ -712,18 +712,19 @@
* until then resync "work" is "inactive" ...
*/
put_ldev(mdev);
- return 1;
+ return 0;
}
requeue:
mdev->rs_in_flight += (i << (BM_BLOCK_SHIFT - 9));
mod_timer(&mdev->resync_timer, jiffies + SLEEP_TIME);
put_ldev(mdev);
- return 1;
+ return 0;
}
-static int w_make_ov_request(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_make_ov_request(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
int number, i, size;
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
@@ -743,7 +744,7 @@
* w_e_end_ov_reply().
* We need to send at least one request out. */
stop_sector_reached = i > 0
- && mdev->agreed_pro_version >= 97
+ && verify_can_do_stop_sector(mdev)
&& sector >= mdev->ov_stop_sector;
if (stop_sector_reached)
break;
@@ -760,7 +761,7 @@
size = (capacity-sector)<<9;
inc_rs_pending(mdev);
- if (!drbd_send_ov_request(mdev, sector, size)) {
+ if (drbd_send_ov_request(mdev, sector, size)) {
dec_rs_pending(mdev);
return 0;
}
@@ -775,52 +776,34 @@
return 1;
}
-
-void start_resync_timer_fn(unsigned long data)
+int w_ov_finished(struct drbd_work *w, int cancel)
{
- struct drbd_conf *mdev = (struct drbd_conf *) data;
-
- drbd_queue_work(&mdev->data.work, &mdev->start_resync_work);
-}
-
-int w_start_resync(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
- if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
- dev_warn(DEV, "w_start_resync later...\n");
- mdev->start_resync_timer.expires = jiffies + HZ/10;
- add_timer(&mdev->start_resync_timer);
- return 1;
- }
-
- drbd_start_resync(mdev, C_SYNC_SOURCE);
- drbd_clear_flag(mdev, AHEAD_TO_SYNC_SOURCE);
- return 1;
-}
-
-int w_ov_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
-{
+ struct drbd_conf *mdev = w->mdev;
kfree(w);
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
-static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static int w_resync_finished(struct drbd_work *w, int cancel)
{
+ struct drbd_conf *mdev = w->mdev;
kfree(w);
drbd_resync_finished(mdev);
- return 1;
+ return 0;
}
static void ping_peer(struct drbd_conf *mdev)
{
- drbd_clear_flag(mdev, GOT_PING_ACK);
- request_ping(mdev);
- wait_event(mdev->misc_wait,
- drbd_test_flag(mdev, GOT_PING_ACK) || mdev->state.conn < C_CONNECTED);
+ struct drbd_tconn *tconn = mdev->tconn;
+
+ clear_bit(GOT_PING_ACK, &tconn->flags);
+ request_ping(tconn);
+ wait_event(tconn->ping_wait,
+ test_bit(GOT_PING_ACK, &tconn->flags) || mdev->state.conn < C_CONNECTED);
}
int drbd_resync_finished(struct drbd_conf *mdev)
@@ -845,7 +828,8 @@
w = kmalloc(sizeof(struct drbd_work), GFP_ATOMIC);
if (w) {
w->cb = w_resync_finished;
- drbd_queue_work(&mdev->data.work, w);
+ w->mdev = mdev;
+ drbd_queue_work(&mdev->tconn->sender_work, w);
return 1;
}
dev_err(DEV, "Warn failed to drbd_rs_del_all() and to kmalloc(w).\n");
@@ -868,8 +852,8 @@
ping_peer(mdev);
- spin_lock_irq(&mdev->req_lock);
- os = mdev->state;
+ spin_lock_irq(&mdev->tconn->req_lock);
+ os = drbd_read_state(mdev);
verify_done = (os.conn == C_VERIFY_S || os.conn == C_VERIFY_T);
@@ -899,7 +883,7 @@
if (os.conn == C_SYNC_TARGET || os.conn == C_PAUSED_SYNC_T)
khelper_cmd = "after-resync-target";
- if (mdev->csums_tfm && mdev->rs_total) {
+ if (mdev->tconn->csums_tfm && mdev->rs_total) {
const unsigned long s = mdev->rs_same_csum;
const unsigned long t = mdev->rs_total;
const int ratio =
@@ -957,7 +941,7 @@
_drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
out_unlock:
- spin_unlock_irq(&mdev->req_lock);
+ spin_unlock_irq(&mdev->tconn->req_lock);
put_ldev(mdev);
out:
mdev->rs_total = 0;
@@ -977,19 +961,19 @@
}
/* helper */
-static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_epoch_entry *e)
+static void move_to_net_ee_or_free(struct drbd_conf *mdev, struct drbd_peer_request *peer_req)
{
- if (drbd_ee_has_active_page(e)) {
+ if (drbd_peer_req_has_active_page(peer_req)) {
/* This might happen if sendpage() has not finished */
- int i = (e->size + PAGE_SIZE -1) >> PAGE_SHIFT;
+ int i = (peer_req->i.size + PAGE_SIZE -1) >> PAGE_SHIFT;
atomic_add(i, &mdev->pp_in_use_by_net);
atomic_sub(i, &mdev->pp_in_use);
- spin_lock_irq(&mdev->req_lock);
- list_add_tail(&e->w.list, &mdev->net_ee);
- spin_unlock_irq(&mdev->req_lock);
+ spin_lock_irq(&mdev->tconn->req_lock);
+ list_add_tail(&peer_req->w.list, &mdev->net_ee);
+ spin_unlock_irq(&mdev->tconn->req_lock);
wake_up(&drbd_pp_wait);
} else
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
}
/**
@@ -998,174 +982,177 @@
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_data_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_data_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- ok = drbd_send_block(mdev, P_DATA_REPLY, e);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_block(mdev, P_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. sector=%llus.\n",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_DREPLY, peer_req);
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ return err;
}
/**
- * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUESTRS
+ * w_e_end_rsdata_req() - Worker callback to send a P_RS_DATA_REPLY packet in response to a P_RS_DATA_REQUEST
* @mdev: DRBD device.
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_e_end_rsdata_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- int ok;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ int err;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
if (get_ldev_if_state(mdev, D_FAILED)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
if (mdev->state.conn == C_AHEAD) {
- ok = drbd_send_ack(mdev, P_RS_CANCEL, e);
- } else if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ err = drbd_send_ack(mdev, P_RS_CANCEL, peer_req);
+ } else if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
if (likely(mdev->state.pdsk >= D_INCONSISTENT)) {
inc_rs_pending(mdev);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Not sending RSDataReply, "
"partner DISKLESS!\n");
- ok = 1;
+ err = 0;
}
} else {
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegRSDReply. sector %llus.\n",
- (unsigned long long)e->sector);
+ (unsigned long long)peer_req->i.sector);
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
/* update resync data with failure */
- drbd_rs_failed_io(mdev, e->sector, e->size);
+ drbd_rs_failed_io(mdev, peer_req->i.sector, peer_req->i.size);
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block() failed\n");
- return ok;
+ return err;
}
-int w_e_end_csum_rs_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
int digest_size;
void *digest = NULL;
- int ok, eq = 0;
+ int err, eq = 0;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
/* quick hack to try to avoid a race against reconfiguration.
* a real fix would be much more involved,
* introducing more locking mechanisms */
- if (mdev->csums_tfm) {
- digest_size = crypto_hash_digestsize(mdev->csums_tfm);
+ if (mdev->tconn->csums_tfm) {
+ digest_size = crypto_hash_digestsize(mdev->tconn->csums_tfm);
D_ASSERT(digest_size == di->digest_size);
digest = kmalloc(digest_size, GFP_NOIO);
}
if (digest) {
- drbd_csum_ee(mdev, mdev->csums_tfm, e, digest);
+ drbd_csum_ee(mdev, mdev->tconn->csums_tfm, peer_req, digest);
eq = !memcmp(digest, di->digest, digest_size);
kfree(digest);
}
if (eq) {
- drbd_set_in_sync(mdev, e->sector, e->size);
+ drbd_set_in_sync(mdev, peer_req->i.sector, peer_req->i.size);
/* rs_same_csums unit is BM_BLOCK_SIZE */
- mdev->rs_same_csum += e->size >> BM_BLOCK_SHIFT;
- ok = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, e);
+ mdev->rs_same_csum += peer_req->i.size >> BM_BLOCK_SHIFT;
+ err = drbd_send_ack(mdev, P_RS_IS_IN_SYNC, peer_req);
} else {
inc_rs_pending(mdev);
- e->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
- e->flags &= ~EE_HAS_DIGEST; /* This e no longer has a digest pointer */
+ peer_req->block_id = ID_SYNCER; /* By setting block_id, digest pointer becomes invalid! */
+ peer_req->flags &= ~EE_HAS_DIGEST; /* This peer request no longer has a digest pointer */
kfree(di);
- ok = drbd_send_block(mdev, P_RS_DATA_REPLY, e);
+ err = drbd_send_block(mdev, P_RS_DATA_REPLY, peer_req);
}
} else {
- ok = drbd_send_ack(mdev, P_NEG_RS_DREPLY, e);
+ err = drbd_send_ack(mdev, P_NEG_RS_DREPLY, peer_req);
if (__ratelimit(&drbd_ratelimit_state))
dev_err(DEV, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(mdev);
- move_to_net_ee_or_free(mdev, e);
+ move_to_net_ee_or_free(mdev, peer_req);
- if (unlikely(!ok))
+ if (unlikely(err))
dev_err(DEV, "drbd_send_block/ack() failed\n");
- return ok;
+ return err;
}
-/* TODO merge common code with w_e_send_csum */
-int w_e_end_ov_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_req(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
- sector_t sector = e->sector;
- unsigned int size = e->size;
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
int digest_size;
void *digest;
- int ok = 1;
+ int err = 0;
if (unlikely(cancel))
goto out;
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+ digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (!digest) {
- ok = 0; /* terminate the connection in case the allocation failed */
+ err = 1; /* terminate the connection in case the allocation failed */
goto out;
}
- if (likely(!(e->flags & EE_WAS_ERROR)))
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+ if (likely(!(peer_req->flags & EE_WAS_ERROR)))
+ drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
else
memset(digest, 0, digest_size);
@@ -1173,25 +1160,23 @@
* In case we block on congestion, we could otherwise run into
* some distributed deadlock, if the other side blocks on
* congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
- e = NULL;
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
+ peer_req = NULL;
inc_rs_pending(mdev);
- ok = drbd_send_drequest_csum(mdev, sector, size,
- digest, digest_size,
- P_OV_REPLY);
- if (!ok)
+ err = drbd_send_drequest_csum(mdev, sector, size, digest, digest_size, P_OV_REPLY);
+ if (err)
dec_rs_pending(mdev);
kfree(digest);
out:
- if (e)
- drbd_free_ee(mdev, e);
+ if (peer_req)
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return ok;
+ return err;
}
-void drbd_ov_oos_found(struct drbd_conf *mdev, sector_t sector, int size)
+void drbd_ov_out_of_sync_found(struct drbd_conf *mdev, sector_t sector, int size)
{
if (mdev->ov_last_oos_start + mdev->ov_last_oos_size == sector) {
mdev->ov_last_oos_size += size>>9;
@@ -1202,37 +1187,38 @@
drbd_set_out_of_sync(mdev, sector, size);
}
-int w_e_end_ov_reply(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_e_end_ov_reply(struct drbd_work *w, int cancel)
{
- struct drbd_epoch_entry *e = container_of(w, struct drbd_epoch_entry, w);
+ struct drbd_peer_request *peer_req = container_of(w, struct drbd_peer_request, w);
+ struct drbd_conf *mdev = w->mdev;
struct digest_info *di;
void *digest;
- sector_t sector = e->sector;
- unsigned int size = e->size;
+ sector_t sector = peer_req->i.sector;
+ unsigned int size = peer_req->i.size;
int digest_size;
- int ok, eq = 0;
+ int err, eq = 0;
bool stop_sector_reached = false;
if (unlikely(cancel)) {
- drbd_free_ee(mdev, e);
+ drbd_free_peer_req(mdev, peer_req);
dec_unacked(mdev);
- return 1;
+ return 0;
}
/* after "cancel", because after drbd_disconnect/drbd_rs_cancel_all
* the resync lru has been cleaned up already */
if (get_ldev(mdev)) {
- drbd_rs_complete_io(mdev, e->sector);
+ drbd_rs_complete_io(mdev, peer_req->i.sector);
put_ldev(mdev);
}
- di = e->digest;
+ di = peer_req->digest;
- if (likely((e->flags & EE_WAS_ERROR) == 0)) {
- digest_size = crypto_hash_digestsize(mdev->verify_tfm);
+ if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
+ digest_size = crypto_hash_digestsize(mdev->tconn->verify_tfm);
digest = kmalloc(digest_size, GFP_NOIO);
if (digest) {
- drbd_csum_ee(mdev, mdev->verify_tfm, e, digest);
+ drbd_csum_ee(mdev, mdev->tconn->verify_tfm, peer_req, digest);
D_ASSERT(digest_size == di->digest_size);
eq = !memcmp(digest, di->digest, digest_size);
@@ -1240,19 +1226,19 @@
}
}
- /* Free e and pages before send.
- * In case we block on congestion, we could otherwise run into
- * some distributed deadlock, if the other side blocks on
- * congestion as well, because our receiver blocks in
- * drbd_pp_alloc due to pp_in_use > max_buffers. */
- drbd_free_ee(mdev, e);
+ /* Free peer_req and pages before send.
+ * In case we block on congestion, we could otherwise run into
+ * some distributed deadlock, if the other side blocks on
+ * congestion as well, because our receiver blocks in
+ * drbd_alloc_pages due to pp_in_use > max_buffers. */
+ drbd_free_peer_req(mdev, peer_req);
if (!eq)
- drbd_ov_oos_found(mdev, sector, size);
+ drbd_ov_out_of_sync_found(mdev, sector, size);
else
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
- ok = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
- eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
+ err = drbd_send_ack_ex(mdev, P_OV_RESULT, sector, size,
+ eq ? ID_IN_SYNC : ID_OUT_OF_SYNC);
dec_unacked(mdev);
@@ -1262,76 +1248,102 @@
if ((mdev->ov_left & 0x200) == 0x200)
drbd_advance_rs_marks(mdev, mdev->ov_left);
- stop_sector_reached = mdev->agreed_pro_version >= 97 &&
+ stop_sector_reached = verify_can_do_stop_sector(mdev) &&
(sector + (size>>9)) >= mdev->ov_stop_sector;
if (mdev->ov_left == 0 || stop_sector_reached) {
- ov_oos_print(mdev);
+ ov_out_of_sync_print(mdev);
drbd_resync_finished(mdev);
}
- return ok;
+ return err;
}
-int w_prev_work_done(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_prev_work_done(struct drbd_work *w, int cancel)
{
struct drbd_wq_barrier *b = container_of(w, struct drbd_wq_barrier, w);
+
complete(&b->done);
- return 1;
+ return 0;
}
-int w_send_barrier(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+/* FIXME
+ * We need to track the number of pending barrier acks,
+ * and to be able to wait for them.
+ * See also comment in drbd_adm_attach before drbd_suspend_io.
+ */
+int drbd_send_barrier(struct drbd_tconn *tconn)
{
- struct drbd_tl_epoch *b = container_of(w, struct drbd_tl_epoch, w);
- struct p_barrier *p = &mdev->data.sbuf.barrier;
- int ok = 1;
+ struct p_barrier *p;
+ struct drbd_socket *sock;
- /* really avoid racing with tl_clear. w.cb may have been referenced
- * just before it was reassigned and re-queued, so double check that.
- * actually, this race was harmless, since we only try to send the
- * barrier packet here, and otherwise do nothing with the object.
- * but compare with the head of w_clear_epoch */
- spin_lock_irq(&mdev->req_lock);
- if (w->cb != w_send_barrier || mdev->state.conn < C_CONNECTED)
- cancel = 1;
- spin_unlock_irq(&mdev->req_lock);
+ sock = &tconn->data;
+ p = conn_prepare_command(tconn, sock);
+ if (!p)
+ return -EIO;
+ p->barrier = tconn->send.current_epoch_nr;
+ p->pad = 0;
+ tconn->send.current_epoch_writes = 0;
+
+ return conn_send_command(tconn, sock, P_BARRIER, sizeof(*p), NULL, 0);
+}
+
+int w_send_write_hint(struct drbd_work *w, int cancel)
+{
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_socket *sock;
+
if (cancel)
- return 1;
-
- if (!drbd_get_data_sock(mdev))
return 0;
- p->barrier = b->br_number;
- /* inc_ap_pending was done where this was queued.
- * dec_ap_pending will be done in got_BarrierAck
- * or (on connection loss) in w_clear_epoch. */
- ok = _drbd_send_cmd(mdev, mdev->data.socket, P_BARRIER,
- (struct p_header80 *)p, sizeof(*p), 0);
- drbd_put_data_sock(mdev);
-
- return ok;
+ sock = &mdev->tconn->data;
+ if (!drbd_prepare_command(mdev, sock))
+ return -EIO;
+ return drbd_send_command(mdev, sock, P_UNPLUG_REMOTE, 0, NULL, 0);
}
-int w_send_write_hint(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void re_init_if_first_write(struct drbd_tconn *tconn, unsigned int epoch)
{
- if (cancel)
- return 1;
- return drbd_send_short_cmd(mdev, P_UNPLUG_REMOTE);
+ if (!tconn->send.seen_any_write_yet) {
+ tconn->send.seen_any_write_yet = true;
+ tconn->send.current_epoch_nr = epoch;
+ tconn->send.current_epoch_writes = 0;
+ }
}
-int w_send_oos(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+static void maybe_send_barrier(struct drbd_tconn *tconn, unsigned int epoch)
+{
+ /* re-init if first write on this connection */
+ if (!tconn->send.seen_any_write_yet)
+ return;
+ if (tconn->send.current_epoch_nr != epoch) {
+ if (tconn->send.current_epoch_writes)
+ drbd_send_barrier(tconn);
+ tconn->send.current_epoch_nr = epoch;
+ }
+}
+
+int w_send_out_of_sync(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_oos(mdev, req);
- req_mod(req, oos_handed_to_network);
+ /* this time, no tconn->send.current_epoch_writes++;
+ * If it was sent, it was the closing barrier for the last
+ * replicated epoch, before we went into AHEAD mode.
+ * No more barriers will be sent, until we leave AHEAD mode again. */
+ maybe_send_barrier(tconn, req->epoch);
- return ok;
+ err = drbd_send_out_of_sync(mdev, req);
+ req_mod(req, OOS_HANDED_TO_NETWORK);
+
+ return err;
}
/**
@@ -1340,20 +1352,26 @@
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_dblock(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_dblock(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_dblock(mdev, req);
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ re_init_if_first_write(tconn, req->epoch);
+ maybe_send_barrier(tconn, req->epoch);
+ tconn->send.current_epoch_writes++;
- return ok;
+ err = drbd_send_dblock(mdev, req);
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
}
/**
@@ -1362,57 +1380,61 @@
* @w: work object.
* @cancel: The connection will be closed anyways
*/
-int w_send_read_req(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_send_read_req(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
- int ok;
+ struct drbd_conf *mdev = w->mdev;
+ struct drbd_tconn *tconn = mdev->tconn;
+ int err;
if (unlikely(cancel)) {
- req_mod(req, send_canceled);
- return 1;
+ req_mod(req, SEND_CANCELED);
+ return 0;
}
- ok = drbd_send_drequest(mdev, P_DATA_REQUEST, req->sector, req->size,
- (unsigned long)req);
+ /* Even read requests may close a write epoch,
+ * if there was any yet. */
+ maybe_send_barrier(tconn, req->epoch);
- if (!ok) {
- /* ?? we set C_TIMEOUT or C_BROKEN_PIPE in drbd_send();
- * so this is probably redundant */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev, NS(conn, C_NETWORK_FAILURE));
- }
- req_mod(req, ok ? handed_over_to_network : send_failed);
+ err = drbd_send_drequest(mdev, P_DATA_REQUEST, req->i.sector, req->i.size,
+ (unsigned long)req);
- return ok;
+ req_mod(req, err ? SEND_FAILED : HANDED_OVER_TO_NETWORK);
+
+ return err;
}
-int w_restart_disk_io(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
+int w_restart_disk_io(struct drbd_work *w, int cancel)
{
struct drbd_request *req = container_of(w, struct drbd_request, w);
+ struct drbd_conf *mdev = w->mdev;
if (bio_data_dir(req->master_bio) == WRITE && req->rq_state & RQ_IN_ACT_LOG)
- drbd_al_begin_io(mdev, req->sector);
- /* Calling drbd_al_begin_io() out of the worker might deadlocks
- theoretically. Practically it can not deadlock, since this is
- only used when unfreezing IOs. All the extents of the requests
- that made it into the TL are already active */
+ drbd_al_begin_io(mdev, &req->i);
drbd_req_make_private_bio(req, req->master_bio);
req->private_bio->bi_bdev = mdev->ldev->backing_bdev;
generic_make_request(req->private_bio);
- return 1;
+ return 0;
}
static int _drbd_may_sync_now(struct drbd_conf *mdev)
{
struct drbd_conf *odev = mdev;
+ int resync_after;
while (1) {
- if (odev->sync_conf.after == -1)
+ if (!odev->ldev)
return 1;
- odev = minor_to_mdev(odev->sync_conf.after);
- ERR_IF(!odev) return 1;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
+ if (resync_after == -1)
+ return 1;
+ odev = minor_to_mdev(resync_after);
+ if (!expect(odev))
+ return 1;
if ((odev->state.conn >= C_SYNC_SOURCE &&
odev->state.conn <= C_PAUSED_SYNC_T) ||
odev->state.aftr_isp || odev->state.peer_isp ||
@@ -1432,16 +1454,15 @@
struct drbd_conf *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (!_drbd_may_sync_now(odev))
rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL)
!= SS_NOTHING_TO_DO);
}
+ rcu_read_unlock();
return rv;
}
@@ -1457,10 +1478,8 @@
struct drbd_conf *odev;
int i, rv = 0;
- for (i = 0; i < minor_count; i++) {
- odev = minor_to_mdev(i);
- if (!odev)
- continue;
+ rcu_read_lock();
+ idr_for_each_entry(&minors, odev, i) {
if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS)
continue;
if (odev->state.aftr_isp) {
@@ -1470,6 +1489,7 @@
!= SS_NOTHING_TO_DO) ;
}
}
+ rcu_read_unlock();
return rv;
}
@@ -1487,57 +1507,86 @@
write_unlock_irq(&global_state_lock);
}
-static int sync_after_error(struct drbd_conf *mdev, int o_minor)
+/* caller must hold global_state_lock */
+enum drbd_ret_code drbd_resync_after_valid(struct drbd_conf *mdev, int o_minor)
{
struct drbd_conf *odev;
+ int resync_after;
if (o_minor == -1)
return NO_ERROR;
if (o_minor < -1 || minor_to_mdev(o_minor) == NULL)
- return ERR_SYNC_AFTER;
+ return ERR_RESYNC_AFTER;
/* check for loops */
odev = minor_to_mdev(o_minor);
while (1) {
if (odev == mdev)
- return ERR_SYNC_AFTER_CYCLE;
+ return ERR_RESYNC_AFTER_CYCLE;
+ rcu_read_lock();
+ resync_after = rcu_dereference(odev->ldev->disk_conf)->resync_after;
+ rcu_read_unlock();
/* dependency chain ends here, no cycles. */
- if (odev->sync_conf.after == -1)
+ if (resync_after == -1)
return NO_ERROR;
/* follow the dependency chain */
- odev = minor_to_mdev(odev->sync_conf.after);
+ odev = minor_to_mdev(resync_after);
}
}
-int drbd_alter_sa(struct drbd_conf *mdev, int na)
+/* caller must hold global_state_lock */
+void drbd_resync_after_changed(struct drbd_conf *mdev)
{
int changes;
- int retcode;
- write_lock_irq(&global_state_lock);
- retcode = sync_after_error(mdev, na);
- if (retcode == NO_ERROR) {
- mdev->sync_conf.after = na;
- do {
- changes = _drbd_pause_after(mdev);
- changes |= _drbd_resume_next(mdev);
- } while (changes);
- }
- write_unlock_irq(&global_state_lock);
- return retcode;
+ do {
+ changes = _drbd_pause_after(mdev);
+ changes |= _drbd_resume_next(mdev);
+ } while (changes);
}
void drbd_rs_controller_reset(struct drbd_conf *mdev)
{
+ struct fifo_buffer *plan;
+
atomic_set(&mdev->rs_sect_in, 0);
atomic_set(&mdev->rs_sect_ev, 0);
mdev->rs_in_flight = 0;
- mdev->rs_planed = 0;
- spin_lock(&mdev->peer_seq_lock);
- fifo_set(&mdev->rs_plan_s, 0);
- spin_unlock(&mdev->peer_seq_lock);
+
+ /* Updating the RCU protected object in place is necessary since
+ this function gets called from atomic context.
+ It is valid since all other updates also lead to an completely
+ empty fifo */
+ rcu_read_lock();
+ plan = rcu_dereference(mdev->rs_plan_s);
+ plan->total = 0;
+ fifo_set(plan, 0);
+ rcu_read_unlock();
+}
+
+void start_resync_timer_fn(unsigned long data)
+{
+ struct drbd_conf *mdev = (struct drbd_conf *) data;
+
+ drbd_queue_work(&mdev->tconn->sender_work, &mdev->start_resync_work);
+}
+
+int w_start_resync(struct drbd_work *w, int cancel)
+{
+ struct drbd_conf *mdev = w->mdev;
+
+ if (atomic_read(&mdev->unacked_cnt) || atomic_read(&mdev->rs_pending_cnt)) {
+ dev_warn(DEV, "w_start_resync later...\n");
+ mdev->start_resync_timer.expires = jiffies + HZ/10;
+ add_timer(&mdev->start_resync_timer);
+ return 0;
+ }
+
+ drbd_start_resync(mdev, C_SYNC_SOURCE);
+ clear_bit(AHEAD_TO_SYNC_SOURCE, &mdev->flags);
+ return 0;
}
/**
@@ -1558,43 +1607,58 @@
return;
}
- if (side == C_SYNC_TARGET) {
- /* Since application IO was locked out during C_WF_BITMAP_T and
- C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
- we check that we might make the data inconsistent. */
- r = drbd_khelper(mdev, "before-resync-target");
- r = (r >> 8) & 0xff;
- if (r > 0) {
- dev_info(DEV, "before-resync-target handler returned %d, "
- "dropping connection.\n", r);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
- return;
- }
- } else /* C_SYNC_SOURCE */ {
- r = drbd_khelper(mdev, "before-resync-source");
- r = (r >> 8) & 0xff;
- if (r > 0) {
- if (r == 3) {
- dev_info(DEV, "before-resync-source handler returned %d, "
- "ignoring. Old userland tools?", r);
- } else {
- dev_info(DEV, "before-resync-source handler returned %d, "
+ if (!test_bit(B_RS_H_DONE, &mdev->flags)) {
+ if (side == C_SYNC_TARGET) {
+ /* Since application IO was locked out during C_WF_BITMAP_T and
+ C_WF_SYNC_UUID we are still unmodified. Before going to C_SYNC_TARGET
+ we check that we might make the data inconsistent. */
+ r = drbd_khelper(mdev, "before-resync-target");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ dev_info(DEV, "before-resync-target handler returned %d, "
"dropping connection.\n", r);
- drbd_force_state(mdev, NS(conn, C_DISCONNECTING));
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
return;
}
+ } else /* C_SYNC_SOURCE */ {
+ r = drbd_khelper(mdev, "before-resync-source");
+ r = (r >> 8) & 0xff;
+ if (r > 0) {
+ if (r == 3) {
+ dev_info(DEV, "before-resync-source handler returned %d, "
+ "ignoring. Old userland tools?", r);
+ } else {
+ dev_info(DEV, "before-resync-source handler returned %d, "
+ "dropping connection.\n", r);
+ conn_request_state(mdev->tconn, NS(conn, C_DISCONNECTING), CS_HARD);
+ return;
+ }
+ }
}
}
- drbd_state_lock(mdev);
+ if (current == mdev->tconn->worker.task) {
+ /* The worker should not sleep waiting for state_mutex,
+ that can take long */
+ if (!mutex_trylock(mdev->state_mutex)) {
+ set_bit(B_RS_H_DONE, &mdev->flags);
+ mdev->start_resync_timer.expires = jiffies + HZ/5;
+ add_timer(&mdev->start_resync_timer);
+ return;
+ }
+ } else {
+ mutex_lock(mdev->state_mutex);
+ }
+ clear_bit(B_RS_H_DONE, &mdev->flags);
+
write_lock_irq(&global_state_lock);
if (!get_ldev_if_state(mdev, D_NEGOTIATING)) {
write_unlock_irq(&global_state_lock);
- drbd_state_unlock(mdev);
+ mutex_unlock(mdev->state_mutex);
return;
}
- ns.i = mdev->state.i;
+ ns = drbd_read_state(mdev);
ns.aftr_isp = !_drbd_may_sync_now(mdev);
@@ -1606,7 +1670,7 @@
ns.pdsk = D_INCONSISTENT;
r = __drbd_set_state(mdev, ns, CS_VERBOSE, NULL);
- ns = mdev->state;
+ ns = drbd_read_state(mdev);
if (ns.conn < C_CONNECTED)
r = SS_UNKNOWN_ERROR;
@@ -1632,6 +1696,10 @@
write_unlock_irq(&global_state_lock);
if (r == SS_SUCCESS) {
+ /* reset rs_last_bcast when a resync or verify is started,
+ * to deal with potential jiffies wrap. */
+ mdev->rs_last_bcast = jiffies - HZ;
+
dev_info(DEV, "Began resync as %s (will sync %lu KB [%lu bits set]).\n",
drbd_conn_str(ns.conn),
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
@@ -1646,10 +1714,10 @@
* drbd_resync_finished from here in that case.
* We drbd_gen_and_send_sync_uuid here for protocol < 96,
* and from after_state_ch otherwise. */
- if (side == C_SYNC_SOURCE && mdev->agreed_pro_version < 96)
+ if (side == C_SYNC_SOURCE && mdev->tconn->agreed_pro_version < 96)
drbd_gen_and_send_sync_uuid(mdev);
- if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
+ if (mdev->tconn->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
@@ -1660,10 +1728,16 @@
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
- if (side == C_SYNC_SOURCE)
- schedule_timeout_interruptible(
- mdev->net_conf->ping_int * HZ +
- mdev->net_conf->ping_timeo*HZ/9);
+ if (side == C_SYNC_SOURCE) {
+ struct net_conf *nc;
+ int timeo;
+
+ rcu_read_lock();
+ nc = rcu_dereference(mdev->tconn->net_conf);
+ timeo = nc->ping_int * HZ + nc->ping_timeo * HZ / 9;
+ rcu_read_unlock();
+ schedule_timeout_interruptible(timeo);
+ }
drbd_resync_finished(mdev);
}
@@ -1678,114 +1752,180 @@
drbd_md_sync(mdev);
}
put_ldev(mdev);
- drbd_state_unlock(mdev);
+ mutex_unlock(mdev->state_mutex);
+}
+
+/* If the resource already closed the current epoch, but we did not
+ * (because we have not yet seen new requests), we should send the
+ * corresponding barrier now. Must be checked within the same spinlock
+ * that is used to check for new requests. */
+bool need_to_send_barrier(struct drbd_tconn *connection)
+{
+ if (!connection->send.seen_any_write_yet)
+ return false;
+
+ /* Skip barriers that do not contain any writes.
+ * This may happen during AHEAD mode. */
+ if (!connection->send.current_epoch_writes)
+ return false;
+
+ /* ->req_lock is held when requests are queued on
+ * connection->sender_work, and put into ->transfer_log.
+ * It is also held when ->current_tle_nr is increased.
+ * So either there are already new requests queued,
+ * and corresponding barriers will be send there.
+ * Or nothing new is queued yet, so the difference will be 1.
+ */
+ if (atomic_read(&connection->current_tle_nr) !=
+ connection->send.current_epoch_nr + 1)
+ return false;
+
+ return true;
+}
+
+bool dequeue_work_batch(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ list_splice_init(&queue->q, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
+
+bool dequeue_work_item(struct drbd_work_queue *queue, struct list_head *work_list)
+{
+ spin_lock_irq(&queue->q_lock);
+ if (!list_empty(&queue->q))
+ list_move(queue->q.next, work_list);
+ spin_unlock_irq(&queue->q_lock);
+ return !list_empty(work_list);
+}
+
+void wait_for_work(struct drbd_tconn *connection, struct list_head *work_list)
+{
+ DEFINE_WAIT(wait);
+ struct net_conf *nc;
+ int uncork, cork;
+
+ dequeue_work_item(&connection->sender_work, work_list);
+ if (!list_empty(work_list))
+ return;
+
+ /* Still nothing to do?
+ * Maybe we still need to close the current epoch,
+ * even if no new requests are queued yet.
+ *
+ * Also, poke TCP, just in case.
+ * Then wait for new work (or signal). */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ uncork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ if (uncork) {
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket)
+ drbd_tcp_uncork(connection->data.socket);
+ mutex_unlock(&connection->data.mutex);
+ }
+
+ for (;;) {
+ int send_barrier;
+ prepare_to_wait(&connection->sender_work.q_wait, &wait, TASK_INTERRUPTIBLE);
+ spin_lock_irq(&connection->req_lock);
+ spin_lock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ /* dequeue single item only,
+ * we still use drbd_queue_work_front() in some places */
+ if (!list_empty(&connection->sender_work.q))
+ list_move(connection->sender_work.q.next, work_list);
+ spin_unlock(&connection->sender_work.q_lock); /* FIXME get rid of this one? */
+ if (!list_empty(work_list) || signal_pending(current)) {
+ spin_unlock_irq(&connection->req_lock);
+ break;
+ }
+ send_barrier = need_to_send_barrier(connection);
+ spin_unlock_irq(&connection->req_lock);
+ if (send_barrier) {
+ drbd_send_barrier(connection);
+ connection->send.current_epoch_nr++;
+ }
+ schedule();
+ /* may be woken up for other things but new work, too,
+ * e.g. if the current epoch got closed.
+ * In which case we send the barrier above. */
+ }
+ finish_wait(&connection->sender_work.q_wait, &wait);
+
+ /* someone may have changed the config while we have been waiting above. */
+ rcu_read_lock();
+ nc = rcu_dereference(connection->net_conf);
+ cork = nc ? nc->tcp_cork : 0;
+ rcu_read_unlock();
+ mutex_lock(&connection->data.mutex);
+ if (connection->data.socket) {
+ if (cork)
+ drbd_tcp_cork(connection->data.socket);
+ else if (!uncork)
+ drbd_tcp_uncork(connection->data.socket);
+ }
+ mutex_unlock(&connection->data.mutex);
}
int drbd_worker(struct drbd_thread *thi)
{
- struct drbd_conf *mdev = thi->mdev;
+ struct drbd_tconn *tconn = thi->tconn;
struct drbd_work *w = NULL;
+ struct drbd_conf *mdev;
LIST_HEAD(work_list);
- int intr = 0, i;
+ int vnr;
- sprintf(current->comm, "drbd%d_worker", mdev_to_minor(mdev));
+ while (get_t_state(thi) == RUNNING) {
+ drbd_thread_current_set_cpu(thi);
- while (get_t_state(thi) == Running) {
- drbd_thread_current_set_cpu(mdev);
+ /* as long as we use drbd_queue_work_front(),
+ * we may only dequeue single work items here, not batches. */
+ if (list_empty(&work_list))
+ wait_for_work(tconn, &work_list);
- if (down_trylock(&mdev->data.work.s)) {
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_uncork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
-
- intr = down_interruptible(&mdev->data.work.s);
-
- mutex_lock(&mdev->data.mutex);
- if (mdev->data.socket && !mdev->net_conf->no_cork)
- drbd_tcp_cork(mdev->data.socket);
- mutex_unlock(&mdev->data.mutex);
- }
-
- if (intr) {
- D_ASSERT(intr == -EINTR);
+ if (signal_pending(current)) {
flush_signals(current);
- ERR_IF (get_t_state(thi) == Running)
+ if (get_t_state(thi) == RUNNING) {
+ conn_warn(tconn, "Worker got an unexpected signal\n");
continue;
+ }
break;
}
- if (get_t_state(thi) != Running)
+ if (get_t_state(thi) != RUNNING)
break;
- /* With this break, we have done a down() but not consumed
- the entry from the list. The cleanup code takes care of
- this... */
-
- w = NULL;
- spin_lock_irq(&mdev->data.work.q_lock);
- ERR_IF(list_empty(&mdev->data.work.q)) {
- /* something terribly wrong in our logic.
- * we were able to down() the semaphore,
- * but the list is empty... doh.
- *
- * what is the best thing to do now?
- * try again from scratch, restarting the receiver,
- * asender, whatnot? could break even more ugly,
- * e.g. when we are primary, but no good local data.
- *
- * I'll try to get away just starting over this loop.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
- continue;
- }
- w = list_entry(mdev->data.work.q.next, struct drbd_work, list);
- list_del_init(&w->list);
- spin_unlock_irq(&mdev->data.work.q_lock);
-
- if (!w->cb(mdev, w, mdev->state.conn < C_CONNECTED)) {
- /* dev_warn(DEV, "worker: a callback failed! \n"); */
- if (mdev->state.conn >= C_CONNECTED)
- drbd_force_state(mdev,
- NS(conn, C_NETWORK_FAILURE));
- }
- }
- D_ASSERT(drbd_test_flag(mdev, DEVICE_DYING));
- D_ASSERT(drbd_test_flag(mdev, CONFIG_PENDING));
-
- spin_lock_irq(&mdev->data.work.q_lock);
- i = 0;
- while (!list_empty(&mdev->data.work.q)) {
- list_splice_init(&mdev->data.work.q, &work_list);
- spin_unlock_irq(&mdev->data.work.q_lock);
while (!list_empty(&work_list)) {
- w = list_entry(work_list.next, struct drbd_work, list);
+ w = list_first_entry(&work_list, struct drbd_work, list);
list_del_init(&w->list);
- w->cb(mdev, w, 1);
- i++; /* dead debugging code */
+ if (w->cb(w, tconn->cstate < C_WF_REPORT_PARAMS) == 0)
+ continue;
+ if (tconn->cstate >= C_WF_REPORT_PARAMS)
+ conn_request_state(tconn, NS(conn, C_NETWORK_FAILURE), CS_HARD);
}
-
- spin_lock_irq(&mdev->data.work.q_lock);
}
- sema_init(&mdev->data.work.s, 0);
- /* DANGEROUS race: if someone did queue his work within the spinlock,
- * but up() ed outside the spinlock, we could get an up() on the
- * semaphore without corresponding list entry.
- * So don't do that.
- */
- spin_unlock_irq(&mdev->data.work.q_lock);
- D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
- /* _drbd_set_state only uses stop_nowait.
- * wait here for the Exiting receiver. */
- drbd_thread_stop(&mdev->receiver);
- drbd_mdev_cleanup(mdev);
+ do {
+ while (!list_empty(&work_list)) {
+ w = list_first_entry(&work_list, struct drbd_work, list);
+ list_del_init(&w->list);
+ w->cb(w, 1);
+ }
+ dequeue_work_batch(&tconn->sender_work, &work_list);
+ } while (!list_empty(&work_list));
- dev_info(DEV, "worker terminated\n");
-
- drbd_clear_flag(mdev, DEVICE_DYING);
- drbd_clear_flag(mdev, CONFIG_PENDING);
- wake_up(&mdev->state_wait);
+ rcu_read_lock();
+ idr_for_each_entry(&tconn->volumes, mdev, vnr) {
+ D_ASSERT(mdev->state.disk == D_DISKLESS && mdev->state.conn == C_STANDALONE);
+ kref_get(&mdev->kref);
+ rcu_read_unlock();
+ drbd_mdev_cleanup(mdev);
+ kref_put(&mdev->kref, &drbd_minor_destroy);
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
return 0;
}
diff --git a/drivers/block/drbd/drbd_wrappers.h b/drivers/block/drbd/drbd_wrappers.h
index 151f1a3..328f18e 100644
--- a/drivers/block/drbd/drbd_wrappers.h
+++ b/drivers/block/drbd/drbd_wrappers.h
@@ -3,6 +3,7 @@
#include <linux/ctype.h>
#include <linux/mm.h>
+#include "drbd_int.h"
/* see get_sb_bdev and bd_claim */
extern char *drbd_sec_holder;
@@ -20,8 +21,8 @@
/* bi_end_io handlers */
extern void drbd_md_io_complete(struct bio *bio, int error);
-extern void drbd_endio_sec(struct bio *bio, int error);
-extern void drbd_endio_pri(struct bio *bio, int error);
+extern void drbd_peer_request_endio(struct bio *bio, int error);
+extern void drbd_request_endio(struct bio *bio, int error);
/*
* used to submit our private bio
@@ -45,12 +46,6 @@
generic_make_request(bio);
}
-static inline int drbd_crypto_is_hash(struct crypto_tfm *tfm)
-{
- return (crypto_tfm_alg_type(tfm) & CRYPTO_ALG_TYPE_HASH_MASK)
- == CRYPTO_ALG_TYPE_HASH;
-}
-
#ifndef __CHECKER__
# undef __cond_lock
# define __cond_lock(x,c) (c)
diff --git a/include/linux/drbd.h b/include/linux/drbd.h
index 94f58a1..0c5a18e 100644
--- a/include/linux/drbd.h
+++ b/include/linux/drbd.h
@@ -51,12 +51,11 @@
#endif
-
extern const char *drbd_buildtag(void);
-#define REL_VERSION "8.3.14"
-#define API_VERSION 88
+#define REL_VERSION "8.4.2"
+#define API_VERSION 1
#define PRO_VERSION_MIN 86
-#define PRO_VERSION_MAX 97
+#define PRO_VERSION_MAX 101
enum drbd_io_error_p {
@@ -66,7 +65,8 @@
};
enum drbd_fencing_p {
- FP_DONT_CARE,
+ FP_NOT_AVAIL = -1, /* Not a policy */
+ FP_DONT_CARE = 0,
FP_RESOURCE,
FP_STONITH
};
@@ -102,6 +102,20 @@
OC_DISCONNECT,
};
+enum drbd_read_balancing {
+ RB_PREFER_LOCAL,
+ RB_PREFER_REMOTE,
+ RB_ROUND_ROBIN,
+ RB_LEAST_PENDING,
+ RB_CONGESTED_REMOTE,
+ RB_32K_STRIPING,
+ RB_64K_STRIPING,
+ RB_128K_STRIPING,
+ RB_256K_STRIPING,
+ RB_512K_STRIPING,
+ RB_1M_STRIPING,
+};
+
/* KEEP the order, do not delete or insert. Only append. */
enum drbd_ret_code {
ERR_CODE_BASE = 100,
@@ -122,7 +136,7 @@
ERR_AUTH_ALG = 120,
ERR_AUTH_ALG_ND = 121,
ERR_NOMEM = 122,
- ERR_DISCARD = 123,
+ ERR_DISCARD_IMPOSSIBLE = 123,
ERR_DISK_CONFIGURED = 124,
ERR_NET_CONFIGURED = 125,
ERR_MANDATORY_TAG = 126,
@@ -130,8 +144,8 @@
ERR_INTR = 129, /* EINTR */
ERR_RESIZE_RESYNC = 130,
ERR_NO_PRIMARY = 131,
- ERR_SYNC_AFTER = 132,
- ERR_SYNC_AFTER_CYCLE = 133,
+ ERR_RESYNC_AFTER = 132,
+ ERR_RESYNC_AFTER_CYCLE = 133,
ERR_PAUSE_IS_SET = 134,
ERR_PAUSE_IS_CLEAR = 135,
ERR_PACKET_NR = 137,
@@ -155,6 +169,14 @@
ERR_CONG_NOT_PROTO_A = 155,
ERR_PIC_AFTER_DEP = 156,
ERR_PIC_PEER_DEP = 157,
+ ERR_RES_NOT_KNOWN = 158,
+ ERR_RES_IN_USE = 159,
+ ERR_MINOR_CONFIGURED = 160,
+ ERR_MINOR_EXISTS = 161,
+ ERR_INVALID_REQUEST = 162,
+ ERR_NEED_APV_100 = 163,
+ ERR_NEED_ALLOW_TWO_PRI = 164,
+ ERR_MD_UNCLEAN = 165,
/* insert new ones above this line */
AFTER_LAST_ERR_CODE
@@ -296,7 +318,8 @@
SS_NOT_SUPPORTED = -17, /* drbd-8.2 only */
SS_IN_TRANSIENT_STATE = -18, /* Retry after the next state change */
SS_CONCURRENT_ST_CHG = -19, /* Concurrent cluster side state change! */
- SS_AFTER_LAST_ERROR = -20, /* Keep this at bottom */
+ SS_O_VOL_PEER_PRI = -20,
+ SS_AFTER_LAST_ERROR = -21, /* Keep this at bottom */
};
/* from drbd_strings.c */
@@ -313,7 +336,9 @@
#define MDF_FULL_SYNC (1 << 3)
#define MDF_WAS_UP_TO_DATE (1 << 4)
#define MDF_PEER_OUT_DATED (1 << 5)
-#define MDF_CRASHED_PRIMARY (1 << 6)
+#define MDF_CRASHED_PRIMARY (1 << 6)
+#define MDF_AL_CLEAN (1 << 7)
+#define MDF_AL_DISABLED (1 << 8)
enum drbd_uuid_index {
UI_CURRENT,
@@ -333,37 +358,23 @@
#define UUID_JUST_CREATED ((__u64)4)
+/* magic numbers used in meta data and network packets */
#define DRBD_MAGIC 0x83740267
-#define BE_DRBD_MAGIC __constant_cpu_to_be32(DRBD_MAGIC)
#define DRBD_MAGIC_BIG 0x835a
-#define BE_DRBD_MAGIC_BIG __constant_cpu_to_be16(DRBD_MAGIC_BIG)
+#define DRBD_MAGIC_100 0x8620ec20
+
+#define DRBD_MD_MAGIC_07 (DRBD_MAGIC+3)
+#define DRBD_MD_MAGIC_08 (DRBD_MAGIC+4)
+#define DRBD_MD_MAGIC_84_UNCLEAN (DRBD_MAGIC+5)
+
+
+/* how I came up with this magic?
+ * base64 decode "actlog==" ;) */
+#define DRBD_AL_MAGIC 0x69cb65a2
/* these are of type "int" */
#define DRBD_MD_INDEX_INTERNAL -1
#define DRBD_MD_INDEX_FLEX_EXT -2
#define DRBD_MD_INDEX_FLEX_INT -3
-/* Start of the new netlink/connector stuff */
-
-#define DRBD_NL_CREATE_DEVICE 0x01
-#define DRBD_NL_SET_DEFAULTS 0x02
-
-
-/* For searching a vacant cn_idx value */
-#define CN_IDX_STEP 6977
-
-struct drbd_nl_cfg_req {
- int packet_type;
- unsigned int drbd_minor;
- int flags;
- unsigned short tag_list[];
-};
-
-struct drbd_nl_cfg_reply {
- int packet_type;
- unsigned int minor;
- int ret_code; /* enum ret_code or set_st_err_t */
- unsigned short tag_list[]; /* only used with get_* calls */
-};
-
#endif
diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h
new file mode 100644
index 0000000..d0d8fac
--- /dev/null
+++ b/include/linux/drbd_genl.h
@@ -0,0 +1,378 @@
+/*
+ * General overview:
+ * full generic netlink message:
+ * |nlmsghdr|genlmsghdr|<payload>
+ *
+ * payload:
+ * |optional fixed size family header|<sequence of netlink attributes>
+ *
+ * sequence of netlink attributes:
+ * I chose to have all "top level" attributes NLA_NESTED,
+ * corresponding to some real struct.
+ * So we have a sequence of |tla, len|<nested nla sequence>
+ *
+ * nested nla sequence:
+ * may be empty, or contain a sequence of netlink attributes
+ * representing the struct fields.
+ *
+ * The tag number of any field (regardless of containing struct)
+ * will be available as T_ ## field_name,
+ * so you cannot have the same field name in two differnt structs.
+ *
+ * The tag numbers themselves are per struct, though,
+ * so should always begin at 1 (not 0, that is the special "NLA_UNSPEC" type,
+ * which we won't use here).
+ * The tag numbers are used as index in the respective nla_policy array.
+ *
+ * GENL_struct(tag_name, tag_number, struct name, struct fields) - struct and policy
+ * genl_magic_struct.h
+ * generates the struct declaration,
+ * generates an entry in the tla enum,
+ * genl_magic_func.h
+ * generates an entry in the static tla policy
+ * with .type = NLA_NESTED
+ * generates the static <struct_name>_nl_policy definition,
+ * and static conversion functions
+ *
+ * genl_magic_func.h
+ *
+ * GENL_mc_group(group)
+ * genl_magic_struct.h
+ * does nothing
+ * genl_magic_func.h
+ * defines and registers the mcast group,
+ * and provides a send helper
+ *
+ * GENL_notification(op_name, op_num, mcast_group, tla list)
+ * These are notifications to userspace.
+ *
+ * genl_magic_struct.h
+ * generates an entry in the genl_ops enum,
+ * genl_magic_func.h
+ * does nothing
+ *
+ * mcast group: the name of the mcast group this notification should be
+ * expected on
+ * tla list: the list of expected top level attributes,
+ * for documentation and sanity checking.
+ *
+ * GENL_op(op_name, op_num, flags and handler, tla list) - "genl operations"
+ * These are requests from userspace.
+ *
+ * _op and _notification share the same "number space",
+ * op_nr will be assigned to "genlmsghdr->cmd"
+ *
+ * genl_magic_struct.h
+ * generates an entry in the genl_ops enum,
+ * genl_magic_func.h
+ * generates an entry in the static genl_ops array,
+ * and static register/unregister functions to
+ * genl_register_family_with_ops().
+ *
+ * flags and handler:
+ * GENL_op_init( .doit = x, .dumpit = y, .flags = something)
+ * GENL_doit(x) => .dumpit = NULL, .flags = GENL_ADMIN_PERM
+ * tla list: the list of expected top level attributes,
+ * for documentation and sanity checking.
+ */
+
+/*
+ * STRUCTS
+ */
+
+/* this is sent kernel -> userland on various error conditions, and contains
+ * informational textual info, which is supposedly human readable.
+ * The computer relevant return code is in the drbd_genlmsghdr.
+ */
+GENL_struct(DRBD_NLA_CFG_REPLY, 1, drbd_cfg_reply,
+ /* "arbitrary" size strings, nla_policy.len = 0 */
+ __str_field(1, DRBD_GENLA_F_MANDATORY, info_text, 0)
+)
+
+/* Configuration requests typically need a context to operate on.
+ * Possible keys are device minor (fits in the drbd_genlmsghdr),
+ * the replication link (aka connection) name,
+ * and/or the replication group (aka resource) name,
+ * and the volume id within the resource. */
+GENL_struct(DRBD_NLA_CFG_CONTEXT, 2, drbd_cfg_context,
+ __u32_field(1, DRBD_GENLA_F_MANDATORY, ctx_volume)
+ __str_field(2, DRBD_GENLA_F_MANDATORY, ctx_resource_name, 128)
+ __bin_field(3, DRBD_GENLA_F_MANDATORY, ctx_my_addr, 128)
+ __bin_field(4, DRBD_GENLA_F_MANDATORY, ctx_peer_addr, 128)
+)
+
+GENL_struct(DRBD_NLA_DISK_CONF, 3, disk_conf,
+ __str_field(1, DRBD_F_REQUIRED | DRBD_F_INVARIANT, backing_dev, 128)
+ __str_field(2, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev, 128)
+ __s32_field(3, DRBD_F_REQUIRED | DRBD_F_INVARIANT, meta_dev_idx)
+
+ /* use the resize command to try and change the disk_size */
+ __u64_field(4, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, disk_size)
+ /* we could change the max_bio_bvecs,
+ * but it won't propagate through the stack */
+ __u32_field(5, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, max_bio_bvecs)
+
+ __u32_field_def(6, DRBD_GENLA_F_MANDATORY, on_io_error, DRBD_ON_IO_ERROR_DEF)
+ __u32_field_def(7, DRBD_GENLA_F_MANDATORY, fencing, DRBD_FENCING_DEF)
+
+ __u32_field_def(8, DRBD_GENLA_F_MANDATORY, resync_rate, DRBD_RESYNC_RATE_DEF)
+ __s32_field_def(9, DRBD_GENLA_F_MANDATORY, resync_after, DRBD_MINOR_NUMBER_DEF)
+ __u32_field_def(10, DRBD_GENLA_F_MANDATORY, al_extents, DRBD_AL_EXTENTS_DEF)
+ __u32_field_def(11, DRBD_GENLA_F_MANDATORY, c_plan_ahead, DRBD_C_PLAN_AHEAD_DEF)
+ __u32_field_def(12, DRBD_GENLA_F_MANDATORY, c_delay_target, DRBD_C_DELAY_TARGET_DEF)
+ __u32_field_def(13, DRBD_GENLA_F_MANDATORY, c_fill_target, DRBD_C_FILL_TARGET_DEF)
+ __u32_field_def(14, DRBD_GENLA_F_MANDATORY, c_max_rate, DRBD_C_MAX_RATE_DEF)
+ __u32_field_def(15, DRBD_GENLA_F_MANDATORY, c_min_rate, DRBD_C_MIN_RATE_DEF)
+
+ __flg_field_def(16, DRBD_GENLA_F_MANDATORY, disk_barrier, DRBD_DISK_BARRIER_DEF)
+ __flg_field_def(17, DRBD_GENLA_F_MANDATORY, disk_flushes, DRBD_DISK_FLUSHES_DEF)
+ __flg_field_def(18, DRBD_GENLA_F_MANDATORY, disk_drain, DRBD_DISK_DRAIN_DEF)
+ __flg_field_def(19, DRBD_GENLA_F_MANDATORY, md_flushes, DRBD_MD_FLUSHES_DEF)
+ __u32_field_def(20, DRBD_GENLA_F_MANDATORY, disk_timeout, DRBD_DISK_TIMEOUT_DEF)
+ __u32_field_def(21, 0 /* OPTIONAL */, read_balancing, DRBD_READ_BALANCING_DEF)
+ /* 9: __u32_field_def(22, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF) */
+ __flg_field_def(23, 0 /* OPTIONAL */, al_updates, DRBD_AL_UPDATES_DEF)
+)
+
+GENL_struct(DRBD_NLA_RESOURCE_OPTS, 4, res_opts,
+ __str_field_def(1, DRBD_GENLA_F_MANDATORY, cpu_mask, 32)
+ __u32_field_def(2, DRBD_GENLA_F_MANDATORY, on_no_data, DRBD_ON_NO_DATA_DEF)
+)
+
+GENL_struct(DRBD_NLA_NET_CONF, 5, net_conf,
+ __str_field_def(1, DRBD_GENLA_F_MANDATORY | DRBD_F_SENSITIVE,
+ shared_secret, SHARED_SECRET_MAX)
+ __str_field_def(2, DRBD_GENLA_F_MANDATORY, cram_hmac_alg, SHARED_SECRET_MAX)
+ __str_field_def(3, DRBD_GENLA_F_MANDATORY, integrity_alg, SHARED_SECRET_MAX)
+ __str_field_def(4, DRBD_GENLA_F_MANDATORY, verify_alg, SHARED_SECRET_MAX)
+ __str_field_def(5, DRBD_GENLA_F_MANDATORY, csums_alg, SHARED_SECRET_MAX)
+ __u32_field_def(6, DRBD_GENLA_F_MANDATORY, wire_protocol, DRBD_PROTOCOL_DEF)
+ __u32_field_def(7, DRBD_GENLA_F_MANDATORY, connect_int, DRBD_CONNECT_INT_DEF)
+ __u32_field_def(8, DRBD_GENLA_F_MANDATORY, timeout, DRBD_TIMEOUT_DEF)
+ __u32_field_def(9, DRBD_GENLA_F_MANDATORY, ping_int, DRBD_PING_INT_DEF)
+ __u32_field_def(10, DRBD_GENLA_F_MANDATORY, ping_timeo, DRBD_PING_TIMEO_DEF)
+ __u32_field_def(11, DRBD_GENLA_F_MANDATORY, sndbuf_size, DRBD_SNDBUF_SIZE_DEF)
+ __u32_field_def(12, DRBD_GENLA_F_MANDATORY, rcvbuf_size, DRBD_RCVBUF_SIZE_DEF)
+ __u32_field_def(13, DRBD_GENLA_F_MANDATORY, ko_count, DRBD_KO_COUNT_DEF)
+ __u32_field_def(14, DRBD_GENLA_F_MANDATORY, max_buffers, DRBD_MAX_BUFFERS_DEF)
+ __u32_field_def(15, DRBD_GENLA_F_MANDATORY, max_epoch_size, DRBD_MAX_EPOCH_SIZE_DEF)
+ __u32_field_def(16, DRBD_GENLA_F_MANDATORY, unplug_watermark, DRBD_UNPLUG_WATERMARK_DEF)
+ __u32_field_def(17, DRBD_GENLA_F_MANDATORY, after_sb_0p, DRBD_AFTER_SB_0P_DEF)
+ __u32_field_def(18, DRBD_GENLA_F_MANDATORY, after_sb_1p, DRBD_AFTER_SB_1P_DEF)
+ __u32_field_def(19, DRBD_GENLA_F_MANDATORY, after_sb_2p, DRBD_AFTER_SB_2P_DEF)
+ __u32_field_def(20, DRBD_GENLA_F_MANDATORY, rr_conflict, DRBD_RR_CONFLICT_DEF)
+ __u32_field_def(21, DRBD_GENLA_F_MANDATORY, on_congestion, DRBD_ON_CONGESTION_DEF)
+ __u32_field_def(22, DRBD_GENLA_F_MANDATORY, cong_fill, DRBD_CONG_FILL_DEF)
+ __u32_field_def(23, DRBD_GENLA_F_MANDATORY, cong_extents, DRBD_CONG_EXTENTS_DEF)
+ __flg_field_def(24, DRBD_GENLA_F_MANDATORY, two_primaries, DRBD_ALLOW_TWO_PRIMARIES_DEF)
+ __flg_field(25, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, discard_my_data)
+ __flg_field_def(26, DRBD_GENLA_F_MANDATORY, tcp_cork, DRBD_TCP_CORK_DEF)
+ __flg_field_def(27, DRBD_GENLA_F_MANDATORY, always_asbp, DRBD_ALWAYS_ASBP_DEF)
+ __flg_field(28, DRBD_GENLA_F_MANDATORY | DRBD_F_INVARIANT, tentative)
+ __flg_field_def(29, DRBD_GENLA_F_MANDATORY, use_rle, DRBD_USE_RLE_DEF)
+ /* 9: __u32_field_def(30, DRBD_GENLA_F_MANDATORY, fencing_policy, DRBD_FENCING_DEF) */
+)
+
+GENL_struct(DRBD_NLA_SET_ROLE_PARMS, 6, set_role_parms,
+ __flg_field(1, DRBD_GENLA_F_MANDATORY, assume_uptodate)
+)
+
+GENL_struct(DRBD_NLA_RESIZE_PARMS, 7, resize_parms,
+ __u64_field(1, DRBD_GENLA_F_MANDATORY, resize_size)
+ __flg_field(2, DRBD_GENLA_F_MANDATORY, resize_force)
+ __flg_field(3, DRBD_GENLA_F_MANDATORY, no_resync)
+)
+
+GENL_struct(DRBD_NLA_STATE_INFO, 8, state_info,
+ /* the reason of the broadcast,
+ * if this is an event triggered broadcast. */
+ __u32_field(1, DRBD_GENLA_F_MANDATORY, sib_reason)
+ __u32_field(2, DRBD_F_REQUIRED, current_state)
+ __u64_field(3, DRBD_GENLA_F_MANDATORY, capacity)
+ __u64_field(4, DRBD_GENLA_F_MANDATORY, ed_uuid)
+
+ /* These are for broadcast from after state change work.
+ * prev_state and new_state are from the moment the state change took
+ * place, new_state is not neccessarily the same as current_state,
+ * there may have been more state changes since. Which will be
+ * broadcasted soon, in their respective after state change work. */
+ __u32_field(5, DRBD_GENLA_F_MANDATORY, prev_state)
+ __u32_field(6, DRBD_GENLA_F_MANDATORY, new_state)
+
+ /* if we have a local disk: */
+ __bin_field(7, DRBD_GENLA_F_MANDATORY, uuids, (UI_SIZE*sizeof(__u64)))
+ __u32_field(8, DRBD_GENLA_F_MANDATORY, disk_flags)
+ __u64_field(9, DRBD_GENLA_F_MANDATORY, bits_total)
+ __u64_field(10, DRBD_GENLA_F_MANDATORY, bits_oos)
+ /* and in case resync or online verify is active */
+ __u64_field(11, DRBD_GENLA_F_MANDATORY, bits_rs_total)
+ __u64_field(12, DRBD_GENLA_F_MANDATORY, bits_rs_failed)
+
+ /* for pre and post notifications of helper execution */
+ __str_field(13, DRBD_GENLA_F_MANDATORY, helper, 32)
+ __u32_field(14, DRBD_GENLA_F_MANDATORY, helper_exit_code)
+
+ __u64_field(15, 0, send_cnt)
+ __u64_field(16, 0, recv_cnt)
+ __u64_field(17, 0, read_cnt)
+ __u64_field(18, 0, writ_cnt)
+ __u64_field(19, 0, al_writ_cnt)
+ __u64_field(20, 0, bm_writ_cnt)
+ __u32_field(21, 0, ap_bio_cnt)
+ __u32_field(22, 0, ap_pending_cnt)
+ __u32_field(23, 0, rs_pending_cnt)
+)
+
+GENL_struct(DRBD_NLA_START_OV_PARMS, 9, start_ov_parms,
+ __u64_field(1, DRBD_GENLA_F_MANDATORY, ov_start_sector)
+ __u64_field(2, DRBD_GENLA_F_MANDATORY, ov_stop_sector)
+)
+
+GENL_struct(DRBD_NLA_NEW_C_UUID_PARMS, 10, new_c_uuid_parms,
+ __flg_field(1, DRBD_GENLA_F_MANDATORY, clear_bm)
+)
+
+GENL_struct(DRBD_NLA_TIMEOUT_PARMS, 11, timeout_parms,
+ __u32_field(1, DRBD_F_REQUIRED, timeout_type)
+)
+
+GENL_struct(DRBD_NLA_DISCONNECT_PARMS, 12, disconnect_parms,
+ __flg_field(1, DRBD_GENLA_F_MANDATORY, force_disconnect)
+)
+
+GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms,
+ __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach)
+)
+
+/*
+ * Notifications and commands (genlmsghdr->cmd)
+ */
+GENL_mc_group(events)
+
+ /* kernel -> userspace announcement of changes */
+GENL_notification(
+ DRBD_EVENT, 1, events,
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_STATE_INFO, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_GENLA_F_MANDATORY)
+ GENL_tla_expected(DRBD_NLA_SYNCER_CONF, DRBD_GENLA_F_MANDATORY)
+)
+
+ /* query kernel for specific or all info */
+GENL_op(
+ DRBD_ADM_GET_STATUS, 2,
+ GENL_op_init(
+ .doit = drbd_adm_get_status,
+ .dumpit = drbd_adm_get_status_all,
+ /* anyone may ask for the status,
+ * it is broadcasted anyways */
+ ),
+ /* To select the object .doit.
+ * Or a subset of objects in .dumpit. */
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)
+)
+
+ /* add DRBD minor devices as volumes to resources */
+GENL_op(DRBD_ADM_NEW_MINOR, 5, GENL_doit(drbd_adm_add_minor),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DEL_MINOR, 6, GENL_doit(drbd_adm_delete_minor),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+ /* add or delete resources */
+GENL_op(DRBD_ADM_NEW_RESOURCE, 7, GENL_doit(drbd_adm_new_resource),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DEL_RESOURCE, 8, GENL_doit(drbd_adm_del_resource),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_RESOURCE_OPTS, 9,
+ GENL_doit(drbd_adm_resource_opts),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_RESOURCE_OPTS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+ DRBD_ADM_CONNECT, 10,
+ GENL_doit(drbd_adm_connect),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+ DRBD_ADM_CHG_NET_OPTS, 29,
+ GENL_doit(drbd_adm_net_opts),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_NET_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_DISCONNECT, 11, GENL_doit(drbd_adm_disconnect),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+
+GENL_op(DRBD_ADM_ATTACH, 12,
+ GENL_doit(drbd_adm_attach),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DISK_CONF, DRBD_F_REQUIRED)
+)
+
+GENL_op(DRBD_ADM_CHG_DISK_OPTS, 28,
+ GENL_doit(drbd_adm_disk_opts),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DISK_OPTS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+ DRBD_ADM_RESIZE, 13,
+ GENL_doit(drbd_adm_resize),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_RESIZE_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+ DRBD_ADM_PRIMARY, 14,
+ GENL_doit(drbd_adm_set_role),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+ DRBD_ADM_SECONDARY, 15,
+ GENL_doit(drbd_adm_set_role),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_SET_ROLE_PARMS, DRBD_F_REQUIRED)
+)
+
+GENL_op(
+ DRBD_ADM_NEW_C_UUID, 16,
+ GENL_doit(drbd_adm_new_c_uuid),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_NEW_C_UUID_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(
+ DRBD_ADM_START_OV, 17,
+ GENL_doit(drbd_adm_start_ov),
+ GENL_tla_expected(DRBD_NLA_START_OV_PARMS, DRBD_GENLA_F_MANDATORY)
+)
+
+GENL_op(DRBD_ADM_DETACH, 18, GENL_doit(drbd_adm_detach),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)
+ GENL_tla_expected(DRBD_NLA_DETACH_PARMS, DRBD_GENLA_F_MANDATORY))
+
+GENL_op(DRBD_ADM_INVALIDATE, 19, GENL_doit(drbd_adm_invalidate),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_INVAL_PEER, 20, GENL_doit(drbd_adm_invalidate_peer),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_PAUSE_SYNC, 21, GENL_doit(drbd_adm_pause_sync),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_RESUME_SYNC, 22, GENL_doit(drbd_adm_resume_sync),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_SUSPEND_IO, 23, GENL_doit(drbd_adm_suspend_io),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_RESUME_IO, 24, GENL_doit(drbd_adm_resume_io),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_OUTDATE, 25, GENL_doit(drbd_adm_outdate),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
+GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down),
+ GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED))
diff --git a/include/linux/drbd_genl_api.h b/include/linux/drbd_genl_api.h
new file mode 100644
index 0000000..9ef50d5
--- /dev/null
+++ b/include/linux/drbd_genl_api.h
@@ -0,0 +1,55 @@
+#ifndef DRBD_GENL_STRUCT_H
+#define DRBD_GENL_STRUCT_H
+
+/**
+ * struct drbd_genlmsghdr - DRBD specific header used in NETLINK_GENERIC requests
+ * @minor:
+ * For admin requests (user -> kernel): which minor device to operate on.
+ * For (unicast) replies or informational (broadcast) messages
+ * (kernel -> user): which minor device the information is about.
+ * If we do not operate on minors, but on connections or resources,
+ * the minor value shall be (~0), and the attribute DRBD_NLA_CFG_CONTEXT
+ * is used instead.
+ * @flags: possible operation modifiers (relevant only for user->kernel):
+ * DRBD_GENL_F_SET_DEFAULTS
+ * @volume:
+ * When creating a new minor (adding it to a resource), the resource needs
+ * to know which volume number within the resource this is supposed to be.
+ * The volume number corresponds to the same volume number on the remote side,
+ * whereas the minor number on the remote side may be different
+ * (union with flags).
+ * @ret_code: kernel->userland unicast cfg reply return code (union with flags);
+ */
+struct drbd_genlmsghdr {
+ __u32 minor;
+ union {
+ __u32 flags;
+ __s32 ret_code;
+ };
+};
+
+/* To be used in drbd_genlmsghdr.flags */
+enum {
+ DRBD_GENL_F_SET_DEFAULTS = 1,
+};
+
+enum drbd_state_info_bcast_reason {
+ SIB_GET_STATUS_REPLY = 1,
+ SIB_STATE_CHANGE = 2,
+ SIB_HELPER_PRE = 3,
+ SIB_HELPER_POST = 4,
+ SIB_SYNC_PROGRESS = 5,
+};
+
+/* hack around predefined gcc/cpp "linux=1",
+ * we cannot possibly include <1/drbd_genl.h> */
+#undef linux
+
+#include <linux/drbd.h>
+#define GENL_MAGIC_VERSION API_VERSION
+#define GENL_MAGIC_FAMILY drbd
+#define GENL_MAGIC_FAMILY_HDRSZ sizeof(struct drbd_genlmsghdr)
+#define GENL_MAGIC_INCLUDE_FILE <linux/drbd_genl.h>
+#include <linux/genl_magic_struct.h>
+
+#endif
diff --git a/include/linux/drbd_limits.h b/include/linux/drbd_limits.h
index fb670bf..1fa19c5 100644
--- a/include/linux/drbd_limits.h
+++ b/include/linux/drbd_limits.h
@@ -16,29 +16,37 @@
#define DEBUG_RANGE_CHECK 0
#define DRBD_MINOR_COUNT_MIN 1
-#define DRBD_MINOR_COUNT_MAX 256
+#define DRBD_MINOR_COUNT_MAX 255
#define DRBD_MINOR_COUNT_DEF 32
+#define DRBD_MINOR_COUNT_SCALE '1'
+
+#define DRBD_VOLUME_MAX 65535
#define DRBD_DIALOG_REFRESH_MIN 0
#define DRBD_DIALOG_REFRESH_MAX 600
+#define DRBD_DIALOG_REFRESH_SCALE '1'
/* valid port number */
#define DRBD_PORT_MIN 1
#define DRBD_PORT_MAX 0xffff
+#define DRBD_PORT_SCALE '1'
/* startup { */
/* if you want more than 3.4 days, disable */
#define DRBD_WFC_TIMEOUT_MIN 0
#define DRBD_WFC_TIMEOUT_MAX 300000
#define DRBD_WFC_TIMEOUT_DEF 0
+#define DRBD_WFC_TIMEOUT_SCALE '1'
#define DRBD_DEGR_WFC_TIMEOUT_MIN 0
#define DRBD_DEGR_WFC_TIMEOUT_MAX 300000
#define DRBD_DEGR_WFC_TIMEOUT_DEF 0
+#define DRBD_DEGR_WFC_TIMEOUT_SCALE '1'
#define DRBD_OUTDATED_WFC_TIMEOUT_MIN 0
#define DRBD_OUTDATED_WFC_TIMEOUT_MAX 300000
#define DRBD_OUTDATED_WFC_TIMEOUT_DEF 0
+#define DRBD_OUTDATED_WFC_TIMEOUT_SCALE '1'
/* }*/
/* net { */
@@ -47,75 +55,91 @@
#define DRBD_TIMEOUT_MIN 1
#define DRBD_TIMEOUT_MAX 600
#define DRBD_TIMEOUT_DEF 60 /* 6 seconds */
+#define DRBD_TIMEOUT_SCALE '1'
/* If backing disk takes longer than disk_timeout, mark the disk as failed */
#define DRBD_DISK_TIMEOUT_MIN 0 /* 0 = disabled */
#define DRBD_DISK_TIMEOUT_MAX 6000 /* 10 Minutes */
#define DRBD_DISK_TIMEOUT_DEF 0 /* disabled */
+#define DRBD_DISK_TIMEOUT_SCALE '1'
/* active connection retries when C_WF_CONNECTION */
#define DRBD_CONNECT_INT_MIN 1
#define DRBD_CONNECT_INT_MAX 120
#define DRBD_CONNECT_INT_DEF 10 /* seconds */
+#define DRBD_CONNECT_INT_SCALE '1'
/* keep-alive probes when idle */
#define DRBD_PING_INT_MIN 1
#define DRBD_PING_INT_MAX 120
#define DRBD_PING_INT_DEF 10
+#define DRBD_PING_INT_SCALE '1'
/* timeout for the ping packets.*/
#define DRBD_PING_TIMEO_MIN 1
#define DRBD_PING_TIMEO_MAX 300
#define DRBD_PING_TIMEO_DEF 5
+#define DRBD_PING_TIMEO_SCALE '1'
/* max number of write requests between write barriers */
#define DRBD_MAX_EPOCH_SIZE_MIN 1
#define DRBD_MAX_EPOCH_SIZE_MAX 20000
#define DRBD_MAX_EPOCH_SIZE_DEF 2048
+#define DRBD_MAX_EPOCH_SIZE_SCALE '1'
/* I don't think that a tcp send buffer of more than 10M is useful */
#define DRBD_SNDBUF_SIZE_MIN 0
#define DRBD_SNDBUF_SIZE_MAX (10<<20)
#define DRBD_SNDBUF_SIZE_DEF 0
+#define DRBD_SNDBUF_SIZE_SCALE '1'
#define DRBD_RCVBUF_SIZE_MIN 0
#define DRBD_RCVBUF_SIZE_MAX (10<<20)
#define DRBD_RCVBUF_SIZE_DEF 0
+#define DRBD_RCVBUF_SIZE_SCALE '1'
/* @4k PageSize -> 128kB - 512MB */
#define DRBD_MAX_BUFFERS_MIN 32
#define DRBD_MAX_BUFFERS_MAX 131072
#define DRBD_MAX_BUFFERS_DEF 2048
+#define DRBD_MAX_BUFFERS_SCALE '1'
/* @4k PageSize -> 4kB - 512MB */
#define DRBD_UNPLUG_WATERMARK_MIN 1
#define DRBD_UNPLUG_WATERMARK_MAX 131072
#define DRBD_UNPLUG_WATERMARK_DEF (DRBD_MAX_BUFFERS_DEF/16)
+#define DRBD_UNPLUG_WATERMARK_SCALE '1'
/* 0 is disabled.
* 200 should be more than enough even for very short timeouts */
#define DRBD_KO_COUNT_MIN 0
#define DRBD_KO_COUNT_MAX 200
-#define DRBD_KO_COUNT_DEF 0
+#define DRBD_KO_COUNT_DEF 7
+#define DRBD_KO_COUNT_SCALE '1'
/* } */
/* syncer { */
/* FIXME allow rate to be zero? */
-#define DRBD_RATE_MIN 1
+#define DRBD_RESYNC_RATE_MIN 1
/* channel bonding 10 GbE, or other hardware */
-#define DRBD_RATE_MAX (4 << 20)
-#define DRBD_RATE_DEF 250 /* kb/second */
+#define DRBD_RESYNC_RATE_MAX (4 << 20)
+#define DRBD_RESYNC_RATE_DEF 250
+#define DRBD_RESYNC_RATE_SCALE 'k' /* kilobytes */
/* less than 7 would hit performance unnecessarily.
- * 3833 is the largest prime that still does fit
- * into 64 sectors of activity log */
+ * 919 slots context information per transaction,
+ * 32k activity log, 4k transaction size,
+ * one transaction in flight:
+ * 919 * 7 = 6433 */
#define DRBD_AL_EXTENTS_MIN 7
-#define DRBD_AL_EXTENTS_MAX 3833
-#define DRBD_AL_EXTENTS_DEF 127
+#define DRBD_AL_EXTENTS_MAX 6433
+#define DRBD_AL_EXTENTS_DEF 1237
+#define DRBD_AL_EXTENTS_SCALE '1'
-#define DRBD_AFTER_MIN -1
-#define DRBD_AFTER_MAX 255
-#define DRBD_AFTER_DEF -1
+#define DRBD_MINOR_NUMBER_MIN -1
+#define DRBD_MINOR_NUMBER_MAX ((1 << 20) - 1)
+#define DRBD_MINOR_NUMBER_DEF -1
+#define DRBD_MINOR_NUMBER_SCALE '1'
/* } */
@@ -124,11 +148,12 @@
* the upper limit with 64bit kernel, enough ram and flexible meta data
* is 1 PiB, currently. */
/* DRBD_MAX_SECTORS */
-#define DRBD_DISK_SIZE_SECT_MIN 0
-#define DRBD_DISK_SIZE_SECT_MAX (1 * (2LLU << 40))
-#define DRBD_DISK_SIZE_SECT_DEF 0 /* = disabled = no user size... */
+#define DRBD_DISK_SIZE_MIN 0
+#define DRBD_DISK_SIZE_MAX (1 * (2LLU << 40))
+#define DRBD_DISK_SIZE_DEF 0 /* = disabled = no user size... */
+#define DRBD_DISK_SIZE_SCALE 's' /* sectors */
-#define DRBD_ON_IO_ERROR_DEF EP_PASS_ON
+#define DRBD_ON_IO_ERROR_DEF EP_DETACH
#define DRBD_FENCING_DEF FP_DONT_CARE
#define DRBD_AFTER_SB_0P_DEF ASB_DISCONNECT
#define DRBD_AFTER_SB_1P_DEF ASB_DISCONNECT
@@ -136,38 +161,59 @@
#define DRBD_RR_CONFLICT_DEF ASB_DISCONNECT
#define DRBD_ON_NO_DATA_DEF OND_IO_ERROR
#define DRBD_ON_CONGESTION_DEF OC_BLOCK
+#define DRBD_READ_BALANCING_DEF RB_PREFER_LOCAL
#define DRBD_MAX_BIO_BVECS_MIN 0
#define DRBD_MAX_BIO_BVECS_MAX 128
#define DRBD_MAX_BIO_BVECS_DEF 0
+#define DRBD_MAX_BIO_BVECS_SCALE '1'
#define DRBD_C_PLAN_AHEAD_MIN 0
#define DRBD_C_PLAN_AHEAD_MAX 300
-#define DRBD_C_PLAN_AHEAD_DEF 0 /* RS rate controller disabled by default */
+#define DRBD_C_PLAN_AHEAD_DEF 20
+#define DRBD_C_PLAN_AHEAD_SCALE '1'
#define DRBD_C_DELAY_TARGET_MIN 1
#define DRBD_C_DELAY_TARGET_MAX 100
#define DRBD_C_DELAY_TARGET_DEF 10
+#define DRBD_C_DELAY_TARGET_SCALE '1'
#define DRBD_C_FILL_TARGET_MIN 0
#define DRBD_C_FILL_TARGET_MAX (1<<20) /* 500MByte in sec */
-#define DRBD_C_FILL_TARGET_DEF 0 /* By default disabled -> controlled by delay_target */
+#define DRBD_C_FILL_TARGET_DEF 100 /* Try to place 50KiB in socket send buffer during resync */
+#define DRBD_C_FILL_TARGET_SCALE 's' /* sectors */
-#define DRBD_C_MAX_RATE_MIN 250 /* kByte/sec */
+#define DRBD_C_MAX_RATE_MIN 250
#define DRBD_C_MAX_RATE_MAX (4 << 20)
#define DRBD_C_MAX_RATE_DEF 102400
+#define DRBD_C_MAX_RATE_SCALE 'k' /* kilobytes */
-#define DRBD_C_MIN_RATE_MIN 0 /* kByte/sec */
+#define DRBD_C_MIN_RATE_MIN 0
#define DRBD_C_MIN_RATE_MAX (4 << 20)
-#define DRBD_C_MIN_RATE_DEF 4096
+#define DRBD_C_MIN_RATE_DEF 250
+#define DRBD_C_MIN_RATE_SCALE 'k' /* kilobytes */
#define DRBD_CONG_FILL_MIN 0
#define DRBD_CONG_FILL_MAX (10<<21) /* 10GByte in sectors */
#define DRBD_CONG_FILL_DEF 0
+#define DRBD_CONG_FILL_SCALE 's' /* sectors */
#define DRBD_CONG_EXTENTS_MIN DRBD_AL_EXTENTS_MIN
#define DRBD_CONG_EXTENTS_MAX DRBD_AL_EXTENTS_MAX
#define DRBD_CONG_EXTENTS_DEF DRBD_AL_EXTENTS_DEF
+#define DRBD_CONG_EXTENTS_SCALE DRBD_AL_EXTENTS_SCALE
-#undef RANGE
+#define DRBD_PROTOCOL_DEF DRBD_PROT_C
+
+#define DRBD_DISK_BARRIER_DEF 0
+#define DRBD_DISK_FLUSHES_DEF 1
+#define DRBD_DISK_DRAIN_DEF 1
+#define DRBD_MD_FLUSHES_DEF 1
+#define DRBD_TCP_CORK_DEF 1
+#define DRBD_AL_UPDATES_DEF 1
+
+#define DRBD_ALLOW_TWO_PRIMARIES_DEF 0
+#define DRBD_ALWAYS_ASBP_DEF 0
+#define DRBD_USE_RLE_DEF 1
+
#endif
diff --git a/include/linux/drbd_nl.h b/include/linux/drbd_nl.h
deleted file mode 100644
index f6a576d..0000000
--- a/include/linux/drbd_nl.h
+++ /dev/null
@@ -1,164 +0,0 @@
-/*
- PAKET( name,
- TYPE ( pn, pr, member )
- ...
- )
-
- You may never reissue one of the pn arguments
-*/
-
-#if !defined(NL_PACKET) || !defined(NL_STRING) || !defined(NL_INTEGER) || !defined(NL_BIT) || !defined(NL_INT64)
-#error "The macros NL_PACKET, NL_STRING, NL_INTEGER, NL_INT64 and NL_BIT needs to be defined"
-#endif
-
-NL_PACKET(primary, 1,
- NL_BIT( 1, T_MAY_IGNORE, primary_force)
-)
-
-NL_PACKET(secondary, 2, )
-
-NL_PACKET(disk_conf, 3,
- NL_INT64( 2, T_MAY_IGNORE, disk_size)
- NL_STRING( 3, T_MANDATORY, backing_dev, 128)
- NL_STRING( 4, T_MANDATORY, meta_dev, 128)
- NL_INTEGER( 5, T_MANDATORY, meta_dev_idx)
- NL_INTEGER( 6, T_MAY_IGNORE, on_io_error)
- NL_INTEGER( 7, T_MAY_IGNORE, fencing)
- NL_BIT( 37, T_MAY_IGNORE, use_bmbv)
- NL_BIT( 53, T_MAY_IGNORE, no_disk_flush)
- NL_BIT( 54, T_MAY_IGNORE, no_md_flush)
- /* 55 max_bio_size was available in 8.2.6rc2 */
- NL_INTEGER( 56, T_MAY_IGNORE, max_bio_bvecs)
- NL_BIT( 57, T_MAY_IGNORE, no_disk_barrier)
- NL_BIT( 58, T_MAY_IGNORE, no_disk_drain)
- NL_INTEGER( 89, T_MAY_IGNORE, disk_timeout)
-)
-
-NL_PACKET(detach, 4,
- NL_BIT( 88, T_MANDATORY, detach_force)
-)
-
-NL_PACKET(net_conf, 5,
- NL_STRING( 8, T_MANDATORY, my_addr, 128)
- NL_STRING( 9, T_MANDATORY, peer_addr, 128)
- NL_STRING( 10, T_MAY_IGNORE, shared_secret, SHARED_SECRET_MAX)
- NL_STRING( 11, T_MAY_IGNORE, cram_hmac_alg, SHARED_SECRET_MAX)
- NL_STRING( 44, T_MAY_IGNORE, integrity_alg, SHARED_SECRET_MAX)
- NL_INTEGER( 14, T_MAY_IGNORE, timeout)
- NL_INTEGER( 15, T_MANDATORY, wire_protocol)
- NL_INTEGER( 16, T_MAY_IGNORE, try_connect_int)
- NL_INTEGER( 17, T_MAY_IGNORE, ping_int)
- NL_INTEGER( 18, T_MAY_IGNORE, max_epoch_size)
- NL_INTEGER( 19, T_MAY_IGNORE, max_buffers)
- NL_INTEGER( 20, T_MAY_IGNORE, unplug_watermark)
- NL_INTEGER( 21, T_MAY_IGNORE, sndbuf_size)
- NL_INTEGER( 22, T_MAY_IGNORE, ko_count)
- NL_INTEGER( 24, T_MAY_IGNORE, after_sb_0p)
- NL_INTEGER( 25, T_MAY_IGNORE, after_sb_1p)
- NL_INTEGER( 26, T_MAY_IGNORE, after_sb_2p)
- NL_INTEGER( 39, T_MAY_IGNORE, rr_conflict)
- NL_INTEGER( 40, T_MAY_IGNORE, ping_timeo)
- NL_INTEGER( 67, T_MAY_IGNORE, rcvbuf_size)
- NL_INTEGER( 81, T_MAY_IGNORE, on_congestion)
- NL_INTEGER( 82, T_MAY_IGNORE, cong_fill)
- NL_INTEGER( 83, T_MAY_IGNORE, cong_extents)
- /* 59 addr_family was available in GIT, never released */
- NL_BIT( 60, T_MANDATORY, mind_af)
- NL_BIT( 27, T_MAY_IGNORE, want_lose)
- NL_BIT( 28, T_MAY_IGNORE, two_primaries)
- NL_BIT( 41, T_MAY_IGNORE, always_asbp)
- NL_BIT( 61, T_MAY_IGNORE, no_cork)
- NL_BIT( 62, T_MANDATORY, auto_sndbuf_size)
- NL_BIT( 70, T_MANDATORY, dry_run)
-)
-
-NL_PACKET(disconnect, 6,
- NL_BIT( 84, T_MAY_IGNORE, force)
-)
-
-NL_PACKET(resize, 7,
- NL_INT64( 29, T_MAY_IGNORE, resize_size)
- NL_BIT( 68, T_MAY_IGNORE, resize_force)
- NL_BIT( 69, T_MANDATORY, no_resync)
-)
-
-NL_PACKET(syncer_conf, 8,
- NL_INTEGER( 30, T_MAY_IGNORE, rate)
- NL_INTEGER( 31, T_MAY_IGNORE, after)
- NL_INTEGER( 32, T_MAY_IGNORE, al_extents)
-/* NL_INTEGER( 71, T_MAY_IGNORE, dp_volume)
- * NL_INTEGER( 72, T_MAY_IGNORE, dp_interval)
- * NL_INTEGER( 73, T_MAY_IGNORE, throttle_th)
- * NL_INTEGER( 74, T_MAY_IGNORE, hold_off_th)
- * feature will be reimplemented differently with 8.3.9 */
- NL_STRING( 52, T_MAY_IGNORE, verify_alg, SHARED_SECRET_MAX)
- NL_STRING( 51, T_MAY_IGNORE, cpu_mask, 32)
- NL_STRING( 64, T_MAY_IGNORE, csums_alg, SHARED_SECRET_MAX)
- NL_BIT( 65, T_MAY_IGNORE, use_rle)
- NL_INTEGER( 75, T_MAY_IGNORE, on_no_data)
- NL_INTEGER( 76, T_MAY_IGNORE, c_plan_ahead)
- NL_INTEGER( 77, T_MAY_IGNORE, c_delay_target)
- NL_INTEGER( 78, T_MAY_IGNORE, c_fill_target)
- NL_INTEGER( 79, T_MAY_IGNORE, c_max_rate)
- NL_INTEGER( 80, T_MAY_IGNORE, c_min_rate)
-)
-
-NL_PACKET(invalidate, 9, )
-NL_PACKET(invalidate_peer, 10, )
-NL_PACKET(pause_sync, 11, )
-NL_PACKET(resume_sync, 12, )
-NL_PACKET(suspend_io, 13, )
-NL_PACKET(resume_io, 14, )
-NL_PACKET(outdate, 15, )
-NL_PACKET(get_config, 16, )
-NL_PACKET(get_state, 17,
- NL_INTEGER( 33, T_MAY_IGNORE, state_i)
-)
-
-NL_PACKET(get_uuids, 18,
- NL_STRING( 34, T_MAY_IGNORE, uuids, (UI_SIZE*sizeof(__u64)))
- NL_INTEGER( 35, T_MAY_IGNORE, uuids_flags)
-)
-
-NL_PACKET(get_timeout_flag, 19,
- NL_BIT( 36, T_MAY_IGNORE, use_degraded)
-)
-
-NL_PACKET(call_helper, 20,
- NL_STRING( 38, T_MAY_IGNORE, helper, 32)
-)
-
-/* Tag nr 42 already allocated in drbd-8.1 development. */
-
-NL_PACKET(sync_progress, 23,
- NL_INTEGER( 43, T_MAY_IGNORE, sync_progress)
-)
-
-NL_PACKET(dump_ee, 24,
- NL_STRING( 45, T_MAY_IGNORE, dump_ee_reason, 32)
- NL_STRING( 46, T_MAY_IGNORE, seen_digest, SHARED_SECRET_MAX)
- NL_STRING( 47, T_MAY_IGNORE, calc_digest, SHARED_SECRET_MAX)
- NL_INT64( 48, T_MAY_IGNORE, ee_sector)
- NL_INT64( 49, T_MAY_IGNORE, ee_block_id)
- NL_STRING( 50, T_MAY_IGNORE, ee_data, 32 << 10)
-)
-
-NL_PACKET(start_ov, 25,
- NL_INT64( 66, T_MAY_IGNORE, start_sector)
- NL_INT64( 90, T_MANDATORY, stop_sector)
-)
-
-NL_PACKET(new_c_uuid, 26,
- NL_BIT( 63, T_MANDATORY, clear_bm)
-)
-
-#ifdef NL_RESPONSE
-NL_RESPONSE(return_code_only, 27)
-#endif
-
-#undef NL_PACKET
-#undef NL_INTEGER
-#undef NL_INT64
-#undef NL_BIT
-#undef NL_STRING
-#undef NL_RESPONSE
diff --git a/include/linux/drbd_tag_magic.h b/include/linux/drbd_tag_magic.h
deleted file mode 100644
index 82de1f9..0000000
--- a/include/linux/drbd_tag_magic.h
+++ /dev/null
@@ -1,84 +0,0 @@
-#ifndef DRBD_TAG_MAGIC_H
-#define DRBD_TAG_MAGIC_H
-
-#define TT_END 0
-#define TT_REMOVED 0xE000
-
-/* declare packet_type enums */
-enum packet_types {
-#define NL_PACKET(name, number, fields) P_ ## name = number,
-#define NL_RESPONSE(name, number) P_ ## name = number,
-#define NL_INTEGER(pn, pr, member)
-#define NL_INT64(pn, pr, member)
-#define NL_BIT(pn, pr, member)
-#define NL_STRING(pn, pr, member, len)
-#include <linux/drbd_nl.h>
- P_nl_after_last_packet,
-};
-
-/* These struct are used to deduce the size of the tag lists: */
-#define NL_PACKET(name, number, fields) \
- struct name ## _tag_len_struct { fields };
-#define NL_INTEGER(pn, pr, member) \
- int member; int tag_and_len ## member;
-#define NL_INT64(pn, pr, member) \
- __u64 member; int tag_and_len ## member;
-#define NL_BIT(pn, pr, member) \
- unsigned char member:1; int tag_and_len ## member;
-#define NL_STRING(pn, pr, member, len) \
- unsigned char member[len]; int member ## _len; \
- int tag_and_len ## member;
-#include <linux/drbd_nl.h>
-
-/* declare tag-list-sizes */
-static const int tag_list_sizes[] = {
-#define NL_PACKET(name, number, fields) 2 fields ,
-#define NL_INTEGER(pn, pr, member) + 4 + 4
-#define NL_INT64(pn, pr, member) + 4 + 8
-#define NL_BIT(pn, pr, member) + 4 + 1
-#define NL_STRING(pn, pr, member, len) + 4 + (len)
-#include <linux/drbd_nl.h>
-};
-
-/* The two highest bits are used for the tag type */
-#define TT_MASK 0xC000
-#define TT_INTEGER 0x0000
-#define TT_INT64 0x4000
-#define TT_BIT 0x8000
-#define TT_STRING 0xC000
-/* The next bit indicates if processing of the tag is mandatory */
-#define T_MANDATORY 0x2000
-#define T_MAY_IGNORE 0x0000
-#define TN_MASK 0x1fff
-/* The remaining 13 bits are used to enumerate the tags */
-
-#define tag_type(T) ((T) & TT_MASK)
-#define tag_number(T) ((T) & TN_MASK)
-
-/* declare tag enums */
-#define NL_PACKET(name, number, fields) fields
-enum drbd_tags {
-#define NL_INTEGER(pn, pr, member) T_ ## member = pn | TT_INTEGER | pr ,
-#define NL_INT64(pn, pr, member) T_ ## member = pn | TT_INT64 | pr ,
-#define NL_BIT(pn, pr, member) T_ ## member = pn | TT_BIT | pr ,
-#define NL_STRING(pn, pr, member, len) T_ ## member = pn | TT_STRING | pr ,
-#include <linux/drbd_nl.h>
-};
-
-struct tag {
- const char *name;
- int type_n_flags;
- int max_len;
-};
-
-/* declare tag names */
-#define NL_PACKET(name, number, fields) fields
-static const struct tag tag_descriptions[] = {
-#define NL_INTEGER(pn, pr, member) [ pn ] = { #member, TT_INTEGER | pr, sizeof(int) },
-#define NL_INT64(pn, pr, member) [ pn ] = { #member, TT_INT64 | pr, sizeof(__u64) },
-#define NL_BIT(pn, pr, member) [ pn ] = { #member, TT_BIT | pr, sizeof(int) },
-#define NL_STRING(pn, pr, member, len) [ pn ] = { #member, TT_STRING | pr, (len) },
-#include <linux/drbd_nl.h>
-};
-
-#endif
diff --git a/include/linux/genl_magic_func.h b/include/linux/genl_magic_func.h
new file mode 100644
index 0000000..023bc34
--- /dev/null
+++ b/include/linux/genl_magic_func.h
@@ -0,0 +1,422 @@
+#ifndef GENL_MAGIC_FUNC_H
+#define GENL_MAGIC_FUNC_H
+
+#include <linux/genl_magic_struct.h>
+
+/*
+ * Magic: declare tla policy {{{1
+ * Magic: declare nested policies
+ * {{{2
+ */
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+ [tag_name] = { .type = NLA_NESTED },
+
+static struct nla_policy CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy)[] = {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+static struct nla_policy s_name ## _nl_policy[] __read_mostly = \
+{ s_fields };
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, _type, __get, \
+ __put, __is_signed) \
+ [attr_nr] = { .type = nla_type },
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, _type, maxlen, \
+ __get, __put, __is_signed) \
+ [attr_nr] = { .type = nla_type, \
+ .len = maxlen - (nla_type == NLA_NUL_STRING) },
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#ifndef __KERNEL__
+#ifndef pr_info
+#define pr_info(args...) fprintf(stderr, args);
+#endif
+#endif
+
+#ifdef GENL_MAGIC_DEBUG
+static void dprint_field(const char *dir, int nla_type,
+ const char *name, void *valp)
+{
+ __u64 val = valp ? *(__u32 *)valp : 1;
+ switch (nla_type) {
+ case NLA_U8: val = (__u8)val;
+ case NLA_U16: val = (__u16)val;
+ case NLA_U32: val = (__u32)val;
+ pr_info("%s attr %s: %d 0x%08x\n", dir,
+ name, (int)val, (unsigned)val);
+ break;
+ case NLA_U64:
+ val = *(__u64*)valp;
+ pr_info("%s attr %s: %lld 0x%08llx\n", dir,
+ name, (long long)val, (unsigned long long)val);
+ break;
+ case NLA_FLAG:
+ if (val)
+ pr_info("%s attr %s: set\n", dir, name);
+ break;
+ }
+}
+
+static void dprint_array(const char *dir, int nla_type,
+ const char *name, const char *val, unsigned len)
+{
+ switch (nla_type) {
+ case NLA_NUL_STRING:
+ if (len && val[len-1] == '\0')
+ len--;
+ pr_info("%s attr %s: [len:%u] '%s'\n", dir, name, len, val);
+ break;
+ default:
+ /* we can always show 4 byte,
+ * thats what nlattr are aligned to. */
+ pr_info("%s attr %s: [len:%u] %02x%02x%02x%02x ...\n",
+ dir, name, len, val[0], val[1], val[2], val[3]);
+ }
+}
+
+#define DPRINT_TLA(a, op, b) pr_info("%s %s %s\n", a, op, b);
+
+/* Name is a member field name of the struct s.
+ * If s is NULL (only parsing, no copy requested in *_from_attrs()),
+ * nla is supposed to point to the attribute containing the information
+ * corresponding to that struct member. */
+#define DPRINT_FIELD(dir, nla_type, name, s, nla) \
+ do { \
+ if (s) \
+ dprint_field(dir, nla_type, #name, &s->name); \
+ else if (nla) \
+ dprint_field(dir, nla_type, #name, \
+ (nla_type == NLA_FLAG) ? NULL \
+ : nla_data(nla)); \
+ } while (0)
+
+#define DPRINT_ARRAY(dir, nla_type, name, s, nla) \
+ do { \
+ if (s) \
+ dprint_array(dir, nla_type, #name, \
+ s->name, s->name ## _len); \
+ else if (nla) \
+ dprint_array(dir, nla_type, #name, \
+ nla_data(nla), nla_len(nla)); \
+ } while (0)
+#else
+#define DPRINT_TLA(a, op, b) do {} while (0)
+#define DPRINT_FIELD(dir, nla_type, name, s, nla) do {} while (0)
+#define DPRINT_ARRAY(dir, nla_type, name, s, nla) do {} while (0)
+#endif
+
+/*
+ * Magic: provide conversion functions {{{1
+ * populate struct from attribute table:
+ * {{{2
+ */
+
+/* processing of generic netlink messages is serialized.
+ * use one static buffer for parsing of nested attributes */
+static struct nlattr *nested_attr_tb[128];
+
+#ifndef BUILD_BUG_ON
+/* Force a compilation error if condition is true */
+#define BUILD_BUG_ON(condition) ((void)BUILD_BUG_ON_ZERO(condition))
+/* Force a compilation error if condition is true, but also produce a
+ result (of value 0 and type size_t), so the expression can be used
+ e.g. in a structure initializer (or where-ever else comma expressions
+ aren't permitted). */
+#define BUILD_BUG_ON_ZERO(e) (sizeof(struct { int:-!!(e); }))
+#define BUILD_BUG_ON_NULL(e) ((void *)sizeof(struct { int:-!!(e); }))
+#endif
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+/* *_from_attrs functions are static, but potentially unused */ \
+static int __ ## s_name ## _from_attrs(struct s_name *s, \
+ struct genl_info *info, bool exclude_invariants) \
+{ \
+ const int maxtype = ARRAY_SIZE(s_name ## _nl_policy)-1; \
+ struct nlattr *tla = info->attrs[tag_number]; \
+ struct nlattr **ntb = nested_attr_tb; \
+ struct nlattr *nla; \
+ int err; \
+ BUILD_BUG_ON(ARRAY_SIZE(s_name ## _nl_policy) > ARRAY_SIZE(nested_attr_tb)); \
+ if (!tla) \
+ return -ENOMSG; \
+ DPRINT_TLA(#s_name, "<=-", #tag_name); \
+ err = drbd_nla_parse_nested(ntb, maxtype, tla, s_name ## _nl_policy); \
+ if (err) \
+ return err; \
+ \
+ s_fields \
+ return 0; \
+} __attribute__((unused)) \
+static int s_name ## _from_attrs(struct s_name *s, \
+ struct genl_info *info) \
+{ \
+ return __ ## s_name ## _from_attrs(s, info, false); \
+} __attribute__((unused)) \
+static int s_name ## _from_attrs_for_change(struct s_name *s, \
+ struct genl_info *info) \
+{ \
+ return __ ## s_name ## _from_attrs(s, info, true); \
+} __attribute__((unused)) \
+
+#define __assign(attr_nr, attr_flag, name, nla_type, type, assignment...) \
+ nla = ntb[attr_nr]; \
+ if (nla) { \
+ if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
+ pr_info("<< must not change invariant attr: %s\n", #name); \
+ return -EEXIST; \
+ } \
+ assignment; \
+ } else if (exclude_invariants && ((attr_flag) & DRBD_F_INVARIANT)) { \
+ /* attribute missing from payload, */ \
+ /* which was expected */ \
+ } else if ((attr_flag) & DRBD_F_REQUIRED) { \
+ pr_info("<< missing attr: %s\n", #name); \
+ return -ENOMSG; \
+ }
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ __is_signed) \
+ __assign(attr_nr, attr_flag, name, nla_type, type, \
+ if (s) \
+ s->name = __get(nla); \
+ DPRINT_FIELD("<<", nla_type, name, s, nla))
+
+/* validate_nla() already checked nla_len <= maxlen appropriately. */
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, __is_signed) \
+ __assign(attr_nr, attr_flag, name, nla_type, type, \
+ if (s) \
+ s->name ## _len = \
+ __get(s->name, nla, maxlen); \
+ DPRINT_ARRAY("<<", nla_type, name, s, nla))
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+/*
+ * Magic: define op number to op name mapping {{{1
+ * {{{2
+ */
+const char *CONCAT_(GENL_MAGIC_FAMILY, _genl_cmd_to_str)(__u8 cmd)
+{
+ switch (cmd) {
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list) \
+ case op_num: return #op_name;
+#include GENL_MAGIC_INCLUDE_FILE
+ default:
+ return "unknown";
+ }
+}
+
+#ifdef __KERNEL__
+#include <linux/stringify.h>
+/*
+ * Magic: define genl_ops {{{1
+ * {{{2
+ */
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list) \
+{ \
+ handler \
+ .cmd = op_name, \
+ .policy = CONCAT_(GENL_MAGIC_FAMILY, _tla_nl_policy), \
+},
+
+#define ZZZ_genl_ops CONCAT_(GENL_MAGIC_FAMILY, _genl_ops)
+static struct genl_ops ZZZ_genl_ops[] __read_mostly = {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+/*
+ * Define the genl_family, multicast groups, {{{1
+ * and provide register/unregister functions.
+ * {{{2
+ */
+#define ZZZ_genl_family CONCAT_(GENL_MAGIC_FAMILY, _genl_family)
+static struct genl_family ZZZ_genl_family __read_mostly = {
+ .id = GENL_ID_GENERATE,
+ .name = __stringify(GENL_MAGIC_FAMILY),
+ .version = GENL_MAGIC_VERSION,
+#ifdef GENL_MAGIC_FAMILY_HDRSZ
+ .hdrsize = NLA_ALIGN(GENL_MAGIC_FAMILY_HDRSZ),
+#endif
+ .maxattr = ARRAY_SIZE(drbd_tla_nl_policy)-1,
+};
+
+/*
+ * Magic: define multicast groups
+ * Magic: define multicast group registration helper
+ */
+#undef GENL_mc_group
+#define GENL_mc_group(group) \
+static struct genl_multicast_group \
+CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group) __read_mostly = { \
+ .name = #group, \
+}; \
+static int CONCAT_(GENL_MAGIC_FAMILY, _genl_multicast_ ## group)( \
+ struct sk_buff *skb, gfp_t flags) \
+{ \
+ unsigned int group_id = \
+ CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id; \
+ if (!group_id) \
+ return -EINVAL; \
+ return genlmsg_multicast(skb, 0, group_id, flags); \
+}
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void)
+{
+ int err = genl_register_family_with_ops(&ZZZ_genl_family,
+ ZZZ_genl_ops, ARRAY_SIZE(ZZZ_genl_ops));
+ if (err)
+ return err;
+#undef GENL_mc_group
+#define GENL_mc_group(group) \
+ err = genl_register_mc_group(&ZZZ_genl_family, \
+ &CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group)); \
+ if (err) \
+ goto fail; \
+ else \
+ pr_info("%s: mcg %s: %u\n", #group, \
+ __stringify(GENL_MAGIC_FAMILY), \
+ CONCAT_(GENL_MAGIC_FAMILY, _mcg_ ## group).id);
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+ return 0;
+fail:
+ genl_unregister_family(&ZZZ_genl_family);
+ return err;
+}
+
+void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void)
+{
+ genl_unregister_family(&ZZZ_genl_family);
+}
+
+/*
+ * Magic: provide conversion functions {{{1
+ * populate skb from struct.
+ * {{{2
+ */
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+static int s_name ## _to_skb(struct sk_buff *skb, struct s_name *s, \
+ const bool exclude_sensitive) \
+{ \
+ struct nlattr *tla = nla_nest_start(skb, tag_number); \
+ if (!tla) \
+ goto nla_put_failure; \
+ DPRINT_TLA(#s_name, "-=>", #tag_name); \
+ s_fields \
+ nla_nest_end(skb, tla); \
+ return 0; \
+ \
+nla_put_failure: \
+ if (tla) \
+ nla_nest_cancel(skb, tla); \
+ return -EMSGSIZE; \
+} \
+static inline int s_name ## _to_priv_skb(struct sk_buff *skb, \
+ struct s_name *s) \
+{ \
+ return s_name ## _to_skb(skb, s, 0); \
+} \
+static inline int s_name ## _to_unpriv_skb(struct sk_buff *skb, \
+ struct s_name *s) \
+{ \
+ return s_name ## _to_skb(skb, s, 1); \
+}
+
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ __is_signed) \
+ if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
+ DPRINT_FIELD(">>", nla_type, name, s, NULL); \
+ if (__put(skb, attr_nr, s->name)) \
+ goto nla_put_failure; \
+ }
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, __is_signed) \
+ if (!exclude_sensitive || !((attr_flag) & DRBD_F_SENSITIVE)) { \
+ DPRINT_ARRAY(">>",nla_type, name, s, NULL); \
+ if (__put(skb, attr_nr, min_t(int, maxlen, \
+ s->name ## _len + (nla_type == NLA_NUL_STRING)),\
+ s->name)) \
+ goto nla_put_failure; \
+ }
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+
+/* Functions for initializing structs to default values. */
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ __is_signed)
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, __is_signed)
+#undef __u32_field_def
+#define __u32_field_def(attr_nr, attr_flag, name, default) \
+ x->name = default;
+#undef __s32_field_def
+#define __s32_field_def(attr_nr, attr_flag, name, default) \
+ x->name = default;
+#undef __flg_field_def
+#define __flg_field_def(attr_nr, attr_flag, name, default) \
+ x->name = default;
+#undef __str_field_def
+#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
+ memset(x->name, 0, sizeof(x->name)); \
+ x->name ## _len = 0;
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+static void set_ ## s_name ## _defaults(struct s_name *x) __attribute__((unused)); \
+static void set_ ## s_name ## _defaults(struct s_name *x) { \
+s_fields \
+}
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#endif /* __KERNEL__ */
+
+/* }}}1 */
+#endif /* GENL_MAGIC_FUNC_H */
+/* vim: set foldmethod=marker foldlevel=1 nofoldenable : */
diff --git a/include/linux/genl_magic_struct.h b/include/linux/genl_magic_struct.h
new file mode 100644
index 0000000..eecd19b
--- /dev/null
+++ b/include/linux/genl_magic_struct.h
@@ -0,0 +1,277 @@
+#ifndef GENL_MAGIC_STRUCT_H
+#define GENL_MAGIC_STRUCT_H
+
+#ifndef GENL_MAGIC_FAMILY
+# error "you need to define GENL_MAGIC_FAMILY before inclusion"
+#endif
+
+#ifndef GENL_MAGIC_VERSION
+# error "you need to define GENL_MAGIC_VERSION before inclusion"
+#endif
+
+#ifndef GENL_MAGIC_INCLUDE_FILE
+# error "you need to define GENL_MAGIC_INCLUDE_FILE before inclusion"
+#endif
+
+#include <linux/genetlink.h>
+#include <linux/types.h>
+
+#define CONCAT__(a,b) a ## b
+#define CONCAT_(a,b) CONCAT__(a,b)
+
+extern int CONCAT_(GENL_MAGIC_FAMILY, _genl_register)(void);
+extern void CONCAT_(GENL_MAGIC_FAMILY, _genl_unregister)(void);
+
+/*
+ * Extension of genl attribute validation policies {{{2
+ */
+
+/*
+ * @DRBD_GENLA_F_MANDATORY: By default, netlink ignores attributes it does not
+ * know about. This flag can be set in nlattr->nla_type to indicate that this
+ * attribute must not be ignored.
+ *
+ * We check and remove this flag in drbd_nla_check_mandatory() before
+ * validating the attribute types and lengths via nla_parse_nested().
+ */
+#define DRBD_GENLA_F_MANDATORY (1 << 14)
+
+/*
+ * Flags specific to drbd and not visible at the netlink layer, used in
+ * <struct>_from_attrs and <struct>_to_skb:
+ *
+ * @DRBD_F_REQUIRED: Attribute is required; a request without this attribute is
+ * invalid.
+ *
+ * @DRBD_F_SENSITIVE: Attribute includes sensitive information and must not be
+ * included in unpriviledged get requests or broadcasts.
+ *
+ * @DRBD_F_INVARIANT: Attribute is set when an object is initially created, but
+ * cannot subsequently be changed.
+ */
+#define DRBD_F_REQUIRED (1 << 0)
+#define DRBD_F_SENSITIVE (1 << 1)
+#define DRBD_F_INVARIANT (1 << 2)
+
+#define __nla_type(x) ((__u16)((x) & NLA_TYPE_MASK & ~DRBD_GENLA_F_MANDATORY))
+
+/* }}}1
+ * MAGIC
+ * multi-include macro expansion magic starts here
+ */
+
+/* MAGIC helpers {{{2 */
+
+/* possible field types */
+#define __flg_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U8, char, \
+ nla_get_u8, nla_put_u8, false)
+#define __u8_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U8, unsigned char, \
+ nla_get_u8, nla_put_u8, false)
+#define __u16_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U16, __u16, \
+ nla_get_u16, nla_put_u16, false)
+#define __u32_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U32, __u32, \
+ nla_get_u32, nla_put_u32, false)
+#define __s32_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U32, __s32, \
+ nla_get_u32, nla_put_u32, true)
+#define __u64_field(attr_nr, attr_flag, name) \
+ __field(attr_nr, attr_flag, name, NLA_U64, __u64, \
+ nla_get_u64, nla_put_u64, false)
+#define __str_field(attr_nr, attr_flag, name, maxlen) \
+ __array(attr_nr, attr_flag, name, NLA_NUL_STRING, char, maxlen, \
+ nla_strlcpy, nla_put, false)
+#define __bin_field(attr_nr, attr_flag, name, maxlen) \
+ __array(attr_nr, attr_flag, name, NLA_BINARY, char, maxlen, \
+ nla_memcpy, nla_put, false)
+
+/* fields with default values */
+#define __flg_field_def(attr_nr, attr_flag, name, default) \
+ __flg_field(attr_nr, attr_flag, name)
+#define __u32_field_def(attr_nr, attr_flag, name, default) \
+ __u32_field(attr_nr, attr_flag, name)
+#define __s32_field_def(attr_nr, attr_flag, name, default) \
+ __s32_field(attr_nr, attr_flag, name)
+#define __str_field_def(attr_nr, attr_flag, name, maxlen) \
+ __str_field(attr_nr, attr_flag, name, maxlen)
+
+#define GENL_op_init(args...) args
+#define GENL_doit(handler) \
+ .doit = handler, \
+ .flags = GENL_ADMIN_PERM,
+#define GENL_dumpit(handler) \
+ .dumpit = handler, \
+ .flags = GENL_ADMIN_PERM,
+
+/* }}}1
+ * Magic: define the enum symbols for genl_ops
+ * Magic: define the enum symbols for top level attributes
+ * Magic: define the enum symbols for nested attributes
+ * {{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+#undef GENL_mc_group
+#define GENL_mc_group(group)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
+ op_name = op_num,
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, tla_list) \
+ op_name = op_num,
+
+enum {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+ tag_name = tag_number,
+
+enum {
+#include GENL_MAGIC_INCLUDE_FILE
+};
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+enum { \
+ s_fields \
+};
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, \
+ __get, __put, __is_signed) \
+ T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, \
+ maxlen, __get, __put, __is_signed) \
+ T_ ## name = (__u16)(attr_nr | ((attr_flag) & DRBD_GENLA_F_MANDATORY)),
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/* }}}1
+ * Magic: compile time assert unique numbers for operations
+ * Magic: -"- unique numbers for top level attributes
+ * Magic: -"- unique numbers for nested attributes
+ * {{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields)
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list) \
+ case op_name:
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list) \
+ case op_name:
+
+static inline void ct_assert_unique_operations(void)
+{
+ switch (0) {
+#include GENL_MAGIC_INCLUDE_FILE
+ ;
+ }
+}
+
+#undef GENL_op
+#define GENL_op(op_name, op_num, handler, attr_list)
+
+#undef GENL_notification
+#define GENL_notification(op_name, op_num, mcast_group, tla_list)
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+ case tag_number:
+
+static inline void ct_assert_unique_top_level_attributes(void)
+{
+ switch (0) {
+#include GENL_MAGIC_INCLUDE_FILE
+ ;
+ }
+}
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+static inline void ct_assert_unique_ ## s_name ## _attributes(void) \
+{ \
+ switch (0) { \
+ s_fields \
+ ; \
+ } \
+}
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ __is_signed) \
+ case attr_nr:
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, __is_signed) \
+ case attr_nr:
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/* }}}1
+ * Magic: declare structs
+ * struct <name> {
+ * fields
+ * };
+ * {{{2
+ */
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+struct s_name { s_fields };
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ __is_signed) \
+ type name;
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, __is_signed) \
+ type name[maxlen]; \
+ __u32 name ## _len;
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+#undef GENL_struct
+#define GENL_struct(tag_name, tag_number, s_name, s_fields) \
+enum { \
+ s_fields \
+};
+
+#undef __field
+#define __field(attr_nr, attr_flag, name, nla_type, type, __get, __put, \
+ is_signed) \
+ F_ ## name ## _IS_SIGNED = is_signed,
+
+#undef __array
+#define __array(attr_nr, attr_flag, name, nla_type, type, maxlen, \
+ __get, __put, is_signed) \
+ F_ ## name ## _IS_SIGNED = is_signed,
+
+#include GENL_MAGIC_INCLUDE_FILE
+
+/* }}}1 */
+#endif /* GENL_MAGIC_STRUCT_H */
+/* vim: set foldmethod=marker nofoldenable : */
diff --git a/include/linux/idr.h b/include/linux/idr.h
index 87259a4..de7e190 100644
--- a/include/linux/idr.h
+++ b/include/linux/idr.h
@@ -152,4 +152,15 @@
void __init idr_init_cache(void);
+/**
+ * idr_for_each_entry - iterate over an idr's elements of a given type
+ * @idp: idr handle
+ * @entry: the type * to use as cursor
+ * @id: id entry's key
+ */
+#define idr_for_each_entry(idp, entry, id) \
+ for (id = 0, entry = (typeof(entry))idr_get_next((idp), &(id)); \
+ entry != NULL; \
+ ++id, entry = (typeof(entry))idr_get_next((idp), &(id)))
+
#endif /* __IDR_H__ */
diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h
index 7a71ffa..cbafae4 100644
--- a/include/linux/lru_cache.h
+++ b/include/linux/lru_cache.h
@@ -166,9 +166,11 @@
/* if we want to track a larger set of objects,
* it needs to become arch independend u64 */
unsigned lc_number;
-
/* special label when on free list */
#define LC_FREE (~0U)
+
+ /* for pending changes */
+ unsigned lc_new_number;
};
struct lru_cache {
@@ -176,6 +178,7 @@
struct list_head lru;
struct list_head free;
struct list_head in_use;
+ struct list_head to_be_changed;
/* the pre-created kmem cache to allocate the objects from */
struct kmem_cache *lc_cache;
@@ -186,7 +189,7 @@
size_t element_off;
/* number of elements (indices) */
- unsigned int nr_elements;
+ unsigned int nr_elements;
/* Arbitrary limit on maximum tracked objects. Practical limit is much
* lower due to allocation failures, probably. For typical use cases,
* nr_elements should be a few thousand at most.
@@ -194,18 +197,19 @@
* 8 high bits of .lc_index to be overloaded with flags in the future. */
#define LC_MAX_ACTIVE (1<<24)
+ /* allow to accumulate a few (index:label) changes,
+ * but no more than max_pending_changes */
+ unsigned int max_pending_changes;
+ /* number of elements currently on to_be_changed list */
+ unsigned int pending_changes;
+
/* statistics */
- unsigned used; /* number of lelements currently on in_use list */
- unsigned long hits, misses, starving, dirty, changed;
+ unsigned used; /* number of elements currently on in_use list */
+ unsigned long hits, misses, starving, locked, changed;
/* see below: flag-bits for lru_cache */
unsigned long flags;
- /* when changing the label of an index element */
- unsigned int new_number;
-
- /* for paranoia when changing the label of an index element */
- struct lc_element *changing_element;
void *lc_private;
const char *name;
@@ -221,10 +225,15 @@
/* debugging aid, to catch concurrent access early.
* user needs to guarantee exclusive access by proper locking! */
__LC_PARANOIA,
- /* if we need to change the set, but currently there is a changing
- * transaction pending, we are "dirty", and must deferr further
- * changing requests */
+
+ /* annotate that the set is "dirty", possibly accumulating further
+ * changes, until a transaction is finally triggered */
__LC_DIRTY,
+
+ /* Locked, no further changes allowed.
+ * Also used to serialize changing transactions. */
+ __LC_LOCKED,
+
/* if we need to change the set, but currently there is no free nor
* unused element available, we are "starving", and must not give out
* further references, to guarantee that eventually some refcnt will
@@ -236,9 +245,11 @@
};
#define LC_PARANOIA (1<<__LC_PARANOIA)
#define LC_DIRTY (1<<__LC_DIRTY)
+#define LC_LOCKED (1<<__LC_LOCKED)
#define LC_STARVING (1<<__LC_STARVING)
extern struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+ unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off);
extern void lc_reset(struct lru_cache *lc);
extern void lc_destroy(struct lru_cache *lc);
@@ -249,7 +260,7 @@
extern struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr);
extern struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr);
extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e);
-extern void lc_changed(struct lru_cache *lc, struct lc_element *e);
+extern void lc_committed(struct lru_cache *lc);
struct seq_file;
extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc);
@@ -258,16 +269,28 @@
void (*detail) (struct seq_file *, struct lc_element *));
/**
- * lc_try_lock - can be used to stop lc_get() from changing the tracked set
+ * lc_try_lock_for_transaction - can be used to stop lc_get() from changing the tracked set
+ * @lc: the lru cache to operate on
+ *
+ * Allows (expects) the set to be "dirty". Note that the reference counts and
+ * order on the active and lru lists may still change. Used to serialize
+ * changing transactions. Returns true if we aquired the lock.
+ */
+static inline int lc_try_lock_for_transaction(struct lru_cache *lc)
+{
+ return !test_and_set_bit(__LC_LOCKED, &lc->flags);
+}
+
+/**
+ * lc_try_lock - variant to stop lc_get() from changing the tracked set
* @lc: the lru cache to operate on
*
* Note that the reference counts and order on the active and lru lists may
- * still change. Returns true if we acquired the lock.
+ * still change. Only works on a "clean" set. Returns true if we aquired the
+ * lock, which means there are no pending changes, and any further attempt to
+ * change the set will not succeed until the next lc_unlock().
*/
-static inline int lc_try_lock(struct lru_cache *lc)
-{
- return !test_and_set_bit(__LC_DIRTY, &lc->flags);
-}
+extern int lc_try_lock(struct lru_cache *lc);
/**
* lc_unlock - unlock @lc, allow lc_get() to change the set again
@@ -276,14 +299,10 @@
static inline void lc_unlock(struct lru_cache *lc)
{
clear_bit(__LC_DIRTY, &lc->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(__LC_LOCKED, &lc->flags);
}
-static inline int lc_is_used(struct lru_cache *lc, unsigned int enr)
-{
- struct lc_element *e = lc_find(lc, enr);
- return e && e->refcnt;
-}
+extern bool lc_is_used(struct lru_cache *lc, unsigned int enr);
#define lc_entry(ptr, type, member) \
container_of(ptr, type, member)
diff --git a/lib/lru_cache.c b/lib/lru_cache.c
index a07e726..d71d894 100644
--- a/lib/lru_cache.c
+++ b/lib/lru_cache.c
@@ -44,8 +44,8 @@
} while (0)
#define RETURN(x...) do { \
- clear_bit(__LC_PARANOIA, &lc->flags); \
- smp_mb__after_clear_bit(); return x ; } while (0)
+ clear_bit_unlock(__LC_PARANOIA, &lc->flags); \
+ return x ; } while (0)
/* BUG() if e is not one of the elements tracked by lc */
#define PARANOIA_LC_ELEMENT(lc, e) do { \
@@ -55,9 +55,40 @@
BUG_ON(i >= lc_->nr_elements); \
BUG_ON(lc_->lc_element[i] != e_); } while (0)
+
+/* We need to atomically
+ * - try to grab the lock (set LC_LOCKED)
+ * - only if there is no pending transaction
+ * (neither LC_DIRTY nor LC_STARVING is set)
+ * Because of PARANOIA_ENTRY() above abusing lc->flags as well,
+ * it is not sufficient to just say
+ * return 0 == cmpxchg(&lc->flags, 0, LC_LOCKED);
+ */
+int lc_try_lock(struct lru_cache *lc)
+{
+ unsigned long val;
+ do {
+ val = cmpxchg(&lc->flags, 0, LC_LOCKED);
+ } while (unlikely (val == LC_PARANOIA));
+ /* Spin until no-one is inside a PARANOIA_ENTRY()/RETURN() section. */
+ return 0 == val;
+#if 0
+ /* Alternative approach, spin in case someone enters or leaves a
+ * PARANOIA_ENTRY()/RETURN() section. */
+ unsigned long old, new, val;
+ do {
+ old = lc->flags & LC_PARANOIA;
+ new = old | LC_LOCKED;
+ val = cmpxchg(&lc->flags, old, new);
+ } while (unlikely (val == (old ^ LC_PARANOIA)));
+ return old == val;
+#endif
+}
+
/**
* lc_create - prepares to track objects in an active set
* @name: descriptive name only used in lc_seq_printf_stats and lc_seq_dump_details
+ * @max_pending_changes: maximum changes to accumulate until a transaction is required
* @e_count: number of elements allowed to be active simultaneously
* @e_size: size of the tracked objects
* @e_off: offset to the &struct lc_element member in a tracked object
@@ -66,6 +97,7 @@
* or NULL on (allocation) failure.
*/
struct lru_cache *lc_create(const char *name, struct kmem_cache *cache,
+ unsigned max_pending_changes,
unsigned e_count, size_t e_size, size_t e_off)
{
struct hlist_head *slot = NULL;
@@ -98,12 +130,13 @@
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
+ INIT_LIST_HEAD(&lc->to_be_changed);
lc->name = name;
lc->element_size = e_size;
lc->element_off = e_off;
lc->nr_elements = e_count;
- lc->new_number = LC_FREE;
+ lc->max_pending_changes = max_pending_changes;
lc->lc_cache = cache;
lc->lc_element = element;
lc->lc_slot = slot;
@@ -117,6 +150,7 @@
e = p + e_off;
e->lc_index = i;
e->lc_number = LC_FREE;
+ e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
element[i] = e;
}
@@ -175,15 +209,15 @@
INIT_LIST_HEAD(&lc->in_use);
INIT_LIST_HEAD(&lc->lru);
INIT_LIST_HEAD(&lc->free);
+ INIT_LIST_HEAD(&lc->to_be_changed);
lc->used = 0;
lc->hits = 0;
lc->misses = 0;
lc->starving = 0;
- lc->dirty = 0;
+ lc->locked = 0;
lc->changed = 0;
+ lc->pending_changes = 0;
lc->flags = 0;
- lc->changing_element = NULL;
- lc->new_number = LC_FREE;
memset(lc->lc_slot, 0, sizeof(struct hlist_head) * lc->nr_elements);
for (i = 0; i < lc->nr_elements; i++) {
@@ -194,6 +228,7 @@
/* re-init it */
e->lc_index = i;
e->lc_number = LC_FREE;
+ e->lc_new_number = LC_FREE;
list_add(&e->list, &lc->free);
}
}
@@ -208,14 +243,14 @@
/* NOTE:
* total calls to lc_get are
* (starving + hits + misses)
- * misses include "dirty" count (update from an other thread in
+ * misses include "locked" count (update from an other thread in
* progress) and "changed", when this in fact lead to an successful
* update of the cache.
*/
return seq_printf(seq, "\t%s: used:%u/%u "
- "hits:%lu misses:%lu starving:%lu dirty:%lu changed:%lu\n",
+ "hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n",
lc->name, lc->used, lc->nr_elements,
- lc->hits, lc->misses, lc->starving, lc->dirty, lc->changed);
+ lc->hits, lc->misses, lc->starving, lc->locked, lc->changed);
}
static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr)
@@ -224,6 +259,27 @@
}
+static struct lc_element *__lc_find(struct lru_cache *lc, unsigned int enr,
+ bool include_changing)
+{
+ struct hlist_node *n;
+ struct lc_element *e;
+
+ BUG_ON(!lc);
+ BUG_ON(!lc->nr_elements);
+ hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
+ /* "about to be changed" elements, pending transaction commit,
+ * are hashed by their "new number". "Normal" elements have
+ * lc_number == lc_new_number. */
+ if (e->lc_new_number != enr)
+ continue;
+ if (e->lc_new_number == e->lc_number || include_changing)
+ return e;
+ break;
+ }
+ return NULL;
+}
+
/**
* lc_find - find element by label, if present in the hash table
* @lc: The lru_cache object
@@ -232,38 +288,28 @@
* Returns the pointer to an element, if the element with the requested
* "label" or element number is present in the hash table,
* or NULL if not found. Does not change the refcnt.
+ * Ignores elements that are "about to be used", i.e. not yet in the active
+ * set, but still pending transaction commit.
*/
struct lc_element *lc_find(struct lru_cache *lc, unsigned int enr)
{
- struct hlist_node *n;
- struct lc_element *e;
-
- BUG_ON(!lc);
- BUG_ON(!lc->nr_elements);
- hlist_for_each_entry(e, n, lc_hash_slot(lc, enr), colision) {
- if (e->lc_number == enr)
- return e;
- }
- return NULL;
+ return __lc_find(lc, enr, 0);
}
-/* returned element will be "recycled" immediately */
-static struct lc_element *lc_evict(struct lru_cache *lc)
+/**
+ * lc_is_used - find element by label
+ * @lc: The lru_cache object
+ * @enr: element number
+ *
+ * Returns true, if the element with the requested "label" or element number is
+ * present in the hash table, and is used (refcnt > 0).
+ * Also finds elements that are not _currently_ used but only "about to be
+ * used", i.e. on the "to_be_changed" list, pending transaction commit.
+ */
+bool lc_is_used(struct lru_cache *lc, unsigned int enr)
{
- struct list_head *n;
- struct lc_element *e;
-
- if (list_empty(&lc->lru))
- return NULL;
-
- n = lc->lru.prev;
- e = list_entry(n, struct lc_element, list);
-
- PARANOIA_LC_ELEMENT(lc, e);
-
- list_del(&e->list);
- hlist_del(&e->colision);
- return e;
+ struct lc_element *e = __lc_find(lc, enr, 1);
+ return e && e->refcnt;
}
/**
@@ -280,22 +326,34 @@
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt);
- e->lc_number = LC_FREE;
+ e->lc_number = e->lc_new_number = LC_FREE;
hlist_del_init(&e->colision);
list_move(&e->list, &lc->free);
RETURN();
}
-static struct lc_element *lc_get_unused_element(struct lru_cache *lc)
+static struct lc_element *lc_prepare_for_change(struct lru_cache *lc, unsigned new_number)
{
struct list_head *n;
+ struct lc_element *e;
- if (list_empty(&lc->free))
- return lc_evict(lc);
+ if (!list_empty(&lc->free))
+ n = lc->free.next;
+ else if (!list_empty(&lc->lru))
+ n = lc->lru.prev;
+ else
+ return NULL;
- n = lc->free.next;
- list_del(n);
- return list_entry(n, struct lc_element, list);
+ e = list_entry(n, struct lc_element, list);
+ PARANOIA_LC_ELEMENT(lc, e);
+
+ e->lc_new_number = new_number;
+ if (!hlist_unhashed(&e->colision))
+ __hlist_del(&e->colision);
+ hlist_add_head(&e->colision, lc_hash_slot(lc, new_number));
+ list_move(&e->list, &lc->to_be_changed);
+
+ return e;
}
static int lc_unused_element_available(struct lru_cache *lc)
@@ -308,6 +366,75 @@
return 0;
}
+static struct lc_element *__lc_get(struct lru_cache *lc, unsigned int enr, bool may_change)
+{
+ struct lc_element *e;
+
+ PARANOIA_ENTRY();
+ if (lc->flags & LC_STARVING) {
+ ++lc->starving;
+ RETURN(NULL);
+ }
+
+ e = __lc_find(lc, enr, 1);
+ /* if lc_new_number != lc_number,
+ * this enr is currently being pulled in already,
+ * and will be available once the pending transaction
+ * has been committed. */
+ if (e && e->lc_new_number == e->lc_number) {
+ ++lc->hits;
+ if (e->refcnt++ == 0)
+ lc->used++;
+ list_move(&e->list, &lc->in_use); /* Not evictable... */
+ RETURN(e);
+ }
+
+ ++lc->misses;
+ if (!may_change)
+ RETURN(NULL);
+
+ /* It has been found above, but on the "to_be_changed" list, not yet
+ * committed. Don't pull it in twice, wait for the transaction, then
+ * try again */
+ if (e)
+ RETURN(NULL);
+
+ /* To avoid races with lc_try_lock(), first, mark us dirty
+ * (using test_and_set_bit, as it implies memory barriers), ... */
+ test_and_set_bit(__LC_DIRTY, &lc->flags);
+
+ /* ... only then check if it is locked anyways. If lc_unlock clears
+ * the dirty bit again, that's not a problem, we will come here again.
+ */
+ if (test_bit(__LC_LOCKED, &lc->flags)) {
+ ++lc->locked;
+ RETURN(NULL);
+ }
+
+ /* In case there is nothing available and we can not kick out
+ * the LRU element, we have to wait ...
+ */
+ if (!lc_unused_element_available(lc)) {
+ __set_bit(__LC_STARVING, &lc->flags);
+ RETURN(NULL);
+ }
+
+ /* It was not present in the active set. We are going to recycle an
+ * unused (or even "free") element, but we won't accumulate more than
+ * max_pending_changes changes. */
+ if (lc->pending_changes >= lc->max_pending_changes)
+ RETURN(NULL);
+
+ e = lc_prepare_for_change(lc, enr);
+ BUG_ON(!e);
+
+ clear_bit(__LC_STARVING, &lc->flags);
+ BUG_ON(++e->refcnt != 1);
+ lc->used++;
+ lc->pending_changes++;
+
+ RETURN(e);
+}
/**
* lc_get - get element by label, maybe change the active set
@@ -336,110 +463,65 @@
* pointer to an UNUSED element with some different element number,
* where that different number may also be %LC_FREE.
*
- * In this case, the cache is marked %LC_DIRTY (blocking further changes),
- * and the returned element pointer is removed from the lru list and
- * hash collision chains. The user now should do whatever housekeeping
- * is necessary.
- * Then he must call lc_changed(lc,element_pointer), to finish
- * the change.
+ * In this case, the cache is marked %LC_DIRTY,
+ * so lc_try_lock() will no longer succeed.
+ * The returned element pointer is moved to the "to_be_changed" list,
+ * and registered with the new element number on the hash collision chains,
+ * so it is possible to pick it up from lc_is_used().
+ * Up to "max_pending_changes" (see lc_create()) can be accumulated.
+ * The user now should do whatever housekeeping is necessary,
+ * typically serialize on lc_try_lock_for_transaction(), then call
+ * lc_committed(lc) and lc_unlock(), to finish the change.
*
* NOTE: The user needs to check the lc_number on EACH use, so he recognizes
* any cache set change.
*/
struct lc_element *lc_get(struct lru_cache *lc, unsigned int enr)
{
- struct lc_element *e;
-
- PARANOIA_ENTRY();
- if (lc->flags & LC_STARVING) {
- ++lc->starving;
- RETURN(NULL);
- }
-
- e = lc_find(lc, enr);
- if (e) {
- ++lc->hits;
- if (e->refcnt++ == 0)
- lc->used++;
- list_move(&e->list, &lc->in_use); /* Not evictable... */
- RETURN(e);
- }
-
- ++lc->misses;
-
- /* In case there is nothing available and we can not kick out
- * the LRU element, we have to wait ...
- */
- if (!lc_unused_element_available(lc)) {
- __set_bit(__LC_STARVING, &lc->flags);
- RETURN(NULL);
- }
-
- /* it was not present in the active set.
- * we are going to recycle an unused (or even "free") element.
- * user may need to commit a transaction to record that change.
- * we serialize on flags & TF_DIRTY */
- if (test_and_set_bit(__LC_DIRTY, &lc->flags)) {
- ++lc->dirty;
- RETURN(NULL);
- }
-
- e = lc_get_unused_element(lc);
- BUG_ON(!e);
-
- clear_bit(__LC_STARVING, &lc->flags);
- BUG_ON(++e->refcnt != 1);
- lc->used++;
-
- lc->changing_element = e;
- lc->new_number = enr;
-
- RETURN(e);
-}
-
-/* similar to lc_get,
- * but only gets a new reference on an existing element.
- * you either get the requested element, or NULL.
- * will be consolidated into one function.
- */
-struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
-{
- struct lc_element *e;
-
- PARANOIA_ENTRY();
- if (lc->flags & LC_STARVING) {
- ++lc->starving;
- RETURN(NULL);
- }
-
- e = lc_find(lc, enr);
- if (e) {
- ++lc->hits;
- if (e->refcnt++ == 0)
- lc->used++;
- list_move(&e->list, &lc->in_use); /* Not evictable... */
- }
- RETURN(e);
+ return __lc_get(lc, enr, 1);
}
/**
- * lc_changed - tell @lc that the change has been recorded
+ * lc_try_get - get element by label, if present; do not change the active set
* @lc: the lru cache to operate on
- * @e: the element pending label change
+ * @enr: the label to look up
+ *
+ * Finds an element in the cache, increases its usage count,
+ * "touches" and returns it.
+ *
+ * Return values:
+ * NULL
+ * The cache was marked %LC_STARVING,
+ * or the requested label was not in the active set
+ *
+ * pointer to the element with the REQUESTED element number.
+ * In this case, it can be used right away
*/
-void lc_changed(struct lru_cache *lc, struct lc_element *e)
+struct lc_element *lc_try_get(struct lru_cache *lc, unsigned int enr)
{
+ return __lc_get(lc, enr, 0);
+}
+
+/**
+ * lc_committed - tell @lc that pending changes have been recorded
+ * @lc: the lru cache to operate on
+ *
+ * User is expected to serialize on explicit lc_try_lock_for_transaction()
+ * before the transaction is started, and later needs to lc_unlock() explicitly
+ * as well.
+ */
+void lc_committed(struct lru_cache *lc)
+{
+ struct lc_element *e, *tmp;
+
PARANOIA_ENTRY();
- BUG_ON(e != lc->changing_element);
- PARANOIA_LC_ELEMENT(lc, e);
- ++lc->changed;
- e->lc_number = lc->new_number;
- list_add(&e->list, &lc->in_use);
- hlist_add_head(&e->colision, lc_hash_slot(lc, lc->new_number));
- lc->changing_element = NULL;
- lc->new_number = LC_FREE;
- clear_bit(__LC_DIRTY, &lc->flags);
- smp_mb__after_clear_bit();
+ list_for_each_entry_safe(e, tmp, &lc->to_be_changed, list) {
+ /* count number of changes, not number of transactions */
+ ++lc->changed;
+ e->lc_number = e->lc_new_number;
+ list_move(&e->list, &lc->in_use);
+ }
+ lc->pending_changes = 0;
RETURN();
}
@@ -458,13 +540,12 @@
PARANOIA_ENTRY();
PARANOIA_LC_ELEMENT(lc, e);
BUG_ON(e->refcnt == 0);
- BUG_ON(e == lc->changing_element);
+ BUG_ON(e->lc_number != e->lc_new_number);
if (--e->refcnt == 0) {
/* move it to the front of LRU. */
list_move(&e->list, &lc->lru);
lc->used--;
- clear_bit(__LC_STARVING, &lc->flags);
- smp_mb__after_clear_bit();
+ clear_bit_unlock(__LC_STARVING, &lc->flags);
}
RETURN(e->refcnt);
}
@@ -504,16 +585,24 @@
void lc_set(struct lru_cache *lc, unsigned int enr, int index)
{
struct lc_element *e;
+ struct list_head *lh;
if (index < 0 || index >= lc->nr_elements)
return;
e = lc_element_by_index(lc, index);
- e->lc_number = enr;
+ BUG_ON(e->lc_number != e->lc_new_number);
+ BUG_ON(e->refcnt != 0);
+ e->lc_number = e->lc_new_number = enr;
hlist_del_init(&e->colision);
- hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
- list_move(&e->list, e->refcnt ? &lc->in_use : &lc->lru);
+ if (enr == LC_FREE)
+ lh = &lc->free;
+ else {
+ hlist_add_head(&e->colision, lc_hash_slot(lc, enr));
+ lh = &lc->lru;
+ }
+ list_move(&e->list, lh);
}
/**
@@ -553,8 +642,10 @@
EXPORT_SYMBOL(lc_find);
EXPORT_SYMBOL(lc_get);
EXPORT_SYMBOL(lc_put);
-EXPORT_SYMBOL(lc_changed);
+EXPORT_SYMBOL(lc_committed);
EXPORT_SYMBOL(lc_element_by_index);
EXPORT_SYMBOL(lc_index_of);
EXPORT_SYMBOL(lc_seq_printf_stats);
EXPORT_SYMBOL(lc_seq_dump_details);
+EXPORT_SYMBOL(lc_try_lock);
+EXPORT_SYMBOL(lc_is_used);