drbd: new configuration parameter c-min-rate
We now track the data rate of locally submitted resync related requests,
and can thus detect non-resync activity on the lower level device.
If the current sync rate is above c-min-rate, and the lower level device
appears to be busy, we throttle the resyncer.
Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index 0fce3f3..0fedcc0 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1513,6 +1513,7 @@
extern void resync_timer_fn(unsigned long data);
/* drbd_receiver.c */
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
const unsigned rw, const int fault_type);
extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 1ff8418a..db93eee 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -1098,6 +1098,8 @@
mdev->ov_left = mdev->rs_total
- BM_SECT_TO_BIT(mdev->ov_position);
mdev->rs_start = now;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
mdev->ov_last_oos_size = 0;
mdev->ov_last_oos_start = 0;
@@ -2706,7 +2708,8 @@
/* .c_plan_ahead = */ DRBD_C_PLAN_AHEAD_DEF,
/* .c_delay_target = */ DRBD_C_DELAY_TARGET_DEF,
/* .c_fill_target = */ DRBD_C_FILL_TARGET_DEF,
- /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF
+ /* .c_max_rate = */ DRBD_C_MAX_RATE_DEF,
+ /* .c_min_rate = */ DRBD_C_MIN_RATE_DEF
};
/* Have to use that way, because the layout differs between
@@ -2742,6 +2745,7 @@
atomic_set(&mdev->packet_seq, 0);
atomic_set(&mdev->pp_in_use, 0);
atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
mutex_init(&mdev->md_io_mutex);
mutex_init(&mdev->data.mutex);
@@ -2819,6 +2823,7 @@
mdev->rs_total =
mdev->rs_failed = 0;
mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
mdev->rs_mark_left[i] = 0;
mdev->rs_mark_time[i] = 0;
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 295b8d5..6b35d41 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -1604,7 +1604,8 @@
sc.c_plan_ahead = DRBD_C_PLAN_AHEAD_DEF;
sc.c_delay_target = DRBD_C_DELAY_TARGET_DEF;
sc.c_fill_target = DRBD_C_FILL_TARGET_DEF;
- sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+ sc.c_max_rate = DRBD_C_MAX_RATE_DEF;
+ sc.c_min_rate = DRBD_C_MIN_RATE_DEF;
} else
memcpy(&sc, &mdev->sync_conf, sizeof(struct syncer_conf));
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 346aed9..0d9967fef 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1561,6 +1561,7 @@
list_add(&e->w.list, &mdev->sync_ee);
spin_unlock_irq(&mdev->req_lock);
+ atomic_add(data_size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, WRITE, DRBD_FAULT_RS_WR) == 0)
return TRUE;
@@ -2017,17 +2018,66 @@
return FALSE;
}
+/* We may throttle resync, if the lower device seems to be busy,
+ * and current sync rate is above c_min_rate.
+ *
+ * To decide whether or not the lower device is busy, we use a scheme similar
+ * to MD RAID is_mddev_idle(): if the partition stats reveal "significant"
+ * (more than 64 sectors) of activity we cannot account for with our own resync
+ * activity, it obviously is "busy".
+ *
+ * The current sync rate used here uses only the most recent two step marks,
+ * to have a short time average so we can react faster.
+ */
+int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+{
+ struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
+ unsigned long db, dt, dbdt;
+ int curr_events;
+ int throttle = 0;
+
+ /* feature disabled? */
+ if (mdev->sync_conf.c_min_rate == 0)
+ return 0;
+
+ curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
+ (int)part_stat_read(&disk->part0, sectors[1]) -
+ atomic_read(&mdev->rs_sect_ev);
+ if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
+ unsigned long rs_left;
+ int i;
+
+ mdev->rs_last_events = curr_events;
+
+ /* sync speed average over the last 2*DRBD_SYNC_MARK_STEP,
+ * approx. */
+ i = (mdev->rs_last_mark + DRBD_SYNC_MARKS-2) % DRBD_SYNC_MARKS;
+ rs_left = drbd_bm_total_weight(mdev) - mdev->rs_failed;
+
+ dt = ((long)jiffies - (long)mdev->rs_mark_time[i]) / HZ;
+ if (!dt)
+ dt++;
+ db = mdev->rs_mark_left[i] - rs_left;
+ dbdt = Bit2KB(db/dt);
+
+ if (dbdt > mdev->sync_conf.c_min_rate)
+ throttle = 1;
+ }
+ return throttle;
+}
+
+
static int receive_DataRequest(struct drbd_conf *mdev, struct p_header *h)
{
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
struct drbd_epoch_entry *e;
struct digest_info *di = NULL;
+ struct p_block_req *p = (struct p_block_req *)h;
+ const int brps = sizeof(*p)-sizeof(*h);
int size, digest_size;
unsigned int fault_type;
- struct p_block_req *p =
- (struct p_block_req *)h;
- const int brps = sizeof(*p)-sizeof(*h);
+
if (drbd_recv(mdev, h->payload, brps) != brps)
return FALSE;
@@ -2099,8 +2149,9 @@
} else if (h->command == P_OV_REPLY) {
e->w.cb = w_e_end_ov_reply;
dec_rs_pending(mdev);
- /* drbd_rs_begin_io done when we sent this request */
- goto submit;
+ /* drbd_rs_begin_io done when we sent this request,
+ * but accounting still needs to be done. */
+ goto submit_for_resync;
}
break;
@@ -2128,9 +2179,36 @@
goto out_free_e;
}
+ /* Throttle, drbd_rs_begin_io and submit should become asynchronous
+ * wrt the receiver, but it is not as straightforward as it may seem.
+ * Various places in the resync start and stop logic assume resync
+ * requests are processed in order, requeuing this on the worker thread
+ * introduces a bunch of new code for synchronization between threads.
+ *
+ * Unlimited throttling before drbd_rs_begin_io may stall the resync
+ * "forever", throttling after drbd_rs_begin_io will lock that extent
+ * for application writes for the same time. For now, just throttle
+ * here, where the rest of the code expects the receiver to sleep for
+ * a while, anyways.
+ */
+
+ /* Throttle before drbd_rs_begin_io, as that locks out application IO;
+ * this defers syncer requests for some time, before letting at least
+ * on request through. The resync controller on the receiving side
+ * will adapt to the incoming rate accordingly.
+ *
+ * We cannot throttle here if remote is Primary/SyncTarget:
+ * we would also throttle its application reads.
+ * In that case, throttling is done on the SyncTarget only.
+ */
+ if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
+ msleep(100);
if (drbd_rs_begin_io(mdev, e->sector))
goto out_free_e;
+submit_for_resync:
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
+
submit:
inc_unacked(mdev);
spin_lock_irq(&mdev->req_lock);
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index f5d779b..99c937a 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -215,10 +215,8 @@
*/
void drbd_endio_pri(struct bio *bio, int error)
{
- unsigned long flags;
struct drbd_request *req = bio->bi_private;
struct drbd_conf *mdev = req->mdev;
- struct bio_and_error m;
enum drbd_req_event what;
int uptodate = bio_flagged(bio, BIO_UPTODATE);
@@ -244,12 +242,7 @@
bio_put(req->private_bio);
req->private_bio = ERR_PTR(error);
- spin_lock_irqsave(&mdev->req_lock, flags);
- __req_mod(req, what, &m);
- spin_unlock_irqrestore(&mdev->req_lock, flags);
-
- if (m.bio)
- complete_master_bio(mdev, &m);
+ req_mod(req, what);
}
int w_read_retry_remote(struct drbd_conf *mdev, struct drbd_work *w, int cancel)
@@ -376,6 +369,9 @@
if (!get_ldev(mdev))
return -EIO;
+ if (drbd_rs_should_slow_down(mdev))
+ goto defer;
+
/* GFP_TRY, because if there is no memory available right now, this may
* be rescheduled for later. It is "only" background resync, after all. */
e = drbd_alloc_ee(mdev, DRBD_MAGIC+0xbeef, sector, size, GFP_TRY);
@@ -387,6 +383,7 @@
list_add(&e->w.list, &mdev->read_ee);
spin_unlock_irq(&mdev->req_lock);
+ atomic_add(size >> 9, &mdev->rs_sect_ev);
if (drbd_submit_ee(mdev, e, READ, DRBD_FAULT_RS_RD) == 0)
return 0;
@@ -512,8 +509,9 @@
sector_t sector;
const sector_t capacity = drbd_get_capacity(mdev->this_bdev);
int max_segment_size;
- int number, i, rollback_i, size, pe, mx;
+ int number, rollback_i, size, pe, mx;
int align, queued, sndbuf;
+ int i = 0;
if (unlikely(cancel))
return 1;
@@ -549,7 +547,14 @@
mdev->c_sync_rate = mdev->sync_conf.rate;
number = SLEEP_TIME * mdev->c_sync_rate / ((BM_BLOCK_SIZE / 1024) * HZ);
}
- pe = atomic_read(&mdev->rs_pending_cnt);
+
+ /* Throttle resync on lower level disk activity, which may also be
+ * caused by application IO on Primary/SyncTarget.
+ * Keep this after the call to drbd_rs_controller, as that assumes
+ * to be called as precisely as possible every SLEEP_TIME,
+ * and would be confused otherwise. */
+ if (drbd_rs_should_slow_down(mdev))
+ goto requeue;
mutex_lock(&mdev->data.mutex);
if (mdev->data.socket)
@@ -563,6 +568,7 @@
mx = number;
/* Limit the number of pending RS requests to no more than the peer's receive buffer */
+ pe = atomic_read(&mdev->rs_pending_cnt);
if ((pe + number) > mx) {
number = mx - pe;
}
@@ -1492,6 +1498,8 @@
mdev->rs_failed = 0;
mdev->rs_paused = 0;
mdev->rs_same_csum = 0;
+ mdev->rs_last_events = 0;
+ mdev->rs_last_sect_ev = 0;
mdev->rs_total = tw;
mdev->rs_start = now;
for (i = 0; i < DRBD_SYNC_MARKS; i++) {
@@ -1516,6 +1524,7 @@
}
atomic_set(&mdev->rs_sect_in, 0);
+ atomic_set(&mdev->rs_sect_ev, 0);
mdev->rs_in_flight = 0;
mdev->rs_planed = 0;
spin_lock(&mdev->peer_seq_lock);