drbd: Implemented priority inheritance for resync requests

We only issue resync requests if there is no significant application IO
going on. = Application IO has higher priority than resnyc IO.

If application IO can not be started because the resync process locked
an resync_lru entry, start the IO operations necessary to release the
lock ASAP.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 33f6cc5..28f85d9 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -182,6 +182,7 @@
 	if (unlikely(tmp != NULL)) {
 		struct bm_extent  *bm_ext = lc_entry(tmp, struct bm_extent, lce);
 		if (test_bit(BME_NO_WRITES, &bm_ext->flags)) {
+			set_bit(BME_PRIORITY, &bm_ext->flags);
 			spin_unlock_irq(&mdev->al_lock);
 			return NULL;
 		}
@@ -1297,8 +1298,7 @@
 	}
 
 	if (lc_put(mdev->resync, &bm_ext->lce) == 0) {
-		clear_bit(BME_LOCKED, &bm_ext->flags);
-		clear_bit(BME_NO_WRITES, &bm_ext->flags);
+		bm_ext->flags = 0; /* clear BME_LOCKED, BME_NO_WRITES and BME_PRIORITY */
 		mdev->resync_locked--;
 		wake_up(&mdev->al_wait);
 	}
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index da02cce..366873d 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1326,6 +1326,7 @@
 
 #define BME_NO_WRITES  0  /* bm_extent.flags: no more requests on this one! */
 #define BME_LOCKED     1  /* bm_extent.flags: syncer active on this one. */
+#define BME_PRIORITY   2  /* finish resync IO on this extent ASAP! App IO waiting! */
 
 /* drbd_bitmap.c */
 /*
@@ -1552,7 +1553,7 @@
 extern void resync_timer_fn(unsigned long data);
 
 /* drbd_receiver.c */
-extern int drbd_rs_should_slow_down(struct drbd_conf *mdev);
+extern int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector);
 extern int drbd_submit_ee(struct drbd_conf *mdev, struct drbd_epoch_entry *e,
 		const unsigned rw, const int fault_type);
 extern int drbd_release_ee(struct drbd_conf *mdev, struct list_head *list);
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index ee9238e..0630a2e 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1862,10 +1862,11 @@
  * The current sync rate used here uses only the most recent two step marks,
  * to have a short time average so we can react faster.
  */
-int drbd_rs_should_slow_down(struct drbd_conf *mdev)
+int drbd_rs_should_slow_down(struct drbd_conf *mdev, sector_t sector)
 {
 	struct gendisk *disk = mdev->ldev->backing_bdev->bd_contains->bd_disk;
 	unsigned long db, dt, dbdt;
+	struct lc_element *tmp;
 	int curr_events;
 	int throttle = 0;
 
@@ -1873,9 +1874,22 @@
 	if (mdev->sync_conf.c_min_rate == 0)
 		return 0;
 
+	spin_lock_irq(&mdev->al_lock);
+	tmp = lc_find(mdev->resync, BM_SECT_TO_EXT(sector));
+	if (tmp) {
+		struct bm_extent *bm_ext = lc_entry(tmp, struct bm_extent, lce);
+		if (test_bit(BME_PRIORITY, &bm_ext->flags)) {
+			spin_unlock_irq(&mdev->al_lock);
+			return 0;
+		}
+		/* Do not slow down if app IO is already waiting for this extent */
+	}
+	spin_unlock_irq(&mdev->al_lock);
+
 	curr_events = (int)part_stat_read(&disk->part0, sectors[0]) +
 		      (int)part_stat_read(&disk->part0, sectors[1]) -
 			atomic_read(&mdev->rs_sect_ev);
+
 	if (!mdev->rs_last_events || curr_events - mdev->rs_last_events > 64) {
 		unsigned long rs_left;
 		int i;
@@ -2060,9 +2074,9 @@
 	 * we would also throttle its application reads.
 	 * In that case, throttling is done on the SyncTarget only.
 	 */
-	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev))
-		msleep(100);
-	if (drbd_rs_begin_io(mdev, e->sector))
+	if (mdev->state.peer != R_PRIMARY && drbd_rs_should_slow_down(mdev, sector))
+		schedule_timeout_uninterruptible(HZ/10);
+	if (drbd_rs_begin_io(mdev, sector))
 		goto out_free_e;
 
 submit_for_resync:
diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c
index 982d684..4008130 100644
--- a/drivers/block/drbd/drbd_worker.c
+++ b/drivers/block/drbd/drbd_worker.c
@@ -355,7 +355,7 @@
 	if (!get_ldev(mdev))
 		return -EIO;
 
-	if (drbd_rs_should_slow_down(mdev))
+	if (drbd_rs_should_slow_down(mdev, sector))
 		goto defer;
 
 	/* GFP_TRY, because if there is no memory available right now, this may
@@ -503,16 +503,6 @@
 		number = SLEEP_TIME * mdev->c_sync_rate  / ((BM_BLOCK_SIZE / 1024) * HZ);
 	}
 
-	/* Throttle resync on lower level disk activity, which may also be
-	 * caused by application IO on Primary/SyncTarget.
-	 * Keep this after the call to drbd_rs_controller, as that assumes
-	 * to be called as precisely as possible every SLEEP_TIME,
-	 * and would be confused otherwise. */
-	if (number && drbd_rs_should_slow_down(mdev)) {
-		mdev->c_sync_rate = 1;
-		number = 0;
-	}
-
 	/* ignore the amount of pending requests, the resync controller should
 	 * throttle down to incoming reply rate soon enough anyways. */
 	return number;
@@ -594,7 +584,8 @@
 
 		sector = BM_BIT_TO_SECT(bit);
 
-		if (drbd_try_rs_begin_io(mdev, sector)) {
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
 			mdev->bm_resync_fo = bit;
 			goto requeue;
 		}
@@ -719,7 +710,8 @@
 
 		size = BM_BLOCK_SIZE;
 
-		if (drbd_try_rs_begin_io(mdev, sector)) {
+		if (drbd_rs_should_slow_down(mdev, sector) ||
+		    drbd_try_rs_begin_io(mdev, sector)) {
 			mdev->ov_position = sector;
 			goto requeue;
 		}