dm snapshot: skip reading origin when overwriting complete chunk

If we write a full chunk in the snapshot, skip reading the origin device
because the whole chunk will be overwritten anyway.

This patch changes the snapshot write logic when a full chunk is written.
In this case:
  1. allocate the exception
  2. dispatch the bio (but don't report the bio completion to device mapper)
  3. write the exception record
  4. report bio completed

Callbacks must be done through the kcopyd thread, because callbacks must not
race with each other.  So we create two new functions:

  dm_kcopyd_prepare_callback: allocate a job structure and prepare the callback.
  (This function must not be called from interrupt context.)

  dm_kcopyd_do_callback: submit callback.
  (This function may be called from interrupt context.)

Performance test (on snapshots with 4k chunk size):
  without the patch:
    non-direct-io sequential write (dd):    17.7MB/s
    direct-io sequential write (dd):        20.9MB/s
    non-direct-io random write (mkfs.ext2): 0.44s

  with the patch:
    non-direct-io sequential write (dd):    26.5MB/s
    direct-io sequential write (dd):        33.2MB/s
    non-direct-io random write (mkfs.ext2): 0.27s

Signed-off-by: Mikulas Patocka <mpatocka@redhat.com>
Signed-off-by: Alasdair G Kergon <agk@redhat.com>
diff --git a/drivers/md/dm-snap.c b/drivers/md/dm-snap.c
index 94dee05..6f75887 100644
--- a/drivers/md/dm-snap.c
+++ b/drivers/md/dm-snap.c
@@ -170,6 +170,13 @@
 	 * kcopyd.
 	 */
 	int started;
+
+	/*
+	 * For writing a complete chunk, bypassing the copy.
+	 */
+	struct bio *full_bio;
+	bio_end_io_t *full_bio_end_io;
+	void *full_bio_private;
 };
 
 /*
@@ -1369,6 +1376,7 @@
 	struct dm_snapshot *s = pe->snap;
 	struct bio *origin_bios = NULL;
 	struct bio *snapshot_bios = NULL;
+	struct bio *full_bio = NULL;
 	int error = 0;
 
 	if (!success) {
@@ -1408,6 +1416,11 @@
 	dm_remove_exception(&pe->e);
 	snapshot_bios = bio_list_get(&pe->snapshot_bios);
 	origin_bios = bio_list_get(&pe->origin_bios);
+	full_bio = pe->full_bio;
+	if (full_bio) {
+		full_bio->bi_end_io = pe->full_bio_end_io;
+		full_bio->bi_private = pe->full_bio_private;
+	}
 	free_pending_exception(pe);
 
 	increment_pending_exceptions_done_count();
@@ -1415,10 +1428,15 @@
 	up_write(&s->lock);
 
 	/* Submit any pending write bios */
-	if (error)
+	if (error) {
+		if (full_bio)
+			bio_io_error(full_bio);
 		error_bios(snapshot_bios);
-	else
+	} else {
+		if (full_bio)
+			bio_endio(full_bio, 0);
 		flush_bios(snapshot_bios);
+	}
 
 	retry_origin_bios(s, origin_bios);
 }
@@ -1472,6 +1490,32 @@
 	dm_kcopyd_copy(s->kcopyd_client, &src, 1, &dest, 0, copy_callback, pe);
 }
 
+static void full_bio_end_io(struct bio *bio, int error)
+{
+	void *callback_data = bio->bi_private;
+
+	dm_kcopyd_do_callback(callback_data, 0, error ? 1 : 0);
+}
+
+static void start_full_bio(struct dm_snap_pending_exception *pe,
+			   struct bio *bio)
+{
+	struct dm_snapshot *s = pe->snap;
+	void *callback_data;
+
+	pe->full_bio = bio;
+	pe->full_bio_end_io = bio->bi_end_io;
+	pe->full_bio_private = bio->bi_private;
+
+	callback_data = dm_kcopyd_prepare_callback(s->kcopyd_client,
+						   copy_callback, pe);
+
+	bio->bi_end_io = full_bio_end_io;
+	bio->bi_private = callback_data;
+
+	generic_make_request(bio);
+}
+
 static struct dm_snap_pending_exception *
 __lookup_pending_exception(struct dm_snapshot *s, chunk_t chunk)
 {
@@ -1507,6 +1551,7 @@
 	bio_list_init(&pe->origin_bios);
 	bio_list_init(&pe->snapshot_bios);
 	pe->started = 0;
+	pe->full_bio = NULL;
 
 	if (s->store->type->prepare_exception(s->store, &pe->e)) {
 		free_pending_exception(pe);
@@ -1600,10 +1645,19 @@
 		}
 
 		remap_exception(s, &pe->e, bio, chunk);
-		bio_list_add(&pe->snapshot_bios, bio);
 
 		r = DM_MAPIO_SUBMITTED;
 
+		if (!pe->started &&
+		    bio->bi_size == (s->store->chunk_size << SECTOR_SHIFT)) {
+			pe->started = 1;
+			up_write(&s->lock);
+			start_full_bio(pe, bio);
+			goto out;
+		}
+
+		bio_list_add(&pe->snapshot_bios, bio);
+
 		if (!pe->started) {
 			/* this is protected by snap->lock */
 			pe->started = 1;