Try a little harder to honor number_ios more accurately

Previously, fio checked this on completion, which means we could
almost spill over by the iodepth setting. Check this before
issue instead, taking the in flight IOs into account as well.

Reported-by: Robert Elliott <Elliott@hp.com>
Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/backend.c b/backend.c
index 992033c..bab2026 100644
--- a/backend.c
+++ b/backend.c
@@ -625,6 +625,7 @@
 
 static int io_bytes_exceeded(struct thread_data *td)
 {
+	unsigned long long number_ios = 0;
 	unsigned long long bytes;
 
 	if (td_rw(td))
@@ -636,7 +637,13 @@
 	else
 		bytes = td->this_io_bytes[DDIR_TRIM];
 
-	return bytes >= td->o.size;
+	if (td->o.number_ios) {
+		number_ios = ddir_rw_sum(td->this_io_blocks);
+		number_ios += td->io_u_queued + td->io_u_in_flight;
+	}
+
+	return bytes >= td->o.size ||
+		(number_ios && number_ios >= td->o.number_ios);
 }
 
 /*
@@ -1128,6 +1135,14 @@
 		return 1;
 	}
 
+	if (td->o.number_ios) {
+		unsigned long long number_ios = ddir_rw_sum(td->this_io_blocks);
+
+		number_ios += td->io_u_queued + td->io_u_in_flight;
+		if (number_ios >= td->o.number_ios)
+			return 0;
+	}
+
 	if (td->o.size != -1ULL && ddir_rw_sum(td->io_bytes) < td->o.size) {
 		uint64_t diff;
 
diff --git a/io_u.c b/io_u.c
index 8e27708..0b86d9f 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1595,9 +1595,6 @@
 
 	if (!gtod_reduce(td))
 		add_iops_sample(td, idx, bytes, &icd->time);
-
-	if (td->o.number_ios && !--td->o.number_ios)
-		td->done = 1;
 }
 
 static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)