Improve precision of the io_limit setting

For async engines, we look only at completions. But we could have
a bunch inflight with a high queue depth, making us go higher than
we should.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/backend.c b/backend.c
index efabfa7..9012140 100644
--- a/backend.c
+++ b/backend.c
@@ -667,13 +667,13 @@
 	unsigned long long bytes, limit;
 
 	if (td_rw(td))
-		bytes = td->this_io_bytes[DDIR_READ] + td->this_io_bytes[DDIR_WRITE];
+		bytes = td->io_issue_bytes[DDIR_READ] + td->io_issue_bytes[DDIR_WRITE];
 	else if (td_write(td))
-		bytes = td->this_io_bytes[DDIR_WRITE];
+		bytes = td->io_issue_bytes[DDIR_WRITE];
 	else if (td_read(td))
-		bytes = td->this_io_bytes[DDIR_READ];
+		bytes = td->io_issue_bytes[DDIR_READ];
 	else
-		bytes = td->this_io_bytes[DDIR_TRIM];
+		bytes = td->io_issue_bytes[DDIR_TRIM];
 
 	if (td->o.io_limit)
 		limit = td->o.io_limit;
diff --git a/fio.h b/fio.h
index be2f23a..d28f8ce 100644
--- a/fio.h
+++ b/fio.h
@@ -235,7 +235,15 @@
 	uint64_t total_io_size;
 	uint64_t fill_device_size;
 
+	/*
+	 * Issue side
+	 */
 	uint64_t io_issues[DDIR_RWDIR_CNT];
+	uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
+
+	/*
+	 * Completions
+	 */
 	uint64_t io_blocks[DDIR_RWDIR_CNT];
 	uint64_t this_io_blocks[DDIR_RWDIR_CNT];
 	uint64_t io_bytes[DDIR_RWDIR_CNT];
diff --git a/ioengines.c b/ioengines.c
index 6370a56..88f67d5 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -294,8 +294,10 @@
 					sizeof(struct timeval));
 	}
 
-	if (ddir_rw(acct_ddir(io_u)))
+	if (ddir_rw(acct_ddir(io_u))) {
 		td->io_issues[acct_ddir(io_u)]++;
+		td->io_issue_bytes[acct_ddir(io_u)] += io_u->xfer_buflen;
+	}
 
 	ret = td->io_ops->queue(td, io_u);