Improve submission latency calculation

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/engines/libaio.c b/engines/libaio.c
index 510ecab..a659ba9 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -19,6 +19,7 @@
 	io_context_t aio_ctx;
 	struct io_event *aio_events;
 	struct iocb **iocbs;
+	struct io_u **io_us;
 	int iocbs_nr;
 };
 
@@ -90,27 +91,49 @@
 	}
 
 	ld->iocbs[ld->iocbs_nr] = &io_u->iocb;
+	ld->io_us[ld->iocbs_nr] = io_u;
 	ld->iocbs_nr++;
 	return FIO_Q_QUEUED;
 }
 
+static void fio_libaio_queued(struct thread_data *td, struct io_u **io_us,
+			      unsigned int nr)
+{
+	struct timeval now;
+	unsigned int i;
+
+	fio_gettime(&now, NULL);
+
+	for (i = 0; i < nr; i++) {
+		struct io_u *io_u = io_us[i];
+
+		memcpy(&io_u->issue_time, &now, sizeof(now));
+		io_u_queued(td, io_u);
+	}
+}
+
 static int fio_libaio_commit(struct thread_data *td)
 {
 	struct libaio_data *ld = td->io_ops->data;
 	struct iocb **iocbs;
+	struct io_u **io_us;
 	int ret, iocbs_nr;
 
 	if (!ld->iocbs_nr)
 		return 0;
 
 	iocbs_nr = ld->iocbs_nr;
+	io_us = ld->io_us;
 	iocbs = ld->iocbs;
 	do {
 		ret = io_submit(ld->aio_ctx, iocbs_nr, iocbs);
 		if (ret == iocbs_nr) {
+			fio_libaio_queued(td, io_us, ret);
 			ret = 0;
 			break;
 		} else if (ret > 0) {
+			fio_libaio_queued(td, io_us, ret);
+			io_us += ret;
 			iocbs += ret;
 			iocbs_nr -= ret;
 			continue;
@@ -141,11 +164,9 @@
 
 	if (ld) {
 		io_destroy(ld->aio_ctx);
-		if (ld->aio_events)
-			free(ld->aio_events);
-		if (ld->iocbs)
-			free(ld->iocbs);
-
+		free(ld->aio_events);
+		free(ld->iocbs);
+		free(ld->io_us);
 		free(ld);
 		td->io_ops->data = NULL;
 	}
@@ -166,6 +187,8 @@
 	memset(ld->aio_events, 0, td->iodepth * sizeof(struct io_event));
 	ld->iocbs = malloc(td->iodepth * sizeof(struct iocb *));
 	memset(ld->iocbs, 0, sizeof(struct iocb *));
+	ld->io_us = malloc(td->iodepth * sizeof(struct io_u *));
+	memset(ld->io_us, 0, td->iodepth * sizeof(struct io_u *));
 	ld->iocbs_nr = 0;
 
 	td->io_ops->data = ld;
diff --git a/fio.c b/fio.c
index 6176b77..4596544 100644
--- a/fio.c
+++ b/fio.c
@@ -385,6 +385,13 @@
 				ret = bytes_done;
 			break;
 		case FIO_Q_QUEUED:
+			/*
+			 * if the engine doesn't have a commit hook,
+			 * the io_u is really queued. if it does have such
+			 * a hook, it has to call io_u_queued() itself.
+			 */
+			if (td->io_ops->commit == NULL)
+				io_u_queued(td, io_u);
 			break;
 		case FIO_Q_BUSY:
 			requeue_io_u(td, &io_u);
@@ -399,9 +406,6 @@
 		if (ret < 0 || td->error)
 			break;
 
-		if (io_u)
-			add_slat_sample(td, io_u->ddir, mtime_since(&io_u->start_time, &io_u->issue_time));
-
 		/*
 		 * See if we need to complete some commands
 		 */
diff --git a/fio.h b/fio.h
index 12afa06..259166f 100644
--- a/fio.h
+++ b/fio.h
@@ -614,6 +614,7 @@
 extern void requeue_io_u(struct thread_data *, struct io_u **);
 extern long io_u_sync_complete(struct thread_data *, struct io_u *, endio_handler *);
 extern long io_u_queued_complete(struct thread_data *, int, endio_handler *);
+extern void io_u_queued(struct thread_data *, struct io_u *);
 
 /*
  * io engine entry points
diff --git a/io_u.c b/io_u.c
index ab46cbe..6234c42 100644
--- a/io_u.c
+++ b/io_u.c
@@ -555,3 +555,14 @@
 
 	return -1;
 }
+
+/*
+ * Call when io_u is really queued, to update the submission latency.
+ */
+void io_u_queued(struct thread_data *td, struct io_u *io_u)
+{
+	unsigned long slat_time;
+
+	slat_time = mtime_since(&io_u->start_time, &io_u->issue_time);
+	add_slat_sample(td, io_u->ddir, slat_time);
+}
diff --git a/ioengines.c b/ioengines.c
index 16ea928..2a11ed3 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -187,12 +187,15 @@
 
 int td_io_queue(struct thread_data *td, struct io_u *io_u)
 {
-	fio_gettime(&io_u->issue_time, NULL);
+	int ret;
+
 
 	if (io_u->ddir != DDIR_SYNC)
 		td->io_issues[io_u->ddir]++;
 
-	return td->io_ops->queue(td, io_u);
+	ret = td->io_ops->queue(td, io_u);
+	fio_gettime(&io_u->issue_time, NULL);
+	return ret;
 }
 
 int td_io_init(struct thread_data *td)