Improve rate limiting

If we have pending IO, we should commit it before going to sleep,
not just wait for IO that has already been issued.

Also improve the delay functions to return the time spent, so
that users don't have to track tha separately if they care about
the precision.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/fio_time.h b/fio_time.h
index 27520b0..79f324a 100644
--- a/fio_time.h
+++ b/fio_time.h
@@ -10,8 +10,8 @@
 extern uint64_t time_since_genesis(void);
 extern uint64_t mtime_since_genesis(void);
 extern uint64_t utime_since_genesis(void);
-extern void usec_spin(unsigned int);
-extern void usec_sleep(struct thread_data *, unsigned long);
+extern uint64_t usec_spin(unsigned int);
+extern uint64_t usec_sleep(struct thread_data *, unsigned long);
 extern void fill_start_time(struct timeval *);
 extern void set_genesis_time(void);
 extern int ramp_time_over(struct thread_data *);
diff --git a/io_u.c b/io_u.c
index 23a9e4a..5971d78 100644
--- a/io_u.c
+++ b/io_u.c
@@ -529,6 +529,12 @@
 	 * io's that have been actually submitted to an async engine,
 	 * and cur_depth is meaningless for sync engines.
 	 */
+	if (td->io_u_queued || td->cur_depth) {
+		int fio_unused ret;
+
+		ret = td_io_commit(td);
+	}
+
 	while (td->io_u_in_flight) {
 		int fio_unused ret;
 
@@ -539,7 +545,6 @@
 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 {
 	enum fio_ddir odir = ddir ^ 1;
-	struct timeval t;
 	long usec;
 
 	assert(ddir_rw(ddir));
@@ -574,9 +579,7 @@
 
 	io_u_quiesce(td);
 
-	fio_gettime(&t, NULL);
-	usec_sleep(td, usec);
-	usec = utime_since_now(&t);
+	usec = usec_sleep(td, usec);
 
 	td->rate_pending_usleep[ddir] -= usec;
 
diff --git a/time.c b/time.c
index b145dc5..f1833c7 100644
--- a/time.c
+++ b/time.c
@@ -9,25 +9,29 @@
 /*
  * busy looping version for the last few usec
  */
-void usec_spin(unsigned int usec)
+uint64_t usec_spin(unsigned int usec)
 {
 	struct timeval start;
+	uint64_t t;
 
 	fio_gettime(&start, NULL);
-	while (utime_since_now(&start) < usec)
+	while ((t = utime_since_now(&start)) < usec)
 		nop;
+
+	return t;
 }
 
-void usec_sleep(struct thread_data *td, unsigned long usec)
+uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
 {
 	struct timespec req;
 	struct timeval tv;
+	uint64_t t = 0;
 
 	do {
 		unsigned long ts = usec;
 
 		if (usec < ns_granularity) {
-			usec_spin(usec);
+			t += usec_spin(usec);
 			break;
 		}
 
@@ -46,11 +50,14 @@
 			break;
 
 		ts = utime_since_now(&tv);
+		t += ts;
 		if (ts >= usec)
 			break;
 
 		usec -= ts;
 	} while (!td->terminate);
+
+	return t;
 }
 
 uint64_t time_since_genesis(void)