Improve accuracy of rate= option

I noticed that the rate= option is not terribly precise in some cases.
 It gets worse the higher the selected rate.  For example:
$ fio -name=load -size=100g -ioengine=null -runtime=10 -rate=30m
One would expect that to read 300MB (307200KB) at close to 30MB/s
(30720KB/s).  However it writes 315024KB at 31499KB/s.  Further
experimentation shows that even higher rates can show bigger
discrepancies.  At the extreme end...
$ fio -name=load -size=100g -ioengine=null -runtime=10 -rate=500m
One would expect this to write 5000MB at a rate of 500MB/s
(512000KB/s).  However it writes close to double that (9536.8MB) at a
rate of over 953MB/s.  At a rate of 1GB/s and higher, the rate
limiting is effectively ignored.

This patch improves the accuracy of the rate= option across the whole
range of rates, at the cost of being very slightly more
computationally expensive.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/io_u.c b/io_u.c
index fc3ee49..0ff66f9 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1296,6 +1296,16 @@
 	add_iops_sample(td, idx, &icd->time);
 }
 
+static long long usec_for_io(struct thread_data *td, enum fio_ddir ddir)
+{
+	unsigned long long secs, remainder, bps, bytes;
+	bytes = td->this_io_bytes[ddir];
+	bps = td->rate_bps[ddir];
+	secs = bytes / bps;
+	remainder = bytes % bps;
+	return remainder * 1000000 / bps + secs * 1000000;
+}
+
 static void io_completed(struct thread_data *td, struct io_u *io_u,
 			 struct io_completion_data *icd)
 {
@@ -1354,14 +1364,12 @@
 
 			if (__should_check_rate(td, idx)) {
 				td->rate_pending_usleep[idx] =
-					((td->this_io_bytes[idx] *
-					  td->rate_nsec_cycle[idx]) / 1000 -
+					(usec_for_io(td, idx) -
 					 utime_since_now(&td->start));
 			}
-			if (__should_check_rate(td, idx ^ 1))
+			if (__should_check_rate(td, odx))
 				td->rate_pending_usleep[odx] =
-					((td->this_io_bytes[odx] *
-					  td->rate_nsec_cycle[odx]) / 1000 -
+					(usec_for_io(td, odx) -
 					 utime_since_now(&td->start));
 		}