Improve latency_target runs

Reset the stats when we have found our target, and then do
another latency_window run with those settings so that the
final results reflect the probed values.

Signed-off-by: Jens Axboe <axboe@fb.com>
diff --git a/fio.h b/fio.h
index d1180cd..9159b0c 100644
--- a/fio.h
+++ b/fio.h
@@ -262,6 +262,7 @@
 	unsigned int latency_qd_low;
 	unsigned int latency_failed;
 	uint64_t latency_ios;
+	int latency_end_run;
 
 	/*
 	 * read/write mixed workload state
@@ -504,6 +505,7 @@
  */
 extern void lat_target_check(struct thread_data *);
 extern void lat_target_init(struct thread_data *);
+extern void lat_target_reset(struct thread_data *);
 
 #define for_each_td(td, i)	\
 	for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
diff --git a/io_u.c b/io_u.c
index b84b3e2..619fa25 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1162,6 +1162,10 @@
 		return 1;
 
 	td->latency_qd_high = td->latency_qd;
+
+	if (td->latency_qd == td->latency_qd_low)
+		td->latency_qd_low--;
+
 	td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
 
 	dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
@@ -1186,6 +1190,8 @@
 
 void lat_target_init(struct thread_data *td)
 {
+	td->latency_end_run = 0;
+
 	if (td->o.latency_target) {
 		dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
 		fio_gettime(&td->latency_ts, NULL);
@@ -1197,9 +1203,16 @@
 		td->latency_qd = td->o.iodepth;
 }
 
+void lat_target_reset(struct thread_data *td)
+{
+	if (!td->latency_end_run)
+		lat_target_init(td);
+}
+
 static void lat_target_success(struct thread_data *td)
 {
 	const unsigned int qd = td->latency_qd;
+	struct thread_options *o = &td->o;
 
 	td->latency_qd_low = td->latency_qd;
 
@@ -1208,20 +1221,32 @@
 	 * of bisecting from highest possible queue depth. If we have set
 	 * a limit other than td->o.iodepth, bisect between that.
 	 */
-	if (td->latency_qd_high != td->o.iodepth)
+	if (td->latency_qd_high != o->iodepth)
 		td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
 	else
 		td->latency_qd *= 2;
 
-	if (td->latency_qd > td->o.iodepth)
-		td->latency_qd = td->o.iodepth;
+	if (td->latency_qd > o->iodepth)
+		td->latency_qd = o->iodepth;
 
 	dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
 	/*
-	 * Same as last one, we are done
+	 * Same as last one, we are done. Let it run a latency cycle, so
+	 * we get only the results from the targeted depth.
 	 */
-	if (td->latency_qd == qd)
-		td->done = 1;
+	if (td->latency_qd == qd) {
+		if (td->latency_end_run) {
+			dprint(FD_RATE, "We are done\n");
+			td->done = 1;
+		} else {
+			dprint(FD_RATE, "Quiesce and final run\n");
+			io_u_quiesce(td);
+			td->latency_end_run = 1;
+			reset_all_stats(td);
+			reset_io_stats(td);
+		}
+	}
 
 	lat_new_cycle(td);
 }
diff --git a/libfio.c b/libfio.c
index 222cd16..f4aac2e 100644
--- a/libfio.c
+++ b/libfio.c
@@ -135,7 +135,7 @@
 	memcpy(&td->epoch, &tv, sizeof(tv));
 	memcpy(&td->start, &tv, sizeof(tv));
 
-	lat_target_init(td);
+	lat_target_reset(td);
 }
 
 void reset_fio_state(void)
diff --git a/stat.c b/stat.c
index bc01b51..e43db8f 100644
--- a/stat.c
+++ b/stat.c
@@ -1579,6 +1579,41 @@
 	ios->mean.u.f = ios->S.u.f = 0;
 }
 
+void reset_io_stats(struct thread_data *td)
+{
+	struct thread_stat *ts = &td->ts;
+	int i, j;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		reset_io_stat(&ts->clat_stat[i]);
+		reset_io_stat(&ts->slat_stat[i]);
+		reset_io_stat(&ts->lat_stat[i]);
+		reset_io_stat(&ts->bw_stat[i]);
+		reset_io_stat(&ts->iops_stat[i]);
+
+		ts->io_bytes[i] = 0;
+		ts->runtime[i] = 0;
+
+		for (j = 0; j < FIO_IO_U_PLAT_NR; j++)
+			ts->io_u_plat[i][j] = 0;
+	}
+
+	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
+		ts->io_u_map[i] = 0;
+		ts->io_u_submit[i] = 0;
+		ts->io_u_complete[i] = 0;
+		ts->io_u_lat_u[i] = 0;
+		ts->io_u_lat_m[i] = 0;
+		ts->total_submit = 0;
+		ts->total_complete = 0;
+	}
+
+	for (i = 0; i < 3; i++) {
+		ts->total_io_u[i] = 0;
+		ts->short_io_u[i] = 0;
+	}
+}
+
 static void _add_stat_to_log(struct io_log *iolog, unsigned long elapsed)
 {
 	/*
diff --git a/stat.h b/stat.h
index 7ad0c9d..bc4f6da 100644
--- a/stat.h
+++ b/stat.h
@@ -224,6 +224,7 @@
 extern void stat_calc_lat_m(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
+extern void reset_io_stats(struct thread_data *);
 
 static inline int usec_to_msec(unsigned long *min, unsigned long *max,
 			       double *mean, double *dev)