Add logging for queue submit and complet counts

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/HOWTO b/HOWTO
index e4fccb5..15ee367 100644
--- a/HOWTO
+++ b/HOWTO
@@ -824,6 +824,8 @@
     bw (KiB/s) : min=    0, max= 1196, per=51.00%, avg=664.02, stdev=681.68
   cpu        : usr=1.49%, sys=0.25%, ctx=7969, majf=0, minf=17
   IO depths    : 1=0.1%, 2=0.3%, 4=0.5%, 8=99.0%, 16=0.0%, 32=0.0%, >32=0.0%
+     submit    : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
+     complete  : 0=0.0%, 4=100.0%, 8=0.0%, 16=0.0%, 32=0.0%, 64=0.0%, >=64=0.0%
      issued r/w: total=0/32768, short=0/0
      lat (msec): 2=1.6%, 4=0.0%, 10=3.2%, 20=12.8%, 50=38.4%, 100=24.8%,
      lat (msec): 250=15.2%, 500=0.0%, 750=0.0%, 1000=0.0%, >=2048=0.0%
@@ -861,6 +863,11 @@
 		16= entries includes depths up to that value but higher
 		than the previous entry. In other words, it covers the
 		range from 16 to 31.
+IO submit=	How many pieces of IO were submitting in a single submit
+		call. Each entry denotes that amount and below, until
+		the previous entry - eg, 8=100% mean that we submitted
+		anywhere in between 5-8 ios per submit call.
+IO complete=	Like the above submit number, but for completions instead.
 IO issued=	The number of read/write requests issued, and how many
 		of them were short.
 IO latencies=	The distribution of IO completion latencies. This is the
diff --git a/engines/guasi.c b/engines/guasi.c
index e0bcaf4..b65061e 100644
--- a/engines/guasi.c
+++ b/engines/guasi.c
@@ -141,6 +141,7 @@
 	struct io_u *io_u;
 	struct timeval now;
 
+	io_u_mark_submit(td, nr);
 	fio_gettime(&now, NULL);
 	for (i = 0; i < nr; i++) {
 		io_u = io_us[i];
diff --git a/engines/libaio.c b/engines/libaio.c
index 7ef3ff8..7e69c23 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -144,13 +144,16 @@
 		ret = io_submit(ld->aio_ctx, ld->iocbs_nr, iocbs);
 		if (ret > 0) {
 			fio_libaio_queued(td, io_us, ret);
+			io_u_mark_submit(td, ret);
 			ld->iocbs_nr -= ret;
 			io_us += ret;
 			iocbs += ret;
 			ret = 0;
-		} else if (!ret || ret == -EAGAIN || ret == -EINTR)
+		} else if (!ret || ret == -EAGAIN || ret == -EINTR) {
+			if (!ret)
+				io_u_mark_submit(td, ret);
 			continue;
-		else
+		} else
 			break;
 	} while (ld->iocbs_nr);
 
diff --git a/engines/null.c b/engines/null.c
index 85d6847..eb80501 100644
--- a/engines/null.c
+++ b/engines/null.c
@@ -46,6 +46,7 @@
 	struct null_data *nd = td->io_ops->data;
 
 	if (!nd->events) {
+		io_u_mark_submit(td, nd->queued);
 		nd->events = nd->queued;
 		nd->queued = 0;
 	}
diff --git a/engines/sync.c b/engines/sync.c
index 46cb864..e966846 100644
--- a/engines/sync.c
+++ b/engines/sync.c
@@ -230,6 +230,7 @@
 	if (!sd->queued)
 		return 0;
 
+	io_u_mark_submit(td, sd->queued);
 	f = sd->last_file;
 
 	if (lseek(f->fd, sd->io_us[0]->offset, SEEK_SET) == -1) {
diff --git a/fio.c b/fio.c
index c79fad8..90bad97 100644
--- a/fio.c
+++ b/fio.c
@@ -412,10 +412,10 @@
 		 */
 		min_events = 0;
 		if (queue_full(td) || ret == FIO_Q_BUSY) {
-			min_events = 1;
-
-			if (td->cur_depth > td->o.iodepth_low)
+			if (td->cur_depth >= td->o.iodepth_low)
 				min_events = td->cur_depth - td->o.iodepth_low;
+			if (!min_events)
+				min_events = 1;
 		}
 
 		/*
@@ -545,16 +545,12 @@
 		/*
 		 * See if we need to complete some commands
 		 */
-		if (ret == FIO_Q_QUEUED || ret == FIO_Q_BUSY) {
+		if (queue_full(td) || ret == FIO_Q_BUSY) {
 			min_evts = 0;
-			if (queue_full(td) || ret == FIO_Q_BUSY) {
+			if (td->cur_depth >= td->o.iodepth_low)
+				min_evts = td->cur_depth - td->o.iodepth_low;
+			if (!min_evts)
 				min_evts = 1;
-
-				if (td->cur_depth > td->o.iodepth_low)
-					min_evts = td->cur_depth
-							- td->o.iodepth_low;
-			}
-
 			fio_gettime(&comp_time, NULL);
 			bytes_done = io_u_queued_complete(td, min_evts);
 			if (bytes_done < 0)
diff --git a/fio.h b/fio.h
index 25d05bd..472fe93 100644
--- a/fio.h
+++ b/fio.h
@@ -378,10 +378,14 @@
 	 * IO depth and latency stats
 	 */
 	unsigned int io_u_map[FIO_IO_U_MAP_NR];
+	unsigned int io_u_submit[FIO_IO_U_MAP_NR];
+	unsigned int io_u_complete[FIO_IO_U_MAP_NR];
 	unsigned int io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	unsigned int io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	unsigned long total_io_u[2];
 	unsigned long short_io_u[2];
+	unsigned long total_submit;
+	unsigned long total_complete;
 
 	unsigned long long io_bytes[2];
 	unsigned long runtime[2];
@@ -909,6 +913,8 @@
 extern void io_u_set_timeout(struct thread_data *);
 extern void io_u_mark_depth(struct thread_data *, unsigned int);
 extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int);
+void io_u_mark_complete(struct thread_data *, unsigned int);
+void io_u_mark_submit(struct thread_data *, unsigned int);
 
 /*
  * io engine entry points
diff --git a/io_u.c b/io_u.c
index 06fb3ce..ce08c59 100644
--- a/io_u.c
+++ b/io_u.c
@@ -422,6 +422,47 @@
 	return 0;
 }
 
+static void __io_u_mark_map(unsigned int *map, unsigned int nr)
+{
+	int index = 0;
+
+	switch (nr) {
+	default:
+		index = 6;
+		break;
+	case 33 ... 64:
+		index = 5;
+		break;
+	case 17 ... 32:
+		index = 4;
+		break;
+	case 9 ... 16:
+		index = 3;
+		break;
+	case 5 ... 8:
+		index = 2;
+		break;
+	case 1 ... 4:
+		index = 1;
+	case 0:
+		break;
+	}
+
+	map[index]++;
+}
+
+void io_u_mark_submit(struct thread_data *td, unsigned int nr)
+{
+	__io_u_mark_map(td->ts.io_u_submit, nr);
+	td->ts.total_submit++;
+}
+
+void io_u_mark_complete(struct thread_data *td, unsigned int nr)
+{
+	__io_u_mark_map(td->ts.io_u_complete, nr);
+	td->ts.total_complete++;
+}
+
 void io_u_mark_depth(struct thread_data *td, unsigned int nr)
 {
 	int index = 0;
diff --git a/ioengines.c b/ioengines.c
index 9e1c556..fb68763 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -198,6 +198,8 @@
 	if (td->io_ops->getevents)
 		r = td->io_ops->getevents(td, min, max, t);
 out:
+	if (r >= 0)
+		io_u_mark_complete(td, r);
 	dprint(FD_IO, "getevents: %d\n", r);
 	return r;
 }
@@ -237,6 +239,11 @@
 
 	unlock_file(td, io_u->file);
 
+	if (!td->io_ops->commit) {
+		io_u_mark_submit(td, 1);
+		io_u_mark_complete(td, 1);
+	}
+
 	if (ret == FIO_Q_COMPLETED) {
 		if (io_u->ddir != DDIR_SYNC) {
 			io_u_mark_depth(td, 1);
diff --git a/stat.c b/stat.c
index e3e084e..fe6acb8 100644
--- a/stat.c
+++ b/stat.c
@@ -124,7 +124,8 @@
 #define ts_total_io_u(ts)	\
 	((ts)->total_io_u[0] + (ts)->total_io_u[1])
 
-static void stat_calc_dist(struct thread_stat *ts, double *io_u_dist)
+static void stat_calc_dist(unsigned int *map, unsigned long total,
+			   double *io_u_dist)
 {
 	int i;
 
@@ -132,27 +133,33 @@
 	 * Do depth distribution calculations
 	 */
 	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
-		io_u_dist[i] = (double) ts->io_u_map[i]
-					/ (double) ts_total_io_u(ts);
-		io_u_dist[i] *= 100.0;
-		if (io_u_dist[i] < 0.1 && ts->io_u_map[i])
-			io_u_dist[i] = 0.1;
+		if (total) {
+			io_u_dist[i] = (double) map[i] / (double) total;
+			io_u_dist[i] *= 100.0;
+			if (io_u_dist[i] < 0.1 && map[i])
+				io_u_dist[i] = 0.1;
+		} else
+			io_u_dist[i] = 0.0;
 	}
 }
 
 static void stat_calc_lat(struct thread_stat *ts, double *dst,
 			  unsigned int *src, int nr)
 {
+	unsigned long total = ts_total_io_u(ts);
 	int i;
 
 	/*
 	 * Do latency distribution calculations
 	 */
 	for (i = 0; i < nr; i++) {
-		dst[i] = (double) src[i] / (double) ts_total_io_u(ts);
-		dst[i] *= 100.0;
-		if (dst[i] < 0.01 && src[i])
-			dst[i] = 0.01;
+		if (total) {
+			dst[i] = (double) src[i] / (double) total;
+			dst[i] *= 100.0;
+			if (dst[i] < 0.01 && src[i])
+				dst[i] = 0.01;
+		} else
+			dst[i] = 0.0;
 	}
 }
 
@@ -342,19 +349,30 @@
 	log_info("  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%lu, majf=%lu,"
 		 " minf=%lu\n", usr_cpu, sys_cpu, ts->ctx, ts->majf, ts->minf);
 
-	stat_calc_dist(ts, io_u_dist);
-	stat_calc_lat_u(ts, io_u_lat_u);
-	stat_calc_lat_m(ts, io_u_lat_m);
-
+	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
 	log_info("  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
 		 " 16=%3.1f%%, 32=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
+
+	stat_calc_dist(ts->io_u_submit, ts->total_submit, io_u_dist);
+	log_info("     submit    : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
+					io_u_dist[1], io_u_dist[2],
+					io_u_dist[3], io_u_dist[4],
+					io_u_dist[5], io_u_dist[6]);
+	stat_calc_dist(ts->io_u_complete, ts->total_complete, io_u_dist);
+	log_info("     complete  : 0=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%, 16=%3.1f%%,"
+		 " 32=%3.1f%%, 64=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
+					io_u_dist[1], io_u_dist[2],
+					io_u_dist[3], io_u_dist[4],
+					io_u_dist[5], io_u_dist[6]);
 	log_info("     issued r/w: total=%lu/%lu, short=%lu/%lu\n",
 					ts->total_io_u[0], ts->total_io_u[1],
 					ts->short_io_u[0], ts->short_io_u[1]);
-
+	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
 	show_latencies(io_u_lat_u, io_u_lat_m);
 }
 
@@ -419,7 +437,7 @@
 	log_info(";%f%%;%f%%;%lu;%lu;%lu", usr_cpu, sys_cpu, ts->ctx, ts->majf,
 								ts->minf);
 
-	stat_calc_dist(ts, io_u_dist);
+	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
@@ -569,6 +587,10 @@
 
 		for (k = 0; k < FIO_IO_U_MAP_NR; k++)
 			ts->io_u_map[k] += td->ts.io_u_map[k];
+		for (k = 0; k < FIO_IO_U_MAP_NR; k++)
+			ts->io_u_submit[k] += td->ts.io_u_submit[k];
+		for (k = 0; k < FIO_IO_U_MAP_NR; k++)
+			ts->io_u_complete[k] += td->ts.io_u_complete[k];
 		for (k = 0; k < FIO_IO_U_LAT_U_NR; k++)
 			ts->io_u_lat_u[k] += td->ts.io_u_lat_u[k];
 		for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
@@ -581,6 +603,8 @@
 		}
 
 		ts->total_run_time += td->ts.total_run_time;
+		ts->total_submit += td->ts.total_submit;
+		ts->total_complete += td->ts.total_complete;
 	}
 
 	for (i = 0; i < nr_ts; i++) {