Add iodepth_batch_complete control

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/HOWTO b/HOWTO
index 12974f3..50ca467 100644
--- a/HOWTO
+++ b/HOWTO
@@ -433,11 +433,21 @@
 		job, can be overridden with a larger value for higher
 		concurrency.
 
+iodepth_batch_submit=int
 iodepth_batch=int This defines how many pieces of IO to submit at once.
 		It defaults to 1 which means that we submit each IO
 		as soon as it is available, but can be raised to submit
 		bigger batches of IO at the time.
 
+iodepth_batch_complete=int This defines how many pieces of IO to retrieve
+		at once. It defaults to 1 which means that we'll ask
+		for a minimum of 1 IO in the retrieval process from
+		the kernel. The IO retrieval will go on until we
+		hit the limit set by iodepth_low. If this variable is
+		set to 0, then fio will always check for completed
+		events before queuing more IO. This helps reduce
+		IO latency, at the cost of more retrieval system calls.
+
 iodepth_low=int	The low water mark indicating when to start filling
 		the queue again. Defaults to the same as iodepth, meaning
 		that fio will attempt to keep the queue full at all times.
diff --git a/fio.c b/fio.c
index f0566ed..cedbfb0 100644
--- a/fio.c
+++ b/fio.c
@@ -355,7 +355,7 @@
 
 	io_u = NULL;
 	while (!td->terminate) {
-		int ret2;
+		int ret2, full;
 
 		io_u = __get_io_u(td);
 		if (!io_u)
@@ -435,19 +435,25 @@
 		 * if we can queue more, do so. but check if there are
 		 * completed io_u's first.
 		 */
-		min_events = 0;
-		if (queue_full(td) || ret == FIO_Q_BUSY) {
-			if (td->cur_depth >= td->o.iodepth_low)
-				min_events = td->cur_depth - td->o.iodepth_low;
-			if (!min_events)
+		full = queue_full(td) || ret == FIO_Q_BUSY;
+		if (full || !td->o.iodepth_batch_complete) {
+			min_events = td->o.iodepth_batch_complete;
+			if (full && !min_events)
 				min_events = 1;
-		}
 
-		/*
-		 * Reap required number of io units, if any, and do the
-		 * verification on them through the callback handler
-		 */
-		if (io_u_queued_complete(td, min_events) < 0)
+			do {
+				/*
+				 * Reap required number of io units, if any,
+				 * and do the verification on them through
+				 * the callback handler
+				 */
+				if (io_u_queued_complete(td, min_events) < 0) {
+					ret = -1;
+					break;
+				}
+			} while (full && (td->cur_depth > td->o.iodepth_low));
+		}
+		if (ret < 0)
 			break;
 	}
 
@@ -480,7 +486,7 @@
 		long bytes_done = 0;
 		int min_evts = 0;
 		struct io_u *io_u;
-		int ret2;
+		int ret2, full;
 
 		if (td->terminate)
 			break;
@@ -570,18 +576,25 @@
 		/*
 		 * See if we need to complete some commands
 		 */
-		if (queue_full(td) || ret == FIO_Q_BUSY) {
-			min_evts = 0;
-			if (td->cur_depth >= td->o.iodepth_low)
-				min_evts = td->cur_depth - td->o.iodepth_low;
-			if (!min_evts)
+		full = queue_full(td) || ret == FIO_Q_BUSY;
+		if (full || !td->o.iodepth_batch_complete) {
+			min_evts = td->o.iodepth_batch_complete;
+			if (full && !min_evts)
 				min_evts = 1;
+
 			fio_gettime(&comp_time, NULL);
-			bytes_done = io_u_queued_complete(td, min_evts);
-			if (bytes_done < 0)
-				break;
+
+			do {
+				ret = io_u_queued_complete(td, min_evts);
+				if (ret <= 0)
+					break;
+
+				bytes_done += ret;
+			} while (full && (td->cur_depth > td->o.iodepth_low));
 		}
 
+		if (ret < 0)
+			break;
 		if (!bytes_done)
 			continue;
 
diff --git a/fio.h b/fio.h
index 9d80237..207b2ec 100644
--- a/fio.h
+++ b/fio.h
@@ -418,6 +418,7 @@
 	unsigned int iodepth;
 	unsigned int iodepth_low;
 	unsigned int iodepth_batch;
+	unsigned int iodepth_batch_complete;
 
 	unsigned long long size;
 	unsigned int fill_device;
diff --git a/io_u.c b/io_u.c
index 3f71367..92b7076 100644
--- a/io_u.c
+++ b/io_u.c
@@ -971,19 +971,19 @@
 /*
  * Called to complete min_events number of io for the async engines.
  */
-long io_u_queued_complete(struct thread_data *td, int min_events)
+long io_u_queued_complete(struct thread_data *td, int min_evts)
 {
 	struct io_completion_data icd;
 	struct timespec *tvp = NULL;
 	int ret;
 	struct timespec ts = { .tv_sec = 0, .tv_nsec = 0, };
 
-	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_events);
+	dprint(FD_IO, "io_u_queued_completed: min=%d\n", min_evts);
 
-	if (!min_events)
+	if (!min_evts)
 		tvp = &ts;
 
-	ret = td_io_getevents(td, min_events, td->cur_depth, tvp);
+	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
 	if (ret < 0) {
 		td_verror(td, -ret, "td_io_getevents");
 		return ret;
diff --git a/ioengines.c b/ioengines.c
index 8975591..e447539 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -195,9 +195,13 @@
 		if (r < 0)
 			goto out;
 	}
+	if (max > td->cur_depth)
+		max = td->cur_depth;
+	if (min > max)
+		max = min;
 
 	r = 0;
-	if (td->io_ops->getevents)
+	if (max && td->io_ops->getevents)
 		r = td->io_ops->getevents(td, min, max, t);
 out:
 	if (r >= 0)
diff --git a/options.c b/options.c
index b398695..18787f8 100644
--- a/options.c
+++ b/options.c
@@ -612,6 +612,7 @@
 	},
 	{
 		.name	= "iodepth_batch",
+		.alias	= "iodepth_batch_submit",
 		.type	= FIO_OPT_INT,
 		.off1	= td_var_offset(iodepth_batch),
 		.help	= "Number of IO to submit in one go",
@@ -620,6 +621,15 @@
 		.def	= "1",
 	},
 	{
+		.name	= "iodepth_batch_complete",
+		.type	= FIO_OPT_INT,
+		.off1	= td_var_offset(iodepth_batch_complete),
+		.help	= "Number of IO to retrieve in one go",
+		.parent	= "iodepth",
+		.minval	= 0,
+		.def	= "1",
+	},
+	{
 		.name	= "iodepth_low",
 		.type	= FIO_OPT_INT,
 		.off1	= td_var_offset(iodepth_low),