Add options to have fio latency profile a device
This adds three new options:
- latency_target. This defines a specific latency target, in usec.
- latency_window. This defines the period over which fio samples.
- latency_percentile. This defines the percentage of IOs that must
meet the criteria specified by latency_target/latency_window.
With these options set, fio will run the described workload and
vary the queue depth between 1 and iodepth= to find the best
performing spot that meets the criteria specified by the three
options.
A sample job file is also added to demonstrate how to use this.
Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/HOWTO b/HOWTO
index 250bc58..044b572 100644
--- a/HOWTO
+++ b/HOWTO
@@ -881,6 +881,20 @@
the job to exit. The same format as rate is used for read vs
write separation.
+latency_target=int If set, fio will attempt to find the max performance
+ point that the given workload will run at while maintaining a
+ latency below this target. The values is given in microseconds.
+ See latency_window and latency_percentile
+
+latency_window=int Used with latency_target to specify the sample window
+ that the job is run at varying queue depths to test the
+ performance. The value is given in microseconds.
+
+latency_percentile=float The percentage of IOs that must fall within the
+ criteria specified by latency_target and latency_window. If not
+ set, this defaults to 100.0, meaning that all IOs must be equal
+ or below to the value set by latency_target.
+
max_latency=int If set, fio will exit the job if it exceeds this maximum
latency. It will exit with an ETIME error.
diff --git a/README b/README
index 1373c78..0acb04f 100644
--- a/README
+++ b/README
@@ -197,6 +197,8 @@
mutex Dump info only related to mutex up/down ops
profile Dump info related to profile extensions
time Dump info related to internal time keeping
+ net Dump info related to networking connections
+ rate Dump info related to IO rate switching
? or help Show available debug options.
One can specify multiple debug options: e.g. --debug=file,mem will enable
diff --git a/backend.c b/backend.c
index 2ec478c..101024d 100644
--- a/backend.c
+++ b/backend.c
@@ -649,6 +649,8 @@
else
td_set_runstate(td, TD_RUNNING);
+ lat_target_init(td);
+
while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td) ||
td->o.time_based) {
@@ -680,8 +682,11 @@
break;
io_u = get_io_u(td);
- if (!io_u)
+ if (!io_u) {
+ if (td->o.latency_target)
+ goto reap;
break;
+ }
ddir = io_u->ddir;
@@ -776,6 +781,7 @@
* can get BUSY even without IO queued, if the system is
* resource starved.
*/
+reap:
full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
if (full || !td->o.iodepth_batch_complete) {
min_evts = min(td->o.iodepth_batch_complete,
@@ -812,6 +818,8 @@
break;
}
}
+ if (!in_ramp_time(td) && td->o.latency_target)
+ lat_target_check(td);
if (td->o.thinktime) {
unsigned long long b;
diff --git a/cconv.c b/cconv.c
index 82383b2..dd61d10 100644
--- a/cconv.c
+++ b/cconv.c
@@ -199,6 +199,9 @@
o->flow_watermark = __le32_to_cpu(top->flow_watermark);
o->flow_sleep = le32_to_cpu(top->flow_sleep);
o->sync_file_range = le32_to_cpu(top->sync_file_range);
+ o->latency_target = le64_to_cpu(top->latency_target);
+ o->latency_window = le64_to_cpu(top->latency_window);
+ o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
o->compress_percentage = le32_to_cpu(top->compress_percentage);
o->compress_chunk = le32_to_cpu(top->compress_chunk);
@@ -348,6 +351,9 @@
top->flow_watermark = __cpu_to_le32(o->flow_watermark);
top->flow_sleep = cpu_to_le32(o->flow_sleep);
top->sync_file_range = cpu_to_le32(o->sync_file_range);
+ top->latency_target = __cpu_to_le64(o->latency_target);
+ top->latency_window = __cpu_to_le64(o->latency_window);
+ top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
top->compress_percentage = cpu_to_le32(o->compress_percentage);
top->compress_chunk = cpu_to_le32(o->compress_chunk);
diff --git a/client.c b/client.c
index 1557240..7975ce6 100644
--- a/client.c
+++ b/client.c
@@ -796,6 +796,11 @@
dst->first_error = le32_to_cpu(src->first_error);
dst->kb_base = le32_to_cpu(src->kb_base);
dst->unit_base = le32_to_cpu(src->unit_base);
+
+ dst->latency_depth = le32_to_cpu(src->latency_depth);
+ dst->latency_target = le64_to_cpu(src->latency_target);
+ dst->latency_window = le64_to_cpu(src->latency_window);
+ dst->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(src->latency_percentile.u.i));
}
static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
diff --git a/debug.h b/debug.h
index 6a74131..e248695 100644
--- a/debug.h
+++ b/debug.h
@@ -19,6 +19,7 @@
FD_PROFILE,
FD_TIME,
FD_NET,
+ FD_RATE,
FD_DEBUG_MAX,
};
diff --git a/examples/latency-profile.fio b/examples/latency-profile.fio
new file mode 100644
index 0000000..f5cd844
--- /dev/null
+++ b/examples/latency-profile.fio
@@ -0,0 +1,21 @@
+# Test job that demonstrates how to use the latency target
+# profiling. Fio will find the queue depth between 1..128
+# that fits within the latency constraints of this 4k random
+# read workload.
+
+[global]
+bs=4k
+rw=randread
+random_generator=lfsr
+direct=1
+ioengine=libaio
+iodepth=128
+# Set max acceptable latency to 500msec
+latency_target=500000
+# profile over a 5s window
+latency_window=5000000
+# 99.9% of IOs must be below the target
+latency_percentile=99.9
+
+[device]
+filename=/dev/sda
diff --git a/fio.1 b/fio.1
index 15a1ac5..9ee0eef 100644
--- a/fio.1
+++ b/fio.1
@@ -764,6 +764,23 @@
Average bandwidth for \fBrate\fR and \fBratemin\fR over this number of
milliseconds. Default: 1000ms.
.TP
+.BI latency_target \fR=\fPint
+If set, fio will attempt to find the max performance point that the given
+workload will run at while maintaining a latency below this target. The
+values is given in microseconds. See \fBlatency_window\fR and
+\fBlatency_percentile\fR.
+.TP
+.BI latency_window \fR=\fPint
+Used with \fBlatency_target\fR to specify the sample window that the job
+is run at varying queue depths to test the performance. The value is given
+in microseconds.
+.TP
+.BI latency_percentile \fR=\fPfloat
+The percentage of IOs that must fall within the criteria specified by
+\fBlatency_target\fR and \fBlatency_window\fR. If not set, this defaults
+to 100.0, meaning that all IOs must be equal or below to the value set
+by \fBlatency_target\fR.
+.TP
.BI max_latency \fR=\fPint
If set, fio will exit the job if it exceeds this maximum latency. It will exit
with an ETIME error.
diff --git a/fio.h b/fio.h
index d0fe83a..a0927bd 100644
--- a/fio.h
+++ b/fio.h
@@ -249,6 +249,16 @@
unsigned int ramp_time_over;
/*
+ * Time since last latency_window was started
+ */
+ struct timeval latency_ts;
+ unsigned int latency_qd;
+ unsigned int latency_qd_high;
+ unsigned int latency_qd_low;
+ unsigned int latency_failed;
+ uint64_t latency_ios;
+
+ /*
* read/write mixed workload state
*/
union {
@@ -483,6 +493,12 @@
extern int load_blktrace(struct thread_data *, const char *, int);
#endif
+/*
+ * Latency target helpers
+ */
+extern void lat_target_check(struct thread_data *);
+extern void lat_target_init(struct thread_data *);
+
#define for_each_td(td, i) \
for ((i) = 0, (td) = &threads[0]; (i) < (int) thread_number; (i)++, (td)++)
#define for_each_file(td, f, i) \
diff --git a/init.c b/init.c
index 4dd0c9a..ba22f78 100644
--- a/init.c
+++ b/init.c
@@ -1382,7 +1382,7 @@
printf("%s [options] [job options] <job file(s)>\n", name);
printf(" --debug=options\tEnable debug logging. May be one/more of:\n"
"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
- "\t\t\tdiskutil,job,mutex,profile,time,net\n");
+ "\t\t\tdiskutil,job,mutex,profile,time,net,rate\n");
printf(" --parse-only\t\tParse options only, don't start any IO\n");
printf(" --output\t\tWrite output to file\n");
printf(" --runtime\t\tRuntime in seconds\n");
@@ -1482,6 +1482,10 @@
.help = "Network logging",
.shift = FD_NET,
},
+ { .name = "rate",
+ .help = "Rate logging",
+ .shift = FD_RATE,
+ },
{ .name = NULL, },
};
diff --git a/io_u.c b/io_u.c
index 7df57aa..ea6c251 100644
--- a/io_u.c
+++ b/io_u.c
@@ -1126,6 +1126,136 @@
return 0;
}
+static void lat_fatal(struct thread_data *td, struct io_completion_data *icd,
+ unsigned long tusec, unsigned long max_usec)
+{
+ if (!td->error)
+ log_err("fio: latency of %lu usec exceeds specified max (%lu usec)\n", tusec, max_usec);
+ td_verror(td, ETIMEDOUT, "max latency exceeded");
+ icd->error = ETIMEDOUT;
+}
+
+static void lat_new_cycle(struct thread_data *td)
+{
+ fio_gettime(&td->latency_ts, NULL);
+ td->latency_ios = ddir_rw_sum(td->io_blocks);
+ td->latency_failed = 0;
+}
+
+/*
+ * We had an IO outside the latency target. Reduce the queue depth. If we
+ * are at QD=1, then it's time to give up.
+ */
+static int __lat_target_failed(struct thread_data *td)
+{
+ if (td->latency_qd == 1)
+ return 1;
+
+ td->latency_qd_high = td->latency_qd;
+ td->latency_qd = (td->latency_qd + td->latency_qd_low) / 2;
+
+ dprint(FD_RATE, "Ramped down: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+
+ /*
+ * When we ramp QD down, quiesce existing IO to prevent
+ * a storm of ramp downs due to pending higher depth.
+ */
+ io_u_quiesce(td);
+ lat_new_cycle(td);
+ return 0;
+}
+
+static int lat_target_failed(struct thread_data *td)
+{
+ if (td->o.latency_percentile.u.f == 100.0)
+ return __lat_target_failed(td);
+
+ td->latency_failed++;
+ return 0;
+}
+
+void lat_target_init(struct thread_data *td)
+{
+ if (td->o.latency_target) {
+ dprint(FD_RATE, "Latency target=%llu\n", td->o.latency_target);
+ fio_gettime(&td->latency_ts, NULL);
+ td->latency_qd = 1;
+ td->latency_qd_high = td->o.iodepth;
+ td->latency_qd_low = 1;
+ td->latency_ios = ddir_rw_sum(td->io_blocks);
+ } else
+ td->latency_qd = td->o.iodepth;
+}
+
+static void lat_target_success(struct thread_data *td)
+{
+ const unsigned int qd = td->latency_qd;
+
+ td->latency_qd_low = td->latency_qd;
+
+ /*
+ * If we haven't failed yet, we double up to a failing value instead
+ * of bisecting from highest possible queue depth. If we have set
+ * a limit other than td->o.iodepth, bisect between that.
+ */
+ if (td->latency_qd_high != td->o.iodepth)
+ td->latency_qd = (td->latency_qd + td->latency_qd_high) / 2;
+ else
+ td->latency_qd *= 2;
+
+ if (td->latency_qd > td->o.iodepth)
+ td->latency_qd = td->o.iodepth;
+
+ dprint(FD_RATE, "Ramped up: %d %d %d\n", td->latency_qd_low, td->latency_qd, td->latency_qd_high);
+ /*
+ * Same as last one, we are done
+ */
+ if (td->latency_qd == qd)
+ td->done = 1;
+
+ lat_new_cycle(td);
+}
+
+/*
+ * Check if we can bump the queue depth
+ */
+void lat_target_check(struct thread_data *td)
+{
+ uint64_t usec_window;
+ uint64_t ios;
+ double success_ios;
+
+ usec_window = utime_since_now(&td->latency_ts);
+ if (usec_window < td->o.latency_window)
+ return;
+
+ ios = ddir_rw_sum(td->io_blocks) - td->latency_ios;
+ success_ios = (double) (ios - td->latency_failed) / (double) ios;
+ success_ios *= 100.0;
+
+ dprint(FD_RATE, "Success rate: %.2f%% (target %.2f%%)\n", success_ios, td->o.latency_percentile.u.f);
+
+ if (success_ios >= td->o.latency_percentile.u.f)
+ lat_target_success(td);
+ else
+ __lat_target_failed(td);
+}
+
+/*
+ * If latency target is enabled, we might be ramping up or down and not
+ * using the full queue depth available.
+ */
+int queue_full(struct thread_data *td)
+{
+ const int qempty = io_u_qempty(&td->io_u_freelist);
+
+ if (qempty)
+ return 1;
+ if (!td->o.latency_target)
+ return 0;
+
+ return td->cur_depth >= td->latency_qd;
+}
struct io_u *__get_io_u(struct thread_data *td)
{
@@ -1136,7 +1266,7 @@
again:
if (!io_u_rempty(&td->io_u_requeues))
io_u = io_u_rpop(&td->io_u_requeues);
- else if (!io_u_qempty(&td->io_u_freelist)) {
+ else if (!queue_full(td)) {
io_u = io_u_qpop(&td->io_u_freelist);
io_u->buflen = 0;
@@ -1395,11 +1525,11 @@
icd->error = ops->io_u_lat(td, tusec);
}
- if (td->o.max_latency && tusec > td->o.max_latency) {
- if (!td->error)
- log_err("fio: latency of %lu usec exceeds specified max (%u usec)\n", tusec, td->o.max_latency);
- td_verror(td, ETIMEDOUT, "max latency exceeded");
- icd->error = ETIMEDOUT;
+ if (td->o.max_latency && tusec > td->o.max_latency)
+ lat_fatal(td, icd, tusec, td->o.max_latency);
+ if (td->o.latency_target && tusec > td->o.latency_target) {
+ if (lat_target_failed(td))
+ lat_fatal(td, icd, tusec, td->o.latency_target);
}
}
diff --git a/ioengine.h b/ioengine.h
index 31662eb..6dd2aa4 100644
--- a/ioengine.h
+++ b/ioengine.h
@@ -188,7 +188,6 @@
/*
* io unit handling
*/
-#define queue_full(td) io_u_qempty(&(td)->io_u_freelist)
extern struct io_u *__get_io_u(struct thread_data *);
extern struct io_u *get_io_u(struct thread_data *);
extern void put_io_u(struct thread_data *, struct io_u *);
@@ -204,6 +203,7 @@
extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
void io_u_mark_complete(struct thread_data *, unsigned int);
void io_u_mark_submit(struct thread_data *, unsigned int);
+int queue_full(struct thread_data *);
int do_io_u_sync(struct thread_data *, struct io_u *);
int do_io_u_trim(struct thread_data *, struct io_u *);
diff --git a/libfio.c b/libfio.c
index c26d6a3..7f36514 100644
--- a/libfio.c
+++ b/libfio.c
@@ -133,6 +133,8 @@
fio_gettime(&tv, NULL);
memcpy(&td->epoch, &tv, sizeof(tv));
memcpy(&td->start, &tv, sizeof(tv));
+
+ lat_target_init(td);
}
void reset_fio_state(void)
diff --git a/options.c b/options.c
index d2493e4..b0f4509 100644
--- a/options.c
+++ b/options.c
@@ -1043,6 +1043,10 @@
static struct opt_group fio_opt_cat_groups[] = {
{
+ .name = "Latency profiling",
+ .mask = FIO_OPT_G_LATPROF,
+ },
+ {
.name = "Rate",
.mask = FIO_OPT_G_RATE,
},
@@ -2623,7 +2627,38 @@
.off1 = td_var_offset(max_latency),
.help = "Maximum tolerated IO latency (usec)",
.category = FIO_OPT_C_IO,
- .group = FIO_OPT_G_RATE,
+ .group = FIO_OPT_G_LATPROF,
+ },
+ {
+ .name = "latency_target",
+ .lname = "Latency Target (usec)",
+ .type = FIO_OPT_STR_VAL_TIME,
+ .off1 = td_var_offset(latency_target),
+ .help = "Ramp to max queue depth supporting this latency",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_LATPROF,
+ },
+ {
+ .name = "latency_window",
+ .lname = "Latency Window (usec)",
+ .type = FIO_OPT_STR_VAL_TIME,
+ .off1 = td_var_offset(latency_window),
+ .help = "Time to sustain latency_target",
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_LATPROF,
+ },
+ {
+ .name = "latency_percentile",
+ .lname = "Latency Percentile",
+ .type = FIO_OPT_FLOAT_LIST,
+ .off1 = td_var_offset(latency_percentile),
+ .help = "Percentile of IOs must be below latency_target",
+ .def = "100",
+ .maxlen = 1,
+ .minfp = 0.0,
+ .maxfp = 100.0,
+ .category = FIO_OPT_C_IO,
+ .group = FIO_OPT_G_LATPROF,
},
{
.name = "invalidate",
diff --git a/options.h b/options.h
index 3795afd..fc36368 100644
--- a/options.h
+++ b/options.h
@@ -94,6 +94,7 @@
__FIO_OPT_G_NETIO,
__FIO_OPT_G_LIBAIO,
__FIO_OPT_G_ACT,
+ __FIO_OPT_G_LATPROF,
__FIO_OPT_G_NR,
FIO_OPT_G_RATE = (1U << __FIO_OPT_G_RATE),
@@ -122,6 +123,7 @@
FIO_OPT_G_NETIO = (1U << __FIO_OPT_G_NETIO),
FIO_OPT_G_LIBAIO = (1U << __FIO_OPT_G_LIBAIO),
FIO_OPT_G_ACT = (1U << __FIO_OPT_G_ACT),
+ FIO_OPT_G_LATPROF = (1U << __FIO_OPT_G_LATPROF),
FIO_OPT_G_INVALID = (1U << __FIO_OPT_G_NR),
};
diff --git a/parse.c b/parse.c
index e3f431e..6141c91 100644
--- a/parse.c
+++ b/parse.c
@@ -547,9 +547,11 @@
** Initialize precision to 0 and zero out list
** in case specified list is shorter than default
*/
- ul2 = 0;
- ilp = td_var(data, o->off2);
- *ilp = ul2;
+ if (o->off2) {
+ ul2 = 0;
+ ilp = td_var(data, o->off2);
+ *ilp = ul2;
+ }
flp = td_var(data, o->off1);
for(i = 0; i < o->maxlen; i++)
@@ -592,9 +594,11 @@
while (*++cp2 != '\0' && *cp2 >= '0' && *cp2 <= '9')
len++;
- ilp = td_var(data, o->off2);
- if (len > *ilp)
- *ilp = len;
+ if (o->off2) {
+ ilp = td_var(data, o->off2);
+ if (len > *ilp)
+ *ilp = len;
+ }
}
break;
diff --git a/server.c b/server.c
index 8324543..6e96b77 100644
--- a/server.c
+++ b/server.c
@@ -1042,6 +1042,11 @@
p.ts.kb_base = cpu_to_le32(ts->kb_base);
p.ts.unit_base = cpu_to_le32(ts->unit_base);
+ p.ts.latency_depth = cpu_to_le32(ts->latency_depth);
+ p.ts.latency_target = cpu_to_le64(ts->latency_target);
+ p.ts.latency_window = cpu_to_le64(ts->latency_window);
+ p.ts.latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
+
convert_gs(&p.rs, rs);
fio_net_send_cmd(server_fd, FIO_NET_CMD_TS, &p, sizeof(p), NULL, NULL);
diff --git a/server.h b/server.h
index 405370e..84901cf 100644
--- a/server.h
+++ b/server.h
@@ -38,7 +38,7 @@
};
enum {
- FIO_SERVER_VER = 27,
+ FIO_SERVER_VER = 28,
FIO_SERVER_MAX_FRAGMENT_PDU = 1024,
diff --git a/stat.c b/stat.c
index ac5ff16..5d50ae8 100644
--- a/stat.c
+++ b/stat.c
@@ -497,7 +497,6 @@
show_lat_m(io_u_lat_m);
}
-
void show_thread_status_normal(struct thread_stat *ts, struct group_run_stats *rs)
{
double usr_cpu, sys_cpu;
@@ -587,6 +586,13 @@
ts->first_error,
strerror(ts->first_error));
}
+ if (ts->latency_depth) {
+ log_info(" latency : target=%llu, window=%llu, percentile=%.2f%%, depth=%u\n",
+ (unsigned long long)ts->latency_target,
+ (unsigned long long)ts->latency_window,
+ ts->latency_percentile.u.f,
+ ts->latency_depth);
+ }
}
static void show_ddir_status_terse(struct thread_stat *ts,
@@ -975,6 +981,13 @@
json_object_add_value_int(root, "first_error", ts->first_error);
}
+ if (ts->latency_depth) {
+ json_object_add_value_int(root, "latency_depth", ts->latency_depth);
+ json_object_add_value_int(root, "latency_target", ts->latency_target);
+ json_object_add_value_float(root, "latency_percentile", ts->latency_percentile.u.f);
+ json_object_add_value_int(root, "latency_window", ts->latency_window);
+ }
+
/* Additional output if description is set */
if (strlen(ts->description))
json_object_add_value_string(root, "desc", ts->description);
@@ -1265,6 +1278,11 @@
}
}
+ ts->latency_depth = td->latency_qd;
+ ts->latency_target = td->o.latency_target;
+ ts->latency_percentile = td->o.latency_percentile;
+ ts->latency_window = td->o.latency_window;
+
sum_thread_stats(ts, &td->ts, idx);
}
diff --git a/stat.h b/stat.h
index 8190d1e..7ad0c9d 100644
--- a/stat.h
+++ b/stat.h
@@ -175,6 +175,11 @@
uint32_t kb_base;
uint32_t unit_base;
+
+ uint32_t latency_depth;
+ uint64_t latency_target;
+ fio_fp64_t latency_percentile;
+ uint64_t latency_window;
};
struct jobs_eta {
diff --git a/thread_options.h b/thread_options.h
index 44cbf91..7476d8c 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -240,6 +240,10 @@
unsigned long long number_ios;
unsigned int sync_file_range;
+
+ unsigned long long latency_target;
+ unsigned long long latency_window;
+ fio_fp64_t latency_percentile;
};
#define FIO_TOP_STR_MAX 256
@@ -446,6 +450,10 @@
uint64_t number_ios;
uint32_t sync_file_range;
+
+ uint64_t latency_target;
+ uint64_t latency_window;
+ fio_fp64_t latency_percentile;
} __attribute__((packed));
extern void convert_thread_options_to_cpu(struct thread_options *o, struct thread_options_pack *top);