Merge branch 'master' into gfio

Conflicts:
	Makefile
	backend.c
	client.c
	fio.h
	init.c
	io_ddir.h
	options.c
	server.h
	stat.c
	stat.h

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index ee017f0..08729ba 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.0.8
+DEF_VER=fio-2.0.9
 
 LF='
 '
diff --git a/HOWTO b/HOWTO
index 8a4e2bd..7170aa3 100644
--- a/HOWTO
+++ b/HOWTO
@@ -602,6 +602,16 @@
 				channel semantics (Send/Recv) for the
 				InfiniBand, RoCE and iWARP protocols.
 
+			falloc   IO engine that does regular fallocate to
+				 simulate data transfer as fio ioengine.
+				 DDIR_READ  does fallocate(,mode = keep_size,)
+				 DDIR_WRITE does fallocate(,mode = 0)
+				 DDIR_TRIM  does fallocate(,mode = punch_hole)
+
+			e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT
+				 ioctls to simulate defragment activity in
+				 request to DDIR_WRITE event
+
 			external Prefix to specify loading an external
 				IO engine object file. Append the engine
 				filename, eg ioengine=external:/tmp/foo.o
@@ -1315,6 +1325,14 @@
 [net] listen	For TCP network connections, tell fio to listen for incoming
 		connections rather than initiating an outgoing connection. The
 		hostname must be omitted if this option is used.
+[e4defrag] donorname=str
+	        File will be used as a block donor(swap extents between files)
+[e4defrag] inplace=int
+		Configure donor file blocks allocation strategy		
+		0(default): Preallocate donor's file on init
+		1 	  : allocate space immidietly inside defragment event,
+			    and free right after event
+
 
 
 6.0 Interpreting the output
diff --git a/Makefile b/Makefile
index ea8c851..d851640 100644
--- a/Makefile
+++ b/Makefile
@@ -18,13 +18,14 @@
 		lib/num2str.c lib/ieee754.c $(wildcard crc/*.c) engines/cpu.c \
 		engines/mmap.c engines/sync.c engines/null.c engines/net.c \
 		memalign.c server.c client.c iolog.c backend.c libfio.c flow.c \
-		cconv.c lib/prio_tree.c
+		cconv.c lib/prio_tree.c json.c
 
 ifeq ($(UNAME), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c helpers.c cgroup.c trim.c \
 		engines/libaio.c engines/posixaio.c engines/sg.c \
 		engines/splice.c engines/syslet-rw.c engines/guasi.c \
-		engines/binject.c engines/rdma.c profiles/tiobench.c
+		engines/binject.c engines/rdma.c profiles/tiobench.c \
+		engines/fusion-aw.c engines/falloc.c engines/e4defrag.c
   LIBS += -lpthread -ldl -lrt -laio
   LDFLAGS += -rdynamic
 endif
diff --git a/README b/README
index 8bce835..535b077 100644
--- a/README
+++ b/README
@@ -121,12 +121,13 @@
 $ fio
 	--debug			Enable some debugging options (see below)
 	--output		Write output to file
-	--timeout		Runtime in seconds
+	--runtime		Runtime in seconds
 	--latency-log		Generate per-job latency logs
 	--bandwidth-log		Generate per-job bandwidth logs
 	--minimal		Minimal (terse) output
+	--output-format=type	Output format (terse,json,normal)
+	--terse-version=type	Terse version output format (default 3, or 2 or 4).
 	--version		Print version info and exit
-	--terse-version=type	Terse version output format (default 3, or 2).
 	--help			Print this page
 	--cmdhelp=cmd		Print command help, "all" for all of them
 	--enghelp=engine	Print ioengine help, or list available ioengines
@@ -138,8 +139,8 @@
 				May be "always", "never" or "auto"
 	--section=name		Only run specified section in job file.
 				Multiple sections can be specified.
-	--alloc-size=kb	Set smalloc pool to this size in kb (def 1024)
-	--warnings-fatal Fio parser warnings are fatal
+	--alloc-size=kb		Set smalloc pool to this size in kb (def 1024)
+	--warnings-fatal	Fio parser warnings are fatal
 	--max-jobs		Maximum number of threads/processes to support
 	--server=args		Start backend server. See Client/Server section.
 	--client=host		Connect to specified backend.
@@ -161,11 +162,11 @@
 
 	process		Dump info related to processes
 	file		Dump info related to file actions
-	io			Dump info related to IO queuing
-	mem			Dump info related to memory allocations
+	io		Dump info related to IO queuing
+	mem		Dump info related to memory allocations
 	blktrace	Dump info related to blktrace setup
 	verify		Dump info related to IO verification
-	all			Enable all debug options
+	all		Enable all debug options
 	random		Dump info related to random offset generation
 	parse		Dump info related to option matching and parsing
 	diskutil	Dump info related to disk utilization updates
diff --git a/backend.c b/backend.c
index f20857a..a0ac424 100644
--- a/backend.c
+++ b/backend.c
@@ -60,7 +60,7 @@
 static unsigned int nr_process = 0;
 static unsigned int nr_thread = 0;
 
-struct io_log *agg_io_log[2];
+struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 
 int groupid = 0;
 unsigned int thread_number = 0;
@@ -208,10 +208,12 @@
 {
 	int ret = 0;
 
-	if (bytes_done[0])
-		ret |= __check_min_rate(td, now, 0);
-	if (bytes_done[1])
-		ret |= __check_min_rate(td, now, 1);
+	if (bytes_done[DDIR_READ])
+		ret |= __check_min_rate(td, now, DDIR_READ);
+	if (bytes_done[DDIR_WRITE])
+		ret |= __check_min_rate(td, now, DDIR_WRITE);
+	if (bytes_done[DDIR_TRIM])
+		ret |= __check_min_rate(td, now, DDIR_TRIM);
 
 	return ret;
 }
@@ -545,11 +547,13 @@
 	unsigned long long bytes;
 
 	if (td_rw(td))
-		bytes = td->this_io_bytes[0] + td->this_io_bytes[1];
+		bytes = td->this_io_bytes[DDIR_READ] + td->this_io_bytes[DDIR_WRITE];
 	else if (td_write(td))
-		bytes = td->this_io_bytes[1];
+		bytes = td->this_io_bytes[DDIR_WRITE];
+	else if (td_read(td))
+		bytes = td->this_io_bytes[DDIR_READ];
 	else
-		bytes = td->this_io_bytes[0];
+		bytes = td->this_io_bytes[DDIR_TRIM];
 
 	return bytes >= td->o.size;
 }
@@ -572,7 +576,7 @@
 		(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td) ||
 		td->o.time_based) {
 		struct timeval comp_time;
-		unsigned long bytes_done[2] = { 0, 0 };
+		unsigned long bytes_done[DDIR_RWDIR_CNT] = { 0, 0, 0 };
 		int min_evts = 0;
 		struct io_u *io_u;
 		int ret2, full;
@@ -649,8 +653,9 @@
 				requeue_io_u(td, &io_u);
 			} else {
 sync_done:
-				if (__should_check_rate(td, 0) ||
-				    __should_check_rate(td, 1))
+				if (__should_check_rate(td, DDIR_READ) ||
+				    __should_check_rate(td, DDIR_WRITE) ||
+				    __should_check_rate(td, DDIR_TRIM))
 					fio_gettime(&comp_time, NULL);
 
 				ret = io_u_sync_complete(td, io_u, bytes_done);
@@ -697,8 +702,9 @@
 			if (full && !min_evts)
 				min_evts = 1;
 
-			if (__should_check_rate(td, 0) ||
-			    __should_check_rate(td, 1))
+			if (__should_check_rate(td, DDIR_READ) ||
+			    __should_check_rate(td, DDIR_WRITE) ||
+			    __should_check_rate(td, DDIR_TRIM))
 				fio_gettime(&comp_time, NULL);
 
 			do {
@@ -711,7 +717,7 @@
 
 		if (ret < 0)
 			break;
-		if (!(bytes_done[0] + bytes_done[1]))
+		if (!ddir_rw_sum(bytes_done))
 			continue;
 
 		if (!in_ramp_time(td) && should_check_rate(td, bytes_done)) {
@@ -726,7 +732,7 @@
 		if (td->o.thinktime) {
 			unsigned long long b;
 
-			b = td->io_blocks[0] + td->io_blocks[1];
+			b = ddir_rw_sum(td->io_blocks);
 			if (!(b % td->o.thinktime_blocks)) {
 				int left;
 
@@ -772,7 +778,7 @@
 	/*
 	 * stop job if we failed doing any IO
 	 */
-	if ((td->this_io_bytes[0] + td->this_io_bytes[1]) == 0)
+	if (!ddir_rw_sum(td->this_io_bytes))
 		td->done = 1;
 }
 
@@ -800,6 +806,7 @@
 
 	max_units = td->o.iodepth;
 	max_bs = max(td->o.max_bs[DDIR_READ], td->o.max_bs[DDIR_WRITE]);
+	max_bs = max(td->o.max_bs[DDIR_TRIM], max_bs);
 	min_write = td->o.min_bs[DDIR_WRITE];
 	td->orig_buffer_size = (unsigned long long) max_bs
 					* (unsigned long long) max_units;
@@ -926,8 +933,6 @@
 
 static int keep_running(struct thread_data *td)
 {
-	unsigned long long io_done;
-
 	if (td->done)
 		return 0;
 	if (td->o.time_based)
@@ -937,9 +942,7 @@
 		return 1;
 	}
 
-	io_done = td->io_bytes[DDIR_READ] + td->io_bytes[DDIR_WRITE]
-			+ td->io_skip_bytes;
-	if (io_done < td->o.size)
+	if (ddir_rw_sum(td->io_bytes) < td->o.size)
 		return 1;
 
 	return 0;
@@ -1110,10 +1113,13 @@
 		memcpy(&td->iops_sample_time, &td->start, sizeof(td->start));
 		memcpy(&td->tv_cache, &td->start, sizeof(td->start));
 
-		if (td->o.ratemin[0] || td->o.ratemin[1]) {
-			memcpy(&td->lastrate[0], &td->bw_sample_time,
+		if (td->o.ratemin[DDIR_READ] || td->o.ratemin[DDIR_WRITE] ||
+				td->o.ratemin[DDIR_TRIM]) {
+		        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
 						sizeof(td->bw_sample_time));
-			memcpy(&td->lastrate[1], &td->bw_sample_time,
+		        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
+						sizeof(td->bw_sample_time));
+		        memcpy(&td->lastrate[DDIR_TRIM], &td->bw_sample_time,
 						sizeof(td->bw_sample_time));
 		}
 
@@ -1134,6 +1140,10 @@
 			elapsed = utime_since_now(&td->start);
 			td->ts.runtime[DDIR_WRITE] += elapsed;
 		}
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM]) {
+			elapsed = utime_since_now(&td->start);
+			td->ts.runtime[DDIR_TRIM] += elapsed;
+		}
 
 		if (td->error || td->terminate)
 			break;
@@ -1156,11 +1166,13 @@
 	}
 
 	update_rusage_stat(td);
-	td->ts.runtime[0] = (td->ts.runtime[0] + 999) / 1000;
-	td->ts.runtime[1] = (td->ts.runtime[1] + 999) / 1000;
+	td->ts.runtime[DDIR_READ] = (td->ts.runtime[DDIR_READ] + 999) / 1000;
+	td->ts.runtime[DDIR_WRITE] = (td->ts.runtime[DDIR_WRITE] + 999) / 1000;
+	td->ts.runtime[DDIR_TRIM] = (td->ts.runtime[DDIR_TRIM] + 999) / 1000;
 	td->ts.total_run_time = mtime_since_now(&td->epoch);
-	td->ts.io_bytes[0] = td->io_bytes[0];
-	td->ts.io_bytes[1] = td->io_bytes[1];
+	td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
+	td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
+	td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 
 	fio_unpin_memory(td);
 
@@ -1352,8 +1364,8 @@
 		continue;
 reaped:
 		(*nr_running)--;
-		(*m_rate) -= (td->o.ratemin[0] + td->o.ratemin[1]);
-		(*t_rate) -= (td->o.rate[0] + td->o.rate[1]);
+		(*m_rate) -= ddir_rw_sum(td->o.ratemin);
+		(*t_rate) -= ddir_rw_sum(td->o.rate);
 		if (!td->pid)
 			pending--;
 
@@ -1389,7 +1401,7 @@
 			nr_process++;
 	}
 
-	if (!terse_output) {
+	if (output_format == FIO_OUTPUT_NORMAL) {
 		log_info("Starting ");
 		if (nr_thread)
 			log_info("%d thread%s", nr_thread,
@@ -1580,8 +1592,8 @@
 				td_set_runstate(td, TD_RUNNING);
 			nr_running++;
 			nr_started--;
-			m_rate += td->o.ratemin[0] + td->o.ratemin[1];
-			t_rate += td->o.rate[0] + td->o.rate[1];
+			m_rate += ddir_rw_sum(td->o.ratemin);
+			t_rate += ddir_rw_sum(td->o.rate);
 			todo--;
 			fio_mutex_up(td->mutex);
 		}
@@ -1670,6 +1682,7 @@
 	if (write_bw_log) {
 		setup_log(&agg_io_log[DDIR_READ], 0, IO_LOG_TYPE_BW);
 		setup_log(&agg_io_log[DDIR_WRITE], 0, IO_LOG_TYPE_BW);
+		setup_log(&agg_io_log[DDIR_TRIM], 0, IO_LOG_TYPE_BW);
 	}
 
 	startup_mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
@@ -1693,6 +1706,8 @@
 			__finish_log(agg_io_log[DDIR_READ], "agg-read_bw.log");
 			__finish_log(agg_io_log[DDIR_WRITE],
 					"agg-write_bw.log");
+			__finish_log(agg_io_log[DDIR_TRIM],
+					"agg-write_bw.log");
 		}
 	}
 
diff --git a/client.c b/client.c
index a9b63e2..7bd5284 100644
--- a/client.c
+++ b/client.c
@@ -850,7 +850,7 @@
 		log_info("\nDisk stats (read/write):\n");
 	}
 
-	print_disk_util(&du->dus, &du->agg, terse_output);
+	print_disk_util(&du->dus, &du->agg, output_format == FIO_OUTPUT_TERSE);
 }
 
 static void convert_jobs_eta(struct jobs_eta *je)
diff --git a/diskutil.c b/diskutil.c
index a3a5b4d..d2c0b97 100644
--- a/diskutil.c
+++ b/diskutil.c
@@ -597,10 +597,60 @@
 		log_info("\n");
 }
 
-void show_disk_util(int terse)
+static void print_disk_util_json(struct disk_util *du, struct json_array *array)
+{
+	double util = 0;
+	struct disk_util_stat *dus = &du->dus;
+	struct disk_util_agg *agg = &du->agg;
+	struct json_object *obj;
+
+	obj = json_create_object();
+	json_array_add_value_object(array, obj);
+
+	if (dus->msec)
+		util = (double) 100 * dus->io_ticks / (double) dus->msec;
+	if (util > 100.0)
+		util = 100.0;
+
+
+	json_object_add_value_string(obj, "name", dus->name);
+	json_object_add_value_int(obj, "read_ios", dus->ios[0]);
+	json_object_add_value_int(obj, "write_ios", dus->ios[1]);
+	json_object_add_value_int(obj, "read_merges", dus->merges[0]);
+	json_object_add_value_int(obj, "write_merges", dus->merges[1]);
+	json_object_add_value_int(obj, "read_ticks", dus->ticks[0]);
+	json_object_add_value_int(obj, "write_ticks", dus->ticks[1]);
+	json_object_add_value_int(obj, "in_queue", dus->time_in_queue);
+	json_object_add_value_float(obj, "util", util);
+
+	/*
+	 * If the device has slaves, aggregate the stats for
+	 * those slave devices also.
+	 */
+	if (!agg->slavecount)
+		return;
+	json_object_add_value_int(obj, "aggr_read_ios",
+				agg->ios[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_ios",
+				agg->ios[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_read_merges",
+				agg->merges[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_merge",
+				agg->merges[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_read_ticks",
+				agg->ticks[0] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_write_ticks",
+				agg->ticks[1] / agg->slavecount);
+	json_object_add_value_int(obj, "aggr_in_queue",
+				agg->time_in_queue / agg->slavecount);
+	json_object_add_value_float(obj, "aggr_util", agg->max_util.u.f);
+}
+
+void show_disk_util(int terse, struct json_object *parent)
 {
 	struct flist_head *entry;
 	struct disk_util *du;
+	struct json_array *array = NULL;
 
 	fio_mutex_down(disk_util_mutex);
 
@@ -612,11 +662,19 @@
 	if (!terse)
 		log_info("\nDisk stats (read/write):\n");
 
+	if (terse && terse_version == 4) {
+		array = json_create_array();
+		json_object_add_value_array(parent, "disk_util", array);
+	}
+
 	flist_for_each(entry, &disk_list) {
 		du = flist_entry(entry, struct disk_util, list);
 
 		aggregate_slaves_stats(du);
-		print_disk_util(&du->dus, &du->agg, terse);
+		if (terse && terse_version == 4)
+			print_disk_util_json(du, array);
+		else
+			print_disk_util(&du->dus, &du->agg, terse);
 	}
 
 	fio_mutex_up(disk_util_mutex);
diff --git a/diskutil.h b/diskutil.h
index 88dde55..b223150 100644
--- a/diskutil.h
+++ b/diskutil.h
@@ -1,6 +1,6 @@
 #ifndef FIO_DISKUTIL_H
 #define FIO_DISKUTIL_H
-
+#include "json.h"
 #define FIO_DU_NAME_SZ		64
 
 /*
@@ -101,14 +101,14 @@
  */
 #ifdef FIO_HAVE_DISK_UTIL
 extern void print_disk_util(struct disk_util_stat *, struct disk_util_agg *, int terse);
-extern void show_disk_util(int terse);
+extern void show_disk_util(int terse, struct json_object *parent);
 extern void free_disk_util(void);
 extern void init_disk_util(struct thread_data *);
 extern int update_io_ticks(void);
 extern void setup_disk_util(void);
 #else
 #define print_disk_util(dus, agg, terse)
-#define show_disk_util(terse)
+#define show_disk_util(terse, parent)
 #define free_disk_util()
 #define init_disk_util(td)
 #define setup_disk_util()
diff --git a/engines/e4defrag.c b/engines/e4defrag.c
new file mode 100644
index 0000000..5affaa0
--- /dev/null
+++ b/engines/e4defrag.c
@@ -0,0 +1,215 @@
+/*
+ * ioe_e4defrag:  ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate
+ * defragment activity
+ *
+ */
+
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+
+#ifndef EXT4_IOC_MOVE_EXT
+#define EXT4_IOC_MOVE_EXT               _IOWR('f', 15, struct move_extent)
+struct move_extent {
+	__u32 reserved;         /* should be zero */
+	__u32 donor_fd;         /* donor file descriptor */
+	__u64 orig_start;       /* logical start offset in block for orig */
+	__u64 donor_start;      /* logical start offset in block for donor */
+	__u64 len;              /* block length to be moved */
+	__u64 moved_len;        /* moved block length */
+};
+#endif
+
+struct e4defrag_data {
+	int donor_fd;
+	int bsz;
+};
+
+struct e4defrag_options {
+	struct thread_data *td;
+	unsigned int inplace;
+	char * donor_name;
+};
+
+static struct fio_option options[] = {
+	{
+		.name	= "donorname",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= offsetof(struct e4defrag_options, donor_name),
+		.help	= "File used as a block donor",
+	},
+	{
+		.name	= "inplace",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct e4defrag_options, inplace),
+		.minval	= 0,
+		.maxval	= 1,
+		.help	= "Alloc and free space inside defrag event",
+	},
+	{
+		.name	= NULL,
+	},
+};
+
+static int fio_e4defrag_init(struct thread_data *td)
+{
+	int r, len = 0;
+	struct e4defrag_options *o = td->eo;
+	struct e4defrag_data *ed;
+	struct stat stub;
+	char donor_name[PATH_MAX];
+
+	if (!strlen(o->donor_name)) {
+		log_err("'donorname' options required\n");
+		return 1;
+	}
+
+	ed = malloc(sizeof(*ed));
+	if (!ed) {
+		td_verror(td, -ENOMEM, "io_queue_init");
+		return 1;
+	}
+	memset(ed, 0 ,sizeof(*ed));
+
+	if (td->o.directory)
+		len = sprintf(donor_name, "%s/", td->o.directory);
+	sprintf(donor_name + len, "%s", o->donor_name);
+
+	ed->donor_fd = open(donor_name, O_CREAT|O_WRONLY, 0644);
+	if (ed->donor_fd < 0) {
+		td_verror(td, ed->donor_fd, "io_queue_init");
+		log_err("Can't open donor file %s err:%d", ed->donor_fd);
+		free(ed);
+		return 1;
+	}
+
+	if (!o->inplace) {
+		long long len = td->o.file_size_high - td->o.start_offset;
+		r = fallocate(ed->donor_fd, 0, td->o.start_offset, len);
+		if (r)
+			goto err;
+	}
+	r = fstat(ed->donor_fd, &stub);
+	if (r)
+		goto err;
+
+	ed->bsz = stub.st_blksize;
+	td->io_ops->data = ed;
+	return 0;
+err:
+	td_verror(td, errno, "io_queue_init");
+	close(ed->donor_fd);
+	free(ed);
+	return 1;
+}
+
+static void fio_e4defrag_cleanup(struct thread_data *td)
+{
+	struct e4defrag_data *ed = td->io_ops->data;
+	if (ed) {
+		if (ed->donor_fd >= 0)
+			close(ed->donor_fd);
+		free(ed);
+	}
+}
+
+
+static int fio_e4defrag_queue(struct thread_data *td, struct io_u *io_u)
+{
+
+	int ret;
+	unsigned long long len;
+	struct move_extent me;
+	struct fio_file *f = io_u->file;
+	struct e4defrag_data *ed = td->io_ops->data;
+	struct e4defrag_options *o = td->eo;
+
+	fio_ro_check(td, io_u);
+
+	/* Theoretically defragmentation should not change data, but it
+	 * changes data layout. So this function handle only DDIR_WRITE
+	 * in order to satisfy strict read only access pattern
+	 */
+	if (io_u->ddir != DDIR_WRITE) {
+		io_u->error = errno;
+		return FIO_Q_COMPLETED;
+	}
+
+	if (o->inplace) {
+		ret = fallocate(ed->donor_fd, 0, io_u->offset, io_u->xfer_buflen);
+		if (ret) {
+			io_u->error = errno;
+			goto out;
+		}
+	}
+
+	memset(&me, 0, sizeof(me));
+	me.donor_fd = ed->donor_fd;
+	me.orig_start = io_u->offset / ed->bsz;
+	me.donor_start = me.orig_start;
+	len = (io_u->offset + io_u->xfer_buflen + ed->bsz -1);
+	me.len = len / ed->bsz - me.orig_start;
+
+	ret = ioctl(f->fd, EXT4_IOC_MOVE_EXT, &me);
+	len = me.moved_len * ed->bsz;
+
+	if (io_u->file && len >= 0 && ddir_rw(io_u->ddir))
+		io_u->file->file_pos = io_u->offset + len;
+
+	if (len > io_u->xfer_buflen)
+		len = io_u->xfer_buflen;
+
+	if (len != io_u->xfer_buflen) {
+		io_u->resid = io_u->xfer_buflen - len;
+		io_u->error = 0;
+	}
+	if (ret)
+		io_u->error = errno;
+	
+	if (o->inplace) {
+		ret = ftruncate(ed->donor_fd, 0);
+		if (ret)
+			io_u->error = errno;
+	}
+out:
+	if (io_u->error)
+		td_verror(td, errno, "xfer");
+
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name			= "e4defrag",
+	.version		= FIO_IOOPS_VERSION,
+	.init			= fio_e4defrag_init,
+	.queue			= fio_e4defrag_queue,
+	.open_file		= generic_open_file,
+	.close_file		= generic_close_file,
+	.get_file_size		= generic_get_file_size,
+	.flags			= FIO_SYNCIO,
+	.cleanup		= fio_e4defrag_cleanup,
+	.options		= options,
+	.option_struct_size	= sizeof(struct e4defrag_options),
+
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/falloc.c b/engines/falloc.c
new file mode 100644
index 0000000..4977d9e
--- /dev/null
+++ b/engines/falloc.c
@@ -0,0 +1,119 @@
+/*
+ * falloc: ioengine for git://git.kernel.dk/fio.git
+ *
+ * IO engine that does regular fallocate to simulate data transfer 
+ * as fio ioengine.
+ * DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
+ * DDIR_WRITE does fallocate(,mode = 0) : fallocate with size extention 
+ * DDIR_TRIM  does fallocate(,mode = FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)
+ *
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+#include <fcntl.h>
+
+#include "../fio.h"
+#include "../filehash.h"
+
+/*
+ * generic_open_file is not appropriate because does not allow to perform
+ * TRIM in to file
+ */
+int open_file(struct thread_data *td, struct fio_file *f)
+{
+	int from_hash = 0;
+
+	dprint(FD_FILE, "fd open %s\n", f->file_name);
+
+	if (f->filetype != FIO_TYPE_FILE) {
+		log_err("fio: only files are supported fallocate \n");
+		return 1;
+	}
+	if (!strcmp(f->file_name, "-")) {
+		log_err("fio: can't read/write to stdin/out\n");
+		return 1;
+	}
+
+open_again:
+	from_hash = file_lookup_open(f, O_CREAT|O_RDWR);
+
+	if (f->fd == -1) {
+		char buf[FIO_VERROR_SIZE];
+		int __e = errno;
+		snprintf(buf, sizeof(buf) - 1, "open(%s)", f->file_name);
+		td_verror(td, __e, buf);
+	}
+
+	if (!from_hash && f->fd != -1) {
+		if (add_file_hash(f)) {
+			int fio_unused ret;
+
+			/*
+			 * OK to ignore, we haven't done anything with it
+			 */
+			ret = generic_close_file(td, f);
+			goto open_again;
+		}
+	}
+
+	return 0;
+}
+
+#ifndef FALLOC_FL_KEEP_SIZE
+#define FALLOC_FL_KEEP_SIZE     0x01 /* default is extend size */
+#endif
+#ifndef FALLOC_FL_PUNCH_HOLE
+#define FALLOC_FL_PUNCH_HOLE    0x02 /* de-allocates range */
+#endif 
+static int fio_fallocate_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	int ret;
+	int flags = 0;
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		flags = FALLOC_FL_KEEP_SIZE;
+	else if (io_u->ddir == DDIR_WRITE)
+		flags = 0;
+	else if (io_u->ddir == DDIR_TRIM)
+		flags = FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE;
+
+	ret = fallocate(f->fd, flags, io_u->offset, io_u->xfer_buflen);
+
+	if (ret) {
+		io_u->error = errno;
+		if (io_u->error)
+			td_verror(td, io_u->error, "xfer");
+	}
+
+	if (io_u->file && ret == 0 && ddir_rw(io_u->ddir))
+		io_u->file->file_pos = io_u->offset + ret;
+
+	return FIO_Q_COMPLETED;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "falloc",
+	.version	= FIO_IOOPS_VERSION,
+	.queue		= fio_fallocate_queue,
+	.open_file	= open_file,
+	.close_file	= generic_close_file,
+	.get_file_size	= generic_get_file_size,
+	.flags		= FIO_SYNCIO
+};
+
+static void fio_init fio_syncio_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_syncio_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/fusion-aw.c b/engines/fusion-aw.c
new file mode 100644
index 0000000..9aac43a
--- /dev/null
+++ b/engines/fusion-aw.c
@@ -0,0 +1,173 @@
+/*
+ * Custom fio(1) engine that submits synchronous atomic writes to file.
+ *
+ * Copyright (C) 2012 Fusion-io, Inc.
+ * Author: Santhosh Kumar Koundinya.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License as published by the Free
+ * Software Foundation; under version 2 of the License.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License version
+ * 2 for more details.
+ *
+ * You should have received a copy of the GNU General Public License Version 2
+ * along with this program; if not see <http://www.gnu.org/licenses/>
+ */
+
+#include <stdlib.h>
+#include <stdint.h>
+
+#include "../fio.h"
+
+#ifdef FIO_HAVE_FUSION_AW
+
+#include <vsl_dp_experimental/vectored_write.h>
+
+/* Fix sector size to 512 bytes independent of actual sector size, just like
+ * the linux kernel. */
+#define SECTOR_SHIFT    9
+#define SECTOR_SIZE    (1U<<SECTOR_SHIFT)
+
+struct acs_file_data {
+	struct vsl_iovec iov[IO_VECTOR_LIMIT];
+};
+
+static int queue(struct thread_data *td, struct io_u *io_u)
+{
+	int rc;
+	int iov_index;
+	off_t offset;
+	char *xfer_buf;
+	size_t xfer_buflen;
+	struct acs_file_data *d = io_u->file->file_data;
+
+	if (io_u->ddir != DDIR_WRITE) {
+		td_vmsg(td, -EIO, "only writes supported", "io_u->ddir");
+		rc = -EIO;
+		goto out;
+	}
+	if (io_u->xfer_buflen > IO_SIZE_MAX) {
+		td_vmsg(td, -EIO, "data too big", "io_u->xfer_buflen");
+		rc = -EIO;
+		goto out;
+	}
+	if (io_u->xfer_buflen & (SECTOR_SIZE - 1)) {
+		td_vmsg(td, -EIO, "unaligned data size", "io_u->xfer_buflen");
+		rc = -EIO;
+		goto out;
+	}
+
+	/* Chop up the write into minimal number of iovec's necessary */
+	iov_index = 0;
+	offset = io_u->offset;
+	xfer_buf = io_u->xfer_buf;
+	xfer_buflen = io_u->xfer_buflen;
+	while (xfer_buflen) {
+		struct vsl_iovec *iov = &d->iov[iov_index++];
+
+		iov->iov_len = xfer_buflen > IO_VECTOR_MAX_SIZE ?
+		    IO_VECTOR_MAX_SIZE : xfer_buflen;
+		iov->iov_base = (uint64_t) xfer_buf;
+		iov->sector = offset >> SECTOR_SHIFT;
+		iov->iov_flag = VSL_IOV_WRITE;
+
+		offset += iov->iov_len;
+		xfer_buf += iov->iov_len;
+		xfer_buflen -= iov->iov_len;
+	}
+	assert(xfer_buflen == 0);
+	assert(iov_index <= IO_VECTOR_LIMIT);
+
+	rc = vsl_vectored_write(io_u->file->fd, d->iov, iov_index, O_ATOMIC);
+	if (rc == -1) {
+		td_verror(td, -errno, "vsl_vectored_write");
+		rc = -EIO;
+		goto out;
+	} else {
+		io_u->error = 0;
+		io_u->file->file_pos = io_u->offset + rc;
+		rc = FIO_Q_COMPLETED;
+	}
+
+out:
+	if (rc < 0)
+		io_u->error = rc;
+
+	return rc;
+}
+
+static int open_file(struct thread_data *td, struct fio_file *f)
+{
+	int rc;
+	struct acs_file_data *d = NULL;
+
+	d = malloc(sizeof(*d));
+	if (!d) {
+		td_verror(td, -ENOMEM, "malloc");
+		rc = -ENOMEM;
+		goto error;
+	}
+	f->file_data = d;
+
+	rc = generic_open_file(td, f);
+
+out:
+	return rc;
+
+error:
+	f->fd = -1;
+	f->file_data = NULL;
+	if (d)
+		free(d);
+
+	goto out;
+}
+
+static int close_file(struct thread_data *td, struct fio_file *f)
+{
+	if (f->file_data) {
+		free(f->file_data);
+		f->file_data = NULL;
+	}
+
+	return generic_close_file(td, f);
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "fusion-aw-sync",
+	.version = FIO_IOOPS_VERSION,
+	.queue = queue,
+	.open_file = open_file,
+	.close_file = close_file,
+	.get_file_size = generic_get_file_size,
+	.flags = FIO_SYNCIO | FIO_RAWIO | FIO_MEMALIGN
+};
+
+#else /* !FUSION_HAVE_FUSION_AW */
+
+static int fio_fusion_aw_eng_init(struct thread_data fio_unused *td)
+{
+	log_err("fio: fusion atomic write engine not available\n");
+	return 1;
+}
+
+static struct ioengine_ops ioengine = {
+	.name		= "fusion-aw-sync",
+	.version	= FIO_IOOPS_VERSION,
+	.init		= fio_fusion_aw_eng_init,
+};
+
+#endif /* FUSION_HAVE_FUSION_AW */
+
+static void fio_init fio_fusion_aw_init(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_fusion_aw_exit(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/libaio.c b/engines/libaio.c
index 2b8c6da..748233c 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -257,11 +257,20 @@
 static int fio_libaio_init(struct thread_data *td)
 {
 	struct libaio_data *ld = malloc(sizeof(*ld));
-	int err;
+	struct libaio_options *o = td->eo;
+	int err = 0;
 
 	memset(ld, 0, sizeof(*ld));
 
-	err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
+	/*
+	 * First try passing in 0 for queue depth, since we don't
+	 * care about the user ring. If that fails, the kernel is too old
+	 * and we need the right depth.
+	 */
+	if (!o->userspace_reap)
+		err = io_queue_init(INT_MAX, &ld->aio_ctx);
+	if (o->userspace_reap || err == -EINVAL)
+		err = io_queue_init(td->o.iodepth, &ld->aio_ctx);
 	if (err) {
 		td_verror(td, -err, "io_queue_init");
 		log_err("fio: check /proc/sys/fs/aio-max-nr\n");
diff --git a/engines/sync.c b/engines/sync.c
index 3377f81..bd912e7 100644
--- a/engines/sync.c
+++ b/engines/sync.c
@@ -75,9 +75,10 @@
 		ret = pread(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
 	else if (io_u->ddir == DDIR_WRITE)
 		ret = pwrite(f->fd, io_u->xfer_buf, io_u->xfer_buflen, io_u->offset);
-	else if (io_u->ddir == DDIR_TRIM)
-		ret = do_io_u_trim(td, io_u);
-	else
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
 		ret = do_io_u_sync(td, io_u);
 
 	return fio_io_end(td, io_u, ret);
@@ -94,9 +95,10 @@
 		ret = read(f->fd, io_u->xfer_buf, io_u->xfer_buflen);
 	else if (io_u->ddir == DDIR_WRITE)
 		ret = write(f->fd, io_u->xfer_buf, io_u->xfer_buflen);
-	else if (io_u->ddir == DDIR_TRIM)
-		ret = do_io_u_trim(td, io_u);
-	else
+	else if (io_u->ddir == DDIR_TRIM) {
+		do_io_u_trim(td, io_u);
+		return FIO_Q_COMPLETED;
+	} else
 		ret = do_io_u_sync(td, io_u);
 
 	return fio_io_end(td, io_u, ret);
diff --git a/eta.c b/eta.c
index f491fea..600b046 100644
--- a/eta.c
+++ b/eta.c
@@ -53,11 +53,16 @@
 				c = 'r';
 			else
 				c = 'R';
-		} else {
+		} else if (td_write(td)) {
 			if (td_random(td))
 				c = 'w';
 			else
 				c = 'W';
+		} else {
+			if (td_random(td))
+				c = 'd';
+			else
+				c = 'D';
 		}
 		break;
 	case TD_PRE_READING:
@@ -150,7 +155,7 @@
 	if (td->runstate == TD_RUNNING || td->runstate == TD_VERIFYING) {
 		double perc, perc_t;
 
-		bytes_done = td->io_bytes[DDIR_READ] + td->io_bytes[DDIR_WRITE];
+		bytes_done = ddir_rw_sum(td->io_bytes);
 		perc = (double) bytes_done / (double) bytes_total;
 		if (perc > 1.0)
 			perc = 1.0;
@@ -171,6 +176,7 @@
 			|| td->runstate == TD_RAMP
 			|| td->runstate == TD_PRE_READING) {
 		int t_eta = 0, r_eta = 0;
+		unsigned long long rate_bytes;
 
 		/*
 		 * We can only guess - assume it'll run the full timeout
@@ -189,9 +195,9 @@
 					t_eta -= ramp_left;
 			}
 		}
-		if (td->o.rate[0] || td->o.rate[1]) {
-			r_eta = (bytes_total / 1024) /
-					(td->o.rate[0] + td->o.rate[1]);
+		rate_bytes = ddir_rw_sum(td->o.rate);
+		if (rate_bytes) {
+			r_eta = (bytes_total / 1024) / rate_bytes;
 			r_eta += td->o.start_delay;
 		}
 
@@ -218,23 +224,25 @@
 {
 	int i;
 
-	for (i = 0; i <= DDIR_WRITE; i++) {
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		unsigned long long diff;
 
 		diff = io_bytes[i] - prev_io_bytes[i];
 		rate[i] = ((1000 * diff) / mtime) / 1024;
+
+		prev_io_bytes[i] = io_bytes[i];
 	}
-	prev_io_bytes[0] = io_bytes[0];
-	prev_io_bytes[1] = io_bytes[1];
 }
 
 static void calc_iops(unsigned long mtime, unsigned long long *io_iops,
 		      unsigned long long *prev_io_iops, unsigned int *iops)
 {
-	iops[0] = ((io_iops[0] - prev_io_iops[0]) * 1000) / mtime;
-	iops[1] = ((io_iops[1] - prev_io_iops[1]) * 1000) / mtime;
-	prev_io_iops[0] = io_iops[0];
-	prev_io_iops[1] = io_iops[1];
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		iops[i] = ((io_iops[i] - prev_io_iops[i]) * 1000) / mtime;
+		prev_io_iops[i] = io_iops[i];
+	}
 }
 
 /*
@@ -246,26 +254,28 @@
 	struct thread_data *td;
 	int i;
 	unsigned long rate_time, disp_time, bw_avg_time, *eta_secs;
-	unsigned long long io_bytes[2];
-	unsigned long long io_iops[2];
+	unsigned long long io_bytes[DDIR_RWDIR_CNT];
+	unsigned long long io_iops[DDIR_RWDIR_CNT];
 	struct timeval now;
 
-	static unsigned long long rate_io_bytes[2];
-	static unsigned long long disp_io_bytes[2];
-	static unsigned long long disp_io_iops[2];
+	static unsigned long long rate_io_bytes[DDIR_RWDIR_CNT];
+	static unsigned long long disp_io_bytes[DDIR_RWDIR_CNT];
+	static unsigned long long disp_io_iops[DDIR_RWDIR_CNT];
 	static struct timeval rate_prev_time, disp_prev_time;
 
 	if (!force) {
-		if (temp_stall_ts || terse_output || eta_print == FIO_ETA_NEVER)
+		if (output_format != FIO_OUTPUT_NORMAL)
+			return 0;
+		if (temp_stall_ts || eta_print == FIO_ETA_NEVER)
 			return 0;
 
 		if (!isatty(STDOUT_FILENO) && (eta_print != FIO_ETA_ALWAYS))
 			return 0;
 	}
 
-	if (!rate_io_bytes[0] && !rate_io_bytes[1])
+	if (!ddir_rw_sum(rate_io_bytes))
 		fill_start_time(&rate_prev_time);
-	if (!disp_io_bytes[0] && !disp_io_bytes[1])
+	if (!ddir_rw_sum(disp_io_bytes))
 		fill_start_time(&disp_prev_time);
 
 	eta_secs = malloc(thread_number * sizeof(unsigned long));
@@ -273,8 +283,8 @@
 
 	je->elapsed_sec = (mtime_since_genesis() + 999) / 1000;
 
-	io_bytes[0] = io_bytes[1] = 0;
-	io_iops[0] = io_iops[1] = 0;
+	io_bytes[DDIR_READ] = io_bytes[DDIR_WRITE] = io_bytes[DDIR_TRIM] = 0;
+	io_iops[DDIR_READ] = io_iops[DDIR_WRITE] = io_iops[DDIR_TRIM] = 0;
 	bw_avg_time = ULONG_MAX;
 	for_each_td(td, i) {
 		if (is_power_of_2(td->o.kb_base))
@@ -297,6 +307,13 @@
 				je->m_rate[1] += td->o.ratemin[DDIR_WRITE];
 				je->m_iops[1] += td->o.rate_iops_min[DDIR_WRITE];
 			}
+			if (td_trim(td)) {
+				je->t_rate[2] += td->o.rate[DDIR_TRIM];
+				je->t_iops[2] += td->o.rate_iops[DDIR_TRIM];
+				je->m_rate[2] += td->o.ratemin[DDIR_TRIM];
+				je->m_iops[2] += td->o.rate_iops_min[DDIR_TRIM];
+			}
+
 			je->files_open += td->nr_open_files;
 		} else if (td->runstate == TD_RAMP) {
 			je->nr_running++;
@@ -312,10 +329,11 @@
 		check_str_update(td);
 
 		if (td->runstate > TD_RAMP) {
-			io_bytes[0] += td->io_bytes[0];
-			io_bytes[1] += td->io_bytes[1];
-			io_iops[0] += td->io_blocks[0];
-			io_iops[1] += td->io_blocks[1];
+			int ddir;
+			for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+				io_bytes[ddir] += td->io_bytes[ddir];
+				io_iops[ddir] += td->io_blocks[ddir];
+			}
 		}
 	}
 
@@ -344,6 +362,7 @@
 		memcpy(&rate_prev_time, &now, sizeof(now));
 		add_agg_sample(je->rate[DDIR_READ], DDIR_READ, 0);
 		add_agg_sample(je->rate[DDIR_WRITE], DDIR_WRITE, 0);
+		add_agg_sample(je->rate[DDIR_TRIM], DDIR_TRIM, 0);
 	}
 
 	disp_time = mtime_since(&disp_prev_time, &now);
@@ -397,10 +416,11 @@
 	}
 	if (je->eta_sec != INT_MAX && je->nr_running) {
 		char perc_str[32];
-		char *iops_str[2];
-		char *rate_str[2];
+		char *iops_str[DDIR_RWDIR_CNT];
+		char *rate_str[DDIR_RWDIR_CNT];
 		size_t left;
 		int l;
+		int ddir;
 
 		if ((!je->eta_sec && !eta_good) || je->nr_ramp == je->nr_running)
 			strcpy(perc_str, "-.-% done");
@@ -410,26 +430,28 @@
 			sprintf(perc_str, "%3.1f%% done", perc);
 		}
 
-		rate_str[0] = num2str(je->rate[0], 5, 1024, je->is_pow2);
-		rate_str[1] = num2str(je->rate[1], 5, 1024, je->is_pow2);
-
-		iops_str[0] = num2str(je->iops[0], 4, 1, 0);
-		iops_str[1] = num2str(je->iops[1], 4, 1, 0);
+		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+			rate_str[ddir] = num2str(je->rate[ddir], 5,
+						1024, je->is_pow2);
+			iops_str[ddir] = num2str(je->iops[ddir], 4, 1, 0);
+		}
 
 		left = sizeof(output) - (p - output) - 1;
 
-		l = snprintf(p, left, ": [%s] [%s] [%s/%s /s] [%s/%s iops] [eta %s]",
-				je->run_str, perc_str, rate_str[0],
-				rate_str[1], iops_str[0], iops_str[1], eta_str);
+		l = snprintf(p, left, ": [%s] [%s] [%s/%s/%s /s] [%s/%s/%s iops] [eta %s]",
+				je->run_str, perc_str, rate_str[DDIR_READ],
+				rate_str[DDIR_WRITE], rate_str[DDIR_TRIM],
+				iops_str[DDIR_READ], iops_str[DDIR_WRITE],
+				iops_str[DDIR_TRIM], eta_str);
 		p += l;
 		if (l >= 0 && l < linelen_last)
 			p += sprintf(p, "%*s", linelen_last - l, "");
 		linelen_last = l;
 
-		free(rate_str[0]);
-		free(rate_str[1]);
-		free(iops_str[0]);
-		free(iops_str[1]);
+		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
+			free(rate_str[ddir]);
+			free(iops_str[ddir]);
+		}
 	}
 	p += sprintf(p, "\r");
 
diff --git a/examples/e4defrag b/examples/e4defrag
new file mode 100644
index 0000000..cb94e85
--- /dev/null
+++ b/examples/e4defrag
@@ -0,0 +1,41 @@
+[global]
+ioengine=e4defrag
+directory=/scratch
+nrfiles=1
+filesize=100M
+size=100M
+bs=32k
+#group_reporting
+
+[isolated-e4defrag]
+# It is important to disable buffered io
+buffered=0
+donorname=file.def
+filename=file1
+inplace=0
+rw=write
+
+# Run e4defrag and aio-dio workers in parallel
+[e4defrag]
+stonewall
+time_based=30
+runtime=30
+ioengine=e4defrag
+buffered=0
+donorname=file.def
+filename=file1
+inplace=0
+rw=write
+
+[random-aio-32k]
+ioengine=libaio
+runtime=30
+verify=md5
+direct=1
+bs=64k
+iodepth=128
+filename=file1
+rw=randrw
+numjobs=4
+
+
diff --git a/examples/e4defrag2 b/examples/e4defrag2
new file mode 100644
index 0000000..818618a
--- /dev/null
+++ b/examples/e4defrag2
@@ -0,0 +1,88 @@
+#################################################
+# Hardcode defragmentation patterns
+# Please be carefull, it can trigger kernel panic
+#################################################
+[global]
+ioengine=e4defrag
+group_reporting
+directory=/scratch
+nrfiles=1
+filesize=100M
+size=100M
+donorname=file.def
+bs=32k
+
+###########
+# Run several defragmentation threads for different files, but
+# use shared donor file
+[parallel-e4defrag]
+buffered=0
+inplace=0
+rw=write
+numjobs=4
+
+########
+# Run two defragmentation threads, each thread use another's file
+# as donor file
+
+[e4defrag-1]
+stonewall
+inplace=0
+rw=write
+donorname=e4defrag-2
+
+[e4defrag-2]
+inplace=0
+rw=write
+donorname=e4defrag-1
+
+###########
+# Run random defragment activity 
+[e4defrag-fuzzer-4k]
+stonewall
+inplace=1
+bs=4k
+rw=randwrite
+filename=file
+donorname=file.def
+
+########
+# Run random e4defrag and various aio workers in parallel
+[e4defrag-fuzzer-4k]
+stonewall
+continue_on_error=all
+inplace=1
+bs=4k
+donorname=file3.def
+filename=file3
+time_based=30
+rw=randwrite
+
+[buffered-aio-32k]
+continue_on_error=none
+verify=md5
+buffered=1
+ioengine=libaio
+iodepth=128
+bs=32k
+filename=file3
+rw=randrw
+runtime=30
+time_based=30
+numjobs=4
+
+[direct-aio-32k]
+continue_on_error=none
+verify=md5
+buffered=0
+direct=1
+ioengine=libaio
+iodepth=128
+bs=32k
+filename=file3
+rw=randrw
+runtime=30
+time_based=30
+numjobs=4
+
+
diff --git a/examples/falloc b/examples/falloc
new file mode 100644
index 0000000..fa30731
--- /dev/null
+++ b/examples/falloc
@@ -0,0 +1,54 @@
+[global]
+ioengine=falloc
+iodepth=1
+direct=0
+buffered=0
+directory=/scratch
+nrfiles=1
+size=100M
+filesize=100M
+group_reporting
+
+
+# Run falloc and punch_hole threads in parallel
+# After activity file will be highly fragmented
+[falloc-fuzzer]
+stonewall
+runtime=10
+time_based=10
+bssplit=4k/10:64k/50:32k/40
+rw=randwrite
+numjobs=1
+filename=fragmented_file
+
+[punch hole-fuzzer]
+bs=4k
+runtime=10
+time_based=10
+rw=randtrim
+numjobs=2
+filename=fragmented_file
+
+## Mesure IO performance on fragmented file
+[sequential aio-dio write]
+stonewall
+ioengine=libaio
+numjobs=1
+iodepth=128
+buffered=0
+direct=1
+rw=write
+bs=64k
+filename=fragmented_file
+
+[sequential buffered read]
+stonewall
+ioengine=sync
+numjobs=1
+iodepth=1
+buffered=1
+direct=0
+rw=read
+bs=64k
+filename=fragmented_file
+
diff --git a/examples/fusion-aw-sync.ini b/examples/fusion-aw-sync.ini
new file mode 100644
index 0000000..c4639f0
--- /dev/null
+++ b/examples/fusion-aw-sync.ini
@@ -0,0 +1,15 @@
+# Example Job File that randomly writes 8k worth of data atomically for
+# 60 seconds.
+[rw_aw_file_sync]
+rw=randwrite
+ioengine=fusion-aw-sync
+blocksize=8k
+blockalign=8k
+
+filename=/mnt/fs/file
+randrepeat=1
+fallocate=none
+direct=1
+invalidate=0
+runtime=60
+time_based
diff --git a/file.h b/file.h
index 68f9a6e..42fd58c 100644
--- a/file.h
+++ b/file.h
@@ -153,6 +153,7 @@
 extern int __must_check generic_open_file(struct thread_data *, struct fio_file *);
 extern int __must_check generic_close_file(struct thread_data *, struct fio_file *);
 extern int __must_check generic_get_file_size(struct thread_data *, struct fio_file *);
+extern int __must_check file_lookup_open(struct fio_file *f, int flags);
 extern int __must_check pre_read_files(struct thread_data *);
 extern int add_file(struct thread_data *, const char *);
 extern int add_file_exclusive(struct thread_data *, const char *);
diff --git a/filesetup.c b/filesetup.c
index 9c486be..79e29da 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -435,7 +435,7 @@
 	return ret;
 }
 
-static int file_lookup_open(struct fio_file *f, int flags)
+int file_lookup_open(struct fio_file *f, int flags)
 {
 	struct fio_file *__f;
 	int from_hash;
@@ -468,6 +468,11 @@
 
 	dprint(FD_FILE, "fd open %s\n", f->file_name);
 
+	if (td_trim(td) && f->filetype != FIO_TYPE_BD) {
+		log_err("fio: trim only applies to block device\n");
+		return 1;
+	}
+
 	if (!strcmp(f->file_name, "-")) {
 		if (td_rw(td)) {
 			log_err("fio: can't read/write to stdin/out\n");
@@ -482,14 +487,17 @@
 			f_out = stderr;
 	}
 
+	if (td_trim(td))
+		goto skip_flags;
 	if (td->o.odirect)
 		flags |= OS_O_DIRECT;
 	if (td->o.sync_io)
 		flags |= O_SYNC;
-	if (f->filetype != FIO_TYPE_FILE)
-		flags |= FIO_O_NOATIME;
 	if (td->o.create_on_open)
 		flags |= O_CREAT;
+skip_flags:
+	if (f->filetype != FIO_TYPE_FILE)
+		flags |= FIO_O_NOATIME;
 
 open_again:
 	if (td_write(td)) {
@@ -503,7 +511,7 @@
 			f->fd = dup(STDOUT_FILENO);
 		else
 			from_hash = file_lookup_open(f, flags);
-	} else {
+	} else if (td_read(td)) {
 		if (f->filetype == FIO_TYPE_CHAR && !read_only)
 			flags |= O_RDWR;
 		else
@@ -513,6 +521,9 @@
 			f->fd = dup(STDIN_FILENO);
 		else
 			from_hash = file_lookup_open(f, flags);
+	} else { //td trim
+		flags |= O_RDWR;
+		from_hash = file_lookup_open(f, flags);
 	}
 
 	if (f->fd == -1) {
@@ -755,8 +766,11 @@
 
 		if (f->io_size == -1ULL)
 			total_size = -1ULL;
-		else
+		else {
+                        if (td->o.size_percent)
+                                f->io_size = (f->io_size * td->o.size_percent) / 100;
 			total_size += f->io_size;
+		}
 
 		if (f->filetype == FIO_TYPE_FILE &&
 		    (f->io_size + f->file_offset) > f->real_file_size &&
@@ -770,9 +784,6 @@
 		}
 	}
 
-	if (td->o.size_percent)
-		total_size = (total_size * td->o.size_percent) / 100;
-
 	if (!td->o.size || td->o.size > total_size)
 		td->o.size = total_size;
 
@@ -781,7 +792,7 @@
 	 */
 	if (need_extend) {
 		temp_stall_ts = 1;
-		if (!terse_output)
+		if (output_format == FIO_OUTPUT_NORMAL)
 			log_info("%s: Laying out IO file(s) (%u file(s) /"
 				 " %lluMB)\n", td->o.name, need_extend,
 					extend_size >> 20);
diff --git a/fio.1 b/fio.1
index d94be33..3c0002c 100644
--- a/fio.1
+++ b/fio.1
@@ -20,8 +20,8 @@
 .BI \-\-output \fR=\fPfilename
 Write output to \fIfilename\fR.
 .TP
-.BI \-\-timeout \fR=\fPtimeout
-Limit run time to \fItimeout\fR seconds.
+.BI \-\-runtime \fR=\fPruntime
+Limit run time to \fIruntime\fR seconds.
 .TP
 .B \-\-latency\-log
 Generate per-job latency logs.
@@ -472,6 +472,21 @@
 .B external
 Loads an external I/O engine object file.  Append the engine filename as
 `:\fIenginepath\fR'.
+.TP
+.B falloc
+   IO engine that does regular linux native fallocate callt to simulate data
+transfer as fio ioengine
+.br
+  DDIR_READ  does fallocate(,mode = FALLOC_FL_KEEP_SIZE,)
+.br
+  DIR_WRITE does fallocate(,mode = 0)
+.br
+  DDIR_TRIM does fallocate(,mode = FALLOC_FL_KEEP_SIZE|FALLOC_FL_PUNCH_HOLE)
+.TP
+.B e4defrag
+IO engine that does regular EXT4_IOC_MOVE_EXT ioctls to simulate defragment activity
+request to DDIR_WRITE event
+.TP
 .RE
 .RE
 .TP
@@ -1063,6 +1078,20 @@
 For TCP network connections, tell fio to listen for incoming
 connections rather than initiating an outgoing connection. The
 hostname must be omitted if this option is used.
+.TP
+.BI (e4defrag,donorname) \fR=\fPstr
+File will be used as a block donor (swap extents between files)
+.TP
+.BI (e4defrag,inplace) \fR=\fPint
+Configure donor file block allocation strategy		
+.RS
+.BI 0(default) :
+Preallocate donor's file on init
+.TP
+.BI 1:
+allocate space immidietly inside defragment event, and free right after event
+.RE
+.TP
 .SH OUTPUT
 While running, \fBfio\fR will display the status of the created jobs.  For
 example:
diff --git a/fio.h b/fio.h
index 95d9d77..4b3c63b 100644
--- a/fio.h
+++ b/fio.h
@@ -78,10 +78,10 @@
 	struct io_log *bw_log;
 	struct io_log *iops_log;
 
-	uint64_t stat_io_bytes[2];
+	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
 	struct timeval bw_sample_time;
 
-	uint64_t stat_io_blocks[2];
+	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
 	struct timeval iops_sample_time;
 
 	struct rusage ru_start;
@@ -180,21 +180,21 @@
 	/*
 	 * Rate state
 	 */
-	unsigned long long rate_bps[2];
-	long rate_pending_usleep[2];
-	unsigned long rate_bytes[2];
-	unsigned long rate_blocks[2];
-	struct timeval lastrate[2];
+	unsigned long long rate_bps[DDIR_RWDIR_CNT];
+	long rate_pending_usleep[DDIR_RWDIR_CNT];
+	unsigned long rate_bytes[DDIR_RWDIR_CNT];
+	unsigned long rate_blocks[DDIR_RWDIR_CNT];
+	struct timeval lastrate[DDIR_RWDIR_CNT];
 
 	unsigned long long total_io_size;
 	unsigned long long fill_device_size;
 
-	unsigned long io_issues[2];
-	unsigned long long io_blocks[2];
-	unsigned long long this_io_blocks[2];
-	unsigned long long io_bytes[2];
+	unsigned long io_issues[DDIR_RWDIR_CNT];
+	unsigned long long io_blocks[DDIR_RWDIR_CNT];
+	unsigned long long this_io_blocks[DDIR_RWDIR_CNT];
+	unsigned long long io_bytes[DDIR_RWDIR_CNT];
 	unsigned long long io_skip_bytes;
-	unsigned long long this_io_bytes[2];
+	unsigned long long this_io_bytes[DDIR_RWDIR_CNT];
 	unsigned long long zone_bytes;
 	struct fio_mutex *mutex;
 
@@ -312,7 +312,7 @@
 extern unsigned int thread_number;
 extern int shm_id;
 extern int groupid;
-extern int terse_output;
+extern int output_format;
 extern int temp_stall_ts;
 extern uintptr_t page_mask, page_size;
 extern int read_only;
@@ -513,10 +513,12 @@
 {
 	int ret = 0;
 
-	if (bytes_done[0])
-		ret |= __should_check_rate(td, 0);
-	if (bytes_done[1])
-		ret |= __should_check_rate(td, 1);
+	if (bytes_done[DDIR_READ])
+		ret |= __should_check_rate(td, DDIR_READ);
+	if (bytes_done[DDIR_WRITE])
+		ret |= __should_check_rate(td, DDIR_WRITE);
+	if (bytes_done[DDIR_TRIM])
+		ret |= __should_check_rate(td, DDIR_TRIM);
 
 	return ret;
 }
@@ -553,4 +555,10 @@
 
 #define ARRAY_SIZE(x) (sizeof((x)) / (sizeof((x)[0])))
 
+enum {
+	FIO_OUTPUT_TERSE	= 0,
+	FIO_OUTPUT_JSON,
+	FIO_OUTPUT_NORMAL,
+};
+
 #endif
diff --git a/gclient.c b/gclient.c
index 78a5c36..f317dc7 100644
--- a/gclient.c
+++ b/gclient.c
@@ -694,7 +694,7 @@
 	const int add_mask = 0x17e;
 	int i, j;
 
-	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
 
 	gtk_list_store_append(model, &iter);
 
diff --git a/init.c b/init.c
index ae96c6c..6604a18 100644
--- a/init.c
+++ b/init.c
@@ -36,7 +36,7 @@
 struct thread_data *threads = NULL;
 
 int exitall_on_terminate = 0;
-int terse_output = 0;
+int output_format = FIO_OUTPUT_NORMAL;
 int eta_print;
 FILE *f_out = NULL;
 FILE *f_err = NULL;
@@ -116,6 +116,11 @@
 		.val		= 'm' | FIO_CLIENT_FLAG,
 	},
 	{
+		.name		= (char *) "output-format",
+		.has_arg	= optional_argument,
+		.val		= 'F' | FIO_CLIENT_FLAG,
+	},
+	{
 		.name		= (char *) "version",
 		.has_arg	= no_argument,
 		.val		= 'v' | FIO_CLIENT_FLAG,
@@ -369,6 +374,8 @@
 		ret = __setup_rate(td, DDIR_READ);
 	if (td->o.rate[DDIR_WRITE] || td->o.rate_iops[DDIR_WRITE])
 		ret |= __setup_rate(td, DDIR_WRITE);
+	if (td->o.rate[DDIR_TRIM] || td->o.rate_iops[DDIR_TRIM])
+		ret |= __setup_rate(td, DDIR_TRIM);
 
 	return ret;
 }
@@ -377,7 +384,9 @@
 {
 	return o->min_bs[DDIR_READ] == o->max_bs[DDIR_READ] &&
 		o->min_bs[DDIR_WRITE] == o->max_bs[DDIR_WRITE] &&
-		o->min_bs[DDIR_READ] == o->min_bs[DDIR_WRITE];
+		o->min_bs[DDIR_TRIM] == o->max_bs[DDIR_TRIM] &&
+		o->min_bs[DDIR_READ] == o->min_bs[DDIR_WRITE] &&
+		o->min_bs[DDIR_READ] == o->min_bs[DDIR_TRIM];
 }
 
 /*
@@ -433,8 +442,14 @@
 		o->min_bs[DDIR_WRITE] = o->bs[DDIR_WRITE];
 	if (!o->max_bs[DDIR_WRITE])
 		o->max_bs[DDIR_WRITE] = o->bs[DDIR_WRITE];
+	if (!o->min_bs[DDIR_TRIM])
+		o->min_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
+	if (!o->max_bs[DDIR_TRIM])
+		o->max_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
+
 
 	o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]);
+	o->rw_min_bs = min(o->min_bs[DDIR_TRIM], o->rw_min_bs);
 
 	/*
 	 * For random IO, allow blockalign offset other than min_bs.
@@ -443,9 +458,12 @@
 		o->ba[DDIR_READ] = o->min_bs[DDIR_READ];
 	if (!o->ba[DDIR_WRITE] || !td_random(td))
 		o->ba[DDIR_WRITE] = o->min_bs[DDIR_WRITE];
+	if (!o->ba[DDIR_TRIM] || !td_random(td))
+		o->ba[DDIR_TRIM] = o->min_bs[DDIR_TRIM];
 
 	if ((o->ba[DDIR_READ] != o->min_bs[DDIR_READ] ||
-	    o->ba[DDIR_WRITE] != o->min_bs[DDIR_WRITE]) &&
+	    o->ba[DDIR_WRITE] != o->min_bs[DDIR_WRITE] ||
+	    o->ba[DDIR_TRIM] != o->min_bs[DDIR_TRIM]) &&
 	    !o->norandommap) {
 		log_err("fio: Any use of blockalign= turns off randommap\n");
 		o->norandommap = 1;
@@ -498,15 +516,19 @@
 	if (o->open_files > o->nr_files || !o->open_files)
 		o->open_files = o->nr_files;
 
-	if (((o->rate[0] + o->rate[1]) && (o->rate_iops[0] + o->rate_iops[1]))||
-	    ((o->ratemin[0] + o->ratemin[1]) && (o->rate_iops_min[0] +
-		o->rate_iops_min[1]))) {
+	if (((o->rate[DDIR_READ] + o->rate[DDIR_WRITE] + o->rate[DDIR_TRIM]) &&
+	    (o->rate_iops[DDIR_READ] + o->rate_iops[DDIR_WRITE] + o->rate_iops[DDIR_TRIM])) ||
+	    ((o->ratemin[DDIR_READ] + o->ratemin[DDIR_WRITE] + o->ratemin[DDIR_TRIM]) &&
+	    (o->rate_iops_min[DDIR_READ] + o->rate_iops_min[DDIR_WRITE] + o->rate_iops_min[DDIR_TRIM]))) {
 		log_err("fio: rate and rate_iops are mutually exclusive\n");
 		ret = 1;
 	}
-	if ((o->rate[0] < o->ratemin[0]) || (o->rate[1] < o->ratemin[1]) ||
-	    (o->rate_iops[0] < o->rate_iops_min[0]) ||
-	    (o->rate_iops[1] < o->rate_iops_min[1])) {
+	if ((o->rate[DDIR_READ] < o->ratemin[DDIR_READ]) ||
+	    (o->rate[DDIR_WRITE] < o->ratemin[DDIR_WRITE]) ||
+	    (o->rate[DDIR_TRIM] < o->ratemin[DDIR_TRIM]) ||
+	    (o->rate_iops[DDIR_READ] < o->rate_iops_min[DDIR_READ]) ||
+	    (o->rate_iops[DDIR_WRITE] < o->rate_iops_min[DDIR_WRITE]) ||
+	    (o->rate_iops[DDIR_TRIM] < o->rate_iops_min[DDIR_TRIM])) {
 		log_err("fio: minimum rate exceeds rate\n");
 		ret = 1;
 	}
@@ -818,10 +840,12 @@
 	else
 		memcpy(td->ts.percentile_list, def_percentile_list, sizeof(def_percentile_list));
 
-	td->ts.clat_stat[0].min_val = td->ts.clat_stat[1].min_val = ULONG_MAX;
-	td->ts.slat_stat[0].min_val = td->ts.slat_stat[1].min_val = ULONG_MAX;
-	td->ts.lat_stat[0].min_val = td->ts.lat_stat[1].min_val = ULONG_MAX;
-	td->ts.bw_stat[0].min_val = td->ts.bw_stat[1].min_val = ULONG_MAX;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		td->ts.clat_stat[i].min_val = ULONG_MAX;
+		td->ts.slat_stat[i].min_val = ULONG_MAX;
+		td->ts.lat_stat[i].min_val = ULONG_MAX;
+		td->ts.bw_stat[i].min_val = ULONG_MAX;
+	}
 	td->ddir_seq_nr = td->o.ddir_seq_nr;
 
 	if ((td->o.stonewall || td->o.new_group) && prev_group_jobs) {
@@ -853,24 +877,26 @@
 	if (!td->o.name)
 		td->o.name = strdup(jobname);
 
-	if (!terse_output) {
+	if (output_format == FIO_OUTPUT_NORMAL) {
 		if (!job_add_num) {
 			if (is_backend && !recursed)
 				fio_server_send_add_job(td);
 
 			if (!(td->io_ops->flags & FIO_NOIO)) {
-				char *c1, *c2, *c3, *c4;
+				char *c1, *c2, *c3, *c4, *c5, *c6;
 
 				c1 = fio_uint_to_kmg(td->o.min_bs[DDIR_READ]);
 				c2 = fio_uint_to_kmg(td->o.max_bs[DDIR_READ]);
 				c3 = fio_uint_to_kmg(td->o.min_bs[DDIR_WRITE]);
 				c4 = fio_uint_to_kmg(td->o.max_bs[DDIR_WRITE]);
+				c5 = fio_uint_to_kmg(td->o.min_bs[DDIR_TRIM]);
+				c6 = fio_uint_to_kmg(td->o.max_bs[DDIR_TRIM]);
 
-				log_info("%s: (g=%d): rw=%s, bs=%s-%s/%s-%s,"
+				log_info("%s: (g=%d): rw=%s, bs=%s-%s/%s-%s/%s-%s,"
 					 " ioengine=%s, iodepth=%u\n",
 						td->o.name, td->groupid,
 						ddir_str(td->o.td_ddir),
-						c1, c2, c3, c4,
+						c1, c2, c3, c4, c5, c6,
 						td->io_ops->name,
 						td->o.iodepth);
 
@@ -878,6 +904,8 @@
 				free(c2);
 				free(c3);
 				free(c4);
+				free(c5);
+				free(c6);
 			}
 		} else if (job_add_num == 1)
 			log_info("...\n");
@@ -1186,12 +1214,13 @@
 		"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
 		"\t\t\tdiskutil,job,mutex,profile,time,net\n");
 	printf("  --output\t\tWrite output to file\n");
-	printf("  --timeout\t\tRuntime in seconds\n");
+	printf("  --runtime\t\tRuntime in seconds\n");
 	printf("  --latency-log\t\tGenerate per-job latency logs\n");
 	printf("  --bandwidth-log\tGenerate per-job bandwidth logs\n");
 	printf("  --minimal\t\tMinimal (terse) output\n");
-	printf("  --version\t\tPrint version info and exit\n");
+	printf("  --output-format=x\tOutput format (terse,json,normal)\n");
 	printf("  --terse-version=x\tSet terse version output format to 'x'\n");
+	printf("  --version\t\tPrint version info and exit\n");
 	printf("  --help\t\tPrint this page\n");
 	printf("  --cmdhelp=cmd\t\tPrint command help, \"all\" for all of"
 		" them\n");
@@ -1425,7 +1454,17 @@
 			f_err = f_out;
 			break;
 		case 'm':
-			terse_output = 1;
+			output_format = FIO_OUTPUT_TERSE;
+			break;
+		case 'F':
+			if (!strcmp(optarg, "minimal") ||
+			    !strcmp(optarg, "terse") ||
+			    !strcmp(optarg, "csv"))
+				output_format = FIO_OUTPUT_TERSE;
+			else if (!strcmp(optarg, "json"))
+				output_format = FIO_OUTPUT_JSON;
+			else
+				output_format = FIO_OUTPUT_NORMAL;
 			break;
 		case 'h':
 			if (!cur_client) {
@@ -1459,7 +1498,8 @@
 			break;
 		case 'V':
 			terse_version = atoi(optarg);
-			if (!(terse_version == 2 || terse_version == 3)) {
+			if (!(terse_version == 2 || terse_version == 3) ||
+			     (terse_version == 4)) {
 				log_err("fio: bad terse version format\n");
 				exit_val = 1;
 				do_exit++;
@@ -1708,7 +1748,7 @@
 		fio_gtod_cpu = def_thread.o.gtod_cpu;
 	}
 
-	if (!terse_output)
+	if (output_format == FIO_OUTPUT_NORMAL)
 		log_info("%s\n", fio_version_string);
 
 	return 0;
diff --git a/io_ddir.h b/io_ddir.h
index 908101a..f28f755 100644
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -3,12 +3,13 @@
 
 enum fio_ddir {
 	DDIR_READ = 0,
-	DDIR_WRITE,
-	DDIR_SYNC,
+	DDIR_WRITE = 1,
+	DDIR_TRIM = 2,
+	DDIR_RWDIR_CNT = 3,
+	DDIR_SYNC = 3,
 	DDIR_DATASYNC,
 	DDIR_SYNC_FILE_RANGE,
 	DDIR_WAIT,
-	DDIR_TRIM,
 	DDIR_INVAL = -1,
 };
 
@@ -16,14 +17,17 @@
 	TD_DDIR_READ		= 1 << 0,
 	TD_DDIR_WRITE		= 1 << 1,
 	TD_DDIR_RAND		= 1 << 2,
+	TD_DDIR_TRIM		= 1 << 3,
 	TD_DDIR_RW		= TD_DDIR_READ | TD_DDIR_WRITE,
 	TD_DDIR_RANDREAD	= TD_DDIR_READ | TD_DDIR_RAND,
 	TD_DDIR_RANDWRITE	= TD_DDIR_WRITE | TD_DDIR_RAND,
 	TD_DDIR_RANDRW		= TD_DDIR_RW | TD_DDIR_RAND,
+	TD_DDIR_RANDTRIM	= TD_DDIR_TRIM | TD_DDIR_RAND,
 };
 
 #define td_read(td)		((td)->o.td_ddir & TD_DDIR_READ)
 #define td_write(td)		((td)->o.td_ddir & TD_DDIR_WRITE)
+#define td_trim(td)		((td)->o.td_ddir & TD_DDIR_TRIM)
 #define td_rw(td)		(((td)->o.td_ddir & TD_DDIR_RW) == TD_DDIR_RW)
 #define td_random(td)		((td)->o.td_ddir & TD_DDIR_RAND)
 #define file_randommap(td, f)	(!(td)->o.norandommap && (f)->file_map)
@@ -36,15 +40,21 @@
 
 static inline int ddir_rw(enum fio_ddir ddir)
 {
-	return ddir == DDIR_READ || ddir == DDIR_WRITE;
+	return ddir == DDIR_READ || ddir == DDIR_WRITE || ddir == DDIR_TRIM;
 }
 
 static inline const char *ddir_str(enum fio_ddir ddir)
 {
 	const char *ddir_str[] = { NULL, "read", "write", "rw", NULL,
-				   "randread", "randwrite", "randrw" };
+				   "randread", "randwrite", "randrw",
+				   "trim", NULL, NULL, NULL, "randtrim" };
 
 	return ddir_str[ddir];
 }
 
+#define ddir_trim(ddir) ((ddir) == DDIR_TRIM)
+
+#define ddir_rw_sum(arr)	\
+	((arr)[DDIR_READ] + (arr)[DDIR_WRITE] + (arr)[DDIR_TRIM])
+
 #endif
diff --git a/io_u.c b/io_u.c
index 28a86f7..b0d51ef 100644
--- a/io_u.c
+++ b/io_u.c
@@ -15,7 +15,7 @@
 	int nr;				/* input */
 
 	int error;			/* output */
-	unsigned long bytes_done[2];	/* output */
+	unsigned long bytes_done[DDIR_RWDIR_CNT];	/* output */
 	struct timeval time;		/* output */
 };
 
@@ -543,6 +543,8 @@
 	if (td_rw(td) && __should_check_rate(td, odir))
 		td->rate_pending_usleep[odir] -= usec;
 
+	if (ddir_trim(ddir))
+		return ddir;
 	return ddir;
 }
 
@@ -599,8 +601,10 @@
 		ddir = td->rwmix_ddir;
 	} else if (td_read(td))
 		ddir = DDIR_READ;
-	else
+	else if (td_write(td))
 		ddir = DDIR_WRITE;
+	else
+		ddir = DDIR_TRIM;
 
 	td->rwmix_ddir = rate_ddir(td, ddir);
 	return td->rwmix_ddir;
@@ -1406,7 +1410,7 @@
 					(usec_for_io(td, idx) -
 					 utime_since_now(&td->start));
 			}
-			if (__should_check_rate(td, odx))
+			if (idx != DDIR_TRIM && __should_check_rate(td, odx))
 				td->rate_pending_usleep[odx] =
 					(usec_for_io(td, odx) -
 					 utime_since_now(&td->start));
@@ -1444,13 +1448,15 @@
 static void init_icd(struct thread_data *td, struct io_completion_data *icd,
 		     int nr)
 {
+	int ddir;
 	if (!td->o.disable_clat || !td->o.disable_bw)
 		fio_gettime(&icd->time, NULL);
 
 	icd->nr = nr;
 
 	icd->error = 0;
-	icd->bytes_done[0] = icd->bytes_done[1] = 0;
+	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+		icd->bytes_done[ddir] = 0;
 }
 
 static void ios_completed(struct thread_data *td,
@@ -1489,8 +1495,10 @@
 	}
 
 	if (bytes) {
-		bytes[0] += icd.bytes_done[0];
-		bytes[1] += icd.bytes_done[1];
+		int ddir;
+
+		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+			bytes[ddir] += icd.bytes_done[ddir];
 	}
 
 	return 0;
@@ -1527,8 +1535,10 @@
 	}
 
 	if (bytes) {
-		bytes[0] += icd.bytes_done[0];
-		bytes[1] += icd.bytes_done[1];
+		int ddir;
+
+		for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++)
+			bytes[ddir] += icd.bytes_done[ddir];
 	}
 
 	return 0;
diff --git a/ioengines.c b/ioengines.c
index b3c2e51..8b71e13 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -104,7 +104,9 @@
 	 * Unlike the included modules, external engines should have a
 	 * non-static ioengine structure that we can reference.
 	 */
-	ops = dlsym(dlhandle, "ioengine");
+	ops = dlsym(dlhandle, engine_lib);
+	if (!ops)
+		ops = dlsym(dlhandle, "ioengine");
 	if (!ops) {
 		td_vmsg(td, -1, dlerror(), "dlsym");
 		dlclose(dlhandle);
@@ -293,7 +295,7 @@
 			 "support direct IO, or iomem_align= is bad.\n");
 	}
 
-	if (!td->io_ops->commit) {
+	if (!td->io_ops->commit || ddir_trim(io_u->ddir)) {
 		io_u_mark_submit(td, 1);
 		io_u_mark_complete(td, 1);
 	}
@@ -302,8 +304,7 @@
 		if (ddir_rw(io_u->ddir)) {
 			io_u_mark_depth(td, 1);
 			td->ts.total_io_u[io_u->ddir]++;
-		} else if (io_u->ddir == DDIR_TRIM)
-			td->ts.total_io_u[2]++;
+		}
 	} else if (ret == FIO_Q_QUEUED) {
 		int r;
 
diff --git a/iolog.h b/iolog.h
index 122a982..4ad4e79 100644
--- a/iolog.h
+++ b/iolog.h
@@ -52,7 +52,7 @@
 	 * Windowed average, for logging single entries average over some
 	 * period of time.
 	 */
-	struct io_stat avg_window[2];
+	struct io_stat avg_window[DDIR_RWDIR_CNT];
 	unsigned long avg_msec;
 	unsigned long avg_last;
 };
@@ -123,7 +123,7 @@
 extern void finish_log(struct thread_data *, struct io_log *, const char *);
 extern void finish_log_named(struct thread_data *, struct io_log *, const char *, const char *);
 extern void __finish_log(struct io_log *, const char *);
-extern struct io_log *agg_io_log[2];
+extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 extern int write_bw_log;
 extern void add_agg_sample(unsigned long, enum fio_ddir, unsigned int);
 
diff --git a/json.c b/json.c
new file mode 100644
index 0000000..8efbbda
--- /dev/null
+++ b/json.c
@@ -0,0 +1,336 @@
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <errno.h>
+#include <stdarg.h>
+#include "json.h"
+#include "log.h"
+
+struct json_object *json_create_object(void)
+{
+	struct json_object *obj = malloc(sizeof(struct json_object));
+	if (obj)
+		memset(obj, 0, sizeof(struct json_object));
+	return obj;
+}
+
+struct json_array *json_create_array(void)
+{
+	struct json_array *array = malloc(sizeof(struct json_array));
+	if (array)
+		memset(array, 0, sizeof(struct json_array));
+	return array;
+}
+
+static struct json_pair *json_create_pair(const char *name, struct json_value *value)
+{
+	struct json_pair *pair = malloc(sizeof(struct json_pair));
+	if (pair) {
+		pair->name = strdup(name);
+		pair->value = value;
+
+		value->parent_type = JSON_PARENT_TYPE_PAIR;
+		value->parent_pair = pair;
+	}
+	return pair;
+}
+
+static struct json_value *json_create_value_int(long number)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_INTEGER;
+		value->integer_number = number;
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_float(float number)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_FLOAT;
+		value->float_number = number;
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_string(const char *str)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_STRING;
+		value->string = strdup(str);
+		if (!value->string) {
+			free(value);
+			value = NULL;
+		}
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_object(struct json_object *obj)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_OBJECT;
+		value->object = obj;
+		obj->parent = value;
+	}
+	return value;
+}
+
+static struct json_value *json_create_value_array(struct json_array *array)
+{
+	struct json_value *value = malloc(sizeof(struct json_value));
+
+	if (value) {
+		value->type = JSON_TYPE_ARRAY;
+		value->array = array;
+		array->parent = value;
+	}
+	return value;
+}
+
+static void json_free_pair(struct json_pair *pair);
+static void json_free_value(struct json_value *value);
+
+void json_free_object(struct json_object *obj)
+{
+	int i;
+
+	for (i = 0; i < obj->pair_cnt; i++)
+		json_free_pair(obj->pairs[i]);
+	free(obj->pairs);
+	free(obj);
+}
+
+static void json_free_array(struct json_array *array)
+{
+	int i;
+
+	for (i = 0; i < array->value_cnt; i++)
+		json_free_value(array->values[i]);
+	free(array->values);
+	free(array);
+}
+
+static void json_free_pair(struct json_pair *pair)
+{
+	json_free_value(pair->value);
+	free(pair->name);
+	free(pair);
+}
+
+static void json_free_value(struct json_value *value)
+{
+	switch (value->type) {
+	case JSON_TYPE_STRING:
+		free(value->string);
+		break;
+	case JSON_TYPE_OBJECT:
+		json_free_object(value->object);
+		break;
+	case JSON_TYPE_ARRAY:
+		json_free_array(value->array);
+		break;
+	}
+	free(value);
+}
+
+static int json_array_add_value(struct json_array *array, struct json_value *value)
+{
+	struct json_value **values = realloc(array->values,
+		sizeof(struct json_value *) * (array->value_cnt + 1));
+
+	if (!values)
+		return ENOMEM;
+	values[array->value_cnt] = value;
+	array->value_cnt++;
+	array->values = values;
+
+	value->parent_type = JSON_PARENT_TYPE_ARRAY;
+	value->parent_array = array;
+	return 0;
+}
+
+static int json_object_add_pair(struct json_object *obj, struct json_pair *pair)
+{
+	struct json_pair **pairs = realloc(obj->pairs,
+		sizeof(struct json_pair *) * (obj->pair_cnt + 1));
+	if (!pairs)
+		return ENOMEM;
+	pairs[obj->pair_cnt] = pair;
+	obj->pair_cnt++;
+	obj->pairs = pairs;
+
+	pair->parent = obj;
+	return 0;
+}
+
+int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...)
+{
+	struct json_value *value;
+	struct json_pair *pair;
+	va_list args;
+	int ret;
+
+	va_start(args, type);
+	if (type == JSON_TYPE_STRING)
+		value = json_create_value_string(va_arg(args, char *));
+	else if (type == JSON_TYPE_INTEGER)
+		value = json_create_value_int(va_arg(args, long));
+	else if (type == JSON_TYPE_FLOAT)
+		value = json_create_value_float(va_arg(args, double));
+	else if (type == JSON_TYPE_OBJECT)
+		value = json_create_value_object(va_arg(args, struct json_object *));
+	else
+		value = json_create_value_array(va_arg(args, struct json_array *));
+	va_end(args);
+
+	if (!value)
+		return ENOMEM;
+
+	pair = json_create_pair(name, value);
+	if (!pair) {
+		json_free_value(value);
+		return ENOMEM;
+	}
+	ret = json_object_add_pair(obj, pair);
+	if (ret) {
+		json_free_pair(pair);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+static void json_print_array(struct json_array *array);
+int json_array_add_value_type(struct json_array *array, int type, ...)
+{
+	struct json_value *value;
+	va_list args;
+	int ret;
+
+	va_start(args, type);
+	if (type == JSON_TYPE_STRING)
+		value = json_create_value_string(va_arg(args, char *));
+	else if (type == JSON_TYPE_INTEGER)
+		value = json_create_value_int(va_arg(args, long));
+	else if (type == JSON_TYPE_FLOAT)
+		value = json_create_value_float(va_arg(args, double));
+	else if (type == JSON_TYPE_OBJECT)
+		value = json_create_value_object(va_arg(args, struct json_object *));
+	else
+		value = json_create_value_array(va_arg(args, struct json_array *));
+	va_end(args);
+
+	if (!value)
+		return ENOMEM;
+
+	ret = json_array_add_value(array, value);
+	if (ret) {
+		json_free_value(value);
+		return ENOMEM;
+	}
+	return 0;
+}
+
+static int json_value_level(struct json_value *value);
+static int json_pair_level(struct json_pair *pair);
+static int json_array_level(struct json_array *array);
+static int json_object_level(struct json_object *object)
+{
+	if (object->parent == NULL)
+		return 0;
+	return json_value_level(object->parent);
+}
+
+static int json_pair_level(struct json_pair *pair)
+{
+	return json_object_level(pair->parent) + 1;
+}
+
+static int json_array_level(struct json_array *array)
+{
+	return json_value_level(array->parent);
+}
+
+static int json_value_level(struct json_value *value)
+{
+	if (value->parent_type == JSON_PARENT_TYPE_PAIR)
+		return json_pair_level(value->parent_pair);
+	else
+		return json_array_level(value->parent_array) + 1;
+}
+
+static void json_print_level(int level)
+{
+	while (level-- > 0)
+		log_info("  ");
+}
+
+static void json_print_pair(struct json_pair *pair);
+static void json_print_array(struct json_array *array);
+static void json_print_value(struct json_value *value);
+void json_print_object(struct json_object *obj)
+{
+	int i;
+
+	log_info("{\n");
+	for (i = 0; i < obj->pair_cnt; i++) {
+		if (i > 0)
+			log_info(",\n");
+		json_print_pair(obj->pairs[i]);
+	}
+	log_info("\n");
+	json_print_level(json_object_level(obj));
+	log_info("}");
+}
+
+static void json_print_pair(struct json_pair *pair)
+{
+	json_print_level(json_pair_level(pair));
+	log_info("\"%s\" : ", pair->name);
+	json_print_value(pair->value);
+}
+
+static void json_print_array(struct json_array *array)
+{
+	int i;
+
+	log_info("[\n");
+	for (i = 0; i < array->value_cnt; i++) {
+		if (i > 0)
+			log_info(",\n");
+		json_print_level(json_value_level(array->values[i]));
+		json_print_value(array->values[i]);
+	}
+	log_info("\n");
+	json_print_level(json_array_level(array));
+	log_info("]");
+}
+
+static void json_print_value(struct json_value *value)
+{
+	switch (value->type) {
+	case JSON_TYPE_STRING:
+		log_info("\"%s\"", value->string);
+		break;
+	case JSON_TYPE_INTEGER:
+		log_info("%ld", value->integer_number);
+		break;
+	case JSON_TYPE_FLOAT:
+		log_info("%.2f", value->float_number);
+		break;
+	case JSON_TYPE_OBJECT:
+		json_print_object(value->object);
+		break;
+	case JSON_TYPE_ARRAY:
+		json_print_array(value->array);
+		break;
+	}
+}
diff --git a/json.h b/json.h
new file mode 100644
index 0000000..4d05e82
--- /dev/null
+++ b/json.h
@@ -0,0 +1,77 @@
+#ifndef __JSON__H
+#define __JSON__H
+struct json_object;
+struct json_array;
+struct json_pair;
+
+#define JSON_TYPE_STRING 0
+#define JSON_TYPE_INTEGER 1
+#define JSON_TYPE_FLOAT 2
+#define JSON_TYPE_OBJECT 3
+#define JSON_TYPE_ARRAY 4
+#define JSON_PARENT_TYPE_PAIR 0
+#define JSON_PARENT_TYPE_ARRAY 1
+struct json_value {
+	int type;
+	union {
+		long integer_number;
+		double float_number;
+		char *string;
+		struct json_object *object;
+		struct json_array *array;
+	};
+	int parent_type;
+	union {
+		struct json_pair *parent_pair;
+		struct json_array *parent_array;
+	};
+};
+
+struct json_array {
+	struct json_value **values;
+	int value_cnt;
+	struct json_value *parent;
+};
+
+struct json_object {
+	struct json_pair **pairs;
+	int pair_cnt;
+	struct json_value *parent;
+};
+
+struct json_pair {
+	char *name;
+	struct json_value *value;
+	struct json_object *parent;
+};
+
+struct json_object *json_create_object(void);
+struct json_array *json_create_array(void);
+
+void json_free_object(struct json_object *obj);
+
+int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...);
+#define json_object_add_value_int(obj, name, val) \
+	json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (val))
+#define json_object_add_value_float(obj, name, val) \
+	json_object_add_value_type((obj), name, JSON_TYPE_FLOAT, (val))
+#define json_object_add_value_string(obj, name, val) \
+	json_object_add_value_type((obj), name, JSON_TYPE_STRING, (val))
+#define json_object_add_value_object(obj, name, val) \
+	json_object_add_value_type((obj), name, JSON_TYPE_OBJECT, (val))
+#define json_object_add_value_array(obj, name, val) \
+	json_object_add_value_type((obj), name, JSON_TYPE_ARRAY, (val))
+int json_array_add_value_type(struct json_array *array, int type, ...);
+#define json_array_add_value_int(obj, val) \
+	json_array_add_value_type((obj), JSON_TYPE_INTEGER, (val))
+#define json_array_add_value_float(obj, val) \
+	json_array_add_value_type((obj), JSON_TYPE_FLOAT, (val))
+#define json_array_add_value_string(obj, val) \
+	json_array_add_value_type((obj), JSON_TYPE_STRING, (val))
+#define json_array_add_value_object(obj, val) \
+	json_array_add_value_type((obj), JSON_TYPE_OBJECT, (val))
+#define json_array_add_value_array(obj, val) \
+	json_array_add_value_type((obj), JSON_TYPE_ARRAY, (val))
+
+void json_print_object(struct json_object *obj);
+#endif
diff --git a/libfio.c b/libfio.c
index 36876dd..f680be8 100644
--- a/libfio.c
+++ b/libfio.c
@@ -74,13 +74,16 @@
 
 static void reset_io_counters(struct thread_data *td)
 {
-	td->stat_io_bytes[0] = td->stat_io_bytes[1] = 0;
-	td->this_io_bytes[0] = td->this_io_bytes[1] = 0;
-	td->stat_io_blocks[0] = td->stat_io_blocks[1] = 0;
-	td->this_io_blocks[0] = td->this_io_blocks[1] = 0;
+	int ddir;
+	for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+		td->stat_io_bytes[ddir] = 0;
+		td->this_io_bytes[ddir] = 0;
+		td->stat_io_blocks[ddir] = 0;
+		td->this_io_blocks[ddir] = 0;
+		td->rate_bytes[ddir] = 0;
+		td->rate_blocks[ddir] = 0;
+	}
 	td->zone_bytes = 0;
-	td->rate_bytes[0] = td->rate_bytes[1] = 0;
-	td->rate_blocks[0] = td->rate_blocks[1] = 0;
 
 	td->last_was_sync = 0;
 
@@ -115,16 +118,15 @@
 
 	reset_io_counters(td);
 
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		td->io_bytes[i] = 0;
 		td->io_blocks[i] = 0;
 		td->io_issues[i] = 0;
 		td->ts.total_io_u[i] = 0;
+		td->ts.runtime[i] = 0;
 	}
 
 	fio_gettime(&tv, NULL);
-	td->ts.runtime[0] = 0;
-	td->ts.runtime[1] = 0;
 	memcpy(&td->epoch, &tv, sizeof(tv));
 	memcpy(&td->start, &tv, sizeof(tv));
 }
diff --git a/options.c b/options.c
index 2201f59..0394456 100644
--- a/options.c
+++ b/options.c
@@ -165,7 +165,7 @@
 static int str_bssplit_cb(void *data, const char *input)
 {
 	struct thread_data *td = data;
-	char *str, *p, *odir;
+	char *str, *p, *odir, *ddir;
 	int ret = 0;
 
 	p = str = strdup(input);
@@ -175,7 +175,21 @@
 
 	odir = strchr(str, ',');
 	if (odir) {
-		ret = bssplit_ddir(&td->o, DDIR_WRITE, odir + 1);
+		ddir = strchr(odir + 1, ',');
+		if (ddir) {
+			ret = bssplit_ddir(&td->o, DDIR_TRIM, ddir + 1);
+			if (!ret)
+				*ddir = '\0';
+		} else {
+			char *op;
+
+			op = strdup(odir + 1);
+			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+
+			free(op);
+		}
+		if (!ret)
+			ret = bssplit_ddir(&td->o, DDIR_WRITE, odir + 1);
 		if (!ret) {
 			*odir = '\0';
 			ret = bssplit_ddir(&td->o, DDIR_READ, str);
@@ -184,12 +198,15 @@
 		char *op;
 
 		op = strdup(str);
-
-		ret = bssplit_ddir(&td->o, DDIR_READ, str);
-		if (!ret)
-			ret = bssplit_ddir(&td->o, DDIR_WRITE, op);
-
+		ret = bssplit_ddir(&td->o, DDIR_WRITE, op);
 		free(op);
+
+		if (!ret) {
+			op = strdup(str);
+			ret = bssplit_ddir(&td->o, DDIR_TRIM, op);
+			free(op);
+		}
+		ret = bssplit_ddir(&td->o, DDIR_READ, str);
 	}
 
 	free(p);
@@ -950,6 +967,10 @@
 			    .oval = TD_DDIR_WRITE,
 			    .help = "Sequential write",
 			  },
+			  { .ival = "trim",
+			    .oval = TD_DDIR_TRIM,
+			    .help = "Sequential trim",
+			  },
 			  { .ival = "randread",
 			    .oval = TD_DDIR_RANDREAD,
 			    .help = "Random read",
@@ -958,6 +979,10 @@
 			    .oval = TD_DDIR_RANDWRITE,
 			    .help = "Random write",
 			  },
+			  { .ival = "randtrim",
+			    .oval = TD_DDIR_RANDTRIM,
+			    .help = "Random trim",
+			  },
 			  { .ival = "rw",
 			    .oval = TD_DDIR_RW,
 			    .help = "Sequential read and write mix",
@@ -1077,6 +1102,21 @@
 			    .help = "RDMA IO engine",
 			  },
 #endif
+#ifdef FIO_HAVE_FUSION_AW
+			  { .ival = "fusion-aw-sync",
+			    .help = "Fusion-io atomic write engine",
+			  },
+#endif
+#ifdef FIO_HAVE_E4_ENG
+			  { .ival = "e4defrag",
+			    .help = "ext4 defrag engine",
+			  },
+#endif
+#ifdef FIO_HAVE_FALLOC_ENG
+			  { .ival = "falloc",
+			    .help = "fallocate() file based engine",
+			  },
+#endif
 			  { .ival = "external",
 			    .help = "Load external engine (append name)",
 			  },
@@ -1200,6 +1240,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= td_var_offset(bs[DDIR_READ]),
 		.off2	= td_var_offset(bs[DDIR_WRITE]),
+		.off3	= td_var_offset(bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Block size unit",
 		.def	= "4k",
@@ -1216,6 +1257,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= td_var_offset(ba[DDIR_READ]),
 		.off2	= td_var_offset(ba[DDIR_WRITE]),
+		.off3	= td_var_offset(ba[DDIR_TRIM]),
 		.minval	= 1,
 		.help	= "IO block offset alignment",
 		.parent	= "rw",
@@ -1233,6 +1275,8 @@
 		.off2	= td_var_offset(max_bs[DDIR_READ]),
 		.off3	= td_var_offset(min_bs[DDIR_WRITE]),
 		.off4	= td_var_offset(max_bs[DDIR_WRITE]),
+		.off5	= td_var_offset(min_bs[DDIR_TRIM]),
+		.off6	= td_var_offset(max_bs[DDIR_TRIM]),
 		.minval = 1,
 		.help	= "Set block size range (in more detail than bs)",
 		.parent = "rw",
@@ -2115,8 +2159,9 @@
 		.name	= "rate",
 		.lname	= "I/O rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate[0]),
-		.off2	= td_var_offset(rate[1]),
+		.off1	= td_var_offset(rate[DDIR_READ]),
+		.off2	= td_var_offset(rate[DDIR_WRITE]),
+		.off3	= td_var_offset(rate[DDIR_TRIM]),
 		.help	= "Set bandwidth rate",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RATE,
@@ -2125,8 +2170,9 @@
 		.name	= "ratemin",
 		.lname	= "I/O min rate",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(ratemin[0]),
-		.off2	= td_var_offset(ratemin[1]),
+		.off1	= td_var_offset(ratemin[DDIR_READ]),
+		.off2	= td_var_offset(ratemin[DDIR_WRITE]),
+		.off3	= td_var_offset(ratemin[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shutdown",
 		.parent	= "rate",
 		.hide	= 1,
@@ -2137,8 +2183,9 @@
 		.name	= "rate_iops",
 		.lname	= "I/O rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops[0]),
-		.off2	= td_var_offset(rate_iops[1]),
+		.off1	= td_var_offset(rate_iops[DDIR_READ]),
+		.off2	= td_var_offset(rate_iops[DDIR_WRITE]),
+		.off3	= td_var_offset(rate_iops[DDIR_TRIM]),
 		.help	= "Limit IO used to this number of IO operations/sec",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
@@ -2148,8 +2195,9 @@
 		.name	= "rate_iops_min",
 		.lname	= "I/O min rate IOPS",
 		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(rate_iops_min[0]),
-		.off2	= td_var_offset(rate_iops_min[1]),
+		.off1	= td_var_offset(rate_iops_min[DDIR_READ]),
+		.off2	= td_var_offset(rate_iops_min[DDIR_WRITE]),
+		.off3	= td_var_offset(rate_iops_min[DDIR_TRIM]),
 		.help	= "Job must meet this rate or it will be shut down",
 		.parent	= "rate_iops",
 		.hide	= 1,
diff --git a/os/os-linux.h b/os/os-linux.h
index 3f5b2d8..081f5d6 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -45,6 +45,7 @@
 #define FIO_HAVE_CLOCK_MONOTONIC
 #define FIO_HAVE_GETTID
 #define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_E4_ENG
 
 /*
  * Can only enable this for newer glibcs, or the header and defines are
@@ -57,6 +58,10 @@
 #define FIO_HAVE_LINUX_FALLOCATE
 #endif
 
+#ifdef FIO_HAVE_LINUX_FALLOCATE
+#define FIO_HAVE_FALLOC_ENG
+#endif
+
 #ifdef SYNC_FILE_RANGE_WAIT_BEFORE
 #define FIO_HAVE_SYNC_FILE_RANGE
 #endif
diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index f8da8fc..571492e 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -10,7 +10,7 @@
 	<Product Id="2BA394F9-0D9E-4597-BB9D-6B18097D64BB"

 	  Codepage="1252" Language="1033"

 	  Manufacturer="fio" Name="fio"

-	  UpgradeCode="2338A332-5511-43cf-b9BD-5C60496CCFCC" Version="2.0.8">

+	  UpgradeCode="2338A332-5511-43cf-b9BD-5C60496CCFCC" Version="2.0.9">

 		<Package 

 		  Comments="Contact: Your local administrator"

 		  Description="Flexible IO Tester"

diff --git a/os/windows/posix.c b/os/windows/posix.c
index bfffe77..ce41ef8 100755
--- a/os/windows/posix.c
+++ b/os/windows/posix.c
@@ -7,6 +7,7 @@
 #include <netinet/in.h>
 #include <windows.h>
 #include <stddef.h>
+#include <string.h>
 #include <stdlib.h>
 #include <unistd.h>
 #include <dirent.h>
@@ -23,6 +24,24 @@
 extern unsigned long mtime_since_now(struct timeval *);
 extern void fio_gettime(struct timeval *, void *);
 
+/* These aren't defined in the MinGW headers */
+HRESULT WINAPI StringCchCopyA(
+  char *pszDest,
+  size_t cchDest,
+  const char *pszSrc);
+
+HRESULT WINAPI StringCchPrintfA(
+  char *pszDest,
+  size_t cchDest,
+  const char *pszFormat,
+  ...);
+
+int vsprintf_s(
+  char *buffer,
+  size_t numberOfElements,
+  const char *format,
+  va_list argptr);
+
 long sysconf(int name)
 {
 	long long val = -1;
@@ -119,11 +138,6 @@
 	return 0;
 }
 
-void syslog(int priority, const char *message, ... /* argument */)
-{
-	log_err("%s is not implemented\n", __func__);
-}
-
 int sigaction(int sig, const struct sigaction *act,
 		struct sigaction *oact)
 {
@@ -187,14 +201,43 @@
 	return (-1);
 }
 
+static HANDLE log_file = INVALID_HANDLE_VALUE;
+
 void openlog(const char *ident, int logopt, int facility)
 {
-	log_err("%s is not implemented\n", __func__);
+	if (log_file == INVALID_HANDLE_VALUE)
+		log_file = CreateFileA("syslog.txt", GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, 0, NULL);
 }
 
 void closelog(void)
 {
-	log_err("%s is not implemented\n", __func__);
+	CloseHandle(log_file);
+	log_file = INVALID_HANDLE_VALUE;
+}
+
+void syslog(int priority, const char *message, ... /* argument */)
+{
+	va_list v;
+	int len;
+	char *output;
+	DWORD bytes_written;
+
+	if (log_file == INVALID_HANDLE_VALUE) {
+		log_file = CreateFileA("syslog.txt", GENERIC_WRITE, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_ALWAYS, 0, NULL);
+	}
+
+	if (log_file == INVALID_HANDLE_VALUE) {
+		log_err("syslog: failed to open log file\n");
+		return;
+	}
+
+	va_start(v, message);
+	len = _vscprintf(message, v);
+	output = malloc(len + sizeof(char));
+	vsprintf_s(output, len + sizeof(char), message, v);
+	WriteFile(log_file, output, len, &bytes_written, NULL);
+	va_end(v);
+    free(output);
 }
 
 int kill(pid_t pid, int sig)
@@ -334,7 +377,7 @@
 
 int posix_fallocate(int fd, off_t offset, off_t len)
 {
-	const int BUFFER_SIZE = 64 * 1024 * 1024;
+	const int BUFFER_SIZE = 256 * 1024;
 	int rc = 0;
 	char *buf;
 	unsigned int write_len;
@@ -498,7 +541,6 @@
 /* Windows doesn't support advice for memory pages. Just ignore it. */
 int msync(void *addr, size_t len, int flags)
 {
-	log_err("%s is not implemented\n", __func__);
 	errno = ENOSYS;
 	return -1;
 }
@@ -665,23 +707,65 @@
 
 DIR *opendir(const char *dirname)
 {
-	log_err("%s is not implemented\n", __func__);
-	errno = ENOSYS;
-	return NULL;
+    struct dirent_ctx *dc = NULL;
+
+    /* See if we can open it. If not, we'll return an error here */
+    HANDLE file = CreateFileA(dirname, 0, FILE_SHARE_READ | FILE_SHARE_WRITE, NULL, OPEN_EXISTING, FILE_FLAG_BACKUP_SEMANTICS, NULL);
+    if (file != INVALID_HANDLE_VALUE) {
+        CloseHandle(file);
+        dc = (struct dirent_ctx*)malloc(sizeof(struct dirent_ctx));
+        StringCchCopyA(dc->dirname, MAX_PATH, dirname);
+        dc->find_handle = INVALID_HANDLE_VALUE;
+    } else {
+        DWORD error = GetLastError();
+        if (error == ERROR_FILE_NOT_FOUND)
+            errno = ENOENT;
+
+        else if (error == ERROR_PATH_NOT_FOUND)
+            errno = ENOTDIR;
+        else if (error == ERROR_TOO_MANY_OPEN_FILES)
+            errno = ENFILE;
+        else if (error == ERROR_ACCESS_DENIED)
+            errno = EACCES;
+        else
+            errno = error;
+    }
+
+    return dc;
 }
 
 int closedir(DIR *dirp)
 {
-	log_err("%s is not implemented\n", __func__);
-	errno = ENOSYS;
-	return -1;
+    if (dirp != NULL && dirp->find_handle != INVALID_HANDLE_VALUE)
+        FindClose(dirp->find_handle);
+
+    free(dirp);
+    return 0;
 }
 
 struct dirent *readdir(DIR *dirp)
 {
-	log_err("%s is not implemented\n", __func__);
-	errno = ENOSYS;
-	return NULL;
+	static struct dirent de;
+	WIN32_FIND_DATA find_data;
+
+	if (dirp == NULL)
+		return NULL;
+
+	if (dirp->find_handle == INVALID_HANDLE_VALUE) {
+		char search_pattern[MAX_PATH];
+		StringCchPrintfA(search_pattern, MAX_PATH, "%s\\*", dirp->dirname);
+		dirp->find_handle = FindFirstFileA(search_pattern, &find_data);
+		if (dirp->find_handle == INVALID_HANDLE_VALUE)
+			return NULL;
+	} else {
+		if (!FindNextFile(dirp->find_handle, &find_data))
+			return NULL;
+	}
+
+	StringCchCopyA(de.d_name, MAX_PATH, find_data.cFileName);
+	de.d_ino = 0;
+
+	return &de;
 }
 
 uid_t geteuid(void)
@@ -691,13 +775,6 @@
 	return -1;
 }
 
-int inet_aton(char *addr)
-{
-	log_err("%s is not implemented\n", __func__);
-	errno = ENOSYS;
-	return 0;
-}
-
 const char* inet_ntop(int af, const void *restrict src,
 		char *restrict dst, socklen_t size)
 {
diff --git a/os/windows/posix/include/dirent.h b/os/windows/posix/include/dirent.h
index ca4d4c9..eef6a88 100644
--- a/os/windows/posix/include/dirent.h
+++ b/os/windows/posix/include/dirent.h
@@ -1,13 +1,21 @@
 #ifndef DIRENT_H

 #define DIRENT_H

 

+#include <windows.h>

+

 struct dirent

 {

 	ino_t  d_ino;     /*  File serial number */

-	char   d_name[];  /* Name of entry */

+	char   d_name[MAX_PATH];  /* Name of entry */

 };

 

-typedef int DIR;

+struct dirent_ctx

+{

+	HANDLE find_handle;

+	char dirname[MAX_PATH];

+};

+

+typedef struct dirent_ctx DIR;

 

 DIR *opendir(const char *dirname);

 struct dirent *readdir(DIR *dirp);

diff --git a/parse.c b/parse.c
index 6317013..419e80f 100644
--- a/parse.c
+++ b/parse.c
@@ -413,11 +413,17 @@
 	case FIO_OPT_INT:
 	case FIO_OPT_STR_VAL: {
 		fio_opt_str_val_fn *fn = o->cb;
+		char tmp[128], *p;
+
+		strncpy(tmp, ptr, sizeof(tmp) - 1);
+		p = strchr(tmp, ',');
+		if (p)
+			*p = '\0';
 
 		if (is_time)
-			ret = check_str_time(ptr, &ull);
+			ret = check_str_time(tmp, &ull);
 		else
-			ret = check_str_bytes(ptr, &ull, data);
+			ret = check_str_bytes(tmp, &ull, data);
 
 		if (ret)
 			break;
@@ -443,12 +449,32 @@
 					else
 						val_store(ilp, ull, o->off1, 0, data);
 				}
-				if (!more) {
+				if (curr == 1) {
 					if (o->roff2)
 						*(unsigned int *) o->roff2 = ull;
 					else if (o->off2)
 						val_store(ilp, ull, o->off2, 0, data);
 				}
+				if (curr == 2) {
+					if (o->roff3)
+						*(unsigned int *) o->roff3 = ull;
+					else if (o->off3)
+						val_store(ilp, ull, o->off3, 0, data);
+				}
+				if (!more) {
+					if (curr < 1) {
+						if (o->roff2)
+							*(unsigned int *) o->roff2 = ull;
+						else if (o->off2)
+							val_store(ilp, ull, o->off2, 0, data);
+					}
+					if (curr < 2) {
+						if (o->roff3)
+							*(unsigned int *) o->roff3 = ull;
+						else if (o->off3)
+							val_store(ilp, ull, o->off3, 0, data);
+					}
+				}
 			} else {
 				if (first) {
 					if (o->roff1)
@@ -597,12 +623,43 @@
 				else
 					val_store(ilp, ul2, o->off2, 0, data);
 			}
-			if (o->roff3 && o->roff4) {
-				*(unsigned int *) o->roff3 = ul1;
-				*(unsigned int *) o->roff4 = ul2;
-			} else if (o->off3 && o->off4) {
-				val_store(ilp, ul1, o->off3, 0, data);
-				val_store(ilp, ul2, o->off4, 0, data);
+			if (curr == 1) {
+				if (o->roff3 && o->roff4) {
+					*(unsigned int *) o->roff3 = ul1;
+					*(unsigned int *) o->roff4 = ul2;
+				} else if (o->off3 && o->off4) {
+					val_store(ilp, ul1, o->off3, 0, data);
+					val_store(ilp, ul2, o->off4, 0, data);
+				}
+			}
+			if (curr == 2) {
+				if (o->roff5 && o->roff6) {
+					*(unsigned int *) o->roff5 = ul1;
+					*(unsigned int *) o->roff6 = ul2;
+				} else if (o->off5 && o->off6) {
+					val_store(ilp, ul1, o->off5, 0, data);
+					val_store(ilp, ul2, o->off6, 0, data);
+				}
+			}
+			if (!more) {
+				if (curr < 1) {
+					if (o->roff3 && o->roff4) {
+						*(unsigned int *) o->roff3 = ul1;
+						*(unsigned int *) o->roff4 = ul2;
+					} else if (o->off3 && o->off4) {
+						val_store(ilp, ul1, o->off3, 0, data);
+						val_store(ilp, ul2, o->off4, 0, data);
+					}
+				}
+				if (curr < 2) {
+					if (o->roff5 && o->roff6) {
+						*(unsigned int *) o->roff5 = ul1;
+						*(unsigned int *) o->roff6 = ul2;
+					} else if (o->off5 && o->off6) {
+						val_store(ilp, ul1, o->off5, 0, data);
+						val_store(ilp, ul2, o->off6, 0, data);
+					}
+				}
 			}
 		}
 
@@ -706,7 +763,7 @@
 			ptr2 = strchr(ptr, ',');
 			if (ptr2 && *(ptr2 + 1) == '\0')
 				*ptr2 = '\0';
-			if (o->type != FIO_OPT_STR_MULTI) {
+			if (o->type != FIO_OPT_STR_MULTI && o->type != FIO_OPT_RANGE) {
 				if (!ptr2)
 					ptr2 = strchr(ptr, ':');
 				if (!ptr2)
diff --git a/parse.h b/parse.h
index 83cb5b1..7fee4fa 100644
--- a/parse.h
+++ b/parse.h
@@ -33,7 +33,7 @@
 };
 
 #define OPT_LEN_MAX 	4096
-#define PARSE_MAX_VP	16
+#define PARSE_MAX_VP	24
 
 /*
  * Option define
@@ -47,7 +47,9 @@
 	unsigned int off2;
 	unsigned int off3;
 	unsigned int off4;
-	void *roff1, *roff2, *roff3, *roff4;
+	unsigned int off5;
+	unsigned int off6;
+	void *roff1, *roff2, *roff3, *roff4, *roff5, *roff6;
 	unsigned int maxval;		/* max and min value */
 	int minval;
 	double maxfp;			/* max and min floating value */
diff --git a/server.h b/server.h
index a838126..938f20a 100644
--- a/server.h
+++ b/server.h
@@ -157,6 +157,12 @@
 extern void fio_server_send_ts(struct thread_stat *, struct group_run_stats *);
 extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
+extern void fio_server_idle_loop(void);
+
+extern int fio_clients_connect(void);
+extern int fio_clients_send_ini(const char *);
+extern void fio_client_add_cmd_option(void *, const char *);
+extern void fio_client_add_ini_file(void *, const char *);
 
 extern int fio_recv_data(int sk, void *p, unsigned int len);
 extern int fio_send_data(int sk, const void *p, unsigned int len);
diff --git a/stat.c b/stat.c
index ef447b1..edeae87 100644
--- a/stat.c
+++ b/stat.c
@@ -10,6 +10,7 @@
 #include "fio.h"
 #include "diskutil.h"
 #include "lib/ieee754.h"
+#include "json.h"
 
 void update_rusage_stat(struct thread_data *td)
 {
@@ -257,12 +258,12 @@
 void show_group_stats(struct group_run_stats *rs)
 {
 	char *p1, *p2, *p3, *p4;
-	const char *ddir_str[] = { "   READ", "  WRITE" };
+	const char *ddir_str[] = { "   READ", "  WRITE" , "   TRIM"};
 	int i;
 
 	log_info("\nRun status group %d (all jobs):\n", rs->groupid);
 
-	for (i = 0; i <= DDIR_WRITE; i++) {
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		const int i2p = is_power_of_2(rs->kb_base);
 
 		if (!rs->max_run[i])
@@ -306,7 +307,7 @@
 static void stat_calc_lat(struct thread_stat *ts, double *dst,
 			  unsigned int *src, int nr)
 {
-	unsigned long total = ts_total_io_u(ts);
+	unsigned long total = ddir_rw_sum(ts->total_io_u);
 	int i;
 
 	/*
@@ -355,7 +356,7 @@
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 			     int ddir)
 {
-	const char *ddir_str[] = { "read ", "write" };
+	const char *ddir_str[] = { "read ", "write", "trim" };
 	unsigned long min, max, runt;
 	unsigned long long bw, iops;
 	double mean, dev;
@@ -488,9 +489,9 @@
 	time_t time_p;
 	char time_buf[64];
 
-
-	if (!(ts->io_bytes[0] + ts->io_bytes[1]) &&
-	    !(ts->total_io_u[0] + ts->total_io_u[1]))
+	if (!(ts->io_bytes[DDIR_READ] + ts->io_bytes[DDIR_WRITE] +
+	    ts->io_bytes[DDIR_TRIM]) && !(ts->total_io_u[DDIR_READ] +
+	    ts->total_io_u[DDIR_WRITE] + ts->total_io_u[DDIR_TRIM]))
 		return;
 
 	time(&time_p);
@@ -514,6 +515,8 @@
 		show_ddir_status(rs, ts, DDIR_READ);
 	if (ts->io_bytes[DDIR_WRITE])
 		show_ddir_status(rs, ts, DDIR_WRITE);
+	if (ts->io_bytes[DDIR_TRIM])
+		show_ddir_status(rs, ts, DDIR_TRIM);
 
 	show_latencies(ts);
 
@@ -531,7 +534,7 @@
 	log_info("  cpu          : usr=%3.2f%%, sys=%3.2f%%, ctx=%lu, majf=%lu,"
 		 " minf=%lu\n", usr_cpu, sys_cpu, ts->ctx, ts->majf, ts->minf);
 
-	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
 	log_info("  IO depths    : 1=%3.1f%%, 2=%3.1f%%, 4=%3.1f%%, 8=%3.1f%%,"
 		 " 16=%3.1f%%, 32=%3.1f%%, >=64=%3.1f%%\n", io_u_dist[0],
 					io_u_dist[1], io_u_dist[2],
@@ -635,6 +638,109 @@
 		log_info(";%lu;%lu;%f%%;%f;%f", 0UL, 0UL, 0.0, 0.0, 0.0);
 }
 
+static void add_ddir_status_json(struct thread_stat *ts,
+		struct group_run_stats *rs, int ddir, struct json_object *parent)
+{
+	unsigned long min, max;
+	unsigned long long bw, iops;
+	unsigned int *ovals = NULL;
+	double mean, dev;
+	unsigned int len, minv, maxv;
+	int i;
+	const char *ddirname[] = {"read", "write", "trim"};
+	struct json_object *dir_object, *tmp_object, *percentile_object;
+	char buf[120];
+	double p_of_agg = 100.0;
+
+	assert(ddir_rw(ddir));
+
+	dir_object = json_create_object();
+	json_object_add_value_object(parent, ddirname[ddir], dir_object);
+
+	iops = bw = 0;
+	if (ts->runtime[ddir]) {
+		uint64_t runt = ts->runtime[ddir];
+
+		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
+		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
+	}
+
+	json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10);
+	json_object_add_value_int(dir_object, "bw", bw);
+	json_object_add_value_int(dir_object, "iops", iops);
+	json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
+
+	if (!calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	tmp_object = json_create_object();
+	json_object_add_value_object(dir_object, "slat", tmp_object);
+	json_object_add_value_int(tmp_object, "min", min);
+	json_object_add_value_int(tmp_object, "max", max);
+	json_object_add_value_float(tmp_object, "mean", mean);
+	json_object_add_value_float(tmp_object, "stddev", dev);
+
+	if (!calc_lat(&ts->clat_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	tmp_object = json_create_object();
+	json_object_add_value_object(dir_object, "clat", tmp_object);
+	json_object_add_value_int(tmp_object, "min", min);
+	json_object_add_value_int(tmp_object, "max", max);
+	json_object_add_value_float(tmp_object, "mean", mean);
+	json_object_add_value_float(tmp_object, "stddev", dev);
+
+	if (ts->clat_percentiles) {
+		len = calc_clat_percentiles(ts->io_u_plat[ddir],
+					ts->clat_stat[ddir].samples,
+					ts->percentile_list, &ovals, &maxv,
+					&minv);
+	} else
+		len = 0;
+
+	percentile_object = json_create_object();
+	json_object_add_value_object(tmp_object, "percentile", percentile_object);
+	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
+		if (i >= len) {
+			json_object_add_value_int(percentile_object, "0.00", 0);
+			continue;
+		}
+		snprintf(buf, sizeof(buf) - 1, "%2.2f", ts->percentile_list[i].u.f);
+		json_object_add_value_int(percentile_object, (const char *)buf, ovals[i]);
+	}
+
+	if (!calc_lat(&ts->lat_stat[ddir], &min, &max, &mean, &dev)) {
+		min = max = 0;
+		mean = dev = 0.0;
+	}
+	tmp_object = json_create_object();
+	json_object_add_value_object(dir_object, "lat", tmp_object);
+	json_object_add_value_int(tmp_object, "min", min);
+	json_object_add_value_int(tmp_object, "max", max);
+	json_object_add_value_float(tmp_object, "mean", mean);
+	json_object_add_value_float(tmp_object, "stddev", dev);
+	if (ovals)
+		free(ovals);
+
+	if (!calc_lat(&ts->bw_stat[ddir], &min, &max, &mean, &dev)) {
+		if (rs->agg[ddir]) {
+			p_of_agg = mean * 100 / (double) rs->agg[ddir];
+			if (p_of_agg > 100.0)
+				p_of_agg = 100.0;
+		}
+	} else {
+		min = max = 0;
+		p_of_agg = mean = dev = 0.0;
+	}
+	json_object_add_value_int(dir_object, "bw_min", min);
+	json_object_add_value_int(dir_object, "bw_max", max);
+	json_object_add_value_float(dir_object, "bw_agg", mean);
+	json_object_add_value_float(dir_object, "bw_mean", mean);
+	json_object_add_value_float(dir_object, "bw_dev", dev);
+}
+
 static void show_thread_status_terse_v2(struct thread_stat *ts,
 					struct group_run_stats *rs)
 {
@@ -647,9 +753,11 @@
 	/* General Info */
 	log_info("2;%s;%d;%d", ts->name, ts->groupid, ts->error);
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, 0);
+	show_ddir_status_terse(ts, rs, DDIR_READ);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, 1);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE);
+	/* Log Trim Status */
+	show_ddir_status_terse(ts, rs, DDIR_TRIM);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -666,7 +774,7 @@
 								ts->minf);
 
 	/* Calc % distribution of IO depths, usecond, msecond latency */
-	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
@@ -693,10 +801,8 @@
 	log_info("\n");
 }
 
-#define FIO_TERSE_VERSION	"3"
-
-static void show_thread_status_terse_v3(struct thread_stat *ts,
-					struct group_run_stats *rs)
+static void show_thread_status_terse_v3_v4(struct thread_stat *ts,
+					   struct group_run_stats *rs, int ver)
 {
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
@@ -705,12 +811,15 @@
 	int i;
 
 	/* General Info */
-	log_info("%s;%s;%s;%d;%d", FIO_TERSE_VERSION, fio_version_string,
+	log_info("%d;%s;%s;%d;%d", ver, fio_version_string,
 					ts->name, ts->groupid, ts->error);
 	/* Log Read Status */
-	show_ddir_status_terse(ts, rs, 0);
+	show_ddir_status_terse(ts, rs, DDIR_READ);
 	/* Log Write Status */
-	show_ddir_status_terse(ts, rs, 1);
+	show_ddir_status_terse(ts, rs, DDIR_WRITE);
+	/* Log Trim Status */
+	if (ver == 4)
+		show_ddir_status_terse(ts, rs, DDIR_TRIM);
 
 	/* CPU Usage */
 	if (ts->total_run_time) {
@@ -727,7 +836,7 @@
 								ts->minf);
 
 	/* Calc % distribution of IO depths, usecond, msecond latency */
-	stat_calc_dist(ts->io_u_map, ts_total_io_u(ts), io_u_dist);
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
 	stat_calc_lat_u(ts, io_u_lat_u);
 	stat_calc_lat_m(ts, io_u_lat_m);
 
@@ -744,7 +853,7 @@
 		log_info(";%3.2f%%", io_u_lat_m[i]);
 
 	/* disk util stats, if any */
-	show_disk_util(1);
+	show_disk_util(1, NULL);
 
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
@@ -757,13 +866,97 @@
 	log_info("\n");
 }
 
+static struct json_object *show_thread_status_json(struct thread_stat *ts,
+				    struct group_run_stats *rs)
+{
+	struct json_object *root, *tmp;
+	double io_u_dist[FIO_IO_U_MAP_NR];
+	double io_u_lat_u[FIO_IO_U_LAT_U_NR];
+	double io_u_lat_m[FIO_IO_U_LAT_M_NR];
+	double usr_cpu, sys_cpu;
+	int i;
+
+	root = json_create_object();
+	json_object_add_value_string(root, "jobname", ts->name);
+	json_object_add_value_int(root, "groupid", ts->groupid);
+	json_object_add_value_int(root, "error", ts->error);
+
+	add_ddir_status_json(ts, rs, DDIR_READ, root);
+	add_ddir_status_json(ts, rs, DDIR_WRITE, root);
+	add_ddir_status_json(ts, rs, DDIR_TRIM, root);
+
+	/* CPU Usage */
+	if (ts->total_run_time) {
+		double runt = (double) ts->total_run_time;
+
+		usr_cpu = (double) ts->usr_time * 100 / runt;
+		sys_cpu = (double) ts->sys_time * 100 / runt;
+	} else {
+		usr_cpu = 0;
+		sys_cpu = 0;
+	}
+	json_object_add_value_float(root, "usr_cpu", usr_cpu);
+	json_object_add_value_float(root, "sys_cpu", sys_cpu);
+	json_object_add_value_int(root, "ctx", ts->ctx);
+	json_object_add_value_int(root, "majf", ts->majf);
+	json_object_add_value_int(root, "minf", ts->minf);
+
+
+	/* Calc % distribution of IO depths, usecond, msecond latency */
+	stat_calc_dist(ts->io_u_map, ddir_rw_sum(ts->total_io_u), io_u_dist);
+	stat_calc_lat_u(ts, io_u_lat_u);
+	stat_calc_lat_m(ts, io_u_lat_m);
+
+	tmp = json_create_object();
+	json_object_add_value_object(root, "iodepth_level", tmp);
+	/* Only show fixed 7 I/O depth levels*/
+	for (i = 0; i < 7; i++) {
+		char name[20];
+		if (i < 6)
+			snprintf(name, 19, "%d", 1 << i);
+		else
+			snprintf(name, 19, ">=%d", 1 << i);
+		json_object_add_value_float(tmp, (const char *)name, io_u_dist[i]);
+	}
+
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_us", tmp);
+	/* Microsecond latency */
+	for (i = 0; i < FIO_IO_U_LAT_U_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_u[i]);
+	}
+	/* Millisecond latency */
+	tmp = json_create_object();
+	json_object_add_value_object(root, "latency_ms", tmp);
+	for (i = 0; i < FIO_IO_U_LAT_M_NR; i++) {
+		const char *ranges[] = { "2", "4", "10", "20", "50", "100",
+				 "250", "500", "750", "1000", "2000",
+				 ">=2000", };
+		json_object_add_value_float(tmp, ranges[i], io_u_lat_m[i]);
+	}
+
+	/* Additional output if continue_on_error set - default off*/
+	if (ts->continue_on_error) {
+		json_object_add_value_int(root, "total_err", ts->total_err_count);
+		json_object_add_value_int(root, "total_err", ts->first_error);
+	}
+
+	/* Additional output if description is set */
+	if (strlen(ts->description))
+		json_object_add_value_string(root, "desc", ts->description);
+
+	return root;
+}
+
 static void show_thread_status_terse(struct thread_stat *ts,
 				     struct group_run_stats *rs)
 {
 	if (terse_version == 2)
 		show_thread_status_terse_v2(ts, rs);
-	else if (terse_version == 3)
-		show_thread_status_terse_v3(ts, rs);
+	else if (terse_version == 3 || terse_version == 4)
+		show_thread_status_terse_v3_v4(ts, rs, terse_version);
 	else
 		log_err("fio: bad terse version!? %d\n", terse_version);
 }
@@ -807,7 +1000,7 @@
 {
 	int i;
 
-	for (i = 0; i < 2; i++) {
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		if (dst->max_run[i] < src->max_run[i])
 			dst->max_run[i] = src->max_run[i];
 		if (dst->min_run[i] && dst->min_run[i] > src->min_run[i])
@@ -827,7 +1020,7 @@
 {
 	int l, k;
 
-	for (l = 0; l <= DDIR_WRITE; l++) {
+	for (l = 0; l < DDIR_RWDIR_CNT; l++) {
 		sum_stat(&dst->clat_stat[l], &src->clat_stat[l], nr);
 		sum_stat(&dst->slat_stat[l], &src->slat_stat[l], nr);
 		sum_stat(&dst->lat_stat[l], &src->lat_stat[l], nr);
@@ -856,12 +1049,12 @@
 	for (k = 0; k < FIO_IO_U_LAT_M_NR; k++)
 		dst->io_u_lat_m[k] += src->io_u_lat_m[k];
 
-	for (k = 0; k <= 2; k++) {
+	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
 		dst->total_io_u[k] += src->total_io_u[k];
 		dst->short_io_u[k] += src->short_io_u[k];
 	}
 
-	for (k = 0; k <= DDIR_WRITE; k++) {
+	for (k = 0; k < DDIR_RWDIR_CNT; k++) {
 		int m;
 		for (m = 0; m < FIO_IO_U_PLAT_NR; m++)
 			dst->io_u_plat[k][m] += src->io_u_plat[k][m];
@@ -874,9 +1067,11 @@
 
 void init_group_run_stat(struct group_run_stats *gs)
 {
+	int i;
 	memset(gs, 0, sizeof(*gs));
-	gs->min_bw[0] = gs->min_run[0] = ~0UL;
-	gs->min_bw[1] = gs->min_run[1] = ~0UL;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		gs->min_bw[i] = gs->min_run[i] = ~0UL;
 }
 
 void init_thread_stat(struct thread_stat *ts)
@@ -885,7 +1080,7 @@
 
 	memset(ts, 0, sizeof(*ts));
 
-	for (j = 0; j <= DDIR_WRITE; j++) {
+	for (j = 0; j < DDIR_RWDIR_CNT; j++) {
 		ts->lat_stat[j].min_val = -1UL;
 		ts->clat_stat[j].min_val = -1UL;
 		ts->slat_stat[j].min_val = -1UL;
@@ -901,6 +1096,8 @@
 	struct thread_stat *threadstats, *ts;
 	int i, j, nr_ts, last_ts, idx;
 	int kb_base_warned = 0;
+	struct json_object *root = NULL;
+	struct json_array *array = NULL;
 
 	runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1));
 
@@ -1007,7 +1204,7 @@
 		rs = &runstats[ts->groupid];
 		rs->kb_base = ts->kb_base;
 
-		for (j = 0; j <= DDIR_WRITE; j++) {
+		for (j = 0; j < DDIR_RWDIR_CNT; j++) {
 			if (!ts->runtime[j])
 				continue;
 			if (ts->runtime[j] < rs->min_run[j] || !rs->min_run[j])
@@ -1033,19 +1230,28 @@
 	}
 
 	for (i = 0; i < groupid + 1; i++) {
+		int ddir;
+
 		rs = &runstats[i];
 
-		if (rs->max_run[0])
-			rs->agg[0] = (rs->io_kb[0] * 1000) / rs->max_run[0];
-		if (rs->max_run[1])
-			rs->agg[1] = (rs->io_kb[1] * 1000) / rs->max_run[1];
+		for (ddir = 0; ddir < DDIR_RWDIR_CNT; ddir++) {
+			if (rs->max_run[ddir])
+				rs->agg[ddir] = (rs->io_kb[ddir] * 1000) /
+						rs->max_run[ddir];
+		}
 	}
 
 	/*
 	 * don't overwrite last signal output
 	 */
-	if (!terse_output)
+	if (output_format == FIO_OUTPUT_NORMAL)
 		log_info("\n");
+	else if (output_format == FIO_OUTPUT_JSON) {
+		root = json_create_object();
+		json_object_add_value_string(root, "fio version", fio_version_string);
+		array = json_create_array();
+		json_object_add_value_array(root, "jobs", array);
+	}
 
 	for (i = 0; i < nr_ts; i++) {
 		ts = &threadstats[i];
@@ -1053,11 +1259,22 @@
 
 		if (is_backend)
 			fio_server_send_ts(ts, rs);
-		else if (terse_output)
+		else if (output_format == FIO_OUTPUT_TERSE)
 			show_thread_status_terse(ts, rs);
-		else
+		else if (output_format == FIO_OUTPUT_JSON) {
+			struct json_object *tmp = show_thread_status_json(ts, rs);
+			json_array_add_value_object(array, tmp);
+		} else
 			show_thread_status(ts, rs);
 	}
+	if (output_format == FIO_OUTPUT_JSON) {
+		/* disk util stats, if any */
+		show_disk_util(1, root);
+
+		json_print_object(root);
+		log_info("\n");
+		json_free_object(root);
+	}
 
 	for (i = 0; i < groupid + 1; i++) {
 		rs = &runstats[i];
@@ -1065,14 +1282,14 @@
 		rs->groupid = i;
 		if (is_backend)
 			fio_server_send_gs(rs);
-		else if (!terse_output)
+		else if (output_format == FIO_OUTPUT_NORMAL)
 			show_group_stats(rs);
 	}
 
 	if (is_backend)
 		fio_server_send_du();
-	else if (!terse_output)
-		show_disk_util(0);
+	else if (output_format == FIO_OUTPUT_NORMAL)
+		show_disk_util(0, NULL);
 
 	free(runstats);
 	free(threadstats);
@@ -1094,10 +1311,13 @@
 			td->ts.runtime[DDIR_READ] += rt[i];
 		if (td_write(td) && td->io_bytes[DDIR_WRITE])
 			td->ts.runtime[DDIR_WRITE] += rt[i];
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] += rt[i];
 
 		update_rusage_stat(td);
-		td->ts.io_bytes[0] = td->io_bytes[0];
-		td->ts.io_bytes[1] = td->io_bytes[1];
+		td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
+		td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
+		td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 		td->ts.total_run_time = mtime_since(&td->epoch, &tv);
 	}
 
@@ -1108,6 +1328,8 @@
 			td->ts.runtime[DDIR_READ] -= rt[i];
 		if (td_write(td) && td->io_bytes[DDIR_WRITE])
 			td->ts.runtime[DDIR_WRITE] -= rt[i];
+		if (td_trim(td) && td->io_bytes[DDIR_TRIM])
+			td->ts.runtime[DDIR_TRIM] -= rt[i];
 	}
 
 	free(rt);
@@ -1225,9 +1447,17 @@
 		mw = iolog->avg_window[DDIR_WRITE].mean.u.f + 0.50;
 		__add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed);
 	}
+	if (iolog->avg_window[DDIR_TRIM].samples) {
+		unsigned long mw;
+
+		mw = iolog->avg_window[DDIR_TRIM].mean.u.f + 0.50;
+		__add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed);
+	}
+
 
 	reset_io_stat(&iolog->avg_window[DDIR_READ]);
 	reset_io_stat(&iolog->avg_window[DDIR_WRITE]);
+	reset_io_stat(&iolog->avg_window[DDIR_TRIM]);
 	iolog->avg_last = elapsed;
 }
 
@@ -1312,7 +1542,7 @@
 	/*
 	 * Compute both read and write rates for the interval.
 	 */
-	for (ddir = DDIR_READ; ddir <= DDIR_WRITE; ddir++) {
+	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
 		uint64_t delta;
 
 		delta = td->this_io_bytes[ddir] - td->stat_io_bytes[ddir];
@@ -1347,7 +1577,7 @@
 	/*
 	 * Compute both read and write rates for the interval.
 	 */
-	for (ddir = DDIR_READ; ddir <= DDIR_WRITE; ddir++) {
+	for (ddir = DDIR_READ; ddir < DDIR_RWDIR_CNT; ddir++) {
 		uint64_t delta;
 
 		delta = td->this_io_blocks[ddir] - td->stat_io_blocks[ddir];
diff --git a/stat.h b/stat.h
index d7184aa..8a1536e 100644
--- a/stat.h
+++ b/stat.h
@@ -4,10 +4,10 @@
 #include "iolog.h"
 
 struct group_run_stats {
-	uint64_t max_run[2], min_run[2];
-	uint64_t max_bw[2], min_bw[2];
-	uint64_t io_kb[2];
-	uint64_t agg[2];
+	uint64_t max_run[DDIR_RWDIR_CNT], min_run[DDIR_RWDIR_CNT];
+	uint64_t max_bw[DDIR_RWDIR_CNT], min_bw[DDIR_RWDIR_CNT];
+	uint64_t io_kb[DDIR_RWDIR_CNT];
+	uint64_t agg[DDIR_RWDIR_CNT];
 	uint32_t kb_base;
 	uint32_t groupid;
 };
@@ -127,11 +127,11 @@
 	/*
 	 * bandwidth and latency stats
 	 */
-	struct io_stat clat_stat[2];		/* completion latency */
-	struct io_stat slat_stat[2];		/* submission latency */
-	struct io_stat lat_stat[2];		/* total latency */
-	struct io_stat bw_stat[2];		/* bandwidth stats */
-	struct io_stat iops_stat[2];		/* IOPS stats */
+	struct io_stat clat_stat[DDIR_RWDIR_CNT]; /* completion latency */
+	struct io_stat slat_stat[DDIR_RWDIR_CNT]; /* submission latency */
+	struct io_stat lat_stat[DDIR_RWDIR_CNT]; /* total latency */
+	struct io_stat bw_stat[DDIR_RWDIR_CNT]; /* bandwidth stats */
+	struct io_stat iops_stat[DDIR_RWDIR_CNT]; /* IOPS stats */
 
 	/*
 	 * fio system usage accounting
@@ -152,14 +152,14 @@
 	uint32_t io_u_complete[FIO_IO_U_MAP_NR];
 	uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
-	uint32_t io_u_plat[2][FIO_IO_U_PLAT_NR];
+	uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
 	uint64_t total_io_u[3];
 	uint64_t short_io_u[3];
 	uint64_t total_submit;
 	uint64_t total_complete;
 
-	uint64_t io_bytes[2];
-	uint64_t runtime[2];
+	uint64_t io_bytes[DDIR_RWDIR_CNT];
+	uint64_t runtime[DDIR_RWDIR_CNT];
 	uint64_t total_run_time;
 
 	/*
@@ -177,10 +177,10 @@
 	uint32_t nr_ramp;
 	uint32_t nr_pending;
 	uint32_t files_open;
-	uint32_t m_rate[2], t_rate[2];
-	uint32_t m_iops[2], t_iops[2];
-	uint32_t rate[2];
-	uint32_t iops[2];
+	uint32_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
+	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
+	uint32_t rate[DDIR_RWDIR_CNT];
+	uint32_t iops[DDIR_RWDIR_CNT];
 	uint64_t elapsed_sec;
 	uint64_t eta_sec;
 	uint32_t is_pow2;
@@ -209,8 +209,6 @@
 extern void stat_calc_lat_u(struct thread_stat *ts, double *io_u_lat);
 extern void stat_calc_dist(unsigned int *map, unsigned long total, double *io_u_dist);
 
-#define ts_total_io_u(ts)	((ts)->total_io_u[0] + (ts)->total_io_u[1])
-
 static inline int usec_to_msec(unsigned long *min, unsigned long *max,
 			       double *mean, double *dev)
 {
diff --git a/thread_options.h b/thread_options.h
index a78684c..323dacd 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -61,12 +61,12 @@
 	unsigned long long file_size_high;
 	unsigned long long start_offset;
 
-	unsigned int bs[2];
-	unsigned int ba[2];
-	unsigned int min_bs[2];
-	unsigned int max_bs[2];
-	struct bssplit *bssplit[2];
-	unsigned int bssplit_nr[2];
+	unsigned int bs[DDIR_RWDIR_CNT];
+	unsigned int ba[DDIR_RWDIR_CNT];
+	unsigned int min_bs[DDIR_RWDIR_CNT];
+	unsigned int max_bs[DDIR_RWDIR_CNT];
+	struct bssplit *bssplit[DDIR_RWDIR_CNT];
+	unsigned int bssplit_nr[DDIR_RWDIR_CNT];
 
 	unsigned int nr_files;
 	unsigned int open_files;
@@ -181,11 +181,11 @@
 	char *exec_prerun;
 	char *exec_postrun;
 
-	unsigned int rate[2];
-	unsigned int ratemin[2];
+	unsigned int rate[DDIR_RWDIR_CNT];
+	unsigned int ratemin[DDIR_RWDIR_CNT];
 	unsigned int ratecycle;
-	unsigned int rate_iops[2];
-	unsigned int rate_iops_min[2];
+	unsigned int rate_iops[DDIR_RWDIR_CNT];
+	unsigned int rate_iops_min[DDIR_RWDIR_CNT];
 
 	char *ioscheduler;