resolved conflicts for merge of 90531c44 to master

Change-Id: Ia51e784b350063ea085d7cc62bd693aa3a5be455
diff --git a/.gitignore b/.gitignore
index 3993a30..c9d90fb 100644
--- a/.gitignore
+++ b/.gitignore
@@ -7,3 +7,5 @@
 /config.log
 /cscope.out
 /fio
+y.tab.*
+lex.yy.c
diff --git a/Android.mk b/Android.mk
index 7002f00..3e0b5ab 100644
--- a/Android.mk
+++ b/Android.mk
@@ -29,16 +29,18 @@
                   smalloc.c filehash.c helpers.c profile.c debug.c backend.c \
                   cconv.c client.c filelock.c flow.c gettime-thread.c idletime.c io_u_queue.c \
                   iolog.c json.c libfio.c memalign.c profiles/act.c profiles/tiobench.c server.c \
-                  td_error.c diskutil.c blktrace.c trim.c fifo.c cgroup.c \
+                  td_error.c diskutil.c blktrace.c trim.c fifo.c cgroup.c
 
 lib_src_files := lib/rbtree.c lib/flist_sort.c lib/getrusage.c lib/hweight.c lib/ieee754.c lib/lfsr.c \
                  lib/num2str.c lib/prio_tree.c lib/rand.c lib/zipf.c lib/inet_aton.c lib/axmap.c \
+                 lib/bloom.c lib/linux-dev-lookup.c lib/tp.c
 
 crc_src_files := crc/crc7.c crc/crc16.c crc/crc32.c crc/crc64.c crc/crc32c.c crc/crc32c-intel.c \
                  crc/sha1.c crc/sha256.c crc/sha512.c crc/md5.c crc/test.c crc/xxhash.c \
+                 crc/fnv.c crc/murmur3.c
 
-engines_src_files := engines/binject.c engines/cpu.c engines/mmap.c engines/null.c engines/net.c \
-                     engines/sg.c engines/sync.c \
+engines_src_files := engines/cpu.c engines/mmap.c engines/null.c engines/net.c \
+                     engines/sg.c engines/sync.c engines/gfapi.h
 
 engines_src_files_64 := engines/splice.c
 
@@ -56,7 +58,7 @@
 LOCAL_SHARED_LIBRARIES := libdl
 LOCAL_STATIC_LIBRARIES := libcutils libz
 
-LOCAL_CFLAGS += -DFIO_VERSION="\"fio-2.1.8-80-g890b\"" \
+LOCAL_CFLAGS += -DFIO_VERSION="\"fio-2.2.6\"" \
                 -DCONFIG_3ARG_AFFINITY \
                 -DCONFIG_CLOCK_GETTIME \
                 -DCONFIG_CLOCK_MONOTONIC \
@@ -66,7 +68,6 @@
                 -DCONFIG_IPV6 \
                 -DCONFIG_LINUX_FALLOCATE \
                 -DCONFIG_LITTLE_ENDIAN \
-                -DCONFIG_POSIX_FALLOCATE \
                 -DCONFIG_RLIMIT_MEMLOCK \
                 -DCONFIG_RUSAGE_THREAD \
                 -DCONFIG_SCHED_IDLE \
diff --git a/FIO-VERSION-GEN b/FIO-VERSION-GEN
index 4cb3545..9ae7b7d 100755
--- a/FIO-VERSION-GEN
+++ b/FIO-VERSION-GEN
@@ -1,7 +1,7 @@
 #!/bin/sh
 
 GVF=FIO-VERSION-FILE
-DEF_VER=fio-2.1.9
+DEF_VER=fio-2.2.6
 
 LF='
 '
diff --git a/HOWTO b/HOWTO
index 1c89d75..0f7909d 100644
--- a/HOWTO
+++ b/HOWTO
@@ -81,7 +81,7 @@
 and it will start doing what the job_file tells it to do. You can give
 more than one job file on the command line, fio will serialize the running
 of those files. Internally that is the same as using the 'stonewall'
-parameter described the the parameter section.
+parameter described in the parameter section.
 
 If the job file contains only one job, you may as well just give the
 parameters on the command line. The command line parameters are identical
@@ -159,6 +159,41 @@
 
 $ fio --name=random-writers --ioengine=libaio --iodepth=4 --rw=randwrite --bs=32k --direct=0 --size=64m --numjobs=4
 
+When fio is utilized as a basis of any reasonably large test suite, it might be
+desirable to share a set of standardized settings across multiple job files.
+Instead of copy/pasting such settings, any section may pull in an external
+.fio file with 'include filename' directive, as in the following example:
+
+; -- start job file including.fio --
+[global]
+filename=/tmp/test
+filesize=1m
+include glob-include.fio
+
+[test]
+rw=randread
+bs=4k
+time_based=1
+runtime=10
+include test-include.fio
+; -- end job file including.fio --
+
+; -- start job file glob-include.fio --
+thread=1
+group_reporting=1
+; -- end job file glob-include.fio --
+
+; -- start job file test-include.fio --
+ioengine=libaio
+iodepth=4
+; -- end job file test-include.fio --
+
+Settings pulled into a section apply to that section only (except global
+section). Include directives may be nested in that any included file may
+contain further include directive(s). Include files may not contain []
+sections.
+
+
 4.1 Environment variables
 -------------------------
 
@@ -218,7 +253,20 @@
 
 This section describes in details each parameter associated with a job.
 Some parameters take an option of a given type, such as an integer or
-a string. The following types are used:
+a string. Anywhere a numeric value is required, an arithmetic expression
+may be used, provided it is surrounded by parentheses. Supported operators
+are:
+
+	addition (+)
+	subtraction (-)
+	multiplication (*)
+	division (/)
+	modulus (%)
+	exponentiation (^)
+
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The following types are used:
 
 str	String. This is a sequence of alpha characters.
 time	Integer with possible time suffix. In seconds unless otherwise
@@ -344,7 +392,7 @@
 		For certain types of io the result may still be skewed a bit,
 		since the speed may be different. It is possible to specify
 		a number of IO's to do before getting a new offset, this is
-		one by appending a ':<nr>' to the end of the string given.
+		done by appending a ':<nr>' to the end of the string given.
 		For a random read, it would look like 'rw=randread:8' for
 		passing in an offset modifier with a value of 8. If the
 		suffix is used with a sequential IO pattern, then the value
@@ -391,12 +439,6 @@
 		If not set, the random sequence depends on the randrepeat
 		setting.
 
-use_os_rand=bool Fio can either use the random generator supplied by the OS
-		to generator random offsets, or it can use it's own internal
-		generator (based on Tausworthe). Default is to use the
-		internal generator, which is often of better quality and
-		faster.
-
 fallocate=str	Whether pre-allocation is performed when laying down files.
 		Accepted values are:
 
@@ -420,23 +462,26 @@
 
 size=int	The total size of file io for this job. Fio will run until
 		this many bytes has been transferred, unless runtime is
-		limited by other options (such as 'runtime', for instance).
-		Unless specific nrfiles and filesize options are given,
-		fio will divide this size between the available files
-		specified by the job. If not set, fio will use the full
-		size of the given files or devices. If the the files
-		do not exist, size must be given. It is also possible to
-		give size as a percentage between 1 and 100. If size=20%
-		is given, fio will use 20% of the full size of the given
-		files or devices.
+		limited by other options (such as 'runtime', for instance,
+		or increased/decreased by 'io_size'). Unless specific nrfiles
+		and filesize options are given, fio will divide this size
+		between the available files specified by the job. If not set,
+		fio will use the full size of the given files or devices.
+		If the files do not exist, size must be given. It is also
+		possible to give size as a percentage between 1 and 100. If
+		size=20% is given, fio will use 20% of the full size of the
+		given files or devices.
 
+io_size=int
 io_limit=int	Normally fio operates within the region set by 'size', which
 		means that the 'size' option sets both the region and size of
 		IO to be performed. Sometimes that is not what you want. With
 		this option, it is possible to define just the amount of IO
 		that fio should do. For instance, if 'size' is set to 20G and
-		'io_limit' is set to 5G, fio will perform IO within the first
-		20G but exit when 5G have been done.
+		'io_size' is set to 5G, fio will perform IO within the first
+		20G but exit when 5G have been done. The opposite is also
+		possible - if 'size' is set to 20G, and 'io_size' is set to
+		40G, then fio will do 40G of IO within the 0..20G region.
 
 filesize=int	Individual file sizes. May be a range, in which case fio
 		will select sizes for files at random within the given range
@@ -519,7 +564,7 @@
 		while having 90% 4k writes and 10% 8k writes, you would
 		specify:
 
-		bssplit=2k/50:4k/50,4k/90,8k/10
+		bssplit=2k/50:4k/50,4k/90:8k/10
 
 blocksize_unaligned
 bs_unaligned	If this option is given, any byte size value within bsrange
@@ -552,9 +597,12 @@
 buffer_compress_percentage=int	If this is set, then fio will attempt to
 		provide IO buffer content (on WRITEs) that compress to
 		the specified level. Fio does this by providing a mix of
-		random data and zeroes. Note that this is per block size
-		unit, for file/disk wide compression level that matches
-		this setting, you'll also want to set refill_buffers.
+		random data and a fixed pattern. The fixed pattern is either
+		zeroes, or the pattern specified by buffer_pattern. If the
+		pattern option is used, it might skew the compression ratio
+		slightly. Note that this is per block size unit, for file/disk
+		wide compression level that matches this setting, you'll also
+		want to set refill_buffers.
 
 buffer_compress_chunk=int	See buffer_compress_percentage. This
 		setting allows fio to manage how big the ranges of random
@@ -565,10 +613,20 @@
 		alternate random and zeroed data throughout the IO
 		buffer.
 
-buffer_pattern=str	If set, fio will fill the io buffers with this pattern.
-		If not set, the contents of io buffers is defined by the other
-		options related to buffer contents. The setting can be any
-		pattern of bytes, and can be prefixed with 0x for hex values.
+buffer_pattern=str	If set, fio will fill the io buffers with this
+		pattern. If not set, the contents of io buffers is defined by
+		the other options related to buffer contents. The setting can
+		be any pattern of bytes, and can be prefixed with 0x for hex
+		values. It may also be a string, where the string must then
+		be wrapped with "".
+
+dedupe_percentage=int	If set, fio will generate this percentage of
+		identical buffers when writing. These buffers will be
+		naturally dedupable. The contents of the buffers depend on
+		what other buffer compression settings have been set. It's
+		possible to have the individual buffers either fully
+		compressible, or not at all. This option only controls the
+		distribution of unique buffers.
 
 nrfiles=int	Number of files to use for this job. Defaults to 1.
 
@@ -670,15 +728,45 @@
 				channel semantics (Send/Recv) for the
 				InfiniBand, RoCE and iWARP protocols.
 
-			falloc   IO engine that does regular fallocate to
-				 simulate data transfer as fio ioengine.
-				 DDIR_READ  does fallocate(,mode = keep_size,)
-				 DDIR_WRITE does fallocate(,mode = 0)
-				 DDIR_TRIM  does fallocate(,mode = punch_hole)
+			falloc	IO engine that does regular fallocate to
+				simulate data transfer as fio ioengine.
+				DDIR_READ  does fallocate(,mode = keep_size,)
+				DDIR_WRITE does fallocate(,mode = 0)
+				DDIR_TRIM  does fallocate(,mode = punch_hole)
 
 			e4defrag IO engine that does regular EXT4_IOC_MOVE_EXT
-				 ioctls to simulate defragment activity in
-				 request to DDIR_WRITE event
+				ioctls to simulate defragment activity in
+				request to DDIR_WRITE event
+
+			rbd	IO engine supporting direct access to Ceph
+				Rados Block Devices (RBD) via librbd without
+				the need to use the kernel rbd driver. This
+				ioengine defines engine specific options.
+
+			gfapi	Using Glusterfs libgfapi sync interface to
+				direct access to Glusterfs volumes without
+				options.
+
+			gfapi_async Using Glusterfs libgfapi async interface
+				to direct access to Glusterfs volumes without
+				having to go through FUSE. This ioengine
+				defines engine specific options.
+
+			libhdfs	Read and write through Hadoop (HDFS).
+				The 'filename' option is used to specify host,
+				port of the hdfs name-node to connect. This
+				engine interprets offsets a little
+				differently. In HDFS, files once created
+				cannot be modified. So random writes are not
+				possible. To imitate this, libhdfs engine
+				expects bunch of small files to be created
+				over HDFS, and engine will randomly pick a
+				file out of those files based on the offset
+				generated by fio backend. (see the example
+				job file to create such files, use rw=write
+				option). Please note, you might want to set
+				necessary environment variables to work with
+				hdfs/libhdfs properly.
 
 			external Prefix to specify loading an external
 				IO engine object file. Append the engine
@@ -736,18 +824,21 @@
 		caps the file size at real_size - offset.
 
 offset_increment=int	If this is provided, then the real offset becomes
-		the offset + offset_increment * thread_number, where the
-		thread number is a counter that starts at 0 and is incremented
-		for each job. This option is useful if there are several jobs
-		which are intended to operate on a file in parallel in disjoint
-		segments, with even spacing between the starting points.
+		offset + offset_increment * thread_number, where the thread
+		number is a counter that starts at 0 and is incremented for
+		each sub-job (i.e. when numjobs option is specified). This
+		option is useful if there are several jobs which are intended
+		to operate on a file in parallel disjoint segments, with
+		even spacing between the starting points.
 
 number_ios=int	Fio will normally perform IOs until it has exhausted the size
 		of the region set by size=, or if it exhaust the allocated
 		time (or hits an error condition). With this setting, the
 		range/size can be set independently of the number of IOs to
 		perform. When fio reaches this number, it will exit normally
-		and report status.
+		and report status. Note that this does not extend the amount
+		of IO that will be done, it will only stop fio if this
+		condition is met before other end-of-job criteria.
 
 fsync=int	If writing to a file, issue a sync of the dirty data
 		for every number of blocks given. For example, if you give
@@ -828,10 +919,10 @@
 		random IO. If this option is given, fio will just get a
 		new random offset without looking at past io history. This
 		means that some blocks may not be read or written, and that
-		some blocks may be read/written more than once. This option
-		is mutually exclusive with verify= if and only if multiple
-		blocksizes (via bsrange=) are used, since fio only tracks
-		complete rewrites of blocks.
+		some blocks may be read/written more than once. If this option
+		is used with verify= and multiple blocksizes (via bsrange=),
+		only intact blocks are verified, i.e., partially-overwritten
+		blocks are ignored.
 
 softrandommap=bool See norandommap. If fio runs with the random block map
 		enabled and it fails to allocate the map, if this option is
@@ -1231,6 +1322,21 @@
 		if verify_backlog_batch is larger than verify_backlog, some
 		blocks will be verified more than once.
 
+verify_state_save=bool	When a job exits during the write phase of a verify
+		workload, save its current state. This allows fio to replay
+		up until that point, if the verify state is loaded for the
+		verify read phase. The format of the filename is, roughly,
+		<type>-<jobname>-<jobindex>-verify.state. <type> is "local"
+		for a local run, "sock" for a client/server socket connection,
+		and "ip" (192.168.0.1, for instance) for a networked
+		client/server connection.
+
+verify_state_load=bool	If a verify termination trigger was used, fio stores
+		the current write state of each thread. This can be used at
+		verification time so that fio knows how far it should verify.
+		Without this information, fio will run a full verification
+		pass, according to the settings in the job file used.
+
 stonewall
 wait_for_previous Wait for preceding jobs in the job file to exit, before
 		starting this one. Can be used to insert serialization
@@ -1307,7 +1413,9 @@
 		jobs in their lifetime. The included fio_generate_plots
 		script uses gnuplot to turn these text files into nice
 		graphs. See write_lat_log for behaviour of given
-		filename. For this option, the suffix is _bw.log.
+		filename. For this option, the suffix is _bw.x.log, where
+		x is the index of the job (1..N, where N is the number of
+		jobs).
 
 write_lat_log=str Same as write_bw_log, except that this option stores io
 		submission, completion, and total latencies instead. If no
@@ -1317,14 +1425,16 @@
 
 		write_lat_log=foo
 
-		The actual log names will be foo_slat.log, foo_clat.log,
-		and foo_lat.log. This helps fio_generate_plot fine the logs
-		automatically.
+		The actual log names will be foo_slat.x.log, foo_clat.x.log,
+		and foo_lat.x.log, where x is the index of the job (1..N,
+		where N is the number of jobs). This helps fio_generate_plot
+		fine the logs automatically.
 
 write_iops_log=str Same as write_bw_log, but writes IOPS. If no filename is
 		given with this option, the default filename of
-		"jobname_type.log" is used. Even if the filename is given,
-		fio will still append the type of log.
+		"jobname_type.x.log" is used,where x is the index of the job
+		(1..N, where N is the number of jobs). Even if the filename
+		is given, fio will still append the type of log.
 
 log_avg_msec=int By default, fio will log an entry in the iops, latency,
 		or bw log for every IO that completes. When writing to the
@@ -1333,6 +1443,29 @@
 		specified period of time, reducing the resolution of the log.
 		Defaults to 0.
 
+log_offset=int	If this is set, the iolog options will include the byte
+		offset for the IO entry as well as the other data values.
+
+log_compression=int	If this is set, fio will compress the IO logs as
+		it goes, to keep the memory footprint lower. When a log
+		reaches the specified size, that chunk is removed and
+		compressed in the background. Given that IO logs are
+		fairly highly compressible, this yields a nice memory
+		savings for longer runs. The downside is that the
+		compression will consume some background CPU cycles, so
+		it may impact the run. This, however, is also true if
+		the logging ends up consuming most of the system memory.
+		So pick your poison. The IO logs are saved normally at the
+		end of a run, by decompressing the chunks and storing them
+		in the specified log file. This feature depends on the
+		availability of zlib.
+
+log_store_compressed=bool	If set, and log_compression is also set,
+		fio will store the log files in a compressed format. They
+		can be decompressed with fio, using the --inflate-log
+		command line parameter. The files will be stored with a
+		.fz suffix.
+
 lockmem=int	Pin down the specified amount of memory with mlock(2). Can
 		potentially be used instead of removing memory or booting
 		with less memory to simulate a smaller amount of memory.
@@ -1522,7 +1655,9 @@
 		address.
 
 [netsplice] port=int
-[net] port=int	The TCP or UDP port to bind to or connect to.
+[net] port=int	The TCP or UDP port to bind to or connect to. If this is used
+with numjobs to spawn multiple instances of the same job type, then this will
+be the starting port number since fio will use a range of ports.
 
 [netsplice] interface=str
 [net] interface=str  The IP address of the network interface used to send or
@@ -1554,6 +1689,7 @@
 [net] listen	For TCP network connections, tell fio to listen for incoming
 		connections rather than initiating an outgoing connection. The
 		hostname must be omitted if this option is used.
+
 [net] pingpong	Normaly a network writer will just continue writing data, and
 		a network reader will just consume packages. If pingpong=1
 		is set, a writer will send its normal payload to the reader,
@@ -1566,6 +1702,10 @@
 		single reader when multiple readers are listening to the same
 		address.
 
+[net] window_size	Set the desired socket buffer size for the connection.
+
+[net] mss	Set the TCP maximum segment size (TCP_MAXSEG).
+
 [e4defrag] donorname=str
 	        File will be used as a block donor(swap extents between files)
 [e4defrag] inplace=int
@@ -1607,6 +1747,15 @@
 X		Thread reaped, exited with an error.
 K		Thread reaped, exited due to signal.
 
+Fio will condense the thread string as not to take up more space on the
+command line as is needed. For instance, if you have 10 readers and 10
+writers running, the output would look like this:
+
+Jobs: 20 (f=20): [R(10),W(10)] [4.0% done] [2103MB/0KB/0KB /s] [538K/0/0 iops] [eta 57m:36s]
+
+Fio will still maintain the ordering, though. So the above means that jobs
+1..10 are readers, and 11..20 are writers.
+
 The other values are fairly self explanatory - number of threads
 currently running and doing io, rate of io since last check (read speed
 listed first, then write speed), and the estimated completion percentage
@@ -1850,3 +1999,81 @@
 and standard deviation of time to complete an unit work is reported in "unit
 work" section. Options can be chosen to report detailed percpu idleness or
 overall system idleness by aggregating percpu stats.
+
+
+10.0 Verification and triggers
+------------------------------
+Fio is usually run in one of two ways, when data verification is done. The
+first is a normal write job of some sort with verify enabled. When the
+write phase has completed, fio switches to reads and verifies everything
+it wrote. The second model is running just the write phase, and then later
+on running the same job (but with reads instead of writes) to repeat the
+same IO patterns and verify the contents. Both of these methods depend
+on the write phase being completed, as fio otherwise has no idea how much
+data was written.
+
+With verification triggers, fio supports dumping the current write state
+to local files. Then a subsequent read verify workload can load this state
+and know exactly where to stop. This is useful for testing cases where
+power is cut to a server in a managed fashion, for instance.
+
+A verification trigger consists of two things:
+
+1) Storing the write state of each job
+2) Executing a trigger command
+
+The write state is relatively small, on the order of hundreds of bytes
+to single kilobytes. It contains information on the number of completions
+done, the last X completions, etc.
+
+A trigger is invoked either through creation ('touch') of a specified
+file in the system, or through a timeout setting. If fio is run with
+--trigger-file=/tmp/trigger-file, then it will continually check for
+the existence of /tmp/trigger-file. When it sees this file, it will
+fire off the trigger (thus saving state, and executing the trigger
+command).
+
+For client/server runs, there's both a local and remote trigger. If
+fio is running as a server backend, it will send the job states back
+to the client for safe storage, then execute the remote trigger, if
+specified. If a local trigger is specified, the server will still send
+back the write state, but the client will then execute the trigger.
+
+10.1 Verification trigger example
+---------------------------------
+Lets say we want to run a powercut test on the remote machine 'server'.
+Our write workload is in write-test.fio. We want to cut power to 'server'
+at some point during the run, and we'll run this test from the safety
+or our local machine, 'localbox'. On the server, we'll start the fio
+backend normally:
+
+server# fio --server
+
+and on the client, we'll fire off the workload:
+
+localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger-remote="bash -c \"echo b > /proc/sysrq-triger\""
+
+We set /tmp/my-trigger as the trigger file, and we tell fio to execute
+
+echo b > /proc/sysrq-trigger
+
+on the server once it has received the trigger and sent us the write
+state. This will work, but it's not _really_ cutting power to the server,
+it's merely abruptly rebooting it. If we have a remote way of cutting
+power to the server through IPMI or similar, we could do that through
+a local trigger command instead. Lets assume we have a script that does
+IPMI reboot of a given hostname, ipmi-reboot. On localbox, we could
+then have run fio with a local trigger instead:
+
+localbox$ fio --client=server --trigger-file=/tmp/my-trigger --trigger="ipmi-reboot server"
+
+For this case, fio would wait for the server to send us the write state,
+then execute 'ipmi-reboot server' when that happened.
+
+10.1 Loading verify state
+-------------------------
+To load store write state, read verification job file must contain
+the verify_state_load option. If that is set, fio will load the previously
+stored state. For a local fio run this is done by loading the files directly,
+and on a client/server run, the server backend will ask the client to send
+the files over and load them from there.
diff --git a/LICENSE b/MORAL-LICENSE
similarity index 82%
rename from LICENSE
rename to MORAL-LICENSE
index d7c0b1b..8ef3f26 100644
--- a/LICENSE
+++ b/MORAL-LICENSE
@@ -1,8 +1,8 @@
 As specified by the COPYING file, fio is free software published under version
-2 of the GPL license. That covers the copying part of the license. By using fio,
-you are also promising to uphold the following moral obligations:
+2 of the GPL license. That covers the copying part of the license. When using
+fio, you are encouraged to uphold the following moral obligations:
 
-- If you publish results that are done using fio, it must be clearly stated
+- If you publish results that are done using fio, it should be clearly stated
   that fio was used. The specific version should also be listed.
 
 - If you develop features or bug fixes for fio, they should be sent upstream
diff --git a/Makefile b/Makefile
index a0f0f71..52e515b 100644
--- a/Makefile
+++ b/Makefile
@@ -35,7 +35,15 @@
 		cconv.c lib/prio_tree.c json.c lib/zipf.c lib/axmap.c \
 		lib/lfsr.c gettime-thread.c helpers.c lib/flist_sort.c \
 		lib/hweight.c lib/getrusage.c idletime.c td_error.c \
-		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c
+		profiles/tiobench.c profiles/act.c io_u_queue.c filelock.c \
+		lib/tp.c lib/bloom.c
+
+ifdef CONFIG_LIBHDFS
+  HDFSFLAGS= -I $(JAVA_HOME)/include -I $(JAVA_HOME)/include/linux -I $(FIO_LIBHDFS_INCLUDE)
+  HDFSLIB= -Wl,-rpath $(JAVA_HOME)/jre/lib/amd64/server -L$(JAVA_HOME)/jre/lib/amd64/server -ljvm $(FIO_LIBHDFS_LIB)/libhdfs.a
+  CFLAGS += $(HDFSFLAGS)
+  SOURCE += engines/libhdfs.c
+endif
 
 ifdef CONFIG_64BIT_LLP64
   CFLAGS += -DBITS_PER_LONG=32
@@ -91,15 +99,24 @@
 ifndef CONFIG_INET_ATON
   SOURCE += lib/inet_aton.c
 endif
+ifdef CONFIG_GFAPI
+  SOURCE += engines/glusterfs.c
+  SOURCE += engines/glusterfs_sync.c
+  SOURCE += engines/glusterfs_async.c
+  ifdef CONFIG_GF_FADVISE
+    CFLAGS += "-DGFAPI_USE_FADVISE"
+  endif
+endif
 
 ifeq ($(CONFIG_TARGET_OS), Linux)
   SOURCE += diskutil.c fifo.c blktrace.c cgroup.c trim.c engines/sg.c \
-		engines/binject.c
+		engines/binject.c lib/linux-dev-lookup.c
   LIBS += -lpthread -ldl
   LDFLAGS += -rdynamic
 endif
 ifeq ($(CONFIG_TARGET_OS), Android)
-  SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c
+  SOURCE += diskutil.c fifo.c blktrace.c trim.c profiles/tiobench.c \
+		lib/linux-dev-lookup.c
   LIBS += -ldl
   LDFLAGS += -rdynamic
 endif
@@ -119,6 +136,10 @@
   LIBS	 += -lpthread -lrt
   LDFLAGS += -rdynamic
 endif
+ifeq ($(CONFIG_TARGET_OS), DragonFly)
+  LIBS	 += -lpthread -lrt
+  LDFLAGS += -rdynamic
+endif
 ifeq ($(CONFIG_TARGET_OS), AIX)
   LIBS	 += -lpthread -ldl -lrt
   CPPFLAGS += -D_LARGE_FILES -D__ppc__
@@ -141,13 +162,19 @@
 OBJS = $(SOURCE:.c=.o)
 
 FIO_OBJS = $(OBJS) fio.o
+
 GFIO_OBJS = $(OBJS) gfio.o graph.o tickmarks.o ghelpers.o goptions.o gerror.o \
 			gclient.o gcompat.o cairo_text_helpers.o printing.o
 
+ifdef CONFIG_ARITHMETIC
+FIO_OBJS += lex.yy.o y.tab.o
+GFIO_OBJS += lex.yy.o y.tab.o
+endif
+
 -include $(OBJS:.o=.d)
 
 T_SMALLOC_OBJS = t/stest.o
-T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o
+T_SMALLOC_OBJS += gettime.o mutex.o smalloc.o t/log.o t/debug.o
 T_SMALLOC_PROGS = t/stest
 
 T_IEEE_OBJS = t/ieee754.o
@@ -156,33 +183,53 @@
 
 T_ZIPF_OBS = t/genzipf.o
 T_ZIPF_OBJS += t/log.o lib/ieee754.o lib/rand.o lib/zipf.o t/genzipf.o
-T_ZIPF_PROGS = t/genzipf
+T_ZIPF_PROGS = t/fio-genzipf
 
 T_AXMAP_OBJS = t/axmap.o
 T_AXMAP_OBJS += lib/lfsr.o lib/axmap.o
 T_AXMAP_PROGS = t/axmap
 
 T_LFSR_TEST_OBJS = t/lfsr-test.o
-T_LFSR_TEST_OBJS += lib/lfsr.o
+T_LFSR_TEST_OBJS += lib/lfsr.o gettime.o t/log.o t/debug.o
 T_LFSR_TEST_PROGS = t/lfsr-test
 
+ifeq ($(CONFIG_TARGET_OS), Linux)
+T_BTRACE_FIO_OBJS = t/btrace2fio.o
+T_BTRACE_FIO_OBJS += fifo.o lib/flist_sort.o t/log.o lib/linux-dev-lookup.o
+T_BTRACE_FIO_PROGS = t/fio-btrace2fio
+endif
+
+T_DEDUPE_OBJS = t/dedupe.o
+T_DEDUPE_OBJS += lib/rbtree.o t/log.o mutex.o smalloc.o gettime.o crc/md5.o \
+		memalign.o lib/bloom.o t/debug.o crc/xxhash.o crc/murmur3.o \
+		crc/crc32c.o crc/crc32c-intel.o crc/fnv.o
+T_DEDUPE_PROGS = t/fio-dedupe
+
 T_OBJS = $(T_SMALLOC_OBJS)
 T_OBJS += $(T_IEEE_OBJS)
 T_OBJS += $(T_ZIPF_OBJS)
 T_OBJS += $(T_AXMAP_OBJS)
 T_OBJS += $(T_LFSR_TEST_OBJS)
+T_OBJS += $(T_BTRACE_FIO_OBJS)
+T_OBJS += $(T_DEDUPE_OBJS)
 
-T_PROGS = $(T_SMALLOC_PROGS)
-T_PROGS += $(T_IEEE_PROGS)
+T_TEST_PROGS = $(T_SMALLOC_PROGS)
+T_TEST_PROGS += $(T_IEEE_PROGS)
 T_PROGS += $(T_ZIPF_PROGS)
-T_PROGS += $(T_AXMAP_PROGS)
-T_PROGS += $(T_LFSR_TEST_PROGS)
+T_TEST_PROGS += $(T_AXMAP_PROGS)
+T_TEST_PROGS += $(T_LFSR_TEST_PROGS)
+T_PROGS += $(T_BTRACE_FIO_PROGS)
+T_PROGS += $(T_DEDUPE_PROGS)
+
+PROGS += $(T_PROGS)
 
 ifneq ($(findstring $(MAKEFLAGS),s),s)
 ifndef V
 	QUIET_CC	= @echo '   ' CC $@;
-	QUIET_LINK	= @echo '   ' LINK $@;
-	QUIET_DEP	= @echo '   ' DEP $@;
+	QUIET_LINK	= @echo ' ' LINK $@;
+	QUIET_DEP	= @echo '  ' DEP $@;
+	QUIET_YACC	= @echo ' ' YACC $@;
+	QUIET_LEX	= @echo '  ' LEX $@;
 endif
 endif
 
@@ -202,7 +249,7 @@
 sharedir = $(prefix)/share/fio
 endif
 
-all: $(PROGS) $(SCRIPTS) FORCE
+all: $(PROGS) $(T_TEST_PROGS) $(SCRIPTS) FORCE
 
 .PHONY: all install clean
 .PHONY: FORCE cscope
@@ -213,7 +260,7 @@
 
 override CFLAGS += -DFIO_VERSION='"$(FIO_VERSION)"'
 
-.c.o: FORCE FIO-VERSION-FILE
+%.o : %.c
 	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
 	@$(CC) -MM $(CFLAGS) $(CPPFLAGS) $*.c > $*.d
 	@mv -f $*.d $*.d.tmp
@@ -222,6 +269,31 @@
 		sed -e 's/^ *//' -e 's/$$/:/' >> $*.d
 	@rm -f $*.d.tmp
 
+ifdef CONFIG_ARITHMETIC
+lex.yy.c: exp/expression-parser.l
+	$(QUIET_LEX)$(LEX) exp/expression-parser.l
+
+lex.yy.o: lex.yy.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.o: y.tab.c y.tab.h
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+
+y.tab.c: exp/expression-parser.y
+	$(QUIET_YACC)$(YACC) -l -d -b y exp/expression-parser.y
+
+y.tab.h: y.tab.c
+
+lexer.h: lex.yy.c
+
+exp/test-expression-parser.o: exp/test-expression-parser.c
+	$(QUIET_CC)$(CC) -o $@ $(CFLAGS) $(CPPFLAGS) -c $<
+exp/test-expression-parser: exp/test-expression-parser.o
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) $< y.tab.o lex.yy.o -o $@ $(LIBS)
+
+parse.o: lex.yy.o y.tab.o
+endif
+
 init.o: FIO-VERSION-FILE init.c
 	$(QUIET_CC)$(CC) -o init.o $(CFLAGS) $(CPPFLAGS) -c init.c
 
@@ -259,12 +331,12 @@
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_IEEE_OBJS) $(LIBS)
 
 fio: $(FIO_OBJS)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(FIO_OBJS) $(LIBS) $(HDFSLIB)
 
 gfio: $(GFIO_OBJS)
-	$(QUIET_LINK)$(CC) $(LDFLAGS) -o gfio $(GFIO_OBJS) $(LIBS) $(GTK_LDFLAGS)
+	$(QUIET_LINK)$(CC) $(filter-out -static, $(LDFLAGS)) -o gfio $(GFIO_OBJS) $(LIBS) $(GFIO_LIBS) $(GTK_LDFLAGS) $(HDFSLIB)
 
-t/genzipf: $(T_ZIPF_OBJS)
+t/fio-genzipf: $(T_ZIPF_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_ZIPF_OBJS) $(LIBS)
 
 t/axmap: $(T_AXMAP_OBJS)
@@ -273,8 +345,16 @@
 t/lfsr-test: $(T_LFSR_TEST_OBJS)
 	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_LFSR_TEST_OBJS) $(LIBS)
 
+ifeq ($(CONFIG_TARGET_OS), Linux)
+t/fio-btrace2fio: $(T_BTRACE_FIO_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_BTRACE_FIO_OBJS) $(LIBS)
+endif
+
+t/fio-dedupe: $(T_DEDUPE_OBJS)
+	$(QUIET_LINK)$(CC) $(LDFLAGS) $(CFLAGS) -o $@ $(T_DEDUPE_OBJS) $(LIBS)
+
 clean: FORCE
-	-rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h
+	@rm -f .depend $(FIO_OBJS) $(GFIO_OBJS) $(OBJS) $(T_OBJS) $(PROGS) $(T_PROGS) $(T_TEST_PROGS) core.* core gfio FIO-VERSION-FILE *.d lib/*.d crc/*.d engines/*.d profiles/*.d t/*.d config-host.mak config-host.h y.tab.[ch] lex.yy.c exp/*.[do] lexer.h
 
 distclean: clean FORCE
 	@rm -f cscope.out fio.pdf fio_generate_plots.pdf fio2gnuplot.pdf
diff --git a/README b/README
index f8aaef2..18d1c4f 100644
--- a/README
+++ b/README
@@ -26,6 +26,17 @@
 
 	http://brick.kernel.dk/snaps/
 
+There are also two official mirrors. Both of these are synced within
+an hour of commits landing at git.kernel.dk. So if the main repo is
+down for some reason, either one of those is safe to use:
+
+	git://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+	https://git.kernel.org/pub/scm/linux/kernel/git/axboe/fio.git
+
+or
+
+	https://github.com/axboe/fio.git
+
 
 Binary packages
 ---------------
@@ -101,6 +112,9 @@
  $ make CROSS_COMPILE=/path/to/toolchain/prefix
 Configure will attempt to determine the target platform automatically.
 
+It's possible to build fio for ESX as well, use the --esx switch to
+configure.
+
 
 Windows
 -------
@@ -134,7 +148,6 @@
 	--parse-only		Parse options only, don't start any IO
 	--output		Write output to file
 	--runtime		Runtime in seconds
-	--latency-log		Generate per-job latency logs
 	--bandwidth-log		Generate per-job bandwidth logs
 	--minimal		Minimal (terse) output
 	--output-format=type	Output format (terse,json,normal)
@@ -160,9 +173,11 @@
 	--max-jobs		Maximum number of threads/processes to support
 	--server=args		Start backend server. See Client/Server section.
 	--client=host		Connect to specified backend.
+	--remote-config=file	Tell fio server to load this local file
 	--idle-prof=option	Report cpu idleness on a system or percpu basis
 				(option=system,percpu) or run unit work
 				calibration only (option=calibrate).
+	--inflate-log=log	Inflate and output compressed log
 
 
 Any parameters following the options will be assumed to be job files,
@@ -196,6 +211,7 @@
 	time		Dump info related to internal time keeping
 	net		Dump info related to networking connections
 	rate		Dump info related to IO rate switching
+	compress	Dump info related to log compress/decompress
 	? or help	Show available debug options.
 
 One can specify multiple debug options: e.g. --debug=file,mem will enable
@@ -290,6 +306,14 @@
 
 fio --client=<server1> <job file(s)> --client=<server2> <job file(s)>
 
+If the job file is located on the fio server, then you can tell the server
+to load a local file as well. This is done by using --remote-config:
+
+fio --client=server --remote-config /path/to/file.fio
+
+Then the fio server will open this local (to the server) job file instead
+of being passed one from the client.
+
 
 Platforms
 ---------
diff --git a/arch/arch-aarch64.h b/arch/arch-aarch64.h
new file mode 100644
index 0000000..a6cfaf2
--- /dev/null
+++ b/arch/arch-aarch64.h
@@ -0,0 +1,35 @@
+#ifndef ARCH_AARCH64_H
+#define ARCH_AARCH64_H
+
+#include <unistd.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/wait.h>
+
+#define FIO_ARCH	(arch_aarch64)
+
+#ifndef __NR_ioprio_set
+#define __NR_ioprio_set		30
+#define __NR_ioprio_get		31
+#endif
+
+#define nop		do { __asm__ __volatile__ ("yield"); } while (0)
+#define read_barrier()	do { __sync_synchronize(); } while (0)
+#define write_barrier()	do { __sync_synchronize(); } while (0)
+
+static inline int arch_ffz(unsigned long bitmask)
+{
+	unsigned long count, reversed_bits;
+	if (~bitmask == 0)	/* ffz() in lib/ffz.h does this. */
+		return 63;
+
+	__asm__ __volatile__ ("rbit %1, %2\n"
+			      "clz %0, %1\n" : 
+			      "=r"(count), "=&r"(reversed_bits) :
+			      "r"(~bitmask));
+	return count;
+}
+
+#define ARCH_HAVE_FFZ
+
+#endif
diff --git a/arch/arch-arm.h b/arch/arch-arm.h
index 7cd9502..bab886e 100644
--- a/arch/arch-arm.h
+++ b/arch/arch-arm.h
@@ -19,7 +19,8 @@
 #endif
 
 #if defined (__ARM_ARCH_4__) || defined (__ARM_ARCH_4T__) \
-	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
+	|| defined (__ARM_ARCH_5__) || defined (__ARM_ARCH_5T__) || defined (__ARM_ARCH_5E__)\
+	|| defined (__ARM_ARCH_5TE__) || defined (__ARM_ARCH_5TEJ__) \
 	|| defined(__ARM_ARCH_6__)  || defined(__ARM_ARCH_6J__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__)
 #define nop             __asm__ __volatile__("mov\tr0,r0\t@ nop\n\t")
 #define read_barrier()	__asm__ __volatile__ ("" : : : "memory")
diff --git a/arch/arch-ppc.h b/arch/arch-ppc.h
index 0f043bc..d4a080c 100644
--- a/arch/arch-ppc.h
+++ b/arch/arch-ppc.h
@@ -82,6 +82,7 @@
 	return ret;
 }
 
+#if 0
 static void atb_child(void)
 {
 	arch_flags |= ARCH_FLAG_1;
@@ -106,6 +107,7 @@
 			arch_flags |= ARCH_FLAG_1;
 	}
 }
+#endif
 
 #define ARCH_HAVE_INIT
 extern int tsc_reliable;
diff --git a/arch/arch-s390.h b/arch/arch-s390.h
index 169282b..cc7a1d1 100644
--- a/arch/arch-s390.h
+++ b/arch/arch-s390.h
@@ -40,6 +40,7 @@
 
 #define ARCH_CPU_CLOCK_CYCLES_PER_USEC 1
 #define ARCH_HAVE_CPU_CLOCK
+#undef ARCH_CPU_CLOCK_WRAPS
 
 #define ARCH_HAVE_INIT
 extern int tsc_reliable;
diff --git a/arch/arch.h b/arch/arch.h
index 31d96d4..5671b9a 100644
--- a/arch/arch.h
+++ b/arch/arch.h
@@ -14,6 +14,7 @@
 	arch_sh,
 	arch_hppa,
 	arch_mips,
+	arch_aarch64,
 
 	arch_generic,
 
@@ -29,6 +30,8 @@
 
 extern unsigned long arch_flags;
 
+#define ARCH_CPU_CLOCK_WRAPS
+
 #if defined(__i386__)
 #include "arch-x86.h"
 #elif defined(__x86_64__)
@@ -53,6 +56,8 @@
 #include "arch-sh.h"
 #elif defined(__hppa__)
 #include "arch-hppa.h"
+#elif defined(__aarch64__)
+#include "arch-aarch64.h"
 #else
 #warning "Unknown architecture, attempting to use generic model."
 #include "arch-generic.h"
diff --git a/backend.c b/backend.c
index d1d5571..fdb7413 100644
--- a/backend.c
+++ b/backend.c
@@ -53,9 +53,13 @@
 #include "lib/getrusage.h"
 #include "idletime.h"
 #include "err.h"
+#include "lib/tp.h"
 
-static pthread_t disk_util_thread;
-static struct fio_mutex *disk_thread_mutex;
+static pthread_t helper_thread;
+static pthread_mutex_t helper_lock;
+pthread_cond_t helper_cond;
+int helper_do_stat = 0;
+
 static struct fio_mutex *startup_mutex;
 static struct flist_head *cgroup_list;
 static char *cgroup_mnt;
@@ -72,7 +76,7 @@
 int shm_id = 0;
 int temp_stall_ts;
 unsigned long done_secs = 0;
-volatile int disk_util_exit = 0;
+volatile int helper_exit = 0;
 
 #define PAGE_ALIGN(buf)	\
 	(char *) (((uintptr_t) (buf) + page_mask) & ~page_mask)
@@ -86,7 +90,7 @@
 			fio_server_got_signal(sig);
 		else {
 			log_info("\nfio: terminating on signal %d\n", sig);
-			fflush(stdout);
+			log_info_flush();
 			exit_value = 128;
 		}
 
@@ -390,7 +394,7 @@
 			 * fill_device option is set.
 			 */
 			td_clear_error(td);
-			td->terminate = 1;
+			fio_mark_td_terminate(td);
 			return 1;
 		} else {
 			/*
@@ -414,6 +418,34 @@
 	}
 }
 
+static int wait_for_completions(struct thread_data *td, struct timeval *time,
+				uint64_t *bytes_done)
+{
+	const int full = queue_full(td);
+	int min_evts = 0;
+	int ret;
+
+	/*
+	 * if the queue is full, we MUST reap at least 1 event
+	 */
+	min_evts = min(td->o.iodepth_batch_complete, td->cur_depth);
+	if (full && !min_evts)
+		min_evts = 1;
+
+	if (time && (__should_check_rate(td, DDIR_READ) ||
+	    __should_check_rate(td, DDIR_WRITE) ||
+	    __should_check_rate(td, DDIR_TRIM)))
+		fio_gettime(time, NULL);
+
+	do {
+		ret = io_u_queued_complete(td, min_evts, bytes_done);
+		if (ret < 0)
+			break;
+	} while (full && (td->cur_depth > td->o.iodepth_low));
+
+	return ret;
+}
+
 /*
  * The main verify engine. Runs over the writes we previously submitted,
  * reads the blocks back in, and checks the crc/md5 of the data.
@@ -459,7 +491,7 @@
 		if (runtime_exceeded(td, &td->tv_cache)) {
 			__update_tv_cache(td);
 			if (runtime_exceeded(td, &td->tv_cache)) {
-				td->terminate = 1;
+				fio_mark_td_terminate(td);
 				break;
 			}
 		}
@@ -522,12 +554,19 @@
 				break;
 		}
 
+		if (verify_state_should_stop(td, io_u)) {
+			put_io_u(td, io_u);
+			break;
+		}
+
 		if (td->o.verify_async)
 			io_u->end_io = verify_io_u_async;
 		else
 			io_u->end_io = verify_io_u;
 
 		ddir = io_u->ddir;
+		if (!td->o.disable_slat)
+			fio_gettime(&io_u->start_time, NULL);
 
 		ret = td_io_queue(td, io_u);
 		switch (ret) {
@@ -590,27 +629,9 @@
 		 */
 reap:
 		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete) {
-			min_events = min(td->o.iodepth_batch_complete,
-					 td->cur_depth);
-			/*
-			 * if the queue is full, we MUST reap at least 1 event
-			 */
-			if (full && !min_events)
-				min_events = 1;
+		if (full || !td->o.iodepth_batch_complete)
+			ret = wait_for_completions(td, NULL, bytes_done);
 
-			do {
-				/*
-				 * Reap required number of io units, if any,
-				 * and do the verification on them through
-				 * the callback handler
-				 */
-				if (io_u_queued_complete(td, min_events, bytes_done) < 0) {
-					ret = -1;
-					break;
-				}
-			} while (full && (td->cur_depth > td->o.iodepth_low));
-		}
 		if (ret < 0)
 			break;
 	}
@@ -637,13 +658,35 @@
 	if (!td->o.number_ios)
 		return 0;
 
-	number_ios = ddir_rw_sum(td->this_io_blocks);
+	number_ios = ddir_rw_sum(td->io_blocks);
 	number_ios += td->io_u_queued + td->io_u_in_flight;
 
-	return number_ios >= td->o.number_ios;
+	return number_ios >= (td->o.number_ios * td->loops);
 }
 
-static int io_bytes_exceeded(struct thread_data *td)
+static int io_issue_bytes_exceeded(struct thread_data *td)
+{
+	unsigned long long bytes, limit;
+
+	if (td_rw(td))
+		bytes = td->io_issue_bytes[DDIR_READ] + td->io_issue_bytes[DDIR_WRITE];
+	else if (td_write(td))
+		bytes = td->io_issue_bytes[DDIR_WRITE];
+	else if (td_read(td))
+		bytes = td->io_issue_bytes[DDIR_READ];
+	else
+		bytes = td->io_issue_bytes[DDIR_TRIM];
+
+	if (td->o.io_limit)
+		limit = td->o.io_limit;
+	else
+		limit = td->o.size;
+
+	limit *= td->loops;
+	return bytes >= limit || exceeds_number_ios(td);
+}
+
+static int io_complete_bytes_exceeded(struct thread_data *td)
 {
 	unsigned long long bytes, limit;
 
@@ -661,6 +704,7 @@
 	else
 		limit = td->o.size;
 
+	limit *= td->loops;
 	return bytes >= limit || exceeds_number_ios(td);
 }
 
@@ -684,21 +728,26 @@
 
 	lat_target_init(td);
 
+	total_bytes = td->o.size;
+	/*
+	* Allow random overwrite workloads to write up to io_limit
+	* before starting verification phase as 'size' doesn't apply.
+	*/
+	if (td_write(td) && td_random(td) && td->o.norandommap)
+		total_bytes = max(total_bytes, (uint64_t) td->o.io_limit);
 	/*
 	 * If verify_backlog is enabled, we'll run the verify in this
 	 * handler as well. For that case, we may need up to twice the
 	 * amount of bytes.
 	 */
-	total_bytes = td->o.size;
 	if (td->o.verify != VERIFY_NONE &&
 	   (td_write(td) && td->o.verify_backlog))
 		total_bytes += td->o.size;
 
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
-		(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td) ||
+		(!flist_empty(&td->trim_list)) || !io_issue_bytes_exceeded(td) ||
 		td->o.time_based) {
 		struct timeval comp_time;
-		int min_evts = 0;
 		struct io_u *io_u;
 		int ret2, full;
 		enum fio_ddir ddir;
@@ -713,7 +762,7 @@
 		if (runtime_exceeded(td, &td->tv_cache)) {
 			__update_tv_cache(td);
 			if (runtime_exceeded(td, &td->tv_cache)) {
-				td->terminate = 1;
+				fio_mark_td_terminate(td);
 				break;
 			}
 		}
@@ -749,9 +798,14 @@
 		    ((io_u->flags & IO_U_F_VER_LIST) || !td_rw(td))) {
 
 			if (!td->o.verify_pattern_bytes) {
-				io_u->rand_seed = __rand(&td->__verify_state);
+				io_u->rand_seed = __rand(&td->verify_state);
 				if (sizeof(int) != sizeof(long *))
-					io_u->rand_seed *= __rand(&td->__verify_state);
+					io_u->rand_seed *= __rand(&td->verify_state);
+			}
+
+			if (verify_state_should_stop(td, io_u)) {
+				put_io_u(td, io_u);
+				break;
 			}
 
 			if (td->o.verify_async)
@@ -857,28 +911,8 @@
 		 */
 reap:
 		full = queue_full(td) || (ret == FIO_Q_BUSY && td->cur_depth);
-		if (full || !td->o.iodepth_batch_complete) {
-			min_evts = min(td->o.iodepth_batch_complete,
-					td->cur_depth);
-			/*
-			 * if the queue is full, we MUST reap at least 1 event
-			 */
-			if (full && !min_evts)
-				min_evts = 1;
-
-			if (__should_check_rate(td, DDIR_READ) ||
-			    __should_check_rate(td, DDIR_WRITE) ||
-			    __should_check_rate(td, DDIR_TRIM))
-				fio_gettime(&comp_time, NULL);
-
-			do {
-				ret = io_u_queued_complete(td, min_evts, bytes_done);
-				if (ret < 0)
-					break;
-
-			} while (full && (td->cur_depth > td->o.iodepth_low));
-		}
-
+		if (full || !td->o.iodepth_batch_complete)
+			ret = wait_for_completions(td, &comp_time, bytes_done);
 		if (ret < 0)
 			break;
 		if (!ddir_rw_sum(bytes_done) && !(td->io_ops->flags & FIO_NOIO))
@@ -921,7 +955,7 @@
 
 	if (td->o.fill_device && td->error == ENOSPC) {
 		td->error = 0;
-		td->terminate = 1;
+		fio_mark_td_terminate(td);
 	}
 	if (!td->error) {
 		struct fio_file *f;
@@ -973,6 +1007,9 @@
 	io_u_rexit(&td->io_u_requeues);
 	io_u_qexit(&td->io_u_freelist);
 	io_u_qexit(&td->io_u_all);
+
+	if (td->last_write_comp)
+		sfree(td->last_write_comp);
 }
 
 static int init_io_u(struct thread_data *td)
@@ -1089,6 +1126,14 @@
 		p += max_bs;
 	}
 
+	if (td->o.verify != VERIFY_NONE) {
+		td->last_write_comp = scalloc(max_units, sizeof(uint64_t));
+		if (!td->last_write_comp) {
+			log_err("fio: failed to alloc write comp data\n");
+			return 1;
+		}
+	}
+
 	return 0;
 }
 
@@ -1217,7 +1262,7 @@
 	td_set_runstate(td, TD_RUNNING);
 
 	while ((td->o.read_iolog_file && !flist_empty(&td->io_log_list)) ||
-		(!flist_empty(&td->trim_list)) || !io_bytes_exceeded(td)) {
+		(!flist_empty(&td->trim_list)) || !io_complete_bytes_exceeded(td)) {
 		struct io_u *io_u;
 		int ret;
 
@@ -1270,11 +1315,6 @@
 	} else
 		td->pid = gettid();
 
-	/*
-	 * fio_time_init() may not have been called yet if running as a server
-	 */
-	fio_time_init();
-
 	fio_local_clock_init(o->use_thread);
 
 	dprint(FD_PROCESS, "jobs pid=%d started\n", (int) td->pid);
@@ -1325,7 +1365,7 @@
 	 * Set affinity first, in case it has an impact on the memory
 	 * allocations.
 	 */
-	if (o->cpumask_set) {
+	if (fio_option_is_set(o, cpumask)) {
 		if (o->cpus_allowed_policy == FIO_CPUS_SPLIT) {
 			ret = fio_cpus_split(&o->cpumask, td->thread_number - 1);
 			if (!ret) {
@@ -1344,16 +1384,16 @@
 
 #ifdef CONFIG_LIBNUMA
 	/* numa node setup */
-	if (o->numa_cpumask_set || o->numa_memmask_set) {
+	if (fio_option_is_set(o, numa_cpunodes) ||
+	    fio_option_is_set(o, numa_memnodes)) {
 		struct bitmask *mask;
-		int ret;
 
 		if (numa_available() < 0) {
 			td_verror(td, errno, "Does not support NUMA API\n");
 			goto err;
 		}
 
-		if (o->numa_cpumask_set) {
+		if (fio_option_is_set(o, numa_cpunodes)) {
 			mask = numa_parse_nodestring(o->numa_cpunodes);
 			ret = numa_run_on_node_mask(mask);
 			numa_free_nodemask(mask);
@@ -1364,8 +1404,7 @@
 			}
 		}
 
-		if (o->numa_memmask_set) {
-
+		if (fio_option_is_set(o, numa_memnodes)) {
 			mask = NULL;
 			if (o->numa_memnodes)
 				mask = numa_parse_nodestring(o->numa_memnodes);
@@ -1411,7 +1450,8 @@
 	if (o->verify_async && verify_async_init(td))
 		goto err;
 
-	if (o->ioprio) {
+	if (fio_option_is_set(o, ioprio) ||
+	    fio_option_is_set(o, ioprio_class)) {
 		ret = ioprio_set(IOPRIO_WHO_PROCESS, 0, o->ioprio_class, o->ioprio);
 		if (ret == -1) {
 			td_verror(td, errno, "ioprio_set");
@@ -1448,6 +1488,9 @@
 			goto err;
 	}
 
+	if (td->flags & TD_F_COMPRESS_LOG)
+		tp_init(&td->tp_data);
+
 	fio_verify_init(td);
 
 	fio_gettime(&td->epoch, NULL);
@@ -1483,18 +1526,21 @@
 
 		clear_state = 1;
 
+		fio_mutex_down(stat_mutex);
 		if (td_read(td) && td->io_bytes[DDIR_READ]) {
-			elapsed = utime_since_now(&td->start);
+			elapsed = mtime_since_now(&td->start);
 			td->ts.runtime[DDIR_READ] += elapsed;
 		}
 		if (td_write(td) && td->io_bytes[DDIR_WRITE]) {
-			elapsed = utime_since_now(&td->start);
+			elapsed = mtime_since_now(&td->start);
 			td->ts.runtime[DDIR_WRITE] += elapsed;
 		}
 		if (td_trim(td) && td->io_bytes[DDIR_TRIM]) {
-			elapsed = utime_since_now(&td->start);
+			elapsed = mtime_since_now(&td->start);
 			td->ts.runtime[DDIR_TRIM] += elapsed;
 		}
+		fio_gettime(&td->start, NULL);
+		fio_mutex_up(stat_mutex);
 
 		if (td->error || td->terminate)
 			break;
@@ -1510,25 +1556,40 @@
 
 		do_verify(td, verify_bytes);
 
-		td->ts.runtime[DDIR_READ] += utime_since_now(&td->start);
+		fio_mutex_down(stat_mutex);
+		td->ts.runtime[DDIR_READ] += mtime_since_now(&td->start);
+		fio_gettime(&td->start, NULL);
+		fio_mutex_up(stat_mutex);
 
 		if (td->error || td->terminate)
 			break;
 	}
 
 	update_rusage_stat(td);
-	td->ts.runtime[DDIR_READ] = (td->ts.runtime[DDIR_READ] + 999) / 1000;
-	td->ts.runtime[DDIR_WRITE] = (td->ts.runtime[DDIR_WRITE] + 999) / 1000;
-	td->ts.runtime[DDIR_TRIM] = (td->ts.runtime[DDIR_TRIM] + 999) / 1000;
 	td->ts.total_run_time = mtime_since_now(&td->epoch);
 	td->ts.io_bytes[DDIR_READ] = td->io_bytes[DDIR_READ];
 	td->ts.io_bytes[DDIR_WRITE] = td->io_bytes[DDIR_WRITE];
 	td->ts.io_bytes[DDIR_TRIM] = td->io_bytes[DDIR_TRIM];
 
+	if (td->o.verify_state_save && !(td->flags & TD_F_VSTATE_SAVED) &&
+	    (td->o.verify != VERIFY_NONE && td_write(td))) {
+		struct all_io_list *state;
+		size_t sz;
+
+		state = get_all_io_list(td->thread_number, &sz);
+		if (state) {
+			__verify_save_state(state, "local");
+			free(state);
+		}
+	}
+
 	fio_unpin_memory(td);
 
 	fio_writeout_logs(td);
 
+	if (td->flags & TD_F_COMPRESS_LOG)
+		tp_exit(&td->tp_data);
+
 	if (o->exec_postrun)
 		exec_string(o, o->exec_postrun, (const char *)"postrun");
 
@@ -1547,11 +1608,12 @@
 	cleanup_io_u(td);
 	close_ioengine(td);
 	cgroup_shutdown(td, &cgroup_mnt);
+	verify_free_state(td);
 
-	if (o->cpumask_set) {
-		int ret = fio_cpuset_exit(&o->cpumask);
-
-		td_verror(td, ret, "fio_cpuset_exit");
+	if (fio_option_is_set(o, cpumask)) {
+		ret = fio_cpuset_exit(&o->cpumask);
+		if (ret)
+			td_verror(td, ret, "fio_cpuset_exit");
 	}
 
 	/*
@@ -1560,13 +1622,17 @@
 	if (o->write_iolog_file)
 		write_iolog_close(td);
 
-	fio_mutex_remove(td->rusage_sem);
-	td->rusage_sem = NULL;
-
 	fio_mutex_remove(td->mutex);
 	td->mutex = NULL;
 
 	td_set_runstate(td, TD_EXITED);
+
+	/*
+	 * Do this last after setting our runstate to exited, so we
+	 * know that the stat thread is signaled.
+	 */
+	check_update_rusage(td);
+
 	return (void *) (uintptr_t) td->error;
 }
 
@@ -1580,7 +1646,7 @@
 	struct thread_data *td;
 	void *data, *ret;
 
-#ifndef __hpux
+#if !defined(__hpux) && !defined(CONFIG_NO_SHM)
 	data = shmat(shmid, NULL, 0);
 	if (data == (void *) -1) {
 		int __err = errno;
@@ -1601,6 +1667,13 @@
 	return (int) (uintptr_t) ret;
 }
 
+static void dump_td_info(struct thread_data *td)
+{
+	log_err("fio: job '%s' hasn't exited in %lu seconds, it appears to "
+		"be stuck. Doing forceful exit of this job.\n", td->o.name,
+			(unsigned long) time_since_now(&td->terminate_time));
+}
+
 /*
  * Run over the job map and reap the threads that have exited, if any.
  */
@@ -1679,6 +1752,17 @@
 		}
 
 		/*
+		 * If the job is stuck, do a forceful timeout of it and
+		 * move on.
+		 */
+		if (td->terminate &&
+		    time_since_now(&td->terminate_time) >= FIO_REAP_TIMEOUT) {
+			dump_td_info(td);
+			td_set_runstate(td, TD_REAPED);
+			goto reaped;
+		}
+
+		/*
 		 * thread is not dead, continue
 		 */
 		pending++;
@@ -1701,9 +1785,80 @@
 		fio_terminate_threads(TERMINATE_ALL);
 }
 
+static int __check_trigger_file(void)
+{
+	struct stat sb;
+
+	if (!trigger_file)
+		return 0;
+
+	if (stat(trigger_file, &sb))
+		return 0;
+
+	if (unlink(trigger_file) < 0)
+		log_err("fio: failed to unlink %s: %s\n", trigger_file,
+							strerror(errno));
+
+	return 1;
+}
+
+static int trigger_timedout(void)
+{
+	if (trigger_timeout)
+		return time_since_genesis() >= trigger_timeout;
+
+	return 0;
+}
+
+void exec_trigger(const char *cmd)
+{
+	int ret;
+
+	if (!cmd)
+		return;
+
+	ret = system(cmd);
+	if (ret == -1)
+		log_err("fio: failed executing %s trigger\n", cmd);
+}
+
+void check_trigger_file(void)
+{
+	if (__check_trigger_file() || trigger_timedout()) {
+		if (nr_clients)
+			fio_clients_send_trigger(trigger_remote_cmd);
+		else {
+			verify_save_state();
+			fio_terminate_threads(TERMINATE_ALL);
+			exec_trigger(trigger_cmd);
+		}
+	}
+}
+
+static int fio_verify_load_state(struct thread_data *td)
+{
+	int ret;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	if (is_backend) {
+		void *data;
+
+		ret = fio_server_get_verify_state(td->o.name,
+					td->thread_number - 1, &data);
+		if (!ret)
+			verify_convert_assign_state(td, data);
+	} else
+		ret = verify_load_state(td, "local");
+
+	return ret;
+}
+
 static void do_usleep(unsigned int usecs)
 {
 	check_for_running_stats();
+	check_trigger_file();
 	usleep(usecs);
 }
 
@@ -1743,7 +1898,7 @@
 						nr_process > 1 ? "es" : "");
 		}
 		log_info("\n");
-		fflush(stdout);
+		log_info_flush();
 	}
 
 	todo = thread_number;
@@ -1757,12 +1912,16 @@
 		if (!td->o.create_serialize)
 			continue;
 
+		if (fio_verify_load_state(td))
+			goto reap;
+
 		/*
 		 * do file setup here so it happens sequentially,
 		 * we don't want X number of threads getting their
 		 * client data interspersed on disk
 		 */
 		if (setup_files(td)) {
+reap:
 			exit_value++;
 			if (td->error)
 				log_err("fio: pid=%d, err=%d/%s\n",
@@ -1950,57 +2109,71 @@
 	update_io_ticks();
 }
 
-void wait_for_disk_thread_exit(void)
+static void wait_for_helper_thread_exit(void)
 {
-	fio_mutex_down(disk_thread_mutex);
+	void *ret;
+
+	helper_exit = 1;
+	pthread_cond_signal(&helper_cond);
+	pthread_join(helper_thread, &ret);
 }
 
 static void free_disk_util(void)
 {
-	disk_util_start_exit();
-	wait_for_disk_thread_exit();
 	disk_util_prune_entries();
+
+	pthread_cond_destroy(&helper_cond);
 }
 
-static void *disk_thread_main(void *data)
+static void *helper_thread_main(void *data)
 {
 	int ret = 0;
 
 	fio_mutex_up(startup_mutex);
 
-	while (threads && !ret) {
-		usleep(DISK_UTIL_MSEC * 1000);
-		if (!threads)
-			break;
+	while (!ret) {
+		uint64_t sec = DISK_UTIL_MSEC / 1000;
+		uint64_t nsec = (DISK_UTIL_MSEC % 1000) * 1000000;
+		struct timespec ts;
+		struct timeval tv;
+
+		gettimeofday(&tv, NULL);
+		ts.tv_sec = tv.tv_sec + sec;
+		ts.tv_nsec = (tv.tv_usec * 1000) + nsec;
+
+		if (ts.tv_nsec >= 1000000000ULL) {
+			ts.tv_nsec -= 1000000000ULL;
+			ts.tv_sec++;
+		}
+
+		pthread_cond_timedwait(&helper_cond, &helper_lock, &ts);
+
 		ret = update_io_ticks();
 
+		if (helper_do_stat) {
+			helper_do_stat = 0;
+			__show_running_run_stats();
+		}
+
 		if (!is_backend)
 			print_thread_status();
 	}
 
-	fio_mutex_up(disk_thread_mutex);
 	return NULL;
 }
 
-static int create_disk_util_thread(void)
+static int create_helper_thread(void)
 {
 	int ret;
 
 	setup_disk_util();
 
-	disk_thread_mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
+	pthread_cond_init(&helper_cond, NULL);
+	pthread_mutex_init(&helper_lock, NULL);
 
-	ret = pthread_create(&disk_util_thread, NULL, disk_thread_main, NULL);
+	ret = pthread_create(&helper_thread, NULL, helper_thread_main, NULL);
 	if (ret) {
-		fio_mutex_remove(disk_thread_mutex);
-		log_err("Can't create disk util thread: %s\n", strerror(ret));
-		return 1;
-	}
-
-	ret = pthread_detach(disk_util_thread);
-	if (ret) {
-		fio_mutex_remove(disk_thread_mutex);
-		log_err("Can't detatch disk util thread: %s\n", strerror(ret));
+		log_err("Can't create helper thread: %s\n", strerror(ret));
 		return 1;
 	}
 
@@ -2025,9 +2198,13 @@
 		return 0;
 
 	if (write_bw_log) {
-		setup_log(&agg_io_log[DDIR_READ], 0, IO_LOG_TYPE_BW);
-		setup_log(&agg_io_log[DDIR_WRITE], 0, IO_LOG_TYPE_BW);
-		setup_log(&agg_io_log[DDIR_TRIM], 0, IO_LOG_TYPE_BW);
+		struct log_params p = {
+			.log_type = IO_LOG_TYPE_BW,
+		};
+
+		setup_log(&agg_io_log[DDIR_READ], &p, "agg-read_bw.log");
+		setup_log(&agg_io_log[DDIR_WRITE], &p, "agg-write_bw.log");
+		setup_log(&agg_io_log[DDIR_TRIM], &p, "agg-trim_bw.log");
 	}
 
 	startup_mutex = fio_mutex_init(FIO_MUTEX_LOCKED);
@@ -2036,26 +2213,34 @@
 
 	set_genesis_time();
 	stat_init();
-	create_disk_util_thread();
+	create_helper_thread();
 
 	cgroup_list = smalloc(sizeof(*cgroup_list));
 	INIT_FLIST_HEAD(cgroup_list);
 
 	run_threads();
 
+	wait_for_helper_thread_exit();
+
 	if (!fio_abort) {
-		show_run_stats();
+		__show_run_stats();
 		if (write_bw_log) {
-			__finish_log(agg_io_log[DDIR_READ], "agg-read_bw.log");
-			__finish_log(agg_io_log[DDIR_WRITE],
-					"agg-write_bw.log");
-			__finish_log(agg_io_log[DDIR_TRIM],
-					"agg-write_bw.log");
+			for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+				struct io_log *log = agg_io_log[i];
+
+				flush_log(log);
+				free_log(log);
+			}
 		}
 	}
 
-	for_each_td(td, i)
+	for_each_td(td, i) {
 		fio_options_free(td);
+		if (td->rusage_sem) {
+			fio_mutex_remove(td->rusage_sem);
+			td->rusage_sem = NULL;
+		}
+	}
 
 	free_disk_util();
 	cgroup_kill(cgroup_list);
@@ -2063,7 +2248,6 @@
 	sfree(cgroup_mnt);
 
 	fio_mutex_remove(startup_mutex);
-	fio_mutex_remove(disk_thread_mutex);
 	stat_exit();
 	return exit_value;
 }
diff --git a/blktrace.c b/blktrace.c
index 29eed50..9afc5be 100644
--- a/blktrace.c
+++ b/blktrace.c
@@ -9,6 +9,7 @@
 #include "flist.h"
 #include "fio.h"
 #include "blktrace_api.h"
+#include "lib/linux-dev-lookup.h"
 
 #define TRACE_FIFO_SIZE	8192
 
@@ -108,67 +109,6 @@
 	return 0;
 }
 
-static int lookup_device(struct thread_data *td, char *path, unsigned int maj,
-			 unsigned int min)
-{
-	struct dirent *dir;
-	struct stat st;
-	int found = 0;
-	DIR *D;
-
-	D = opendir(path);
-	if (!D)
-		return 0;
-
-	while ((dir = readdir(D)) != NULL) {
-		char full_path[256];
-
-		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
-			continue;
-
-		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
-		if (lstat(full_path, &st) == -1) {
-			perror("lstat");
-			break;
-		}
-
-		if (S_ISDIR(st.st_mode)) {
-			found = lookup_device(td, full_path, maj, min);
-			if (found) {
-				strcpy(path, full_path);
-				break;
-			}
-		}
-
-		if (!S_ISBLK(st.st_mode))
-			continue;
-
-		/*
-		 * If replay_redirect is set then always return this device
-		 * upon lookup which overrides the device lookup based on
-		 * major minor in the actual blktrace
-		 */
-		if (td->o.replay_redirect) {
-			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
-					" with: %s\n", maj, min,
-					td->o.replay_redirect);
-			strcpy(path, td->o.replay_redirect);
-			found = 1;
-			break;
-		}
-
-		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
-			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
-			strcpy(path, full_path);
-			found = 1;
-			break;
-		}
-	}
-
-	closedir(D);
-	return found;
-}
-
 #define FMINORBITS	20
 #define FMINORMASK	((1U << FMINORBITS) - 1)
 #define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
@@ -212,9 +152,16 @@
 		}
 
 	strcpy(dev, "/dev");
-	if (lookup_device(td, dev, maj, min)) {
+	if (blktrace_lookup_device(td->o.replay_redirect, dev, maj, min)) {
 		int fileno;
 
+		if (td->o.replay_redirect)
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n overridden"
+					" with: %s\n", maj, min,
+					td->o.replay_redirect);
+		else
+			dprint(FD_BLKTRACE, "device lookup: %d/%d\n", maj, min);
+
 		dprint(FD_BLKTRACE, "add devices %s\n", dev);
 		fileno = add_file_exclusive(td, dev);
 		td->o.open_files++;
@@ -260,11 +207,11 @@
 {
 	switch (t->action) {
 	case BLK_TN_PROCESS:
-		log_info("blktrace: got process notify: %x, %d\n",
+		dprint(FD_BLKTRACE, "got process notify: %x, %d\n",
 				t->action, t->pid);
 		break;
 	case BLK_TN_TIMESTAMP:
-		log_info("blktrace: got timestamp notify: %x, %d\n",
+		dprint(FD_BLKTRACE, "got timestamp notify: %x, %d\n",
 				t->action, t->pid);
 		break;
 	case BLK_TN_MESSAGE:
@@ -275,8 +222,10 @@
 	}
 }
 
-static void handle_trace_discard(struct thread_data *td, struct blk_io_trace *t,
-				 unsigned long long ttime, unsigned long *ios)
+static void handle_trace_discard(struct thread_data *td,
+				 struct blk_io_trace *t,
+				 unsigned long long ttime,
+				 unsigned long *ios, unsigned int *bs)
 {
 	struct io_piece *ipo = malloc(sizeof(*ipo));
 	int fileno;
@@ -284,7 +233,10 @@
 	init_ipo(ipo);
 	fileno = trace_add_file(td, t->device);
 
-	ios[DDIR_WRITE]++;
+	ios[DDIR_TRIM]++;
+	if (t->bytes > bs[DDIR_TRIM])
+		bs[DDIR_TRIM] = t->bytes;
+
 	td->o.size += t->bytes;
 
 	memset(ipo, 0, sizeof(*ipo));
@@ -329,20 +281,30 @@
  * due to internal workings of the block layer.
  */
 static void handle_trace(struct thread_data *td, struct blk_io_trace *t,
-			 unsigned long long ttime, unsigned long *ios,
-			 unsigned int *bs)
+			 unsigned long *ios, unsigned int *bs)
 {
+	static unsigned long long last_ttime;
+	unsigned long long delay;
+
 	if ((t->action & 0xffff) != __BLK_TA_QUEUE)
 		return;
-	if (t->action & BLK_TC_ACT(BLK_TC_PC))
-		return;
+
+	if (!(t->action & BLK_TC_ACT(BLK_TC_NOTIFY))) {
+		if (!last_ttime || td->o.no_stall) {
+			last_ttime = t->time;
+			delay = 0;
+		} else {
+			delay = t->time - last_ttime;
+			last_ttime = t->time;
+		}
+	}
 
 	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
 		handle_trace_notify(t);
 	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
-		handle_trace_discard(td, t, ttime, ios);
+		handle_trace_discard(td, t, delay, ios, bs);
 	else
-		handle_trace_fs(td, t, ttime, ios, bs);
+		handle_trace_fs(td, t, delay, ios, bs);
 }
 
 static void byteswap_trace(struct blk_io_trace *t)
@@ -360,17 +322,20 @@
 	t->pdu_len = fio_swap16(t->pdu_len);
 }
 
+static int t_is_write(struct blk_io_trace *t)
+{
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE | BLK_TC_DISCARD)) != 0;
+}
+
 /*
  * Load a blktrace file by reading all the blk_io_trace entries, and storing
  * them as io_pieces like the fio text version would do.
  */
 int load_blktrace(struct thread_data *td, const char *filename, int need_swap)
 {
-	unsigned long long ttime, delay;
 	struct blk_io_trace t;
-	unsigned long ios[2], skipped_writes;
-	unsigned int cpu;
-	unsigned int rw_bs[2];
+	unsigned long ios[DDIR_RWDIR_CNT], skipped_writes;
+	unsigned int rw_bs[DDIR_RWDIR_CNT];
 	struct fifo *fifo;
 	int fd, i, old_state;
 	struct fio_file *f;
@@ -388,8 +353,6 @@
 
 	td->o.size = 0;
 
-	cpu = 0;
-	ttime = 0;
 	ios[0] = ios[1] = 0;
 	rw_bs[0] = rw_bs[1] = 0;
 	skipped_writes = 0;
@@ -434,33 +397,14 @@
 				depth = max(depth, this_depth);
 				this_depth = 0;
 			}
-			if (!ttime) {
-				ttime = t.time;
-				cpu = t.cpu;
-			}
 
-			delay = 0;
-			if (cpu == t.cpu)
-				delay = t.time - ttime;
-			if ((t.action & BLK_TC_ACT(BLK_TC_WRITE)) && read_only)
+			if (t_is_write(&t) && read_only) {
 				skipped_writes++;
-			else {
-				/*
-				 * set delay to zero if no_stall enabled for
-				 * fast replay
-				 */
-				if (td->o.no_stall)
-					delay = 0;
-
-				handle_trace(td, &t, delay, ios, rw_bs);
+				continue;
 			}
-
-			ttime = t.time;
-			cpu = t.cpu;
-		} else {
-			delay = 0;
-			handle_trace(td, &t, delay, ios, rw_bs);
 		}
+
+		handle_trace(td, &t, ios, rw_bs);
 	} while (1);
 
 	for (i = 0; i < td->files_index; i++) {
@@ -492,7 +436,7 @@
 	if (!ios[DDIR_READ] && !ios[DDIR_WRITE]) {
 		log_err("fio: found no ios in blktrace data\n");
 		return 1;
-	} else if (ios[DDIR_READ] && !ios[DDIR_READ]) {
+	} else if (ios[DDIR_READ] && !ios[DDIR_WRITE]) {
 		td->o.td_ddir = TD_DDIR_READ;
 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
 	} else if (!ios[DDIR_READ] && ios[DDIR_WRITE]) {
@@ -502,6 +446,7 @@
 		td->o.td_ddir = TD_DDIR_RW;
 		td->o.max_bs[DDIR_READ] = rw_bs[DDIR_READ];
 		td->o.max_bs[DDIR_WRITE] = rw_bs[DDIR_WRITE];
+		td->o.max_bs[DDIR_TRIM] = rw_bs[DDIR_TRIM];
 	}
 
 	/*
@@ -514,8 +459,8 @@
 	 * we don't know if this option was set or not. it defaults to 1,
 	 * so we'll just guess that we should override it if it's still 1
 	 */
-	if (td->o.iodepth != 1)
-		td->o.iodepth = depth;
+	if (td->o.iodepth == 1)
+		td->o.iodepth = td->o.iodepth_low = depth;
 
 	return 0;
 err:
diff --git a/cconv.c b/cconv.c
index 2f7177d..0fca764 100644
--- a/cconv.c
+++ b/cconv.c
@@ -49,6 +49,9 @@
 {
 	int i, j;
 
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		o->set_options[i] = le64_to_cpu(top->set_options[i]);
+
 	string_to_cpu(&o->description, top->description);
 	string_to_cpu(&o->name, top->name);
 	string_to_cpu(&o->directory, top->directory);
@@ -131,6 +134,7 @@
 	o->verifysort = le32_to_cpu(top->verifysort);
 	o->verifysort_nr = le32_to_cpu(top->verifysort_nr);
 	o->experimental_verify = le32_to_cpu(top->experimental_verify);
+	o->verify_state = le32_to_cpu(top->verify_state);
 	o->verify_interval = le32_to_cpu(top->verify_interval);
 	o->verify_offset = le32_to_cpu(top->verify_offset);
 
@@ -149,8 +153,10 @@
 	o->rand_repeatable = le32_to_cpu(top->rand_repeatable);
 	o->allrand_repeatable = le32_to_cpu(top->allrand_repeatable);
 	o->rand_seed = le64_to_cpu(top->rand_seed);
-	o->use_os_rand = le32_to_cpu(top->use_os_rand);
 	o->log_avg_msec = le32_to_cpu(top->log_avg_msec);
+	o->log_offset = le32_to_cpu(top->log_offset);
+	o->log_gz = le32_to_cpu(top->log_gz);
+	o->log_gz_store = le32_to_cpu(top->log_gz_store);
 	o->norandommap = le32_to_cpu(top->norandommap);
 	o->softrandommap = le32_to_cpu(top->softrandommap);
 	o->bs_unaligned = le32_to_cpu(top->bs_unaligned);
@@ -191,8 +197,6 @@
 	o->stonewall = le32_to_cpu(top->stonewall);
 	o->new_group = le32_to_cpu(top->new_group);
 	o->numjobs = le32_to_cpu(top->numjobs);
-	o->cpumask_set = le32_to_cpu(top->cpumask_set);
-	o->verify_cpumask_set = le32_to_cpu(top->verify_cpumask_set);
 	o->cpus_allowed_policy = le32_to_cpu(top->cpus_allowed_policy);
 	o->iolog = le32_to_cpu(top->iolog);
 	o->rwmixcycle = le32_to_cpu(top->rwmixcycle);
@@ -215,7 +219,6 @@
 	o->unified_rw_rep = le32_to_cpu(top->unified_rw_rep);
 	o->gtod_reduce = le32_to_cpu(top->gtod_reduce);
 	o->gtod_cpu = le32_to_cpu(top->gtod_cpu);
-	o->gtod_offload = le32_to_cpu(top->gtod_offload);
 	o->clocksource = le32_to_cpu(top->clocksource);
 	o->no_stall = le32_to_cpu(top->no_stall);
 	o->trim_percentage = le32_to_cpu(top->trim_percentage);
@@ -238,6 +241,7 @@
 	o->latency_percentile.u.f = fio_uint64_to_double(le64_to_cpu(top->latency_percentile.u.i));
 	o->compress_percentage = le32_to_cpu(top->compress_percentage);
 	o->compress_chunk = le32_to_cpu(top->compress_chunk);
+	o->dedupe_percentage = le32_to_cpu(top->dedupe_percentage);
 
 	o->trim_backlog = le64_to_cpu(top->trim_backlog);
 
@@ -254,6 +258,9 @@
 {
 	int i, j;
 
+	for (i = 0; i < NR_OPTS_SZ; i++)
+		top->set_options[i] = cpu_to_le64(o->set_options[i]);
+
 	string_to_net(top->description, o->description);
 	string_to_net(top->name, o->name);
 	string_to_net(top->directory, o->directory);
@@ -305,6 +312,7 @@
 	top->verifysort = cpu_to_le32(o->verifysort);
 	top->verifysort_nr = cpu_to_le32(o->verifysort_nr);
 	top->experimental_verify = cpu_to_le32(o->experimental_verify);
+	top->verify_state = cpu_to_le32(o->verify_state);
 	top->verify_interval = cpu_to_le32(o->verify_interval);
 	top->verify_offset = cpu_to_le32(o->verify_offset);
 	top->verify_pattern_bytes = cpu_to_le32(o->verify_pattern_bytes);
@@ -319,8 +327,10 @@
 	top->rand_repeatable = cpu_to_le32(o->rand_repeatable);
 	top->allrand_repeatable = cpu_to_le32(o->allrand_repeatable);
 	top->rand_seed = __cpu_to_le64(o->rand_seed);
-	top->use_os_rand = cpu_to_le32(o->use_os_rand);
 	top->log_avg_msec = cpu_to_le32(o->log_avg_msec);
+	top->log_offset = cpu_to_le32(o->log_offset);
+	top->log_gz = cpu_to_le32(o->log_gz);
+	top->log_gz_store = cpu_to_le32(o->log_gz_store);
 	top->norandommap = cpu_to_le32(o->norandommap);
 	top->softrandommap = cpu_to_le32(o->softrandommap);
 	top->bs_unaligned = cpu_to_le32(o->bs_unaligned);
@@ -348,8 +358,6 @@
 	top->stonewall = cpu_to_le32(o->stonewall);
 	top->new_group = cpu_to_le32(o->new_group);
 	top->numjobs = cpu_to_le32(o->numjobs);
-	top->cpumask_set = cpu_to_le32(o->cpumask_set);
-	top->verify_cpumask_set = cpu_to_le32(o->verify_cpumask_set);
 	top->cpus_allowed_policy = cpu_to_le32(o->cpus_allowed_policy);
 	top->iolog = cpu_to_le32(o->iolog);
 	top->rwmixcycle = cpu_to_le32(o->rwmixcycle);
@@ -372,7 +380,6 @@
 	top->unified_rw_rep = cpu_to_le32(o->unified_rw_rep);
 	top->gtod_reduce = cpu_to_le32(o->gtod_reduce);
 	top->gtod_cpu = cpu_to_le32(o->gtod_cpu);
-	top->gtod_offload = cpu_to_le32(o->gtod_offload);
 	top->clocksource = cpu_to_le32(o->clocksource);
 	top->no_stall = cpu_to_le32(o->no_stall);
 	top->trim_percentage = cpu_to_le32(o->trim_percentage);
@@ -395,6 +402,7 @@
 	top->latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(o->latency_percentile.u.f));
 	top->compress_percentage = cpu_to_le32(o->compress_percentage);
 	top->compress_chunk = cpu_to_le32(o->compress_chunk);
+	top->dedupe_percentage = cpu_to_le32(o->dedupe_percentage);
 
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		top->bs[i] = cpu_to_le32(o->bs[i]);
diff --git a/client.c b/client.c
index af6621d..760ec85 100644
--- a/client.c
+++ b/client.c
@@ -23,6 +23,7 @@
 #include "server.h"
 #include "flist.h"
 #include "hash.h"
+#include "verify.h"
 
 static void handle_du(struct fio_client *client, struct fio_net_cmd *cmd);
 static void handle_ts(struct fio_client *client, struct fio_net_cmd *cmd);
@@ -60,7 +61,8 @@
 static struct json_object *root = NULL;
 static struct json_array *clients_array = NULL;
 static struct json_array *du_array = NULL;
-static int do_output_all_clients;
+
+static int error_clients;
 
 #define FIO_CLIENT_HASH_BITS	7
 #define FIO_CLIENT_HASH_SZ	(1 << FIO_CLIENT_HASH_BITS)
@@ -89,6 +91,30 @@
 		INIT_FLIST_HEAD(&client_hash[i]);
 }
 
+static int read_data(int fd, void *data, size_t size)
+{
+	ssize_t ret;
+
+	while (size) {
+		ret = read(fd, data, size);
+		if (ret < 0) {
+			if (errno == EAGAIN || errno == EINTR)
+				continue;
+			break;
+		} else if (!ret)
+			break;
+		else {
+			data += ret;
+			size -= ret;
+		}
+	}
+
+	if (size)
+		return EAGAIN;
+
+	return 0;
+}
+
 static void fio_client_json_init(void)
 {
 	if (output_format != FIO_OUTPUT_JSON)
@@ -141,13 +167,19 @@
 		free(client->argv);
 	if (client->name)
 		free(client->name);
-	while (client->nr_ini_file)
-		free(client->ini_file[--client->nr_ini_file]);
-	if (client->ini_file)
-		free(client->ini_file);
+	while (client->nr_files) {
+		struct client_file *cf = &client->files[--client->nr_files];
+
+		free(cf->file);
+	}
+	if (client->files)
+		free(client->files);
 
 	if (!client->did_stat)
-		sum_stat_clients -= client->nr_stat;
+		sum_stat_clients--;
+
+	if (client->error)
+		error_clients++;
 
 	free(client);
 }
@@ -262,17 +294,29 @@
 	return NULL;
 }
 
-void fio_client_add_ini_file(void *cookie, const char *ini_file)
+int fio_client_add_ini_file(void *cookie, const char *ini_file, int remote)
 {
 	struct fio_client *client = cookie;
+	struct client_file *cf;
 	size_t new_size;
+	void *new_files;
+
+	if (!client)
+		return 1;
 
 	dprint(FD_NET, "client <%s>: add ini %s\n", client->hostname, ini_file);
 
-	new_size = (client->nr_ini_file + 1) * sizeof(char *);
-	client->ini_file = realloc(client->ini_file, new_size);
-	client->ini_file[client->nr_ini_file] = strdup(ini_file);
-	client->nr_ini_file++;
+	new_size = (client->nr_files + 1) * sizeof(struct client_file);
+	new_files = realloc(client->files, new_size);
+	if (!new_files)
+		return 1;
+
+	client->files = new_files;
+	cf = &client->files[client->nr_files];
+	cf->file = strdup(ini_file);
+	cf->remote = remote;
+	client->nr_files++;
+	return 0;
 }
 
 int fio_client_add(struct client_ops *ops, const char *hostname, void **cookie)
@@ -323,10 +367,27 @@
 	return 0;
 }
 
+static const char *server_name(struct fio_client *client, char *buf,
+			       size_t bufsize)
+{
+	const char *from;
+
+	if (client->ipv6)
+		from = inet_ntop(AF_INET6, (struct sockaddr *) &client->addr6.sin6_addr, buf, bufsize);
+	else if (client->is_sock)
+		from = "sock";
+	else
+		from = inet_ntop(AF_INET, (struct sockaddr *) &client->addr.sin_addr, buf, bufsize);
+
+	return from;
+}
+
 static void probe_client(struct fio_client *client)
 {
 	struct cmd_client_probe_pdu pdu;
+	const char *sname;
 	uint64_t tag;
+	char buf[64];
 
 	dprint(FD_NET, "client: send probe\n");
 
@@ -336,6 +397,10 @@
 	pdu.flags = 0;
 #endif
 
+	sname = server_name(client, buf, sizeof(buf));
+	memset(pdu.server, 0, sizeof(pdu.server));
+	strncpy((char *) pdu.server, sname, sizeof(pdu.server) - 1);
+
 	fio_net_send_cmd(client->fd, FIO_NET_CMD_PROBE, &pdu, sizeof(pdu), &tag, &client->cmd_list);
 }
 
@@ -599,11 +664,34 @@
 	return flist_empty(&client_list);
 }
 
+static int __fio_client_send_remote_ini(struct fio_client *client,
+					const char *filename)
+{
+	struct cmd_load_file_pdu *pdu;
+	size_t p_size;
+	int ret;
+
+	dprint(FD_NET, "send remote ini %s to %s\n", filename, client->hostname);
+
+	p_size = sizeof(*pdu) + strlen(filename) + 1;
+	pdu = malloc(p_size);
+	memset(pdu, 0, p_size);
+	pdu->name_len = strlen(filename);
+	strcpy((char *) pdu->file, filename);
+	pdu->client_type = cpu_to_le16((uint16_t) client->type);
+
+	client->sent_job = 1;
+	ret = fio_net_send_cmd(client->fd, FIO_NET_CMD_LOAD_FILE, pdu, p_size,NULL, NULL);
+	free(pdu);
+	return ret;
+}
+
 /*
  * Send file contents to server backend. We could use sendfile(), but to remain
  * more portable lets just read/write the darn thing.
  */
-static int __fio_client_send_ini(struct fio_client *client, const char *filename)
+static int __fio_client_send_local_ini(struct fio_client *client,
+				       const char *filename)
 {
 	struct cmd_job_pdu *pdu;
 	size_t p_size;
@@ -617,15 +705,13 @@
 
 	fd = open(filename, O_RDONLY);
 	if (fd < 0) {
-		int ret = -errno;
-
+		ret = -errno;
 		log_err("fio: job file <%s> open: %s\n", filename, strerror(errno));
 		return ret;
 	}
 
 	if (fstat(fd, &sb) < 0) {
-		int ret = -errno;
-
+		ret = -errno;
 		log_err("fio: job file stat: %s\n", strerror(errno));
 		close(fd);
 		return ret;
@@ -637,21 +723,7 @@
 
 	len = sb.st_size;
 	p = buf;
-	do {
-		ret = read(fd, p, len);
-		if (ret > 0) {
-			len -= ret;
-			if (!len)
-				break;
-			p += ret;
-			continue;
-		} else if (!ret)
-			break;
-		else if (errno == EAGAIN || errno == EINTR)
-			continue;
-	} while (1);
-
-	if (len) {
+	if (read_data(fd, p, len)) {
 		log_err("fio: failed reading job file %s\n", filename);
 		close(fd);
 		free(pdu);
@@ -668,17 +740,28 @@
 	return ret;
 }
 
-int fio_client_send_ini(struct fio_client *client, const char *filename)
+int fio_client_send_ini(struct fio_client *client, const char *filename,
+			int remote)
 {
 	int ret;
 
-	ret = __fio_client_send_ini(client, filename);
+	if (!remote)
+		ret = __fio_client_send_local_ini(client, filename);
+	else
+		ret = __fio_client_send_remote_ini(client, filename);
+
 	if (!ret)
 		client->sent_job = 1;
 
 	return ret;
 }
 
+static int fio_client_send_cf(struct fio_client *client,
+			      struct client_file *cf)
+{
+	return fio_client_send_ini(client, cf->file, cf->remote);
+}
+
 int fio_clients_send_ini(const char *filename)
 {
 	struct fio_client *client;
@@ -687,18 +770,23 @@
 	flist_for_each_safe(entry, tmp, &client_list) {
 		client = flist_entry(entry, struct fio_client, list);
 
-		if (client->nr_ini_file) {
+		if (client->nr_files) {
 			int i;
 
-			for (i = 0; i < client->nr_ini_file; i++) {
-				const char *ini = client->ini_file[i];
+			for (i = 0; i < client->nr_files; i++) {
+				struct client_file *cf;
 
-				if (fio_client_send_ini(client, ini)) {
+				cf = &client->files[i];
+
+				if (fio_client_send_cf(client, cf)) {
 					remove_client(client);
 					break;
 				}
 			}
-		} else if (!filename || fio_client_send_ini(client, filename))
+		}
+		if (client->sent_job)
+			continue;
+		if (!filename || fio_client_send_ini(client, filename, 0))
 			remove_client(client);
 	}
 
@@ -754,6 +842,7 @@
 	dst->minf		= le64_to_cpu(src->minf);
 	dst->majf		= le64_to_cpu(src->majf);
 	dst->clat_percentiles	= le64_to_cpu(src->clat_percentiles);
+	dst->percentile_precision = le64_to_cpu(src->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		fio_fp64_t *fps = &src->percentile_list[i];
@@ -780,6 +869,7 @@
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		dst->total_io_u[i]	= le64_to_cpu(src->total_io_u[i]);
 		dst->short_io_u[i]	= le64_to_cpu(src->short_io_u[i]);
+		dst->drop_io_u[i]	= le64_to_cpu(src->drop_io_u[i]);
 	}
 
 	dst->total_submit	= le64_to_cpu(src->total_submit);
@@ -823,9 +913,11 @@
 }
 
 static void json_object_add_client_info(struct json_object *obj,
-struct fio_client *client)
+					struct fio_client *client)
 {
-	json_object_add_value_string(obj, "hostname", client->hostname);
+	const char *hostname = client->hostname ? client->hostname : "";
+
+	json_object_add_value_string(obj, "hostname", hostname);
 	json_object_add_value_int(obj, "port", client->port);
 }
 
@@ -841,7 +933,7 @@
 		json_array_add_value_object(clients_array, tsobj);
 	}
 
-	if (!do_output_all_clients)
+	if (sum_stat_clients <= 1)
 		return;
 
 	sum_thread_stats(&client_ts, &p->ts, sum_stat_nr);
@@ -890,16 +982,16 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		agg->ios[i]	= le32_to_cpu(agg->ios[i]);
-		agg->merges[i]	= le32_to_cpu(agg->merges[i]);
+		agg->ios[i]	= le64_to_cpu(agg->ios[i]);
+		agg->merges[i]	= le64_to_cpu(agg->merges[i]);
 		agg->sectors[i]	= le64_to_cpu(agg->sectors[i]);
-		agg->ticks[i]	= le32_to_cpu(agg->ticks[i]);
+		agg->ticks[i]	= le64_to_cpu(agg->ticks[i]);
 	}
 
-	agg->io_ticks		= le32_to_cpu(agg->io_ticks);
-	agg->time_in_queue	= le32_to_cpu(agg->time_in_queue);
+	agg->io_ticks		= le64_to_cpu(agg->io_ticks);
+	agg->time_in_queue	= le64_to_cpu(agg->time_in_queue);
 	agg->slavecount		= le32_to_cpu(agg->slavecount);
-	agg->max_util.u.f	= fio_uint64_to_double(__le64_to_cpu(agg->max_util.u.i));
+	agg->max_util.u.f	= fio_uint64_to_double(le64_to_cpu(agg->max_util.u.i));
 }
 
 static void convert_dus(struct disk_util_stat *dus)
@@ -907,14 +999,14 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		dus->s.ios[i]		= le32_to_cpu(dus->s.ios[i]);
-		dus->s.merges[i]	= le32_to_cpu(dus->s.merges[i]);
+		dus->s.ios[i]		= le64_to_cpu(dus->s.ios[i]);
+		dus->s.merges[i]	= le64_to_cpu(dus->s.merges[i]);
 		dus->s.sectors[i]	= le64_to_cpu(dus->s.sectors[i]);
-		dus->s.ticks[i]		= le32_to_cpu(dus->s.ticks[i]);
+		dus->s.ticks[i]		= le64_to_cpu(dus->s.ticks[i]);
 	}
 
-	dus->s.io_ticks		= le32_to_cpu(dus->s.io_ticks);
-	dus->s.time_in_queue	= le32_to_cpu(dus->s.time_in_queue);
+	dus->s.io_ticks		= le64_to_cpu(dus->s.io_ticks);
+	dus->s.time_in_queue	= le64_to_cpu(dus->s.time_in_queue);
 	dus->s.msec		= le64_to_cpu(dus->s.msec);
 }
 
@@ -987,7 +1079,12 @@
 		dst->eta_sec = je->eta_sec;
 
 	dst->nr_threads		+= je->nr_threads;
-	/* we need to handle je->run_str too ... */
+
+	/*
+	 * This wont be correct for multiple strings, but at least it
+	 * works for the basic cases.
+	 */
+	strcpy((char *) dst->run_str, (char *) je->run_str);
 }
 
 void fio_client_dec_jobs_eta(struct client_eta *eta, client_eta_op eta_fn)
@@ -1098,9 +1195,6 @@
 	client->jobs = le32_to_cpu(pdu->jobs);
 	client->nr_stat = le32_to_cpu(pdu->stat_outputs);
 
-	if (sum_stat_clients > 1)
-		do_output_all_clients = 1;
-
 	sum_stat_clients += client->nr_stat;
 }
 
@@ -1149,9 +1243,9 @@
 	/*
 	 * Get header first, it's not compressed
 	 */
-	nr_samples = le32_to_cpu(pdu->nr_samples);
+	nr_samples = le64_to_cpu(pdu->nr_samples);
 
-	total = nr_samples * sizeof(struct io_sample);
+	total = nr_samples * __log_entry_sz(le32_to_cpu(pdu->log_offset));
 	ret = malloc(total + sizeof(*pdu));
 	ret->nr_samples = nr_samples;
 
@@ -1201,7 +1295,8 @@
 {
 	struct cmd_iolog_pdu *pdu = (struct cmd_iolog_pdu *) cmd->payload;
 	struct cmd_iolog_pdu *ret;
-	int i;
+	uint64_t i;
+	void *samples;
 
 	/*
 	 * Convert if compressed and we support it. If it's not
@@ -1220,23 +1315,73 @@
 	} else
 		ret = pdu;
 
+	ret->nr_samples		= le64_to_cpu(ret->nr_samples);
 	ret->thread_number	= le32_to_cpu(ret->thread_number);
-	ret->nr_samples		= le32_to_cpu(ret->nr_samples);
 	ret->log_type		= le32_to_cpu(ret->log_type);
 	ret->compressed		= le32_to_cpu(ret->compressed);
+	ret->log_offset		= le32_to_cpu(ret->log_offset);
 
+	samples = &ret->samples[0];
 	for (i = 0; i < ret->nr_samples; i++) {
-		struct io_sample *s = &ret->samples[i];
+		struct io_sample *s;
 
-		s->time	= le64_to_cpu(s->time);
-		s->val	= le64_to_cpu(s->val);
-		s->ddir	= le32_to_cpu(s->ddir);
-		s->bs	= le32_to_cpu(s->bs);
+		s = __get_sample(samples, ret->log_offset, i);
+		s->time		= le64_to_cpu(s->time);
+		s->val		= le64_to_cpu(s->val);
+		s->__ddir	= le32_to_cpu(s->__ddir);
+		s->bs		= le32_to_cpu(s->bs);
+
+		if (ret->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = le64_to_cpu(so->offset);
+		}
 	}
 
 	return ret;
 }
 
+static void sendfile_reply(int fd, struct cmd_sendfile_reply *rep,
+			   size_t size, uint64_t tag)
+{
+	rep->error = cpu_to_le32(rep->error);
+	fio_net_send_cmd(fd, FIO_NET_CMD_SENDFILE, rep, size, &tag, NULL);
+}
+
+static int send_file(struct fio_client *client, struct cmd_sendfile *pdu,
+		     uint64_t tag)
+{
+	struct cmd_sendfile_reply *rep;
+	struct stat sb;
+	size_t size;
+	int fd;
+
+	size = sizeof(*rep);
+	rep = malloc(size);
+
+	if (stat((char *)pdu->path, &sb) < 0) {
+fail:
+		rep->error = errno;
+		sendfile_reply(client->fd, rep, size, tag);
+		free(rep);
+		return 1;
+	}
+
+	size += sb.st_size;
+	rep = realloc(rep, size);
+	rep->size = cpu_to_le32((uint32_t) sb.st_size);
+
+	fd = open((char *)pdu->path, O_RDONLY);
+	if (fd == -1 )
+		goto fail;
+
+	rep->error = read_data(fd, &rep->data, sb.st_size);
+	sendfile_reply(client->fd, rep, size, tag);
+	free(rep);
+	close(fd);
+	return 0;
+}
+
 int fio_handle_client(struct fio_client *client)
 {
 	struct client_ops *ops = client->ops;
@@ -1256,12 +1401,10 @@
 		if (ops->quit)
 			ops->quit(client, cmd);
 		remove_client(client);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_TEXT:
 		convert_text(cmd);
 		ops->text(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_DU: {
 		struct cmd_du_pdu *du = (struct cmd_du_pdu *) cmd->payload;
@@ -1270,7 +1413,6 @@
 		convert_agg(&du->agg);
 
 		ops->disk_util(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_TS: {
@@ -1280,7 +1422,6 @@
 		convert_gs(&p->rs, &p->rs);
 
 		ops->thread_status(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_GS: {
@@ -1289,7 +1430,6 @@
 		convert_gs(gs, gs);
 
 		ops->group_stats(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_ETA: {
@@ -1298,26 +1438,22 @@
 		remove_reply_cmd(client, cmd);
 		convert_jobs_eta(je);
 		handle_eta(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_PROBE:
 		remove_reply_cmd(client, cmd);
 		ops->probe(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_SERVER_START:
 		client->state = Client_running;
 		if (ops->job_start)
 			ops->job_start(client, cmd);
-		free(cmd);
 		break;
 	case FIO_NET_CMD_START: {
 		struct cmd_start_pdu *pdu = (struct cmd_start_pdu *) cmd->payload;
 
 		pdu->jobs = le32_to_cpu(pdu->jobs);
 		ops->start(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_STOP: {
@@ -1328,7 +1464,6 @@
 		client->error = le32_to_cpu(pdu->error);
 		client->signal = le32_to_cpu(pdu->signal);
 		ops->stop(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_ADD_JOB: {
@@ -1339,7 +1474,6 @@
 
 		if (ops->add_job)
 			ops->add_job(client, cmd);
-		free(cmd);
 		break;
 		}
 	case FIO_NET_CMD_IOLOG:
@@ -1349,22 +1483,63 @@
 			pdu = convert_iolog(cmd);
 			ops->iolog(client, pdu);
 		}
-		free(cmd);
 		break;
 	case FIO_NET_CMD_UPDATE_JOB:
 		ops->update_job(client, cmd);
 		remove_reply_cmd(client, cmd);
-		free(cmd);
 		break;
+	case FIO_NET_CMD_VTRIGGER: {
+		struct all_io_list *pdu = (struct all_io_list *) cmd->payload;
+		char buf[64];
+
+		__verify_save_state(pdu, server_name(client, buf, sizeof(buf)));
+		exec_trigger(trigger_cmd);
+		break;
+		}
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile *pdu = (struct cmd_sendfile *) cmd->payload;
+		send_file(client, pdu, cmd->tag);
+		break;
+		}
 	default:
 		log_err("fio: unknown client op: %s\n", fio_server_op(cmd->opcode));
-		free(cmd);
 		break;
 	}
 
+	free(cmd);
 	return 1;
 }
 
+int fio_clients_send_trigger(const char *cmd)
+{
+	struct flist_head *entry;
+	struct fio_client *client;
+	size_t slen;
+
+	dprint(FD_NET, "client: send vtrigger: %s\n", cmd);
+
+	if (!cmd)
+		slen = 0;
+	else
+		slen = strlen(cmd);
+
+	flist_for_each(entry, &client_list) {
+		struct cmd_vtrigger_pdu *pdu;
+
+		client = flist_entry(entry, struct fio_client, list);
+
+		pdu = malloc(sizeof(*pdu) + slen);
+		pdu->len = cpu_to_le16((uint16_t) slen);
+		if (slen)
+			memcpy(pdu->cmd, cmd, slen);
+		fio_net_send_cmd(client->fd, FIO_NET_CMD_VTRIGGER, pdu,
+					sizeof(*pdu) + slen, NULL, NULL);
+		free(pdu);
+	}
+
+	return 0;
+}
+
 static void request_client_etas(struct client_ops *ops)
 {
 	struct fio_client *client;
@@ -1374,8 +1549,7 @@
 
 	dprint(FD_NET, "client: request eta (%d)\n", nr_clients);
 
-	eta = malloc(sizeof(*eta));
-	memset(&eta->eta, 0, sizeof(eta->eta));
+	eta = calloc(1, sizeof(*eta) + __THREAD_RUNSTR_SZ(REAL_MAX_JOBS));
 	eta->pending = nr_clients;
 
 	flist_for_each(entry, &client_list) {
@@ -1447,6 +1621,7 @@
 		else
 			log_err("fio: client %s timed out\n", client->hostname);
 
+		client->error = ETIMEDOUT;
 		remove_client(client);
 		ret = 1;
 	}
@@ -1492,6 +1667,7 @@
 
 		do {
 			struct timeval tv;
+			int timeout;
 
 			fio_gettime(&tv, NULL);
 			if (mtime_since(&eta_tv, &tv) >= 900) {
@@ -1502,7 +1678,11 @@
 					break;
 			}
 
-			ret = poll(pfds, nr_clients, ops->eta_msec);
+			check_trigger_file();
+
+			timeout = min(100u, ops->eta_msec);
+
+			ret = poll(pfds, nr_clients, timeout);
 			if (ret < 0) {
 				if (errno == EINTR)
 					continue;
@@ -1535,5 +1715,5 @@
 	fio_client_json_fini();
 
 	free(pfds);
-	return retval;
+	return retval || error_clients;
 }
diff --git a/client.h b/client.h
index c8ff23e..8818de2 100644
--- a/client.h
+++ b/client.h
@@ -20,6 +20,11 @@
 	Client_exited		= 5,
 };
 
+struct client_file {
+	char *file;
+	int remote;
+};
+
 struct fio_client {
 	struct flist_head list;
 	struct flist_head hash_list;
@@ -64,8 +69,8 @@
 	struct client_ops *ops;
 	void *client_data;
 
-	char **ini_file;
-	unsigned int nr_ini_file;
+	struct client_file *files;
+	unsigned int nr_files;
 };
 
 struct cmd_iolog_pdu;
@@ -119,19 +124,20 @@
 extern int fio_clients_connect(void);
 extern int fio_start_client(struct fio_client *);
 extern int fio_start_all_clients(void);
-extern int fio_client_send_ini(struct fio_client *, const char *);
 extern int fio_clients_send_ini(const char *);
+extern int fio_client_send_ini(struct fio_client *, const char *, int);
 extern int fio_handle_clients(struct client_ops *);
 extern int fio_client_add(struct client_ops *, const char *, void **);
 extern struct fio_client *fio_client_add_explicit(struct client_ops *, const char *, int, int);
 extern void fio_client_add_cmd_option(void *, const char *);
-extern void fio_client_add_ini_file(void *, const char *);
+extern int fio_client_add_ini_file(void *, const char *, int);
 extern int fio_client_terminate(struct fio_client *);
 extern void fio_clients_terminate(void);
 extern struct fio_client *fio_get_client(struct fio_client *);
 extern void fio_put_client(struct fio_client *);
 extern int fio_client_update_options(struct fio_client *, struct thread_options *, uint64_t *);
 extern int fio_client_wait_for_reply(struct fio_client *, uint64_t);
+extern int fio_clients_send_trigger(const char *);
 
 #define FIO_CLIENT_DEF_ETA_MSEC		900
 
diff --git a/compiler/compiler-gcc4.h b/compiler/compiler-gcc4.h
index f136611..e8701cf 100644
--- a/compiler/compiler-gcc4.h
+++ b/compiler/compiler-gcc4.h
@@ -5,4 +5,13 @@
 #define __must_check		__attribute__((warn_unused_result))
 #endif
 
+#define GCC_VERSION (__GNUC__ * 10000		\
+			+ __GNUC_MINOR__ * 100	\
+			+ __GNUC_PATCHLEVEL__)
+
+#if GCC_VERSION >= 40300
+#define __compiletime_warning(message)	__attribute__((warning(message)))
+#define __compiletime_error(message)	__attribute__((error(message)))
+#endif
+
 #endif
diff --git a/compiler/compiler.h b/compiler/compiler.h
index 0a0213b..40e857c 100644
--- a/compiler/compiler.h
+++ b/compiler/compiler.h
@@ -22,4 +22,37 @@
 
 #define fio_unlikely(x)	__builtin_expect(!!(x), 0)
 
+/*
+ * Check at compile time that something is of a particular type.
+ * Always evaluates to 1 so you may use it easily in comparisons.
+ */
+#define typecheck(type,x) \
+({	type __dummy; \
+	typeof(x) __dummy2; \
+	(void)(&__dummy == &__dummy2); \
+	1; \
+})
+
+#ifndef __compiletime_error
+#define __compiletime_error(message)
+#endif
+#ifndef __compiletime_error_fallback
+#define __compiletime_error_fallback(condition)	do { } while (0)
+#endif
+
+#define __compiletime_assert(condition, msg, prefix, suffix)		\
+	do {								\
+		int __cond = !(condition);				\
+		extern void prefix ## suffix(void) __compiletime_error(msg); \
+		if (__cond)						\
+			prefix ## suffix();				\
+		__compiletime_error_fallback(__cond);			\
+	} while (0)
+
+#define _compiletime_assert(condition, msg, prefix, suffix) \
+	__compiletime_assert(condition, msg, prefix, suffix)
+
+#define compiletime_assert(condition, msg) \
+	_compiletime_assert(condition, msg, __compiletime_assert_, __LINE__)
+
 #endif
diff --git a/configure b/configure
index 2ba1daf..892335b 100755
--- a/configure
+++ b/configure
@@ -133,7 +133,8 @@
 # default options
 show_help="no"
 exit_val=0
-gfio="no"
+gfio_check="no"
+libhdfs="no"
 
 # parse options
 for opt do
@@ -141,17 +142,31 @@
   case "$opt" in
   --cpu=*) cpu="$optarg"
   ;;
+  #  esx is cross compiled and cannot be detect through simple uname calls
+  --esx)
+  esx="yes"
+  ;;
   --cc=*) CC="$optarg"
   ;;
   --extra-cflags=*) CFLAGS="$CFLAGS $optarg"
   ;;
   --build-32bit-win) build_32bit_win="yes"
   ;;
+  --build-static) build_static="yes"
+  ;;
   --enable-gfio)
-  gfio="yes"
+  gfio_check="yes"
   ;;
   --disable-numa) disable_numa="yes"
   ;;
+  --disable-rbd) disable_rbd="yes"
+  ;;
+  --disable-gfapi) disable_gfapi="yes"
+  ;;
+  --enable-libhdfs) libhdfs="yes"
+  ;;
+  --disable-shm) output_sym "CONFIG_NO_SHM"
+  ;;
   --help)
     show_help="yes"
     ;;
@@ -167,8 +182,11 @@
   echo "--cc=                  Specify compiler to use"
   echo "--extra-cflags=        Specify extra CFLAGS to pass to compiler"
   echo "--build-32bit-win      Enable 32-bit build on Windows"
+  echo "--build-static         Build a static fio"
+  echo "--esx                  Configure build options for esx"
   echo "--enable-gfio          Enable building of gtk gfio"
   echo "--disable-numa         Disable libnuma even if found"
+  echo "--enable-libhdfs       Enable hdfs support"
   exit $exit_val
 fi
 
@@ -387,6 +405,16 @@
 echo
 
 ##########################################
+# See if we need to build a static build
+if test "$build_static" = "yes" ; then
+  CFLAGS="$CFLAGS -ffunction-sections -fdata-sections"
+  LDFLAGS="$LDFLAGS -static -Wl,--gc-sections"
+else
+  build_static="no"
+fi
+echo "Static build                  $build_static"
+
+##########################################
 # check for wordsize
 wordsize="0"
 cat > $TMPC <<EOF
@@ -859,7 +887,7 @@
 int main(int argc, char **argv)
 {
   struct bitmask *mask = numa_parse_nodestring(NULL);
-  return 0;
+  return mask->size == 0;
 }
 EOF
 if compile_prog "" "" "libnuma api"; then
@@ -969,7 +997,8 @@
 
 ##########################################
 # Check if we have required gtk/glib support for gfio
-if test "$gfio" = "yes" ; then
+gfio="no"
+if test "$gfio_check" = "yes" ; then
   cat > $TMPC << EOF
 #include <glib.h>
 #include <cairo.h>
@@ -983,6 +1012,8 @@
 }
 EOF
 GTK_CFLAGS=$(pkg-config --cflags gtk+-2.0 gthread-2.0)
+ORG_LDFLAGS=$LDFLAGS
+LDFLAGS=$(echo $LDFLAGS | sed s/"-static"//g)
 if test "$?" != "0" ; then
   echo "configure: gtk and gthread not found"
   exit 1
@@ -996,7 +1027,7 @@
   r=$($TMPE)
   if test "$r" != "0" ; then
     gfio="yes"
-    LIBS="$LIBS $GTK_LIBS"
+    GFIO_LIBS="$LIBS $GTK_LIBS"
     CFLAGS="$CFLAGS $GTK_CFLAGS"
   else
     echo "GTK found, but need version 2.18 or higher"
@@ -1006,9 +1037,12 @@
   echo "Please install gtk and gdk libraries"
   gfio="no"
 fi
+LDFLAGS=$ORG_LDFLAGS
 fi
 
-echo "gtk 2.18 or higher            $gfio"
+if test "$gfio_check" = "yes" ; then
+  echo "gtk 2.18 or higher            $gfio"
+fi
 
 # Check whether we have getrusage(RUSAGE_THREAD)
 rusage_thread="no"
@@ -1062,6 +1096,45 @@
 echo "TCP_NODELAY                   $tcp_nodelay"
 
 ##########################################
+# Check whether we have SO_SNDBUF
+window_size="no"
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+int main(int argc, char **argv)
+{
+  setsockopt(0, SOL_SOCKET, SO_SNDBUF, NULL, 0);
+  setsockopt(0, SOL_SOCKET, SO_RCVBUF, NULL, 0);
+}
+EOF
+if compile_prog "" "" "SO_SNDBUF"; then
+  window_size="yes"
+fi
+echo "Net engine window_size        $window_size"
+
+##########################################
+# Check whether we have TCP_MAXSEG
+mss="no"
+cat > $TMPC << EOF
+#include <stdio.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <netinet/tcp.h>
+#include <arpa/inet.h>
+#include <netinet/in.h>
+int main(int argc, char **argv)
+{
+  return setsockopt(0, IPPROTO_TCP, TCP_MAXSEG, NULL, 0);
+}
+EOF
+if compile_prog "" "" "TCP_MAXSEG"; then
+  mss="yes"
+fi
+echo "TCP_MAXSEG                    $mss"
+
+##########################################
 # Check whether we have RLIMIT_MEMLOCK
 rlimit_memlock="no"
 cat > $TMPC << EOF
@@ -1141,13 +1214,33 @@
   return 0;
 }
 EOF
-if compile_prog "" "-lrbd -lrados" "rbd"; then
+if test "$disable_rbd" != "yes"  && compile_prog "" "-lrbd -lrados" "rbd"; then
   LIBS="-lrbd -lrados $LIBS"
   rbd="yes"
 fi
 echo "Rados Block Device engine     $rbd"
 
 ##########################################
+# check for rbd_invaidate_cache()
+rbd_inval="no"
+if test "$rbd" = "yes"; then
+cat > $TMPC << EOF
+#include <rbd/librbd.h>
+
+int main(int argc, char **argv)
+{
+  rbd_image_t image;
+
+  return rbd_invalidate_cache(image);
+}
+EOF
+if compile_prog "" "-lrbd -lrados" "rbd"; then
+  rbd_inval="yes"
+fi
+echo "rbd_invalidate_cache          $rbd_inval"
+fi
+
+##########################################
 # Check whether we have setvbuf
 setvbuf="no"
 cat > $TMPC << EOF
@@ -1165,6 +1258,64 @@
 fi
 echo "setvbuf                       $setvbuf"
 
+# check for gfapi
+gfapi="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+
+  glfs_t *g = glfs_new("foo");
+
+  return 0;
+}
+EOF
+if test "$disable_gfapi" != "yes"  && compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  LIBS="-lgfapi -lglusterfs $LIBS"
+  gfapi="yes"
+fi
+ echo "Gluster API engine            $gfapi"
+
+##########################################
+# check for gfapi fadvise support
+if test "$gfapi" = "yes" ; then
+gf_fadvise="no"
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  struct glfs_fd *fd;
+  int ret = glfs_fadvise(fd, 0, 0, 1);
+
+  return 0;
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gfapi"; then
+  gf_fadvise="yes"
+fi
+echo "Gluster API use fadvise       $gf_fadvise"
+fi
+
+##########################################
+# check for gfapi trim support
+gf_trim="no"
+if test "$gfapi" = "yes" ; then
+cat > $TMPC << EOF
+#include <glusterfs/api/glfs.h>
+
+int main(int argc, char **argv)
+{
+  return glfs_discard_async(NULL, 0, 0);
+}
+EOF
+if compile_prog "" "-lgfapi -lglusterfs" "gf trim"; then
+  gf_trim="yes"
+fi
+echo "Gluster API trim support      $gf_trim"
+fi
+
 ##########################################
 # Check if we support stckf on s390
 s390_z196_facilities="no"
@@ -1195,6 +1346,73 @@
   fi
 fi
 echo "s390_z196_facilities          $s390_z196_facilities"
+
+##########################################
+# Check if we have required environment variables configured for libhdfs
+if test "$libhdfs" = "yes" ; then
+  hdfs_conf_error=0
+  if test "$JAVA_HOME" = "" ; then
+    echo "configure: JAVA_HOME should be defined to jdk/jvm path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_INCLUDE" = "" ; then
+    echo "configure: FIO_LIBHDFS_INCLUDE should be defined to libhdfs inlude path"
+    hdfs_conf_error=1
+  fi
+  if test "$FIO_LIBHDFS_LIB" = "" ; then
+    echo "configure: FIO_LIBHDFS_LIB should be defined to libhdfs library path"
+    hdfs_conf_error=1
+  fi
+  if test "$hdfs_conf_error" = "1" ; then
+    exit 1
+  fi
+fi
+echo "HDFS engine                   $libhdfs"
+
+# Check if we have lex/yacc available
+yacc="no"
+yacc_is_bison="no"
+lex="no"
+arith="no"
+if test "$targetos" != "SunOS" ; then
+LEX=$(which lex 2> /dev/null)
+if test -x "$LEX" ; then
+  lex="yes"
+fi
+YACC=$(which bison 2> /dev/null)
+if test -x "$YACC" ; then
+  yacc="yes"
+  yacc_is_bison="yes"
+else
+  YACC=$(which yacc 2> /dev/null)
+  if test -x "$YACC" ; then
+    yacc="yes"
+  fi
+fi
+if test "$yacc" = "yes" && test "$lex" = "yes" ; then
+  arith="yes"
+fi
+
+if test "$arith" = "yes" ; then
+cat > $TMPC << EOF
+extern int yywrap(void);
+
+int main(int argc, char **argv)
+{
+  yywrap();
+  return 0;
+}
+EOF
+if compile_prog "" "-ll" "lex"; then
+  LIBS="-ll $LIBS"
+else
+  arith="no"
+fi
+fi
+fi
+
+echo "lex/yacc for arithmetic       $arith"
+
 #############################################################################
 
 if test "$wordsize" = "64" ; then
@@ -1301,12 +1519,22 @@
 if test "$gfio" = "yes" ; then
   echo "CONFIG_GFIO=y" >> $config_host_mak
 fi
+if test "$esx" = "yes" ; then
+  output_sym "CONFIG_ESX"
+  output_sym "CONFIG_NO_SHM"
+fi
 if test "$sched_idle" = "yes" ; then
   output_sym "CONFIG_SCHED_IDLE"
 fi
 if test "$tcp_nodelay" = "yes" ; then
   output_sym "CONFIG_TCP_NODELAY"
 fi
+if test "$window_size" = "yes" ; then
+  output_sym "CONFIG_NET_WINDOWSIZE"
+fi
+if test "$mss" = "yes" ; then
+  output_sym "CONFIG_NET_MSS"
+fi
 if test "$rlimit_memlock" = "yes" ; then
   output_sym "CONFIG_RLIMIT_MEMLOCK"
 fi
@@ -1319,6 +1547,9 @@
 if test "$rbd" = "yes" ; then
   output_sym "CONFIG_RBD"
 fi
+if test "$rbd_inval" = "yes" ; then
+  output_sym "CONFIG_RBD_INVAL"
+fi
 if test "$setvbuf" = "yes" ; then
   output_sym "CONFIG_SETVBUF"
 fi
@@ -1326,8 +1557,37 @@
   output_sym "CONFIG_S390_Z196_FACILITIES"
   CFLAGS="$CFLAGS -march=z9-109"
 fi
+if test "$gfapi" = "yes" ; then
+  output_sym "CONFIG_GFAPI"
+fi
+if test "$gf_fadvise" = "yes" ; then
+  output_sym "CONFIG_GF_FADVISE"
+fi
+if test "$gf_trim" = "yes" ; then
+  output_sym "CONFIG_GF_TRIM"
+fi
+if test "$libhdfs" = "yes" ; then
+  output_sym "CONFIG_LIBHDFS"
+  echo "JAVA_HOME=$JAVA_HOME" >> $config_host_mak
+  echo "FIO_LIBHDFS_INCLUDE=$FIO_LIBHDFS_INCLUDE" >> $config_host_mak
+  echo "FIO_LIBHDFS_LIB=$FIO_LIBHDFS_LIB" >> $config_host_mak
+ fi
+if test "$arith" = "yes" ; then
+  output_sym "CONFIG_ARITHMETIC"
+  if test "$yacc_is_bison" = "yes" ; then
+    echo "YACC=$YACC -y" >> $config_host_mak
+  else
+    echo "YACC=$YACC" >> $config_host_mak
+  fi
+fi
+
+if test "$zlib" = "no" ; then
+  echo "Consider installing zlib-dev (zlib-devel), some fio features depend on it."
+fi
 
 echo "LIBS+=$LIBS" >> $config_host_mak
+echo "GFIO_LIBS+=$GFIO_LIBS" >> $config_host_mak
 echo "CFLAGS+=$CFLAGS" >> $config_host_mak
+echo "LDFLAGS+=$LDFLAGS" >> $config_host_mak
 echo "CC=$cc" >> $config_host_mak
 echo "BUILD_CFLAGS=$BUILD_CFLAGS $CFLAGS" >> $config_host_mak
diff --git a/crc/fnv.c b/crc/fnv.c
new file mode 100644
index 0000000..04c0560
--- /dev/null
+++ b/crc/fnv.c
@@ -0,0 +1,16 @@
+#include "fnv.h"
+
+#define FNV_PRIME	0x100000001b3ULL
+
+uint64_t fnv(const void *buf, uint32_t len, uint64_t hval)
+{
+	const uint64_t *ptr = buf;
+	const uint64_t *end = (void *) buf + len;
+
+	while (ptr < end) {
+		hval *= FNV_PRIME;
+		hval ^= (uint64_t) *ptr++;
+	}
+
+	return hval;
+}
diff --git a/crc/fnv.h b/crc/fnv.h
new file mode 100644
index 0000000..ef2b77b
--- /dev/null
+++ b/crc/fnv.h
@@ -0,0 +1,8 @@
+#ifndef FIO_FNV_H
+#define FIO_FNV_H
+
+#include <inttypes.h>
+
+uint64_t fnv(const void *, uint32_t, uint64_t);
+
+#endif
diff --git a/crc/md5.c b/crc/md5.c
index 0da85e4..64fe48a 100644
--- a/crc/md5.c
+++ b/crc/md5.c
@@ -125,3 +125,23 @@
 
 	memcpy(mctx->block, data, len);
 }
+
+void fio_md5_final(struct fio_md5_ctx *mctx)
+{
+	const unsigned int offset = mctx->byte_count & 0x3f;
+	char *p = (char *)mctx->block + offset;
+	int padding = 56 - (offset + 1);
+
+	*p++ = 0x80;
+	if (padding < 0) {
+		memset(p, 0x00, padding + sizeof (uint64_t));
+		md5_transform(mctx->hash, mctx->block);
+		p = (char *)mctx->block;
+		padding = 56;
+	}
+
+	memset(p, 0, padding);
+	mctx->block[14] = mctx->byte_count << 3;
+	mctx->block[15] = mctx->byte_count >> 29;
+	md5_transform(mctx->hash, mctx->block);
+}
diff --git a/crc/md5.h b/crc/md5.h
index 668f0e9..54e350c 100644
--- a/crc/md5.h
+++ b/crc/md5.h
@@ -23,6 +23,7 @@
 };
 
 extern void fio_md5_update(struct fio_md5_ctx *, const uint8_t *, unsigned int);
+extern void fio_md5_final(struct fio_md5_ctx *);
 extern void fio_md5_init(struct fio_md5_ctx *);
 
 #endif
diff --git a/crc/murmur3.c b/crc/murmur3.c
new file mode 100644
index 0000000..e316f59
--- /dev/null
+++ b/crc/murmur3.c
@@ -0,0 +1,68 @@
+#include "murmur3.h"
+
+static inline uint32_t rotl32(uint32_t x, int8_t r)
+{
+	return (x << r) | (x >> (32 - r));
+}
+
+//-----------------------------------------------------------------------------
+// Finalization mix - force all bits of a hash block to avalanche
+
+static inline uint32_t fmix32(uint32_t h)
+{
+	h ^= h >> 16;
+	h *= 0x85ebca6b;
+	h ^= h >> 13;
+	h *= 0xc2b2ae35;
+	h ^= h >> 16;
+
+	return h;
+}
+
+static uint32_t murmur3_tail(const uint8_t *data, const int nblocks,
+			     uint32_t len, const uint32_t c1,
+			     const uint32_t c2, uint32_t h1)
+{
+	const uint8_t *tail = (const uint8_t *)(data + nblocks * 4);
+
+	uint32_t k1 = 0;
+	switch (len & 3) {
+	case 3:
+		k1 ^= tail[2] << 16;
+	case 2:
+		k1 ^= tail[1] << 8;
+	case 1:
+		k1 ^= tail[0];
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+		h1 ^= k1;
+	};
+
+	return fmix32(h1 ^ len);
+}
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed)
+{
+	const uint8_t *data = (const uint8_t *)key;
+	const int nblocks = len / 4;
+	uint32_t h1 = seed;
+	const uint32_t c1 = 0xcc9e2d51;
+	const uint32_t c2 = 0x1b873593;
+	const uint32_t *blocks = (const uint32_t *)(data + nblocks * 4);
+	int i;
+
+	for (i = -nblocks; i; i++) {
+		uint32_t k1 = blocks[i];
+
+		k1 *= c1;
+		k1 = rotl32(k1, 15);
+		k1 *= c2;
+
+		h1 ^= k1;
+		h1 = rotl32(h1, 13);
+		h1 = h1 * 5 + 0xe6546b64;
+	}
+
+	return murmur3_tail(data, nblocks, len, c1, c2, h1);
+}
diff --git a/crc/murmur3.h b/crc/murmur3.h
new file mode 100644
index 0000000..89f6500
--- /dev/null
+++ b/crc/murmur3.h
@@ -0,0 +1,8 @@
+#ifndef FIO_MURMUR3_H
+#define FIO_MURMUR3_H
+
+#include <inttypes.h>
+
+uint32_t murmurhash3(const void *key, uint32_t len, uint32_t seed);
+
+#endif
diff --git a/crc/sha1.c b/crc/sha1.c
index 117fbd9..8d64c8e 100644
--- a/crc/sha1.c
+++ b/crc/sha1.c
@@ -55,7 +55,7 @@
 		memcpy(ctx->W, data, len);
 }
 
-void fio_sha1_final(unsigned char hashout[20], struct fio_sha1_ctx *ctx)
+void fio_sha1_final(struct fio_sha1_ctx *ctx)
 {
 	static const unsigned char pad[64] = { 0x80 };
 	unsigned int padlen[2];
@@ -69,11 +69,6 @@
 	i = ctx->size & 63;
 	fio_sha1_update(ctx, pad, 1+ (63 & (55 - i)));
 	fio_sha1_update(ctx, padlen, 8);
-
-	/* Output hash
-	 */
-	for (i = 0; i < 5; i++)
-		((unsigned int *)hashout)[i] = htonl(ctx->H[i]);
 }
 
 #if defined(__i386__) || defined(__x86_64__)
diff --git a/crc/sha1.h b/crc/sha1.h
index 14af44a..75317f7 100644
--- a/crc/sha1.h
+++ b/crc/sha1.h
@@ -15,6 +15,6 @@
 
 void fio_sha1_init(struct fio_sha1_ctx *);
 void fio_sha1_update(struct fio_sha1_ctx *, const void *dataIn, unsigned long len);
-void fio_sha1_final(unsigned char hashout[20], struct fio_sha1_ctx *);
+void fio_sha1_final(struct fio_sha1_ctx *);
 
 #endif
diff --git a/crc/sha256.c b/crc/sha256.c
index 3a72a5b..2fd17a3 100644
--- a/crc/sha256.c
+++ b/crc/sha256.c
@@ -237,37 +237,57 @@
 	sctx->state[5] = H5;
 	sctx->state[6] = H6;
 	sctx->state[7] = H7;
-	sctx->count[0] = sctx->count[1] = 0;
+	sctx->count = 0;
 }
 
 void fio_sha256_update(struct fio_sha256_ctx *sctx, const uint8_t *data,
 		       unsigned int len)
 {
-	unsigned int i, idx, part_len;
+	unsigned int partial, done;
+	const uint8_t *src;
 
-	/* Compute number of bytes mod 128 */
-	idx = (unsigned int)((sctx->count[0] >> 3) & 0x3f);
+	partial = sctx->count & 0x3f;
+	sctx->count += len;
+	done = 0;
+	src = data;
 
-	/* Update number of bits */
-	if ((sctx->count[0] += (len << 3)) < (len << 3)) {
-		sctx->count[1]++;
-		sctx->count[1] += (len >> 29);
+	if ((partial + len) > 63) {
+		if (partial) {
+			done = -partial;
+			memcpy(sctx->buf + partial, data, done + 64);
+			src = sctx->buf;
+		}
+
+		do {
+			sha256_transform(sctx->state, src);
+			done += 64;
+			src = data + done;
+		} while (done + 63 < len);
+
+		partial = 0;
 	}
+	memcpy(sctx->buf + partial, src, len - done);
+}
 
-	part_len = 64 - idx;
+void fio_sha256_final(struct fio_sha256_ctx *sctx)
+{
+	uint64_t bits;
+	unsigned int index, pad_len;
+	int i;
+	static const uint8_t padding[64] = { 0x80, };
 
-	/* Transform as many times as possible. */
-	if (len >= part_len) {
-		memcpy(&sctx->buf[idx], data, part_len);
-		sha256_transform(sctx->state, sctx->buf);
+	/* Save number of bits */
+	bits = (uint64_t) sctx->count << 3;
 
-		for (i = part_len; i + 63 < len; i += 64)
-			sha256_transform(sctx->state, &data[i]);
-		idx = 0;
-	} else {
-		i = 0;
-	}
-	
-	/* Buffer remaining input */
-	memcpy(&sctx->buf[idx], &data[i], len-i);
+	/* Pad out to 56 mod 64. */
+	index = sctx->count & 0x3f;
+	pad_len = (index < 56) ? (56 - index) : ((64+56) - index);
+	fio_sha256_update(sctx, padding, pad_len);
+
+	/* Append length (before padding) */
+	fio_sha256_update(sctx, (const uint8_t *)&bits, sizeof(bits));
+
+	/* Store state in digest */
+	for (i = 0; i < 8; i++)
+		sctx->buf[i] = sctx->state[i];
 }
diff --git a/crc/sha256.h b/crc/sha256.h
index c7aa28f..b636033 100644
--- a/crc/sha256.h
+++ b/crc/sha256.h
@@ -1,13 +1,17 @@
 #ifndef FIO_SHA256_H
 #define FIO_SHA256_H
 
+#define SHA256_DIGEST_SIZE	32
+#define SHA256_BLOCK_SIZE	64
+
 struct fio_sha256_ctx {
-	uint32_t count[2];
-	uint32_t state[8];
+	uint32_t count;
+	uint32_t state[SHA256_DIGEST_SIZE / 4];
 	uint8_t *buf;
 };
 
 void fio_sha256_init(struct fio_sha256_ctx *);
 void fio_sha256_update(struct fio_sha256_ctx *, const uint8_t *, unsigned int);
+void fio_sha256_final(struct fio_sha256_ctx *);
 
 #endif
diff --git a/crc/test.c b/crc/test.c
index 3773b71..dbc5653 100644
--- a/crc/test.c
+++ b/crc/test.c
@@ -17,6 +17,9 @@
 #include "../crc/sha256.h"
 #include "../crc/sha512.h"
 #include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/fnv.h"
+#include "../hash.h"
 
 #include "test.h"
 
@@ -26,7 +29,8 @@
 struct test_type {
 	const char *name;
 	unsigned int mask;
-	uint64_t (*fn)(void);
+	void (*fn)(struct test_type *, void *, size_t);
+	uint32_t output;
 };
 
 enum {
@@ -40,224 +44,140 @@
 	T_SHA256	= 1U << 7,
 	T_SHA512	= 1U << 8,
 	T_XXHASH	= 1U << 9,
+	T_MURMUR3	= 1U << 10,
+	T_JHASH		= 1U << 11,
+	T_FNV		= 1U << 12,
 };
 
-static void randomize_buf(void *buf, unsigned int size, int seed)
-{
-	struct frand_state state;
-
-	init_rand_seed(&state, seed);
-	fill_random_buf(&state, buf, size);
-}
-
-static uint64_t t_md5(void)
+static void t_md5(struct test_type *t, void *buf, size_t size)
 {
 	uint32_t digest[4];
 	struct fio_md5_ctx ctx = { .hash = digest };
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
 	fio_md5_init(&ctx);
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
-	for (i = 0; i < NR_CHUNKS; i++)
-		fio_md5_update(&ctx, buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_md5_update(&ctx, buf, size);
+		fio_md5_final(&ctx);
+	}
 }
 
-static uint64_t t_crc64(void)
+static void t_crc64(struct test_type *t, void *buf, size_t size)
 {
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc64(buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_crc64(buf, size);
 }
 
-static uint64_t t_crc32(void)
+static void t_crc32(struct test_type *t, void *buf, size_t size)
 {
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc32(buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_crc32(buf, size);
 }
 
-static uint64_t t_crc32c(void)
+static void t_crc32c(struct test_type *t, void *buf, size_t size)
 {
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc32c(buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_crc32c(buf, size);
 }
 
-static uint64_t t_crc16(void)
+static void t_crc16(struct test_type *t, void *buf, size_t size)
 {
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc16(buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_crc16(buf, size);
 }
 
-static uint64_t t_crc7(void)
+static void t_crc7(struct test_type *t, void *buf, size_t size)
 {
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_crc7(buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_crc7(buf, size);
 }
 
-static uint64_t t_sha1(void)
+static void t_sha1(struct test_type *t, void *buf, size_t size)
 {
 	uint32_t sha[5];
 	struct fio_sha1_ctx ctx = { .H = sha };
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
 	fio_sha1_init(&ctx);
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
-	for (i = 0; i < NR_CHUNKS; i++)
-		fio_sha1_update(&ctx, buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha1_update(&ctx, buf, size);
+		fio_sha1_final(&ctx);
+	}
 }
 
-static uint64_t t_sha256(void)
+static void t_sha256(struct test_type *t, void *buf, size_t size)
 {
 	uint8_t sha[64];
 	struct fio_sha256_ctx ctx = { .buf = sha };
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
 	fio_sha256_init(&ctx);
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
-	for (i = 0; i < NR_CHUNKS; i++)
-		fio_sha256_update(&ctx, buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+	for (i = 0; i < NR_CHUNKS; i++) {
+		fio_sha256_update(&ctx, buf, size);
+		fio_sha256_final(&ctx);
+	}
 }
 
-static uint64_t t_sha512(void)
+static void t_sha512(struct test_type *t, void *buf, size_t size)
 {
 	uint8_t sha[128];
 	struct fio_sha512_ctx ctx = { .buf = sha };
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
 	fio_sha512_init(&ctx);
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		fio_sha512_update(&ctx, buf, CHUNK);
-
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+		fio_sha512_update(&ctx, buf, size);
 }
 
-static uint64_t t_xxhash(void)
+static void t_murmur3(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		murmurhash3(buf, size, 0x8989);
+}
+
+static void t_jhash(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += jhash(buf, size, 0x8989);
+}
+
+static void t_fnv(struct test_type *t, void *buf, size_t size)
+{
+	int i;
+
+	for (i = 0; i < NR_CHUNKS; i++)
+		t->output += fnv(buf, size, 0x8989);
+}
+
+static void t_xxhash(struct test_type *t, void *buf, size_t size)
 {
 	void *state;
-	struct timeval s;
-	uint64_t ret;
-	void *buf;
 	int i;
 
 	state = XXH32_init(0x8989);
 
-	buf = malloc(CHUNK);
-	randomize_buf(buf, CHUNK, 0x8989);
-
-	fio_gettime(&s, NULL);
 	for (i = 0; i < NR_CHUNKS; i++)
-		XXH32_update(state, buf, CHUNK);
+		XXH32_update(state, buf, size);
 
-	XXH32_digest(state);
-	ret = utime_since_now(&s);
-	free(buf);
-	return ret;
+	t->output = XXH32_digest(state);
 }
 
 static struct test_type t[] = {
@@ -312,6 +232,21 @@
 		.fn = t_xxhash,
 	},
 	{
+		.name = "murmur3",
+		.mask = T_MURMUR3,
+		.fn = t_murmur3,
+	},
+	{
+		.name = "jhash",
+		.mask = T_JHASH,
+		.fn = t_jhash,
+	},
+	{
+		.name = "fnv",
+		.mask = T_FNV,
+		.fn = t_fnv,
+	},
+	{
 		.name = NULL,
 	},
 };
@@ -345,14 +280,16 @@
 	for (i = 0; t[i].name; i++)
 		printf("%s\n", t[i].name);
 
-	return 0;
+	return 1;
 }
 
 int fio_crctest(const char *type)
 {
 	unsigned int test_mask = 0;
 	uint64_t mb = CHUNK * NR_CHUNKS;
-	int i;
+	struct frand_state state;
+	int i, first = 1;
+	void *buf;
 
 	crc32c_intel_probe();
 
@@ -363,18 +300,50 @@
 	else
 		test_mask = get_test_mask(type);
 
+	if (!test_mask) {
+		fprintf(stderr, "fio: unknown hash `%s`. Available:\n", type);
+		return list_types();
+	}
+
+	buf = malloc(CHUNK);
+	init_rand_seed(&state, 0x8989);
+	fill_random_buf(&state, buf, CHUNK);
+
 	for (i = 0; t[i].name; i++) {
+		struct timeval tv;
 		double mb_sec;
 		uint64_t usec;
+		char pre[3];
 
 		if (!(t[i].mask & test_mask))
 			continue;
 
-		usec = t[i].fn();
-		mb_sec = (double) mb / (double) usec;
-		mb_sec /= (1.024 * 1.024);
-		printf("%s:\t%.2f MB/sec\n", t[i].name, mb_sec);
+		/*
+		 * For first run, make sure CPUs are spun up and that
+		 * we've touched the data.
+		 */
+		if (first) {
+			usec_spin(100000);
+			t[i].fn(&t[i], buf, CHUNK);
+		}
+
+		fio_gettime(&tv, NULL);
+		t[i].fn(&t[i], buf, CHUNK);
+		usec = utime_since_now(&tv);
+
+		if (usec) {
+			mb_sec = (double) mb / (double) usec;
+			mb_sec /= (1.024 * 1.024);
+			if (strlen(t[i].name) >= 7)
+				sprintf(pre, "\t");
+			else
+				sprintf(pre, "\t\t");
+			printf("%s:%s%8.2f MB/sec\n", t[i].name, pre, mb_sec);
+		} else
+			printf("%s:inf MB/sec\n", t[i].name);
+		first = 0;
 	}
 
+	free(buf);
 	return 0;
 }
diff --git a/crc/xxhash.c b/crc/xxhash.c
index eedaecb..4736c52 100644
--- a/crc/xxhash.c
+++ b/crc/xxhash.c
@@ -221,7 +221,7 @@
 }
 
 
-uint32_t XXH32(const void* input, int len, uint32_t seed)
+uint32_t XXH32(const void* input, uint32_t len, uint32_t seed)
 {
 #if 0
     // Simple version, good for code maintenance, but unfortunately slow for small inputs
diff --git a/crc/xxhash.h b/crc/xxhash.h
index e80a91d..8850d20 100644
--- a/crc/xxhash.h
+++ b/crc/xxhash.h
@@ -88,7 +88,7 @@
 // Simple Hash Functions
 //****************************
 
-unsigned int XXH32 (const void* input, int len, unsigned int seed);
+uint32_t XXH32 (const void* input, uint32_t len, uint32_t seed);
 
 /*
 XXH32() :
diff --git a/debug.h b/debug.h
index e248695..923fa39 100644
--- a/debug.h
+++ b/debug.h
@@ -20,6 +20,7 @@
 	FD_TIME,
 	FD_NET,
 	FD_RATE,
+	FD_COMPRESS,
 	FD_DEBUG_MAX,
 };
 
diff --git a/diskutil.c b/diskutil.c
index cb285cf..52d87f6 100644
--- a/diskutil.c
+++ b/diskutil.c
@@ -30,7 +30,7 @@
 	while (!flist_empty(&du->slaves)) {
 		struct disk_util *slave;
 
-		slave = flist_entry(du->slaves.next, struct disk_util, slavelist);
+		slave = flist_first_entry(&du->slaves, struct disk_util, slavelist);
 		flist_del(&slave->slavelist);
 		slave->users--;
 	}
@@ -62,14 +62,18 @@
 
 	dprint(FD_DISKUTIL, "%s: %s", du->path, p);
 
-	ret = sscanf(p, "%u %u %llu %u %u %u %llu %u %u %u %u\n",
-					&dus->s.ios[0],
-					&dus->s.merges[0], &sectors[0],
-					&dus->s.ticks[0], &dus->s.ios[1],
-					&dus->s.merges[1], &sectors[1],
-					&dus->s.ticks[1], &in_flight,
-					&dus->s.io_ticks,
-					&dus->s.time_in_queue);
+	ret = sscanf(p, "%llu %llu %llu %llu %llu %llu %llu %llu %u %llu %llu\n",
+				(unsigned long long *) &dus->s.ios[0],
+				(unsigned long long *) &dus->s.merges[0],
+				&sectors[0],
+				(unsigned long long *) &dus->s.ticks[0],
+				(unsigned long long *) &dus->s.ios[1],
+				(unsigned long long *) &dus->s.merges[1],
+				&sectors[1],
+				(unsigned long long *) &dus->s.ticks[1],
+				&in_flight,
+				(unsigned long long *) &dus->s.io_ticks,
+				(unsigned long long *) &dus->s.time_in_queue);
 	fclose(f);
 	dprint(FD_DISKUTIL, "%s: stat read ok? %d\n", du->path, ret == 1);
 	dus->s.sectors[0] = sectors[0];
@@ -117,7 +121,7 @@
 
 	fio_mutex_down(disk_util_mutex);
 
-	if (!disk_util_exit) {
+	if (!helper_exit) {
 		flist_for_each(entry, &disk_list) {
 			du = flist_entry(entry, struct disk_util, list);
 			update_io_tick_disk(du);
@@ -497,26 +501,27 @@
 		return;
 
 	if (!terse) {
-		log_info(", aggrios=%u/%u, aggrmerge=%u/%u, aggrticks=%u/%u,"
-				" aggrin_queue=%u, aggrutil=%3.2f%%",
-				agg->ios[0] / agg->slavecount,
-				agg->ios[1] / agg->slavecount,
-				agg->merges[0] / agg->slavecount,
-				agg->merges[1] / agg->slavecount,
-				agg->ticks[0] / agg->slavecount,
-				agg->ticks[1] / agg->slavecount,
-				agg->time_in_queue / agg->slavecount,
-				agg->max_util.u.f);
+		log_info(", aggrios=%llu/%llu, aggrmerge=%llu/%llu, "
+			 "aggrticks=%llu/%llu, aggrin_queue=%llu, "
+			 "aggrutil=%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
 	} else {
-		log_info(";slaves;%u;%u;%u;%u;%u;%u;%u;%3.2f%%",
-				agg->ios[0] / agg->slavecount,
-				agg->ios[1] / agg->slavecount,
-				agg->merges[0] / agg->slavecount,
-				agg->merges[1] / agg->slavecount,
-				agg->ticks[0] / agg->slavecount,
-				agg->ticks[1] / agg->slavecount,
-				agg->time_in_queue / agg->slavecount,
-				agg->max_util.u.f);
+		log_info(";slaves;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+			(unsigned long long) agg->ios[0] / agg->slavecount,
+			(unsigned long long) agg->ios[1] / agg->slavecount,
+			(unsigned long long) agg->merges[0] / agg->slavecount,
+			(unsigned long long) agg->merges[1] / agg->slavecount,
+			(unsigned long long) agg->ticks[0] / agg->slavecount,
+			(unsigned long long) agg->ticks[1] / agg->slavecount,
+			(unsigned long long) agg->time_in_queue / agg->slavecount,
+			agg->max_util.u.f);
 	}
 }
 
@@ -562,7 +567,7 @@
 	while (!flist_empty(&disk_list)) {
 		struct disk_util *du;
 
-		du = flist_entry(disk_list.next, struct disk_util, list);
+		du = flist_first_entry(&disk_list, struct disk_util, list);
 		flist_del(&du->list);
 		disk_util_free(du);
 	}
@@ -586,19 +591,28 @@
 		if (agg->slavecount)
 			log_info("  ");
 
-		log_info("  %s: ios=%u/%u, merge=%u/%u, ticks=%u/%u, "
-			 "in_queue=%u, util=%3.2f%%", dus->name,
-					dus->s.ios[0], dus->s.ios[1],
-					dus->s.merges[0], dus->s.merges[1],
-					dus->s.ticks[0], dus->s.ticks[1],
-					dus->s.time_in_queue, util);
+		log_info("  %s: ios=%llu/%llu, merge=%llu/%llu, "
+			 "ticks=%llu/%llu, in_queue=%llu, util=%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
 	} else {
-		log_info(";%s;%u;%u;%u;%u;%u;%u;%u;%3.2f%%",
-					dus->name, dus->s.ios[0],
-					dus->s.ios[1], dus->s.merges[0],
-					dus->s.merges[1], dus->s.ticks[0],
-					dus->s.ticks[1],
-					dus->s.time_in_queue, util);
+		log_info(";%s;%llu;%llu;%llu;%llu;%llu;%llu;%llu;%3.2f%%",
+				dus->name,
+				(unsigned long long) dus->s.ios[0],
+				(unsigned long long) dus->s.ios[1],
+				(unsigned long long) dus->s.merges[0],
+				(unsigned long long) dus->s.merges[1],
+				(unsigned long long) dus->s.ticks[0],
+				(unsigned long long) dus->s.ticks[1],
+				(unsigned long long) dus->s.time_in_queue,
+				util);
 	}
 
 	/*
@@ -680,6 +694,9 @@
 	struct flist_head *entry;
 	struct disk_util *du;
 
+	if (!disk_util_mutex)
+		return;
+
 	fio_mutex_down(disk_util_mutex);
 
 	if (flist_empty(&disk_list)) {
diff --git a/diskutil.h b/diskutil.h
index d86e4ec..c0ae0ed 100644
--- a/diskutil.h
+++ b/diskutil.h
@@ -3,15 +3,15 @@
 #include "json.h"
 #define FIO_DU_NAME_SZ		64
 
-extern volatile int disk_util_exit;
+extern volatile int helper_exit;
 
 struct disk_util_stats {
-	uint32_t ios[2];
-	uint32_t merges[2];
+	uint64_t ios[2];
+	uint64_t merges[2];
 	uint64_t sectors[2];
-	uint32_t ticks[2];
-	uint32_t io_ticks;
-	uint32_t time_in_queue;
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
 	uint64_t msec;
 };
 
@@ -24,13 +24,14 @@
 };
 
 struct disk_util_agg {
-	uint32_t ios[2];
-	uint32_t merges[2];
+	uint64_t ios[2];
+	uint64_t merges[2];
 	uint64_t sectors[2];
-	uint32_t ticks[2];
-	uint32_t io_ticks;
-	uint32_t time_in_queue;
+	uint64_t ticks[2];
+	uint64_t io_ticks;
+	uint64_t time_in_queue;
 	uint32_t slavecount;
+	uint32_t pad;
 	fio_fp64_t max_util;
 };
 
@@ -100,8 +101,6 @@
 
 extern struct flist_head disk_list;
 
-extern void wait_for_disk_thread_exit(void);
-
 /*
  * disk util stuff
  */
@@ -127,12 +126,8 @@
 
 static inline int update_io_ticks(void)
 {
-	return disk_util_exit;
+	return helper_exit;
 }
 #endif
 
-static inline void disk_util_start_exit(void)
-{
-	disk_util_exit = 1;
-}
 #endif
diff --git a/engines/binject.c b/engines/binject.c
index 43e3169..f8e83cd 100644
--- a/engines/binject.c
+++ b/engines/binject.c
@@ -62,14 +62,14 @@
 static unsigned int binject_read_commands(struct thread_data *td, void *p,
 					  int left, int *err)
 {
-	struct binject_file *bf;
 	struct fio_file *f;
 	int i, ret, events;
 
 one_more:
 	events = 0;
 	for_each_file(td, f, i) {
-		bf = (struct binject_file *) (uintptr_t) f->engine_data;
+		struct binject_file *bf = FILE_ENG_DATA(f);
+
 		ret = read(bf->fd, p, left * sizeof(struct b_user_cmd));
 		if (ret < 0) {
 			if (errno == EAGAIN)
@@ -91,20 +91,20 @@
 }
 
 static int fio_binject_getevents(struct thread_data *td, unsigned int min,
-			      unsigned int max, struct timespec fio_unused *t)
+				 unsigned int max,
+				 const struct timespec fio_unused *t)
 {
 	struct binject_data *bd = td->io_ops->data;
 	int left = max, ret, r = 0, ev_index = 0;
 	void *buf = bd->cmds;
 	unsigned int i, events;
 	struct fio_file *f;
-	struct binject_file *bf;
 
 	/*
 	 * Fill in the file descriptors
 	 */
 	for_each_file(td, f, i) {
-		bf = (struct binject_file *) (uintptr_t) f->engine_data;
+		struct binject_file *bf = FILE_ENG_DATA(f);
 
 		/*
 		 * don't block for min events == 0
@@ -154,7 +154,7 @@
 
 	if (!min) {
 		for_each_file(td, f, i) {
-			bf = (struct binject_file *) (uintptr_t) f->engine_data;
+			struct binject_file *bf = FILE_ENG_DATA(f);
 
 			if (bd->fd_flags[i] == -1)
 				continue;
@@ -173,7 +173,7 @@
 static int fio_binject_doio(struct thread_data *td, struct io_u *io_u)
 {
 	struct b_user_cmd *buc = &io_u->buc;
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) io_u->file->engine_data;
+	struct binject_file *bf = FILE_ENG_DATA(io_u->file);
 	int ret;
 
 	ret = write(bf->fd, buc, sizeof(*buc));
@@ -187,7 +187,7 @@
 {
 	struct binject_data *bd = td->io_ops->data;
 	struct b_user_cmd *buc = &io_u->buc;
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) io_u->file->engine_data;
+	struct binject_file *bf = FILE_ENG_DATA(io_u->file);
 
 	if (io_u->xfer_buflen & (bf->bs - 1)) {
 		log_err("read/write not sector aligned\n");
@@ -329,12 +329,12 @@
 
 static int fio_binject_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct binject_file *bf = (struct binject_file *) (uintptr_t) f->engine_data;
+	struct binject_file *bf = FILE_ENG_DATA(f);
 
 	if (bf) {
 		binject_unmap_dev(td, bf);
 		free(bf);
-		f->engine_data = 0;
+		FILE_SET_ENG_DATA(f, NULL);
 		return generic_close_file(td, f);
 	}
 
@@ -363,7 +363,7 @@
 	bf = malloc(sizeof(*bf));
 	bf->bs = bs;
 	bf->minor = bf->fd = -1;
-	f->engine_data = (uintptr_t) bf;
+	FILE_SET_ENG_DATA(f, bf);
 
 	if (binject_map_dev(td, bf, f->fd)) {
 err_close:
diff --git a/engines/cpu.c b/engines/cpu.c
index 85598ef..7e4d737 100644
--- a/engines/cpu.c
+++ b/engines/cpu.c
@@ -8,7 +8,7 @@
 #include "../fio.h"
 
 struct cpu_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int cpuload;
 	unsigned int cpucycle;
 	unsigned int exit_io_done;
diff --git a/engines/e4defrag.c b/engines/e4defrag.c
index 3599ab8..d6113a9 100644
--- a/engines/e4defrag.c
+++ b/engines/e4defrag.c
@@ -36,7 +36,7 @@
 };
 
 struct e4defrag_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int inplace;
 	char * donor_name;
 };
@@ -98,8 +98,8 @@
 	}
 
 	if (!o->inplace) {
-		long long len = td->o.file_size_high - td->o.start_offset;
-		r = fallocate(ed->donor_fd, 0, td->o.start_offset, len);
+		long long __len = td->o.file_size_high - td->o.start_offset;
+		r = fallocate(ed->donor_fd, 0, td->o.start_offset, __len);
 		if (r)
 			goto err;
 	}
diff --git a/engines/fusion-aw.c b/engines/fusion-aw.c
index 23f623a..77844ff 100644
--- a/engines/fusion-aw.c
+++ b/engines/fusion-aw.c
@@ -36,8 +36,8 @@
 
 static int queue(struct thread_data *td, struct io_u *io_u)
 {
+	struct fas_data *d = FILE_ENG_DATA(io_u->file);
 	int rc;
-	struct fas_data *d = (struct fas_data *) io_u->file->engine_data;
 
 	if (io_u->ddir != DDIR_WRITE) {
 		td_vmsg(td, EINVAL, "only writes supported", "io_u->ddir");
@@ -94,7 +94,7 @@
 		goto error;
 	}
 	d->nvm_handle = -1;
-	f->engine_data = (uintptr_t) d;
+	FILE_SET_ENG_DATA(f, d);
 
 	rc = generic_open_file(td, f);
 
@@ -144,19 +144,19 @@
 	free(d);
 error:
 	f->fd = -1;
-	f->engine_data = 0;
+	FILE_SET_ENG_DATA(f, NULL);
 	goto out;
 }
 
 static int close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct fas_data *d = (struct fas_data *) f->engine_data;
+	struct fas_data *d = FILE_ENG_DATA(f);
 
 	if (d) {
 		if (d->nvm_handle != -1)
 			nvm_release_handle(d->nvm_handle);
 		free(d);
-		f->engine_data = 0;
+		FILE_SET_ENG_DATA(f, NULL);
 	}
 
 	return generic_close_file(td, f);
diff --git a/engines/gfapi.h b/engines/gfapi.h
new file mode 100644
index 0000000..1028431
--- /dev/null
+++ b/engines/gfapi.h
@@ -0,0 +1,22 @@
+#include <glusterfs/api/glfs.h>
+#include "../fio.h"
+
+struct gf_options {
+	void *pad;
+	char *gf_vol;
+	char *gf_brick;
+};
+
+struct gf_data {
+	glfs_t *fs;
+	glfs_fd_t *fd;
+	struct io_u **aio_events;
+};
+
+extern struct fio_option gfapi_options[];
+extern int fio_gf_setup(struct thread_data *td);
+extern void fio_gf_cleanup(struct thread_data *td);
+extern int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_open_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_close_file(struct thread_data *td, struct fio_file *f);
+extern int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f);
diff --git a/engines/glusterfs.c b/engines/glusterfs.c
new file mode 100644
index 0000000..507cd25
--- /dev/null
+++ b/engines/glusterfs.c
@@ -0,0 +1,305 @@
+/*
+ * glusterfs engine
+ *
+ * common Glusterfs's gfapi interface
+ *
+ */
+
+#include "gfapi.h"
+
+struct fio_option gfapi_options[] = {
+	{
+	 .name = "volume",
+	 .lname = "Glusterfs volume",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs volume",
+	 .off1 = offsetof(struct gf_options, gf_vol),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = "brick",
+	 .lname = "Glusterfs brick name",
+	 .type = FIO_OPT_STR_STORE,
+	 .help = "Name of the Glusterfs brick to connect",
+	 .off1 = offsetof(struct gf_options, gf_brick),
+	 .category = FIO_OPT_C_ENGINE,
+	 .group = FIO_OPT_G_GFAPI,
+	 },
+	{
+	 .name = NULL,
+	 },
+};
+
+int fio_gf_setup(struct thread_data *td)
+{
+	int r = 0;
+	struct gf_data *g = NULL;
+	struct gf_options *opt = td->eo;
+	struct stat sb = { 0, };
+
+	dprint(FD_IO, "fio setup\n");
+
+	if (td->io_ops->data)
+		return 0;
+
+	g = malloc(sizeof(struct gf_data));
+	if (!g) {
+		log_err("malloc failed.\n");
+		return -ENOMEM;
+	}
+	g->fs = NULL;
+	g->fd = NULL;
+	g->aio_events = NULL;
+
+	g->fs = glfs_new(opt->gf_vol);
+	if (!g->fs) {
+		log_err("glfs_new failed.\n");
+		goto cleanup;
+	}
+	glfs_set_logging(g->fs, "/tmp/fio_gfapi.log", 7);
+	/* default to tcp */
+	r = glfs_set_volfile_server(g->fs, "tcp", opt->gf_brick, 0);
+	if (r) {
+		log_err("glfs_set_volfile_server failed.\n");
+		goto cleanup;
+	}
+	r = glfs_init(g->fs);
+	if (r) {
+		log_err("glfs_init failed. Is glusterd running on brick?\n");
+		goto cleanup;
+	}
+	sleep(2);
+	r = glfs_lstat(g->fs, ".", &sb);
+	if (r) {
+		log_err("glfs_lstat failed.\n");
+		goto cleanup;
+	}
+	dprint(FD_FILE, "fio setup %p\n", g->fs);
+	td->io_ops->data = g;
+	return 0;
+cleanup:
+	if (g->fs)
+		glfs_fini(g->fs);
+	free(g);
+	td->io_ops->data = NULL;
+	return r;
+}
+
+void fio_gf_cleanup(struct thread_data *td)
+{
+	struct gf_data *g = td->io_ops->data;
+
+	if (g) {
+		if (g->aio_events)
+			free(g->aio_events);
+		if (g->fd)
+			glfs_close(g->fd);
+		if (g->fs)
+			glfs_fini(g->fs);
+		free(g);
+		td->io_ops->data = NULL;
+	}
+}
+
+int fio_gf_get_file_size(struct thread_data *td, struct fio_file *f)
+{
+	struct stat buf;
+	int ret;
+	struct gf_data *g = td->io_ops->data;
+
+	dprint(FD_FILE, "get file size %s\n", f->file_name);
+
+	if (!g || !g->fs) {
+		return 0;
+	}
+	if (fio_file_size_known(f))
+		return 0;
+
+	ret = glfs_lstat(g->fs, f->file_name, &buf);
+	if (ret < 0) {
+		log_err("glfs_lstat failed.\n");
+		return ret;
+	}
+
+	f->real_file_size = buf.st_size;
+	fio_file_set_size_known(f);
+
+	return 0;
+
+}
+
+int fio_gf_open_file(struct thread_data *td, struct fio_file *f)
+{
+
+	int flags = 0;
+	int ret = 0;
+	struct gf_data *g = td->io_ops->data;
+	struct stat sb = { 0, };
+
+	if (td_write(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+	} else if (td_read(td)) {
+		if (!read_only)
+			flags = O_RDWR;
+		else
+			flags = O_RDONLY;
+	}
+
+	if (td->o.odirect)
+		flags |= OS_O_DIRECT;
+	if (td->o.sync_io)
+		flags |= O_SYNC;
+
+	dprint(FD_FILE, "fio file %s open mode %s td rw %s\n", f->file_name,
+	       flags & O_RDONLY ? "ro" : "rw", td_read(td) ? "read" : "write");
+	g->fd = glfs_creat(g->fs, f->file_name, flags, 0644);
+	if (!g->fd) {
+		ret = errno;
+		log_err("glfs_creat failed.\n");
+		return ret;
+	}
+	/* file for read doesn't exist or shorter than required, create/extend it */
+	if (td_read(td)) {
+		if (glfs_lstat(g->fs, f->file_name, &sb)
+		    || sb.st_size < f->real_file_size) {
+			dprint(FD_FILE, "fio extend file %s from %ld to %ld\n",
+			       f->file_name, sb.st_size, f->real_file_size);
+			ret = glfs_ftruncate(g->fd, f->real_file_size);
+			if (ret) {
+				log_err("failed fio extend file %s to %ld\n",
+					f->file_name, f->real_file_size);
+			} else {
+				unsigned long long left;
+				unsigned int bs;
+				char *b;
+				int r;
+
+				/* fill the file, copied from extend_file */
+				b = malloc(td->o.max_bs[DDIR_WRITE]);
+
+				left = f->real_file_size;
+				while (left && !td->terminate) {
+					bs = td->o.max_bs[DDIR_WRITE];
+					if (bs > left)
+						bs = left;
+
+					fill_io_buffer(td, b, bs, bs);
+
+					r = glfs_write(g->fd, b, bs, 0);
+					dprint(FD_IO,
+					       "fio write %d of %ld file %s\n",
+					       r, f->real_file_size,
+					       f->file_name);
+
+					if (r > 0) {
+						left -= r;
+						continue;
+					} else {
+						if (r < 0) {
+							int __e = errno;
+
+							if (__e == ENOSPC) {
+								if (td->o.
+								    fill_device)
+									break;
+								log_info
+								    ("fio: ENOSPC on laying out "
+								     "file, stopping\n");
+								break;
+							}
+							td_verror(td, errno,
+								  "write");
+						} else
+							td_verror(td, EIO,
+								  "write");
+
+						break;
+					}
+				}
+
+				if (b)
+					free(b);
+				glfs_lseek(g->fd, 0, SEEK_SET);
+
+				if (td->terminate && td->o.unlink) {
+					dprint(FD_FILE, "terminate unlink %s\n",
+					       f->file_name);
+					glfs_unlink(g->fs, f->file_name);
+				} else if (td->o.create_fsync) {
+					if (glfs_fsync(g->fd) < 0) {
+						dprint(FD_FILE,
+						       "failed to sync, close %s\n",
+						       f->file_name);
+						td_verror(td, errno, "fsync");
+						glfs_close(g->fd);
+						g->fd = NULL;
+						return 1;
+					}
+				}
+			}
+		}
+	}
+#if defined(GFAPI_USE_FADVISE)
+	{
+		int r = 0;
+		if (td_random(td)) {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_RANDOM);
+		} else {
+			r = glfs_fadvise(g->fd, 0, f->real_file_size,
+					 POSIX_FADV_SEQUENTIAL);
+		}
+		if (r) {
+			dprint(FD_FILE, "fio %p fadvise %s status %d\n", g->fs,
+			       f->file_name, r);
+		}
+	}
+#endif
+	dprint(FD_FILE, "fio %p created %s\n", g->fs, f->file_name);
+	f->fd = -1;
+	f->shadow_fd = -1;
+	td->o.open_files ++;
+	return ret;
+}
+
+int fio_gf_close_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops->data;
+
+	dprint(FD_FILE, "fd close %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+		g->fd = NULL;
+	}
+
+	return ret;
+}
+
+int fio_gf_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	int ret = 0;
+	struct gf_data *g = td->io_ops->data;
+
+	dprint(FD_FILE, "fd unlink %s\n", f->file_name);
+
+	if (g) {
+		if (g->fd && glfs_close(g->fd) < 0)
+			ret = errno;
+
+		glfs_unlink(g->fs, f->file_name);
+
+		if (g->fs)
+			glfs_fini(g->fs);
+
+		g->fd = NULL;
+		free(g);
+	}
+	td->io_ops->data = NULL;
+
+	return ret;
+}
diff --git a/engines/glusterfs_async.c b/engines/glusterfs_async.c
new file mode 100644
index 0000000..7c2c139
--- /dev/null
+++ b/engines/glusterfs_async.c
@@ -0,0 +1,191 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi async interface
+ *
+ */
+#include "gfapi.h"
+#define NOT_YET 1
+struct fio_gf_iou {
+	struct io_u *io_u;
+	int io_complete;
+};
+
+static struct io_u *fio_gf_event(struct thread_data *td, int event)
+{
+	struct gf_data *gf_data = td->io_ops->data;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	return gf_data->aio_events[event];
+}
+
+static int fio_gf_getevents(struct thread_data *td, unsigned int min,
+			    unsigned int max, const struct timespec *t)
+{
+	struct gf_data *g = td->io_ops->data;
+	unsigned int events = 0;
+	struct io_u *io_u;
+	int i;
+
+	dprint(FD_IO, "%s\n", __FUNCTION__);
+	do {
+		io_u_qiter(&td->io_u_all, io_u, i) {
+			struct fio_gf_iou *io;
+
+			if (!(io_u->flags & IO_U_F_FLIGHT))
+				continue;
+
+			io = io_u->engine_data;
+			if (io->io_complete) {
+				io->io_complete = 0;
+				g->aio_events[events] = io_u;
+				events++;
+
+				if (events >= max)
+					break;
+			}
+
+		}
+		if (events < min)
+			usleep(100);
+		else
+			break;
+
+	} while (1);
+
+	return events;
+}
+
+static void fio_gf_io_u_free(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_gf_iou *io = io_u->engine_data;
+
+	if (io) {
+		if (io->io_complete)
+			log_err("incomplete IO found.\n");
+		io_u->engine_data = NULL;
+		free(io);
+	}
+}
+
+static int fio_gf_io_u_init(struct thread_data *td, struct io_u *io_u)
+{
+	dprint(FD_FILE, "%s\n", __FUNCTION__);
+
+	if (!io_u->engine_data) {
+		struct fio_gf_iou *io;
+
+		io = malloc(sizeof(struct fio_gf_iou));
+		if (!io) {
+			td_verror(td, errno, "malloc");
+			return 1;
+		}
+		io->io_complete = 0;
+		io->io_u = io_u;
+		io_u->engine_data = io;
+	}
+	return 0;
+}
+
+static void gf_async_cb(glfs_fd_t * fd, ssize_t ret, void *data)
+{
+	struct io_u *io_u = data;
+	struct fio_gf_iou *iou = io_u->engine_data;
+
+	dprint(FD_IO, "%s ret %lu\n", __FUNCTION__, ret);
+	iou->io_complete = 1;
+}
+
+static int fio_gf_async_queue(struct thread_data fio_unused * td,
+			      struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops->data;
+	int r;
+
+	dprint(FD_IO, "%s op %s\n", __FUNCTION__, io_ddir_name(io_u->ddir));
+
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		r = glfs_pread_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				     io_u->offset, 0, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_WRITE)
+		r = glfs_pwrite_async(g->fd, io_u->xfer_buf, io_u->xfer_buflen,
+				      io_u->offset, 0, gf_async_cb, io_u);
+#if defined(CONFIG_GF_TRIM)
+	else if (io_u->ddir == DDIR_TRIM)
+		r = glfs_discard_async(g->fd, io_u->offset, io_u->xfer_buflen,
+				       gf_async_cb, io_u);
+#endif
+	else if (io_u->ddir == DDIR_DATASYNC)
+		r = glfs_fdatasync_async(g->fd, gf_async_cb, io_u);
+	else if (io_u->ddir == DDIR_SYNC)
+		r = glfs_fsync_async(g->fd, gf_async_cb, io_u);
+	else
+		r = EINVAL;
+
+	if (r) {
+		log_err("glfs queue failed.\n");
+		io_u->error = r;
+		goto failed;
+	}
+	return FIO_Q_QUEUED;
+
+failed:
+	io_u->error = r;
+	td_verror(td, io_u->error, "xfer");
+	return FIO_Q_COMPLETED;
+}
+
+int fio_gf_async_setup(struct thread_data *td)
+{
+	struct gf_data *g;
+	int r;
+
+#if defined(NOT_YET)
+	log_err("the async interface is still very experimental...\n");
+#endif
+	r = fio_gf_setup(td);
+	if (r)
+		return r;
+
+	td->o.use_thread = 1;
+	g = td->io_ops->data;
+	g->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!g->aio_events) {
+		r = -ENOMEM;
+		fio_gf_cleanup(td);
+		return r;
+	}
+
+	return r;
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi_async",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_async_setup,
+	.cleanup = fio_gf_cleanup,
+	.queue = fio_gf_async_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.getevents = fio_gf_getevents,
+	.event = fio_gf_event,
+	.io_u_init = fio_gf_io_u_init,
+	.io_u_free = fio_gf_io_u_free,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/glusterfs_sync.c b/engines/glusterfs_sync.c
new file mode 100644
index 0000000..6de4ee2
--- /dev/null
+++ b/engines/glusterfs_sync.c
@@ -0,0 +1,98 @@
+/*
+ * glusterfs engine
+ *
+ * IO engine using Glusterfs's gfapi sync interface
+ *
+ */
+
+#include "gfapi.h"
+
+#define LAST_POS(f)	((f)->engine_data)
+static int fio_gf_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct fio_file *f = io_u->file;
+	struct gf_data *g = td->io_ops->data;
+
+	dprint(FD_FILE, "fio prep\n");
+
+	if (!ddir_rw(io_u->ddir))
+		return 0;
+
+	if (LAST_POS(f) != -1ULL && LAST_POS(f) == io_u->offset)
+		return 0;
+
+	if (glfs_lseek(g->fd, io_u->offset, SEEK_SET) < 0) {
+		td_verror(td, errno, "lseek");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int fio_gf_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct gf_data *g = td->io_ops->data;
+	int ret = 0;
+
+	dprint(FD_FILE, "fio queue len %lu\n", io_u->xfer_buflen);
+	fio_ro_check(td, io_u);
+
+	if (io_u->ddir == DDIR_READ)
+		ret = glfs_read(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_WRITE)
+		ret = glfs_write(g->fd, io_u->xfer_buf, io_u->xfer_buflen, 0);
+	else if (io_u->ddir == DDIR_SYNC)
+		ret = glfs_fsync(g->fd);
+	else if (io_u->ddir == DDIR_DATASYNC)
+		ret = glfs_fdatasync(g->fd);
+	else {
+		log_err("unsupported operation.\n");
+		return -EINVAL;
+	}
+	dprint(FD_FILE, "fio len %lu ret %d\n", io_u->xfer_buflen, ret);
+	if (io_u->file && ret >= 0 && ddir_rw(io_u->ddir))
+		LAST_POS(io_u->file) = io_u->offset + ret;
+
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error) {
+		log_err("IO failed.\n");
+		td_verror(td, io_u->error, "xfer");
+	}
+
+	return FIO_Q_COMPLETED;
+
+}
+
+static struct ioengine_ops ioengine = {
+	.name = "gfapi",
+	.version = FIO_IOOPS_VERSION,
+	.init = fio_gf_setup,
+	.cleanup = fio_gf_cleanup,
+	.prep = fio_gf_prep,
+	.queue = fio_gf_queue,
+	.open_file = fio_gf_open_file,
+	.close_file = fio_gf_close_file,
+	.unlink_file = fio_gf_unlink_file,
+	.get_file_size = fio_gf_get_file_size,
+	.options = gfapi_options,
+	.option_struct_size = sizeof(struct gf_options),
+	.flags = FIO_SYNCIO | FIO_DISKLESSIO,
+};
+
+static void fio_init fio_gf_register(void)
+{
+	register_ioengine(&ioengine);
+}
+
+static void fio_exit fio_gf_unregister(void)
+{
+	unregister_ioengine(&ioengine);
+}
diff --git a/engines/guasi.c b/engines/guasi.c
index c9c7429..c586f09 100644
--- a/engines/guasi.c
+++ b/engines/guasi.c
@@ -80,7 +80,7 @@
 }
 
 static int fio_guasi_getevents(struct thread_data *td, unsigned int min,
-			       unsigned int max, struct timespec *t)
+			       unsigned int max, const struct timespec *t)
 {
 	struct guasi_data *ld = td->io_ops->data;
 	int n, r;
diff --git a/engines/libaio.c b/engines/libaio.c
index 9cc910d..d4f4830 100644
--- a/engines/libaio.c
+++ b/engines/libaio.c
@@ -13,16 +13,31 @@
 
 #include "../fio.h"
 
+static int fio_libaio_commit(struct thread_data *td);
+
 struct libaio_data {
 	io_context_t aio_ctx;
 	struct io_event *aio_events;
 	struct iocb **iocbs;
 	struct io_u **io_us;
-	int iocbs_nr;
+
+	/*
+	 * Basic ring buffer. 'head' is incremented in _queue(), and
+	 * 'tail' is incremented in _commit(). We keep 'queued' so
+	 * that we know if the ring is full or empty, when
+	 * 'head' == 'tail'. 'entries' is the ring size, and
+	 * 'is_pow2' is just an optimization to use AND instead of
+	 * modulus to get the remainder on ring increment.
+	 */
+	int is_pow2;
+	unsigned int entries;
+	unsigned int queued;
+	unsigned int head;
+	unsigned int tail;
 };
 
 struct libaio_options {
-	struct thread_data *td;
+	void *pad;
 	unsigned int userspace_reap;
 };
 
@@ -41,6 +56,15 @@
 	},
 };
 
+static inline void ring_inc(struct libaio_data *ld, unsigned int *val,
+			    unsigned int add)
+{
+	if (ld->is_pow2)
+		*val = (*val + add) & (ld->entries - 1);
+	else
+		*val = (*val + add) % ld->entries;
+}
+
 static int fio_libaio_prep(struct thread_data fio_unused *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
@@ -117,13 +141,19 @@
 }
 
 static int fio_libaio_getevents(struct thread_data *td, unsigned int min,
-				unsigned int max, struct timespec *t)
+				unsigned int max, const struct timespec *t)
 {
 	struct libaio_data *ld = td->io_ops->data;
 	struct libaio_options *o = td->eo;
 	unsigned actual_min = td->o.iodepth_batch_complete == 0 ? 0 : min;
+	struct timespec __lt, *lt = NULL;
 	int r, events = 0;
 
+	if (t) {
+		__lt = *t;
+		lt = &__lt;
+	}
+
 	do {
 		if (o->userspace_reap == 1
 		    && actual_min == 0
@@ -133,12 +163,15 @@
 				ld->aio_events + events);
 		} else {
 			r = io_getevents(ld->aio_ctx, actual_min,
-				max, ld->aio_events + events, t);
+				max, ld->aio_events + events, lt);
 		}
-		if (r >= 0)
+		if (r > 0)
 			events += r;
-		else if (r == -EAGAIN)
+		else if ((min && r == 0) || r == -EAGAIN) {
+			fio_libaio_commit(td);
 			usleep(100);
+		} else if (r != -EINTR)
+			break;
 	} while (events < min);
 
 	return r < 0 ? r : events;
@@ -150,7 +183,7 @@
 
 	fio_ro_check(td, io_u);
 
-	if (ld->iocbs_nr == (int) td->o.iodepth)
+	if (ld->queued == td->o.iodepth)
 		return FIO_Q_BUSY;
 
 	/*
@@ -160,7 +193,7 @@
 	 * have pending io, to let fio complete those first.
 	 */
 	if (ddir_sync(io_u->ddir)) {
-		if (ld->iocbs_nr)
+		if (ld->queued)
 			return FIO_Q_BUSY;
 
 		do_io_u_sync(td, io_u);
@@ -168,16 +201,17 @@
 	}
 
 	if (io_u->ddir == DDIR_TRIM) {
-		if (ld->iocbs_nr)
+		if (ld->queued)
 			return FIO_Q_BUSY;
 
 		do_io_u_trim(td, io_u);
 		return FIO_Q_COMPLETED;
 	}
 
-	ld->iocbs[ld->iocbs_nr] = &io_u->iocb;
-	ld->io_us[ld->iocbs_nr] = io_u;
-	ld->iocbs_nr++;
+	ld->iocbs[ld->head] = &io_u->iocb;
+	ld->io_us[ld->head] = io_u;
+	ring_inc(ld, &ld->head, 1);
+	ld->queued++;
 	return FIO_Q_QUEUED;
 }
 
@@ -205,29 +239,67 @@
 	struct libaio_data *ld = td->io_ops->data;
 	struct iocb **iocbs;
 	struct io_u **io_us;
-	int ret;
+	struct timeval tv;
+	int ret, wait_start = 0;
 
-	if (!ld->iocbs_nr)
+	if (!ld->queued)
 		return 0;
 
-	io_us = ld->io_us;
-	iocbs = ld->iocbs;
 	do {
-		ret = io_submit(ld->aio_ctx, ld->iocbs_nr, iocbs);
+		long nr = ld->queued;
+
+		nr = min((unsigned int) nr, ld->entries - ld->tail);
+		io_us = ld->io_us + ld->tail;
+		iocbs = ld->iocbs + ld->tail;
+
+		ret = io_submit(ld->aio_ctx, nr, iocbs);
 		if (ret > 0) {
 			fio_libaio_queued(td, io_us, ret);
 			io_u_mark_submit(td, ret);
-			ld->iocbs_nr -= ret;
-			io_us += ret;
-			iocbs += ret;
+
+			ld->queued -= ret;
+			ring_inc(ld, &ld->tail, ret);
 			ret = 0;
-		} else if (!ret || ret == -EAGAIN || ret == -EINTR) {
+			wait_start = 0;
+		} else if (ret == -EINTR || !ret) {
 			if (!ret)
 				io_u_mark_submit(td, ret);
+			wait_start = 0;
 			continue;
+		} else if (ret == -EAGAIN) {
+			/*
+			 * If we get EAGAIN, we should break out without
+			 * error and let the upper layer reap some
+			 * events for us. If we have no queued IO, we
+			 * must loop here. If we loop for more than 30s,
+			 * just error out, something must be buggy in the
+			 * IO path.
+			 */
+			if (ld->queued) {
+				ret = 0;
+				break;
+			}
+			if (!wait_start) {
+				fio_gettime(&tv, NULL);
+				wait_start = 1;
+			} else if (mtime_since_now(&tv) > 30000) {
+				log_err("fio: aio appears to be stalled, giving up\n");
+				break;
+			}
+			usleep(1);
+			continue;
+		} else if (ret == -ENOMEM) {
+			/*
+			 * If we get -ENOMEM, reap events if we can. If
+			 * we cannot, treat it as a fatal event since there's
+			 * nothing we can do about it.
+			 */
+			if (ld->queued)
+				ret = 0;
+			break;
 		} else
 			break;
-	} while (ld->iocbs_nr);
+	} while (ld->queued);
 
 	return ret;
 }
@@ -254,11 +326,11 @@
 
 static int fio_libaio_init(struct thread_data *td)
 {
-	struct libaio_data *ld = malloc(sizeof(*ld));
 	struct libaio_options *o = td->eo;
+	struct libaio_data *ld;
 	int err = 0;
 
-	memset(ld, 0, sizeof(*ld));
+	ld = calloc(1, sizeof(*ld));
 
 	/*
 	 * First try passing in 0 for queue depth, since we don't
@@ -276,13 +348,11 @@
 		return 1;
 	}
 
-	ld->aio_events = malloc(td->o.iodepth * sizeof(struct io_event));
-	memset(ld->aio_events, 0, td->o.iodepth * sizeof(struct io_event));
-	ld->iocbs = malloc(td->o.iodepth * sizeof(struct iocb *));
-	memset(ld->iocbs, 0, sizeof(struct iocb *));
-	ld->io_us = malloc(td->o.iodepth * sizeof(struct io_u *));
-	memset(ld->io_us, 0, td->o.iodepth * sizeof(struct io_u *));
-	ld->iocbs_nr = 0;
+	ld->entries = td->o.iodepth;
+	ld->is_pow2 = is_power_of_2(ld->entries);
+	ld->aio_events = calloc(ld->entries, sizeof(struct io_event));
+	ld->iocbs = calloc(ld->entries, sizeof(struct iocb *));
+	ld->io_us = calloc(ld->entries, sizeof(struct io_u *));
 
 	td->io_ops->data = ld;
 	return 0;
diff --git a/engines/libhdfs.c b/engines/libhdfs.c
new file mode 100644
index 0000000..658cd6a
--- /dev/null
+++ b/engines/libhdfs.c
@@ -0,0 +1,239 @@
+/*
+ * libhdfs engine
+ *
+ * this engine helps perform read/write operations on hdfs cluster using
+ * libhdfs. hdfs doesnot support modification of data once file is created.
+ *
+ * so to mimic that create many files of small size (e.g 256k), and this
+ * engine select a file based on the offset generated by fio.
+ *
+ * thus, random reads and writes can also be achieved with this logic.
+ *
+ * NOTE: please set environment variables FIO_HDFS_BS and FIO_HDFS_FCOUNT
+ * to appropriate value to work this engine properly
+ *
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <unistd.h>
+#include <sys/uio.h>
+#include <errno.h>
+#include <assert.h>
+
+#include "../fio.h"
+
+#include "hdfs.h"
+
+struct hdfsio_data {
+	char host[256];
+	int port;
+	hdfsFS fs;
+	hdfsFile fp;
+	unsigned long fsbs;
+	unsigned long fscount;
+	unsigned long curr_file_id;
+	unsigned int numjobs;
+	unsigned int fid_correction;
+};
+
+static int fio_hdfsio_setup_fs_params(struct hdfsio_data *hd)
+{
+	/* make sure that hdfsConnect is invoked before executing this function */
+	hdfsSetWorkingDirectory(hd->fs, "/.perftest");
+	hd->fp = hdfsOpenFile(hd->fs, ".fcount", O_RDONLY, 0, 0, 0);
+	if (hd->fp) {
+		hdfsRead(hd->fs, hd->fp, &(hd->fscount), sizeof(hd->fscount));
+		hdfsCloseFile(hd->fs, hd->fp);
+	}
+	hd->fp = hdfsOpenFile(hd->fs, ".fbs", O_RDONLY, 0, 0, 0);
+	if (hd->fp) {
+		hdfsRead(hd->fs, hd->fp, &(hd->fsbs), sizeof(hd->fsbs));
+		hdfsCloseFile(hd->fs, hd->fp);
+	}
+
+	return 0;
+}
+
+static int fio_hdfsio_prep(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd;
+	hdfsFileInfo *fi;
+	unsigned long f_id;
+	char fname[80];
+	int open_flags = 0;
+
+	hd = td->io_ops->data;
+
+	if (hd->curr_file_id == -1) {
+		/* see comment in fio_hdfsio_setup() function */
+		fio_hdfsio_setup_fs_params(hd);
+	}
+
+	/* find out file id based on the offset generated by fio */
+	f_id = (io_u->offset / hd->fsbs) + hd->fid_correction;
+
+	if (f_id == hd->curr_file_id) {
+		/* file is already open */
+		return 0;
+	}
+
+	if (hd->curr_file_id != -1) {
+		hdfsCloseFile(hd->fs, hd->fp);
+	}
+
+	if (io_u->ddir == DDIR_READ) {
+		open_flags = O_RDONLY;
+	} else if (io_u->ddir == DDIR_WRITE) {
+		open_flags = O_WRONLY;
+	} else {
+		log_err("hdfs: Invalid I/O Operation\n");
+	}
+
+	hd->curr_file_id = f_id;
+	do {
+		sprintf(fname, ".f%lu", f_id);
+		fi = hdfsGetPathInfo(hd->fs, fname);
+		if (fi->mSize >= hd->fsbs || io_u->ddir == DDIR_WRITE) {
+			/* file has enough data to read OR file is opened in write mode */
+			hd->fp =
+			    hdfsOpenFile(hd->fs, fname, open_flags, 0, 0,
+					 hd->fsbs);
+			if (hd->fp) {
+				break;
+			}
+		}
+		/* file is empty, so try next file for reading */
+		f_id = (f_id + 1) % hd->fscount;
+	} while (1);
+
+	return 0;
+}
+
+static int fio_io_end(struct thread_data *td, struct io_u *io_u, int ret)
+{
+	if (ret != (int)io_u->xfer_buflen) {
+		if (ret >= 0) {
+			io_u->resid = io_u->xfer_buflen - ret;
+			io_u->error = 0;
+			return FIO_Q_COMPLETED;
+		} else
+			io_u->error = errno;
+	}
+
+	if (io_u->error)
+		td_verror(td, io_u->error, "xfer");
+
+	return FIO_Q_COMPLETED;
+}
+
+static int fio_hdfsio_queue(struct thread_data *td, struct io_u *io_u)
+{
+	struct hdfsio_data *hd;
+	int ret = 0;
+
+	hd = td->io_ops->data;
+
+	if (io_u->ddir == DDIR_READ) {
+		ret =
+		    hdfsRead(hd->fs, hd->fp, io_u->xfer_buf, io_u->xfer_buflen);
+	} else if (io_u->ddir == DDIR_WRITE) {
+		ret =
+		    hdfsWrite(hd->fs, hd->fp, io_u->xfer_buf,
+			      io_u->xfer_buflen);
+	} else {
+		log_err("hdfs: Invalid I/O Operation\n");
+	}
+
+	return fio_io_end(td, io_u, ret);
+}
+
+int fio_hdfsio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct hdfsio_data *hd;
+
+	hd = td->io_ops->data;
+	hd->fs = hdfsConnect(hd->host, hd->port);
+	hdfsSetWorkingDirectory(hd->fs, "/.perftest");
+	hd->fid_correction = (getpid() % hd->numjobs);
+
+	return 0;
+}
+
+int fio_hdfsio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct hdfsio_data *hd;
+
+	hd = td->io_ops->data;
+	hdfsDisconnect(hd->fs);
+
+	return 0;
+}
+
+static int fio_hdfsio_setup(struct thread_data *td)
+{
+	struct hdfsio_data *hd;
+	struct fio_file *f;
+	static unsigned int numjobs = 1;	/* atleast one job has to be there! */
+	numjobs = (td->o.numjobs > numjobs) ? td->o.numjobs : numjobs;
+
+	if (!td->io_ops->data) {
+		hd = malloc(sizeof(*hd));;
+
+		memset(hd, 0, sizeof(*hd));
+		td->io_ops->data = hd;
+
+		/* separate host and port from filename */
+		*(strchr(td->o.filename, ',')) = ' ';
+		sscanf(td->o.filename, "%s%d", hd->host, &(hd->port));
+
+		/* read fbs and fcount and based on that set f->real_file_size */
+		f = td->files[0];
+#if 0
+		/* IMHO, this should be done here instead of fio_hdfsio_prep()
+		 * but somehow calling it here doesn't seem to work,
+		 * some problem with libhdfs that needs to be debugged */
+		hd->fs = hdfsConnect(hd->host, hd->port);
+		fio_hdfsio_setup_fs_params(hd);
+		hdfsDisconnect(hd->fs);
+#else
+		/* so, as an alternate, using environment variables */
+		if (getenv("FIO_HDFS_FCOUNT") && getenv("FIO_HDFS_BS")) {
+			hd->fscount = atol(getenv("FIO_HDFS_FCOUNT"));
+			hd->fsbs = atol(getenv("FIO_HDFS_BS"));
+		} else {
+			log_err("FIO_HDFS_FCOUNT and/or FIO_HDFS_BS not set.\n");
+			return 1;
+		}
+#endif
+		f->real_file_size = hd->fscount * hd->fsbs;
+
+		td->o.nr_files = 1;
+		hd->curr_file_id = -1;
+		hd->numjobs = numjobs;
+		fio_file_set_size_known(f);
+	}
+
+	return 0;
+}
+
+static struct ioengine_ops ioengine_hdfs = {
+	.name = "libhdfs",
+	.version = FIO_IOOPS_VERSION,
+	.setup = fio_hdfsio_setup,
+	.prep = fio_hdfsio_prep,
+	.queue = fio_hdfsio_queue,
+	.open_file = fio_hdfsio_open_file,
+	.close_file = fio_hdfsio_close_file,
+	.flags = FIO_SYNCIO,
+};
+
+static void fio_init fio_hdfsio_register(void)
+{
+	register_ioengine(&ioengine_hdfs);
+}
+
+static void fio_exit fio_hdfsio_unregister(void)
+{
+	unregister_ioengine(&ioengine_hdfs);
+}
diff --git a/engines/mmap.c b/engines/mmap.c
index 8c04a19..69add78 100644
--- a/engines/mmap.c
+++ b/engines/mmap.c
@@ -22,9 +22,16 @@
 static unsigned long mmap_map_size;
 static unsigned long mmap_map_mask;
 
+struct fio_mmap_data {
+	void *mmap_ptr;
+	size_t mmap_sz;
+	off_t mmap_off;
+};
+
 static int fio_mmap_file(struct thread_data *td, struct fio_file *f,
 			 size_t length, off_t off)
 {
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 	int flags = 0;
 
 	if (td_rw(td))
@@ -37,28 +44,38 @@
 	} else
 		flags = PROT_READ;
 
-	f->mmap_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
-	if (f->mmap_ptr == MAP_FAILED) {
-		f->mmap_ptr = NULL;
+	fmd->mmap_ptr = mmap(NULL, length, flags, MAP_SHARED, f->fd, off);
+	if (fmd->mmap_ptr == MAP_FAILED) {
+		fmd->mmap_ptr = NULL;
 		td_verror(td, errno, "mmap");
 		goto err;
 	}
 
 	if (!td_random(td)) {
-		if (posix_madvise(f->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_SEQUENTIAL) < 0) {
 			td_verror(td, errno, "madvise");
 			goto err;
 		}
 	} else {
-		if (posix_madvise(f->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) {
+		if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_RANDOM) < 0) {
 			td_verror(td, errno, "madvise");
 			goto err;
 		}
 	}
+	if (posix_madvise(fmd->mmap_ptr, length, POSIX_MADV_DONTNEED) < 0) {
+		td_verror(td, errno, "madvise");
+		goto err;
+	}
+
+#ifdef FIO_MADV_FREE
+	if (f->filetype == FIO_TYPE_BD)
+		(void) posix_madvise(fmd->mmap_ptr, fmd->mmap_sz, FIO_MADV_FREE);
+#endif
+
 
 err:
-	if (td->error && f->mmap_ptr)
-		munmap(f->mmap_ptr, length);
+	if (td->error && fmd->mmap_ptr)
+		munmap(fmd->mmap_ptr, length);
 
 	return td->error;
 }
@@ -69,19 +86,20 @@
 static int fio_mmapio_prep_limited(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 
 	if (io_u->buflen > mmap_map_size) {
 		log_err("fio: bs too big for mmap engine\n");
 		return EIO;
 	}
 
-	f->mmap_sz = mmap_map_size;
-	if (f->mmap_sz  > f->io_size)
-		f->mmap_sz = f->io_size;
+	fmd->mmap_sz = mmap_map_size;
+	if (fmd->mmap_sz  > f->io_size)
+		fmd->mmap_sz = f->io_size;
 
-	f->mmap_off = io_u->offset;
+	fmd->mmap_off = io_u->offset;
 
-	return fio_mmap_file(td, f, f->mmap_sz, f->mmap_off);
+	return fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
 }
 
 /*
@@ -90,15 +108,21 @@
 static int fio_mmapio_prep_full(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 	int ret;
 
 	if (fio_file_partial_mmap(f))
 		return EINVAL;
+	if (io_u->offset != (size_t) io_u->offset ||
+	    f->io_size != (size_t) f->io_size) {
+		fio_file_set_partial_mmap(f);
+		return EINVAL;
+	}
 
-	f->mmap_sz = f->io_size;
-	f->mmap_off = 0;
+	fmd->mmap_sz = f->io_size;
+	fmd->mmap_off = 0;
 
-	ret = fio_mmap_file(td, f, f->mmap_sz, f->mmap_off);
+	ret = fio_mmap_file(td, f, fmd->mmap_sz, fmd->mmap_off);
 	if (ret)
 		fio_file_set_partial_mmap(f);
 
@@ -108,22 +132,23 @@
 static int fio_mmapio_prep(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 	int ret;
 
 	/*
 	 * It fits within existing mapping, use it
 	 */
-	if (io_u->offset >= f->mmap_off &&
-	    io_u->offset + io_u->buflen < f->mmap_off + f->mmap_sz)
+	if (io_u->offset >= fmd->mmap_off &&
+	    io_u->offset + io_u->buflen < fmd->mmap_off + fmd->mmap_sz)
 		goto done;
 
 	/*
 	 * unmap any existing mapping
 	 */
-	if (f->mmap_ptr) {
-		if (munmap(f->mmap_ptr, f->mmap_sz) < 0)
+	if (fmd->mmap_ptr) {
+		if (munmap(fmd->mmap_ptr, fmd->mmap_sz) < 0)
 			return errno;
-		f->mmap_ptr = NULL;
+		fmd->mmap_ptr = NULL;
 	}
 
 	if (fio_mmapio_prep_full(td, io_u)) {
@@ -134,7 +159,7 @@
 	}
 
 done:
-	io_u->mmap_data = f->mmap_ptr + io_u->offset - f->mmap_off -
+	io_u->mmap_data = fmd->mmap_ptr + io_u->offset - fmd->mmap_off -
 				f->file_offset;
 	return 0;
 }
@@ -142,6 +167,7 @@
 static int fio_mmapio_queue(struct thread_data *td, struct io_u *io_u)
 {
 	struct fio_file *f = io_u->file;
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
 
 	fio_ro_check(td, io_u);
 
@@ -150,7 +176,7 @@
 	else if (io_u->ddir == DDIR_WRITE)
 		memcpy(io_u->mmap_data, io_u->xfer_buf, io_u->xfer_buflen);
 	else if (ddir_sync(io_u->ddir)) {
-		if (msync(f->mmap_ptr, f->mmap_sz, MS_SYNC)) {
+		if (msync(fmd->mmap_ptr, fmd->mmap_sz, MS_SYNC)) {
 			io_u->error = errno;
 			td_verror(td, io_u->error, "msync");
 		}
@@ -205,14 +231,45 @@
 	return 0;
 }
 
+static int fio_mmapio_open_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd;
+	int ret;
+
+	ret = generic_open_file(td, f);
+	if (ret)
+		return ret;
+
+	fmd = calloc(1, sizeof(*fmd));
+	if (!fmd) {
+		int fio_unused ret;
+		ret = generic_close_file(td, f);
+		return 1;
+	}
+
+	FILE_SET_ENG_DATA(f, fmd);
+	return 0;
+}
+
+static int fio_mmapio_close_file(struct thread_data *td, struct fio_file *f)
+{
+	struct fio_mmap_data *fmd = FILE_ENG_DATA(f);
+
+	FILE_SET_ENG_DATA(f, NULL);
+	free(fmd);
+	fio_file_clear_partial_mmap(f);
+
+	return generic_close_file(td, f);
+}
+
 static struct ioengine_ops ioengine = {
 	.name		= "mmap",
 	.version	= FIO_IOOPS_VERSION,
 	.init		= fio_mmapio_init,
 	.prep		= fio_mmapio_prep,
 	.queue		= fio_mmapio_queue,
-	.open_file	= generic_open_file,
-	.close_file	= generic_close_file,
+	.open_file	= fio_mmapio_open_file,
+	.close_file	= fio_mmapio_close_file,
 	.get_file_size	= generic_get_file_size,
 	.flags		= FIO_SYNCIO | FIO_NOEXTEND,
 };
diff --git a/engines/net.c b/engines/net.c
index 8087207..cd19535 100644
--- a/engines/net.c
+++ b/engines/net.c
@@ -21,14 +21,18 @@
 #include <sys/un.h>
 
 #include "../fio.h"
+#include "../verify.h"
 
 struct netio_data {
 	int listenfd;
 	int use_splice;
+	int seq_off;
 	int pipes[2];
 	struct sockaddr_in addr;
 	struct sockaddr_in6 addr6;
 	struct sockaddr_un addr_un;
+	uint64_t udp_send_seq;
+	uint64_t udp_recv_seq;
 };
 
 struct netio_options {
@@ -39,6 +43,8 @@
 	unsigned int pingpong;
 	unsigned int nodelay;
 	unsigned int ttl;
+	unsigned int window_size;
+	unsigned int mss;
 	char *intfc;
 };
 
@@ -47,10 +53,17 @@
 	uint32_t cmd;
 };
 
+struct udp_seq {
+	uint64_t magic;
+	uint64_t seq;
+	uint64_t bs;
+};
+
 enum {
 	FIO_LINK_CLOSE = 0x89,
 	FIO_LINK_OPEN_CLOSE_MAGIC = 0x6c696e6b,
 	FIO_LINK_OPEN = 0x98,
+	FIO_UDP_SEQ_MAGIC = 0x657375716e556563ULL,
 
 	FIO_TYPE_TCP	= 1,
 	FIO_TYPE_UDP	= 2,
@@ -165,6 +178,30 @@
 		.category = FIO_OPT_C_ENGINE,
 		.group	= FIO_OPT_G_NETIO,
 	},
+#ifdef CONFIG_NET_WINDOWSIZE
+	{
+		.name	= "window_size",
+		.lname	= "Window Size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, window_size),
+		.minval	= 0,
+		.help	= "Set socket buffer window size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
+#ifdef CONFIG_NET_MSS
+	{
+		.name	= "mss",
+		.lname	= "Maximum segment size",
+		.type	= FIO_OPT_INT,
+		.off1	= offsetof(struct netio_options, mss),
+		.minval	= 0,
+		.help	= "Set TCP maximum segment size",
+		.category = FIO_OPT_C_ENGINE,
+		.group	= FIO_OPT_G_NETIO,
+	},
+#endif
 	{
 		.name	= NULL,
 	},
@@ -185,6 +222,65 @@
 	return o->proto == FIO_TYPE_UDP_V6 || o->proto == FIO_TYPE_TCP_V6;
 }
 
+static int set_window_size(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_WINDOWSIZE
+	struct netio_options *o = td->eo;
+	unsigned int wss;
+	int snd, rcv, ret;
+
+	if (!o->window_size)
+		return 0;
+
+	rcv = o->listen || o->pingpong;
+	snd = !o->listen || o->pingpong;
+	wss = o->window_size;
+	ret = 0;
+
+	if (rcv) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_RCVBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "rcvbuf window size");
+	}
+	if (snd && !ret) {
+		ret = setsockopt(fd, SOL_SOCKET, SO_SNDBUF, (void *) &wss,
+					sizeof(wss));
+		if (ret < 0)
+			td_verror(td, errno, "sndbuf window size");
+	}
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt window size");
+	return -1;
+#endif
+}
+
+static int set_mss(struct thread_data *td, int fd)
+{
+#ifdef CONFIG_NET_MSS
+	struct netio_options *o = td->eo;
+	unsigned int mss;
+	int ret;
+
+	if (!o->mss || !is_tcp(o))
+		return 0;
+
+	mss = o->mss;
+	ret = setsockopt(fd, IPPROTO_TCP, TCP_MAXSEG, (void *) &mss,
+				sizeof(mss));
+	if (ret < 0)
+		td_verror(td, errno, "setsockopt TCP_MAXSEG");
+
+	return ret;
+#else
+	td_verror(td, -EINVAL, "setsockopt TCP_MAXSEG");
+	return -1;
+#endif
+}
+
+
 /*
  * Return -1 for error and 'nr events' for a positive number
  * of events
@@ -384,6 +480,47 @@
 }
 #endif
 
+static void store_udp_seq(struct netio_data *nd, struct io_u *io_u)
+{
+	struct udp_seq *us;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	us->magic = cpu_to_le64((uint64_t) FIO_UDP_SEQ_MAGIC);
+	us->bs = cpu_to_le64((uint64_t) io_u->xfer_buflen);
+	us->seq = cpu_to_le64(nd->udp_send_seq++);
+}
+
+static void verify_udp_seq(struct thread_data *td, struct netio_data *nd,
+			   struct io_u *io_u)
+{
+	struct udp_seq *us;
+	uint64_t seq;
+
+	if (io_u->xfer_buflen < sizeof(*us))
+		return;
+
+	if (nd->seq_off)
+		return;
+
+	us = io_u->xfer_buf + io_u->xfer_buflen - sizeof(*us);
+	if (le64_to_cpu(us->magic) != FIO_UDP_SEQ_MAGIC)
+		return;
+	if (le64_to_cpu(us->bs) != io_u->xfer_buflen) {
+		nd->seq_off = 1;
+		return;
+	}
+
+	seq = le64_to_cpu(us->seq);
+
+	if (seq != nd->udp_recv_seq)
+		td->ts.drop_io_u[io_u->ddir] += seq - nd->udp_recv_seq;
+
+	nd->udp_recv_seq = seq + 1;
+}
+
 static int fio_netio_send(struct thread_data *td, struct io_u *io_u)
 {
 	struct netio_data *nd = td->io_ops->data;
@@ -403,6 +540,9 @@
 				len = sizeof(nd->addr);
 			}
 
+			if (td->o.verify == VERIFY_NONE)
+				store_udp_seq(nd, io_u);
+
 			ret = sendto(io_u->file->fd, io_u->xfer_buf,
 					io_u->xfer_buflen, flags, to, len);
 		} else {
@@ -428,7 +568,7 @@
 	return ret;
 }
 
-static int is_udp_close(struct io_u *io_u, int len)
+static int is_close_msg(struct io_u *io_u, int len)
 {
 	struct udp_close_msg *msg;
 
@@ -436,9 +576,9 @@
 		return 0;
 
 	msg = io_u->xfer_buf;
-	if (ntohl(msg->magic) != FIO_LINK_OPEN_CLOSE_MAGIC)
+	if (le32_to_cpu(msg->magic) != FIO_LINK_OPEN_CLOSE_MAGIC)
 		return 0;
-	if (ntohl(msg->cmd) != FIO_LINK_CLOSE)
+	if (le32_to_cpu(msg->cmd) != FIO_LINK_CLOSE)
 		return 0;
 
 	return 1;
@@ -470,13 +610,19 @@
 
 			ret = recvfrom(io_u->file->fd, io_u->xfer_buf,
 					io_u->xfer_buflen, flags, from, len);
-			if (is_udp_close(io_u, ret)) {
+
+			if (is_close_msg(io_u, ret)) {
 				td->done = 1;
 				return 0;
 			}
 		} else {
 			ret = recv(io_u->file->fd, io_u->xfer_buf,
 					io_u->xfer_buflen, flags);
+
+			if (is_close_msg(io_u, ret)) {
+				td->done = 1;
+				return 0;
+			}
 		}
 		if (ret > 0)
 			break;
@@ -489,6 +635,9 @@
 		flags |= MSG_WAITALL;
 	} while (1);
 
+	if (is_udp(o) && td->o.verify == VERIFY_NONE)
+		verify_udp_seq(td, nd, io_u);
+
 	return ret;
 }
 
@@ -515,11 +664,13 @@
 		ret = 0;	/* must be a SYNC */
 
 	if (ret != (int) io_u->xfer_buflen) {
-		if (ret >= 0) {
+		if (ret > 0) {
 			io_u->resid = io_u->xfer_buflen - ret;
 			io_u->error = 0;
 			return FIO_Q_COMPLETED;
-		} else {
+		} else if (!ret)
+			return FIO_Q_BUSY;
+		else {
 			int err = errno;
 
 			if (ddir == DDIR_WRITE && err == EMSGSIZE)
@@ -601,6 +752,15 @@
 	}
 #endif
 
+	if (set_window_size(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+	if (set_mss(td, f->fd)) {
+		close(f->fd);
+		return 1;
+	}
+
 	if (is_udp(o)) {
 		if (!fio_netio_is_multicast(td->o.filename))
 			return 0;
@@ -715,7 +875,7 @@
 	return 1;
 }
 
-static void fio_netio_udp_close(struct thread_data *td, struct fio_file *f)
+static void fio_netio_send_close(struct thread_data *td, struct fio_file *f)
 {
 	struct netio_data *nd = td->io_ops->data;
 	struct netio_options *o = td->eo;
@@ -732,8 +892,8 @@
 		len = sizeof(nd->addr);
 	}
 
-	msg.magic = htonl(FIO_LINK_OPEN_CLOSE_MAGIC);
-	msg.cmd = htonl(FIO_LINK_CLOSE);
+	msg.magic = cpu_to_le32((uint32_t) FIO_LINK_OPEN_CLOSE_MAGIC);
+	msg.cmd = cpu_to_le32((uint32_t) FIO_LINK_CLOSE);
 
 	ret = sendto(f->fd, (void *) &msg, sizeof(msg), MSG_WAITALL, to, len);
 	if (ret < 0)
@@ -742,14 +902,10 @@
 
 static int fio_netio_close_file(struct thread_data *td, struct fio_file *f)
 {
-	struct netio_options *o = td->eo;
-
 	/*
-	 * If this is an UDP connection, notify the receiver that we are
-	 * closing down the link
+	 * Notify the receiver that we are closing down the link
 	 */
-	if (is_udp(o))
-		fio_netio_udp_close(td, f);
+	fio_netio_send_close(td, f);
 
 	return generic_close_file(td, f);
 }
@@ -784,10 +940,11 @@
 		return -1;
 	}
 
+	fio_gettime(&td->start, NULL);
 	return 0;
 }
 
-static int fio_netio_udp_send_open(struct thread_data *td, struct fio_file *f)
+static int fio_netio_send_open(struct thread_data *td, struct fio_file *f)
 {
 	struct netio_data *nd = td->io_ops->data;
 	struct netio_options *o = td->eo;
@@ -833,7 +990,7 @@
 
 	if (is_udp(o)) {
 		if (td_write(td))
-			ret = fio_netio_udp_send_open(td, f);
+			ret = fio_netio_send_open(td, f);
 		else {
 			int state;
 
@@ -1042,6 +1199,15 @@
 	}
 #endif
 
+	if (set_window_size(td, fd)) {
+		close(fd);
+		return 1;
+	}
+	if (set_mss(td, fd)) {
+		close(fd);
+		return 1;
+	}
+
 	if (td->o.filename) {
 		if (!is_udp(o) || !fio_netio_is_multicast(td->o.filename)) {
 			log_err("fio: hostname not valid for non-multicast inbound network IO\n");
@@ -1148,6 +1314,8 @@
 		return 1;
 	}
 
+	o->port += td->subjob_number;
+
 	if (!is_tcp(o)) {
 		if (o->listen) {
 			log_err("fio: listen only valid for TCP proto IO\n");
@@ -1213,7 +1381,7 @@
 
 static void fio_netio_terminate(struct thread_data *td)
 {
-	kill(td->pid, SIGUSR2);
+	kill(td->pid, SIGTERM);
 }
 
 #ifdef CONFIG_LINUX_SPLICE
diff --git a/engines/null.c b/engines/null.c
index eb17b11..6000930 100644
--- a/engines/null.c
+++ b/engines/null.c
@@ -32,7 +32,7 @@
 
 static int fio_null_getevents(struct thread_data *td, unsigned int min_events,
 			      unsigned int fio_unused max,
-			      struct timespec fio_unused *t)
+			      const struct timespec fio_unused *t)
 {
 	struct null_data *nd = (struct null_data *) td->io_ops->data;
 	int ret = 0;
@@ -119,7 +119,7 @@
 	.init		= fio_null_init,
 	.cleanup	= fio_null_cleanup,
 	.open_file	= fio_null_open,
-	.flags		= FIO_DISKLESSIO,
+	.flags		= FIO_DISKLESSIO | FIO_FAKEIO,
 };
 
 static void fio_init fio_null_register(void)
@@ -152,7 +152,7 @@
 	ioengine->init           = fio_null_init;
 	ioengine->cleanup        = fio_null_cleanup;
 	ioengine->open_file      = fio_null_open;
-	ioengine->flags	         = FIO_DISKLESSIO;
+	ioengine->flags	         = FIO_DISKLESSIO | FIO_FAKEIO;
 }
 }
 #endif /* FIO_EXTERNAL_ENGINE */
diff --git a/engines/posixaio.c b/engines/posixaio.c
index 2df26af..8ab88fb 100644
--- a/engines/posixaio.c
+++ b/engines/posixaio.c
@@ -91,7 +91,7 @@
 #define SUSPEND_ENTRIES	8
 
 static int fio_posixaio_getevents(struct thread_data *td, unsigned int min,
-				  unsigned int max, struct timespec *t)
+				  unsigned int max, const struct timespec *t)
 {
 	struct posixaio_data *pd = td->io_ops->data;
 	os_aiocb_t *suspend_list[SUSPEND_ENTRIES];
diff --git a/engines/rbd.c b/engines/rbd.c
index ff35373..3688577 100644
--- a/engines/rbd.c
+++ b/engines/rbd.c
@@ -11,6 +11,8 @@
 
 struct fio_rbd_iou {
 	struct io_u *io_u;
+	rbd_completion_t completion;
+	int io_seen;
 	int io_complete;
 };
 
@@ -19,110 +21,121 @@
 	rados_ioctx_t io_ctx;
 	rbd_image_t image;
 	struct io_u **aio_events;
+	struct io_u **sort_events;
 };
 
 struct rbd_options {
-	struct thread_data *td;
+	void *pad;
 	char *rbd_name;
 	char *pool_name;
 	char *client_name;
+	int busy_poll;
 };
 
 static struct fio_option options[] = {
 	{
-	 .name     = "rbdname",
-	 .lname    = "rbd engine rbdname",
-	 .type     = FIO_OPT_STR_STORE,
-	 .help     = "RBD name for RBD engine",
-	 .off1     = offsetof(struct rbd_options, rbd_name),
-	 .category = FIO_OPT_C_ENGINE,
-	 .group    = FIO_OPT_G_RBD,
-	 },
+		.name		= "rbdname",
+		.lname		= "rbd engine rbdname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "RBD name for RBD engine",
+		.off1		= offsetof(struct rbd_options, rbd_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
 	{
-	 .name     = "pool",
-	 .lname    = "rbd engine pool",
-	 .type     = FIO_OPT_STR_STORE,
-	 .help     = "Name of the pool hosting the RBD for the RBD engine",
-	 .off1     = offsetof(struct rbd_options, pool_name),
-	 .category = FIO_OPT_C_ENGINE,
-	 .group    = FIO_OPT_G_RBD,
-	 },
+		.name		= "pool",
+		.lname		= "rbd engine pool",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the pool hosting the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, pool_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
 	{
-	 .name     = "clientname",
-	 .lname    = "rbd engine clientname",
-	 .type     = FIO_OPT_STR_STORE,
-	 .help     = "Name of the ceph client to access the RBD for the RBD engine",
-	 .off1     = offsetof(struct rbd_options, client_name),
-	 .category = FIO_OPT_C_ENGINE,
-	 .group    = FIO_OPT_G_RBD,
-	 },
+		.name		= "clientname",
+		.lname		= "rbd engine clientname",
+		.type		= FIO_OPT_STR_STORE,
+		.help		= "Name of the ceph client to access the RBD for the RBD engine",
+		.off1		= offsetof(struct rbd_options, client_name),
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
 	{
-	 .name = NULL,
-	 },
+		.name		= "busy_poll",
+		.lname		= "Busy poll",
+		.type		= FIO_OPT_BOOL,
+		.help		= "Busy poll for completions instead of sleeping",
+		.off1		= offsetof(struct rbd_options, busy_poll),
+		.def		= "0",
+		.category	= FIO_OPT_C_ENGINE,
+		.group		= FIO_OPT_G_RBD,
+	},
+	{
+		.name = NULL,
+	},
 };
 
 static int _fio_setup_rbd_data(struct thread_data *td,
 			       struct rbd_data **rbd_data_ptr)
 {
-	struct rbd_data *rbd_data;
+	struct rbd_data *rbd;
 
 	if (td->io_ops->data)
 		return 0;
 
-	rbd_data = malloc(sizeof(struct rbd_data));
-	if (!rbd_data)
+	rbd = calloc(1, sizeof(struct rbd_data));
+	if (!rbd)
 		goto failed;
 
-	memset(rbd_data, 0, sizeof(struct rbd_data));
-
-	rbd_data->aio_events = malloc(td->o.iodepth * sizeof(struct io_u *));
-	if (!rbd_data->aio_events)
+	rbd->aio_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->aio_events)
 		goto failed;
 
-	memset(rbd_data->aio_events, 0, td->o.iodepth * sizeof(struct io_u *));
+	rbd->sort_events = calloc(td->o.iodepth, sizeof(struct io_u *));
+	if (!rbd->sort_events)
+		goto failed;
 
-	*rbd_data_ptr = rbd_data;
-
+	*rbd_data_ptr = rbd;
 	return 0;
 
 failed:
+	if (rbd)
+		free(rbd);
 	return 1;
 
 }
 
 static int _fio_rbd_connect(struct thread_data *td)
 {
-	struct rbd_data *rbd_data = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops->data;
 	struct rbd_options *o = td->eo;
 	int r;
 
-	r = rados_create(&(rbd_data->cluster), o->client_name);
+	r = rados_create(&rbd->cluster, o->client_name);
 	if (r < 0) {
 		log_err("rados_create failed.\n");
 		goto failed_early;
 	}
 
-	r = rados_conf_read_file(rbd_data->cluster, NULL);
+	r = rados_conf_read_file(rbd->cluster, NULL);
 	if (r < 0) {
 		log_err("rados_conf_read_file failed.\n");
 		goto failed_early;
 	}
 
-	r = rados_connect(rbd_data->cluster);
+	r = rados_connect(rbd->cluster);
 	if (r < 0) {
 		log_err("rados_connect failed.\n");
 		goto failed_shutdown;
 	}
 
-	r = rados_ioctx_create(rbd_data->cluster, o->pool_name,
-			       &(rbd_data->io_ctx));
+	r = rados_ioctx_create(rbd->cluster, o->pool_name, &rbd->io_ctx);
 	if (r < 0) {
 		log_err("rados_ioctx_create failed.\n");
 		goto failed_shutdown;
 	}
 
-	r = rbd_open(rbd_data->io_ctx, o->rbd_name, &(rbd_data->image),
-		     NULL /*snap */ );
+	r = rbd_open(rbd->io_ctx, o->rbd_name, &rbd->image, NULL /*snap */ );
 	if (r < 0) {
 		log_err("rbd_open failed.\n");
 		goto failed_open;
@@ -130,104 +143,188 @@
 	return 0;
 
 failed_open:
-	rados_ioctx_destroy(rbd_data->io_ctx);
+	rados_ioctx_destroy(rbd->io_ctx);
+	rbd->io_ctx = NULL;
 failed_shutdown:
-	rados_shutdown(rbd_data->cluster);
+	rados_shutdown(rbd->cluster);
+	rbd->cluster = NULL;
 failed_early:
 	return 1;
 }
 
-static void _fio_rbd_disconnect(struct rbd_data *rbd_data)
+static void _fio_rbd_disconnect(struct rbd_data *rbd)
 {
-	if (!rbd_data)
+	if (!rbd)
 		return;
 
 	/* shutdown everything */
-	if (rbd_data->image) {
-		rbd_close(rbd_data->image);
-		rbd_data->image = NULL;
+	if (rbd->image) {
+		rbd_close(rbd->image);
+		rbd->image = NULL;
 	}
 
-	if (rbd_data->io_ctx) {
-		rados_ioctx_destroy(rbd_data->io_ctx);
-		rbd_data->io_ctx = NULL;
+	if (rbd->io_ctx) {
+		rados_ioctx_destroy(rbd->io_ctx);
+		rbd->io_ctx = NULL;
 	}
 
-	if (rbd_data->cluster) {
-		rados_shutdown(rbd_data->cluster);
-		rbd_data->cluster = NULL;
+	if (rbd->cluster) {
+		rados_shutdown(rbd->cluster);
+		rbd->cluster = NULL;
 	}
 }
 
-static void _fio_rbd_finish_write_aiocb(rbd_completion_t comp, void *data)
+static void _fio_rbd_finish_aiocb(rbd_completion_t comp, void *data)
 {
-	struct io_u *io_u = (struct io_u *)data;
-	struct fio_rbd_iou *fio_rbd_iou =
-	    (struct fio_rbd_iou *)io_u->engine_data;
+	struct fio_rbd_iou *fri = data;
+	struct io_u *io_u = fri->io_u;
+	ssize_t ret;
 
-	fio_rbd_iou->io_complete = 1;
-
-	/* if write needs to be verified - we should not release comp here
-	   without fetching the result */
-
-	rbd_aio_release(comp);
-	/* TODO handle error */
-
-	return;
-}
-
-static void _fio_rbd_finish_read_aiocb(rbd_completion_t comp, void *data)
-{
-	struct io_u *io_u = (struct io_u *)data;
-	struct fio_rbd_iou *fio_rbd_iou =
-	    (struct fio_rbd_iou *)io_u->engine_data;
-
-	fio_rbd_iou->io_complete = 1;
-
-	/* if read needs to be verified - we should not release comp here
-	   without fetching the result */
-	rbd_aio_release(comp);
-
-	/* TODO handle error */
-
-	return;
+	/*
+	 * Looks like return value is 0 for success, or < 0 for
+	 * a specific error. So we have to assume that it can't do
+	 * partial completions.
+	 */
+	fri->io_complete = 1;
+	
+	ret = rbd_aio_get_return_value(fri->completion);
+	if (ret < 0) {
+		io_u->error = ret;
+		io_u->resid = io_u->xfer_buflen;
+	} else
+		io_u->error = 0;
 }
 
 static struct io_u *fio_rbd_event(struct thread_data *td, int event)
 {
-	struct rbd_data *rbd_data = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops->data;
 
-	return rbd_data->aio_events[event];
+	return rbd->aio_events[event];
+}
+
+static inline int fri_check_complete(struct rbd_data *rbd, struct io_u *io_u,
+				     unsigned int *events)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	if (fri->io_complete) {
+		fri->io_seen = 1;
+		rbd->aio_events[*events] = io_u;
+		(*events)++;
+
+		rbd_aio_release(fri->completion);
+		return 1;
+	}
+
+	return 0;
+}
+
+static inline int rbd_io_u_seen(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	return fri->io_seen;
+}
+
+static void rbd_io_u_wait_complete(struct io_u *io_u)
+{
+	struct fio_rbd_iou *fri = io_u->engine_data;
+
+	rbd_aio_wait_for_complete(fri->completion);
+}
+
+static int rbd_io_u_cmp(const void *p1, const void *p2)
+{
+	const struct io_u **a = (const struct io_u **) p1;
+	const struct io_u **b = (const struct io_u **) p2;
+	uint64_t at, bt;
+
+	at = utime_since_now(&(*a)->start_time);
+	bt = utime_since_now(&(*b)->start_time);
+
+	if (at < bt)
+		return -1;
+	else if (at == bt)
+		return 0;
+	else
+		return 1;
+}
+
+static int rbd_iter_events(struct thread_data *td, unsigned int *events,
+			   unsigned int min_evts, int wait)
+{
+	struct rbd_data *rbd = td->io_ops->data;
+	unsigned int this_events = 0;
+	struct io_u *io_u;
+	int i, sidx;
+
+	sidx = 0;
+	io_u_qiter(&td->io_u_all, io_u, i) {
+		if (!(io_u->flags & IO_U_F_FLIGHT))
+			continue;
+		if (rbd_io_u_seen(io_u))
+			continue;
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+		else if (wait)
+			rbd->sort_events[sidx++] = io_u;
+	}
+
+	if (!wait || !sidx)
+		return this_events;
+
+	/*
+	 * Sort events, oldest issue first, then wait on as many as we
+	 * need in order of age. If we have enough events, stop waiting,
+	 * and just check if any of the older ones are done.
+	 */
+	if (sidx > 1)
+		qsort(rbd->sort_events, sidx, sizeof(struct io_u *), rbd_io_u_cmp);
+
+	for (i = 0; i < sidx; i++) {
+		io_u = rbd->sort_events[i];
+
+		if (fri_check_complete(rbd, io_u, events)) {
+			this_events++;
+			continue;
+		}
+
+		/*
+		 * Stop waiting when we have enough, but continue checking
+		 * all pending IOs if they are complete.
+		 */
+		if (*events >= min_evts)
+			continue;
+
+		rbd_io_u_wait_complete(io_u);
+
+		if (fri_check_complete(rbd, io_u, events))
+			this_events++;
+	}
+
+	return this_events;
 }
 
 static int fio_rbd_getevents(struct thread_data *td, unsigned int min,
-			     unsigned int max, struct timespec *t)
+			     unsigned int max, const struct timespec *t)
 {
-	struct rbd_data *rbd_data = td->io_ops->data;
-	unsigned int events = 0;
-	struct io_u *io_u;
-	int i;
-	struct fio_rbd_iou *fov;
+	unsigned int this_events, events = 0;
+	struct rbd_options *o = td->eo;
+	int wait = 0;
 
 	do {
-		io_u_qiter(&td->io_u_all, io_u, i) {
-			if (!(io_u->flags & IO_U_F_FLIGHT))
-				continue;
+		this_events = rbd_iter_events(td, &events, min, wait);
 
-			fov = (struct fio_rbd_iou *)io_u->engine_data;
-
-			if (fov->io_complete) {
-				fov->io_complete = 0;
-				rbd_data->aio_events[events] = io_u;
-				events++;
-			}
-
-		}
-		if (events < min)
-			usleep(100);
-		else
+		if (events >= min)
 			break;
+		if (this_events)
+			continue;
 
+		if (!o->busy_poll)
+			wait = 1;
+		else
+			nop;
 	} while (1);
 
 	return events;
@@ -235,65 +332,60 @@
 
 static int fio_rbd_queue(struct thread_data *td, struct io_u *io_u)
 {
+	struct rbd_data *rbd = td->io_ops->data;
+	struct fio_rbd_iou *fri = io_u->engine_data;
 	int r = -1;
-	struct rbd_data *rbd_data = td->io_ops->data;
-	rbd_completion_t comp;
 
 	fio_ro_check(td, io_u);
 
-	if (io_u->ddir == DDIR_WRITE) {
-		r = rbd_aio_create_completion(io_u,
-					      (rbd_callback_t)
-					      _fio_rbd_finish_write_aiocb,
-					      &comp);
-		if (r < 0) {
-			log_err
-			    ("rbd_aio_create_completion for DDIR_WRITE failed.\n");
-			goto failed;
-		}
+	fri->io_seen = 0;
+	fri->io_complete = 0;
 
-		r = rbd_aio_write(rbd_data->image, io_u->offset,
-				  io_u->xfer_buflen, io_u->xfer_buf, comp);
+	r = rbd_aio_create_completion(fri, _fio_rbd_finish_aiocb,
+						&fri->completion);
+	if (r < 0) {
+		log_err("rbd_aio_create_completion failed.\n");
+		goto failed;
+	}
+
+	if (io_u->ddir == DDIR_WRITE) {
+		r = rbd_aio_write(rbd->image, io_u->offset, io_u->xfer_buflen,
+					 io_u->xfer_buf, fri->completion);
 		if (r < 0) {
 			log_err("rbd_aio_write failed.\n");
-			goto failed;
+			goto failed_comp;
 		}
 
 	} else if (io_u->ddir == DDIR_READ) {
-		r = rbd_aio_create_completion(io_u,
-					      (rbd_callback_t)
-					      _fio_rbd_finish_read_aiocb,
-					      &comp);
-		if (r < 0) {
-			log_err
-			    ("rbd_aio_create_completion for DDIR_READ failed.\n");
-			goto failed;
-		}
-
-		r = rbd_aio_read(rbd_data->image, io_u->offset,
-				 io_u->xfer_buflen, io_u->xfer_buf, comp);
+		r = rbd_aio_read(rbd->image, io_u->offset, io_u->xfer_buflen,
+					io_u->xfer_buf, fri->completion);
 
 		if (r < 0) {
 			log_err("rbd_aio_read failed.\n");
-			goto failed;
+			goto failed_comp;
 		}
-
+	} else if (io_u->ddir == DDIR_TRIM) {
+		r = rbd_aio_discard(rbd->image, io_u->offset,
+					io_u->xfer_buflen, fri->completion);
+		if (r < 0) {
+			log_err("rbd_aio_discard failed.\n");
+			goto failed_comp;
+		}
 	} else if (io_u->ddir == DDIR_SYNC) {
-		r = rbd_flush(rbd_data->image);
+		r = rbd_aio_flush(rbd->image, fri->completion);
 		if (r < 0) {
 			log_err("rbd_flush failed.\n");
-			goto failed;
+			goto failed_comp;
 		}
-
-		return FIO_Q_COMPLETED;
 	} else {
 		dprint(FD_IO, "%s: Warning: unhandled ddir: %d\n", __func__,
 		       io_u->ddir);
-		return FIO_Q_COMPLETED;
+		goto failed_comp;
 	}
 
 	return FIO_Q_QUEUED;
-
+failed_comp:
+	rbd_aio_release(fri->completion);
 failed:
 	io_u->error = r;
 	td_verror(td, io_u->error, "xfer");
@@ -314,43 +406,43 @@
 
 failed:
 	return 1;
-
 }
 
 static void fio_rbd_cleanup(struct thread_data *td)
 {
-	struct rbd_data *rbd_data = td->io_ops->data;
+	struct rbd_data *rbd = td->io_ops->data;
 
-	if (rbd_data) {
-		_fio_rbd_disconnect(rbd_data);
-		free(rbd_data->aio_events);
-		free(rbd_data);
+	if (rbd) {
+		_fio_rbd_disconnect(rbd);
+		free(rbd->aio_events);
+		free(rbd->sort_events);
+		free(rbd);
 	}
-
 }
 
 static int fio_rbd_setup(struct thread_data *td)
 {
-	int r = 0;
 	rbd_image_info_t info;
 	struct fio_file *f;
-	struct rbd_data *rbd_data = NULL;
+	struct rbd_data *rbd = NULL;
 	int major, minor, extra;
+	int r;
 
 	/* log version of librbd. No cluster connection required. */
 	rbd_version(&major, &minor, &extra);
 	log_info("rbd engine: RBD version: %d.%d.%d\n", major, minor, extra);
 
 	/* allocate engine specific structure to deal with librbd. */
-	r = _fio_setup_rbd_data(td, &rbd_data);
+	r = _fio_setup_rbd_data(td, &rbd);
 	if (r) {
 		log_err("fio_setup_rbd_data failed.\n");
 		goto cleanup;
 	}
-	td->io_ops->data = rbd_data;
+	td->io_ops->data = rbd;
 
-	/* librbd does not allow us to run first in the main thread and later in a
-	 * fork child. It needs to be the same process context all the time. 
+	/* librbd does not allow us to run first in the main thread and later
+	 * in a fork child. It needs to be the same process context all the
+	 * time. 
 	 */
 	td->o.use_thread = 1;
 
@@ -365,7 +457,7 @@
 	}
 
 	/* get size of the RADOS block device */
-	r = rbd_stat(rbd_data->image, &info, sizeof(info));
+	r = rbd_stat(rbd->image, &info, sizeof(info));
 	if (r < 0) {
 		log_err("rbd_status failed.\n");
 		goto disconnect;
@@ -387,11 +479,11 @@
 	/* disconnect, then we were only connected to determine
 	 * the size of the RBD.
 	 */
-	_fio_rbd_disconnect(rbd_data);
+	_fio_rbd_disconnect(rbd);
 	return 0;
 
 disconnect:
-	_fio_rbd_disconnect(rbd_data);
+	_fio_rbd_disconnect(rbd);
 cleanup:
 	fio_rbd_cleanup(td);
 	return r;
@@ -402,41 +494,52 @@
 	return 0;
 }
 
+static int fio_rbd_invalidate(struct thread_data *td, struct fio_file *f)
+{
+#if defined(CONFIG_RBD_INVAL)
+	struct rbd_data *rbd = td->io_ops->data;
+
+	return rbd_invalidate_cache(rbd->image);
+#else
+	return 0;
+#endif
+}
+
 static void fio_rbd_io_u_free(struct thread_data *td, struct io_u *io_u)
 {
-	struct fio_rbd_iou *o = io_u->engine_data;
+	struct fio_rbd_iou *fri = io_u->engine_data;
 
-	if (o) {
+	if (fri) {
 		io_u->engine_data = NULL;
-		free(o);
+		free(fri);
 	}
 }
 
 static int fio_rbd_io_u_init(struct thread_data *td, struct io_u *io_u)
 {
-	struct fio_rbd_iou *o;
+	struct fio_rbd_iou *fri;
 
-	o = malloc(sizeof(*o));
-	o->io_complete = 0;
-	o->io_u = io_u;
-	io_u->engine_data = o;
+	fri = calloc(1, sizeof(*fri));
+	fri->io_u = io_u;
+	io_u->engine_data = fri;
 	return 0;
 }
 
 static struct ioengine_ops ioengine = {
-	.name               = "rbd",
-	.version            = FIO_IOOPS_VERSION,
-	.setup              = fio_rbd_setup,
-	.init               = fio_rbd_init,
-	.queue              = fio_rbd_queue,
-	.getevents          = fio_rbd_getevents,
-	.event              = fio_rbd_event,
-	.cleanup            = fio_rbd_cleanup,
-	.open_file          = fio_rbd_open,
-	.options            = options,
-	.io_u_init          = fio_rbd_io_u_init,
-	.io_u_free          = fio_rbd_io_u_free,
-	.option_struct_size = sizeof(struct rbd_options),
+	.name			= "rbd",
+	.version		= FIO_IOOPS_VERSION,
+	.setup			= fio_rbd_setup,
+	.init			= fio_rbd_init,
+	.queue			= fio_rbd_queue,
+	.getevents		= fio_rbd_getevents,
+	.event			= fio_rbd_event,
+	.cleanup		= fio_rbd_cleanup,
+	.open_file		= fio_rbd_open,
+	.invalidate		= fio_rbd_invalidate,
+	.options		= options,
+	.io_u_init		= fio_rbd_io_u_init,
+	.io_u_free		= fio_rbd_io_u_free,
+	.option_struct_size	= sizeof(struct rbd_options),
 };
 
 static void fio_init fio_rbd_register(void)
diff --git a/engines/rdma.c b/engines/rdma.c
index af50187..5081202 100644
--- a/engines/rdma.c
+++ b/engines/rdma.c
@@ -524,7 +524,7 @@
 }
 
 static int fio_rdmaio_getevents(struct thread_data *td, unsigned int min,
-				unsigned int max, struct timespec *t)
+				unsigned int max, const struct timespec *t)
 {
 	struct rdmaio_data *rd = td->io_ops->data;
 	enum ibv_wc_opcode comp_opcode;
diff --git a/engines/sg.c b/engines/sg.c
index 1a027da..6272b79 100644
--- a/engines/sg.c
+++ b/engines/sg.c
@@ -62,7 +62,8 @@
 }
 
 static int fio_sgio_getevents(struct thread_data *td, unsigned int min,
-			      unsigned int max, struct timespec fio_unused *t)
+			      unsigned int max,
+			      const struct timespec fio_unused *t)
 {
 	struct sgio_data *sd = td->io_ops->data;
 	int left = max, ret, r = 0;
diff --git a/engines/skeleton_external.c b/engines/skeleton_external.c
index f9a0e1c..63a6f8d 100644
--- a/engines/skeleton_external.c
+++ b/engines/skeleton_external.c
@@ -38,7 +38,7 @@
  * numbers. Required.
  */
 static int fio_skeleton_getevents(struct thread_data *td, unsigned int min,
-				  unsigned int max, struct timespec *t)
+				  unsigned int max, const struct timespec *t)
 {
 	return 0;
 }
diff --git a/engines/solarisaio.c b/engines/solarisaio.c
index 137dc22..55a0cb9 100644
--- a/engines/solarisaio.c
+++ b/engines/solarisaio.c
@@ -73,7 +73,7 @@
 }
 
 static int fio_solarisaio_getevents(struct thread_data *td, unsigned int min,
-				    unsigned int max, struct timespec *t)
+				    unsigned int max, const struct timespec *t)
 {
 	struct solarisaio_data *sd = td->io_ops->data;
 	struct timeval tv;
diff --git a/engines/sync.c b/engines/sync.c
index 1329946..48aafff 100644
--- a/engines/sync.c
+++ b/engines/sync.c
@@ -63,8 +63,10 @@
 			io_u->error = errno;
 	}
 
-	if (io_u->error)
+	if (io_u->error) {
+		io_u_log_error(td, io_u);
 		td_verror(td, io_u->error, "xfer");
+	}
 
 	return FIO_Q_COMPLETED;
 }
@@ -138,7 +140,7 @@
 
 static int fio_vsyncio_getevents(struct thread_data *td, unsigned int min,
 				 unsigned int max,
-				 struct timespec fio_unused *t)
+				 const struct timespec fio_unused *t)
 {
 	struct syncio_data *sd = td->io_ops->data;
 	int ret;
diff --git a/engines/windowsaio.c b/engines/windowsaio.c
index 16df740..ec8222c 100644
--- a/engines/windowsaio.c
+++ b/engines/windowsaio.c
@@ -37,7 +37,7 @@
 
 static BOOL timeout_expired(DWORD start_count, DWORD end_count);
 static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-					unsigned int max, struct timespec *t);
+				unsigned int max, const struct timespec *t);
 static struct io_u *fio_windowsaio_event(struct thread_data *td, int event);
 static int fio_windowsaio_queue(struct thread_data *td,
 				  struct io_u *io_u);
@@ -256,7 +256,8 @@
 }
 
 static int fio_windowsaio_getevents(struct thread_data *td, unsigned int min,
-					unsigned int max, struct timespec *t)
+				    unsigned int max,
+				    const struct timespec *t)
 {
 	struct windowsaio_data *wd = td->io_ops->data;
 	unsigned int dequeued = 0;
diff --git a/eta.c b/eta.c
index 7500082..167bf5f 100644
--- a/eta.c
+++ b/eta.c
@@ -7,14 +7,33 @@
 
 #include "fio.h"
 
-static char run_str[REAL_MAX_JOBS + 1];
+static char __run_str[REAL_MAX_JOBS + 1];
+static char run_str[__THREAD_RUNSTR_SZ(REAL_MAX_JOBS)];
+
+static void update_condensed_str(char *rstr, char *run_str_condensed)
+{
+	if (*rstr) {
+		while (*rstr) {
+			int nr = 1;
+
+			*run_str_condensed++ = *rstr++;
+			while (*(rstr - 1) == *rstr) {
+				rstr++;
+				nr++;
+			}
+			run_str_condensed += sprintf(run_str_condensed, "(%u),", nr);
+		}
+		run_str_condensed--;
+	}
+	*run_str_condensed = '\0';
+}
 
 /*
  * Sets the status of the 'td' in the printed status map.
  */
 static void check_str_update(struct thread_data *td)
 {
-	char c = run_str[td->thread_number - 1];
+	char c = __run_str[td->thread_number - 1];
 
 	switch (td->runstate) {
 	case TD_REAPED:
@@ -91,7 +110,8 @@
 		log_err("state %d\n", td->runstate);
 	}
 
-	run_str[td->thread_number - 1] = c;
+	__run_str[td->thread_number - 1] = c;
+	update_condensed_str(__run_str, run_str);
 }
 
 /*
@@ -214,11 +234,11 @@
 		 * if given, otherwise assume it'll run at the specified rate.
 		 */
 		if (td->o.timeout) {
-			uint64_t timeout = td->o.timeout;
+			uint64_t __timeout = td->o.timeout;
 			uint64_t start_delay = td->o.start_delay;
 			uint64_t ramp_time = td->o.ramp_time;
 
-			t_eta = timeout + start_delay + ramp_time;
+			t_eta = __timeout + start_delay + ramp_time;
 			t_eta /= 1000000ULL;
 
 			if (in_ramp_time(td)) {
@@ -372,10 +392,9 @@
 		} else if (td->runstate == TD_RAMP) {
 			je->nr_running++;
 			je->nr_ramp++;
-		} else if (td->runstate == TD_SETTING_UP) {
-			je->nr_running++;
+		} else if (td->runstate == TD_SETTING_UP)
 			je->nr_setting_up++;
-		} else if (td->runstate < TD_RUNNING)
+		else if (td->runstate < TD_RUNNING)
 			je->nr_pending++;
 
 		if (je->elapsed_sec >= 3)
@@ -446,7 +465,8 @@
 		return 0;
 
 	je->nr_threads = thread_number;
-	memcpy(je->run_str, run_str, thread_number * sizeof(char));
+	update_condensed_str(__run_str, run_str);
+	memcpy(je->run_str, run_str, strlen(run_str));
 	return 1;
 }
 
@@ -535,8 +555,7 @@
 	if (!eta_new_line_init) {
 		fio_gettime(&disp_eta_new_line, NULL);
 		eta_new_line_init = 1;
-	} else if (eta_new_line &&
-		   mtime_since_now(&disp_eta_new_line) > eta_new_line * 1000) {
+	} else if (eta_new_line && mtime_since_now(&disp_eta_new_line) > eta_new_line) {
 		fio_gettime(&disp_eta_new_line, NULL);
 		eta_new_line_pending = 1;
 	}
@@ -544,19 +563,35 @@
 	fflush(stdout);
 }
 
+struct jobs_eta *get_jobs_eta(int force, size_t *size)
+{
+	struct jobs_eta *je;
+
+	if (!thread_number)
+		return NULL;
+
+	*size = sizeof(*je) + THREAD_RUNSTR_SZ;
+	je = malloc(*size);
+	if (!je)
+		return NULL;
+	memset(je, 0, *size);
+
+	if (!calc_thread_status(je, force)) {
+		free(je);
+		return NULL;
+	}
+
+	*size = sizeof(*je) + strlen((char *) je->run_str) + 1;
+	return je;
+}
+
 void print_thread_status(void)
 {
 	struct jobs_eta *je;
 	size_t size;
 
-	if (!thread_number)
-		return;
-
-	size = sizeof(*je) + thread_number * sizeof(char) + 1;
-	je = malloc(size);
-	memset(je, 0, size);
-
-	if (calc_thread_status(je, 0))
+	je = get_jobs_eta(0, &size);
+	if (je)
 		display_thread_status(je);
 
 	free(je);
@@ -564,5 +599,6 @@
 
 void print_status_init(int thr_number)
 {
-	run_str[thr_number] = 'P';
+	__run_str[thr_number] = 'P';
+	update_condensed_str(__run_str, run_str);
 }
diff --git a/examples/gfapi.fio b/examples/gfapi.fio
new file mode 100644
index 0000000..ccc8123
--- /dev/null
+++ b/examples/gfapi.fio
@@ -0,0 +1,16 @@
+# Test opening a file from multiple jobs.
+# Originally authored by Castor Fu
+[global]
+ioengine=gfapi
+volume=vol
+brick=localhost
+create_on_open=1
+rw=write
+
+[reopen_file_test]
+nrfiles=4
+filesize=16k
+size=64k
+openfiles=2
+rw=write
+filename_format=reopen_test.$filenum
diff --git a/examples/libhdfs.fio b/examples/libhdfs.fio
new file mode 100644
index 0000000..d5c0ba6
--- /dev/null
+++ b/examples/libhdfs.fio
@@ -0,0 +1,8 @@
+[global]
+runtime=300
+
+[hdfs]
+filename=dfs-perftest-base.dfs-perftest-base,9000
+ioengine=libhdfs
+rw=read
+bs=256k
diff --git a/examples/rbd.fio b/examples/rbd.fio
index fcb494a..c6901f4 100644
--- a/examples/rbd.fio
+++ b/examples/rbd.fio
@@ -1,9 +1,9 @@
 ######################################################################
 # Example test for the RBD engine.
 # 
-# Runs a 4k random write test agains a RBD via librbd  
+# Runs a 4k random write test against a RBD via librbd
 #
-# NOTE: Make sure you have either a RBD named 'fio_test' or change 
+# NOTE: Make sure you have either a RBD named 'fio_test' or change
 #       the rbdname parameter.
 ######################################################################
 [global]
@@ -15,7 +15,6 @@
 clientname=admin
 pool=rbd
 rbdname=fio_test
-invalidate=0	# mandatory
 rw=randwrite
 bs=4k
 
diff --git a/examples/ssd-test.fio b/examples/ssd-test.fio
index c84cf50..2b6a590 100644
--- a/examples/ssd-test.fio
+++ b/examples/ssd-test.fio
@@ -14,7 +14,7 @@
 bs=4k
 ioengine=libaio
 iodepth=4
-size=1g
+size=10g
 direct=1
 runtime=60
 directory=/mount-point-of-ssd
diff --git a/exp/README.md b/exp/README.md
new file mode 100644
index 0000000..48c11c9
--- /dev/null
+++ b/exp/README.md
@@ -0,0 +1,7 @@
+simple-expression-parser
+========================
+
+A simple expression parser for arithmetic expressions made with bison + flex
+
+To use, see the example test-expression-parser.c
+
diff --git a/exp/expression-parser.l b/exp/expression-parser.l
new file mode 100644
index 0000000..50bd383
--- /dev/null
+++ b/exp/expression-parser.l
@@ -0,0 +1,184 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include "y.tab.h"
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+extern int lexer_input(char *buffer, unsigned int *nbytes, int buffersize);
+
+#undef YY_INPUT
+#define YY_INPUT(buffer, bytes_read, bytes_requested)			\
+({									\
+	int __ret;							\
+	unsigned int __bread = bytes_read;				\
+	__ret = lexer_input((buffer), &__bread, (bytes_requested));	\
+	bytes_read = __bread;						\
+	__ret;								\
+})
+
+extern int yyerror(long long *result, double *dresult,
+		int *has_error, int *units_specified, const char *msg);
+
+static void __attribute__((unused)) yyunput(int c, char *buf_ptr);
+static int __attribute__((unused)) input(void);
+
+/* set by parser -- this is another thing which makes the parser thread-unsafe :(. */
+int lexer_value_is_time = 0; /* for determining if "m" suffix means mega- or minutes */
+
+#define set_suffix_value(yylval, i_val, d_val, has_d_val) \
+	(yylval).v.dval = (d_val); \
+	(yylval).v.ival = (i_val); \
+	(yylval).v.has_dval = (has_d_val); \
+	(yylval).v.has_error = 0;
+
+%}
+
+%%
+
+
+[kK]|[kK][bB] 	{
+			set_suffix_value(yylval, 1024, 1024.0, 0);
+			return SUFFIX;
+		}
+[Mm][bB]	{
+			set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[mM][sS]	{
+			set_suffix_value(yylval, 1000, 1000.0, 1);
+			return SUFFIX;
+		}
+[uU][sS]	{
+			set_suffix_value(yylval, 1, 1.0, 1);
+			return SUFFIX;
+		}
+[gG]|[Gg][Bb]	{
+			set_suffix_value(yylval, 1024LL * 1024 * 1024, 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[tT]|[tT][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024,
+						1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024, 0);
+			return SUFFIX;
+		}
+[pP]|[pP][bB]	{	
+			set_suffix_value(yylval, 1024LL * 1024 * 1024 * 1024 * 1024,
+					1024.0 * 1024.0 * 1024.0 * 1024.0 * 1024.0, 0);
+			return SUFFIX;
+		}
+[kK][iI][Bb]	{
+			set_suffix_value(yylval, 1000LL, 1000.0, 0);
+			return SUFFIX;
+		}
+[mM][Ii][bB]	{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[gG][iI][Bb]	{
+			set_suffix_value(yylval, 1000000000LL, 1000000000.0 , 0);
+			return SUFFIX;
+		}
+[pP][iI][Bb]	{	
+			set_suffix_value(yylval, 1000000000000LL, 1000000000000.0 , 0);
+			return SUFFIX;
+		}
+[sS]		{
+			set_suffix_value(yylval, 1000000LL, 1000000.0 , 0);
+			return SUFFIX;
+		}
+[mM]		{
+			if (!lexer_value_is_time) {
+				set_suffix_value(yylval, 1024 * 1024, 1024.0 * 1024.0, 0);
+			} else {
+				set_suffix_value(yylval, 60LL * 1000000LL, 60.0 * 1000000.0, 0);
+			}
+			return SUFFIX;
+		}
+[dD]		{
+			set_suffix_value(yylval, 60LL * 60LL * 24LL * 1000000LL,
+						60.0 * 60.0 * 24.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[hH]		{	
+			set_suffix_value(yylval, 60LL * 60LL * 1000000LL,
+					60.0 * 60.0 * 1000000.0, 0);
+			return SUFFIX;
+		}
+[ \t] ; /* ignore whitespace */
+[#:,].* ; /* ignore comments, and everything after colons and commas */
+[0-9]*[.][0-9]+|[0-9]*[.]?[0-9]+[eE][-+]*[0-9]+ {
+			int rc;
+			double dval;
+
+			rc = sscanf(yytext, "%lf", &dval);
+			if (rc == 1) {
+				yylval.v.dval = dval;
+				yylval.v.ival = (long long) dval;
+				yylval.v.has_dval = 1;
+				yylval.v.has_error = 0;
+				return NUMBER;
+			} else {
+				yyerror(0, 0, 0, 0, "bad number\n");
+				yylval.v.has_error = 1;
+				return NUMBER;
+			}
+		}
+0x[0-9a-fA-F]+ {
+		int rc, intval;
+		rc = sscanf(yytext, "%x", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+[0-9]+	{
+		int rc, intval;
+		rc = sscanf(yytext, "%d", &intval);
+		if (rc == 1) {
+			yylval.v.ival = intval;
+			yylval.v.dval = (double) intval;
+			yylval.v.has_dval = 0;
+			yylval.v.has_error = 0;
+			return NUMBER;
+		} else {
+			yyerror(0, 0, 0, 0, "bad number\n");
+			yylval.v.has_error = 1;
+			return NUMBER;
+		}
+	}
+\n	return 0;
+[+-/*()^%]	return yytext[0];
+
+.	{
+		yylval.v.has_error = 1;
+		return NUMBER;	
+	}
+%%
+
diff --git a/exp/expression-parser.y b/exp/expression-parser.y
new file mode 100644
index 0000000..d664b8e
--- /dev/null
+++ b/exp/expression-parser.y
@@ -0,0 +1,247 @@
+%{
+
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+#include <math.h>
+
+struct parser_value_type {
+	double dval;
+	long long ival;
+	int has_dval;
+	int has_error;
+};
+
+typedef union valtype {
+	struct parser_value_type v;
+} PARSER_VALUE_TYPE;
+
+#define YYSTYPE PARSER_VALUE_TYPE
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg);
+
+extern int yylex(void);
+extern void yyrestart(FILE *file);
+extern int lexer_value_is_time;
+
+%}
+
+%union valtype {
+	struct parser_value_type {
+		double dval;
+		long long ival;
+		int has_dval;
+		int has_error;
+	} v;
+};
+
+%token <v> NUMBER
+%token <v> BYE
+%token <v> SUFFIX 
+%left '-' '+'
+%right SUFFIX
+%left '*' '/'
+%right '^'
+%left '%'
+%nonassoc UMINUS
+%parse-param { long long *result }
+%parse-param { double *dresult }
+%parse-param { int *has_error }
+%parse-param { int *units_specified }
+
+%type <v> expression
+%%
+
+top_level:	expression {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = $1.has_error;
+			}
+		| expression error {
+				*result = $1.ival;
+				*dresult = $1.dval;
+				*has_error = 1;
+			}
+expression:	expression '+' expression { 
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival + $3.ival;
+			else
+				$$.ival = (long long) ($1.dval + $3.dval);
+			$$.dval = $1.dval + $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '-' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival - $3.ival; 
+			else
+				$$.ival = (long long) ($1.dval - $3.dval); 
+			$$.dval = $1.dval - $3.dval; 
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '*' expression {
+			if (!$1.has_dval && !$3.has_dval)
+				$$.ival = $1.ival * $3.ival;
+			else
+				$$.ival = (long long) ($1.dval * $3.dval);
+			$$.dval = $1.dval * $3.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '/' expression {
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.ival = $1.ival / $3.ival;
+			if ($3.dval < 1e-20 && $3.dval > -1e-20)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else
+				$$.dval = $1.dval / $3.dval;
+			if ($3.has_dval || $1.has_dval)
+				$$.ival = (long long) $$.dval;
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	'-' expression %prec UMINUS {
+			$$.ival = -$2.ival;
+			$$.dval = -$2.dval;
+			$$.has_error = $2.has_error;
+		}
+	|	'(' expression ')' { $$ = $2; }
+	|	expression SUFFIX {
+			if (!$1.has_dval && !$2.has_dval)
+				$$.ival = $1.ival * $2.ival;
+			else
+				$$.ival = (long long) $1.dval * $2.dval;
+			if ($1.has_dval || $2.has_dval)
+				$$.dval = $1.dval * $2.dval;
+			else
+				$$.dval = $1.ival * $2.ival;
+			$$.has_error = $1.has_error || $2.has_error;
+			*units_specified = 1;
+		}
+	|	expression '%' expression {
+			if ($1.has_dval || $3.has_dval)
+				yyerror(0, 0, 0, 0, "modulo on floats");
+			if ($3.ival == 0)
+				yyerror(0, 0, 0, 0, "divide by zero");
+			else {
+				$$.ival = $1.ival % $3.ival;
+				$$.dval = $$.ival;
+			}
+			$$.has_error = $1.has_error || $3.has_error;
+		}
+	|	expression '^' expression {
+			$$.has_error = $1.has_error || $3.has_error;
+			if (!$1.has_dval && !$3.has_dval) {
+				int i;
+
+				if ($3.ival == 0) {
+					$$.ival = 1;
+				} else if ($3.ival > 0) {
+					long long tmp = $1.ival;
+					$$.ival = 1.0;
+					for (i = 0; i < $3.ival; i++)
+						$$.ival *= tmp;
+				}  else {
+					/* integers, 2^-3, ok, we now have doubles */
+					double tmp;
+					if ($1.ival == 0 && $3.ival == 0) {
+						tmp = 1.0;
+						$$.has_error = 1;
+					} else {
+						double x = (double) $1.ival;
+						double y = (double) $3.ival;
+						tmp = pow(x, y);
+					}
+					$$.ival = (long long) tmp;
+				}
+				$$.dval = pow($1.dval, $3.dval);
+			} else {
+				$$.dval = pow($1.dval, $3.dval);
+				$$.ival = (long long) $$.dval;
+			}
+		}
+	|	NUMBER { $$ = $1; };
+%%
+#include <stdio.h>
+
+/* Urgh.  yacc and lex are kind of horrible.  This is not thread safe, obviously. */
+static int lexer_read_offset = 0;
+static char lexer_input_buffer[1000];
+
+int lexer_input(char* buffer, unsigned int *bytes_read, int bytes_requested)
+{
+	int bytes_left = strlen(lexer_input_buffer) - lexer_read_offset;
+
+	if (bytes_requested > bytes_left )
+		bytes_requested = bytes_left;
+	memcpy(buffer, &lexer_input_buffer[lexer_read_offset], bytes_requested);
+	*bytes_read = bytes_requested;
+	lexer_read_offset += bytes_requested;
+	return 0;
+}
+
+static void setup_to_parse_string(const char *string)
+{
+	unsigned int len;
+
+	len = strlen(string);
+	if (len > sizeof(lexer_input_buffer) - 3)
+		len = sizeof(lexer_input_buffer) - 3;
+
+	strncpy(lexer_input_buffer, string, len);
+	lexer_input_buffer[len] = '\0'; 
+	lexer_input_buffer[len + 1] = '\0';  /* lex/yacc want string double null terminated! */
+	lexer_read_offset = 0;
+}
+
+int evaluate_arithmetic_expression(const char *buffer, long long *ival, double *dval,
+					double implied_units, int is_time)
+{
+	int rc, units_specified = 0, has_error = 0;
+
+	lexer_value_is_time = is_time;
+	setup_to_parse_string(buffer);
+	rc = yyparse(ival, dval, &has_error, &units_specified);
+	yyrestart(NULL);
+	if (rc || has_error) {
+		*ival = 0;
+		*dval = 0;
+		has_error = 1;
+	}
+	if (!units_specified) {
+		*ival = (int) ((double) *ival * implied_units);
+		*dval = *dval * implied_units;
+	}
+	return has_error;
+}
+
+int yyerror(__attribute__((unused)) long long *result,
+		__attribute__((unused)) double *dresult,
+		__attribute__((unused)) int *has_error,
+		__attribute__((unused)) int *units_specified,
+		__attribute__((unused)) const char *msg)
+{
+	/* We do not need to do anything here. */
+	return 0;
+}
+
diff --git a/exp/test-expression-parser.c b/exp/test-expression-parser.c
new file mode 100644
index 0000000..bf3fb3e
--- /dev/null
+++ b/exp/test-expression-parser.c
@@ -0,0 +1,54 @@
+/*
+ * (C) Copyright 2014, Stephen M. Cameron.
+ *
+ * The license below covers all files distributed with fio unless otherwise
+ * noted in the file itself.
+ *
+ *  This program is free software; you can redistribute it and/or modify
+ *  it under the terms of the GNU General Public License version 2 as
+ *  published by the Free Software Foundation.
+ *
+ *  This program is distributed in the hope that it will be useful,
+ *  but WITHOUT ANY WARRANTY; without even the implied warranty of
+ *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ *  GNU General Public License for more details.
+ *
+ *  You should have received a copy of the GNU General Public License
+ *  along with this program; if not, write to the Free Software
+ *  Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
+ *
+ */
+
+#include <stdio.h>
+#include <string.h>
+
+#include "../y.tab.h"
+
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units, int is_time);
+ 
+int main(int argc, char *argv[])
+{
+	int rc, bye = 0;
+	long long result;
+	double dresult;
+	char buffer[100];
+
+	do {
+		if (fgets(buffer, 90, stdin) == NULL)
+			break;
+		rc = strlen(buffer);
+		if (rc > 0 && buffer[rc - 1] == '\n')
+			buffer[rc - 1] = '\0';
+		rc = evaluate_arithmetic_expression(buffer, &result, &dresult, 1.0, 0);
+		if (!rc) {
+			printf("%lld (%20.20lf)\n", result, dresult);
+		} else {
+			fprintf(stderr, "Syntax error\n");
+			result = 0;
+			dresult = 0;
+		}
+	} while (!bye);
+	return 0;
+}
+
diff --git a/fifo.h b/fifo.h
index 7491365..4b775b0 100644
--- a/fifo.h
+++ b/fifo.h
@@ -1,3 +1,5 @@
+#ifndef FIO_FIFO_H
+#define FIO_FIFO_H
 /*
  * A simple FIFO implementation.
  *
@@ -18,6 +20,8 @@
  * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
  *
  */
+#include "minmax.h"
+
 struct fifo {
 	unsigned char *buffer;	/* the buffer holding the data */
 	unsigned int size;	/* the size of the allocated buffer */
@@ -40,19 +44,4 @@
 	return fifo->size - fifo->in + fifo->out;
 }
 
-#ifndef min
-#define min(x,y) ({ \
-	typeof(x) _x = (x);	\
-	typeof(y) _y = (y);	\
-	(void) (&_x == &_y);		\
-	_x < _y ? _x : _y; })
-#endif
-
-#ifndef max
-#define max(x,y) ({ \
-	typeof(x) _x = (x);	\
-	typeof(y) _y = (y);	\
-	(void) (&_x == &_y);		\
-	_x > _y ? _x : _y; })
-
 #endif
diff --git a/file.h b/file.h
index add7773..f7a1eae 100644
--- a/file.h
+++ b/file.h
@@ -27,6 +27,8 @@
 	FIO_FILE_size_known	= 1 << 4,	/* size has been set */
 	FIO_FILE_hashed		= 1 << 5,	/* file is on hash */
 	FIO_FILE_partial_mmap	= 1 << 6,	/* can't do full mmap */
+	FIO_FILE_axmap		= 1 << 7,	/* uses axmap */
+	FIO_FILE_lfsr		= 1 << 8,	/* lfsr is used */
 };
 
 enum file_lock_mode {
@@ -77,10 +79,6 @@
 	unsigned int major, minor;
 	int fileno;
 
-	void *mmap_ptr;
-	size_t mmap_sz;
-	off_t mmap_off;
-
 	/*
 	 * size of the file, offset into file, and io size from that offset
 	 */
@@ -88,8 +86,11 @@
 	uint64_t file_offset;
 	uint64_t io_size;
 
-	uint64_t last_pos;
-	uint64_t last_start;
+	/*
+	 * Track last end and last start of IO for a given data direction
+	 */
+	uint64_t last_pos[DDIR_RWDIR_CNT];
+	uint64_t last_start[DDIR_RWDIR_CNT];
 
 	uint64_t first_write;
 	uint64_t last_write;
@@ -108,11 +109,12 @@
 	};
 
 	/*
-	 * block map for random io
+	 * block map or LFSR for random io
 	 */
-	struct axmap *io_axmap;
-
-	struct fio_lfsr lfsr;
+	union {
+		struct axmap *io_axmap;
+		struct fio_lfsr lfsr;
+	};
 
 	/*
 	 * Used for zipf random distribution
@@ -125,6 +127,10 @@
 	struct disk_util *du;
 };
 
+#define FILE_ENG_DATA(f)	((void *) (uintptr_t) (f)->engine_data)
+#define FILE_SET_ENG_DATA(f, data)	\
+	((f)->engine_data = (uintptr_t) (data))
+
 struct file_name {
 	struct flist_head list;
 	char *filename;
@@ -151,6 +157,8 @@
 FILE_FLAG_FNS(size_known);
 FILE_FLAG_FNS(hashed);
 FILE_FLAG_FNS(partial_mmap);
+FILE_FLAG_FNS(axmap);
+FILE_FLAG_FNS(lfsr);
 #undef FILE_FLAG_FNS
 
 /*
diff --git a/filelock.c b/filelock.c
index b252a97..b113007 100644
--- a/filelock.c
+++ b/filelock.c
@@ -5,6 +5,7 @@
  */
 #include <inttypes.h>
 #include <string.h>
+#include <unistd.h>
 #include <assert.h>
 
 #include "flist.h"
@@ -20,36 +21,99 @@
 	struct flist_head list;
 	unsigned int references;
 };
+
+#define MAX_FILELOCKS	128
 	
-static struct flist_head *filelock_list;
-static struct fio_mutex *filelock_lock;
+static struct filelock_data {
+	struct flist_head list;
+	struct fio_mutex lock;
+
+	struct flist_head free_list;
+	struct fio_filelock ffs[MAX_FILELOCKS];
+} *fld;
+
+static void put_filelock(struct fio_filelock *ff)
+{
+	flist_add(&ff->list, &fld->free_list);
+}
+
+static struct fio_filelock *__get_filelock(void)
+{
+	struct fio_filelock *ff;
+
+	if (flist_empty(&fld->free_list))
+		return NULL;
+
+	ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+	flist_del_init(&ff->list);
+	return ff;
+}
+
+static struct fio_filelock *get_filelock(int trylock, int *retry)
+{
+	struct fio_filelock *ff;
+
+	do {
+		ff = __get_filelock();
+		if (ff || trylock)
+			break;
+
+		fio_mutex_up(&fld->lock);
+		usleep(1000);
+		fio_mutex_down(&fld->lock);
+		*retry = 1;
+	} while (1);
+
+	return ff;
+}
 
 int fio_filelock_init(void)
 {
-	filelock_list = smalloc(sizeof(*filelock_list));
-	if (!filelock_list)
+	int i;
+
+	fld = smalloc(sizeof(*fld));
+	if (!fld)
 		return 1;
 
-	INIT_FLIST_HEAD(filelock_list);
-	filelock_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
-	if (!filelock_lock) {
-		sfree(filelock_list);
-		return 1;
+	INIT_FLIST_HEAD(&fld->list);
+	INIT_FLIST_HEAD(&fld->free_list);
+
+	if (__fio_mutex_init(&fld->lock, FIO_MUTEX_UNLOCKED))
+		goto err;
+
+	for (i = 0; i < MAX_FILELOCKS; i++) {
+		struct fio_filelock *ff = &fld->ffs[i];
+
+		if (__fio_mutex_init(&ff->lock, FIO_MUTEX_UNLOCKED))
+			goto err;
+		flist_add_tail(&ff->list, &fld->free_list);
 	}
 
 	return 0;
+err:
+	fio_filelock_exit();
+	return 1;
 }
 
 void fio_filelock_exit(void)
 {
-	if (!filelock_list)
+	if (!fld)
 		return;
 
-	assert(flist_empty(filelock_list));
-	sfree(filelock_list);
-	filelock_list = NULL;
-	fio_mutex_remove(filelock_lock);
-	filelock_lock = NULL;
+	assert(flist_empty(&fld->list));
+	__fio_mutex_remove(&fld->lock);
+
+	while (!flist_empty(&fld->free_list)) {
+		struct fio_filelock *ff;
+
+		ff = flist_first_entry(&fld->free_list, struct fio_filelock, list);
+
+		flist_del_init(&ff->list);
+		__fio_mutex_remove(&ff->lock);
+	}
+
+	sfree(fld);
+	fld = NULL;
 }
 
 static struct fio_filelock *fio_hash_find(uint32_t hash)
@@ -57,7 +121,7 @@
 	struct flist_head *entry;
 	struct fio_filelock *ff;
 
-	flist_for_each(entry, filelock_list) {
+	flist_for_each(entry, &fld->list) {
 		ff = flist_entry(entry, struct fio_filelock, list);
 		if (ff->hash == hash)
 			return ff;
@@ -66,38 +130,68 @@
 	return NULL;
 }
 
-static struct fio_filelock *fio_hash_get(uint32_t hash)
+static struct fio_filelock *fio_hash_get(uint32_t hash, int trylock)
 {
 	struct fio_filelock *ff;
 
 	ff = fio_hash_find(hash);
 	if (!ff) {
-		ff = smalloc(sizeof(*ff));
+		int retry = 0;
+
+		ff = get_filelock(trylock, &retry);
+		if (!ff)
+			return NULL;
+
+		/*
+		 * If we dropped the main lock, re-lookup the hash in case
+		 * someone else added it meanwhile. If it's now there,
+		 * just return that.
+		 */
+		if (retry) {
+			struct fio_filelock *__ff;
+
+			__ff = fio_hash_find(hash);
+			if (__ff) {
+				put_filelock(ff);
+				return __ff;
+			}
+		}
+
 		ff->hash = hash;
-		__fio_mutex_init(&ff->lock, FIO_MUTEX_UNLOCKED);
 		ff->references = 0;
-		flist_add(&ff->list, filelock_list);
+		flist_add(&ff->list, &fld->list);
 	}
 
 	return ff;
 }
 
-int fio_trylock_file(const char *fname)
+static int __fio_lock_file(const char *fname, int trylock)
 {
 	struct fio_filelock *ff;
 	uint32_t hash;
 
 	hash = jhash(fname, strlen(fname), 0);
 
-	fio_mutex_down(filelock_lock);
-	ff = fio_hash_get(hash);
-	ff->references++;
-	fio_mutex_up(filelock_lock);
+	fio_mutex_down(&fld->lock);
+	ff = fio_hash_get(hash, trylock);
+	if (ff)
+		ff->references++;
+	fio_mutex_up(&fld->lock);
+
+	if (!ff) {
+		assert(!trylock);
+		return 1;
+	}
+
+	if (!trylock) {
+		fio_mutex_down(&ff->lock);
+		return 0;
+	}
 
 	if (!fio_mutex_down_trylock(&ff->lock))
 		return 0;
 
-	fio_mutex_down(filelock_lock);
+	fio_mutex_down(&fld->lock);
 
 	/*
 	 * If we raced and the only reference to the lock is us, we can
@@ -108,7 +202,7 @@
 		ff = NULL;
 	}
 
-	fio_mutex_up(filelock_lock);
+	fio_mutex_up(&fld->lock);
 
 	if (ff) {
 		fio_mutex_down(&ff->lock);
@@ -118,19 +212,14 @@
 	return 1;
 }
 
+int fio_trylock_file(const char *fname)
+{
+	return __fio_lock_file(fname, 1);
+}
+
 void fio_lock_file(const char *fname)
 {
-	struct fio_filelock *ff;
-	uint32_t hash;
-
-	hash = jhash(fname, strlen(fname), 0);
-
-	fio_mutex_down(filelock_lock);
-	ff = fio_hash_get(hash);
-	ff->references++;
-	fio_mutex_up(filelock_lock);
-
-	fio_mutex_down(&ff->lock);
+	__fio_lock_file(fname, 0);
 }
 
 void fio_unlock_file(const char *fname)
@@ -140,18 +229,18 @@
 
 	hash = jhash(fname, strlen(fname), 0);
 
-	fio_mutex_down(filelock_lock);
+	fio_mutex_down(&fld->lock);
 
 	ff = fio_hash_find(hash);
 	if (ff) {
-		ff->references--;
+		int refs = --ff->references;
 		fio_mutex_up(&ff->lock);
-		if (!ff->references) {
-			flist_del(&ff->list);
-			sfree(ff);
+		if (!refs) {
+			flist_del_init(&ff->list);
+			put_filelock(ff);
 		}
 	} else
 		log_err("fio: file not found for unlocking\n");
 
-	fio_mutex_up(filelock_lock);
+	fio_mutex_up(&fld->lock);
 }
diff --git a/filesetup.c b/filesetup.c
index ad7fb85..0fb5589 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -59,7 +59,7 @@
 
 	if (unlink_file || new_layout) {
 		dprint(FD_FILE, "layout unlink %s\n", f->file_name);
-		if ((unlink(f->file_name) < 0) && (errno != ENOENT)) {
+		if ((td_io_unlink_file(td, f) < 0) && (errno != ENOENT)) {
 			td_verror(td, errno, "unlink");
 			return 1;
 		}
@@ -172,7 +172,7 @@
 
 	if (td->terminate) {
 		dprint(FD_FILE, "terminate unlink %s\n", f->file_name);
-		unlink(f->file_name);
+		td_io_unlink_file(td, f);
 	} else if (td->o.create_fsync) {
 		if (fsync(f->fd) < 0) {
 			td_verror(td, errno, "fsync");
@@ -261,16 +261,9 @@
 	unsigned long long ret, sized;
 	unsigned long r;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->file_size_state);
-		sized = td->o.file_size_high - td->o.file_size_low;
-		ret = (unsigned long long) ((double) sized * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__file_size_state);
-		sized = td->o.file_size_high - td->o.file_size_low;
-		ret = (unsigned long long) ((double) sized * (r / (FRAND_MAX + 1.0)));
-	}
-
+	r = __rand(&td->file_size_state);
+	sized = td->o.file_size_high - td->o.file_size_low;
+	ret = (unsigned long long) ((double) sized * (r / (FRAND_MAX + 1.0)));
 	ret += td->o.file_size_low;
 	ret -= (ret % td->o.rw_min_bs);
 	return ret;
@@ -390,6 +383,10 @@
 {
 	int ret = 0;
 
+#ifdef CONFIG_ESX
+	return 0;
+#endif
+
 	if (len == -1ULL)
 		len = f->io_size;
 	if (off == -1ULL)
@@ -401,15 +398,11 @@
 	dprint(FD_IO, "invalidate cache %s: %llu/%llu\n", f->file_name, off,
 								len);
 
-	if (f->mmap_ptr) {
-		ret = posix_madvise(f->mmap_ptr, f->mmap_sz, POSIX_MADV_DONTNEED);
-#ifdef FIO_MADV_FREE
-		if (f->filetype == FIO_TYPE_BD)
-			(void) posix_madvise(f->mmap_ptr, f->mmap_sz, FIO_MADV_FREE);
-#endif
-	} else if (f->filetype == FIO_TYPE_FILE) {
+	if (td->io_ops->invalidate)
+		ret = td->io_ops->invalidate(td, f);
+	else if (f->filetype == FIO_TYPE_FILE)
 		ret = posix_fadvise(f->fd, off, len, POSIX_FADV_DONTNEED);
-	} else if (f->filetype == FIO_TYPE_BD) {
+	else if (f->filetype == FIO_TYPE_BD) {
 		ret = blockdev_invalidate_cache(f);
 		if (ret < 0 && errno == EACCES && geteuid()) {
 			if (!root_warn) {
@@ -603,6 +596,7 @@
 		}
 
 		td_verror(td, __e, buf);
+		return 1;
 	}
 
 	if (!from_hash && f->fd != -1) {
@@ -655,6 +649,7 @@
 			if (td->error != ENOENT) {
 				log_err("%s\n", td->verror);
 				err = 1;
+				break;
 			}
 			clear_error(td);
 		}
@@ -750,7 +745,7 @@
 		return f->real_file_size;
 
 	return td->o.start_offset +
-		(td->thread_number - 1) * td->o.offset_increment;
+		td->subjob_number * td->o.offset_increment;
 }
 
 /*
@@ -890,7 +885,7 @@
 		}
 	}
 
-	if (!o->size || o->size > total_size)
+	if (!o->size || (total_size && o->size > total_size))
 		o->size = total_size;
 
 	if (o->size < td_min_bs(td)) {
@@ -990,12 +985,12 @@
 {
 	unsigned int range_size, seed;
 	unsigned long nranges;
-	uint64_t file_size;
+	uint64_t fsize;
 
 	range_size = min(td->o.min_bs[DDIR_READ], td->o.min_bs[DDIR_WRITE]);
-	file_size = min(f->real_file_size, f->io_size);
+	fsize = min(f->real_file_size, f->io_size);
 
-	nranges = (file_size + range_size - 1) / range_size;
+	nranges = (fsize + range_size - 1) / range_size;
 
 	seed = jhash(f->file_name, strlen(f->file_name), 0) * td->thread_number;
 	if (!td->o.rand_repeatable)
@@ -1040,21 +1035,25 @@
 		return 0;
 
 	for_each_file(td, f, i) {
-		uint64_t file_size = min(f->real_file_size, f->io_size);
+		uint64_t fsize = min(f->real_file_size, f->io_size);
 
-		blocks = file_size / (unsigned long long) td->o.rw_min_bs;
+		blocks = fsize / (unsigned long long) td->o.rw_min_bs;
 
 		if (td->o.random_generator == FIO_RAND_GEN_LFSR) {
 			unsigned long seed;
 
 			seed = td->rand_seeds[FIO_RAND_BLOCK_OFF];
 
-			if (!lfsr_init(&f->lfsr, blocks, seed, 0))
+			if (!lfsr_init(&f->lfsr, blocks, seed, 0)) {
+				fio_file_set_lfsr(f);
 				continue;
+			}
 		} else if (!td->o.norandommap) {
 			f->io_axmap = axmap_new(blocks);
-			if (f->io_axmap)
+			if (f->io_axmap) {
+				fio_file_set_axmap(f);
 				continue;
+			}
 		} else if (td->o.norandommap)
 			continue;
 
@@ -1092,6 +1091,11 @@
 	dprint(FD_FILE, "close files\n");
 
 	for_each_file(td, f, i) {
+		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
+			dprint(FD_FILE, "free unlink %s\n", f->file_name);
+			td_io_unlink_file(td, f);
+		}
+
 		if (fio_file_open(f))
 			td_io_close_file(td, f);
 
@@ -1099,13 +1103,15 @@
 
 		if (td->o.unlink && f->filetype == FIO_TYPE_FILE) {
 			dprint(FD_FILE, "free unlink %s\n", f->file_name);
-			unlink(f->file_name);
+			td_io_unlink_file(td, f);
 		}
 
 		sfree(f->file_name);
 		f->file_name = NULL;
-		axmap_free(f->io_axmap);
-		f->io_axmap = NULL;
+		if (fio_file_axmap(f)) {
+			axmap_free(f->io_axmap);
+			f->io_axmap = NULL;
+		}
 		sfree(f);
 	}
 
@@ -1522,17 +1528,24 @@
 void free_release_files(struct thread_data *td)
 {
 	close_files(td);
+	td->o.nr_files = 0;
+	td->o.open_files = 0;
 	td->files_index = 0;
 	td->nr_normal_files = 0;
 }
 
 void fio_file_reset(struct thread_data *td, struct fio_file *f)
 {
-	f->last_pos = f->file_offset;
-	f->last_start = -1ULL;
-	if (f->io_axmap)
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		f->last_pos[i] = f->file_offset;
+		f->last_start[i] = -1ULL;
+	}
+
+	if (fio_file_axmap(f))
 		axmap_reset(f->io_axmap);
-	if (td->o.random_generator == FIO_RAND_GEN_LFSR)
+	else if (fio_file_lfsr(f))
 		lfsr_reset(&f->lfsr, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
 }
 
diff --git a/fio.1 b/fio.1
index eeb036e..84d71a0 100644
--- a/fio.1
+++ b/fio.1
@@ -1,4 +1,4 @@
-.TH fio 1 "October 2013" "User Manual"
+.TH fio 1 "December 2014" "User Manual"
 .SH NAME
 fio \- flexible I/O tester
 .SH SYNOPSIS
@@ -20,12 +20,12 @@
 .BI \-\-output \fR=\fPfilename
 Write output to \fIfilename\fR.
 .TP
+.BI \-\-output-format \fR=\fPformat
+Set the reporting format to \fInormal\fR, \fIterse\fR, or \fIjson\fR.
+.TP
 .BI \-\-runtime \fR=\fPruntime
 Limit run time to \fIruntime\fR seconds.
 .TP
-.B \-\-latency\-log
-Generate per-job latency logs.
-.TP
 .B \-\-bandwidth\-log
 Generate per-job bandwidth logs.
 .TP
@@ -75,7 +75,7 @@
 Turn on safety read-only checks, preventing any attempted write.
 .TP
 .BI \-\-section \fR=\fPsec
-Only run section \fIsec\fR from job file. Multiple of these options can be given, adding more sections to run.
+Only run section \fIsec\fR from job file. This option can be used multiple times to add more sections to run.
 .TP
 .BI \-\-alloc\-size \fR=\fPkb
 Set the internal smalloc pool size to \fIkb\fP kilobytes.
@@ -115,7 +115,29 @@
 may override any parameter set in global sections.
 .SH "JOB PARAMETERS"
 .SS Types
-Some parameters may take arguments of a specific type.  The types used are:
+Some parameters may take arguments of a specific type.
+Anywhere a numeric value is required, an arithmetic expression may be used,
+provided it is surrounded by parentheses. Supported operators are:
+.RS
+.RS
+.TP
+.B addition (+)
+.TP
+.B subtraction (-)
+.TP
+.B multiplication (*)
+.TP
+.B division (/)
+.TP
+.B modulus (%)
+.TP
+.B exponentiation (^)
+.RE
+.RE
+.P
+For time values in expressions, units are microseconds by default. This is
+different than for time values not in expressions (not enclosed in
+parentheses). The types used are:
 .TP
 .I str
 String: a sequence of alphanumeric characters.
@@ -182,8 +204,8 @@
 device, \\.\PhysicalDrive1 for the second etc. Note: Windows and FreeBSD
 prevent write access to areas of the disk containing in-use data
 (e.g. filesystems). If the wanted filename does need to include a colon, then
-escape that with a '\' character. For instance, if the filename is
-"/dev/dsk/foo@3,0:c", then you would use filename="/dev/dsk/foo@3,0\:c".
+escape that with a '\\' character. For instance, if the filename is
+"/dev/dsk/foo@3,0:c", then you would use filename="/dev/dsk/foo@3,0\\:c".
 .TP
 .BI filename_format \fR=\fPstr
 If sharing multiple files between jobs, it is usually necessary to have
@@ -225,7 +247,7 @@
 No locking. This is the default.
 .TP
 .B exclusive
-Only one thread or process may do IO at the time, excluding all others.
+Only one thread or process may do IO at a time, excluding all others.
 .TP
 .B readwrite
 Read-write locking on the file. Many readers may access the file at the same
@@ -310,7 +332,7 @@
 .BI unified_rw_reporting \fR=\fPbool
 Fio normally reports statistics on a per data direction basis, meaning that
 read, write, and trim are accounted and reported separately. If this option is
-set, the fio will sum the results and report them as "mixed" instead.
+set fio sums the results and reports them as "mixed" instead.
 .TP
 .BI randrepeat \fR=\fPbool
 Seed the random number generator used for random I/O patterns in a predictable
@@ -325,12 +347,6 @@
 control what sequence of output is being generated. If not set, the random
 sequence depends on the \fBrandrepeat\fR setting.
 .TP
-.BI use_os_rand \fR=\fPbool
-Fio can either use the random generator supplied by the OS to generator random
-offsets, or it can use it's own internal generator (based on Tausworthe).
-Default is to use the internal generator, which is often of better quality and
-faster. Default: false.
-.TP
 .BI fallocate \fR=\fPstr
 Whether pre-allocation is performed when laying down files. Accepted values
 are:
@@ -359,26 +375,28 @@
 .RE
 .TP
 .BI fadvise_hint \fR=\fPbool
-Use of \fBposix_fadvise\fR\|(2) to advise the kernel what I/O patterns
+Use \fBposix_fadvise\fR\|(2) to advise the kernel what I/O patterns
 are likely to be issued. Default: true.
 .TP
 .BI size \fR=\fPint
 Total size of I/O for this job.  \fBfio\fR will run until this many bytes have
-been transferred, unless limited by other options (\fBruntime\fR, for instance).
-Unless \fBnrfiles\fR and \fBfilesize\fR options are given, this amount will be
-divided between the available files for the job. If not set, fio will use the
-full size of the given files or devices. If the files do not exist, size
-must be given. It is also possible to give size as a percentage between 1 and
-100. If size=20% is given, fio will use 20% of the full size of the given
-files or devices.
+been transferred, unless limited by other options (\fBruntime\fR, for instance,
+or increased/descreased by \fBio_size\fR). Unless \fBnrfiles\fR and
+\fBfilesize\fR options are given, this amount will be divided between the
+available files for the job. If not set, fio will use the full size of the
+given files or devices. If the files do not exist, size must be given. It is
+also possible to give size as a percentage between 1 and 100. If size=20% is
+given, fio will use 20% of the full size of the given files or devices.
 .TP
-.BI io_limit \fR=\fPint
+.BI io_size \fR=\fPint "\fR,\fB io_limit \fR=\fPint
 Normally fio operates within the region set by \fBsize\fR, which means that
 the \fBsize\fR option sets both the region and size of IO to be performed.
 Sometimes that is not what you want. With this option, it is possible to
 define just the amount of IO that fio should do. For instance, if \fBsize\fR
 is set to 20G and \fBio_limit\fR is set to 5G, fio will perform IO within
-the first 20G but exit when 5G have been done.
+the first 20G but exit when 5G have been done. The opposite is also
+possible - if \fBsize\fR is set to 20G, and \fBio_size\fR is set to 40G, then
+fio will do 40G of IO within the 0..20G region.
 .TP
 .BI fill_device \fR=\fPbool "\fR,\fB fill_fs" \fR=\fPbool
 Sets size to something really large and waits for ENOSPC (no space left on
@@ -443,7 +461,7 @@
 blocksize setting.
 .TP
 .B zero_buffers
-Initialise buffers with all zeros. Default: fill buffers with random data.
+Initialize buffers with all zeros. Default: fill buffers with random data.
 The resulting IO buffers will not be completely zeroed, unless
 \fPscramble_buffers\fR is also turned off.
 .TP
@@ -463,9 +481,12 @@
 .BI buffer_compress_percentage \fR=\fPint
 If this is set, then fio will attempt to provide IO buffer content (on WRITEs)
 that compress to the specified level. Fio does this by providing a mix of
-random data and zeroes. Note that this is per block size unit, for file/disk
-wide compression level that matches this setting, you'll also want to set
-\fBrefill_buffers\fR.
+random data and a fixed pattern. The fixed pattern is either zeroes, or the
+pattern specified by \fBbuffer_pattern\fR. If the pattern option is used, it
+might skew the compression ratio slightly. Note that this is per block size
+unit, for file/disk wide compression level that matches this setting. Note
+that this is per block size unit, for file/disk wide compression level that
+matches this setting, you'll also want to set refill_buffers.
 .TP
 .BI buffer_compress_chunk \fR=\fPint
 See \fBbuffer_compress_percentage\fR. This setting allows fio to manage how
@@ -475,10 +496,18 @@
 size, fio can alternate random and zeroed data throughout the IO buffer.
 .TP
 .BI buffer_pattern \fR=\fPstr
-If set, fio will fill the io buffers with this pattern. If not set, the contents
-of io buffers is defined by the other options related to buffer contents. The
+If set, fio will fill the IO buffers with this pattern. If not set, the contents
+of IO buffers is defined by the other options related to buffer contents. The
 setting can be any pattern of bytes, and can be prefixed with 0x for hex
-values.
+values. It may also be a string, where the string must then be wrapped with
+"".
+.TP
+.BI dedupe_percentage \fR=\fPint
+If set, fio will generate this percentage of identical buffers when writing.
+These buffers will be naturally dedupable. The contents of the buffers depend
+on what other buffer compression settings have been set. It's possible to have
+the individual buffers either fully compressible, or not at all. This option
+only controls the distribution of unique buffers.
 .TP
 .BI nrfiles \fR=\fPint
 Number of files to use for this job.  Default: 1.
@@ -495,13 +524,13 @@
 Choose a file at random.
 .TP
 .B roundrobin
-Round robin over open files (default).
+Round robin over opened files (default).
 .TP
 .B sequential
 Do each file in the set sequentially.
 .RE
 .P
-The number of I/Os to issue before switching a new file can be specified by
+The number of I/Os to issue before switching to a new file can be specified by
 appending `:\fIint\fR' to the service type.
 .RE
 .TP
@@ -602,6 +631,27 @@
 IO engine supporting direct access to Ceph Rados Block Devices (RBD) via librbd 
 without the need to use the kernel rbd driver. This ioengine defines engine specific 
 options.
+.TP
+.B gfapi
+Using Glusterfs libgfapi sync interface to direct access to Glusterfs volumes without
+having to go through FUSE. This ioengine defines engine specific
+options.
+.TP
+.B gfapi_async
+Using Glusterfs libgfapi async interface to direct access to Glusterfs volumes without
+having to go through FUSE. This ioengine defines engine specific
+options.
+.TP
+.B libhdfs
+Read and write through Hadoop (HDFS).  The \fBfilename\fR option is used to
+specify host,port of the hdfs name-node to connect. This engine interprets
+offsets a little differently. In HDFS, files once created cannot be modified.
+So random writes are not possible. To imitate this, libhdfs engine expects
+bunch of small files to be created over HDFS, and engine will randomly pick a
+file out of those files based on the offset generated by fio backend. (see the
+example job file to create such files, use rw=write option). Please note, you
+might want to set necessary environment variables to work with hdfs/libhdfs
+properly.
 .RE
 .P
 .RE
@@ -609,7 +659,7 @@
 .BI iodepth \fR=\fPint
 Number of I/O units to keep in flight against the file. Note that increasing
 iodepth beyond 1 will not affect synchronous ioengines (except for small
-degress when verify_async is in use). Even async engines my impose OS
+degress when verify_async is in use). Even async engines may impose OS
 restrictions causing the desired depth not to be achieved.  This may happen on
 Linux when using libaio and not setting \fBdirect\fR=1, since buffered IO is
 not async on that OS. Keep an eye on the IO depth distribution in the
@@ -647,17 +697,20 @@
 .TP
 .BI offset_increment \fR=\fPint
 If this is provided, then the real offset becomes the
-offset + offset_increment * thread_number, where the thread number is a counter
-that starts at 0 and is incremented for each job. This option is useful if
-there are several jobs which are intended to operate on a file in parallel in
-disjoint segments, with even spacing between the starting points.
+offset + offset_increment * thread_number, where the thread number is a
+counter that starts at 0 and is incremented for each sub-job (i.e. when
+numjobs option is specified). This option is useful if there are several jobs
+which are intended to operate on a file in parallel disjoint segments, with
+even spacing between the starting points.
 .TP
 .BI number_ios \fR=\fPint
 Fio will normally perform IOs until it has exhausted the size of the region
 set by \fBsize\fR, or if it exhaust the allocated time (or hits an error
 condition). With this setting, the range/size can be set independently of
 the number of IOs to perform. When fio reaches this number, it will exit
-normally and report status.
+normally and report status. Note that this does not extend the amount
+of IO that will be done, it will only stop fio if this condition is met
+before other end-of-job criteria.
 .TP
 .BI fsync \fR=\fPint
 How many I/Os to perform before issuing an \fBfsync\fR\|(2) of dirty data.  If
@@ -901,7 +954,7 @@
 .BI startdelay \fR=\fPirange
 Delay start of job for the specified number of seconds. Supports all time
 suffixes to allow specification of hours, minutes, seconds and
-milliseconds - seconds are the default if a unit is ommited.
+milliseconds - seconds are the default if a unit is omitted.
 Can be given as a range which causes each thread to choose randomly out of the
 range.
 .TP
@@ -1125,6 +1178,17 @@
 .BI experimental_verify \fR=\fPbool
 Enable experimental verification.
 .TP
+.BI verify_state_save \fR=\fPbool
+When a job exits during the write phase of a verify workload, save its
+current state. This allows fio to replay up until that point, if the
+verify state is loaded for the verify read phase.
+.TP
+.BI verify_state_load \fR=\fPbool
+If a verify termination trigger was used, fio stores the current write
+state of each thread. This can be used at verification time so that fio
+knows how far it should verify. Without this information, fio will run
+a full verification pass, according to the settings in the job file used.
+.TP
 .B stonewall "\fR,\fP wait_for_previous"
 Wait for preceding jobs in the job file to exit before starting this one.
 \fBstonewall\fR implies \fBnew_group\fR.
@@ -1181,17 +1245,21 @@
 store data of the bandwidth of the jobs in their lifetime. The included
 fio_generate_plots script uses gnuplot to turn these text files into nice
 graphs. See \fBwrite_lat_log\fR for behaviour of given filename. For this
-option, the postfix is _bw.log.
+option, the postfix is _bw.x.log, where x is the index of the job (1..N,
+where N is the number of jobs)
 .TP
 .BI write_lat_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes I/O completion latencies.  If no
-filename is given with this option, the default filename of "jobname_type.log"
-is used. Even if the filename is given, fio will still append the type of log.
+filename is given with this option, the default filename of
+"jobname_type.x.log" is used, where x is the index of the job (1..N, where
+N is the number of jobs). Even if the filename is given, fio will still
+append the type of log.
 .TP
 .BI write_iops_log \fR=\fPstr
 Same as \fBwrite_bw_log\fR, but writes IOPS. If no filename is given with this
-option, the default filename of "jobname_type.log" is used. Even if the
-filename is given, fio will still append the type of log.
+option, the default filename of "jobname_type.x.log" is used, where x is the
+index of the job (1..N, where N is the number of jobs). Even if the filename
+is given, fio will still append the type of log.
 .TP
 .BI log_avg_msec \fR=\fPint
 By default, fio will log an entry in the iops, latency, or bw log for every
@@ -1200,6 +1268,27 @@
 over the specified period of time, reducing the resolution of the log.
 Defaults to 0.
 .TP
+.BI log_offset \fR=\fPbool
+If this is set, the iolog options will include the byte offset for the IO
+entry as well as the other data values.
+.TP
+.BI log_compression \fR=\fPint
+If this is set, fio will compress the IO logs as it goes, to keep the memory
+footprint lower. When a log reaches the specified size, that chunk is removed
+and compressed in the background. Given that IO logs are fairly highly
+compressible, this yields a nice memory savings for longer runs. The downside
+is that the compression will consume some background CPU cycles, so it may
+impact the run. This, however, is also true if the logging ends up consuming
+most of the system memory. So pick your poison. The IO logs are saved
+normally at the end of a run, by decompressing the chunks and storing them
+in the specified log file. This feature depends on the availability of zlib.
+.TP
+.BI log_store_compressed \fR=\fPbool
+If set, and \fBlog\fR_compression is also set, fio will store the log files in
+a compressed format. They can be decompressed with fio, using the
+\fB\-\-inflate-log\fR command line parameter. The files will be stored with a
+\fB\.fz\fR suffix.
+.TP
 .BI disable_lat \fR=\fPbool
 Disable measurements of total latency numbers. Useful only for cutting
 back the number of calls to \fBgettimeofday\fR\|(2), as that does impact performance at
@@ -1367,7 +1456,7 @@
 .SS "Ioengine Parameters List"
 Some parameters are only valid when a specific ioengine is in use. These are
 used identically to normal parameters, with the caveat that when used on the
-command line, the must come after the ioengine that defines them is selected.
+command line, they must come after the ioengine.
 .TP
 .BI (cpu)cpuload \fR=\fPint
 Attempt to use the specified percentage of CPU cycles.
@@ -1392,7 +1481,9 @@
 used and must be omitted unless it is a valid UDP multicast address.
 .TP
 .BI (net,netsplice)port \fR=\fPint
-The TCP or UDP port to bind to or connect to.
+The TCP or UDP port to bind to or connect to. If this is used with
+\fBnumjobs\fR to spawn multiple instances of the same job type, then
+this will be the starting port number since fio will use a range of ports.
 .TP
 .BI (net,netsplice)interface \fR=\fPstr
 The IP address of the network interface used to send or receive UDP multicast
@@ -1438,7 +1529,7 @@
 .TP
 .BI (net, pingpong) \fR=\fPbool
 Normally a network writer will just continue writing data, and a network reader
-will just consume packages. If pingpong=1 is set, a writer will send its normal
+will just consume packets. If pingpong=1 is set, a writer will send its normal
 payload to the reader, then wait for the reader to send the same payload back.
 This allows fio to measure network latencies. The submission and completion
 latencies then measure local time spent sending or receiving, and the
@@ -1446,6 +1537,12 @@
 send back. For UDP multicast traffic pingpong=1 should only be set for a single
 reader when multiple readers are listening to the same address.
 .TP
+.BI (net, window_size) \fR=\fPint
+Set the desired socket buffer size for the connection.
+.TP
+.BI (net, mss) \fR=\fPint
+Set the TCP maximum segment size (TCP_MAXSEG).
+.TP
 .BI (e4defrag,donorname) \fR=\fPstr
 File will be used as a block donor (swap extents between files)
 .TP
@@ -1769,11 +1866,19 @@
 You can connect to multiple clients as well, to do that you could run:
 
 fio \-\-client=server2 \-\-client=server2 <job file(s)>
+
+If the job file is located on the fio server, then you can tell the server
+to load a local file as well. This is done by using \-\-remote-config:
+
+fio \-\-client=server \-\-remote-config /path/to/file.fio
+
+Then the fio serer will open this local (to the server) job file instead
+of being passed one from the client.
 .SH AUTHORS
 
 .B fio
 was written by Jens Axboe <jens.axboe@oracle.com>,
-now Jens Axboe <jaxboe@fusionio.com>.
+now Jens Axboe <axboe@fb.com>.
 .br
 This man page was written by Aaron Carroll <aaronc@cse.unsw.edu.au> based
 on documentation by Jens Axboe.
diff --git a/fio.c b/fio.c
index 7e6b06d..9adc29a 100644
--- a/fio.c
+++ b/fio.c
@@ -43,6 +43,8 @@
 	fio_time_init();
 
 	if (nr_clients) {
+		set_genesis_time();
+
 		if (fio_start_all_clients())
 			return 1;
 		return fio_handle_clients(&fio_client_ops);
diff --git a/fio.h b/fio.h
index 4d4af0a..f688084 100644
--- a/fio.h
+++ b/fio.h
@@ -73,6 +73,8 @@
 	TD_F_PROFILE_OPS	= 64,
 	TD_F_COMPRESS		= 128,
 	TD_F_NOIO		= 256,
+	TD_F_COMPRESS_LOG	= 512,
+	TD_F_VSTATE_SAVED	= 1024,
 };
 
 enum {
@@ -88,6 +90,7 @@
 	FIO_RAND_SEQ_RAND_WRITE_OFF,
 	FIO_RAND_SEQ_RAND_TRIM_OFF,
 	FIO_RAND_START_DELAY,
+	FIO_DEDUPE_OFF,
 	FIO_RAND_NR_OFFS,
 };
 
@@ -101,6 +104,7 @@
 	char verror[FIO_VERROR_SIZE];
 	pthread_t thread;
 	unsigned int thread_number;
+	unsigned int subjob_number;
 	unsigned int groupid;
 	struct thread_stat ts;
 
@@ -112,12 +116,21 @@
 	struct io_log *bw_log;
 	struct io_log *iops_log;
 
+	struct tp_data *tp_data;
+
 	uint64_t stat_io_bytes[DDIR_RWDIR_CNT];
 	struct timeval bw_sample_time;
 
 	uint64_t stat_io_blocks[DDIR_RWDIR_CNT];
 	struct timeval iops_sample_time;
 
+	/*
+	 * Tracks the last iodepth number of completed writes, if data
+	 * verification is enabled
+	 */
+	uint64_t *last_write_comp;
+	unsigned int last_write_idx;
+
 	volatile int update_rusage;
 	struct fio_mutex *rusage_sem;
 	struct rusage ru_start;
@@ -132,12 +145,12 @@
 	unsigned int nr_normal_files;
 	union {
 		unsigned int next_file;
-		os_random_state_t next_file_state;
-		struct frand_state __next_file_state;
+		struct frand_state next_file_state;
 	};
 	int error;
 	int sig;
 	int done;
+	int stop_io;
 	pid_t pid;
 	char *orig_buffer;
 	size_t orig_buffer_size;
@@ -155,28 +168,20 @@
 
 	unsigned long rand_seeds[FIO_RAND_NR_OFFS];
 
-	union {
-		os_random_state_t bsrange_state;
-		struct frand_state __bsrange_state;
-	};
-	union {
-		os_random_state_t verify_state;
-		struct frand_state __verify_state;
-	};
-	union {
-		os_random_state_t trim_state;
-		struct frand_state __trim_state;
-	};
-	union {
-		os_random_state_t delay_state;
-		struct frand_state __delay_state;
-	};
+	struct frand_state bsrange_state;
+	struct frand_state verify_state;
+	struct frand_state trim_state;
+	struct frand_state delay_state;
 
 	struct frand_state buf_state;
+	struct frand_state buf_state_prev;
+	struct frand_state dedupe_state;
 
 	unsigned int verify_batch;
 	unsigned int trim_batch;
 
+	struct thread_io_list *vstate;
+
 	int shm_id;
 
 	/*
@@ -230,7 +235,16 @@
 	uint64_t total_io_size;
 	uint64_t fill_device_size;
 
-	unsigned long io_issues[DDIR_RWDIR_CNT];
+	/*
+	 * Issue side
+	 */
+	uint64_t io_issues[DDIR_RWDIR_CNT];
+	uint64_t io_issue_bytes[DDIR_RWDIR_CNT];
+	uint64_t loops;
+
+	/*
+	 * Completions
+	 */
 	uint64_t io_blocks[DDIR_RWDIR_CNT];
 	uint64_t this_io_blocks[DDIR_RWDIR_CNT];
 	uint64_t io_bytes[DDIR_RWDIR_CNT];
@@ -242,15 +256,14 @@
 	/*
 	 * State for random io, a bitmap of blocks done vs not done
 	 */
-	union {
-		os_random_state_t random_state;
-		struct frand_state __random_state;
-	};
+	struct frand_state random_state;
 
 	struct timeval start;	/* start of this loop */
 	struct timeval epoch;	/* time job was started */
 	struct timeval last_issue;
+	long time_offset;
 	struct timeval tv_cache;
+	struct timeval terminate_time;
 	unsigned int tv_cache_nr;
 	unsigned int tv_cache_mask;
 	unsigned int ramp_time_over;
@@ -269,10 +282,7 @@
 	/*
 	 * read/write mixed workload state
 	 */
-	union {
-		os_random_state_t rwmix_state;
-		struct frand_state __rwmix_state;
-	};
+	struct frand_state rwmix_state;
 	unsigned long rwmix_issues;
 	enum fio_ddir rwmix_ddir;
 	unsigned int ddir_seq_nr;
@@ -280,10 +290,7 @@
 	/*
 	 * rand/seq mixed workload state
 	 */
-	union {
-		os_random_state_t seq_rand_state[DDIR_RWDIR_CNT];
-		struct frand_state __seq_rand_state[DDIR_RWDIR_CNT];
-	};
+	struct frand_state seq_rand_state[DDIR_RWDIR_CNT];
 
 	/*
 	 * IO history logs for verification. We use a tree for sorting,
@@ -318,10 +325,7 @@
 	/*
 	 * For generating file sizes
 	 */
-	union {
-		os_random_state_t file_size_state;
-		struct frand_state __file_size_state;
-	};
+	struct frand_state file_size_state;
 
 	/*
 	 * Error counts
@@ -395,10 +399,16 @@
 extern int log_syslog;
 extern int status_interval;
 extern const char fio_version_string[];
+extern int helper_do_stat;
+extern pthread_cond_t helper_cond;
+extern char *trigger_file;
+extern char *trigger_cmd;
+extern char *trigger_remote_cmd;
+extern long long trigger_timeout;
 
 extern struct thread_data *threads;
 
-static inline void fio_ro_check(struct thread_data *td, struct io_u *io_u)
+static inline void fio_ro_check(const struct thread_data *td, struct io_u *io_u)
 {
 	assert(!(io_u->ddir == DDIR_WRITE && !td_write(td)));
 }
@@ -437,7 +447,7 @@
 extern void options_mem_dupe(void *data, struct fio_option *options);
 extern void td_fill_rand_seeds(struct thread_data *);
 extern void add_job_opts(const char **, int);
-extern char *num2str(unsigned long, int, int, int, int);
+extern char *num2str(uint64_t, int, int, int, int);
 extern int ioengine_load(struct thread_data *);
 extern int parse_dryrun(void);
 extern int fio_running_or_pending_io_threads(void);
@@ -483,8 +493,15 @@
 extern int td_bump_runstate(struct thread_data *, int);
 extern void td_restore_runstate(struct thread_data *, int);
 
+/*
+ * Allow 60 seconds for a job to quit on its own, otherwise reap with
+ * a vengeance.
+ */
+#define FIO_REAP_TIMEOUT	60
+
 #define TERMINATE_ALL		(-1)
 extern void fio_terminate_threads(int);
+extern void fio_mark_td_terminate(struct thread_data *);
 
 /*
  * Memory helpers
@@ -588,7 +605,7 @@
 	return min(td->o.min_bs[DDIR_TRIM], min_bs);
 }
 
-static inline int is_power_of_2(unsigned long val)
+static inline int is_power_of_2(uint64_t val)
 {
 	return (val != 0 && ((val & (val - 1)) == 0));
 }
@@ -634,6 +651,9 @@
 	FIO_RAND_DIST_PARETO,
 };
 
+#define FIO_DEF_ZIPF		1.1
+#define FIO_DEF_PARETO		0.2
+
 enum {
 	FIO_RAND_GEN_TAUSWORTHE = 0,
 	FIO_RAND_GEN_LFSR,
@@ -644,4 +664,7 @@
 	FIO_CPUS_SPLIT,
 };
 
+extern void exec_trigger(const char *);
+extern void check_trigger_file(void);
+
 #endif
diff --git a/fio_time.h b/fio_time.h
index c550a55..79f324a 100644
--- a/fio_time.h
+++ b/fio_time.h
@@ -1,15 +1,17 @@
 #ifndef FIO_TIME_H
 #define FIO_TIME_H
 
-extern uint64_t utime_since(struct timeval *, struct timeval *);
-extern uint64_t utime_since_now(struct timeval *);
-extern uint64_t mtime_since(struct timeval *, struct timeval *);
-extern uint64_t mtime_since_now(struct timeval *);
-extern uint64_t time_since_now(struct timeval *);
+struct thread_data;
+extern uint64_t utime_since(const struct timeval *,const  struct timeval *);
+extern uint64_t utime_since_now(const struct timeval *);
+extern uint64_t mtime_since(const struct timeval *, const struct timeval *);
+extern uint64_t mtime_since_now(const struct timeval *);
+extern uint64_t time_since_now(const struct timeval *);
+extern uint64_t time_since_genesis(void);
 extern uint64_t mtime_since_genesis(void);
 extern uint64_t utime_since_genesis(void);
-extern void usec_spin(unsigned int);
-extern void usec_sleep(struct thread_data *, unsigned long);
+extern uint64_t usec_spin(unsigned int);
+extern uint64_t usec_sleep(struct thread_data *, unsigned long);
 extern void fill_start_time(struct timeval *);
 extern void set_genesis_time(void);
 extern int ramp_time_over(struct thread_data *);
diff --git a/flist.h b/flist.h
index 8e13041..d453e79 100644
--- a/flist.h
+++ b/flist.h
@@ -140,6 +140,22 @@
 		__flist_splice(list, head, head->next);
 }
 
+static inline void flist_splice_tail(struct flist_head *list,
+				     struct flist_head *head)
+{
+	if (!flist_empty(list))
+		__flist_splice(list, head->prev, head);
+}
+
+static inline void flist_splice_tail_init(struct flist_head *list,
+					  struct flist_head *head)
+{
+	if (!flist_empty(list)) {
+		__flist_splice(list, head->prev, head);
+		INIT_FLIST_HEAD(list);
+	}
+}
+
 static inline void flist_splice_init(struct flist_head *list,
 				    struct flist_head *head)
 {
@@ -158,6 +174,9 @@
 #define flist_entry(ptr, type, member) \
 	container_of(ptr, type, member)
 
+#define flist_first_entry(ptr, type, member) \
+	flist_entry((ptr)->next, type, member)
+
 /**
  * flist_for_each	-	iterate over a list
  * @pos:	the &struct flist_head to use as a loop counter.
diff --git a/gclient.c b/gclient.c
index d236f86..42bc761 100644
--- a/gclient.c
+++ b/gclient.c
@@ -694,7 +694,7 @@
 
 static void gfio_client_iolog(struct fio_client *client, struct cmd_iolog_pdu *pdu)
 {
-	printf("got iolog: name=%s, type=%u, entries=%u\n", pdu->name, pdu->log_type, pdu->nr_samples);
+	printf("got iolog: name=%s, type=%u, entries=%lu\n", pdu->name, pdu->log_type, (unsigned long) pdu->nr_samples);
 	free(pdu);
 }
 
diff --git a/gettime-thread.c b/gettime-thread.c
index 3d49034..2dc976f 100644
--- a/gettime-thread.c
+++ b/gettime-thread.c
@@ -8,11 +8,16 @@
 
 struct timeval *fio_tv = NULL;
 int fio_gtod_offload = 0;
-int fio_gtod_cpu = -1;
 static pthread_t gtod_thread;
+#ifdef FIO_HAVE_CPU_AFFINITY
+static os_cpu_mask_t fio_gtod_cpumask;
+#endif
 
 void fio_gtod_init(void)
 {
+	if (fio_tv)
+		return;
+
 	fio_tv = smalloc(sizeof(struct timeval));
 	if (!fio_tv)
 		log_err("fio: smalloc pool exhausted\n");
@@ -20,14 +25,27 @@
 
 static void fio_gtod_update(void)
 {
-	if (fio_tv)
-		gettimeofday(fio_tv, NULL);
+	if (fio_tv) {
+		struct timeval __tv;
+
+		gettimeofday(&__tv, NULL);
+		fio_tv->tv_sec = __tv.tv_sec;
+		write_barrier();
+		fio_tv->tv_usec = __tv.tv_usec;
+		write_barrier();
+	}
 }
 
+struct gtod_cpu_data {
+	struct fio_mutex *mutex;
+	unsigned int cpu;
+};
+
 static void *gtod_thread_main(void *data)
 {
 	struct fio_mutex *mutex = data;
 
+	fio_setaffinity(gettid(), fio_gtod_cpumask);
 	fio_mutex_up(mutex);
 
 	/*
@@ -56,7 +74,7 @@
 
 	pthread_attr_init(&attr);
 	pthread_attr_setstacksize(&attr, PTHREAD_STACK_MIN);
-	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, NULL);
+	ret = pthread_create(&gtod_thread, &attr, gtod_thread_main, mutex);
 	pthread_attr_destroy(&attr);
 	if (ret) {
 		log_err("Can't create gtod thread: %s\n", strerror(ret));
@@ -77,4 +95,9 @@
 	return ret;
 }
 
-
+void fio_gtod_set_cpu(unsigned int cpu)
+{
+#ifdef FIO_HAVE_CPU_AFFINITY
+	fio_cpu_set(&fio_gtod_cpumask, cpu);
+#endif
+}
diff --git a/gettime.c b/gettime.c
index fa750ec..d1c8eb9 100644
--- a/gettime.c
+++ b/gettime.c
@@ -16,19 +16,25 @@
 #if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
 static unsigned long cycles_per_usec;
 static unsigned long inv_cycles_per_usec;
+static uint64_t max_cycles_for_mult;
+#endif
+#ifdef ARCH_CPU_CLOCK_WRAPS
+static unsigned long long cycles_start, cycles_wrap;
 #endif
 int tsc_reliable = 0;
 
 struct tv_valid {
-	struct timeval last_tv;
 	uint64_t last_cycles;
 	int last_tv_valid;
+	int warned;
 };
+#ifdef ARCH_HAVE_CPU_CLOCK
 #ifdef CONFIG_TLS_THREAD
 static __thread struct tv_valid static_tv_valid;
 #else
 static pthread_key_t tv_tls_key;
 #endif
+#endif
 
 enum fio_cs fio_clock_source = FIO_PREFERRED_CLOCK_SOURCE;
 int fio_clock_source_set = 0;
@@ -64,7 +70,7 @@
 	return NULL;
 }
 
-static struct gtod_log *find_log(void *caller)
+static void inc_caller(void *caller)
 {
 	struct gtod_log *log = find_hash(caller);
 
@@ -80,16 +86,13 @@
 		flist_add_tail(&log->list, &hash[h]);
 	}
 
-	return log;
+	log->calls++;
 }
 
 static void gtod_log_caller(void *caller)
 {
-	if (gtod_inited) {
-		struct gtod_log *log = find_log(caller);
-
-		log->calls++;
-	}
+	if (gtod_inited)
+		inc_caller(caller);
 }
 
 static void fio_exit fio_dump_gtod(void)
@@ -136,16 +139,8 @@
 }
 #endif
 
-static void *__fio_gettime(struct timeval *tp)
+static void __fio_gettime(struct timeval *tp)
 {
-	struct tv_valid *tv;
-
-#ifdef CONFIG_TLS_THREAD
-	tv = &static_tv_valid;
-#else
-	tv = pthread_getspecific(tv_tls_key);
-#endif
-
 	switch (fio_clock_source) {
 #ifdef CONFIG_GETTIMEOFDAY
 	case CS_GTOD:
@@ -169,18 +164,34 @@
 #ifdef ARCH_HAVE_CPU_CLOCK
 	case CS_CPUCLOCK: {
 		uint64_t usecs, t;
+		struct tv_valid *tv;
+
+#ifdef CONFIG_TLS_THREAD
+		tv = &static_tv_valid;
+#else
+		tv = pthread_getspecific(tv_tls_key);
+#endif
 
 		t = get_cpu_clock();
-		if (tv && t < tv->last_cycles) {
-			dprint(FD_TIME, "CPU clock going back in time\n");
-			t = tv->last_cycles;
-		} else if (tv)
-			tv->last_cycles = t;
+#ifdef ARCH_CPU_CLOCK_WRAPS
+		if (t < cycles_start && !cycles_wrap)
+			cycles_wrap = 1;
+		else if (cycles_wrap && t >= cycles_start && !tv->warned) {
+			log_err("fio: double CPU clock wrap\n");
+			tv->warned = 1;
+		}
 
+		t -= cycles_start;
+#endif
+		tv->last_cycles = t;
+		tv->last_tv_valid = 1;
 #ifdef ARCH_CPU_CLOCK_CYCLES_PER_USEC
 		usecs = t / ARCH_CPU_CLOCK_CYCLES_PER_USEC;
 #else
-		usecs = (t * inv_cycles_per_usec) / 16777216UL;
+		if (t < max_cycles_for_mult)
+			usecs = (t * inv_cycles_per_usec) / 16777216UL;
+		else
+			usecs = t / cycles_per_usec;
 #endif
 		tp->tv_sec = usecs / 1000000;
 		tp->tv_usec = usecs % 1000000;
@@ -191,8 +202,6 @@
 		log_err("fio: invalid clock source %d\n", fio_clock_source);
 		break;
 	}
-
-	return tv;
 }
 
 #ifdef FIO_DEBUG_TIME
@@ -201,36 +210,16 @@
 void fio_gettime(struct timeval *tp, void fio_unused *caller)
 #endif
 {
-	struct tv_valid *tv;
-
 #ifdef FIO_DEBUG_TIME
 	if (!caller)
 		caller = __builtin_return_address(0);
 
 	gtod_log_caller(caller);
 #endif
-	if (fio_unlikely(fio_tv)) {
-		memcpy(tp, fio_tv, sizeof(*tp));
+	if (fio_unlikely(fio_gettime_offload(tp)))
 		return;
-	}
 
-	tv = __fio_gettime(tp);
-
-	/*
-	 * If Linux is using the tsc clock on non-synced processors,
-	 * sometimes time can appear to drift backwards. Fix that up.
-	 */
-	if (tv) {
-		if (tv->last_tv_valid) {
-			if (tp->tv_sec < tv->last_tv.tv_sec)
-				tp->tv_sec = tv->last_tv.tv_sec;
-			else if (tv->last_tv.tv_sec == tp->tv_sec &&
-				 tp->tv_usec < tv->last_tv.tv_usec)
-				tp->tv_usec = tv->last_tv.tv_usec;
-		}
-		tv->last_tv_valid = 1;
-		memcpy(&tv->last_tv, tp, sizeof(*tp));
-	}
+	__fio_gettime(tp);
 }
 
 #if defined(ARCH_HAVE_CPU_CLOCK) && !defined(ARCH_CPU_CLOCK_CYCLES_PER_USEC)
@@ -269,7 +258,7 @@
 static int calibrate_cpu_clock(void)
 {
 	double delta, mean, S;
-	uint64_t avg, cycles[NR_TIME_ITERS];
+	uint64_t minc, maxc, avg, cycles[NR_TIME_ITERS];
 	int i, samples;
 
 	cycles[0] = get_cycles_per_usec();
@@ -292,10 +281,14 @@
 
 	S = sqrt(S / (NR_TIME_ITERS - 1.0));
 
-	samples = avg = 0;
+	minc = -1ULL;
+	maxc = samples = avg = 0;
 	for (i = 0; i < NR_TIME_ITERS; i++) {
 		double this = cycles[i];
 
+		minc = min(cycles[i], minc);
+		maxc = max(cycles[i], maxc);
+
 		if ((fmax(this, mean) - fmin(this, mean)) > S)
 			continue;
 		samples++;
@@ -311,12 +304,21 @@
 
 	avg /= samples;
 	avg = (avg + 5) / 10;
+	minc /= 10;
+	maxc /= 10;
 	dprint(FD_TIME, "avg: %llu\n", (unsigned long long) avg);
-	dprint(FD_TIME, "mean=%f, S=%f\n", mean, S);
+	dprint(FD_TIME, "min=%llu, max=%llu, mean=%f, S=%f\n",
+			(unsigned long long) minc,
+			(unsigned long long) maxc, mean, S);
 
 	cycles_per_usec = avg;
 	inv_cycles_per_usec = 16777216UL / cycles_per_usec;
+	max_cycles_for_mult = ~0ULL / inv_cycles_per_usec;
 	dprint(FD_TIME, "inv_cycles_per_usec=%lu\n", inv_cycles_per_usec);
+#ifdef ARCH_CPU_CLOCK_WRAPS
+	cycles_start = get_cpu_clock();
+	dprint(FD_TIME, "cycles_start=%llu\n", cycles_start);
+#endif
 	return 0;
 }
 #else
@@ -336,8 +338,10 @@
 	struct tv_valid *t;
 
 	t = calloc(1, sizeof(*t));
-	if (pthread_setspecific(tv_tls_key, t))
+	if (pthread_setspecific(tv_tls_key, t)) {
 		log_err("fio: can't set TLS key\n");
+		assert(0);
+	}
 }
 
 static void kill_tv_tls_key(void *data)
@@ -371,13 +375,13 @@
 	 * runs at a constant rate and is synced across CPU cores.
 	 */
 	if (tsc_reliable) {
-		if (!fio_clock_source_set)
+		if (!fio_clock_source_set && !fio_monotonic_clocktest(0))
 			fio_clock_source = CS_CPUCLOCK;
 	} else if (fio_clock_source == CS_CPUCLOCK)
 		log_info("fio: clocksource=cpu may not be reliable\n");
 }
 
-uint64_t utime_since(struct timeval *s, struct timeval *e)
+uint64_t utime_since(const struct timeval *s, const struct timeval *e)
 {
 	long sec, usec;
 	uint64_t ret;
@@ -400,7 +404,7 @@
 	return ret;
 }
 
-uint64_t utime_since_now(struct timeval *s)
+uint64_t utime_since_now(const struct timeval *s)
 {
 	struct timeval t;
 
@@ -408,7 +412,7 @@
 	return utime_since(s, &t);
 }
 
-uint64_t mtime_since(struct timeval *s, struct timeval *e)
+uint64_t mtime_since(const struct timeval *s, const struct timeval *e)
 {
 	long sec, usec, ret;
 
@@ -429,7 +433,7 @@
 	return ret;
 }
 
-uint64_t mtime_since_now(struct timeval *s)
+uint64_t mtime_since_now(const struct timeval *s)
 {
 	struct timeval t;
 	void *p = __builtin_return_address(0);
@@ -438,7 +442,7 @@
 	return mtime_since(s, &t);
 }
 
-uint64_t time_since_now(struct timeval *s)
+uint64_t time_since_now(const struct timeval *s)
 {
 	return mtime_since_now(s) / 1000;
 }
@@ -446,7 +450,8 @@
 #if defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK)  && \
     defined(CONFIG_SFAA)
 
-#define CLOCK_ENTRIES	100000
+#define CLOCK_ENTRIES_DEBUG	100000
+#define CLOCK_ENTRIES_TEST	10000
 
 struct clock_entry {
 	uint32_t seq;
@@ -457,8 +462,10 @@
 struct clock_thread {
 	pthread_t thread;
 	int cpu;
+	int debug;
 	pthread_mutex_t lock;
 	pthread_mutex_t started;
+	unsigned long nr_entries;
 	uint32_t *seq;
 	struct clock_entry *entries;
 };
@@ -476,12 +483,20 @@
 	uint32_t last_seq;
 	int i;
 
-	memset(&cpu_mask, 0, sizeof(cpu_mask));
+	if (fio_cpuset_init(&cpu_mask)) {
+		int __err = errno;
+
+		log_err("clock cpuset init failed: %s\n", strerror(__err));
+		goto err_out;
+	}
+
 	fio_cpu_set(&cpu_mask, t->cpu);
 
 	if (fio_setaffinity(gettid(), cpu_mask) == -1) {
-		log_err("clock setaffinity failed\n");
-		return (void *) 1;
+		int __err = errno;
+
+		log_err("clock setaffinity failed: %s\n", strerror(__err));
+		goto err;
 	}
 
 	pthread_mutex_lock(&t->lock);
@@ -489,7 +504,7 @@
 
 	last_seq = 0;
 	c = &t->entries[0];
-	for (i = 0; i < CLOCK_ENTRIES; i++, c++) {
+	for (i = 0; i < t->nr_entries; i++, c++) {
 		uint32_t seq;
 		uint64_t tsc;
 
@@ -505,17 +520,26 @@
 		c->tsc = tsc;
 	}
 
-	log_info("cs: cpu%3d: %llu clocks seen\n", t->cpu,
-		(unsigned long long) t->entries[i - 1].tsc - t->entries[0].tsc);
+	if (t->debug) {
+		unsigned long long clocks;
+
+		clocks = t->entries[i - 1].tsc - t->entries[0].tsc;
+		log_info("cs: cpu%3d: %llu clocks seen\n", t->cpu, clocks);
+	}
 
 	/*
 	 * The most common platform clock breakage is returning zero
 	 * indefinitely. Check for that and return failure.
 	 */
 	if (!t->entries[i - 1].tsc && !t->entries[0].tsc)
-		return (void *) 1;
+		goto err;
 
+	fio_cpuset_exit(&cpu_mask);
 	return NULL;
+err:
+	fio_cpuset_exit(&cpu_mask);
+err_out:
+	return (void *) 1;
 }
 
 static int clock_cmp(const void *p1, const void *p2)
@@ -529,34 +553,49 @@
 	return c1->seq - c2->seq;
 }
 
-int fio_monotonic_clocktest(void)
+int fio_monotonic_clocktest(int debug)
 {
-	struct clock_thread *threads;
+	struct clock_thread *cthreads;
 	unsigned int nr_cpus = cpus_online();
 	struct clock_entry *entries;
-	unsigned long tentries, failed = 0;
+	unsigned long nr_entries, tentries, failed = 0;
 	struct clock_entry *prev, *this;
 	uint32_t seq = 0;
 	unsigned int i;
 
-	log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
+	if (debug) {
+		log_info("cs: reliable_tsc: %s\n", tsc_reliable ? "yes" : "no");
 
-	fio_debug |= 1U << FD_TIME;
+#ifdef FIO_INC_DEBUG
+		fio_debug |= 1U << FD_TIME;
+#endif
+		nr_entries = CLOCK_ENTRIES_DEBUG;
+	} else
+		nr_entries = CLOCK_ENTRIES_TEST;
+
 	calibrate_cpu_clock();
-	fio_debug &= ~(1U << FD_TIME);
 
-	threads = malloc(nr_cpus * sizeof(struct clock_thread));
-	tentries = CLOCK_ENTRIES * nr_cpus;
+	if (debug) {
+#ifdef FIO_INC_DEBUG
+		fio_debug &= ~(1U << FD_TIME);
+#endif
+	}
+
+	cthreads = malloc(nr_cpus * sizeof(struct clock_thread));
+	tentries = nr_entries * nr_cpus;
 	entries = malloc(tentries * sizeof(struct clock_entry));
 
-	log_info("cs: Testing %u CPUs\n", nr_cpus);
+	if (debug)
+		log_info("cs: Testing %u CPUs\n", nr_cpus);
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 
 		t->cpu = i;
+		t->debug = debug;
 		t->seq = &seq;
-		t->entries = &entries[i * CLOCK_ENTRIES];
+		t->nr_entries = nr_entries;
+		t->entries = &entries[i * nr_entries];
 		pthread_mutex_init(&t->lock, NULL);
 		pthread_mutex_init(&t->started, NULL);
 		pthread_mutex_lock(&t->lock);
@@ -568,29 +607,30 @@
 	}
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 
 		pthread_mutex_lock(&t->started);
 	}
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 
 		pthread_mutex_unlock(&t->lock);
 	}
 
 	for (i = 0; i < nr_cpus; i++) {
-		struct clock_thread *t = &threads[i];
+		struct clock_thread *t = &cthreads[i];
 		void *ret;
 
 		pthread_join(t->thread, &ret);
 		if (ret)
 			failed++;
 	}
-	free(threads);
+	free(cthreads);
 
 	if (failed) {
-		log_err("Clocksource test: %lu threads failed\n", failed);
+		if (debug)
+			log_err("Clocksource test: %lu threads failed\n", failed);
 		goto err;
 	}
 
@@ -607,6 +647,11 @@
 		if (prev->tsc > this->tsc) {
 			uint64_t diff = prev->tsc - this->tsc;
 
+			if (!debug) {
+				failed++;
+				break;
+			}
+
 			log_info("cs: CPU clock mismatch (diff=%llu):\n",
 						(unsigned long long) diff);
 			log_info("\t CPU%3u: TSC=%llu, SEQ=%u\n", prev->cpu, (unsigned long long) prev->tsc, prev->seq);
@@ -617,11 +662,12 @@
 		prev = this;
 	}
 
-	if (failed)
-		log_info("cs: Failed: %lu\n", failed);
-	else
-		log_info("cs: Pass!\n");
-
+	if (debug) {
+		if (failed)
+			log_info("cs: Failed: %lu\n", failed);
+		else
+			log_info("cs: Pass!\n");
+	}
 err:
 	free(entries);
 	return !!failed;
@@ -629,10 +675,11 @@
 
 #else /* defined(FIO_HAVE_CPU_AFFINITY) && defined(ARCH_HAVE_CPU_CLOCK) */
 
-int fio_monotonic_clocktest(void)
+int fio_monotonic_clocktest(int debug)
 {
-	log_info("cs: current platform does not support CPU clocks\n");
-	return 0;
+	if (debug)
+		log_info("cs: current platform does not support CPU clocks\n");
+	return 1;
 }
 
 #endif
diff --git a/gettime.h b/gettime.h
index f0ad20c..86d55bd 100644
--- a/gettime.h
+++ b/gettime.h
@@ -1,6 +1,8 @@
 #ifndef FIO_GETTIME_H
 #define FIO_GETTIME_H
 
+#include "arch/arch.h"
+
 /*
  * Clock sources
  */
@@ -15,9 +17,27 @@
 extern void fio_gtod_init(void);
 extern void fio_clock_init(void);
 extern int fio_start_gtod_thread(void);
-extern int fio_monotonic_clocktest(void);
+extern int fio_monotonic_clocktest(int debug);
 extern void fio_local_clock_init(int);
 
 extern struct timeval *fio_tv;
 
+static inline int fio_gettime_offload(struct timeval *tv)
+{
+	time_t last_sec;
+
+	if (!fio_tv)
+		return 0;
+
+	do {
+		read_barrier();
+		last_sec = tv->tv_sec = fio_tv->tv_sec;
+		tv->tv_usec = fio_tv->tv_usec;
+	} while (fio_tv->tv_sec != last_sec);
+
+	return 1;
+}
+
+extern void fio_gtod_set_cpu(unsigned int cpu);
+
 #endif
diff --git a/gfio.c b/gfio.c
index 65302e6..42d536e 100644
--- a/gfio.c
+++ b/gfio.c
@@ -444,12 +444,12 @@
 	while (!flist_empty(&gc->o_list)) {
 		struct gfio_client_options *gco;
 
-		gco = flist_entry(gc->o_list.next, struct gfio_client_options, list);
+		gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
 		flist_del(&gco->list);
 		free(gco);
 	}
 
-	ret = fio_client_send_ini(gc->client, ge->job_file);
+	ret = fio_client_send_ini(gc->client, ge->job_file, 0);
 	if (!ret)
 		return 0;
 
@@ -1687,7 +1687,9 @@
 	gtk_init(argc, argv);
 	settings = gtk_settings_get_default();
 	gtk_settings_set_long_property(settings, "gtk_tooltip_timeout", 10, "gfio setting");
+#if !GLIB_CHECK_VERSION(2, 36, 0)
 	g_type_init();
+#endif
 	gdk_color_parse("#fffff4", &gfio_color_lightyellow);
 	gdk_color_parse("white", &gfio_color_white);
 
diff --git a/goptions.c b/goptions.c
index 5b5c89e..c01b6cc 100644
--- a/goptions.c
+++ b/goptions.c
@@ -1433,7 +1433,7 @@
 		goto done;
 
 	while (!flist_empty(&gjv->changed_list)) {
-		gopt = flist_entry(gjv->changed_list.next, struct gopt, changed_list);
+		gopt = flist_first_entry(&gjv->changed_list, struct gopt, changed_list);
 		flist_del_init(&gopt->changed_list);
 	}
 
@@ -1577,7 +1577,7 @@
 
 	gjv = calloc(1, sizeof(*gjv));
 	INIT_FLIST_HEAD(&gjv->changed_list);
-	gco = flist_entry(gc->o_list.next, struct gfio_client_options, list);
+	gco = flist_first_entry(&gc->o_list, struct gfio_client_options, list);
 	gjv->o = &gco->o;
 	gjv->dialog = dialog;
 	gjv->client = gc;
diff --git a/graph.c b/graph.c
index 5c865dc..c45954c 100644
--- a/graph.c
+++ b/graph.c
@@ -687,7 +687,7 @@
 	 */
 	while (!(v->flags & GV_F_ON_PRIO)) {
 		assert(!flist_empty(&v->alias));
-		v = flist_entry(v->alias.next, struct graph_value, alias);
+		v = flist_first_entry(&v->alias, struct graph_value, alias);
 	}
 
 	prio_tree_remove(&l->prio_tree, &v->node);
@@ -698,7 +698,7 @@
 	while (!flist_empty(&v->alias)) {
 		struct graph_value *a;
 
-		a = flist_entry(v->alias.next, struct graph_value, alias);
+		a = flist_first_entry(&v->alias, struct graph_value, alias);
 		flist_del_init(&a->alias);
 
 		__graph_value_drop(l, a);
@@ -773,7 +773,7 @@
 			to_drop = 2;
 
 		while (to_drop-- && !flist_empty(&i->value_list)) {
-			x = flist_entry(i->value_list.next, struct graph_value, list);
+			x = flist_first_entry(&i->value_list, struct graph_value, list);
 			graph_value_drop(i, x);
 
 			/*
@@ -836,7 +836,7 @@
 	struct graph_value *i;
 
 	while (!flist_empty(&l->value_list)) {
-		i = flist_entry(l->value_list.next, struct graph_value, list);
+		i = flist_first_entry(&l->value_list, struct graph_value, list);
 		graph_value_drop(l, i);
 	}
 }
@@ -846,7 +846,7 @@
 	struct graph_label *i;
 
 	while (!flist_empty(&g->label_list)) {
-		i = flist_entry(g->label_list.next, struct graph_label, list);
+		i = flist_first_entry(&g->label_list, struct graph_label, list);
 		flist_del(&i->list);
 		graph_free_values(i);
 		free(i);
@@ -1010,7 +1010,7 @@
 					}
 				}
 				if (!flist_empty(&v->alias))
-					v = flist_entry(v->alias.next, struct graph_value, alias);
+					v = flist_first_entry(&v->alias, struct graph_value, alias);
 			} while (v != rootv);
 		} while ((n = prio_tree_next(&iter)) != NULL);
 
diff --git a/idletime.c b/idletime.c
index a366d2b..db272fe 100644
--- a/idletime.c
+++ b/idletime.c
@@ -43,16 +43,26 @@
 	return tunit / CALIBRATE_SCALE;
 }
 
+static void free_cpu_affinity(struct idle_prof_thread *ipt)
+{
+#if defined(FIO_HAVE_CPU_AFFINITY)
+	fio_cpuset_exit(&ipt->cpu_mask);
+#endif
+}
+
 static int set_cpu_affinity(struct idle_prof_thread *ipt)
 {
 #if defined(FIO_HAVE_CPU_AFFINITY)
-	os_cpu_mask_t cpu_mask;
+	if (fio_cpuset_init(&ipt->cpu_mask)) {
+		log_err("fio: cpuset init failed\n");
+		return -1;
+	}
 
-	memset(&cpu_mask, 0, sizeof(cpu_mask));
-	fio_cpu_set(&cpu_mask, ipt->cpu);
+	fio_cpu_set(&ipt->cpu_mask, ipt->cpu);
 
-	if (fio_setaffinity(gettid(), cpu_mask)) {
+	if (fio_setaffinity(gettid(), ipt->cpu_mask)) {
 		log_err("fio: fio_setaffinity failed\n");
+		fio_cpuset_exit(&ipt->cpu_mask);
 		return -1;
 	}
 
@@ -98,7 +108,7 @@
 	if (retval == -1) {
 		ipt->state = TD_EXITED;
 		pthread_mutex_unlock(&ipt->init_lock);
-		return NULL;
+		goto do_exit;
 	}
 
 	ipt->state = TD_INITIALIZED;
@@ -113,13 +123,13 @@
 	/* exit if other threads failed to initialize */
 	if (ipc.status == IDLE_PROF_STATUS_ABORT) {
 		pthread_mutex_unlock(&ipt->start_lock);
-		return NULL;
+		goto do_exit;
 	}
 
 	/* exit if we are doing calibration only */
 	if (ipc.status == IDLE_PROF_STATUS_CALI_STOP) {
 		pthread_mutex_unlock(&ipt->start_lock);
-		return NULL;
+		goto do_exit;
 	}
 
 	fio_gettime(&ipt->tps, NULL);
@@ -143,6 +153,8 @@
 	ipt->state = TD_EXITED;
 	pthread_mutex_unlock(&ipt->start_lock);
 
+do_exit:
+	free_cpu_affinity(ipt);
 	return NULL;
 }
 
diff --git a/idletime.h b/idletime.h
index 819da25..bd6dcef 100644
--- a/idletime.h
+++ b/idletime.h
@@ -34,6 +34,8 @@
 	pthread_cond_t  cond;
 	pthread_mutex_t init_lock;
 	pthread_mutex_t start_lock;
+
+	os_cpu_mask_t cpu_mask;
 };
 
 struct idle_prof_common {
diff --git a/init.c b/init.c
index 4c5a8dd..7aedf2b 100644
--- a/init.c
+++ b/init.c
@@ -64,7 +64,10 @@
 int read_only = 0;
 int status_interval = 0;
 
-static int write_lat_log;
+char *trigger_file = NULL;
+long long trigger_timeout = 0;
+char *trigger_cmd = NULL;
+char *trigger_remote_cmd = NULL;
 
 static int prev_group_jobs;
 
@@ -172,6 +175,13 @@
 		.has_arg	= required_argument,
 		.val		= 'x' | FIO_CLIENT_FLAG,
 	},
+#ifdef CONFIG_ZLIB
+	{
+		.name		= (char *) "inflate-log",
+		.has_arg	= required_argument,
+		.val		= 'X' | FIO_CLIENT_FLAG,
+	},
+#endif
 	{
 		.name		= (char *) "alloc-size",
 		.has_arg	= required_argument,
@@ -212,6 +222,11 @@
 		.val		= 'C',
 	},
 	{
+		.name		= (char *) "remote-config",
+		.has_arg	= required_argument,
+		.val		= 'R',
+	},
+	{
 		.name		= (char *) "cpuclock-test",
 		.has_arg	= no_argument,
 		.val		= 'T',
@@ -232,21 +247,45 @@
 		.val		= 'L',
 	},
 	{
+		.name		= (char *) "trigger-file",
+		.has_arg	= required_argument,
+		.val		= 'W',
+	},
+	{
+		.name		= (char *) "trigger-timeout",
+		.has_arg	= required_argument,
+		.val		= 'B',
+	},
+	{
+		.name		= (char *) "trigger",
+		.has_arg	= required_argument,
+		.val		= 'H',
+	},
+	{
+		.name		= (char *) "trigger-remote",
+		.has_arg	= required_argument,
+		.val		= 'J',
+	},
+	{
 		.name		= NULL,
 	},
 };
 
 void free_threads_shm(void)
 {
-	struct shmid_ds sbuf;
-
 	if (threads) {
 		void *tp = threads;
+#ifndef CONFIG_NO_SHM
+		struct shmid_ds sbuf;
 
 		threads = NULL;
 		shmdt(tp);
 		shmctl(shm_id, IPC_RMID, &sbuf);
 		shm_id = -1;
+#else
+		threads = NULL;
+		free(tp);
+#endif
 	}
 }
 
@@ -259,6 +298,11 @@
 		free_threads_shm();
 	}
 
+	free(trigger_file);
+	free(trigger_cmd);
+	free(trigger_remote_cmd);
+	trigger_file = trigger_cmd = trigger_remote_cmd = NULL;
+
 	options_free(fio_options, &def_thread);
 	fio_filelock_exit();
 	scleanup();
@@ -287,6 +331,7 @@
 		size += file_hash_size;
 		size += sizeof(unsigned int);
 
+#ifndef CONFIG_NO_SHM
 		shm_id = shmget(0, size, IPC_CREAT | 0600);
 		if (shm_id != -1)
 			break;
@@ -294,10 +339,16 @@
 			perror("shmget");
 			break;
 		}
+#else
+		threads = malloc(size);
+		if (threads)
+			break;
+#endif
 
 		max_jobs >>= 1;
 	} while (max_jobs);
 
+#ifndef CONFIG_NO_SHM
 	if (shm_id == -1)
 		return 1;
 
@@ -306,6 +357,7 @@
 		perror("shmat");
 		return 1;
 	}
+#endif
 
 	memset(threads, 0, max_jobs * sizeof(struct thread_data));
 	hash = (void *) threads + max_jobs * sizeof(struct thread_data);
@@ -330,7 +382,7 @@
  * Return a free job structure.
  */
 static struct thread_data *get_new_job(int global, struct thread_data *parent,
-				       int preserve_eo)
+				       int preserve_eo, const char *jobname)
 {
 	struct thread_data *td;
 
@@ -363,6 +415,10 @@
 	profile_add_hooks(td);
 
 	td->thread_number = thread_number;
+	td->subjob_number = 0;
+
+	if (jobname)
+		td->o.name = strdup(jobname);
 
 	if (!parent->o.group_reporting)
 		stat_number++;
@@ -386,6 +442,9 @@
 	if (td->io_ops)
 		free_ioengine(td);
 
+	if (td->o.name)
+		free(td->o.name);
+
 	memset(&threads[td->thread_number - 1], 0, sizeof(*td));
 	thread_number--;
 }
@@ -441,13 +500,8 @@
 
 	delayrange = td->o.start_delay_high - td->o.start_delay;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->delay_state);
-		delayrange = (unsigned long long) ((double) delayrange * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__delay_state);
-		delayrange = (unsigned long long) ((double) delayrange * (r / (FRAND_MAX + 1.0)));
-	}
+	r = __rand(&td->delay_state);
+	delayrange = (unsigned long long) ((double) delayrange * (r / (FRAND_MAX + 1.0)));
 
 	delayrange += td->o.start_delay;
 	return delayrange;
@@ -511,7 +565,6 @@
 	if (!o->max_bs[DDIR_TRIM])
 		o->max_bs[DDIR_TRIM] = o->bs[DDIR_TRIM];
 
-
 	o->rw_min_bs = min(o->min_bs[DDIR_READ], o->min_bs[DDIR_WRITE]);
 	o->rw_min_bs = min(o->min_bs[DDIR_TRIM], o->rw_min_bs);
 
@@ -543,8 +596,7 @@
 	if (o->norandommap && o->verify != VERIFY_NONE
 	    && !fixed_block_size(o))  {
 		log_err("fio: norandommap given for variable block sizes, "
-			"verify disabled\n");
-		o->verify = VERIFY_NONE;
+			"verify limited\n");
 		ret = warnings_fatal;
 	}
 	if (o->bs_unaligned && (o->odirect || td->io_ops->flags & FIO_RAWIO))
@@ -612,6 +664,15 @@
 		if (o->max_bs[DDIR_WRITE] != o->min_bs[DDIR_WRITE] &&
 		    !o->verify_interval)
 			o->verify_interval = o->min_bs[DDIR_WRITE];
+
+		/*
+		 * Verify interval must be smaller or equal to the
+		 * write size.
+		 */
+		if (o->verify_interval > o->min_bs[DDIR_WRITE])
+			o->verify_interval = o->min_bs[DDIR_WRITE];
+		else if (td_read(td) && o->verify_interval > o->min_bs[DDIR_READ])
+			o->verify_interval = o->min_bs[DDIR_READ];
 	}
 
 	if (o->pre_read) {
@@ -696,6 +757,16 @@
 		ret = 1;
 	}
 
+	if (fio_option_is_set(o, gtod_cpu)) {
+		fio_gtod_init();
+		fio_gtod_set_cpu(o->gtod_cpu);
+		fio_gtod_offload = 1;
+	}
+
+	td->loops = o->loops;
+	if (!td->loops)
+		td->loops = 1;
+
 	return ret;
 }
 
@@ -749,44 +820,18 @@
 	return 1;
 }
 
-static void td_fill_rand_seeds_os(struct thread_data *td)
-{
-	os_random_seed(td->rand_seeds[FIO_RAND_BS_OFF], &td->bsrange_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_VER_OFF], &td->verify_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_MIX_OFF], &td->rwmix_state);
-
-	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
-		os_random_seed(td->rand_seeds[FIO_RAND_FILE_OFF], &td->next_file_state);
-
-	os_random_seed(td->rand_seeds[FIO_RAND_FILE_SIZE_OFF], &td->file_size_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_TRIM_OFF], &td->trim_state);
-	os_random_seed(td->rand_seeds[FIO_RAND_START_DELAY], &td->delay_state);
-
-	if (!td_random(td))
-		return;
-
-	if (td->o.rand_repeatable)
-		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
-
-	os_random_seed(td->rand_seeds[FIO_RAND_BLOCK_OFF], &td->random_state);
-
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF], &td->seq_rand_state[DDIR_READ]);
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF], &td->seq_rand_state[DDIR_WRITE]);
-	os_random_seed(td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF], &td->seq_rand_state[DDIR_TRIM]);
-}
-
 static void td_fill_rand_seeds_internal(struct thread_data *td)
 {
-	init_rand_seed(&td->__bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF]);
-	init_rand_seed(&td->__verify_state, td->rand_seeds[FIO_RAND_VER_OFF]);
-	init_rand_seed(&td->__rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF]);
+	init_rand_seed(&td->bsrange_state, td->rand_seeds[FIO_RAND_BS_OFF]);
+	init_rand_seed(&td->verify_state, td->rand_seeds[FIO_RAND_VER_OFF]);
+	init_rand_seed(&td->rwmix_state, td->rand_seeds[FIO_RAND_MIX_OFF]);
 
 	if (td->o.file_service_type == FIO_FSERVICE_RANDOM)
-		init_rand_seed(&td->__next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF]);
+		init_rand_seed(&td->next_file_state, td->rand_seeds[FIO_RAND_FILE_OFF]);
 
-	init_rand_seed(&td->__file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF]);
-	init_rand_seed(&td->__trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF]);
-	init_rand_seed(&td->__delay_state, td->rand_seeds[FIO_RAND_START_DELAY]);
+	init_rand_seed(&td->file_size_state, td->rand_seeds[FIO_RAND_FILE_SIZE_OFF]);
+	init_rand_seed(&td->trim_state, td->rand_seeds[FIO_RAND_TRIM_OFF]);
+	init_rand_seed(&td->delay_state, td->rand_seeds[FIO_RAND_START_DELAY]);
 
 	if (!td_random(td))
 		return;
@@ -794,26 +839,28 @@
 	if (td->o.rand_repeatable)
 		td->rand_seeds[FIO_RAND_BLOCK_OFF] = FIO_RANDSEED * td->thread_number;
 
-	init_rand_seed(&td->__random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_READ], td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF]);
-	init_rand_seed(&td->__seq_rand_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF]);
+	init_rand_seed(&td->random_state, td->rand_seeds[FIO_RAND_BLOCK_OFF]);
+	init_rand_seed(&td->seq_rand_state[DDIR_READ], td->rand_seeds[FIO_RAND_SEQ_RAND_READ_OFF]);
+	init_rand_seed(&td->seq_rand_state[DDIR_WRITE], td->rand_seeds[FIO_RAND_SEQ_RAND_WRITE_OFF]);
+	init_rand_seed(&td->seq_rand_state[DDIR_TRIM], td->rand_seeds[FIO_RAND_SEQ_RAND_TRIM_OFF]);
 }
 
 void td_fill_rand_seeds(struct thread_data *td)
 {
 	if (td->o.allrand_repeatable) {
-		for (int i = 0; i < FIO_RAND_NR_OFFS; i++)
+		unsigned int i;
+
+		for (i = 0; i < FIO_RAND_NR_OFFS; i++)
 			td->rand_seeds[i] = FIO_RANDSEED * td->thread_number
 			       	+ i;
 	}
 
-	if (td->o.use_os_rand)
-		td_fill_rand_seeds_os(td);
-	else
-		td_fill_rand_seeds_internal(td);
+	td_fill_rand_seeds_internal(td);
 
 	init_rand_seed(&td->buf_state, td->rand_seeds[FIO_RAND_BUF_OFF]);
+	frand_copy(&td->buf_state_prev, &td->buf_state);
+
+	init_rand_seed(&td->dedupe_state, td->rand_seeds[FIO_DEDUPE_OFF]);
 }
 
 /*
@@ -888,7 +935,17 @@
 		td->flags |= TD_F_READ_IOLOG;
 	if (o->refill_buffers)
 		td->flags |= TD_F_REFILL_BUFFERS;
-	if (o->scramble_buffers)
+	/*
+	 * Always scramble buffers if asked to
+	 */
+	if (o->scramble_buffers && fio_option_is_set(o, scramble_buffers))
+		td->flags |= TD_F_SCRAMBLE_BUFFERS;
+	/*
+	 * But also scramble buffers, unless we were explicitly asked
+	 * to zero them.
+	 */
+	if (o->scramble_buffers && !(o->zero_buffers &&
+	    fio_option_is_set(o, zero_buffers)))
 		td->flags |= TD_F_SCRAMBLE_BUFFERS;
 	if (o->verify != VERIFY_NONE)
 		td->flags |= TD_F_VER_NONE;
@@ -982,8 +1039,14 @@
 				ret = snprintf(dst, dst_left, "%s", jobname);
 				if (ret < 0)
 					break;
-				dst += ret;
-				dst_left -= ret;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
 				}
 			case FPRE_JOBNUM: {
@@ -992,8 +1055,14 @@
 				ret = snprintf(dst, dst_left, "%d", jobnum);
 				if (ret < 0)
 					break;
-				dst += ret;
-				dst_left -= ret;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
 				}
 			case FPRE_FILENUM: {
@@ -1002,8 +1071,14 @@
 				ret = snprintf(dst, dst_left, "%d", filenum);
 				if (ret < 0)
 					break;
-				dst += ret;
-				dst_left -= ret;
+				else if (ret > dst_left) {
+					log_err("fio: truncated filename\n");
+					dst += dst_left;
+					dst_left = 0;
+				} else {
+					dst += ret;
+					dst_left -= ret;
+				}
 				break;
 				}
 			default:
@@ -1038,6 +1113,7 @@
 	char fname[PATH_MAX];
 	int numjobs, file_alloced;
 	struct thread_options *o = &td->o;
+	char logname[PATH_MAX + 32];
 
 	/*
 	 * the def_thread is just for options, it's not a real job
@@ -1127,15 +1203,72 @@
 	if (setup_rate(td))
 		goto err;
 
-	if (o->lat_log_file || write_lat_log) {
-		setup_log(&td->lat_log, o->log_avg_msec, IO_LOG_TYPE_LAT);
-		setup_log(&td->slat_log, o->log_avg_msec, IO_LOG_TYPE_SLAT);
-		setup_log(&td->clat_log, o->log_avg_msec, IO_LOG_TYPE_CLAT);
+	if (o->lat_log_file) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.log_type = IO_LOG_TYPE_LAT,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *suf;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		snprintf(logname, sizeof(logname), "%s_lat.%d.%s",
+				o->lat_log_file, td->thread_number, suf);
+		setup_log(&td->lat_log, &p, logname);
+		snprintf(logname, sizeof(logname), "%s_slat.%d.%s",
+				o->lat_log_file, td->thread_number, suf);
+		setup_log(&td->slat_log, &p, logname);
+		snprintf(logname, sizeof(logname), "%s_clat.%d.%s",
+				o->lat_log_file, td->thread_number, suf);
+		setup_log(&td->clat_log, &p, logname);
 	}
-	if (o->bw_log_file || write_bw_log)
-		setup_log(&td->bw_log, o->log_avg_msec, IO_LOG_TYPE_BW);
-	if (o->iops_log_file)
-		setup_log(&td->iops_log, o->log_avg_msec, IO_LOG_TYPE_IOPS);
+	if (o->bw_log_file) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.log_type = IO_LOG_TYPE_BW,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *suf;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		snprintf(logname, sizeof(logname), "%s_bw.%d.%s",
+				o->bw_log_file, td->thread_number, suf);
+		setup_log(&td->bw_log, &p, logname);
+	}
+	if (o->iops_log_file) {
+		struct log_params p = {
+			.td = td,
+			.avg_msec = o->log_avg_msec,
+			.log_type = IO_LOG_TYPE_IOPS,
+			.log_offset = o->log_offset,
+			.log_gz = o->log_gz,
+			.log_gz_store = o->log_gz_store,
+		};
+		const char *suf;
+
+		if (p.log_gz_store)
+			suf = "log.fz";
+		else
+			suf = "log";
+
+		snprintf(logname, sizeof(logname), "%s_iops.%d.%s",
+				o->iops_log_file, td->thread_number, suf);
+		setup_log(&td->iops_log, &p, logname);
+	}
 
 	if (!o->name)
 		o->name = strdup(jobname);
@@ -1190,7 +1323,7 @@
 	 */
 	numjobs = o->numjobs;
 	while (--numjobs) {
-		struct thread_data *td_new = get_new_job(0, td, 1);
+		struct thread_data *td_new = get_new_job(0, td, 1, jobname);
 
 		if (!td_new)
 			goto err;
@@ -1198,6 +1331,7 @@
 		td_new->o.numjobs = 1;
 		td_new->o.stonewall = 0;
 		td_new->o.new_group = 0;
+		td_new->subjob_number = numjobs;
 
 		if (file_alloced) {
 			if (td_new->files) {
@@ -1248,11 +1382,11 @@
 			sprintf(jobname, "%s", o[i] + 5);
 		}
 		if (in_global && !td_parent)
-			td_parent = get_new_job(1, &def_thread, 0);
+			td_parent = get_new_job(1, &def_thread, 0, jobname);
 		else if (!in_global && !td) {
 			if (!td_parent)
 				td_parent = &def_thread;
-			td = get_new_job(0, td_parent, 0);
+			td = get_new_job(0, td_parent, 0, jobname);
 		}
 		if (in_global)
 			fio_options_parse(td_parent, (char **) &o[i], 1, 0);
@@ -1300,11 +1434,12 @@
 /*
  * This is our [ini] type file parser.
  */
-int parse_jobs_ini(char *file, int is_buf, int stonewall_flag, int type)
+int __parse_jobs_ini(struct thread_data *td,
+		char *file, int is_buf, int stonewall_flag, int type,
+		int nested, char *name, char ***popts, int *aopts, int *nopts)
 {
-	unsigned int global;
-	struct thread_data *td;
-	char *string, *name;
+	unsigned int global = 0;
+	char *string;
 	FILE *f;
 	char *p;
 	int ret = 0, stonewall;
@@ -1314,6 +1449,9 @@
 	char **opts;
 	int i, alloc_opts, num_opts;
 
+	dprint(FD_PARSE, "Parsing ini file %s\n", file);
+	assert(td || !nested);
+
 	if (is_buf)
 		f = NULL;
 	else {
@@ -1323,7 +1461,11 @@
 			f = fopen(file, "r");
 
 		if (!f) {
-			perror("fopen job file");
+			int __err = errno;
+
+			log_err("fio: unable to open '%s' job file\n", file);
+			if (td)
+				td_verror(td, __err, "job file open");
 			return 1;
 		}
 	}
@@ -1333,12 +1475,23 @@
 	/*
 	 * it's really 256 + small bit, 280 should suffice
 	 */
-	name = malloc(280);
-	memset(name, 0, 280);
+	if (!nested) {
+		name = malloc(280);
+		memset(name, 0, 280);
+	}
 
-	alloc_opts = 8;
-	opts = malloc(sizeof(char *) * alloc_opts);
-	num_opts = 0;
+	opts = NULL;
+	if (nested && popts) {
+		opts = *popts;
+		alloc_opts = *aopts;
+		num_opts = *nopts;
+	}
+
+	if (!opts) {
+		alloc_opts = 8;
+		opts = malloc(sizeof(char *) * alloc_opts);
+		num_opts = 0;
+	}
 
 	stonewall = stonewall_flag;
 	do {
@@ -1359,58 +1512,73 @@
 		strip_blank_front(&p);
 		strip_blank_end(p);
 
+		dprint(FD_PARSE, "%s\n", p);
 		if (is_empty_or_comment(p))
 			continue;
-		if (sscanf(p, "[%255[^\n]]", name) != 1) {
-			if (inside_skip)
+
+		if (!nested) {
+			if (sscanf(p, "[%255[^\n]]", name) != 1) {
+				if (inside_skip)
+					continue;
+
+				log_err("fio: option <%s> outside of "
+					"[] job section\n", p);
+				ret = 1;
+				break;
+			}
+
+			name[strlen(name) - 1] = '\0';
+
+			if (skip_this_section(name)) {
+				inside_skip = 1;
 				continue;
-			log_err("fio: option <%s> outside of [] job section\n",
-									p);
-			break;
+			} else
+				inside_skip = 0;
+
+			dprint(FD_PARSE, "Parsing section [%s]\n", name);
+
+			global = !strncmp(name, "global", 6);
+
+			if (dump_cmdline) {
+				if (first_sect)
+					log_info("fio ");
+				if (!global)
+					log_info("--name=%s ", name);
+				first_sect = 0;
+			}
+
+			td = get_new_job(global, &def_thread, 0, name);
+			if (!td) {
+				ret = 1;
+				break;
+			}
+
+			/*
+			 * Separate multiple job files by a stonewall
+			 */
+			if (!global && stonewall) {
+				td->o.stonewall = stonewall;
+				stonewall = 0;
+			}
+
+			num_opts = 0;
+			memset(opts, 0, alloc_opts * sizeof(char *));
 		}
-
-		name[strlen(name) - 1] = '\0';
-
-		if (skip_this_section(name)) {
-			inside_skip = 1;
-			continue;
-		} else
-			inside_skip = 0;
-
-		global = !strncmp(name, "global", 6);
-
-		if (dump_cmdline) {
-			if (first_sect)
-				log_info("fio ");
-			if (!global)
-				log_info("--name=%s ", name);
-			first_sect = 0;
-		}
-
-		td = get_new_job(global, &def_thread, 0);
-		if (!td) {
-			ret = 1;
-			break;
-		}
-
-		/*
-		 * Separate multiple job files by a stonewall
-		 */
-		if (!global && stonewall) {
-			td->o.stonewall = stonewall;
-			stonewall = 0;
-		}
-
-		num_opts = 0;
-		memset(opts, 0, alloc_opts * sizeof(char *));
+		else
+			skip_fgets = 1;
 
 		while (1) {
-			if (is_buf)
-				p = strsep(&file, "\n");
+			if (!skip_fgets) {
+				if (is_buf)
+					p = strsep(&file, "\n");
+				else
+					p = fgets(string, 4096, f);
+				if (!p)
+					break;
+				dprint(FD_PARSE, "%s", p);
+			}
 			else
-				p = fgets(string, 4096, f);
-			if (!p)
-				break;
+				skip_fgets = 0;
 
 			if (is_empty_or_comment(p))
 				continue;
@@ -1422,12 +1590,30 @@
 			 * fgets() a new line at the top.
 			 */
 			if (p[0] == '[') {
+				if (nested) {
+					log_err("No new sections in included files\n");
+					return 1;
+				}
+
 				skip_fgets = 1;
 				break;
 			}
 
 			strip_blank_end(p);
 
+			if (!strncmp(p, "include", strlen("include"))) {
+				char *filename = p + strlen("include") + 1;
+
+				if ((ret = __parse_jobs_ini(td, filename,
+						is_buf, stonewall_flag, type, 1,
+						name, &opts, &alloc_opts, &num_opts))) {
+					log_err("Error %d while parsing include file %s\n",
+						ret, filename);
+					break;
+				}
+				continue;
+			}
+
 			if (num_opts == alloc_opts) {
 				alloc_opts <<= 1;
 				opts = realloc(opts,
@@ -1438,6 +1624,13 @@
 			num_opts++;
 		}
 
+		if (nested) {
+			*popts = opts;
+			*aopts = alloc_opts;
+			*nopts = num_opts;
+			goto out;
+		}
+
 		ret = fio_options_parse(td, opts, num_opts, dump_cmdline);
 		if (!ret)
 			ret = add_job(td, name, 0, 0, type);
@@ -1460,14 +1653,22 @@
 		i++;
 	}
 
-	free(string);
-	free(name);
 	free(opts);
+out:
+	free(string);
+	if (!nested)
+		free(name);
 	if (!is_buf && f != stdin)
 		fclose(f);
 	return ret;
 }
 
+int parse_jobs_ini(char *file, int is_buf, int stonewall_flag, int type)
+{
+	return __parse_jobs_ini(NULL, file, is_buf, stonewall_flag, type,
+			0, NULL, NULL, NULL, NULL);
+}
+
 static int fill_def_thread(void)
 {
 	memset(&def_thread, 0, sizeof(def_thread));
@@ -1488,11 +1689,10 @@
 	printf("%s [options] [job options] <job file(s)>\n", name);
 	printf("  --debug=options\tEnable debug logging. May be one/more of:\n"
 		"\t\t\tprocess,file,io,mem,blktrace,verify,random,parse,\n"
-		"\t\t\tdiskutil,job,mutex,profile,time,net,rate\n");
+		"\t\t\tdiskutil,job,mutex,profile,time,net,rate,compress\n");
 	printf("  --parse-only\t\tParse options only, don't start any IO\n");
 	printf("  --output\t\tWrite output to file\n");
 	printf("  --runtime\t\tRuntime in seconds\n");
-	printf("  --latency-log\t\tGenerate per-job latency logs\n");
 	printf("  --bandwidth-log\tGenerate per-job bandwidth logs\n");
 	printf("  --minimal\t\tMinimal (terse) output\n");
 	printf("  --output-format=x\tOutput format (terse,json,normal)\n");
@@ -1524,9 +1724,17 @@
 	printf("  --server=args\t\tStart a backend fio server\n");
 	printf("  --daemonize=pidfile\tBackground fio server, write pid to file\n");
 	printf("  --client=hostname\tTalk to remote backend fio server at hostname\n");
+	printf("  --remote-config=file\tTell fio server to load this local job file\n");
 	printf("  --idle-prof=option\tReport cpu idleness on a system or percpu basis\n"
 		"\t\t\t(option=system,percpu) or run unit work\n"
 		"\t\t\tcalibration only (option=calibrate)\n");
+#ifdef CONFIG_ZLIB
+	printf("  --inflate-log=log\tInflate and output compressed log\n");
+#endif
+	printf("  --trigger-file=file\tExecute trigger cmd when file exists\n");
+	printf("  --trigger-timeout=t\tExecute trigger af this time\n");
+	printf("  --trigger=cmd\t\tSet this command as local trigger\n");
+	printf("  --trigger-remote=cmd\tSet this command as remote trigger\n");
 	printf("\nFio was written by Jens Axboe <jens.axboe@oracle.com>");
 	printf("\n                   Jens Axboe <jaxboe@fusionio.com>");
 	printf("\n                   Jens Axboe <axboe@fb.com>\n");
@@ -1594,6 +1802,10 @@
 	  .help = "Rate logging",
 	  .shift = FD_RATE,
 	},
+	{ .name = "compress",
+	  .help = "Log compression logging",
+	  .shift = FD_COMPRESS,
+	},
 	{ .name = NULL, },
 };
 
@@ -1700,6 +1912,30 @@
 	fio_client_add_cmd_option(client, opt);
 }
 
+static void show_closest_option(const char *name)
+{
+	int best_option, best_distance;
+	int i, distance;
+
+	while (*name == '-')
+		name++;
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (l_opts[i].name) {
+		distance = string_distance(name, l_opts[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1)
+		log_err("Did you mean %s?\n", l_opts[best_option].name);
+}
+
 int parse_cmd_line(int argc, char *argv[], int client_type)
 {
 	struct thread_data *td = NULL;
@@ -1733,13 +1969,15 @@
 			}
 			break;
 		case 'l':
-			write_lat_log = 1;
+			log_err("fio: --latency-log is deprecated. Use per-job latency log options.\n");
+			do_exit++;
+			exit_val = 1;
 			break;
 		case 'b':
 			write_bw_log = 1;
 			break;
 		case 'o':
-			if (f_out)
+			if (f_out && f_out != stdout)
 				fclose(f_out);
 
 			f_out = fopen(optarg, "w+");
@@ -1824,12 +2062,12 @@
 		case 'E': {
 			long long t = 0;
 
-			if (str_to_decimal(optarg, &t, 0, NULL, 1)) {
+			if (check_str_time(optarg, &t, 1)) {
 				log_err("fio: failed parsing eta time %s\n", optarg);
 				exit_val = 1;
 				do_exit++;
 			}
-			eta_new_line = t;
+			eta_new_line = t / 1000;
 			break;
 			}
 		case 'd':
@@ -1856,6 +2094,13 @@
 			nr_job_sections++;
 			break;
 			}
+#ifdef CONFIG_ZLIB
+		case 'X':
+			exit_val = iolog_file_inflate(optarg);
+			did_arg++;
+			do_exit++;
+			break;
+#endif
 		case 'p':
 			did_arg = 1;
 			if (exec_profile)
@@ -1883,9 +2128,15 @@
 				if (is_section && skip_this_section(val))
 					continue;
 
-				td = get_new_job(global, &def_thread, 1);
-				if (!td || ioengine_load(td))
-					goto out_free;
+				td = get_new_job(global, &def_thread, 1, NULL);
+				if (!td || ioengine_load(td)) {
+					if (td) {
+						put_job(td);
+						td = NULL;
+					}
+					do_exit++;
+					break;
+				}
 				fio_options_set_ioengine_opts(l_opts, td);
 			}
 
@@ -1906,8 +2157,12 @@
 
 			if (!ret && !strcmp(opt, "ioengine")) {
 				free_ioengine(td);
-				if (ioengine_load(td))
-					goto out_free;
+				if (ioengine_load(td)) {
+					put_job(td);
+					td = NULL;
+					do_exit++;
+					break;
+				}
 				fio_options_set_ioengine_opts(l_opts, td);
 			}
 			break;
@@ -1935,6 +2190,7 @@
 			break;
 		case 'S':
 			did_arg = 1;
+#ifndef CONFIG_NO_SHM
 			if (nr_clients) {
 				log_err("fio: can't be both client and server\n");
 				do_exit++;
@@ -1945,6 +2201,11 @@
 				fio_server_set_arg(optarg);
 			is_backend = 1;
 			backend = 1;
+#else
+			log_err("fio: client/server requires SHM support\n");
+			do_exit++;
+			exit_val = 1;
+#endif
 			break;
 		case 'D':
 			if (pid_file)
@@ -1983,14 +2244,22 @@
 				    !strncmp(argv[optind], "-", 1))
 					break;
 
-				fio_client_add_ini_file(cur_client, argv[optind]);
+				if (fio_client_add_ini_file(cur_client, argv[optind], 0))
+					break;
 				optind++;
 			}
 			break;
+		case 'R':
+			did_arg = 1;
+			if (fio_client_add_ini_file(cur_client, optarg, 1)) {
+				do_exit++;
+				exit_val = 1;
+			}
+			break;
 		case 'T':
 			did_arg = 1;
 			do_exit++;
-			exit_val = fio_monotonic_clocktest();
+			exit_val = fio_monotonic_clocktest(1);
 			break;
 		case 'G':
 			did_arg = 1;
@@ -2000,18 +2269,42 @@
 		case 'L': {
 			long long val;
 
-			if (check_str_time(optarg, &val, 0)) {
+			if (check_str_time(optarg, &val, 1)) {
 				log_err("fio: failed parsing time %s\n", optarg);
 				do_exit++;
 				exit_val = 1;
 				break;
 			}
-			status_interval = val * 1000;
+			status_interval = val / 1000;
 			break;
 			}
+		case 'W':
+			if (trigger_file)
+				free(trigger_file);
+			trigger_file = strdup(optarg);
+			break;
+		case 'H':
+			if (trigger_cmd)
+				free(trigger_cmd);
+			trigger_cmd = strdup(optarg);
+			break;
+		case 'J':
+			if (trigger_remote_cmd)
+				free(trigger_remote_cmd);
+			trigger_remote_cmd = strdup(optarg);
+			break;
+		case 'B':
+			if (check_str_time(optarg, &trigger_timeout, 1)) {
+				log_err("fio: failed parsing time %s\n", optarg);
+				do_exit++;
+				exit_val = 1;
+			}
+			trigger_timeout /= 1000000;
+			break;
 		case '?':
 			log_err("%s: unrecognized option '%s'\n", argv[0],
 							argv[optind - 1]);
+			show_closest_option(argv[optind - 1]);
 		default:
 			do_exit++;
 			exit_val = 1;
@@ -2129,12 +2422,6 @@
 		return 0;
 	}
 
-	if (def_thread.o.gtod_offload) {
-		fio_gtod_init();
-		fio_gtod_offload = 1;
-		fio_gtod_cpu = def_thread.o.gtod_cpu;
-	}
-
 	if (output_format == FIO_OUTPUT_NORMAL)
 		log_info("%s\n", fio_version_string);
 
diff --git a/io_ddir.h b/io_ddir.h
index eb25c50..b16a6b9 100644
--- a/io_ddir.h
+++ b/io_ddir.h
@@ -10,9 +10,21 @@
 	DDIR_DATASYNC,
 	DDIR_SYNC_FILE_RANGE,
 	DDIR_WAIT,
+	DDIR_LAST,
 	DDIR_INVAL = -1,
 };
 
+static inline const char *io_ddir_name(enum fio_ddir ddir)
+{
+	const char *name[] = { "read", "write", "trim", "sync", "datasync",
+				"sync_file_range", "write", };
+
+	if (ddir < DDIR_LAST)
+		return name[ddir];
+
+	return "invalid";
+}
+
 enum td_ddir {
 	TD_DDIR_READ		= 1 << 0,
 	TD_DDIR_WRITE		= 1 << 1,
@@ -30,7 +42,7 @@
 #define td_trim(td)		((td)->o.td_ddir & TD_DDIR_TRIM)
 #define td_rw(td)		(((td)->o.td_ddir & TD_DDIR_RW) == TD_DDIR_RW)
 #define td_random(td)		((td)->o.td_ddir & TD_DDIR_RAND)
-#define file_randommap(td, f)	(!(td)->o.norandommap && (f)->io_axmap)
+#define file_randommap(td, f)	(!(td)->o.norandommap && fio_file_axmap((f)))
 
 static inline int ddir_sync(enum fio_ddir ddir)
 {
@@ -45,15 +57,13 @@
 
 static inline const char *ddir_str(enum td_ddir ddir)
 {
-	const char *ddir_str[] = { NULL, "read", "write", "rw", NULL,
-				   "randread", "randwrite", "randrw",
-				   "trim", NULL, NULL, NULL, "randtrim" };
+	const char *__str[] = { NULL, "read", "write", "rw", NULL,
+				"randread", "randwrite", "randrw",
+				"trim", NULL, NULL, NULL, "randtrim" };
 
-	return ddir_str[ddir];
+	return __str[ddir];
 }
 
-#define ddir_trim(ddir) ((ddir) == DDIR_TRIM)
-
 #define ddir_rw_sum(arr)	\
 	((arr)[DDIR_READ] + (arr)[DDIR_WRITE] + (arr)[DDIR_TRIM])
 
diff --git a/io_u.c b/io_u.c
index e132fd9..f61fee8 100644
--- a/io_u.c
+++ b/io_u.c
@@ -68,6 +68,9 @@
 	if (td->o.zone_range)
 		max_size = td->o.zone_range;
 
+	if (td->o.min_bs[ddir] > td->o.ba[ddir])
+		max_size -= td->o.min_bs[ddir] - td->o.ba[ddir];
+
 	max_blocks = max_size / (uint64_t) td->o.ba[ddir];
 	if (!max_blocks)
 		return 0;
@@ -83,32 +86,26 @@
 static int __get_next_rand_offset(struct thread_data *td, struct fio_file *f,
 				  enum fio_ddir ddir, uint64_t *b)
 {
-	uint64_t r, lastb;
-
-	lastb = last_block(td, f, ddir);
-	if (!lastb)
-		return 1;
+	uint64_t r;
 
 	if (td->o.random_generator == FIO_RAND_GEN_TAUSWORTHE) {
-		uint64_t rmax;
+		uint64_t lastb;
 
-		rmax = td->o.use_os_rand ? OS_RAND_MAX : FRAND_MAX;
+		lastb = last_block(td, f, ddir);
+		if (!lastb)
+			return 1;
 
-		if (td->o.use_os_rand) {
-			rmax = OS_RAND_MAX;
-			r = os_random_long(&td->random_state);
-		} else {
-			rmax = FRAND_MAX;
-			r = __rand(&td->__random_state);
-		}
+		r = __rand(&td->random_state);
 
 		dprint(FD_RANDOM, "off rand %llu\n", (unsigned long long) r);
 
-		*b = lastb * (r / ((uint64_t) rmax + 1.0));
+		*b = lastb * (r / ((uint64_t) FRAND_MAX + 1.0));
 	} else {
 		uint64_t off = 0;
 
-		if (lfsr_next(&f->lfsr, &off, lastb))
+		assert(fio_file_lfsr(f));
+
+		if (lfsr_next(&f->lfsr, &off))
 			return 1;
 
 		*b = off;
@@ -200,13 +197,8 @@
 	if (td->o.perc_rand[ddir] == 100)
 		return 1;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->seq_rand_state[ddir]);
-		v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__seq_rand_state[ddir]);
-		v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
-	}
+	r = __rand(&td->seq_rand_state[ddir]);
+	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
 
 	return v <= td->o.perc_rand[ddir];
 }
@@ -221,9 +213,8 @@
 		return get_off_from_method(td, f, ddir, b);
 
 	if (!flist_empty(&td->next_rand_list)) {
-		struct rand_off *r;
 fetch:
-		r = flist_entry(td->next_rand_list.next, struct rand_off, list);
+		r = flist_first_entry(&td->next_rand_list, struct rand_off, list);
 		flist_del(&r->list);
 		*b = r->off;
 		free(r);
@@ -263,7 +254,7 @@
 	}
 
 	dprint(FD_IO, "%s: rand offset failed, last=%llu, size=%llu\n",
-			f->file_name, (unsigned long long) f->last_pos,
+			f->file_name, (unsigned long long) f->last_pos[ddir],
 			(unsigned long long) f->real_file_size);
 	return 1;
 }
@@ -271,20 +262,32 @@
 static int get_next_seq_offset(struct thread_data *td, struct fio_file *f,
 			       enum fio_ddir ddir, uint64_t *offset)
 {
+	struct thread_options *o = &td->o;
+
 	assert(ddir_rw(ddir));
 
-	if (f->last_pos >= f->io_size + get_start_offset(td, f) && td->o.time_based)
-		f->last_pos = f->last_pos - f->io_size;
+	if (f->last_pos[ddir] >= f->io_size + get_start_offset(td, f) &&
+	    o->time_based)
+		f->last_pos[ddir] = f->last_pos[ddir] - f->io_size;
 
-	if (f->last_pos < f->real_file_size) {
+	if (f->last_pos[ddir] < f->real_file_size) {
 		uint64_t pos;
 
-		if (f->last_pos == f->file_offset && td->o.ddir_seq_add < 0)
-			f->last_pos = f->real_file_size;
+		if (f->last_pos[ddir] == f->file_offset && o->ddir_seq_add < 0)
+			f->last_pos[ddir] = f->real_file_size;
 
-		pos = f->last_pos - f->file_offset;
-		if (pos)
-			pos += td->o.ddir_seq_add;
+		pos = f->last_pos[ddir] - f->file_offset;
+		if (pos && o->ddir_seq_add) {
+			pos += o->ddir_seq_add;
+
+			/*
+			 * If we reach beyond the end of the file
+			 * with holed IO, wrap around to the
+			 * beginning again.
+			 */
+			if (pos >= f->real_file_size)
+				pos = f->file_offset;
+		}
 
 		*offset = pos;
 		return 0;
@@ -332,8 +335,8 @@
 				*is_random = 0;
 			}
 		} else if (td->o.rw_seq == RW_SEQ_IDENT) {
-			if (f->last_start != -1ULL)
-				offset = f->last_start - f->file_offset;
+			if (f->last_start[ddir] != -1ULL)
+				offset = f->last_start[ddir] - f->file_offset;
 			else
 				offset = 0;
 			ret = 0;
@@ -424,7 +427,7 @@
 	int ddir = io_u->ddir;
 	unsigned int buflen = 0;
 	unsigned int minbs, maxbs;
-	unsigned long r, rand_max;
+	unsigned long r;
 
 	assert(ddir_rw(ddir));
 
@@ -443,20 +446,12 @@
 	if (!io_u_fits(td, io_u, minbs))
 		return 0;
 
-	if (td->o.use_os_rand)
-		rand_max = OS_RAND_MAX;
-	else
-		rand_max = FRAND_MAX;
-
 	do {
-		if (td->o.use_os_rand)
-			r = os_random_long(&td->bsrange_state);
-		else
-			r = __rand(&td->__bsrange_state);
+		r = __rand(&td->bsrange_state);
 
 		if (!td->o.bssplit_nr[ddir]) {
 			buflen = 1 + (unsigned int) ((double) maxbs *
-					(r / (rand_max + 1.0)));
+					(r / (FRAND_MAX + 1.0)));
 			if (buflen < minbs)
 				buflen = minbs;
 		} else {
@@ -468,7 +463,7 @@
 
 				buflen = bsp->bs;
 				perc += bsp->perc;
-				if ((r <= ((rand_max / 100L) * perc)) &&
+				if ((r <= ((FRAND_MAX / 100L) * perc)) &&
 				    io_u_fits(td, io_u, buflen))
 					break;
 			}
@@ -517,13 +512,8 @@
 	unsigned int v;
 	unsigned long r;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->rwmix_state);
-		v = 1 + (int) (100.0 * (r / (OS_RAND_MAX + 1.0)));
-	} else {
-		r = __rand(&td->__rwmix_state);
-		v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
-	}
+	r = __rand(&td->rwmix_state);
+	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
 
 	if (v <= td->o.rwmix[DDIR_READ])
 		return DDIR_READ;
@@ -542,6 +532,12 @@
 	 * io's that have been actually submitted to an async engine,
 	 * and cur_depth is meaningless for sync engines.
 	 */
+	if (td->io_u_queued || td->cur_depth) {
+		int fio_unused ret;
+
+		ret = td_io_commit(td);
+	}
+
 	while (td->io_u_in_flight) {
 		int fio_unused ret;
 
@@ -552,7 +548,6 @@
 static enum fio_ddir rate_ddir(struct thread_data *td, enum fio_ddir ddir)
 {
 	enum fio_ddir odir = ddir ^ 1;
-	struct timeval t;
 	long usec;
 
 	assert(ddir_rw(ddir));
@@ -587,9 +582,7 @@
 
 	io_u_quiesce(td);
 
-	fio_gettime(&t, NULL);
-	usec_sleep(td, usec);
-	usec = utime_since_now(&t);
+	usec = usec_sleep(td, usec);
 
 	td->rate_pending_usleep[ddir] -= usec;
 
@@ -597,8 +590,8 @@
 	if (td_rw(td) && __should_check_rate(td, odir))
 		td->rate_pending_usleep[odir] -= usec;
 
-	if (ddir_trim(ddir))
-		return ddir;
+	if (ddir == DDIR_TRIM)
+		return DDIR_TRIM;
 
 	return ddir;
 }
@@ -688,10 +681,10 @@
 {
 	td_io_u_lock(td);
 
-	if (io_u->file && !(io_u->flags & IO_U_F_FREE_DEF))
+	if (io_u->file && !(io_u->flags & IO_U_F_NO_FILE_PUT))
 		put_file_log(td, io_u->file);
+
 	io_u->file = NULL;
-	io_u->flags &= ~IO_U_F_FREE_DEF;
 	io_u->flags |= IO_U_F_FREE;
 
 	if (io_u->flags & IO_U_F_IN_CUR_DEPTH)
@@ -748,9 +741,17 @@
 	 * See if it's time to switch to a new zone
 	 */
 	if (td->zone_bytes >= td->o.zone_size && td->o.zone_skip) {
+		struct fio_file *f = io_u->file;
+
 		td->zone_bytes = 0;
-		io_u->file->file_offset += td->o.zone_range + td->o.zone_skip;
-		io_u->file->last_pos = io_u->file->file_offset;
+		f->file_offset += td->o.zone_range + td->o.zone_skip;
+
+		/*
+		 * Wrap from the beginning, if we exceed the file size
+		 */
+		if (f->file_offset >= f->real_file_size)
+			f->file_offset = f->real_file_size - f->file_offset;
+		f->last_pos[io_u->ddir] = f->file_offset;
 		td->io_skip_bytes += td->o.zone_skip;
 	}
 
@@ -967,15 +968,9 @@
 		int opened = 0;
 		unsigned long r;
 
-		if (td->o.use_os_rand) {
-			r = os_random_long(&td->next_file_state);
-			fno = (unsigned int) ((double) td->o.nr_files
-				* (r / (OS_RAND_MAX + 1.0)));
-		} else {
-			r = __rand(&td->__next_file_state);
-			fno = (unsigned int) ((double) td->o.nr_files
+		r = __rand(&td->next_file_state);
+		fno = (unsigned int) ((double) td->o.nr_files
 				* (r / (FRAND_MAX + 1.0)));
-		}
 
 		f = td->files[fno];
 		if (fio_file_done(f))
@@ -1100,7 +1095,7 @@
 
 static struct fio_file *get_next_file(struct thread_data *td)
 {
-	if (!(td->flags & TD_F_PROFILE_OPS)) {
+	if (td->flags & TD_F_PROFILE_OPS) {
 		struct prof_io_ops *ops = &td->prof_io_ops;
 
 		if (ops->get_next_file)
@@ -1281,7 +1276,7 @@
  * If latency target is enabled, we might be ramping up or down and not
  * using the full queue depth available.
  */
-int queue_full(struct thread_data *td)
+int queue_full(const struct thread_data *td)
 {
 	const int qempty = io_u_qempty(&td->io_u_freelist);
 
@@ -1297,6 +1292,9 @@
 {
 	struct io_u *io_u = NULL;
 
+	if (td->stop_io)
+		return NULL;
+
 	td_io_u_lock(td);
 
 again:
@@ -1313,9 +1311,9 @@
 
 	if (io_u) {
 		assert(io_u->flags & IO_U_F_FREE);
-		io_u->flags &= ~(IO_U_F_FREE | IO_U_F_FREE_DEF);
-		io_u->flags &= ~(IO_U_F_TRIMMED | IO_U_F_BARRIER);
-		io_u->flags &= ~IO_U_F_VER_LIST;
+		io_u->flags &= ~(IO_U_F_FREE | IO_U_F_NO_FILE_PUT |
+				 IO_U_F_TRIMMED | IO_U_F_BARRIER |
+				 IO_U_F_VER_LIST);
 
 		io_u->error = 0;
 		io_u->acct_ddir = -1;
@@ -1481,13 +1479,14 @@
 			goto err_put;
 		}
 
-		f->last_start = io_u->offset;
-		f->last_pos = io_u->offset + io_u->buflen;
+		f->last_start[io_u->ddir] = io_u->offset;
+		f->last_pos[io_u->ddir] = io_u->offset + io_u->buflen;
 
 		if (io_u->ddir == DDIR_WRITE) {
 			if (td->flags & TD_F_REFILL_BUFFERS) {
 				io_u_fill_buffer(td, io_u,
-					io_u->xfer_buflen, io_u->xfer_buflen);
+					td->o.min_bs[DDIR_WRITE],
+					io_u->xfer_buflen);
 			} else if ((td->flags & TD_F_SCRAMBLE_BUFFERS) &&
 				   !(td->flags & TD_F_COMPRESS))
 				do_scramble = 1;
@@ -1528,21 +1527,16 @@
 void io_u_log_error(struct thread_data *td, struct io_u *io_u)
 {
 	enum error_type_bit eb = td_error_type(io_u->ddir, io_u->error);
-	const char *msg[] = { "read", "write", "sync", "datasync",
-				"sync_file_range", "wait", "trim" };
 
 	if (td_non_fatal_error(td, eb, io_u->error) && !td->o.error_dump)
 		return;
 
-	log_err("fio: io_u error");
-
-	if (io_u->file)
-		log_err(" on file %s", io_u->file->file_name);
-
-	log_err(": %s\n", strerror(io_u->error));
-
-	log_err("     %s offset=%llu, buflen=%lu\n", msg[io_u->ddir],
-					io_u->offset, io_u->xfer_buflen);
+	log_err("fio: io_u error%s%s: %s: %s offset=%llu, buflen=%lu\n",
+		io_u->file ? " on file " : "",
+		io_u->file ? io_u->file->file_name : "",
+		strerror(io_u->error),
+		io_ddir_name(io_u->ddir),
+		io_u->offset, io_u->xfer_buflen);
 
 	if (!td->error)
 		td_verror(td, io_u->error, "io_u error");
@@ -1567,7 +1561,7 @@
 		unsigned long tusec;
 
 		tusec = utime_since(&io_u->start_time, &icd->time);
-		add_lat_sample(td, idx, tusec, bytes);
+		add_lat_sample(td, idx, tusec, bytes, io_u->offset);
 
 		if (td->flags & TD_F_PROFILE_OPS) {
 			struct prof_io_ops *ops = &td->prof_io_ops;
@@ -1585,7 +1579,7 @@
 	}
 
 	if (!td->o.disable_clat) {
-		add_clat_sample(td, idx, lusec, bytes);
+		add_clat_sample(td, idx, lusec, bytes, io_u->offset);
 		io_u_mark_latency(td, lusec);
 	}
 
@@ -1607,10 +1601,12 @@
 	return remainder * 1000000 / bps + secs * 1000000;
 }
 
-static void io_completed(struct thread_data *td, struct io_u *io_u,
+static void io_completed(struct thread_data *td, struct io_u **io_u_ptr,
 			 struct io_completion_data *icd)
 {
-	struct fio_file *f;
+	struct io_u *io_u = *io_u_ptr;
+	enum fio_ddir ddir = io_u->ddir;
+	struct fio_file *f = io_u->file;
 
 	dprint_io_u(io_u, "io complete");
 
@@ -1635,9 +1631,8 @@
 
 	td_io_u_unlock(td);
 
-	if (ddir_sync(io_u->ddir)) {
+	if (ddir_sync(ddir)) {
 		td->last_was_sync = 1;
-		f = io_u->file;
 		if (f) {
 			f->first_write = -1ULL;
 			f->last_write = -1ULL;
@@ -1646,23 +1641,21 @@
 	}
 
 	td->last_was_sync = 0;
-	td->last_ddir = io_u->ddir;
+	td->last_ddir = ddir;
 
-	if (!io_u->error && ddir_rw(io_u->ddir)) {
+	if (!io_u->error && ddir_rw(ddir)) {
 		unsigned int bytes = io_u->buflen - io_u->resid;
-		const enum fio_ddir idx = io_u->ddir;
-		const enum fio_ddir odx = io_u->ddir ^ 1;
+		const enum fio_ddir oddir = ddir ^ 1;
 		int ret;
 
-		td->io_blocks[idx]++;
-		td->this_io_blocks[idx]++;
-		td->io_bytes[idx] += bytes;
+		td->io_blocks[ddir]++;
+		td->this_io_blocks[ddir]++;
+		td->io_bytes[ddir] += bytes;
 
 		if (!(io_u->flags & IO_U_F_VER_LIST))
-			td->this_io_bytes[idx] += bytes;
+			td->this_io_bytes[ddir] += bytes;
 
-		if (idx == DDIR_WRITE) {
-			f = io_u->file;
+		if (ddir == DDIR_WRITE) {
 			if (f) {
 				if (f->first_write == -1ULL ||
 				    io_u->offset < f->first_write)
@@ -1671,27 +1664,37 @@
 				    ((io_u->offset + bytes) > f->last_write))
 					f->last_write = io_u->offset + bytes;
 			}
+			if (td->last_write_comp) {
+				int idx = td->last_write_idx++;
+
+				td->last_write_comp[idx] = io_u->offset;
+				if (td->last_write_idx == td->o.iodepth)
+					td->last_write_idx = 0;
+			}
 		}
 
 		if (ramp_time_over(td) && (td->runstate == TD_RUNNING ||
 					   td->runstate == TD_VERIFYING)) {
-			account_io_completion(td, io_u, icd, idx, bytes);
+			account_io_completion(td, io_u, icd, ddir, bytes);
 
-			if (__should_check_rate(td, idx)) {
-				td->rate_pending_usleep[idx] =
-					(usec_for_io(td, idx) -
+			if (__should_check_rate(td, ddir)) {
+				td->rate_pending_usleep[ddir] =
+					(usec_for_io(td, ddir) -
 					 utime_since_now(&td->start));
 			}
-			if (idx != DDIR_TRIM && __should_check_rate(td, odx))
-				td->rate_pending_usleep[odx] =
-					(usec_for_io(td, odx) -
+			if (ddir != DDIR_TRIM &&
+			    __should_check_rate(td, oddir)) {
+				td->rate_pending_usleep[oddir] =
+					(usec_for_io(td, oddir) -
 					 utime_since_now(&td->start));
+			}
 		}
 
-		icd->bytes_done[idx] += bytes;
+		icd->bytes_done[ddir] += bytes;
 
 		if (io_u->end_io) {
-			ret = io_u->end_io(td, io_u);
+			ret = io_u->end_io(td, io_u_ptr);
+			io_u = *io_u_ptr;
 			if (ret && !icd->error)
 				icd->error = ret;
 		}
@@ -1700,9 +1703,11 @@
 		io_u_log_error(td, io_u);
 	}
 	if (icd->error) {
-		enum error_type_bit eb = td_error_type(io_u->ddir, icd->error);
+		enum error_type_bit eb = td_error_type(ddir, icd->error);
+
 		if (!td_non_fatal_error(td, eb, icd->error))
 			return;
+
 		/*
 		 * If there is a non_fatal error, then add to the error count
 		 * and clear all the errors.
@@ -1710,7 +1715,8 @@
 		update_error_count(td, icd->error);
 		td_clear_error(td);
 		icd->error = 0;
-		io_u->error = 0;
+		if (io_u)
+			io_u->error = 0;
 	}
 }
 
@@ -1738,9 +1744,9 @@
 	for (i = 0; i < icd->nr; i++) {
 		io_u = td->io_ops->event(td, i);
 
-		io_completed(td, io_u, icd);
+		io_completed(td, &io_u, icd);
 
-		if (!(io_u->flags & IO_U_F_FREE_DEF))
+		if (io_u)
 			put_io_u(td, io_u);
 	}
 }
@@ -1754,9 +1760,9 @@
 	struct io_completion_data icd;
 
 	init_icd(td, &icd, 1);
-	io_completed(td, io_u, &icd);
+	io_completed(td, &io_u, &icd);
 
-	if (!(io_u->flags & IO_U_F_FREE_DEF))
+	if (io_u)
 		put_io_u(td, io_u);
 
 	if (icd.error) {
@@ -1789,6 +1795,8 @@
 
 	if (!min_evts)
 		tvp = &ts;
+	else if (min_evts > td->cur_depth)
+		min_evts = td->cur_depth;
 
 	ret = td_io_getevents(td, min_evts, td->o.iodepth_batch_complete, tvp);
 	if (ret < 0) {
@@ -1823,30 +1831,74 @@
 		unsigned long slat_time;
 
 		slat_time = utime_since(&io_u->start_time, &io_u->issue_time);
-		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen);
+		add_slat_sample(td, io_u->ddir, slat_time, io_u->xfer_buflen,
+				io_u->offset);
 	}
 }
 
+/*
+ * See if we should reuse the last seed, if dedupe is enabled
+ */
+static struct frand_state *get_buf_state(struct thread_data *td)
+{
+	unsigned int v;
+	unsigned long r;
+
+	if (!td->o.dedupe_percentage)
+		return &td->buf_state;
+	else if (td->o.dedupe_percentage == 100)
+		return &td->buf_state_prev;
+
+	r = __rand(&td->dedupe_state);
+	v = 1 + (int) (100.0 * (r / (FRAND_MAX + 1.0)));
+
+	if (v <= td->o.dedupe_percentage)
+		return &td->buf_state_prev;
+
+	return &td->buf_state;
+}
+
+static void save_buf_state(struct thread_data *td, struct frand_state *rs)
+{
+	if (rs == &td->buf_state)
+		frand_copy(&td->buf_state_prev, rs);
+}
+
 void fill_io_buffer(struct thread_data *td, void *buf, unsigned int min_write,
 		    unsigned int max_bs)
 {
-	if (td->o.buffer_pattern_bytes)
-		fill_buffer_pattern(td, buf, max_bs);
-	else if (!td->o.zero_buffers) {
+	struct thread_options *o = &td->o;
+
+	if (o->compress_percentage || o->dedupe_percentage) {
 		unsigned int perc = td->o.compress_percentage;
+		struct frand_state *rs;
+		unsigned int left = max_bs;
 
-		if (perc) {
-			unsigned int seg = min_write;
+		do {
+			rs = get_buf_state(td);
 
-			seg = min(min_write, td->o.compress_chunk);
-			if (!seg)
-				seg = min_write;
+			min_write = min(min_write, left);
 
-			fill_random_buf_percentage(&td->buf_state, buf,
-						perc, seg, max_bs);
-		} else
-			fill_random_buf(&td->buf_state, buf, max_bs);
-	} else
+			if (perc) {
+				unsigned int seg = min_write;
+
+				seg = min(min_write, td->o.compress_chunk);
+				if (!seg)
+					seg = min_write;
+
+				fill_random_buf_percentage(rs, buf, perc, seg,
+					min_write, o->buffer_pattern,
+						   o->buffer_pattern_bytes);
+			} else
+				fill_random_buf(rs, buf, min_write);
+
+			buf += min_write;
+			left -= min_write;
+			save_buf_state(td, rs);
+		} while (left);
+	} else if (o->buffer_pattern_bytes)
+		fill_buffer_pattern(td, buf, max_bs);
+	else
 		memset(buf, 0, max_bs);
 }
 
diff --git a/io_u_queue.h b/io_u_queue.h
index 5b6cad0..bda40d5 100644
--- a/io_u_queue.h
+++ b/io_u_queue.h
@@ -12,8 +12,13 @@
 
 static inline struct io_u *io_u_qpop(struct io_u_queue *q)
 {
-	if (q->nr)
-		return q->io_us[--q->nr];
+	if (q->nr) {
+		const unsigned int next = --q->nr;
+		struct io_u *io_u = q->io_us[next];
+
+		q->io_us[next] = NULL;
+		return io_u;
+	}
 
 	return NULL;
 }
@@ -23,7 +28,7 @@
 	q->io_us[q->nr++] = io_u;
 }
 
-static inline int io_u_qempty(struct io_u_queue *q)
+static inline int io_u_qempty(const struct io_u_queue *q)
 {
 	return !q->nr;
 }
diff --git a/ioengine.h b/ioengine.h
index 6e3c717..85923fc 100644
--- a/ioengine.h
+++ b/ioengine.h
@@ -15,12 +15,12 @@
 #include <guasi.h>
 #endif
 
-#define FIO_IOOPS_VERSION	18
+#define FIO_IOOPS_VERSION	21
 
 enum {
 	IO_U_F_FREE		= 1 << 0,
 	IO_U_F_FLIGHT		= 1 << 1,
-	IO_U_F_FREE_DEF		= 1 << 2,
+	IO_U_F_NO_FILE_PUT	= 1 << 2,
 	IO_U_F_IN_CUR_DEPTH	= 1 << 3,
 	IO_U_F_BUSY_OK		= 1 << 4,
 	IO_U_F_TRIMMED		= 1 << 5,
@@ -90,7 +90,7 @@
 	/*
 	 * Callback for io completion
 	 */
-	int (*end_io)(struct thread_data *, struct io_u *);
+	int (*end_io)(struct thread_data *, struct io_u **);
 
 	union {
 #ifdef CONFIG_LIBAIO
@@ -137,12 +137,14 @@
 	int (*prep)(struct thread_data *, struct io_u *);
 	int (*queue)(struct thread_data *, struct io_u *);
 	int (*commit)(struct thread_data *);
-	int (*getevents)(struct thread_data *, unsigned int, unsigned int, struct timespec *);
+	int (*getevents)(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
 	struct io_u *(*event)(struct thread_data *, int);
 	int (*cancel)(struct thread_data *, struct io_u *);
 	void (*cleanup)(struct thread_data *);
 	int (*open_file)(struct thread_data *, struct fio_file *);
 	int (*close_file)(struct thread_data *, struct fio_file *);
+	int (*invalidate)(struct thread_data *, struct fio_file *);
+	int (*unlink_file)(struct thread_data *, struct fio_file *);
 	int (*get_file_size)(struct thread_data *, struct fio_file *);
 	void (*terminate)(struct thread_data *);
 	int (*io_u_init)(struct thread_data *, struct io_u *);
@@ -165,6 +167,7 @@
 	FIO_BARRIER	= 1 << 8,	/* engine supports barriers */
 	FIO_MEMALIGN	= 1 << 9,	/* engine wants aligned memory */
 	FIO_BIT_BASED	= 1 << 10,	/* engine uses a bit base (e.g. uses Kbit as opposed to KB) */
+	FIO_FAKEIO	= 1 << 11,	/* engine pretends to do IO */
 };
 
 /*
@@ -179,10 +182,11 @@
 extern int __must_check td_io_prep(struct thread_data *, struct io_u *);
 extern int __must_check td_io_queue(struct thread_data *, struct io_u *);
 extern int __must_check td_io_sync(struct thread_data *, struct fio_file *);
-extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, struct timespec *);
+extern int __must_check td_io_getevents(struct thread_data *, unsigned int, unsigned int, const struct timespec *);
 extern int __must_check td_io_commit(struct thread_data *);
 extern int __must_check td_io_open_file(struct thread_data *, struct fio_file *);
 extern int td_io_close_file(struct thread_data *, struct fio_file *);
+extern int td_io_unlink_file(struct thread_data *, struct fio_file *);
 extern int __must_check td_io_get_file_size(struct thread_data *, struct fio_file *);
 
 extern struct ioengine_ops *load_ioengine(struct thread_data *, const char *);
@@ -211,10 +215,10 @@
 extern void io_u_fill_buffer(struct thread_data *td, struct io_u *, unsigned int, unsigned int);
 void io_u_mark_complete(struct thread_data *, unsigned int);
 void io_u_mark_submit(struct thread_data *, unsigned int);
-int queue_full(struct thread_data *);
+int queue_full(const struct thread_data *);
 
-int do_io_u_sync(struct thread_data *, struct io_u *);
-int do_io_u_trim(struct thread_data *, struct io_u *);
+int do_io_u_sync(const struct thread_data *, struct io_u *);
+int do_io_u_trim(const struct thread_data *, struct io_u *);
 
 #ifdef FIO_INC_DEBUG
 static inline void dprint_io_u(struct io_u *io_u, const char *p)
diff --git a/ioengines.c b/ioengines.c
index 0f94d0d..00098d6 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -220,7 +220,7 @@
 }
 
 int td_io_getevents(struct thread_data *td, unsigned int min, unsigned int max,
-		    struct timespec *t)
+		    const struct timespec *t)
 {
 	int r = 0;
 
@@ -294,13 +294,20 @@
 					sizeof(struct timeval));
 	}
 
-	if (ddir_rw(acct_ddir(io_u)))
+	if (ddir_rw(acct_ddir(io_u))) {
 		td->io_issues[acct_ddir(io_u)]++;
+		td->io_issue_bytes[acct_ddir(io_u)] += io_u->xfer_buflen;
+	}
 
 	ret = td->io_ops->queue(td, io_u);
 
 	unlock_file(td, io_u->file);
 
+	if (ret == FIO_Q_BUSY && ddir_rw(acct_ddir(io_u))) {
+		td->io_issues[acct_ddir(io_u)]--;
+		td->io_issue_bytes[acct_ddir(io_u)] -= io_u->xfer_buflen;
+	}
+
 	/*
 	 * If an error was seen and the io engine didn't propagate it
 	 * back to 'td', do so.
@@ -321,7 +328,7 @@
 			 "support direct IO, or iomem_align= is bad.\n");
 	}
 
-	if (!td->io_ops->commit || ddir_trim(io_u->ddir)) {
+	if (!td->io_ops->commit || io_u->ddir == DDIR_TRIM) {
 		io_u_mark_submit(td, 1);
 		io_u_mark_complete(td, 1);
 	}
@@ -506,6 +513,14 @@
 	return put_file(td, f);
 }
 
+int td_io_unlink_file(struct thread_data *td, struct fio_file *f)
+{
+	if (td->io_ops->unlink_file)
+		return td->io_ops->unlink_file(td, f);
+	else
+		return unlink(f->file_name);
+}
+
 int td_io_get_file_size(struct thread_data *td, struct fio_file *f)
 {
 	if (!td->io_ops->get_file_size)
@@ -514,7 +529,8 @@
 	return td->io_ops->get_file_size(td, f);
 }
 
-static int do_sync_file_range(struct thread_data *td, struct fio_file *f)
+static int do_sync_file_range(const struct thread_data *td,
+			      struct fio_file *f)
 {
 	off64_t offset, nbytes;
 
@@ -527,7 +543,7 @@
 	return sync_file_range(f->fd, offset, nbytes, td->o.sync_file_range);
 }
 
-int do_io_u_sync(struct thread_data *td, struct io_u *io_u)
+int do_io_u_sync(const struct thread_data *td, struct io_u *io_u)
 {
 	int ret;
 
@@ -553,7 +569,7 @@
 	return ret;
 }
 
-int do_io_u_trim(struct thread_data *td, struct io_u *io_u)
+int do_io_u_trim(const struct thread_data *td, struct io_u *io_u)
 {
 #ifndef FIO_HAVE_TRIM
 	io_u->error = EINVAL;
diff --git a/iolog.c b/iolog.c
index cac1aba..dfa329f 100644
--- a/iolog.c
+++ b/iolog.c
@@ -6,11 +6,19 @@
 #include <stdlib.h>
 #include <libgen.h>
 #include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <unistd.h>
+#ifdef CONFIG_ZLIB
+#include <zlib.h>
+#endif
+
 #include "flist.h"
 #include "fio.h"
 #include "verify.h"
 #include "trim.h"
 #include "filelock.h"
+#include "lib/tp.h"
 
 static const char iolog_ver2[] = "fio version 2 iolog";
 
@@ -20,19 +28,14 @@
 	td->total_io_size += ipo->len;
 }
 
-void log_io_u(struct thread_data *td, struct io_u *io_u)
+void log_io_u(const struct thread_data *td, const struct io_u *io_u)
 {
-	const char *act[] = { "read", "write", "sync", "datasync",
-				"sync_file_range", "wait", "trim" };
-
-	assert(io_u->ddir <= 6);
-
 	if (!td->o.write_iolog_file)
 		return;
 
 	fprintf(td->iolog_f, "%s %s %llu %lu\n", io_u->file->file_name,
-						act[io_u->ddir], io_u->offset,
-						io_u->buflen);
+						io_ddir_name(io_u->ddir),
+						io_u->offset, io_u->buflen);
 }
 
 void log_file(struct thread_data *td, struct fio_file *f,
@@ -57,20 +60,22 @@
 
 static void iolog_delay(struct thread_data *td, unsigned long delay)
 {
-	unsigned long usec = utime_since_now(&td->last_issue);
-	unsigned long this_delay;
+	uint64_t usec = utime_since_now(&td->last_issue);
+	uint64_t this_delay;
+	struct timeval tv;
 
+	if (delay < td->time_offset) {
+		td->time_offset = 0;
+		return;
+	}
+
+	delay -= td->time_offset;
 	if (delay < usec)
 		return;
 
 	delay -= usec;
 
-	/*
-	 * less than 100 usec delay, just regard it as noise
-	 */
-	if (delay < 100)
-		return;
-
+	fio_gettime(&tv, NULL);
 	while (delay && !td->terminate) {
 		this_delay = delay;
 		if (this_delay > 500000)
@@ -79,6 +84,12 @@
 		usec_sleep(td, this_delay);
 		delay -= this_delay;
 	}
+
+	usec = utime_since_now(&tv);
+	if (usec > delay)
+		td->time_offset = usec - delay;
+	else
+		td->time_offset = 0;
 }
 
 static int ipo_special(struct thread_data *td, struct io_piece *ipo)
@@ -105,7 +116,7 @@
 		td_io_close_file(td, f);
 		break;
 	case FIO_LOG_UNLINK_FILE:
-		unlink(f->file_name);
+		td_io_unlink_file(td, f);
 		break;
 	default:
 		log_err("fio: bad file action %d\n", ipo->file_action);
@@ -123,7 +134,7 @@
 	while (!flist_empty(&td->io_log_list)) {
 		int ret;
 
-		ipo = flist_entry(td->io_log_list.next, struct io_piece, list);
+		ipo = flist_first_entry(&td->io_log_list, struct io_piece, list);
 		flist_del(&ipo->list);
 		remove_trim_entry(td, ipo);
 
@@ -176,7 +187,7 @@
 	}
 
 	while (!flist_empty(&td->io_hist_list)) {
-		ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
 		flist_del(&ipo->list);
 		remove_trim_entry(td, ipo);
 		td->io_hist_len--;
@@ -239,6 +250,7 @@
 	p = &td->io_hist_tree.rb_node;
 	parent = NULL;
 	while (*p) {
+		int overlap = 0;
 		parent = *p;
 
 		__ipo = rb_entry(parent, struct io_piece, rb_node);
@@ -246,11 +258,18 @@
 			p = &(*p)->rb_left;
 		else if (ipo->file > __ipo->file)
 			p = &(*p)->rb_right;
-		else if (ipo->offset < __ipo->offset)
+		else if (ipo->offset < __ipo->offset) {
 			p = &(*p)->rb_left;
-		else if (ipo->offset > __ipo->offset)
+			overlap = ipo->offset + ipo->len > __ipo->offset;
+		}
+		else if (ipo->offset > __ipo->offset) {
 			p = &(*p)->rb_right;
-		else {
+			overlap = __ipo->offset + __ipo->len > ipo->offset;
+		}
+		else
+			overlap = 1;
+
+		if (overlap) {
 			dprint(FD_IO, "iolog: overlap %llu/%lu, %llu/%lu",
 				__ipo->offset, __ipo->len,
 				ipo->offset, ipo->len);
@@ -285,7 +304,7 @@
 	td->io_hist_len--;
 }
 
-void trim_io_piece(struct thread_data *td, struct io_u *io_u)
+void trim_io_piece(struct thread_data *td, const struct io_u *io_u)
 {
 	struct io_piece *ipo = io_u->ipo;
 
@@ -410,6 +429,7 @@
 				td->o.max_bs[rw] = bytes;
 			ipo->fileno = fileno;
 			ipo->file_action = file_action;
+			td->o.size += bytes;
 		}
 
 		queue_io_piece(td, ipo);
@@ -538,16 +558,35 @@
 	return ret;
 }
 
-void setup_log(struct io_log **log, unsigned long avg_msec, int log_type)
+void setup_log(struct io_log **log, struct log_params *p,
+	       const char *filename)
 {
-	struct io_log *l = malloc(sizeof(*l));
+	struct io_log *l;
 
-	memset(l, 0, sizeof(*l));
+	l = calloc(1, sizeof(*l));
 	l->nr_samples = 0;
 	l->max_samples = 1024;
-	l->log_type = log_type;
-	l->log = malloc(l->max_samples * sizeof(struct io_sample));
-	l->avg_msec = avg_msec;
+	l->log_type = p->log_type;
+	l->log_offset = p->log_offset;
+	l->log_gz = p->log_gz;
+	l->log_gz_store = p->log_gz_store;
+	l->log = malloc(l->max_samples * log_entry_sz(l));
+	l->avg_msec = p->avg_msec;
+	l->filename = strdup(filename);
+	l->td = p->td;
+
+	if (l->log_offset)
+		l->log_ddir_mask = LOG_OFFSET_SAMPLE_BIT;
+
+	INIT_FLIST_HEAD(&l->chunk_list);
+
+	if (l->log_gz && !p->td)
+		l->log_gz = 0;
+	else if (l->log_gz) {
+		pthread_mutex_init(&l->chunk_lock, NULL);
+		p->td->flags |= TD_F_COMPRESS_LOG;
+	}
+
 	*log = l;
 }
 
@@ -577,13 +616,329 @@
 }
 #endif
 
-void __finish_log(struct io_log *log, const char *name)
+void free_log(struct io_log *log)
 {
-	unsigned int i;
+	free(log->log);
+	free(log->filename);
+	free(log);
+}
+
+static void flush_samples(FILE *f, void *samples, uint64_t sample_size)
+{
+	struct io_sample *s;
+	int log_offset;
+	uint64_t i, nr_samples;
+
+	if (!sample_size)
+		return;
+
+	s = __get_sample(samples, 0, 0);
+	log_offset = (s->__ddir & LOG_OFFSET_SAMPLE_BIT) != 0;
+
+	nr_samples = sample_size / __log_entry_sz(log_offset);
+
+	for (i = 0; i < nr_samples; i++) {
+		s = __get_sample(samples, log_offset, i);
+
+		if (!log_offset) {
+			fprintf(f, "%lu, %lu, %u, %u\n",
+					(unsigned long) s->time,
+					(unsigned long) s->val,
+					io_sample_ddir(s), s->bs);
+		} else {
+			struct io_sample_offset *so = (void *) s;
+
+			fprintf(f, "%lu, %lu, %u, %u, %llu\n",
+					(unsigned long) s->time,
+					(unsigned long) s->val,
+					io_sample_ddir(s), s->bs,
+					(unsigned long long) so->offset);
+		}
+	}
+}
+
+#ifdef CONFIG_ZLIB
+
+struct iolog_flush_data {
+	struct tp_work work;
+	struct io_log *log;
+	void *samples;
+	uint64_t nr_samples;
+};
+
+struct iolog_compress {
+	struct flist_head list;
+	void *buf;
+	size_t len;
+	unsigned int seq;
+};
+
+#define GZ_CHUNK	131072
+
+static struct iolog_compress *get_new_chunk(unsigned int seq)
+{
+	struct iolog_compress *c;
+
+	c = malloc(sizeof(*c));
+	INIT_FLIST_HEAD(&c->list);
+	c->buf = malloc(GZ_CHUNK);
+	c->len = 0;
+	c->seq = seq;
+	return c;
+}
+
+static void free_chunk(struct iolog_compress *ic)
+{
+	free(ic->buf);
+	free(ic);
+}
+
+static int z_stream_init(z_stream *stream, int gz_hdr)
+{
+	int wbits = 15;
+
+	stream->zalloc = Z_NULL;
+	stream->zfree = Z_NULL;
+	stream->opaque = Z_NULL;
+	stream->next_in = Z_NULL;
+
+	/*
+	 * zlib magic - add 32 for auto-detection of gz header or not,
+	 * if we decide to store files in a gzip friendly format.
+	 */
+	if (gz_hdr)
+		wbits += 32;
+
+	if (inflateInit2(stream, wbits) != Z_OK)
+		return 1;
+
+	return 0;
+}
+
+struct inflate_chunk_iter {
+	unsigned int seq;
+	int err;
+	void *buf;
+	size_t buf_size;
+	size_t buf_used;
+	size_t chunk_sz;
+};
+
+static void finish_chunk(z_stream *stream, FILE *f,
+			 struct inflate_chunk_iter *iter)
+{
+	int ret;
+
+	ret = inflateEnd(stream);
+	if (ret != Z_OK)
+		log_err("fio: failed to end log inflation (%d)\n", ret);
+
+	flush_samples(f, iter->buf, iter->buf_used);
+	free(iter->buf);
+	iter->buf = NULL;
+	iter->buf_size = iter->buf_used = 0;
+}
+
+/*
+ * Iterative chunk inflation. Handles cases where we cross into a new
+ * sequence, doing flush finish of previous chunk if needed.
+ */
+static size_t inflate_chunk(struct iolog_compress *ic, int gz_hdr, FILE *f,
+			    z_stream *stream, struct inflate_chunk_iter *iter)
+{
+	size_t ret;
+
+	dprint(FD_COMPRESS, "inflate chunk size=%lu, seq=%u",
+				(unsigned long) ic->len, ic->seq);
+
+	if (ic->seq != iter->seq) {
+		if (iter->seq)
+			finish_chunk(stream, f, iter);
+
+		z_stream_init(stream, gz_hdr);
+		iter->seq = ic->seq;
+	}
+
+	stream->avail_in = ic->len;
+	stream->next_in = ic->buf;
+
+	if (!iter->buf_size) {
+		iter->buf_size = iter->chunk_sz;
+		iter->buf = malloc(iter->buf_size);
+	}
+
+	while (stream->avail_in) {
+		size_t this_out = iter->buf_size - iter->buf_used;
+		int err;
+
+		stream->avail_out = this_out;
+		stream->next_out = iter->buf + iter->buf_used;
+
+		err = inflate(stream, Z_NO_FLUSH);
+		if (err < 0) {
+			log_err("fio: failed inflating log: %d\n", err);
+			iter->err = err;
+			break;
+		}
+
+		iter->buf_used += this_out - stream->avail_out;
+
+		if (!stream->avail_out) {
+			iter->buf_size += iter->chunk_sz;
+			iter->buf = realloc(iter->buf, iter->buf_size);
+			continue;
+		}
+
+		if (err == Z_STREAM_END)
+			break;
+	}
+
+	ret = (void *) stream->next_in - ic->buf;
+
+	dprint(FD_COMPRESS, "inflated to size=%lu\n", (unsigned long) ret);
+
+	return ret;
+}
+
+/*
+ * Inflate stored compressed chunks, or write them directly to the log
+ * file if so instructed.
+ */
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = log->log_gz, };
+	z_stream stream;
+
+	while (!flist_empty(&log->chunk_list)) {
+		struct iolog_compress *ic;
+
+		ic = flist_first_entry(&log->chunk_list, struct iolog_compress, list);
+		flist_del(&ic->list);
+
+		if (log->log_gz_store) {
+			size_t ret;
+
+			dprint(FD_COMPRESS, "log write chunk size=%lu, "
+				"seq=%u\n", (unsigned long) ic->len, ic->seq);
+
+			ret = fwrite(ic->buf, ic->len, 1, f);
+			if (ret != 1 || ferror(f)) {
+				iter.err = errno;
+				log_err("fio: error writing compressed log\n");
+			}
+		} else
+			inflate_chunk(ic, log->log_gz_store, f, &stream, &iter);
+
+		free_chunk(ic);
+	}
+
+	if (iter.seq) {
+		finish_chunk(&stream, f, &iter);
+		free(iter.buf);
+	}
+
+	return iter.err;
+}
+
+/*
+ * Open compressed log file and decompress the stored chunks and
+ * write them to stdout. The chunks are stored sequentially in the
+ * file, so we iterate over them and do them one-by-one.
+ */
+int iolog_file_inflate(const char *file)
+{
+	struct inflate_chunk_iter iter = { .chunk_sz = 64 * 1024 * 1024, };
+	struct iolog_compress ic;
+	z_stream stream;
+	struct stat sb;
+	ssize_t ret;
+	size_t total;
 	void *buf;
 	FILE *f;
 
-	f = fopen(name, "a");
+	f = fopen(file, "r");
+	if (!f) {
+		perror("fopen");
+		return 1;
+	}
+
+	if (stat(file, &sb) < 0) {
+		fclose(f);
+		perror("stat");
+		return 1;
+	}
+
+	ic.buf = buf = malloc(sb.st_size);
+	ic.len = sb.st_size;
+	ic.seq = 1;
+
+	ret = fread(ic.buf, ic.len, 1, f);
+	if (ret < 0) {
+		perror("fread");
+		fclose(f);
+		free(buf);
+		return 1;
+	} else if (ret != 1) {
+		log_err("fio: short read on reading log\n");
+		fclose(f);
+		free(buf);
+		return 1;
+	}
+
+	fclose(f);
+
+	/*
+	 * Each chunk will return Z_STREAM_END. We don't know how many
+	 * chunks are in the file, so we just keep looping and incrementing
+	 * the sequence number until we have consumed the whole compressed
+	 * file.
+	 */
+	total = ic.len;
+	do {
+		size_t iret;
+
+		iret = inflate_chunk(&ic,  1, stdout, &stream, &iter);
+		total -= iret;
+		if (!total)
+			break;
+		if (iter.err)
+			break;
+
+		ic.seq++;
+		ic.len -= iret;
+		ic.buf += iret;
+	} while (1);
+
+	if (iter.seq) {
+		finish_chunk(&stream, stdout, &iter);
+		free(iter.buf);
+	}
+
+	free(buf);
+	return iter.err;
+}
+
+#else
+
+static int inflate_gz_chunks(struct io_log *log, FILE *f)
+{
+	return 0;
+}
+
+int iolog_file_inflate(const char *file)
+{
+	log_err("fio: log inflation not possible without zlib\n");
+	return 1;
+}
+
+#endif
+
+void flush_log(struct io_log *log)
+{
+	void *buf;
+	FILE *f;
+
+	f = fopen(log->filename, "w");
 	if (!f) {
 		perror("fopen log");
 		return;
@@ -591,99 +946,248 @@
 
 	buf = set_file_buffer(f);
 
-	for (i = 0; i < log->nr_samples; i++) {
-		fprintf(f, "%lu, %lu, %u, %u\n",
-				(unsigned long) log->log[i].time,
-				(unsigned long) log->log[i].val,
-				log->log[i].ddir, log->log[i].bs);
-	}
+	inflate_gz_chunks(log, f);
+
+	flush_samples(f, log->log, log->nr_samples * log_entry_sz(log));
 
 	fclose(f);
 	clear_file_buffer(buf);
-	free(log->log);
-	free(log);
 }
 
-static int finish_log_named(struct thread_data *td, struct io_log *log,
-			    const char *prefix, const char *postfix,
-			    int trylock)
+static int finish_log(struct thread_data *td, struct io_log *log, int trylock)
 {
-	char file_name[256];
-
-	snprintf(file_name, sizeof(file_name), "%s_%s.log", prefix, postfix);
+	if (td->tp_data)
+		iolog_flush(log, 1);
 
 	if (trylock) {
-		if (fio_trylock_file(file_name))
+		if (fio_trylock_file(log->filename))
 			return 1;
 	} else
-		fio_lock_file(file_name);
+		fio_lock_file(log->filename);
 
-	if (td->client_type == FIO_CLIENT_TYPE_GUI) {
-		fio_send_iolog(td, log, file_name);
-		free(log->log);
-		free(log);
-	} else
-		__finish_log(log, file_name);
+	if (td->client_type == FIO_CLIENT_TYPE_GUI)
+		fio_send_iolog(td, log, log->filename);
+	else
+		flush_log(log);
 
-	fio_unlock_file(file_name);
+	fio_unlock_file(log->filename);
+	free_log(log);
 	return 0;
 }
 
-static int finish_log(struct thread_data *td, struct io_log *log,
-		      const char *name, int trylock)
+#ifdef CONFIG_ZLIB
+
+/*
+ * Invoked from our compress helper thread, when logging would have exceeded
+ * the specified memory limitation. Compresses the previously stored
+ * entries.
+ */
+static int gz_work(struct tp_work *work)
 {
-	return finish_log_named(td, log, td->o.name, name, trylock);
+	struct iolog_flush_data *data;
+	struct iolog_compress *c;
+	struct flist_head list;
+	unsigned int seq;
+	z_stream stream;
+	size_t total = 0;
+	int ret;
+
+	INIT_FLIST_HEAD(&list);
+
+	data = container_of(work, struct iolog_flush_data, work);
+
+	stream.zalloc = Z_NULL;
+	stream.zfree = Z_NULL;
+	stream.opaque = Z_NULL;
+
+	ret = deflateInit(&stream, Z_DEFAULT_COMPRESSION);
+	if (ret != Z_OK) {
+		log_err("fio: failed to init gz stream\n");
+		return 0;
+	}
+
+	seq = ++data->log->chunk_seq;
+
+	stream.next_in = (void *) data->samples;
+	stream.avail_in = data->nr_samples * log_entry_sz(data->log);
+
+	dprint(FD_COMPRESS, "deflate input size=%lu, seq=%u\n",
+				(unsigned long) stream.avail_in, seq);
+	do {
+		c = get_new_chunk(seq);
+		stream.avail_out = GZ_CHUNK;
+		stream.next_out = c->buf;
+		ret = deflate(&stream, Z_NO_FLUSH);
+		if (ret < 0) {
+			log_err("fio: deflate log (%d)\n", ret);
+			free_chunk(c);
+			goto err;
+		}
+
+		c->len = GZ_CHUNK - stream.avail_out;
+		flist_add_tail(&c->list, &list);
+		total += c->len;
+	} while (stream.avail_in);
+
+	stream.next_out = c->buf + c->len;
+	stream.avail_out = GZ_CHUNK - c->len;
+
+	ret = deflate(&stream, Z_FINISH);
+	if (ret == Z_STREAM_END)
+		c->len = GZ_CHUNK - stream.avail_out;
+	else {
+		do {
+			c = get_new_chunk(seq);
+			stream.avail_out = GZ_CHUNK;
+			stream.next_out = c->buf;
+			ret = deflate(&stream, Z_FINISH);
+			c->len = GZ_CHUNK - stream.avail_out;
+			total += c->len;
+			flist_add_tail(&c->list, &list);
+		} while (ret != Z_STREAM_END);
+	}
+
+	dprint(FD_COMPRESS, "deflated to size=%lu\n", (unsigned long) total);
+
+	ret = deflateEnd(&stream);
+	if (ret != Z_OK)
+		log_err("fio: deflateEnd %d\n", ret);
+
+	free(data->samples);
+
+	if (!flist_empty(&list)) {
+		pthread_mutex_lock(&data->log->chunk_lock);
+		flist_splice_tail(&list, &data->log->chunk_list);
+		pthread_mutex_unlock(&data->log->chunk_lock);
+	}
+
+	ret = 0;
+done:
+	if (work->wait) {
+		work->done = 1;
+		pthread_cond_signal(&work->cv);
+	} else
+		free(data);
+
+	return ret;
+err:
+	while (!flist_empty(&list)) {
+		c = flist_first_entry(list.next, struct iolog_compress, list);
+		flist_del(&c->list);
+		free_chunk(c);
+	}
+	ret = 1;
+	goto done;
 }
 
-static int write_this_log(struct thread_data *td, struct io_log *log,
-			  const char *log_file, const char *name, int try)
+/*
+ * Queue work item to compress the existing log entries. We copy the
+ * samples, and reset the log sample count to 0 (so the logging will
+ * continue to use the memory associated with the log). If called with
+ * wait == 1, will not return until the log compression has completed.
+ */
+int iolog_flush(struct io_log *log, int wait)
 {
-	int ret;
+	struct tp_data *tdat = log->td->tp_data;
+	struct iolog_flush_data *data;
+	size_t sample_size;
+
+	data = malloc(sizeof(*data));
+	if (!data)
+		return 1;
+
+	data->log = log;
+
+	sample_size = log->nr_samples * log_entry_sz(log);
+	data->samples = malloc(sample_size);
+	if (!data->samples) {
+		free(data);
+		return 1;
+	}
+
+	memcpy(data->samples, log->log, sample_size);
+	data->nr_samples = log->nr_samples;
+	data->work.fn = gz_work;
+	log->nr_samples = 0;
+
+	if (wait) {
+		pthread_mutex_init(&data->work.lock, NULL);
+		pthread_cond_init(&data->work.cv, NULL);
+		data->work.wait = 1;
+	} else
+		data->work.wait = 0;
+
+	data->work.prio = 1;
+	tp_queue_work(tdat, &data->work);
+
+	if (wait) {
+		pthread_mutex_lock(&data->work.lock);
+		while (!data->work.done)
+			pthread_cond_wait(&data->work.cv, &data->work.lock);
+		pthread_mutex_unlock(&data->work.lock);
+		free(data);
+	}
+
+	return 0;
+}
+
+#else
+
+int iolog_flush(struct io_log *log, int wait)
+{
+	return 1;
+}
+
+#endif
+
+static int write_iops_log(struct thread_data *td, int try)
+{
+	struct io_log *log = td->iops_log;
 
 	if (!log)
 		return 0;
 
-	if (log_file)
-		ret = finish_log_named(td, log, log_file, name, try);
-	else
-		ret = finish_log(td, log, name, try);
-
-	return ret;
-}
-
-static int write_iops_log(struct thread_data *td, int try)
-{
-	struct thread_options *o = &td->o;
-
-	return write_this_log(td, td->iops_log, o->iops_log_file, "iops", try);
+	return finish_log(td, log, try);
 }
 
 static int write_slat_log(struct thread_data *td, int try)
 {
-	struct thread_options *o = &td->o;
+	struct io_log *log = td->slat_log;
 
-	return write_this_log(td, td->slat_log, o->lat_log_file, "slat", try);
+	if (!log)
+		return 0;
+
+	return finish_log(td, log, try);
 }
 
 static int write_clat_log(struct thread_data *td, int try)
 {
-	struct thread_options *o = &td->o;
+	struct io_log *log = td->clat_log;
 
-	return write_this_log(td, td->clat_log, o->lat_log_file, "clat" , try);
+	if (!log)
+		return 0;
+
+	return finish_log(td, log, try);
 }
 
 static int write_lat_log(struct thread_data *td, int try)
 {
-	struct thread_options *o = &td->o;
+	struct io_log *log = td->lat_log;
 
-	return write_this_log(td, td->lat_log, o->lat_log_file, "lat", try);
+	if (!log)
+		return 0;
+
+	return finish_log(td, log, try);
 }
 
 static int write_bandw_log(struct thread_data *td, int try)
 {
-	struct thread_options *o = &td->o;
+	struct io_log *log = td->bw_log;
 
-	return write_this_log(td, td->bw_log, o->bw_log_file, "bw", try);
+	if (!log)
+		return 0;
+
+	return finish_log(td, log, try);
 }
 
 enum {
diff --git a/iolog.h b/iolog.h
index 3af5668..a1e32ae 100644
--- a/iolog.h
+++ b/iolog.h
@@ -24,10 +24,15 @@
 struct io_sample {
 	uint64_t time;
 	uint64_t val;
-	uint32_t ddir;
+	uint32_t __ddir;
 	uint32_t bs;
 };
 
+struct io_sample_offset {
+	struct io_sample s;
+	uint64_t offset;
+};
+
 enum {
 	IO_LOG_TYPE_LAT = 1,
 	IO_LOG_TYPE_CLAT,
@@ -43,9 +48,15 @@
 	/*
 	 * Entries already logged
 	 */
-	unsigned long nr_samples;
-	unsigned long max_samples;
-	struct io_sample *log;
+	uint64_t nr_samples;
+	uint64_t max_samples;
+	void *log;
+
+	unsigned int log_ddir_mask;
+
+	char *filename;
+
+	struct thread_data *td;
 
 	unsigned int log_type;
 
@@ -55,14 +66,71 @@
 	unsigned int disabled;
 
 	/*
+	 * Log offsets
+	 */
+	unsigned int log_offset;
+
+	/*
+	 * Max size of log entries before a chunk is compressed
+	 */
+	unsigned int log_gz;
+
+	/*
+	 * Don't deflate for storing, just store the compressed bits
+	 */
+	unsigned int log_gz_store;
+
+	/*
 	 * Windowed average, for logging single entries average over some
 	 * period of time.
 	 */
 	struct io_stat avg_window[DDIR_RWDIR_CNT];
 	unsigned long avg_msec;
 	unsigned long avg_last;
+
+	pthread_mutex_t chunk_lock;
+	unsigned int chunk_seq;
+	struct flist_head chunk_list;
 };
 
+/*
+ * If the upper bit is set, then we have the offset as well
+ */
+#define LOG_OFFSET_SAMPLE_BIT	0x80000000U
+#define io_sample_ddir(io)	((io)->__ddir & ~LOG_OFFSET_SAMPLE_BIT)
+
+static inline void io_sample_set_ddir(struct io_log *log,
+				      struct io_sample *io,
+				      enum fio_ddir ddir)
+{
+	io->__ddir = ddir | log->log_ddir_mask;
+}
+
+static inline size_t __log_entry_sz(int log_offset)
+{
+	if (log_offset)
+		return sizeof(struct io_sample_offset);
+	else
+		return sizeof(struct io_sample);
+}
+
+static inline size_t log_entry_sz(struct io_log *log)
+{
+	return __log_entry_sz(log->log_offset);
+}
+
+static inline struct io_sample *__get_sample(void *samples, int log_offset,
+					     uint64_t sample)
+{
+	return samples + sample * __log_entry_sz(log_offset);
+}
+
+static inline struct io_sample *get_sample(struct io_log *iolog,
+					   uint64_t sample)
+{
+	return __get_sample(iolog->log, iolog->log_offset, sample);
+}
+
 enum {
 	IP_F_ONRB	= 1,
 	IP_F_ONLIST	= 2,
@@ -106,38 +174,54 @@
 
 struct io_u;
 extern int __must_check read_iolog_get(struct thread_data *, struct io_u *);
-extern void log_io_u(struct thread_data *, struct io_u *);
+extern void log_io_u(const struct thread_data *, const struct io_u *);
 extern void log_file(struct thread_data *, struct fio_file *, enum file_log_act);
 extern int __must_check init_iolog(struct thread_data *td);
 extern void log_io_piece(struct thread_data *, struct io_u *);
 extern void unlog_io_piece(struct thread_data *, struct io_u *);
-extern void trim_io_piece(struct thread_data *, struct io_u *);
+extern void trim_io_piece(struct thread_data *, const struct io_u *);
 extern void queue_io_piece(struct thread_data *, struct io_piece *);
 extern void prune_io_piece_log(struct thread_data *);
 extern void write_iolog_close(struct thread_data *);
 
+#ifdef CONFIG_ZLIB
+extern int iolog_file_inflate(const char *);
+#endif
+
 /*
  * Logging
  */
+struct log_params {
+	struct thread_data *td;
+	unsigned long avg_msec;
+	int log_type;
+	int log_offset;
+	int log_gz;
+	int log_gz_store;
+	int log_compress;
+};
+
 extern void finalize_logs(struct thread_data *td);
 extern void add_lat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
+				unsigned int, uint64_t);
 extern void add_clat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
+				unsigned int, uint64_t);
 extern void add_slat_sample(struct thread_data *, enum fio_ddir, unsigned long,
-				unsigned int);
+				unsigned int, uint64_t);
 extern void add_bw_sample(struct thread_data *, enum fio_ddir, unsigned int,
 				struct timeval *);
 extern void add_iops_sample(struct thread_data *, enum fio_ddir, unsigned int,
 				struct timeval *);
 extern void init_disk_util(struct thread_data *);
 extern void update_rusage_stat(struct thread_data *);
-extern void setup_log(struct io_log **, unsigned long, int);
-extern void __finish_log(struct io_log *, const char *);
+extern void setup_log(struct io_log **, struct log_params *, const char *);
+extern void flush_log(struct io_log *);
+extern void free_log(struct io_log *);
 extern struct io_log *agg_io_log[DDIR_RWDIR_CNT];
 extern int write_bw_log;
 extern void add_agg_sample(unsigned long, enum fio_ddir, unsigned int);
 extern void fio_writeout_logs(struct thread_data *);
+extern int iolog_flush(struct io_log *, int);
 
 static inline void init_ipo(struct io_piece *ipo)
 {
diff --git a/json.c b/json.c
index 7480a61..6145ee4 100644
--- a/json.c
+++ b/json.c
@@ -8,18 +8,12 @@
 
 struct json_object *json_create_object(void)
 {
-	struct json_object *obj = malloc(sizeof(struct json_object));
-	if (obj)
-		memset(obj, 0, sizeof(struct json_object));
-	return obj;
+	return calloc(1, sizeof(struct json_object));
 }
 
 struct json_array *json_create_array(void)
 {
-	struct json_array *array = malloc(sizeof(struct json_array));
-	if (array)
-		memset(array, 0, sizeof(struct json_array));
-	return array;
+	return calloc(1, sizeof(struct json_array));
 }
 
 static struct json_pair *json_create_pair(const char *name, struct json_value *value)
diff --git a/json.h b/json.h
index 081afd6..962c11c 100644
--- a/json.h
+++ b/json.h
@@ -52,7 +52,7 @@
 
 int json_object_add_value_type(struct json_object *obj, const char *name, int type, ...);
 #define json_object_add_value_int(obj, name, val) \
-	json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (val))
+	json_object_add_value_type((obj), name, JSON_TYPE_INTEGER, (long long) (val))
 #define json_object_add_value_float(obj, name, val) \
 	json_object_add_value_type((obj), name, JSON_TYPE_FLOAT, (val))
 #define json_object_add_value_string(obj, name, val) \
diff --git a/lib/axmap.c b/lib/axmap.c
index 15cd635..9153df5 100644
--- a/lib/axmap.c
+++ b/lib/axmap.c
@@ -22,7 +22,6 @@
 
 #include "../arch/arch.h"
 #include "axmap.h"
-#include "../smalloc.h"
 #include "../minmax.h"
 
 #if BITS_PER_LONG == 64
@@ -33,7 +32,7 @@
 #error "Number of arch bits unknown"
 #endif
 
-#define BLOCKS_PER_UNIT		(1UL << UNIT_SHIFT)
+#define BLOCKS_PER_UNIT		(1U << UNIT_SHIFT)
 #define BLOCKS_PER_UNIT_MASK	(BLOCKS_PER_UNIT - 1)
 
 #define firstfree_valid(b)	((b)->first_free != (uint64_t) -1)
@@ -80,10 +79,10 @@
 		return;
 
 	for (i = 0; i < axmap->nr_levels; i++)
-		sfree(axmap->levels[i].map);
+		free(axmap->levels[i].map);
 
-	sfree(axmap->levels);
-	sfree(axmap);
+	free(axmap->levels);
+	free(axmap);
 }
 
 struct axmap *axmap_new(unsigned long nr_bits)
@@ -91,7 +90,7 @@
 	struct axmap *axmap;
 	unsigned int i, levels;
 
-	axmap = smalloc(sizeof(*axmap));
+	axmap = malloc(sizeof(*axmap));
 	if (!axmap)
 		return NULL;
 
@@ -103,7 +102,7 @@
 	}
 
 	axmap->nr_levels = levels;
-	axmap->levels = smalloc(axmap->nr_levels * sizeof(struct axmap_level));
+	axmap->levels = malloc(axmap->nr_levels * sizeof(struct axmap_level));
 	axmap->nr_bits = nr_bits;
 
 	for (i = 0; i < axmap->nr_levels; i++) {
@@ -111,7 +110,7 @@
 
 		al->level = i;
 		al->map_size = (nr_bits + BLOCKS_PER_UNIT - 1) >> UNIT_SHIFT;
-		al->map = smalloc(al->map_size * sizeof(unsigned long));
+		al->map = malloc(al->map_size * sizeof(unsigned long));
 		if (!al->map)
 			goto err;
 
@@ -123,9 +122,10 @@
 err:
 	for (i = 0; i < axmap->nr_levels; i++)
 		if (axmap->levels[i].map)
-			sfree(axmap->levels[i].map);
+			free(axmap->levels[i].map);
 
-	sfree(axmap->levels);
+	free(axmap->levels);
+	free(axmap);
 	return NULL;
 }
 
@@ -369,7 +369,7 @@
 	return (uint64_t) -1ULL;
 }
 
-uint64_t axmap_first_free(struct axmap *axmap)
+static uint64_t axmap_first_free(struct axmap *axmap)
 {
 	if (firstfree_valid(axmap))
 		return axmap->first_free;
diff --git a/lib/axmap.h b/lib/axmap.h
index edfeba8..3705a1d 100644
--- a/lib/axmap.h
+++ b/lib/axmap.h
@@ -11,7 +11,6 @@
 void axmap_set(struct axmap *axmap, uint64_t bit_nr);
 unsigned int axmap_set_nr(struct axmap *axmap, uint64_t bit_nr, unsigned int nr_bits);
 int axmap_isset(struct axmap *axmap, uint64_t bit_nr);
-uint64_t axmap_first_free(struct axmap *axmap);
 uint64_t axmap_next_free(struct axmap *axmap, uint64_t bit_nr);
 void axmap_reset(struct axmap *axmap);
 
diff --git a/lib/bloom.c b/lib/bloom.c
new file mode 100644
index 0000000..ee4ba0b
--- /dev/null
+++ b/lib/bloom.c
@@ -0,0 +1,119 @@
+#include <stdlib.h>
+#include <inttypes.h>
+
+#include "bloom.h"
+#include "../hash.h"
+#include "../minmax.h"
+#include "../crc/xxhash.h"
+#include "../crc/murmur3.h"
+#include "../crc/crc32c.h"
+#include "../crc/fnv.h"
+
+struct bloom {
+	uint64_t nentries;
+
+	uint32_t *map;
+};
+
+#define BITS_PER_INDEX	(sizeof(uint32_t) * 8)
+#define BITS_INDEX_MASK	(BITS_PER_INDEX - 1)
+
+struct bloom_hash {
+	unsigned int seed;
+	uint32_t (*fn)(const void *, uint32_t, uint32_t);
+};
+
+static uint32_t bloom_crc32c(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fio_crc32c(buf, len);
+}
+
+static uint32_t bloom_fnv(const void *buf, uint32_t len, uint32_t seed)
+{
+	return fnv(buf, len, seed);
+}
+
+#define BLOOM_SEED	0x8989
+
+struct bloom_hash hashes[] = {
+	{
+		.seed = BLOOM_SEED,
+		.fn = jhash,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = XXH32,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = murmurhash3,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_crc32c,
+	},
+	{
+		.seed = BLOOM_SEED,
+		.fn = bloom_fnv,
+	},
+};
+
+#define N_HASHES	5
+
+#define MIN_ENTRIES	1073741824UL
+
+struct bloom *bloom_new(uint64_t entries)
+{
+	struct bloom *b;
+	size_t no_uints;
+
+	crc32c_intel_probe();
+
+	b = malloc(sizeof(*b));
+	b->nentries = entries;
+	no_uints = (entries + BITS_PER_INDEX - 1) / BITS_PER_INDEX;
+	no_uints = max((unsigned long) no_uints, MIN_ENTRIES);
+	b->map = calloc(no_uints, sizeof(uint32_t));
+	if (!b->map) {
+		free(b);
+		return NULL;
+	}
+
+	return b;
+}
+
+void bloom_free(struct bloom *b)
+{
+	free(b->map);
+	free(b);
+}
+
+static int __bloom_check(struct bloom *b, uint32_t *data, unsigned int nwords,
+			 int set)
+{
+	uint32_t hash[N_HASHES];
+	int i, was_set;
+
+	for (i = 0; i < N_HASHES; i++) {
+		hash[i] = hashes[i].fn(data, nwords, hashes[i].seed);
+		hash[i] = hash[i] % b->nentries;
+	}
+
+	was_set = 0;
+	for (i = 0; i < N_HASHES; i++) {
+		const unsigned int index = hash[i] / BITS_PER_INDEX;
+		const unsigned int bit = hash[i] & BITS_INDEX_MASK;
+
+		if (b->map[index] & (1U << bit))
+			was_set++;
+		if (set)
+			b->map[index] |= 1U << bit;
+	}
+
+	return was_set == N_HASHES;
+}
+
+int bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords)
+{
+	return __bloom_check(b, data, nwords, 1);
+}
diff --git a/lib/bloom.h b/lib/bloom.h
new file mode 100644
index 0000000..127ed9b
--- /dev/null
+++ b/lib/bloom.h
@@ -0,0 +1,12 @@
+#ifndef FIO_BLOOM_H
+#define FIO_BLOOM_H
+
+#include <inttypes.h>
+
+struct bloom;
+
+struct bloom *bloom_new(uint64_t entries);
+void bloom_free(struct bloom *b);
+int bloom_set(struct bloom *b, uint32_t *data, unsigned int nwords);
+
+#endif
diff --git a/lib/lfsr.c b/lib/lfsr.c
index 329ef85..0c0072c 100644
--- a/lib/lfsr.c
+++ b/lib/lfsr.c
@@ -11,7 +11,7 @@
  * The memory overhead of the following tap table should be relatively small,
  * no more than 400 bytes.
  */
-static uint8_t taps[64][FIO_MAX_TAPS] =
+static uint8_t lfsr_taps[64][FIO_MAX_TAPS] =
 {
 	{0}, {0}, {0},		//LFSRs with less that 3-bits cannot exist
 	{3, 2},			//Tap position for 3-bit LFSR
@@ -124,7 +124,7 @@
  * c. Check if the calculated value exceeds the desirable range. In this case,
  *    go back to b, else return.
  */
-int lfsr_next(struct fio_lfsr *fl, uint64_t *off, uint64_t last)
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off)
 {
 	if (fl->num_vals++ > fl->max_val)
 		return 1;
@@ -158,12 +158,12 @@
 
 	/*
 	 * For an LFSR, there is always a prohibited state (all ones).
-	 * Thus, if we need to find the proper LFSR for our size, we must take that
-	 * into account.
+	 * Thus, if we need to find the proper LFSR for our size, we must
+	 * take that into account.
 	 */
 	for (i = 3; i < 64; i++)
 		if ((1UL << i) > size)
-			return taps[i];
+			return lfsr_taps[i];
 
 	return NULL;
 }
@@ -234,15 +234,15 @@
 int lfsr_init(struct fio_lfsr *fl, uint64_t nums, unsigned long seed,
 		unsigned int spin)
 {
-	uint8_t *lfsr_taps;
+	uint8_t *taps;
 
-	lfsr_taps = find_lfsr(nums);
-	if (!lfsr_taps)
+	taps = find_lfsr(nums);
+	if (!taps)
 		return 1;
 
 	fl->max_val = nums - 1;
-	fl->xormask = lfsr_create_xormask(lfsr_taps);
-	fl->cached_bit = 1UL << (lfsr_taps[0] - 1);
+	fl->xormask = lfsr_create_xormask(taps);
+	fl->cached_bit = 1UL << (taps[0] - 1);
 
 	if (prepare_spin(fl, spin))
 		return 1;
diff --git a/lib/lfsr.h b/lib/lfsr.h
index 187abf2..c2d5569 100644
--- a/lib/lfsr.h
+++ b/lib/lfsr.h
@@ -22,7 +22,7 @@
 	unsigned int spin;
 };
 
-int lfsr_next(struct fio_lfsr *fl, uint64_t *off, uint64_t);
+int lfsr_next(struct fio_lfsr *fl, uint64_t *off);
 int lfsr_init(struct fio_lfsr *fl, uint64_t size,
 		unsigned long seed, unsigned int spin);
 int lfsr_reset(struct fio_lfsr *fl, unsigned long seed);
diff --git a/lib/linux-dev-lookup.c b/lib/linux-dev-lookup.c
new file mode 100644
index 0000000..4d5f356
--- /dev/null
+++ b/lib/linux-dev-lookup.c
@@ -0,0 +1,66 @@
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <dirent.h>
+#include <string.h>
+#include <stdio.h>
+#include <unistd.h>
+
+#include "../os/os.h"
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min)
+{
+	struct dirent *dir;
+	struct stat st;
+	int found = 0;
+	DIR *D;
+
+	D = opendir(path);
+	if (!D)
+		return 0;
+
+	while ((dir = readdir(D)) != NULL) {
+		char full_path[256];
+
+		if (!strcmp(dir->d_name, ".") || !strcmp(dir->d_name, ".."))
+			continue;
+
+		sprintf(full_path, "%s%s%s", path, FIO_OS_PATH_SEPARATOR, dir->d_name);
+		if (lstat(full_path, &st) == -1) {
+			perror("lstat");
+			break;
+		}
+
+		if (S_ISDIR(st.st_mode)) {
+			found = blktrace_lookup_device(redirect, full_path,
+								maj, min);
+			if (found) {
+				strcpy(path, full_path);
+				break;
+			}
+		}
+
+		if (!S_ISBLK(st.st_mode))
+			continue;
+
+		/*
+		 * If replay_redirect is set then always return this device
+		 * upon lookup which overrides the device lookup based on
+		 * major minor in the actual blktrace
+		 */
+		if (redirect) {
+			strcpy(path, redirect);
+			found = 1;
+			break;
+		}
+
+		if (maj == major(st.st_rdev) && min == minor(st.st_rdev)) {
+			strcpy(path, full_path);
+			found = 1;
+			break;
+		}
+	}
+
+	closedir(D);
+	return found;
+}
diff --git a/lib/linux-dev-lookup.h b/lib/linux-dev-lookup.h
new file mode 100644
index 0000000..144f33a
--- /dev/null
+++ b/lib/linux-dev-lookup.h
@@ -0,0 +1,7 @@
+#ifndef LINUX_DEV_LOOKUP
+#define LINUX_DEV_LOOKUP
+
+int blktrace_lookup_device(const char *redirect, char *path, unsigned int maj,
+			   unsigned int min);
+
+#endif
diff --git a/lib/num2str.c b/lib/num2str.c
index 8961868..0ed05f3 100644
--- a/lib/num2str.c
+++ b/lib/num2str.c
@@ -9,7 +9,7 @@
 /*
  * Cheesy number->string conversion, complete with carry rounding error.
  */
-char *num2str(unsigned long num, int maxlen, int base, int pow2, int unit_base)
+char *num2str(uint64_t num, int maxlen, int base, int pow2, int unit_base)
 {
 	const char *postfix[] = { "", "K", "M", "G", "P", "E" };
 	const char *byte_postfix[] = { "", "B", "bit" };
@@ -36,7 +36,7 @@
 
 	modulo = -1U;
 	while (post_index < sizeof(postfix)) {
-		sprintf(tmp, "%lu", num);
+		sprintf(tmp, "%llu", (unsigned long long) num);
 		if (strlen(tmp) <= maxlen)
 			break;
 
@@ -51,12 +51,12 @@
 		if (post_index >= ARRAY_LENGTH(postfix))
 			post_index = 0;
 
-		sprintf(buf, "%lu%s%s", num, postfix[post_index],
-			byte_postfix[byte_post_index]);
+		sprintf(buf, "%llu%s%s", (unsigned long long) num,
+			postfix[post_index], byte_postfix[byte_post_index]);
 		return buf;
 	}
 
-	sprintf(tmp, "%lu", num);
+	sprintf(tmp, "%llu", (unsigned long long) num);
 	decimals = maxlen - strlen(tmp);
 	if (decimals <= 1) {
 		if (carry)
@@ -72,7 +72,7 @@
 		modulo = (modulo + 9) / 10;
 	} while (1);
 
-	sprintf(buf, "%lu.%u%s%s", num, modulo, postfix[post_index],
-		byte_postfix[byte_post_index]);
+	sprintf(buf, "%llu.%u%s%s", (unsigned long long) num, modulo,
+			postfix[post_index], byte_postfix[byte_post_index]);
 	return buf;
 }
diff --git a/lib/rand.c b/lib/rand.c
index a79fb9c..185b679 100644
--- a/lib/rand.c
+++ b/lib/rand.c
@@ -34,6 +34,7 @@
 */
 
 #include <string.h>
+#include <assert.h>
 #include "rand.h"
 #include "../hash.h"
 
@@ -68,11 +69,26 @@
 
 void __fill_random_buf(void *buf, unsigned int len, unsigned long seed)
 {
-	long *ptr = buf;
+	void *ptr = buf;
 
-	while ((void *) ptr - buf < len) {
-		*ptr = seed;
-		ptr++;
+	while (len) {
+		int this_len;
+
+		if (len >= sizeof(int64_t)) {
+			*((int64_t *) ptr) = seed;
+			this_len = sizeof(int64_t);
+		} else if (len >= sizeof(int32_t)) {
+			*((int32_t *) ptr) = seed;
+			this_len = sizeof(int32_t);
+		} else if (len >= sizeof(int16_t)) {
+			*((int16_t *) ptr) = seed;
+			this_len = sizeof(int16_t);
+		} else {
+			*((int8_t *) ptr) = seed;
+			this_len = sizeof(int8_t);
+		}
+		ptr += this_len;
+		len -= this_len;
 		seed *= GOLDEN_RATIO_PRIME;
 		seed >>= 3;
 	}
@@ -90,24 +106,50 @@
 	return r;
 }
 
-unsigned long fill_random_buf_percentage(struct frand_state *fs, void *buf,
-					 unsigned int percentage,
-					 unsigned int segment, unsigned int len)
+void fill_pattern(void *p, unsigned int len, char *pattern,
+		  unsigned int pattern_bytes)
 {
-	unsigned long r = __rand(fs);
+	switch (pattern_bytes) {
+	case 0:
+		assert(0);
+		break;
+	case 1:
+		memset(p, pattern[0], len);
+		break;
+	default: {
+		unsigned int i = 0, size = 0;
+		unsigned char *b = p;
+
+		while (i < len) {
+			size = pattern_bytes;
+			if (size > (len - i))
+				size = len - i;
+			memcpy(b+i, pattern, size);
+			i += size;
+		}
+		break;
+		}
+	}
+}
+
+void __fill_random_buf_percentage(unsigned long seed, void *buf,
+				  unsigned int percentage,
+				  unsigned int segment, unsigned int len,
+				  char *pattern, unsigned int pbytes)
+{
 	unsigned int this_len;
 
 	if (percentage == 100) {
-		memset(buf, 0, len);
-		return 0;
+		if (pbytes)
+			fill_pattern(buf, len, pattern, pbytes);
+		else
+			memset(buf, 0, len);
+		return;
 	}
 
 	if (segment > len)
 		segment = len;
 
-	if (sizeof(int) != sizeof(long *))
-		r *= (unsigned long) __rand(fs);
-
 	while (len) {
 		/*
 		 * Fill random chunk
@@ -116,18 +158,39 @@
 		if (this_len > len)
 			this_len = len;
 
-		__fill_random_buf(buf, this_len, r);
+		__fill_random_buf(buf, this_len, seed);
 
 		len -= this_len;
+		if (!len)
+			break;
 		buf += this_len;
 
 		if (this_len > len)
 			this_len = len;
+		else if (len - this_len <= sizeof(long))
+			this_len = len;
 
-		memset(buf, 0, this_len);
+		if (pbytes)
+			fill_pattern(buf, this_len, pattern, pbytes);
+		else
+			memset(buf, 0, this_len);
+
 		len -= this_len;
 		buf += this_len;
 	}
+}
 
+unsigned long fill_random_buf_percentage(struct frand_state *fs, void *buf,
+					 unsigned int percentage,
+					 unsigned int segment, unsigned int len,
+					 char *pattern, unsigned int pbytes)
+{
+	unsigned long r = __rand(fs);
+
+	if (sizeof(int) != sizeof(long *))
+		r *= (unsigned long) __rand(fs);
+
+	__fill_random_buf_percentage(r, buf, percentage, segment, len,
+					pattern, pbytes);
 	return r;
 }
diff --git a/lib/rand.h b/lib/rand.h
index d62ebe5..089837d 100644
--- a/lib/rand.h
+++ b/lib/rand.h
@@ -7,6 +7,14 @@
 	unsigned int s1, s2, s3;
 };
 
+static inline void frand_copy(struct frand_state *dst,
+			      struct frand_state *src)
+{
+	dst->s1 = src->s1;
+	dst->s2 = src->s2;
+	dst->s3 = src->s3;
+}
+
 static inline unsigned int __rand(struct frand_state *state)
 {
 #define TAUSWORTHE(s,a,b,c,d) ((s&c)<<d) ^ (((s <<a) ^ s)>>b)
@@ -22,6 +30,8 @@
 extern void init_rand_seed(struct frand_state *, unsigned int seed);
 extern void __fill_random_buf(void *buf, unsigned int len, unsigned long seed);
 extern unsigned long fill_random_buf(struct frand_state *, void *buf, unsigned int len);
-extern unsigned long fill_random_buf_percentage(struct frand_state *, void *buf, unsigned int percentage, unsigned int segment, unsigned int len);
+extern void __fill_random_buf_percentage(unsigned long, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
+extern unsigned long fill_random_buf_percentage(struct frand_state *, void *, unsigned int, unsigned int, unsigned int, char *, unsigned int);
+extern void fill_pattern(void *p, unsigned int len, char *pattern, unsigned int pattern_bytes);
 
 #endif
diff --git a/lib/tp.c b/lib/tp.c
new file mode 100644
index 0000000..7462f5b
--- /dev/null
+++ b/lib/tp.c
@@ -0,0 +1,119 @@
+/*
+ * Basic workqueue like code, that sets up a thread and allows async
+ * processing of some sort. Could be extended to allow for multiple
+ * worker threads. But right now fio associates one of this per IO
+ * thread, so should be enough to have just a single thread doing the
+ * work.
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <unistd.h>
+#include <errno.h>
+#include <pthread.h>
+#include <string.h>
+
+#include "../smalloc.h"
+#include "../log.h"
+#include "tp.h"
+
+static void tp_flush_work(struct flist_head *list)
+{
+	struct tp_work *work;
+
+	while (!flist_empty(list)) {
+		int prio;
+
+		work = flist_entry(list->next, struct tp_work, list);
+		flist_del(&work->list);
+
+		prio = work->prio;
+		if (nice(prio) < 0)
+			log_err("fio: nice %s\n", strerror(errno));
+
+		work->fn(work);
+
+		if (nice(prio) < 0)
+			log_err("fio: nice %s\n", strerror(errno));
+	}
+}
+
+static void *tp_thread(void *data)
+{
+	struct tp_data *tdat = data;
+	struct flist_head work_list;
+
+	INIT_FLIST_HEAD(&work_list);
+
+	while (1) {
+		pthread_mutex_lock(&tdat->lock);
+
+		if (!tdat->thread_exit && flist_empty(&tdat->work))
+			pthread_cond_wait(&tdat->cv, &tdat->lock);
+
+		if (!flist_empty(&tdat->work))
+			flist_splice_tail_init(&tdat->work, &work_list);
+
+		pthread_mutex_unlock(&tdat->lock);
+
+		if (flist_empty(&work_list)) {
+			if (tdat->thread_exit)
+				break;
+			continue;
+		}
+
+		tp_flush_work(&work_list);
+	}
+
+	return NULL;
+}
+
+void tp_queue_work(struct tp_data *tdat, struct tp_work *work)
+{
+	work->done = 0;
+
+	pthread_mutex_lock(&tdat->lock);
+	flist_add_tail(&work->list, &tdat->work);
+	pthread_mutex_unlock(&tdat->lock);
+
+	pthread_cond_signal(&tdat->cv);
+}
+
+void tp_init(struct tp_data **tdatp)
+{
+	struct tp_data *tdat;
+	int ret;
+
+	if (*tdatp)
+		return;
+
+	*tdatp = tdat = smalloc(sizeof(*tdat));
+	pthread_mutex_init(&tdat->lock, NULL);
+	INIT_FLIST_HEAD(&tdat->work);
+	pthread_cond_init(&tdat->cv, NULL);
+	pthread_cond_init(&tdat->sleep_cv, NULL);
+
+	ret = pthread_create(&tdat->thread, NULL, tp_thread, tdat);
+	if (ret)
+		log_err("fio: failed to create tp thread\n");
+}
+
+void tp_exit(struct tp_data **tdatp)
+{
+	struct tp_data *tdat = *tdatp;
+	void *ret;
+
+	if (!tdat)
+		return;
+
+	pthread_mutex_lock(&tdat->lock);
+	tdat->thread_exit = 1;
+	pthread_mutex_unlock(&tdat->lock);
+
+	pthread_cond_signal(&tdat->cv);
+
+	pthread_join(tdat->thread, &ret);
+
+	sfree(tdat);
+	*tdatp = NULL;
+}
diff --git a/lib/tp.h b/lib/tp.h
new file mode 100644
index 0000000..9147cc2
--- /dev/null
+++ b/lib/tp.h
@@ -0,0 +1,33 @@
+#ifndef FIO_TP_H
+#define FIO_TP_H
+
+#include "../flist.h"
+
+struct tp_work;
+typedef int (tp_work_fn)(struct tp_work *);
+
+struct tp_work {
+	struct flist_head list;
+	tp_work_fn *fn;
+	int wait;
+	int prio;
+	pthread_cond_t cv;
+	pthread_mutex_t lock;
+	volatile int done;
+};
+
+struct tp_data {
+	pthread_t thread;
+	pthread_cond_t cv;
+	pthread_mutex_t lock;
+	struct flist_head work;
+	volatile int thread_exit;
+	pthread_cond_t sleep_cv;
+	volatile int sleeping;
+};
+
+extern void tp_init(struct tp_data **);
+extern void tp_exit(struct tp_data **);
+extern void tp_queue_work(struct tp_data *, struct tp_work *);
+
+#endif
diff --git a/lib/zipf.c b/lib/zipf.c
index 9b6ce63..c691bc5 100644
--- a/lib/zipf.c
+++ b/lib/zipf.c
@@ -11,7 +11,7 @@
 #include "../minmax.h"
 #include "../hash.h"
 
-#define ZIPF_MAX_GEN	10000000
+#define ZIPF_MAX_GEN	10000000UL
 
 static void zipf_update(struct zipf_state *zs)
 {
@@ -23,7 +23,7 @@
 	 * 10M max, that should be doable in 1-2s on even slow machines.
 	 * Precision will take a slight hit, but nothing major.
 	 */
-	to_gen = min(zs->nranges, ZIPF_MAX_GEN);
+	to_gen = min(zs->nranges, (uint64_t) ZIPF_MAX_GEN);
 
 	for (i = 0; i < to_gen; i++)
 		zs->zetan += pow(1.0 / (double) (i + 1), zs->theta);
diff --git a/libfio.c b/libfio.c
index 8af1129..57ce725 100644
--- a/libfio.c
+++ b/libfio.c
@@ -58,6 +58,7 @@
 	"Solaris",
 	"Windows",
 	"Android",
+	"DragonFly",
 };
 
 static const char *fio_arch_strings[arch_nr] = {
@@ -108,8 +109,10 @@
 	reset_io_counters(td);
 
 	close_files(td);
-	for_each_file(td, f, i)
+	for_each_file(td, f, i) {
 		fio_file_clear_done(f);
+		f->file_offset = get_start_offset(td, f);
+	}
 
 	/*
 	 * Set the same seed to get repeatable runs
@@ -187,6 +190,13 @@
 	td_set_runstate(td, old_state);
 }
 
+void fio_mark_td_terminate(struct thread_data *td)
+{
+	fio_gettime(&td->terminate_time, NULL);
+	write_barrier();
+	td->terminate = 1;
+}
+
 void fio_terminate_threads(int group_id)
 {
 	struct thread_data *td;
@@ -199,7 +209,11 @@
 		if (group_id == TERMINATE_ALL || groupid == td->groupid) {
 			dprint(FD_PROCESS, "setting terminate on %s/%d\n",
 						td->o.name, (int) td->pid);
-			td->terminate = 1;
+
+			if (td->terminate)
+				continue;
+
+			fio_mark_td_terminate(td);
 			td->o.start_delay = 0;
 
 			/*
@@ -286,6 +300,20 @@
 {
 	long ps;
 
+	/*
+	 * We need these to be properly 64-bit aligned, otherwise we
+	 * can run into problems on archs that fault on unaligned fp
+	 * access (ARM).
+	 */
+	compiletime_assert((offsetof(struct thread_stat, percentile_list) % 8) == 0, "stat percentile_list");
+	compiletime_assert((offsetof(struct thread_stat, total_run_time) % 8) == 0, "total_run_time");
+	compiletime_assert((offsetof(struct thread_stat, total_err_count) % 8) == 0, "total_err_count");
+	compiletime_assert((offsetof(struct thread_stat, latency_percentile) % 8) == 0, "stat latency_percentile");
+	compiletime_assert((offsetof(struct thread_options_pack, zipf_theta) % 8) == 0, "zipf_theta");
+	compiletime_assert((offsetof(struct thread_options_pack, pareto_h) % 8) == 0, "pareto_h");
+	compiletime_assert((offsetof(struct thread_options_pack, percentile_list) % 8) == 0, "percentile_list");
+	compiletime_assert((offsetof(struct thread_options_pack, latency_percentile) % 8) == 0, "latency_percentile");
+
 	if (endian_check()) {
 		log_err("fio: endianness settings appear wrong.\n");
 		log_err("fio: please report this to fio@vger.kernel.org\n");
diff --git a/memalign.c b/memalign.c
index 7a04ffd..cfd6e46 100644
--- a/memalign.c
+++ b/memalign.c
@@ -20,7 +20,7 @@
 
 	ptr = malloc(size + alignment + size + sizeof(*f) - 1);
 	if (ptr) {
-		ret = PTR_ALIGN(ptr, alignment);
+		ret = PTR_ALIGN(ptr, alignment - 1);
 		f = ret + size;
 		f->offset = (uintptr_t) ret - (uintptr_t) ptr;
 	}
diff --git a/memory.c b/memory.c
index 8c06d94..23a0d94 100644
--- a/memory.c
+++ b/memory.c
@@ -63,6 +63,7 @@
 
 static int alloc_mem_shm(struct thread_data *td, unsigned int total_mem)
 {
+#ifndef CONFIG_NO_SHM
 	int flags = IPC_CREAT | S_IRUSR | S_IWUSR;
 
 	if (td->o.mem_type == MEM_SHMHUGE) {
@@ -104,22 +105,28 @@
 	}
 
 	return 0;
+#else
+	log_err("fio: shm not supported\n");
+	return 1;
+#endif
 }
 
 static void free_mem_shm(struct thread_data *td)
 {
+#ifndef CONFIG_NO_SHM
 	struct shmid_ds sbuf;
 
 	dprint(FD_MEM, "shmdt/ctl %d %p\n", td->shm_id, td->orig_buffer);
 	shmdt(td->orig_buffer);
 	shmctl(td->shm_id, IPC_RMID, &sbuf);
+#endif
 }
 
 static int alloc_mem_mmap(struct thread_data *td, size_t total_mem)
 {
 	int flags = 0;
 
-	td->mmapfd = 1;
+	td->mmapfd = -1;
 
 	if (td->o.mem_type == MEM_MMAPHUGE) {
 		unsigned long mask = td->o.hugepage_size - 1;
@@ -158,7 +165,7 @@
 	if (td->orig_buffer == MAP_FAILED) {
 		td_verror(td, errno, "mmap");
 		td->orig_buffer = NULL;
-		if (td->mmapfd != 1) {
+		if (td->mmapfd != 1 && td->mmapfd != -1) {
 			close(td->mmapfd);
 			if (td->o.mmapfile)
 				unlink(td->o.mmapfile);
@@ -176,7 +183,8 @@
 						td->orig_buffer);
 	munmap(td->orig_buffer, td->orig_buffer_size);
 	if (td->o.mmapfile) {
-		close(td->mmapfd);
+		if (td->mmapfd != -1)
+			close(td->mmapfd);
 		unlink(td->o.mmapfile);
 		free(td->o.mmapfile);
 	}
diff --git a/minmax.h b/minmax.h
index e5c2f58..97957c8 100644
--- a/minmax.h
+++ b/minmax.h
@@ -2,10 +2,19 @@
 #define FIO_MIN_MAX_H
 
 #ifndef min
-#define min(a, b)	((a) < (b) ? (a) : (b))
+#define min(x,y) ({ \
+	typeof(x) _x = (x);	\
+	typeof(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x < _y ? _x : _y; })
 #endif
+
 #ifndef max
-#define max(a, b)	((a) > (b) ? (a) : (b))
+#define max(x,y) ({ \
+	typeof(x) _x = (x);	\
+	typeof(y) _y = (y);	\
+	(void) (&_x == &_y);		\
+	_x > _y ? _x : _y; })
 #endif
 
 #endif
diff --git a/mutex.c b/mutex.c
index 9d10c2c..53f9651 100644
--- a/mutex.c
+++ b/mutex.c
@@ -18,10 +18,15 @@
 #include "fio_time.h"
 #include "gettime.h"
 
-void fio_mutex_remove(struct fio_mutex *mutex)
+void __fio_mutex_remove(struct fio_mutex *mutex)
 {
 	assert(mutex->magic == FIO_MUTEX_MAGIC);
 	pthread_cond_destroy(&mutex->cond);
+}
+
+void fio_mutex_remove(struct fio_mutex *mutex)
+{
+	__fio_mutex_remove(mutex);
 	munmap((void *) mutex, sizeof(*mutex));
 }
 
@@ -162,14 +167,19 @@
 
 void fio_mutex_up(struct fio_mutex *mutex)
 {
+	int do_wake = 0;
+
 	assert(mutex->magic == FIO_MUTEX_MAGIC);
 
 	pthread_mutex_lock(&mutex->lock);
 	read_barrier();
 	if (!mutex->value && mutex->waiters)
-		pthread_cond_signal(&mutex->cond);
+		do_wake = 1;
 	mutex->value++;
 	pthread_mutex_unlock(&mutex->lock);
+
+	if (do_wake)
+		pthread_cond_signal(&mutex->cond);
 }
 
 void fio_rwlock_write(struct fio_rwlock *lock)
diff --git a/mutex.h b/mutex.h
index 246afee..17380de 100644
--- a/mutex.h
+++ b/mutex.h
@@ -26,6 +26,7 @@
 
 extern int __fio_mutex_init(struct fio_mutex *, int);
 extern struct fio_mutex *fio_mutex_init(int);
+extern void __fio_mutex_remove(struct fio_mutex *);
 extern void fio_mutex_remove(struct fio_mutex *);
 extern void fio_mutex_up(struct fio_mutex *);
 extern void fio_mutex_down(struct fio_mutex *);
diff --git a/options.c b/options.c
index 9dcb255..ab6e399 100644
--- a/options.c
+++ b/options.c
@@ -98,11 +98,11 @@
 			if (perc > 100)
 				perc = 100;
 			else if (!perc)
-				perc = -1;
+				perc = -1U;
 		} else
-			perc = -1;
+			perc = -1U;
 
-		if (str_to_decimal(fname, &val, 1, o, 0)) {
+		if (str_to_decimal(fname, &val, 1, o, 0, 0)) {
 			log_err("fio: bssplit conversion failed\n");
 			free(bssplit);
 			return 1;
@@ -127,26 +127,29 @@
 	for (i = 0; i < o->bssplit_nr[ddir]; i++) {
 		struct bssplit *bsp = &bssplit[i];
 
-		if (bsp->perc == (unsigned char) -1)
+		if (bsp->perc == -1U)
 			perc_missing++;
 		else
 			perc += bsp->perc;
 	}
 
-	if (perc > 100) {
+	if (perc > 100 && perc_missing > 1) {
 		log_err("fio: bssplit percentages add to more than 100%%\n");
 		free(bssplit);
 		return 1;
 	}
+
 	/*
 	 * If values didn't have a percentage set, divide the remains between
 	 * them.
 	 */
 	if (perc_missing) {
+		if (perc_missing == 1 && o->bssplit_nr[ddir] == 1)
+			perc = 100;
 		for (i = 0; i < o->bssplit_nr[ddir]; i++) {
 			struct bssplit *bsp = &bssplit[i];
 
-			if (bsp->perc == (unsigned char) -1)
+			if (bsp->perc == -1U)
 				bsp->perc = (100 - perc) / perc_missing;
 		}
 	}
@@ -339,7 +342,7 @@
 	else {
 		long long val;
 
-		if (str_to_decimal(nr, &val, 1, o, 0)) {
+		if (str_to_decimal(nr, &val, 1, o, 0, 0)) {
 			log_err("fio: rw postfix parsing failed\n");
 			free(nr);
 			return 1;
@@ -450,7 +453,6 @@
 		}
 	}
 
-	td->o.cpumask_set = 1;
 	return 0;
 }
 
@@ -517,36 +519,24 @@
 	}
 
 	free(p);
-	if (!ret)
-		td->o.cpumask_set = 1;
 	return ret;
 }
 
 static int str_cpus_allowed_cb(void *data, const char *input)
 {
 	struct thread_data *td = data;
-	int ret;
 
 	if (parse_dryrun())
 		return 0;
 
-	ret = set_cpus_allowed(td, &td->o.cpumask, input);
-	if (!ret)
-		td->o.cpumask_set = 1;
-
-	return ret;
+	return set_cpus_allowed(td, &td->o.cpumask, input);
 }
 
 static int str_verify_cpus_allowed_cb(void *data, const char *input)
 {
 	struct thread_data *td = data;
-	int ret;
 
-	ret = set_cpus_allowed(td, &td->o.verify_cpumask, input);
-	if (!ret)
-		td->o.verify_cpumask_set = 1;
-
-	return ret;
+	return set_cpus_allowed(td, &td->o.verify_cpumask, input);
 }
 #endif
 
@@ -573,7 +563,6 @@
 	numa_free_nodemask(verify_bitmask);
 
 	td->o.numa_cpunodes = strdup(input);
-	td->o.numa_cpumask_set = 1;
 	return 0;
 }
 
@@ -672,7 +661,7 @@
 		}
 		td->o.numa_memnodes = strdup(nodelist);
 		numa_free_nodemask(verify_bitmask);
-                
+
 		break;
 	case MPOL_LOCAL:
 	case MPOL_DEFAULT:
@@ -680,9 +669,7 @@
 		break;
 	}
 
-	td->o.numa_memmask_set = 1;
 	return 0;
-
 out:
 	return 1;
 }
@@ -728,14 +715,14 @@
 		return 0;
 
 	if (td->o.random_distribution == FIO_RAND_DIST_ZIPF)
-		val = 1.1;
+		val = FIO_DEF_ZIPF;
 	else if (td->o.random_distribution == FIO_RAND_DIST_PARETO)
-		val = 0.2;
+		val = FIO_DEF_PARETO;
 	else
 		return 0;
 
 	nr = get_opt_postfix(str);
-	if (nr && !str_to_float(nr, &val)) {
+	if (nr && !str_to_float(nr, &val, 0)) {
 		log_err("fio: random postfix parsing failed\n");
 		free(nr);
 		return 1;
@@ -897,18 +884,6 @@
 	return ret;
 }
 
-static int str_lockfile_cb(void *data, const char fio_unused *str)
-{
-	struct thread_data *td = data;
-
-	if (td->files_index) {
-		log_err("fio: lockfile= option must precede filename=\n");
-		return 1;
-	}
-
-	return 0;
-}
-
 static int str_opendir_cb(void *data, const char fio_unused *str)
 {
 	struct thread_data *td = data;
@@ -930,6 +905,29 @@
 	uint32_t pattern_length;
 	char *loc1, *loc2;
 
+	/*
+	 * Check if it's a string input
+	 */
+	loc1 = strchr(input, '\"');
+	if (loc1) {
+		do {
+			loc1++;
+			if (*loc1 == '\0' || *loc1 == '\"')
+				break;
+
+			pattern[i] = *loc1;
+			i++;
+		} while (i < max_size);
+
+		if (!i)
+			return 1;
+
+		goto fill;
+	}
+
+	/*
+	 * No string, find out if it's decimal or hexidecimal
+	 */
 	loc1 = strstr(input, "0x");
 	loc2 = strstr(input, "0X");
 	if (loc1 || loc2)
@@ -966,6 +964,7 @@
 	 * Fill the pattern all the way to the end. This greatly reduces
 	 * the number of memcpy's we have to do when verifying the IO.
 	 */
+fill:
 	pattern_length = i;
 	while (i > 1 && i * 2 <= max_size) {
 		memcpy(&pattern[i], &pattern[0], i);
@@ -985,8 +984,8 @@
 
 	if (i == 1) {
 		/*
-		 * The code in verify_io_u_pattern assumes a single byte pattern
-		 * fills the whole verify pattern buffer.
+		 * The code in verify_io_u_pattern assumes a single byte
+		 * pattern fills the whole verify pattern buffer.
 		 */
 		memset(pattern, pattern[0], max_size);
 	}
@@ -1003,10 +1002,14 @@
 	ret = pattern_cb(td->o.buffer_pattern, MAX_PATTERN_SIZE, input,
 				&td->o.buffer_pattern_bytes);
 
-	if (!ret) {
-		td->o.refill_buffers = 0;
+	if (!ret && td->o.buffer_pattern_bytes) {
+		if (!td->o.compress_percentage)
+			td->o.refill_buffers = 0;
 		td->o.scramble_buffers = 0;
 		td->o.zero_buffers = 0;
+	} else {
+		log_err("fio: failed parsing pattern `%s`\n", input);
+		ret = 1;
 	}
 
 	return ret;
@@ -1021,6 +1024,16 @@
 	return 0;
 }
 
+static int str_dedupe_cb(void *data, unsigned long long *il)
+{
+	struct thread_data *td = data;
+
+	td->flags |= TD_F_COMPRESS;
+	td->o.dedupe_percentage = *il;
+	td->o.refill_buffers = 1;
+	return 0;
+}
+
 static int str_verify_pattern_cb(void *data, const char *input)
 {
 	struct thread_data *td = data;
@@ -1054,16 +1067,6 @@
 	return 0;
 }
 
-static int str_gtod_cpu_cb(void *data, long long *il)
-{
-	struct thread_data *td = data;
-	int val = *il;
-
-	td->o.gtod_cpu = val;
-	td->o.gtod_offload = 1;
-	return 0;
-}
-
 static int str_size_cb(void *data, unsigned long long *__val)
 {
 	struct thread_data *td = data;
@@ -1328,7 +1331,6 @@
 		.parent	= "filename",
 		.hide	= 0,
 		.def	= "none",
-		.cb	= str_lockfile_cb,
 		.category = FIO_OPT_C_FILE,
 		.group	= FIO_OPT_G_FILENAME,
 		.posval = {
@@ -1533,6 +1535,19 @@
 			    .help = "fallocate() file based engine",
 			  },
 #endif
+#ifdef CONFIG_GFAPI
+			  { .ival = "gfapi",
+			    .help = "Glusterfs libgfapi(sync) based engine"
+			  },
+			  { .ival = "gfapi_async",
+			    .help = "Glusterfs libgfapi(async) based engine"
+			  },
+#endif
+#ifdef CONFIG_LIBHDFS
+			  { .ival = "libhdfs",
+			    .help = "Hadoop Distributed Filesystem (HDFS) engine"
+			  },
+#endif
 			  { .ival = "external",
 			    .help = "Load external engine (append name)",
 			  },
@@ -1596,14 +1611,16 @@
 		.lname	= "Size",
 		.type	= FIO_OPT_STR_VAL,
 		.cb	= str_size_cb,
+		.off1	= td_var_offset(size),
 		.help	= "Total size of device or files",
 		.interval = 1024 * 1024,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
-		.name	= "io_limit",
-		.lname	= "IO Limit",
+		.name	= "io_size",
+		.alias	= "io_limit",
+		.lname	= "IO Size",
 		.type	= FIO_OPT_STR_VAL,
 		.off1	= td_var_offset(io_limit),
 		.interval = 1024 * 1024,
@@ -1673,7 +1690,7 @@
 		.lname	= "Number of IOs to perform",
 		.type	= FIO_OPT_STR_VAL,
 		.off1	= td_var_offset(number_ios),
-		.help	= "Force job completion of this number of IOs",
+		.help	= "Force job completion after this number of IOs",
 		.def	= "0",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_INVALID,
@@ -1735,6 +1752,7 @@
 		.lname	= "Block size split",
 		.type	= FIO_OPT_STR,
 		.cb	= str_bssplit_cb,
+		.off1	= td_var_offset(bssplit),
 		.help	= "Set a specific mix of block sizes",
 		.parent	= "rw",
 		.hide	= 1,
@@ -1758,7 +1776,7 @@
 		.lname	= "Block size division is seq/random (not read/write)",
 		.type	= FIO_OPT_BOOL,
 		.off1	= td_var_offset(bs_is_seq_rand),
-		.help	= "Consider any blocksize setting to be sequential,ramdom",
+		.help	= "Consider any blocksize setting to be sequential,random",
 		.def	= "0",
 		.parent = "blocksize",
 		.category = FIO_OPT_C_IO,
@@ -1789,12 +1807,8 @@
 	{
 		.name	= "use_os_rand",
 		.lname	= "Use OS random",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(use_os_rand),
-		.help	= "Set to use OS random generator",
-		.def	= "0",
-		.parent = "rw",
-		.hide	= 1,
+		.type	= FIO_OPT_DEPRECATED,
+		.off1	= td_var_offset(dep_use_os_rand),
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_RANDOM,
 	},
@@ -2128,6 +2142,7 @@
 		.help	= "Only start job when this period has passed",
 		.def	= "0",
 		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -2140,6 +2155,7 @@
 		.help	= "Stop workload when this amount of time has passed",
 		.def	= "0",
 		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -2168,6 +2184,7 @@
 		.off1	= td_var_offset(ramp_time),
 		.help	= "Ramp up time before measuring performance",
 		.is_seconds = 1,
+		.is_time = 1,
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_RUNTIME,
 	},
@@ -2217,6 +2234,7 @@
 			    .oval = MEM_MALLOC,
 			    .help = "Use malloc(3) for IO buffers",
 			  },
+#ifndef CONFIG_NO_SHM
 			  { .ival = "shm",
 			    .oval = MEM_SHM,
 			    .help = "Use shared memory segments for IO buffers",
@@ -2227,6 +2245,7 @@
 			    .help = "Like shm, but use huge pages",
 			  },
 #endif
+#endif
 			  { .ival = "mmap",
 			    .oval = MEM_MMAP,
 			    .help = "Use mmap(2) (file or anon) for IO buffers",
@@ -2388,6 +2407,7 @@
 		.lname	= "Verify pattern",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_pattern_cb,
+		.off1	= td_var_offset(verify_pattern),
 		.help	= "Fill pattern for IO buffers",
 		.parent	= "verify",
 		.hide	= 1,
@@ -2458,6 +2478,7 @@
 		.lname	= "Async verify CPUs",
 		.type	= FIO_OPT_STR,
 		.cb	= str_verify_cpus_allowed_cb,
+		.off1	= td_var_offset(verify_cpumask),
 		.help	= "Set CPUs allowed for async verify threads",
 		.parent	= "verify_async",
 		.hide	= 1,
@@ -2470,6 +2491,28 @@
 		.off1	= td_var_offset(experimental_verify),
 		.type	= FIO_OPT_BOOL,
 		.help	= "Enable experimental verification",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_load",
+		.lname	= "Load verify state",
+		.off1	= td_var_offset(verify_state),
+		.type	= FIO_OPT_BOOL,
+		.help	= "Load verify termination state",
+		.parent	= "verify",
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_VERIFY,
+	},
+	{
+		.name	= "verify_state_save",
+		.lname	= "Save verify state",
+		.off1	= td_var_offset(verify_state_save),
+		.type	= FIO_OPT_BOOL,
+		.def	= "1",
+		.help	= "Save verify state on termination",
+		.parent	= "verify",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_VERIFY,
 	},
@@ -2645,6 +2688,7 @@
 		.lname	= "Read/write mix read",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_read_cb,
+		.off1	= td_var_offset(rwmix[DDIR_READ]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is reads",
 		.def	= "50",
@@ -2658,6 +2702,7 @@
 		.lname	= "Read/write mix write",
 		.type	= FIO_OPT_INT,
 		.cb	= str_rwmix_write_cb,
+		.off1	= td_var_offset(rwmix[DDIR_WRITE]),
 		.maxval	= 100,
 		.help	= "Percentage of mixed workload that is writes",
 		.def	= "50",
@@ -2719,6 +2764,7 @@
 		.off1	= td_var_offset(thinktime),
 		.help	= "Idle time between IO buffers (usec)",
 		.def	= "0",
+		.is_time = 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_THINKTIME,
 	},
@@ -2729,6 +2775,7 @@
 		.off1	= td_var_offset(thinktime_spin),
 		.help	= "Start think time by spinning this amount (usec)",
 		.def	= "0",
+		.is_time = 1,
 		.parent	= "thinktime",
 		.hide	= 1,
 		.category = FIO_OPT_C_IO,
@@ -2812,6 +2859,7 @@
 		.type	= FIO_OPT_INT,
 		.off1	= td_var_offset(max_latency),
 		.help	= "Maximum tolerated IO latency (usec)",
+		.is_time = 1,
 		.category = FIO_OPT_C_IO,
 		.group = FIO_OPT_G_LATPROF,
 	},
@@ -2821,6 +2869,7 @@
 		.type	= FIO_OPT_STR_VAL_TIME,
 		.off1	= td_var_offset(latency_target),
 		.help	= "Ramp to max queue depth supporting this latency",
+		.is_time = 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_LATPROF,
 	},
@@ -2830,6 +2879,7 @@
 		.type	= FIO_OPT_STR_VAL_TIME,
 		.off1	= td_var_offset(latency_window),
 		.help	= "Time to sustain latency_target",
+		.is_time = 1,
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_LATPROF,
 	},
@@ -2922,6 +2972,7 @@
 		.lname	= "CPU mask",
 		.type	= FIO_OPT_INT,
 		.cb	= str_cpumask_cb,
+		.off1	= td_var_offset(cpumask),
 		.help	= "CPU affinity mask",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -2931,6 +2982,7 @@
 		.lname	= "CPUs allowed",
 		.type	= FIO_OPT_STR,
 		.cb	= str_cpus_allowed_cb,
+		.off1	= td_var_offset(cpumask),
 		.help	= "Set CPUs allowed",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_CRED,
@@ -2962,6 +3014,7 @@
 		.name	= "numa_cpu_nodes",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_cpunodes_cb,
+		.off1	= td_var_offset(numa_cpunodes),
 		.help	= "NUMA CPU nodes bind",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -2970,6 +3023,7 @@
 		.name	= "numa_mem_policy",
 		.type	= FIO_OPT_STR,
 		.cb	= str_numa_mpol_cb,
+		.off1	= td_var_offset(numa_memnodes),
 		.help	= "NUMA memory policy setup",
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_INVALID,
@@ -3039,6 +3093,10 @@
 		.type	= FIO_OPT_STR_SET,
 		.off1	= td_var_offset(use_thread),
 		.help	= "Use threads instead of processes",
+#ifdef CONFIG_NO_SHM
+		.def	= "1",
+		.no_warn_def = 1,
+#endif
 		.category = FIO_OPT_C_GENERAL,
 		.group	= FIO_OPT_G_PROCESS,
 	},
@@ -3080,6 +3138,38 @@
 		.group	= FIO_OPT_G_INVALID,
 	},
 	{
+		.name	= "log_offset",
+		.lname	= "Log offset of IO",
+		.type	= FIO_OPT_BOOL,
+		.off1	= td_var_offset(log_offset),
+		.help	= "Include offset of IO for each log entry",
+		.def	= "0",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#ifdef CONFIG_ZLIB
+	{
+		.name	= "log_compression",
+		.lname	= "Log compression",
+		.type	= FIO_OPT_INT,
+		.off1	= td_var_offset(log_gz),
+		.help	= "Log in compressed chunks of this size",
+		.minval	= 32 * 1024 * 1024ULL,
+		.maxval	= 512 * 1024 * 1024ULL,
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+	{
+		.name	= "log_store_compressed",
+		.lname	= "Log store compressed",
+		.type	= FIO_OPT_BOOL,
+		.off1	= td_var_offset(log_gz_store),
+		.help	= "Store logs in a compressed format",
+		.category = FIO_OPT_C_LOG,
+		.group	= FIO_OPT_G_INVALID,
+	},
+#endif
+	{
 		.name	= "bwavgtime",
 		.lname	= "Bandwidth average time",
 		.type	= FIO_OPT_INT,
@@ -3148,6 +3238,7 @@
 		.lname	= "Buffer pattern",
 		.type	= FIO_OPT_STR,
 		.cb	= str_buffer_pattern_cb,
+		.off1	= td_var_offset(buffer_pattern),
 		.help	= "Fill pattern for IO buffers",
 		.category = FIO_OPT_C_IO,
 		.group	= FIO_OPT_G_IO_BUF,
@@ -3157,6 +3248,7 @@
 		.lname	= "Buffer compression percentage",
 		.type	= FIO_OPT_INT,
 		.cb	= str_buffer_compress_cb,
+		.off1	= td_var_offset(compress_percentage),
 		.maxval	= 100,
 		.minval	= 0,
 		.help	= "How compressible the buffer is (approximately)",
@@ -3177,6 +3269,19 @@
 		.group	= FIO_OPT_G_IO_BUF,
 	},
 	{
+		.name	= "dedupe_percentage",
+		.lname	= "Dedupe percentage",
+		.type	= FIO_OPT_INT,
+		.cb	= str_dedupe_cb,
+		.off1	= td_var_offset(dedupe_percentage),
+		.maxval	= 100,
+		.minval	= 0,
+		.help	= "Percentage of buffers that are dedupable",
+		.interval = 1,
+		.category = FIO_OPT_C_IO,
+		.group	= FIO_OPT_G_IO_BUF,
+	},
+	{
 		.name	= "clat_percentiles",
 		.lname	= "Completion latency percentiles",
 		.type	= FIO_OPT_BOOL,
@@ -3276,7 +3381,7 @@
 		.name	= "gtod_cpu",
 		.lname	= "Dedicated gettimeofday() CPU",
 		.type	= FIO_OPT_INT,
-		.cb	= str_gtod_cpu_cb,
+		.off1	= td_var_offset(gtod_cpu),
 		.help	= "Set up dedicated gettimeofday() thread on this CPU",
 		.verify	= gtod_cpu_verify,
 		.category = FIO_OPT_C_GENERAL,
@@ -3339,6 +3444,7 @@
 		.name	= "ignore_error",
 		.type	= FIO_OPT_STR,
 		.cb	= str_ignore_error_cb,
+		.off1	= td_var_offset(ignore_error_nr),
 		.help	= "Set a specific list of errors to ignore",
 		.parent	= "rw",
 		.category = FIO_OPT_C_GENERAL,
@@ -3803,6 +3909,39 @@
 	return opts_copy;
 }
 
+static void show_closest_option(const char *opt)
+{
+	int best_option, best_distance;
+	int i, distance;
+	char *name;
+
+	if (!strlen(opt))
+		return;
+
+	name = strdup(opt);
+	i = 0;
+	while (name[i] != '\0' && name[i] != '=')
+		i++;
+	name[i] = '\0';
+
+	best_option = -1;
+	best_distance = INT_MAX;
+	i = 0;
+	while (fio_options[i].name) {
+		distance = string_distance(name, fio_options[i].name);
+		if (distance < best_distance) {
+			best_distance = distance;
+			best_option = i;
+		}
+		i++;
+	}
+
+	if (best_option != -1)
+		log_err("Did you mean %s?\n", fio_options[best_option].name);
+
+	free(name);
+}
+
 int fio_options_parse(struct thread_data *td, char **opts, int num_opts,
 			int dump_cmdline)
 {
@@ -3817,6 +3956,9 @@
 		int newret = parse_option(opts_copy[i], opts[i], fio_options,
 						&o, td, dump_cmdline);
 
+		if (!newret && o)
+			fio_option_mark_set(&td->o, o);
+
 		if (opts_copy[i]) {
 			if (newret && !o) {
 				unknown++;
@@ -3838,6 +3980,7 @@
 		for (i = 0; i < num_opts; i++) {
 			struct fio_option *o = NULL;
 			int newret = 1;
+
 			if (!opts_copy[i])
 				continue;
 
@@ -3847,9 +3990,10 @@
 						      td->eo, dump_cmdline);
 
 			ret |= newret;
-			if (!o)
+			if (!o) {
 				log_err("Bad option <%s>\n", opts[i]);
-
+				show_closest_option(opts[i]);
+			}
 			free(opts_copy[i]);
 			opts_copy[i] = NULL;
 		}
@@ -3861,7 +4005,18 @@
 
 int fio_cmd_option_parse(struct thread_data *td, const char *opt, char *val)
 {
-	return parse_cmd_option(opt, val, fio_options, td);
+	int ret;
+
+	ret = parse_cmd_option(opt, val, fio_options, td);
+	if (!ret) {
+		struct fio_option *o;
+
+		o = find_option(fio_options, opt);
+		if (o)
+			fio_option_mark_set(&td->o, o);
+	}
+
+	return ret;
 }
 
 int fio_cmd_ioengine_option_parse(struct thread_data *td, const char *opt,
@@ -4023,3 +4178,60 @@
 	return find_option(fio_options, name);
 }
 
+static struct fio_option *find_next_opt(struct thread_options *o,
+					struct fio_option *from,
+					unsigned int off1)
+{
+	struct fio_option *opt;
+
+	if (!from)
+		from = &fio_options[0];
+	else
+		from++;
+
+	opt = NULL;
+	do {
+		if (off1 == from->off1) {
+			opt = from;
+			break;
+		}
+		from++;
+	} while (from->name);
+
+	return opt;
+}
+
+static int opt_is_set(struct thread_options *o, struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	return (o->set_options[index] & (1UL << offset)) != 0;
+}
+
+int __fio_option_is_set(struct thread_options *o, unsigned int off1)
+{
+	struct fio_option *opt, *next;
+
+	next = NULL;
+	while ((opt = find_next_opt(o, next, off1)) != NULL) {
+		if (opt_is_set(o, opt))
+			return 1;
+
+		next = opt;
+	}
+
+	return 0;
+}
+
+void fio_option_mark_set(struct thread_options *o, struct fio_option *opt)
+{
+	unsigned int opt_off, index, offset;
+
+	opt_off = opt - &fio_options[0];
+	index = opt_off / (8 * sizeof(uint64_t));
+	offset = opt_off & ((8 * sizeof(uint64_t)) - 1);
+	o->set_options[index] |= 1UL << offset;
+}
diff --git a/options.h b/options.h
index de9f610..36fd35d 100644
--- a/options.h
+++ b/options.h
@@ -22,6 +22,22 @@
 
 extern struct fio_option fio_options[FIO_MAX_OPTS];
 
+extern int __fio_option_is_set(struct thread_options *, unsigned int off);
+
+#define fio_option_is_set(__td, name)					\
+({									\
+	const unsigned int off = td_var_offset(name);			\
+	int __r = __fio_option_is_set((__td), off);			\
+	if (__r == -1) {						\
+		dprint(FD_PARSE, "option %s/%u not found in map\n",	\
+				__fio_stringify(name), off);		\
+		__r = 0;						\
+	}								\
+	__r;								\
+})
+
+extern void fio_option_mark_set(struct thread_options *, struct fio_option *);
+
 static inline int o_match(struct fio_option *o, const char *opt)
 {
 	if (!strcmp(o->name, opt))
@@ -98,6 +114,7 @@
 	__FIO_OPT_G_ACT,
 	__FIO_OPT_G_LATPROF,
         __FIO_OPT_G_RBD,
+        __FIO_OPT_G_GFAPI,
 	__FIO_OPT_G_NR,
 
 	FIO_OPT_G_RATE		= (1U << __FIO_OPT_G_RATE),
@@ -128,6 +145,7 @@
 	FIO_OPT_G_ACT		= (1U << __FIO_OPT_G_ACT),
 	FIO_OPT_G_LATPROF	= (1U << __FIO_OPT_G_LATPROF),
 	FIO_OPT_G_RBD		= (1U << __FIO_OPT_G_RBD),
+	FIO_OPT_G_GFAPI		= (1U << __FIO_OPT_G_GFAPI),
 	FIO_OPT_G_INVALID	= (1U << __FIO_OPT_G_NR),
 };
 
diff --git a/os/os-android.h b/os/os-android.h
index 6b074cd..172ad9a 100644
--- a/os/os-android.h
+++ b/os/os-android.h
@@ -38,6 +38,13 @@
 
 #define OS_MAP_ANON		MAP_ANONYMOUS
 
+#ifndef POSIX_MADV_DONTNEED
+#define posix_madvise   madvise
+#define POSIX_MADV_DONTNEED MADV_DONTNEED
+#define POSIX_MADV_SEQUENTIAL	MADV_SEQUENTIAL
+#define POSIX_MADV_RANDOM	MADV_RANDOM
+#endif
+
 #ifdef MADV_REMOVE
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
diff --git a/os/os-dragonfly.h b/os/os-dragonfly.h
new file mode 100644
index 0000000..cc3de31
--- /dev/null
+++ b/os/os-dragonfly.h
@@ -0,0 +1,61 @@
+#ifndef FIO_OS_DRAGONFLY_H
+#define FIO_OS_DRAGONFLY_H
+
+#define	FIO_OS	os_dragonfly
+
+#include <errno.h>
+#include <sys/param.h>
+/* XXX hack to avoid confilcts between rbtree.h and <sys/rb.h> */
+#define	rb_node	_rb_node
+#include <sys/sysctl.h>
+#undef rb_node
+#undef rb_left
+#undef rb_right
+
+#include "../file.h"
+
+#define FIO_HAVE_ODIRECT
+#define FIO_USE_GENERIC_BDEV_SIZE
+#define FIO_USE_GENERIC_RAND
+#define FIO_USE_GENERIC_INIT_RANDOM_STATE
+#define FIO_HAVE_GETTID
+
+#undef	FIO_HAVE_CPU_AFFINITY	/* XXX notyet */
+
+#define OS_MAP_ANON		MAP_ANON
+
+#ifndef PTHREAD_STACK_MIN
+#define PTHREAD_STACK_MIN 4096
+#endif
+
+#define fio_swap16(x)	bswap16(x)
+#define fio_swap32(x)	bswap32(x)
+#define fio_swap64(x)	bswap64(x)
+
+typedef off_t off64_t;
+
+static inline int blockdev_invalidate_cache(struct fio_file *f)
+{
+	return EINVAL;
+}
+
+static inline unsigned long long os_phys_mem(void)
+{
+	int mib[2] = { CTL_HW, HW_PHYSMEM };
+	uint64_t mem;
+	size_t len = sizeof(mem);
+
+	sysctl(mib, 2, &mem, &len, NULL, 0);
+	return mem;
+}
+
+static inline int gettid(void)
+{
+	return (int) lwp_gettid();
+}
+
+#ifdef MADV_FREE
+#define FIO_MADV_FREE	MADV_FREE
+#endif
+
+#endif
diff --git a/os/os-linux.h b/os/os-linux.h
index 81d0402..e193634 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -15,6 +15,7 @@
 #include <linux/unistd.h>
 #include <linux/raw.h>
 #include <linux/major.h>
+#include <byteswap.h>
 
 #include "binject.h"
 #include "../file.h"
@@ -208,9 +209,21 @@
 #define FIO_MADV_FREE	MADV_REMOVE
 #endif
 
+#if defined(__builtin_bswap16)
+#define fio_swap16(x)	__builtin_bswap16(x)
+#else
 #define fio_swap16(x)	__bswap_16(x)
+#endif
+#if defined(__builtin_bswap32)
+#define fio_swap32(x)	__builtin_bswap32(x)
+#else
 #define fio_swap32(x)	__bswap_32(x)
+#endif
+#if defined(__builtin_bswap64)
+#define fio_swap64(x)	__builtin_bswap64(x)
+#else
 #define fio_swap64(x)	__bswap_64(x)
+#endif
 
 #define CACHE_LINE_FILE	\
 	"/sys/devices/system/cpu/cpu0/cache/index0/coherency_line_size"
diff --git a/os/os.h b/os/os.h
index b8eee66..7cb8121 100644
--- a/os/os.h
+++ b/os/os.h
@@ -21,6 +21,7 @@
 	os_solaris,
 	os_windows,
 	os_android,
+	os_dragonfly,
 
 	os_nr,
 };
@@ -45,6 +46,8 @@
 #include "os-hpux.h"
 #elif defined(WIN32)
 #include "os-windows.h"
+#elif defined (__DragonFly__)
+#include "os-dragonfly.h"
 #else
 #error "unsupported os"
 #endif
@@ -202,23 +205,29 @@
 
 #ifdef FIO_INTERNAL
 #define le16_to_cpu(val) ({			\
+	typecheck(uint16_t, val);		\
 	__le16_to_cpu(val);			\
 })
 #define le32_to_cpu(val) ({			\
+	typecheck(uint32_t, val);		\
 	__le32_to_cpu(val);			\
 })
 #define le64_to_cpu(val) ({			\
-	__le64_to_cpu(val);				\
+	typecheck(uint64_t, val);		\
+	__le64_to_cpu(val);			\
 })
 #endif
 
 #define cpu_to_le16(val) ({			\
+	typecheck(uint16_t, val);		\
 	__cpu_to_le16(val);			\
 })
 #define cpu_to_le32(val) ({			\
+	typecheck(uint32_t, val);		\
 	__cpu_to_le32(val);			\
 })
 #define cpu_to_le64(val) ({			\
+	typecheck(uint64_t, val);		\
 	__cpu_to_le64(val);			\
 })
 
diff --git a/os/windows/install.wxs b/os/windows/install.wxs
index bd89a7b..74f1d28 100755
--- a/os/windows/install.wxs
+++ b/os/windows/install.wxs
@@ -10,7 +10,7 @@
 	<Product Id="*"
 	  Codepage="1252" Language="1033"
 	  Manufacturer="fio" Name="fio"
-	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.1.9">
+	  UpgradeCode="2338A332-5511-43CF-B9BD-5C60496CCFCC" Version="2.2.6">
 		<Package
 		  Description="Flexible IO Tester"
 		  InstallerVersion="301" Keywords="Installer,MSI,Database"
diff --git a/parse.c b/parse.c
index 188f728..7912212 100644
--- a/parse.c
+++ b/parse.c
@@ -18,6 +18,10 @@
 #include "minmax.h"
 #include "lib/ieee754.h"
 
+#ifdef CONFIG_ARITHMETIC
+#include "y.tab.h"
+#endif
+
 static struct fio_option *__fio_options;
 
 static int vp_cmp(const void *p1, const void *p2)
@@ -264,34 +268,72 @@
 	return __get_mult_bytes(p, data, percent);
 }
 
+extern int evaluate_arithmetic_expression(const char *buffer, long long *ival,
+					  double *dval, double implied_units,
+					  int is_time);
+
 /*
  * Convert string into a floating number. Return 1 for success and 0 otherwise.
  */
-int str_to_float(const char *str, double *val)
+int str_to_float(const char *str, double *val, int is_time)
 {
-	return (1 == sscanf(str, "%lf", val));
+#ifdef CONFIG_ARITHMETIC
+	int rc;
+	long long ival;
+	double dval;
+
+	if (str[0] == '(') {
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, 1.0, is_time);
+		if (!rc) {
+			*val = dval;
+			return 1;
+		}
+	}
+#endif
+	return 1 == sscanf(str, "%lf", val);
 }
 
 /*
  * convert string into decimal value, noting any size suffix
  */
 int str_to_decimal(const char *str, long long *val, int kilo, void *data,
-		   int is_seconds)
+		   int is_seconds, int is_time)
 {
 	int len, base;
+	int rc = 1;
+#ifdef CONFIG_ARITHMETIC
+	long long ival;
+	double dval;
+	double implied_units = 1.0;
+#endif
 
 	len = strlen(str);
 	if (!len)
 		return 1;
 
-	if (strstr(str, "0x") || strstr(str, "0X"))
-		base = 16;
-	else
-		base = 10;
+#ifdef CONFIG_ARITHMETIC
+	if (is_seconds)
+		implied_units = 1000000.0;
+	if (str[0] == '(')
+		rc = evaluate_arithmetic_expression(str, &ival, &dval, implied_units, is_time);
+	if (str[0] == '(' && !rc) {
+		if (!kilo && is_seconds)
+			*val = ival / 1000000LL;
+		else
+			*val = ival;
+	}
+#endif
 
-	*val = strtoll(str, NULL, base);
-	if (*val == LONG_MAX && errno == ERANGE)
-		return 1;
+	if (rc == 1) {
+		if (strstr(str, "0x") || strstr(str, "0X"))
+			base = 16;
+		else
+			base = 10;
+
+		*val = strtoll(str, NULL, base);
+		if (*val == LONG_MAX && errno == ERANGE)
+			return 1;
+	}
 
 	if (kilo) {
 		unsigned long long mult;
@@ -310,12 +352,12 @@
 
 int check_str_bytes(const char *p, long long *val, void *data)
 {
-	return str_to_decimal(p, val, 1, data, 0);
+	return str_to_decimal(p, val, 1, data, 0, 0);
 }
 
 int check_str_time(const char *p, long long *val, int is_seconds)
 {
-	return str_to_decimal(p, val, 0, NULL, is_seconds);
+	return str_to_decimal(p, val, 0, NULL, is_seconds, 1);
 }
 
 void strip_blank_front(char **p)
@@ -357,7 +399,7 @@
 {
 	long long __val;
 
-	if (!str_to_decimal(str, &__val, 1, data, 0)) {
+	if (!str_to_decimal(str, &__val, 1, data, 0, 0)) {
 		*val = __val;
 		return 0;
 	}
@@ -380,7 +422,7 @@
 	return 1;
 }
 
-static int opt_len(const char *str)
+static size_t opt_len(const char *str)
 {
 	char *postfix;
 
@@ -461,6 +503,10 @@
 		fio_opt_str_val_fn *fn = o->cb;
 		char tmp[128], *p;
 
+		if (!is_time && o->is_time)
+			is_time = o->is_time;
+
+		tmp[sizeof(tmp) - 1] = '\0';
 		strncpy(tmp, ptr, sizeof(tmp) - 1);
 		p = strchr(tmp, ',');
 		if (p)
@@ -564,7 +610,7 @@
 					o->maxlen);
 			return 1;
 		}
-		if (!str_to_float(ptr, &uf)) {
+		if (!str_to_float(ptr, &uf, 0)) { /* this breaks if we ever have lists of times */
 			log_err("not a floating point value: %s\n", ptr);
 			return 1;
 		}
@@ -660,6 +706,7 @@
 		char tmp[128];
 		char *p1, *p2;
 
+		tmp[sizeof(tmp) - 1] = '\0';
 		strncpy(tmp, ptr, sizeof(tmp) - 1);
 
 		/* Handle bsrange with separate read,write values: */
@@ -972,7 +1019,7 @@
  * Option match, levenshtein distance. Handy for not quite remembering what
  * the option name is.
  */
-static int string_distance(const char *s1, const char *s2)
+int string_distance(const char *s1, const char *s2)
 {
 	unsigned int s1_len = strlen(s1);
 	unsigned int s2_len = strlen(s2);
@@ -990,11 +1037,13 @@
 		q[0] = p[0] + 1;
 		for (j = 1; j <= s2_len; j++) {
 			unsigned int sub = p[j - 1];
+			unsigned int pmin;
 
 			if (s1[i - 1] != s2[j - 1])
 				sub++;
 
-			q[j] = min(p[j] + 1, min(q[j - 1] + 1, sub));
+			pmin = min(q[j - 1] + 1, sub);
+			q[j] = min(p[j] + 1, pmin);
 		}
 		r = p;
 		p = q;
@@ -1167,7 +1216,7 @@
 		o->minfp = DBL_MIN;
 		o->maxfp = DBL_MAX;
 	}
-	if (o->type == FIO_OPT_STR_SET && o->def) {
+	if (o->type == FIO_OPT_STR_SET && o->def && !o->no_warn_def) {
 		log_err("Option %s: string set option with"
 				" default will always be true\n", o->name);
 	}
@@ -1181,8 +1230,6 @@
 	if (o->type == FIO_OPT_STR || o->type == FIO_OPT_STR_STORE ||
 	    o->type == FIO_OPT_STR_MULTI)
 		return;
-	if (o->cb && (o->off1 || o->off2 || o->off3 || o->off4))
-		log_err("Option %s: both cb and offset given\n", o->name);
 }
 
 /*
diff --git a/parse.h b/parse.h
index c797b92..15f2e06 100644
--- a/parse.h
+++ b/parse.h
@@ -73,6 +73,8 @@
 	unsigned int group;		/* who to group with */
 	void *gui_data;
 	int is_seconds;			/* time value with seconds base */
+	int is_time;			/* time based value */
+	int no_warn_def;
 };
 
 typedef int (str_cb_fn)(void *, char *);
@@ -88,10 +90,12 @@
 
 extern void strip_blank_front(char **);
 extern void strip_blank_end(char *);
-extern int str_to_decimal(const char *, long long *, int, void *, int);
+extern int str_to_decimal(const char *, long long *, int, void *, int, int);
 extern int check_str_bytes(const char *p, long long *val, void *data);
 extern int check_str_time(const char *p, long long *val, int);
-extern int str_to_float(const char *str, double *val);
+extern int str_to_float(const char *str, double *val, int is_time);
+
+extern int string_distance(const char *s1, const char *s2);
 
 /*
  * Handlers for the options
diff --git a/server.c b/server.c
index 077dce5..c249849 100644
--- a/server.c
+++ b/server.c
@@ -24,6 +24,8 @@
 #include "server.h"
 #include "crc/crc16.h"
 #include "lib/ieee754.h"
+#include "verify.h"
+#include "smalloc.h"
 
 int fio_net_port = FIO_NET_PORT;
 
@@ -41,6 +43,7 @@
 static unsigned int has_zlib = 0;
 #endif
 static unsigned int use_zlib;
+static char me[128];
 
 struct fio_fork_item {
 	struct flist_head list;
@@ -50,6 +53,13 @@
 	pid_t pid;
 };
 
+struct cmd_reply {
+	struct fio_mutex lock;
+	void *data;
+	size_t size;
+	int error;
+};
+
 static const char *fio_server_ops[FIO_NET_CMD_NR] = {
 	"",
 	"QUIT",
@@ -67,8 +77,12 @@
 	"DISK_UTIL",
 	"SERVER_START",
 	"ADD_JOB",
-	"CMD_RUN",
-	"CMD_IOLOG",
+	"RUN",
+	"IOLOG",
+	"UPDATE_JOB",
+	"LOAD_FILE",
+	"VTRIGGER",
+	"SENDFILE",
 };
 
 const char *fio_server_op(unsigned int op)
@@ -285,14 +299,14 @@
 		/* zero-terminate text input */
 		if (cmdret->pdu_len) {
 			if (cmdret->opcode == FIO_NET_CMD_TEXT) {
-				struct cmd_text_pdu *pdu = (struct cmd_text_pdu *) cmdret->payload;
-				char *buf = (char *) pdu->buf;
+				struct cmd_text_pdu *__pdu = (struct cmd_text_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
 
-				buf[pdu->buf_len] = '\0';
+				buf[__pdu->buf_len] = '\0';
 			} else if (cmdret->opcode == FIO_NET_CMD_JOB) {
-				struct cmd_job_pdu *pdu = (struct cmd_job_pdu *) cmdret->payload;
-				char *buf = (char *) pdu->buf;
-				int len = le32_to_cpu(pdu->buf_len);
+				struct cmd_job_pdu *__pdu = (struct cmd_job_pdu *) cmdret->payload;
+				char *buf = (char *) __pdu->buf;
+				int len = le32_to_cpu(__pdu->buf_len);
 
 				buf[len] = '\0';
 			}
@@ -319,7 +333,7 @@
 
 	reply = calloc(1, sizeof(*reply));
 	INIT_FLIST_HEAD(&reply->list);
-	gettimeofday(&reply->tv, NULL);
+	fio_gettime(&reply->tv, NULL);
 	reply->saved_tag = tag;
 	reply->opcode = opcode;
 
@@ -550,11 +564,34 @@
 	fio_server_check_fork_items(conn_list);
 }
 
+static int handle_load_file_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_load_file_pdu *pdu = (struct cmd_load_file_pdu *) cmd->payload;
+	void *file_name = pdu->file;
+	struct cmd_start_pdu spdu;
+
+	dprint(FD_NET, "server: loading local file %s\n", (char *) file_name);
+
+	pdu->name_len = le16_to_cpu(pdu->name_len);
+	pdu->client_type = le16_to_cpu(pdu->client_type);
+
+	if (parse_jobs_ini(file_name, 0, 0, pdu->client_type)) {
+		fio_net_send_quit(server_fd);
+		return -1;
+	}
+
+	spdu.jobs = cpu_to_le32(thread_number);
+	spdu.stat_outputs = cpu_to_le32(stat_number);
+	fio_net_send_cmd(server_fd, FIO_NET_CMD_START, &spdu, sizeof(spdu), NULL, NULL);
+	return 0;
+}
+
 static int handle_run_cmd(struct flist_head *job_list, struct fio_net_cmd *cmd)
 {
 	pid_t pid;
 	int ret;
 
+	fio_time_init();
 	set_genesis_time();
 
 	pid = fork();
@@ -636,6 +673,8 @@
 
 	dprint(FD_NET, "server: sending probe reply\n");
 
+	strcpy(me, (char *) pdu->server);
+
 	memset(&probe, 0, sizeof(probe));
 	gethostname((char *) probe.hostname, sizeof(probe.hostname));
 #ifdef CONFIG_BIG_ENDIAN
@@ -665,22 +704,14 @@
 static int handle_send_eta_cmd(struct fio_net_cmd *cmd)
 {
 	struct jobs_eta *je;
-	size_t size;
 	uint64_t tag = cmd->tag;
+	size_t size;
 	int i;
 
-	if (!thread_number)
+	je = get_jobs_eta(1, &size);
+	if (!je)
 		return 0;
 
-	size = sizeof(*je) + thread_number * sizeof(char) + 1;
-	je = malloc(size);
-	memset(je, 0, size);
-
-	if (!calc_thread_status(je, 1)) {
-		free(je);
-		return 0;
-	}
-
 	dprint(FD_NET, "server sending status\n");
 
 	je->nr_running		= cpu_to_le32(je->nr_running);
@@ -739,6 +770,31 @@
 	return 0;
 }
 
+static int handle_trigger_cmd(struct fio_net_cmd *cmd)
+{
+	struct cmd_vtrigger_pdu *pdu = (struct cmd_vtrigger_pdu *) cmd->payload;
+	char *buf = (char *) pdu->cmd;
+	struct all_io_list *rep;
+	size_t sz;
+
+	pdu->len = le16_to_cpu(pdu->len);
+	buf[pdu->len] = '\0';
+
+	rep = get_all_io_list(IO_LIST_ALL, &sz);
+	if (!rep) {
+		struct all_io_list state;
+
+		state.threads = cpu_to_le64((uint64_t) 0);
+		fio_net_send_cmd(server_fd, FIO_NET_CMD_VTRIGGER, &state, sizeof(state), NULL, NULL);
+	} else {
+		fio_net_send_cmd(server_fd, FIO_NET_CMD_VTRIGGER, rep, sz, NULL, NULL);
+		free(rep);
+	}
+
+	exec_trigger(buf);
+	return 0;
+}
+
 static int handle_command(struct flist_head *job_list, struct fio_net_cmd *cmd)
 {
 	int ret;
@@ -754,6 +810,9 @@
 	case FIO_NET_CMD_EXIT:
 		exit_backend = 1;
 		return -1;
+	case FIO_NET_CMD_LOAD_FILE:
+		ret = handle_load_file_cmd(cmd);
+		break;
 	case FIO_NET_CMD_JOB:
 		ret = handle_job_cmd(cmd);
 		break;
@@ -772,6 +831,35 @@
 	case FIO_NET_CMD_UPDATE_JOB:
 		ret = handle_update_job_cmd(cmd);
 		break;
+	case FIO_NET_CMD_VTRIGGER:
+		ret = handle_trigger_cmd(cmd);
+		break;
+	case FIO_NET_CMD_SENDFILE: {
+		struct cmd_sendfile_reply *in;
+		struct cmd_reply *rep;
+
+		rep = (struct cmd_reply *) (uintptr_t) cmd->tag;
+
+		in = (struct cmd_sendfile_reply *) cmd->payload;
+		in->size = le32_to_cpu(in->size);
+		in->error = le32_to_cpu(in->error);
+		if (in->error) {
+			ret = 1;
+			rep->error = in->error;
+		} else {
+			ret = 0;
+			rep->data = smalloc(in->size);
+			if (!rep->data) {
+				ret = 1;
+				rep->error = ENOMEM;
+			} else {
+				rep->size = in->size;
+				memcpy(rep->data, in->data, in->size);
+			}
+		}
+		fio_mutex_up(&rep->lock);
+		break;
+		}
 	default:
 		log_err("fio: unknown opcode: %s\n", fio_server_op(cmd->opcode));
 		ret = 1;
@@ -960,8 +1048,8 @@
 	/*
 	 * Encode to IEEE 754 for network transfer
 	 */
-	dst->mean.u.i	= __cpu_to_le64(fio_double_to_uint64(src->mean.u.f));
-	dst->S.u.i	= __cpu_to_le64(fio_double_to_uint64(src->S.u.f));
+	dst->mean.u.i	= cpu_to_le64(fio_double_to_uint64(src->mean.u.f));
+	dst->S.u.i	= cpu_to_le64(fio_double_to_uint64(src->S.u.f));
 }
 
 static void convert_gs(struct group_run_stats *dst, struct group_run_stats *src)
@@ -1020,12 +1108,13 @@
 	p.ts.minf		= cpu_to_le64(ts->minf);
 	p.ts.majf		= cpu_to_le64(ts->majf);
 	p.ts.clat_percentiles	= cpu_to_le64(ts->clat_percentiles);
+	p.ts.percentile_precision = cpu_to_le64(ts->percentile_precision);
 
 	for (i = 0; i < FIO_IO_U_LIST_MAX_LEN; i++) {
 		fio_fp64_t *src = &ts->percentile_list[i];
 		fio_fp64_t *dst = &p.ts.percentile_list[i];
 
-		dst->u.i = __cpu_to_le64(fio_double_to_uint64(src->u.f));
+		dst->u.i = cpu_to_le64(fio_double_to_uint64(src->u.f));
 	}
 
 	for (i = 0; i < FIO_IO_U_MAP_NR; i++) {
@@ -1046,6 +1135,7 @@
 	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
 		p.ts.total_io_u[i]	= cpu_to_le64(ts->total_io_u[i]);
 		p.ts.short_io_u[i]	= cpu_to_le64(ts->short_io_u[i]);
+		p.ts.drop_io_u[i]	= cpu_to_le64(ts->drop_io_u[i]);
 	}
 
 	p.ts.total_submit	= cpu_to_le64(ts->total_submit);
@@ -1066,7 +1156,7 @@
 	p.ts.latency_depth	= cpu_to_le32(ts->latency_depth);
 	p.ts.latency_target	= cpu_to_le64(ts->latency_target);
 	p.ts.latency_window	= cpu_to_le64(ts->latency_window);
-	p.ts.latency_percentile.u.i = __cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
+	p.ts.latency_percentile.u.i = cpu_to_le64(fio_double_to_uint64(ts->latency_percentile.u.f));
 
 	convert_gs(&p.rs, rs);
 
@@ -1088,16 +1178,16 @@
 	int i;
 
 	for (i = 0; i < 2; i++) {
-		dst->ios[i]	= cpu_to_le32(src->ios[i]);
-		dst->merges[i]	= cpu_to_le32(src->merges[i]);
+		dst->ios[i]	= cpu_to_le64(src->ios[i]);
+		dst->merges[i]	= cpu_to_le64(src->merges[i]);
 		dst->sectors[i]	= cpu_to_le64(src->sectors[i]);
-		dst->ticks[i]	= cpu_to_le32(src->ticks[i]);
+		dst->ticks[i]	= cpu_to_le64(src->ticks[i]);
 	}
 
-	dst->io_ticks		= cpu_to_le32(src->io_ticks);
-	dst->time_in_queue	= cpu_to_le32(src->time_in_queue);
+	dst->io_ticks		= cpu_to_le64(src->io_ticks);
+	dst->time_in_queue	= cpu_to_le64(src->time_in_queue);
 	dst->slavecount		= cpu_to_le32(src->slavecount);
-	dst->max_util.u.i	= __cpu_to_le64(fio_double_to_uint64(src->max_util.u.f));
+	dst->max_util.u.i	= cpu_to_le64(fio_double_to_uint64(src->max_util.u.f));
 }
 
 static void convert_dus(struct disk_util_stat *dst, struct disk_util_stat *src)
@@ -1108,14 +1198,14 @@
 	strncpy((char *) dst->name, (char *) src->name, FIO_DU_NAME_SZ - 1);
 
 	for (i = 0; i < 2; i++) {
-		dst->s.ios[i]		= cpu_to_le32(src->s.ios[i]);
-		dst->s.merges[i]	= cpu_to_le32(src->s.merges[i]);
+		dst->s.ios[i]		= cpu_to_le64(src->s.ios[i]);
+		dst->s.merges[i]	= cpu_to_le64(src->s.merges[i]);
 		dst->s.sectors[i]	= cpu_to_le64(src->s.sectors[i]);
-		dst->s.ticks[i]		= cpu_to_le32(src->s.ticks[i]);
+		dst->s.ticks[i]		= cpu_to_le64(src->s.ticks[i]);
 	}
 
-	dst->s.io_ticks		= cpu_to_le32(src->s.io_ticks);
-	dst->s.time_in_queue	= cpu_to_le32(src->s.time_in_queue);
+	dst->s.io_ticks		= cpu_to_le64(src->s.io_ticks);
+	dst->s.time_in_queue	= cpu_to_le64(src->s.time_in_queue);
 	dst->s.msec		= cpu_to_le64(src->s.msec);
 }
 
@@ -1184,11 +1274,10 @@
 	}
 
 	stream.next_in = (void *) log->log;
-	stream.avail_in = log->nr_samples * sizeof(struct io_sample);
+	stream.avail_in = log->nr_samples * log_entry_sz(log);
 
 	do {
 		unsigned int this_len, flags = 0;
-		int ret;
 
 		stream.avail_out = FIO_SERVER_MAX_FRAGMENT_PDU;
 		stream.next_out = out_pdu;
@@ -1221,8 +1310,8 @@
 	struct cmd_iolog_pdu pdu;
 	int i, ret = 0;
 
+	pdu.nr_samples = cpu_to_le64(log->nr_samples);
 	pdu.thread_number = cpu_to_le32(td->thread_number);
-	pdu.nr_samples = __cpu_to_le32(log->nr_samples);
 	pdu.log_type = cpu_to_le32(log->log_type);
 	pdu.compressed = cpu_to_le32(use_zlib);
 
@@ -1230,12 +1319,18 @@
 	pdu.name[FIO_NET_NAME_MAX - 1] = '\0';
 
 	for (i = 0; i < log->nr_samples; i++) {
-		struct io_sample *s = &log->log[i];
+		struct io_sample *s = get_sample(log, i);
 
-		s->time	= cpu_to_le64(s->time);
-		s->val	= cpu_to_le64(s->val);
-		s->ddir	= cpu_to_le32(s->ddir);
-		s->bs	= cpu_to_le32(s->bs);
+		s->time		= cpu_to_le64(s->time);
+		s->val		= cpu_to_le64(s->val);
+		s->__ddir	= cpu_to_le32(s->__ddir);
+		s->bs		= cpu_to_le32(s->bs);
+
+		if (log->log_offset) {
+			struct io_sample_offset *so = (void *) s;
+
+			so->offset = cpu_to_le64(so->offset);
+		}
 	}
 
 	/*
@@ -1253,7 +1348,7 @@
 		return fio_send_iolog_gz(&pdu, log);
 
 	return fio_send_cmd_ext_pdu(server_fd, FIO_NET_CMD_IOLOG, log->log,
-			log->nr_samples * sizeof(struct io_sample), 0, 0);
+			log->nr_samples * log_entry_sz(log), 0, 0);
 }
 
 void fio_server_send_add_job(struct thread_data *td)
@@ -1275,6 +1370,72 @@
 	fio_net_send_simple_cmd(server_fd, FIO_NET_CMD_SERVER_START, 0, NULL);
 }
 
+int fio_server_get_verify_state(const char *name, int threadnumber,
+				void **datap)
+{
+	struct thread_io_list *s;
+	struct cmd_sendfile out;
+	struct cmd_reply *rep;
+	uint64_t tag;
+	void *data;
+
+	dprint(FD_NET, "server: request verify state\n");
+
+	rep = smalloc(sizeof(*rep));
+	if (!rep) {
+		log_err("fio: smalloc pool too small\n");
+		return 1;
+	}
+
+	__fio_mutex_init(&rep->lock, FIO_MUTEX_LOCKED);
+	rep->data = NULL;
+	rep->error = 0;
+
+	verify_state_gen_name((char *) out.path, sizeof(out.path), name, me,
+				threadnumber);
+	tag = (uint64_t) (uintptr_t) rep;
+	fio_net_send_cmd(server_fd, FIO_NET_CMD_SENDFILE, &out, sizeof(out),
+				&tag, NULL);
+
+	/*
+	 * Wait for the backend to receive the reply
+	 */
+	if (fio_mutex_down_timeout(&rep->lock, 10)) {
+		log_err("fio: timed out waiting for reply\n");
+		goto fail;
+	}
+
+	if (rep->error) {
+		log_err("fio: failure on receiving state file: %s\n", strerror(rep->error));
+fail:
+		*datap = NULL;
+		sfree(rep);
+		fio_net_send_quit(server_fd);
+		return 1;
+	}
+
+	/*
+	 * The format is verify_state_hdr, then thread_io_list. Verify
+	 * the header, and the thread_io_list checksum
+	 */
+	s = rep->data + sizeof(struct verify_state_hdr);
+	if (verify_state_hdr(rep->data, s))
+		goto fail;
+
+	/*
+	 * Don't need the header from now, copy just the thread_io_list
+	 */
+	rep->size -= sizeof(struct verify_state_hdr);
+	data = malloc(rep->size);
+	memcpy(data, s, rep->size);
+	*datap = data;
+
+	sfree(rep->data);
+	__fio_mutex_remove(&rep->lock);
+	sfree(rep);
+	return 0;
+}
+
 static int fio_init_server_ip(void)
 {
 	struct sockaddr *addr;
@@ -1707,10 +1868,9 @@
 		free(pidfile);
 		return -1;
 	} else if (pid) {
-		int ret = write_pid(pid, pidfile);
-
+		ret = write_pid(pid, pidfile);
 		free(pidfile);
-		exit(ret);
+		_exit(ret);
 	}
 
 	setsid();
diff --git a/server.h b/server.h
index 4f09f28..dc5a69e 100644
--- a/server.h
+++ b/server.h
@@ -38,7 +38,7 @@
 };
 
 enum {
-	FIO_SERVER_VER			= 34,
+	FIO_SERVER_VER			= 42,
 
 	FIO_SERVER_MAX_FRAGMENT_PDU	= 1024,
 	FIO_SERVER_MAX_CMD_MB		= 2048,
@@ -61,7 +61,10 @@
 	FIO_NET_CMD_RUN			= 16,
 	FIO_NET_CMD_IOLOG		= 17,
 	FIO_NET_CMD_UPDATE_JOB		= 18,
-	FIO_NET_CMD_NR			= 19,
+	FIO_NET_CMD_LOAD_FILE		= 19,
+	FIO_NET_CMD_VTRIGGER		= 20,
+	FIO_NET_CMD_SENDFILE		= 21,
+	FIO_NET_CMD_NR			= 22,
 
 	FIO_NET_CMD_F_MORE		= 1UL << 0,
 
@@ -76,6 +79,31 @@
 	FIO_PROBE_FLAG_ZLIB		= 1UL << 0,
 };
 
+struct cmd_sendfile {
+	uint8_t path[FIO_NET_NAME_MAX];
+};
+
+struct cmd_sendfile_reply {
+	uint32_t size;
+	uint32_t error;
+	uint8_t data[0];
+};
+
+/*
+ * Client sends this to server on VTRIGGER, server sends back a full
+ * all_io_list structure.
+ */
+struct cmd_vtrigger_pdu {
+	uint16_t len;
+	uint8_t cmd[];
+};
+
+struct cmd_load_file_pdu {
+	uint16_t name_len;
+	uint16_t client_type;
+	uint8_t file[];
+};
+
 struct cmd_ts_pdu {
 	struct thread_stat ts;
 	struct group_run_stats rs;
@@ -88,6 +116,7 @@
 
 struct cmd_client_probe_pdu {
 	uint64_t flags;
+	uint8_t server[128];
 };
 
 struct cmd_probe_reply_pdu {
@@ -143,10 +172,11 @@
 };
 
 struct cmd_iolog_pdu {
+	uint64_t nr_samples;
 	uint32_t thread_number;
-	uint32_t nr_samples;
 	uint32_t log_type;
 	uint32_t compressed;
+	uint32_t log_offset;
 	uint8_t name[FIO_NET_NAME_MAX];
 	struct io_sample samples[0];
 };
@@ -167,11 +197,7 @@
 extern void fio_server_send_gs(struct group_run_stats *);
 extern void fio_server_send_du(void);
 extern void fio_server_idle_loop(void);
-
-extern int fio_clients_connect(void);
-extern int fio_clients_send_ini(const char *);
-extern void fio_client_add_cmd_option(void *, const char *);
-extern void fio_client_add_ini_file(void *, const char *);
+extern int fio_server_get_verify_state(const char *, int, void **);
 
 extern int fio_recv_data(int sk, void *p, unsigned int len);
 extern int fio_send_data(int sk, const void *p, unsigned int len);
diff --git a/smalloc.c b/smalloc.c
index c8f1642..b460d65 100644
--- a/smalloc.c
+++ b/smalloc.c
@@ -17,6 +17,7 @@
 #include "arch/arch.h"
 #include "os/os.h"
 #include "smalloc.h"
+#include "log.h"
 
 #define SMALLOC_REDZONE		/* define to detect memory corruption */
 
@@ -24,8 +25,8 @@
 #define SMALLOC_BPI	(sizeof(unsigned int) * 8)
 #define SMALLOC_BPL	(SMALLOC_BPB * SMALLOC_BPI)
 
-#define INITIAL_SIZE	8192*1024	/* new pool size */
-#define MAX_POOLS	128		/* maximum number of pools to setup */
+#define INITIAL_SIZE	16*1024*1024	/* new pool size */
+#define MAX_POOLS	8		/* maximum number of pools to setup */
 
 #define SMALLOC_PRE_RED		0xdeadbeefU
 #define SMALLOC_POST_RED	0x5aa55aa5U
@@ -180,6 +181,7 @@
 static int add_pool(struct pool *pool, unsigned int alloc_size)
 {
 	int bitmap_blocks;
+	int mmap_flags;
 	void *ptr;
 
 #ifdef SMALLOC_REDZONE
@@ -198,8 +200,14 @@
 	pool->nr_blocks = bitmap_blocks;
 	pool->free_blocks = bitmap_blocks * SMALLOC_BPB;
 
-	ptr = mmap(NULL, alloc_size, PROT_READ|PROT_WRITE,
-			MAP_SHARED | OS_MAP_ANON, -1, 0);
+	mmap_flags = OS_MAP_ANON;
+#ifdef CONFIG_ESX
+	mmap_flags |= MAP_PRIVATE;
+#else
+	mmap_flags |= MAP_SHARED;
+#endif
+	ptr = mmap(NULL, alloc_size, PROT_READ|PROT_WRITE, mmap_flags, -1, 0);
+
 	if (ptr == MAP_FAILED)
 		goto out_fail;
 
@@ -214,7 +222,7 @@
 	nr_pools++;
 	return 0;
 out_fail:
-	fprintf(stderr, "smalloc: failed adding pool\n");
+	log_err("smalloc: failed adding pool\n");
 	if (pool->map)
 		munmap(pool->map, pool->mmap_size);
 	return 1;
@@ -222,11 +230,21 @@
 
 void sinit(void)
 {
-	int ret;
+	int i, ret;
 
 	lock = fio_rwlock_init();
-	ret = add_pool(&mp[0], INITIAL_SIZE);
-	assert(!ret);
+
+	for (i = 0; i < MAX_POOLS; i++) {
+		ret = add_pool(&mp[i], INITIAL_SIZE);
+		if (ret)
+			break;
+	}
+
+	/*
+	 * If we added at least one pool, we should be OK for most
+	 * cases.
+	 */
+	assert(i);
 }
 
 static void cleanup_pool(struct pool *pool)
@@ -276,14 +294,14 @@
 	unsigned int *postred = postred_ptr(hdr);
 
 	if (hdr->prered != SMALLOC_PRE_RED) {
-		fprintf(stderr, "smalloc pre redzone destroyed!\n");
-		fprintf(stderr, "  ptr=%p, prered=%x, expected %x\n",
+		log_err("smalloc pre redzone destroyed!\n"
+			" ptr=%p, prered=%x, expected %x\n",
 				hdr, hdr->prered, SMALLOC_PRE_RED);
 		assert(0);
 	}
 	if (*postred != SMALLOC_POST_RED) {
-		fprintf(stderr, "smalloc post redzone destroyed!\n");
-		fprintf(stderr, "  ptr=%p, postred=%x, expected %x\n",
+		log_err("smalloc post redzone destroyed!\n"
+			"  ptr=%p, postred=%x, expected %x\n",
 				hdr, *postred, SMALLOC_POST_RED);
 		assert(0);
 	}
@@ -434,16 +452,17 @@
 
 void *smalloc(size_t size)
 {
-	unsigned int i;
+	unsigned int i, end_pool;
 
 	if (size != (unsigned int) size)
 		return NULL;
 
 	global_write_lock();
 	i = last_pool;
+	end_pool = nr_pools;
 
 	do {
-		for (; i < nr_pools; i++) {
+		for (; i < end_pool; i++) {
 			void *ptr = smalloc_pool(&mp[i], size);
 
 			if (ptr) {
@@ -453,29 +472,35 @@
 			}
 		}
 		if (last_pool) {
-			last_pool = 0;
+			end_pool = last_pool;
+			last_pool = i = 0;
 			continue;
 		}
 
-		if (nr_pools + 1 > MAX_POOLS)
-			break;
-		else {
-			i = nr_pools;
-			if (add_pool(&mp[nr_pools], size))
-				goto out;
-		}
+		break;
 	} while (1);
 
-out:
 	global_write_unlock();
 	return NULL;
 }
 
+void *scalloc(size_t nmemb, size_t size)
+{
+	void *ret;
+
+	ret = smalloc(nmemb * size);
+	if (ret)
+		memset(ret, 0, nmemb * size);
+
+	return ret;
+}
+
 char *smalloc_strdup(const char *str)
 {
-	char *ptr;
+	char *ptr = NULL;
 
 	ptr = smalloc(strlen(str) + 1);
-	strcpy(ptr, str);
+	if (ptr)
+		strcpy(ptr, str);
 	return ptr;
 }
diff --git a/smalloc.h b/smalloc.h
index f9a5e41..4b551e3 100644
--- a/smalloc.h
+++ b/smalloc.h
@@ -2,6 +2,7 @@
 #define FIO_SMALLOC_H
 
 extern void *smalloc(size_t);
+extern void *scalloc(size_t, size_t);
 extern void sfree(void *);
 extern char *smalloc_strdup(const char *);
 extern void sinit(void);
diff --git a/stat.c b/stat.c
index 3adb46e..6a3610f 100644
--- a/stat.c
+++ b/stat.c
@@ -14,7 +14,7 @@
 #include "lib/getrusage.h"
 #include "idletime.h"
 
-static struct fio_mutex *stat_mutex;
+struct fio_mutex *stat_mutex;
 
 void update_rusage_stat(struct thread_data *td)
 {
@@ -263,7 +263,7 @@
 void show_group_stats(struct group_run_stats *rs)
 {
 	char *p1, *p2, *p3, *p4;
-	const char *ddir_str[] = { "   READ", "  WRITE" , "   TRIM"};
+	const char *str[] = { "   READ", "  WRITE" , "   TRIM"};
 	int i;
 
 	log_info("\nRun status group %d (all jobs):\n", rs->groupid);
@@ -281,7 +281,7 @@
 
 		log_info("%s: io=%s, aggrb=%s/s, minb=%s/s, maxb=%s/s,"
 			 " mint=%llumsec, maxt=%llumsec\n",
-				rs->unified_rw_rep ? "  MIXED" : ddir_str[i],
+				rs->unified_rw_rep ? "  MIXED" : str[i],
 				p1, p2, p3, p4,
 				(unsigned long long) rs->min_run[i],
 				(unsigned long long) rs->max_run[i]);
@@ -363,7 +363,7 @@
 static void show_ddir_status(struct group_run_stats *rs, struct thread_stat *ts,
 			     int ddir)
 {
-	const char *ddir_str[] = { "read ", "write", "trim" };
+	const char *str[] = { "read ", "write", "trim" };
 	unsigned long min, max, runt;
 	unsigned long long bw, iops;
 	double mean, dev;
@@ -386,7 +386,7 @@
 	iops_p = num2str(iops, 6, 1, 0, 0);
 
 	log_info("  %s: io=%s, bw=%s/s, iops=%s, runt=%6llumsec\n",
-				rs->unified_rw_rep ? "mixed" : ddir_str[ddir],
+				rs->unified_rw_rep ? "mixed" : str[ddir],
 				io_p, bw_p, iops_p,
 				(unsigned long long) ts->runtime[ddir]);
 
@@ -504,11 +504,9 @@
 	unsigned long runtime;
 	double io_u_dist[FIO_IO_U_MAP_NR];
 	time_t time_p;
-	char time_buf[64];
+	char time_buf[32];
 
-	if (!(ts->io_bytes[DDIR_READ] + ts->io_bytes[DDIR_WRITE] +
-	    ts->io_bytes[DDIR_TRIM]) && !(ts->total_io_u[DDIR_READ] +
-	    ts->total_io_u[DDIR_WRITE] + ts->total_io_u[DDIR_TRIM]))
+	if (!ddir_rw_sum(ts->io_bytes) && !ddir_rw_sum(ts->total_io_u))
 		return;
 
 	time(&time_p);
@@ -574,13 +572,17 @@
 					io_u_dist[3], io_u_dist[4],
 					io_u_dist[5], io_u_dist[6]);
 	log_info("     issued    : total=r=%llu/w=%llu/d=%llu,"
-				 " short=r=%llu/w=%llu/d=%llu\n",
+				 " short=r=%llu/w=%llu/d=%llu,"
+				 " drop=r=%llu/w=%llu/d=%llu\n",
 					(unsigned long long) ts->total_io_u[0],
 					(unsigned long long) ts->total_io_u[1],
 					(unsigned long long) ts->total_io_u[2],
 					(unsigned long long) ts->short_io_u[0],
 					(unsigned long long) ts->short_io_u[1],
-					(unsigned long long) ts->short_io_u[2]);
+					(unsigned long long) ts->short_io_u[2],
+					(unsigned long long) ts->drop_io_u[0],
+					(unsigned long long) ts->drop_io_u[1],
+					(unsigned long long) ts->drop_io_u[2]);
 	if (ts->continue_on_error) {
 		log_info("     errors    : total=%llu, first_error=%d/<%s>\n",
 					(unsigned long long)ts->total_err_count,
@@ -672,9 +674,9 @@
 		struct group_run_stats *rs, int ddir, struct json_object *parent)
 {
 	unsigned long min, max;
-	unsigned long long bw, iops;
+	unsigned long long bw;
 	unsigned int *ovals = NULL;
-	double mean, dev;
+	double mean, dev, iops;
 	unsigned int len, minv, maxv;
 	int i;
 	const char *ddirname[] = {"read", "write", "trim"};
@@ -691,18 +693,22 @@
 	json_object_add_value_object(parent,
 		ts->unified_rw_rep ? "mixed" : ddirname[ddir], dir_object);
 
-	iops = bw = 0;
+	bw = 0;
+	iops = 0.0;
 	if (ts->runtime[ddir]) {
 		uint64_t runt = ts->runtime[ddir];
 
 		bw = ((1000 * ts->io_bytes[ddir]) / runt) / 1024;
-		iops = (1000 * (uint64_t) ts->total_io_u[ddir]) / runt;
+		iops = (1000.0 * (uint64_t) ts->total_io_u[ddir]) / runt;
 	}
 
 	json_object_add_value_int(dir_object, "io_bytes", ts->io_bytes[ddir] >> 10);
 	json_object_add_value_int(dir_object, "bw", bw);
-	json_object_add_value_int(dir_object, "iops", iops);
+	json_object_add_value_float(dir_object, "iops", iops);
 	json_object_add_value_int(dir_object, "runtime", ts->runtime[ddir]);
+	json_object_add_value_int(dir_object, "total_ios", ts->total_io_u[ddir]);
+	json_object_add_value_int(dir_object, "short_ios", ts->short_io_u[ddir]);
+	json_object_add_value_int(dir_object, "drop_ios", ts->drop_io_u[ddir]);
 
 	if (!calc_lat(&ts->slat_stat[ddir], &min, &max, &mean, &dev)) {
 		min = max = 0;
@@ -891,8 +897,7 @@
 		log_info(";%3.2f%%", io_u_lat_m[i]);
 
 	/* disk util stats, if any */
-	if (is_backend)
-		show_disk_util(1, NULL);
+	show_disk_util(1, NULL);
 
 	/* Additional output if continue_on_error set - default off*/
 	if (ts->continue_on_error)
@@ -1072,6 +1077,10 @@
 		dst->agg[i] += src->agg[i];
 	}
 
+	if (!dst->kb_base)
+		dst->kb_base = src->kb_base;
+	if (!dst->unit_base)
+		dst->unit_base = src->unit_base;
 }
 
 void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr)
@@ -1123,9 +1132,11 @@
 		if (!dst->unified_rw_rep) {
 			dst->total_io_u[k] += src->total_io_u[k];
 			dst->short_io_u[k] += src->short_io_u[k];
+			dst->drop_io_u[k] += src->drop_io_u[k];
 		} else {
 			dst->total_io_u[0] += src->total_io_u[k];
 			dst->short_io_u[0] += src->short_io_u[k];
+			dst->drop_io_u[0] += src->drop_io_u[k];
 		}
 	}
 
@@ -1133,10 +1144,14 @@
 		int m;
 
 		for (m = 0; m < FIO_IO_U_PLAT_NR; m++) {
+			/* HACK to prevent bus error in arm GCC 4.9 */
+			dst->io_u_plat[k][m]+=1;
 			if (!dst->unified_rw_rep)
 				dst->io_u_plat[k][m] += src->io_u_plat[k][m];
 			else
 				dst->io_u_plat[0][m] += src->io_u_plat[k][m];
+			/* HACK to prevent bus error in arm GCC 4.9 */
+			dst->io_u_plat[k][m]-=1;
 		}
 	}
 
@@ -1169,7 +1184,7 @@
 	ts->groupid = -1;
 }
 
-static void __show_run_stats(void)
+void __show_run_stats(void)
 {
 	struct group_run_stats *runstats, *rs;
 	struct thread_data *td;
@@ -1179,7 +1194,6 @@
 	int unit_base_warned = 0;
 	struct json_object *root = NULL;
 	struct json_array *array = NULL;
-
 	runstats = malloc(sizeof(struct group_run_stats) * (groupid + 1));
 
 	for (i = 0; i < groupid + 1; i++)
@@ -1341,8 +1355,18 @@
 	if (output_format == FIO_OUTPUT_NORMAL)
 		log_info("\n");
 	else if (output_format == FIO_OUTPUT_JSON) {
+		char time_buf[32];
+		time_t time_p;
+
+		time(&time_p);
+		os_ctime_r((const time_t *) &time_p, time_buf,
+				sizeof(time_buf));
+		time_buf[strlen(time_buf) - 1] = '\0';
+
 		root = json_create_object();
 		json_object_add_value_string(root, "fio version", fio_version_string);
+		json_object_add_value_int(root, "timestamp", time_p);
+		json_object_add_value_string(root, "time", time_buf);
 		array = json_create_array();
 		json_object_add_value_array(root, "jobs", array);
 	}
@@ -1411,13 +1435,15 @@
 	fio_mutex_up(stat_mutex);
 }
 
-static void *__show_running_run_stats(void fio_unused *arg)
+void __show_running_run_stats(void)
 {
 	struct thread_data *td;
 	unsigned long long *rt;
 	struct timeval tv;
 	int i;
 
+	fio_mutex_down(stat_mutex);
+
 	rt = malloc(thread_number * sizeof(unsigned long long));
 	fio_gettime(&tv, NULL);
 
@@ -1438,6 +1464,8 @@
 	}
 
 	for_each_td(td, i) {
+		if (td->runstate >= TD_EXITED)
+			continue;
 		if (td->rusage_sem) {
 			td->update_rusage = 1;
 			fio_mutex_down(td->rusage_sem);
@@ -1458,31 +1486,6 @@
 
 	free(rt);
 	fio_mutex_up(stat_mutex);
-	return NULL;
-}
-
-/*
- * Called from signal handler. It _should_ be safe to just run this inline
- * in the sig handler, but we should be disturbing the system less by just
- * creating a thread to do it.
- */
-void show_running_run_stats(void)
-{
-	pthread_t thread;
-
-	fio_mutex_down(stat_mutex);
-
-	if (!pthread_create(&thread, NULL, __show_running_run_stats, NULL)) {
-		int err;
-
-		err = pthread_detach(thread);
-		if (err)
-			log_err("fio: DU thread detach failed: %s\n", strerror(err));
-
-		return;
-	}
-
-	fio_mutex_up(stat_mutex);
 }
 
 static int status_interval_init;
@@ -1563,9 +1566,10 @@
 
 static void __add_log_sample(struct io_log *iolog, unsigned long val,
 			     enum fio_ddir ddir, unsigned int bs,
-			     unsigned long t)
+			     unsigned long t, uint64_t offset)
 {
-	const int nr_samples = iolog->nr_samples;
+	uint64_t nr_samples = iolog->nr_samples;
+	struct io_sample *s;
 
 	if (iolog->disabled)
 		return;
@@ -1574,23 +1578,43 @@
 		iolog->avg_last = t;
 
 	if (iolog->nr_samples == iolog->max_samples) {
-		int new_size = sizeof(struct io_sample) * iolog->max_samples*2;
+		size_t new_size;
 		void *new_log;
 
-		new_log = realloc(iolog->log, new_size);
-		if (!new_log) {
-			log_err("fio: failed extending iolog! Will stop logging.\n");
-			iolog->disabled = 1;
-			return;
+		new_size = 2 * iolog->max_samples * log_entry_sz(iolog);
+
+		if (iolog->log_gz && (new_size > iolog->log_gz)) {
+			if (iolog_flush(iolog, 0)) {
+				log_err("fio: failed flushing iolog! Will stop logging.\n");
+				iolog->disabled = 1;
+				return;
+			}
+			nr_samples = iolog->nr_samples;
+		} else {
+			new_log = realloc(iolog->log, new_size);
+			if (!new_log) {
+				log_err("fio: failed extending iolog! Will stop logging.\n");
+				iolog->disabled = 1;
+				return;
+			}
+			iolog->log = new_log;
+			iolog->max_samples <<= 1;
 		}
-		iolog->log = new_log;
-		iolog->max_samples <<= 1;
 	}
 
-	iolog->log[nr_samples].val = val;
-	iolog->log[nr_samples].time = t;
-	iolog->log[nr_samples].ddir = ddir;
-	iolog->log[nr_samples].bs = bs;
+	s = get_sample(iolog, nr_samples);
+
+	s->val = val;
+	s->time = t;
+	io_sample_set_ddir(iolog, s, ddir);
+	s->bs = bs;
+
+	if (iolog->log_offset) {
+		struct io_sample_offset *so = (void *) s;
+
+		so->offset = offset;
+	}
+
 	iolog->nr_samples++;
 }
 
@@ -1632,6 +1656,7 @@
 	for (i = 0; i < 3; i++) {
 		ts->total_io_u[i] = 0;
 		ts->short_io_u[i] = 0;
+		ts->drop_io_u[i] = 0;
 	}
 }
 
@@ -1646,19 +1671,19 @@
 		unsigned long mr;
 
 		mr = iolog->avg_window[DDIR_READ].mean.u.f + 0.50;
-		__add_log_sample(iolog, mr, DDIR_READ, 0, elapsed);
+		__add_log_sample(iolog, mr, DDIR_READ, 0, elapsed, 0);
 	}
 	if (iolog->avg_window[DDIR_WRITE].samples) {
 		unsigned long mw;
 
 		mw = iolog->avg_window[DDIR_WRITE].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed);
+		__add_log_sample(iolog, mw, DDIR_WRITE, 0, elapsed, 0);
 	}
 	if (iolog->avg_window[DDIR_TRIM].samples) {
 		unsigned long mw;
 
 		mw = iolog->avg_window[DDIR_TRIM].mean.u.f + 0.50;
-		__add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed);
+		__add_log_sample(iolog, mw, DDIR_TRIM, 0, elapsed, 0);
 	}
 
 	reset_io_stat(&iolog->avg_window[DDIR_READ]);
@@ -1668,7 +1693,7 @@
 
 static void add_log_sample(struct thread_data *td, struct io_log *iolog,
 			   unsigned long val, enum fio_ddir ddir,
-			   unsigned int bs)
+			   unsigned int bs, uint64_t offset)
 {
 	unsigned long elapsed, this_window;
 
@@ -1681,7 +1706,7 @@
 	 * If no time averaging, just add the log sample.
 	 */
 	if (!iolog->avg_msec) {
-		__add_log_sample(iolog, val, ddir, bs, elapsed);
+		__add_log_sample(iolog, val, ddir, bs, elapsed, offset);
 		return;
 	}
 
@@ -1730,7 +1755,7 @@
 		return;
 
 	iolog = agg_io_log[ddir];
-	__add_log_sample(iolog, val, ddir, bs, mtime_since_genesis());
+	__add_log_sample(iolog, val, ddir, bs, mtime_since_genesis(), 0);
 }
 
 static void add_clat_percentile_sample(struct thread_stat *ts,
@@ -1743,7 +1768,7 @@
 }
 
 void add_clat_sample(struct thread_data *td, enum fio_ddir ddir,
-		     unsigned long usec, unsigned int bs)
+		     unsigned long usec, unsigned int bs, uint64_t offset)
 {
 	struct thread_stat *ts = &td->ts;
 
@@ -1753,14 +1778,14 @@
 	add_stat_sample(&ts->clat_stat[ddir], usec);
 
 	if (td->clat_log)
-		add_log_sample(td, td->clat_log, usec, ddir, bs);
+		add_log_sample(td, td->clat_log, usec, ddir, bs, offset);
 
 	if (ts->clat_percentiles)
 		add_clat_percentile_sample(ts, usec, ddir);
 }
 
 void add_slat_sample(struct thread_data *td, enum fio_ddir ddir,
-		     unsigned long usec, unsigned int bs)
+		     unsigned long usec, unsigned int bs, uint64_t offset)
 {
 	struct thread_stat *ts = &td->ts;
 
@@ -1770,11 +1795,11 @@
 	add_stat_sample(&ts->slat_stat[ddir], usec);
 
 	if (td->slat_log)
-		add_log_sample(td, td->slat_log, usec, ddir, bs);
+		add_log_sample(td, td->slat_log, usec, ddir, bs, offset);
 }
 
 void add_lat_sample(struct thread_data *td, enum fio_ddir ddir,
-		    unsigned long usec, unsigned int bs)
+		    unsigned long usec, unsigned int bs, uint64_t offset)
 {
 	struct thread_stat *ts = &td->ts;
 
@@ -1784,7 +1809,7 @@
 	add_stat_sample(&ts->lat_stat[ddir], usec);
 
 	if (td->lat_log)
-		add_log_sample(td, td->lat_log, usec, ddir, bs);
+		add_log_sample(td, td->lat_log, usec, ddir, bs, offset);
 }
 
 void add_bw_sample(struct thread_data *td, enum fio_ddir ddir, unsigned int bs,
@@ -1818,7 +1843,7 @@
 		add_stat_sample(&ts->bw_stat[ddir], rate);
 
 		if (td->bw_log)
-			add_log_sample(td, td->bw_log, rate, ddir, bs);
+			add_log_sample(td, td->bw_log, rate, ddir, bs, 0);
 
 		td->stat_io_bytes[ddir] = td->this_io_bytes[ddir];
 	}
@@ -1857,7 +1882,7 @@
 		add_stat_sample(&ts->iops_stat[ddir], iops);
 
 		if (td->iops_log)
-			add_log_sample(td, td->iops_log, iops, ddir, bs);
+			add_log_sample(td, td->iops_log, iops, ddir, bs, 0);
 
 		td->stat_io_blocks[ddir] = td->this_io_blocks[ddir];
 	}
@@ -1879,3 +1904,12 @@
 	fio_mutex_down(stat_mutex);
 	fio_mutex_remove(stat_mutex);
 }
+
+/*
+ * Called from signal handler. Wake up status thread.
+ */
+void show_running_run_stats(void)
+{
+	helper_do_stat = 1;
+	pthread_cond_signal(&helper_cond);
+}
diff --git a/stat.h b/stat.h
index 3f68305..8b4416c 100644
--- a/stat.h
+++ b/stat.h
@@ -12,7 +12,7 @@
 	uint32_t unit_base;
 	uint32_t groupid;
 	uint32_t unified_rw_rep;
-};
+} __attribute__((packed));
 
 /*
  * How many depth levels to log
@@ -158,8 +158,11 @@
 	uint32_t io_u_lat_u[FIO_IO_U_LAT_U_NR];
 	uint32_t io_u_lat_m[FIO_IO_U_LAT_M_NR];
 	uint32_t io_u_plat[DDIR_RWDIR_CNT][FIO_IO_U_PLAT_NR];
+	uint32_t pad;
+
 	uint64_t total_io_u[3];
 	uint64_t short_io_u[3];
+	uint64_t drop_io_u[3];
 	uint64_t total_submit;
 	uint64_t total_complete;
 
@@ -170,7 +173,10 @@
 	/*
 	 * IO Error related stats
 	 */
-	uint16_t continue_on_error;
+	union {
+		uint16_t continue_on_error;
+		uint64_t pad2;
+	};
 	uint64_t total_err_count;
 	uint32_t first_error;
 
@@ -181,14 +187,17 @@
 	uint64_t latency_target;
 	fio_fp64_t latency_percentile;
 	uint64_t latency_window;
-};
+} __attribute__((packed));
 
 struct jobs_eta {
 	uint32_t nr_running;
 	uint32_t nr_ramp;
+
 	uint32_t nr_pending;
 	uint32_t nr_setting_up;
+
 	uint32_t files_open;
+
 	uint32_t m_rate[DDIR_RWDIR_CNT], t_rate[DDIR_RWDIR_CNT];
 	uint32_t m_iops[DDIR_RWDIR_CNT], t_iops[DDIR_RWDIR_CNT];
 	uint32_t rate[DDIR_RWDIR_CNT];
@@ -203,7 +212,11 @@
 	 */
 	uint32_t nr_threads;
 	uint8_t run_str[];
-};
+} __attribute__((packed));
+
+extern struct fio_mutex *stat_mutex;
+
+extern struct jobs_eta *get_jobs_eta(int force, size_t *size);
 
 extern void stat_init(void);
 extern void stat_exit(void);
@@ -213,6 +226,8 @@
 extern int calc_thread_status(struct jobs_eta *je, int force);
 extern void display_thread_status(struct jobs_eta *je);
 extern void show_run_stats(void);
+extern void __show_run_stats(void);
+extern void __show_running_run_stats(void);
 extern void show_running_run_stats(void);
 extern void check_for_running_stats(void);
 extern void sum_thread_stats(struct thread_stat *dst, struct thread_stat *src, int nr);
@@ -240,5 +255,10 @@
 
 	return 1;
 }
+/*
+ * Worst level condensing would be 1:5, so allow enough room for that
+ */
+#define __THREAD_RUNSTR_SZ(nr)	((nr) * 5)
+#define THREAD_RUNSTR_SZ	__THREAD_RUNSTR_SZ(thread_number)
 
 #endif
diff --git a/t/axmap.c b/t/axmap.c
index 57c585b..e32ff98 100644
--- a/t/axmap.c
+++ b/t/axmap.c
@@ -37,7 +37,7 @@
 	while (size--) {
 		uint64_t val;
 
-		if (lfsr_next(&lfsr, &val, osize)) {
+		if (lfsr_next(&lfsr, &val)) {
 			printf("lfsr: short loop\n");
 			err = 1;
 			break;
diff --git a/t/btrace2fio.c b/t/btrace2fio.c
new file mode 100644
index 0000000..d0b7e09
--- /dev/null
+++ b/t/btrace2fio.c
@@ -0,0 +1,1144 @@
+#include <stdio.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <math.h>
+#include <assert.h>
+
+#include "../io_ddir.h"
+#include "../flist.h"
+#include "../hash.h"
+#include "../fifo.h"
+#include "../blktrace_api.h"
+#include "../os/os.h"
+#include "../log.h"
+#include "../lib/linux-dev-lookup.h"
+
+#define TRACE_FIFO_SIZE	8192
+
+static unsigned int rt_threshold = 1000000;
+static unsigned int ios_threshold = 10;
+static unsigned int rate_threshold;
+static unsigned int set_rate;
+static unsigned int max_depth = 256;
+static int output_ascii = 1;
+static char *filename;
+
+static char **add_opts;
+static int n_add_opts;
+
+/*
+ * Collapse defaults
+ */
+static unsigned int collapse_entries = 0;
+static unsigned int depth_diff = 1;
+static unsigned int random_diff = 5;
+
+struct bs {
+	unsigned int bs;
+	unsigned int nr;
+	int merges;
+};
+
+struct trace_file {
+	char *name;
+	int major, minor;
+};
+
+struct btrace_out {
+	unsigned long ios[DDIR_RWDIR_CNT];
+	unsigned long merges[DDIR_RWDIR_CNT];
+
+	uint64_t last_end[DDIR_RWDIR_CNT];
+	uint64_t seq[DDIR_RWDIR_CNT];
+
+	struct bs *bs[DDIR_RWDIR_CNT];
+	unsigned int nr_bs[DDIR_RWDIR_CNT];
+
+	int inflight;
+	unsigned int depth;
+	int depth_disabled;
+	int complete_seen;
+
+	uint64_t first_ttime[DDIR_RWDIR_CNT];
+	uint64_t last_ttime[DDIR_RWDIR_CNT];
+	uint64_t kb[DDIR_RWDIR_CNT];
+
+	uint64_t start_delay;
+};
+
+struct btrace_pid {
+	struct flist_head hash_list;
+	struct flist_head pid_list;
+	pid_t pid;
+
+	pid_t *merge_pids;
+	unsigned int nr_merge_pids;
+
+	struct trace_file *files;
+	int nr_files;
+	unsigned int last_major, last_minor;
+	int numjobs;
+	int ignore;
+
+	struct btrace_out o;
+};
+
+struct inflight {
+	struct flist_head list;
+	struct btrace_pid *p;
+	uint64_t end_sector;
+};
+
+#define PID_HASH_BITS	10
+#define PID_HASH_SIZE	(1U << PID_HASH_BITS)
+
+static struct flist_head pid_hash[PID_HASH_SIZE];
+static FLIST_HEAD(pid_list);
+
+#define INFLIGHT_HASH_BITS	8
+#define INFLIGHT_HASH_SIZE	(1U << INFLIGHT_HASH_BITS)
+static struct flist_head inflight_hash[INFLIGHT_HASH_SIZE];
+
+static uint64_t first_ttime = -1ULL;
+
+static struct inflight *inflight_find(uint64_t sector)
+{
+	struct flist_head *inflight_list;
+	struct flist_head *e;
+
+	inflight_list = &inflight_hash[hash_long(sector, INFLIGHT_HASH_BITS)];
+
+	flist_for_each(e, inflight_list) {
+		struct inflight *i = flist_entry(e, struct inflight, list);
+
+		if (i->end_sector == sector)
+			return i;
+	}
+
+	return NULL;
+}
+
+static void inflight_remove(struct inflight *i)
+{
+	struct btrace_out *o = &i->p->o;
+
+	o->inflight--;
+	assert(o->inflight >= 0);
+	flist_del(&i->list);
+	free(i);
+}
+
+static void __inflight_add(struct inflight *i)
+{
+	struct flist_head *list;
+
+	list = &inflight_hash[hash_long(i->end_sector, INFLIGHT_HASH_BITS)];
+	flist_add_tail(&i->list, list);
+}
+
+static void inflight_add(struct btrace_pid *p, uint64_t sector, uint32_t len)
+{
+	struct btrace_out *o = &p->o;
+	struct inflight *i;
+
+	i = calloc(1, sizeof(*i));
+	i->p = p;
+	o->inflight++;
+	if (!o->depth_disabled) {
+		o->depth = max((int) o->depth, o->inflight);
+		if (o->depth >= max_depth && !o->complete_seen) {
+			o->depth_disabled = 1;
+			o->depth = max_depth;
+		}
+	}
+	i->end_sector = sector + (len >> 9);
+	__inflight_add(i);
+}
+
+static void inflight_merge(struct inflight *i, int rw, unsigned int size)
+{
+	i->p->o.merges[rw]++;
+	if (size) {
+		i->end_sector += (size >> 9);
+		flist_del(&i->list);
+		__inflight_add(i);
+	}
+}
+
+/*
+ * fifo refill frontend, to avoid reading data in trace sized bites
+ */
+static int refill_fifo(struct fifo *fifo, int fd)
+{
+	char buf[TRACE_FIFO_SIZE];
+	unsigned int total;
+	int ret;
+
+	total = sizeof(buf);
+	if (total > fifo_room(fifo))
+		total = fifo_room(fifo);
+
+	ret = read(fd, buf, total);
+	if (ret < 0) {
+		perror("read refill");
+		return -1;
+	}
+
+	if (ret > 0)
+		ret = fifo_put(fifo, buf, ret);
+
+	return ret;
+}
+
+/*
+ * Retrieve 'len' bytes from the fifo, refilling if necessary.
+ */
+static int trace_fifo_get(struct fifo *fifo, int fd, void *buf,
+			  unsigned int len)
+{
+	if (fifo_len(fifo) < len) {
+		int ret = refill_fifo(fifo, fd);
+
+		if (ret < 0)
+			return ret;
+	}
+
+	return fifo_get(fifo, buf, len);
+}
+
+/*
+ * Just discard the pdu by seeking past it.
+ */
+static int discard_pdu(struct fifo *fifo, int fd, struct blk_io_trace *t)
+{
+	if (t->pdu_len == 0)
+		return 0;
+
+	return trace_fifo_get(fifo, fd, NULL, t->pdu_len);
+}
+
+static int handle_trace_notify(struct blk_io_trace *t)
+{
+	switch (t->action) {
+	case BLK_TN_PROCESS:
+		//printf("got process notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_TIMESTAMP:
+		//printf("got timestamp notify: %x, %d\n", t->action, t->pid);
+		break;
+	case BLK_TN_MESSAGE:
+		break;
+	default:
+		log_err("unknown trace act %x\n", t->action);
+		return 1;
+	}
+
+	return 0;
+}
+
+static void __add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	o->bs[rw] = realloc(o->bs[rw], (o->nr_bs[rw] + 1) * sizeof(struct bs));
+	o->bs[rw][o->nr_bs[rw]].bs = len;
+	o->bs[rw][o->nr_bs[rw]].nr = 1;
+	o->nr_bs[rw]++;
+}
+
+static void add_bs(struct btrace_out *o, unsigned int len, int rw)
+{
+	struct bs *bs = o->bs[rw];
+	int i;
+
+	if (!o->nr_bs[rw]) {
+		__add_bs(o, len, rw);
+		return;
+	}
+
+	for (i = 0; i < o->nr_bs[rw]; i++) {
+		if (bs[i].bs == len) {
+			bs[i].nr++;
+			return;
+		}
+	}
+
+	__add_bs(o, len, rw);
+}
+
+#define FMINORBITS	20
+#define FMINORMASK	((1U << FMINORBITS) - 1)
+#define FMAJOR(dev)	((unsigned int) ((dev) >> FMINORBITS))
+#define FMINOR(dev)	((unsigned int) ((dev) & FMINORMASK))
+
+static int btrace_add_file(struct btrace_pid *p, uint32_t devno)
+{
+	unsigned int maj = FMAJOR(devno);
+	unsigned int min = FMINOR(devno);
+	struct trace_file *f;
+	unsigned int i;
+	char dev[256];
+
+	if (filename)
+		return 0;
+	if (p->last_major == maj && p->last_minor == min)
+		return 0;
+
+	p->last_major = maj;
+	p->last_minor = min;
+
+	/*
+	 * check for this file in our list
+	 */
+	for (i = 0; i < p->nr_files; i++) {
+		f = &p->files[i];
+
+		if (f->major == maj && f->minor == min)
+			return 0;
+	}
+
+	strcpy(dev, "/dev");
+	if (!blktrace_lookup_device(NULL, dev, maj, min)) {
+		log_err("fio: failed to find device %u/%u\n", maj, min);
+		if (!output_ascii) {
+			log_err("fio: use -d to specify device\n");
+			return 1;
+		}
+		return 0;
+	}
+
+	p->files = realloc(p->files, (p->nr_files + 1) * sizeof(*f));
+	f = &p->files[p->nr_files];
+	f->name = strdup(dev);
+	f->major = maj;
+	f->minor = min;
+	p->nr_files++;
+	return 0;
+}
+
+static int t_to_rwdir(struct blk_io_trace *t)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return DDIR_TRIM;
+
+	return (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+}
+
+static int handle_trace_discard(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	if (o->first_ttime[2] == -1ULL)
+		o->first_ttime[2] = t->time;
+
+	o->ios[DDIR_TRIM]++;
+	add_bs(o, t->bytes, DDIR_TRIM);
+	return 0;
+}
+
+static int handle_trace_fs(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int rw;
+
+	if (btrace_add_file(p, t->device))
+		return 1;
+
+	first_ttime = min(first_ttime, (uint64_t) t->time);
+
+	rw = (t->action & BLK_TC_ACT(BLK_TC_WRITE)) != 0;
+
+	if (o->first_ttime[rw] == -1ULL)
+		o->first_ttime[rw] = t->time;
+
+	add_bs(o, t->bytes, rw);
+	o->ios[rw]++;
+
+	if (t->sector == o->last_end[rw] || o->last_end[rw] == -1ULL)
+		o->seq[rw]++;
+
+	o->last_end[rw] = t->sector + (t->bytes >> 9);
+	return 0;
+}
+
+static int handle_queue_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	if (t->action & BLK_TC_ACT(BLK_TC_NOTIFY))
+		return handle_trace_notify(t);
+	else if (t->action & BLK_TC_ACT(BLK_TC_DISCARD))
+		return handle_trace_discard(t, p);
+	else
+		return handle_trace_fs(t, p);
+}
+
+static int handle_trace(struct blk_io_trace *t, struct btrace_pid *p)
+{
+	unsigned int act = t->action & 0xffff;
+	int ret = 0;
+
+	if (act == __BLK_TA_QUEUE) {
+		inflight_add(p, t->sector, t->bytes);
+		ret = handle_queue_trace(t, p);
+	} else if (act == __BLK_TA_BACKMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), t->bytes);
+	} else if (act == __BLK_TA_FRONTMERGE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i)
+			inflight_remove(i);
+
+		i = inflight_find(t->sector);
+		if (i)
+			inflight_merge(i, t_to_rwdir(t), 0);
+	} else if (act == __BLK_TA_COMPLETE) {
+		struct inflight *i;
+
+		i = inflight_find(t->sector + (t->bytes >> 9));
+		if (i) {
+			i->p->o.kb[t_to_rwdir(t)] += (t->bytes >> 10);
+			i->p->o.complete_seen = 1;
+			inflight_remove(i);
+		}
+	}
+
+	return ret;
+}
+
+static void byteswap_trace(struct blk_io_trace *t)
+{
+	t->magic = fio_swap32(t->magic);
+	t->sequence = fio_swap32(t->sequence);
+	t->time = fio_swap64(t->time);
+	t->sector = fio_swap64(t->sector);
+	t->bytes = fio_swap32(t->bytes);
+	t->action = fio_swap32(t->action);
+	t->pid = fio_swap32(t->pid);
+	t->device = fio_swap32(t->device);
+	t->cpu = fio_swap32(t->cpu);
+	t->error = fio_swap16(t->error);
+	t->pdu_len = fio_swap16(t->pdu_len);
+}
+
+static struct btrace_pid *pid_hash_find(pid_t pid, struct flist_head *list)
+{
+	struct flist_head *e;
+	struct btrace_pid *p;
+
+	flist_for_each(e, list) {
+		p = flist_entry(e, struct btrace_pid, hash_list);
+		if (p->pid == pid)
+			return p;
+	}
+
+	return NULL;
+}
+
+static struct btrace_pid *pid_hash_get(pid_t pid)
+{
+	struct flist_head *hash_list;
+	struct btrace_pid *p;
+
+	hash_list = &pid_hash[hash_long(pid, PID_HASH_BITS)];
+
+	p = pid_hash_find(pid, hash_list);
+	if (!p) {
+		int i;
+
+		p = calloc(1, sizeof(*p));
+
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			p->o.first_ttime[i] = -1ULL;
+			p->o.last_ttime[i] = -1ULL;
+			p->o.last_end[i] = -1ULL;
+		}
+
+		p->pid = pid;
+		p->numjobs = 1;
+		flist_add_tail(&p->hash_list, hash_list);
+		flist_add_tail(&p->pid_list, &pid_list);
+	}
+
+	return p;
+}
+
+/*
+ * Load a blktrace file by reading all the blk_io_trace entries, and storing
+ * them as io_pieces like the fio text version would do.
+ */
+static int load_blktrace(const char *fname, int need_swap)
+{
+	struct btrace_pid *p;
+	unsigned long traces;
+	struct blk_io_trace t;
+	struct fifo *fifo;
+	int fd, ret = 0;
+
+	fd = open(fname, O_RDONLY);
+	if (fd < 0) {
+		perror("open trace file\n");
+		return 1;
+	}
+
+	fifo = fifo_alloc(TRACE_FIFO_SIZE);
+
+	traces = 0;
+	do {
+		ret = trace_fifo_get(fifo, fd, &t, sizeof(t));
+		if (ret < 0)
+			goto err;
+		else if (!ret)
+			break;
+		else if (ret < (int) sizeof(t)) {
+			log_err("fio: short fifo get\n");
+			break;
+		}
+
+		if (need_swap)
+			byteswap_trace(&t);
+
+		if ((t.magic & 0xffffff00) != BLK_IO_TRACE_MAGIC) {
+			log_err("fio: bad magic in blktrace data: %x\n", t.magic);
+			goto err;
+		}
+		if ((t.magic & 0xff) != BLK_IO_TRACE_VERSION) {
+			log_err("fio: bad blktrace version %d\n", t.magic & 0xff);
+			goto err;
+		}
+		ret = discard_pdu(fifo, fd, &t);
+		if (ret < 0) {
+			log_err("blktrace lseek\n");
+			goto err;
+		} else if (t.pdu_len != ret) {
+			log_err("fio: discarded %d of %d\n", ret, t.pdu_len);
+			goto err;
+		}
+
+		p = pid_hash_get(t.pid);
+		ret = handle_trace(&t, p);
+		if (ret)
+			break;
+		p->o.last_ttime[t_to_rwdir(&t)] = t.time;
+		traces++;
+	} while (1);
+
+	fifo_free(fifo);
+	close(fd);
+
+	if (ret)
+		return ret;
+
+	if (output_ascii)
+		printf("Traces loaded: %lu\n", traces);
+
+	return 0;
+err:
+	close(fd);
+	fifo_free(fifo);
+	return 1;
+}
+
+static int bs_cmp(const void *ba, const void *bb)
+{
+	const struct bs *bsa = ba;
+	const struct bs *bsb = bb;
+
+	return bsb->nr - bsa->nr;
+}
+
+static unsigned long o_to_kb_rate(struct btrace_out *o, int rw)
+{
+	uint64_t usec = (o->last_ttime[rw] - o->first_ttime[rw]) / 1000ULL;
+	uint64_t val;
+
+	if (!usec)
+		return 0;
+
+	usec /= 1000;
+	if (!usec)
+		return 0;
+
+	val = o->kb[rw] * 1000ULL;
+	return val / usec;
+}
+
+static uint64_t o_first_ttime(struct btrace_out *o)
+{
+	uint64_t first;
+
+	first = min(o->first_ttime[0], o->first_ttime[1]);
+	return min(first, o->first_ttime[2]);
+}
+
+static uint64_t o_longest_ttime(struct btrace_out *o)
+{
+	uint64_t ret = 0;
+	int i;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		uint64_t diff;
+
+		diff = o->last_ttime[i] - o->first_ttime[i];
+		ret = max(diff, ret);
+	}
+
+	return ret;
+}
+
+static void __output_p_ascii(struct btrace_pid *p, unsigned long *ios)
+{
+	const char *msg[] = { "reads", "writes", "trims" };
+	struct btrace_out *o = &p->o;
+	unsigned long total, usec;
+	int i, j;
+
+	printf("[pid:\t%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(", %u", p->merge_pids[i]);
+	printf("]\n");
+
+	total = ddir_rw_sum(o->ios);
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		float perc;
+
+		if (!o->ios[i])
+			continue;
+
+		ios[i] += o->ios[i] + o->merges[i];
+		printf("%s\n", msg[i]);
+		perc = ((float) o->ios[i] * 100.0) / (float) total;
+		printf("\tios:    %lu (perc=%3.2f%%)\n", o->ios[i], perc);
+		perc = ((float) o->merges[i] * 100.0) / (float) total;
+		printf("\tmerges: %lu (perc=%3.2f%%)\n", o->merges[i], perc);
+		perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+		printf("\tseq:    %lu (perc=%3.2f%%)\n", (unsigned long) o->seq[i], perc);
+		printf("\trate:   %lu KB/sec\n", o_to_kb_rate(o, i));
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			printf("\tbs=%u, perc=%3.2f%%\n", bs->bs, perc);
+		}
+	}
+
+	printf("depth:\t%u\n", o->depth);
+	usec = o_longest_ttime(o) / 1000ULL;
+	printf("usec:\t%lu (delay=%llu)\n", usec, (unsigned long long) o->start_delay);
+
+	printf("files:\t");
+	for (i = 0; i < p->nr_files; i++)
+		printf("%s,", p->files[i].name);
+	printf("\n");
+
+	printf("\n");
+}
+
+static int __output_p_fio(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	unsigned long total;
+	unsigned long long time;
+	float perc;
+	int i, j;
+
+	if ((o->ios[0] + o->ios[1]) && o->ios[2]) {
+		log_err("fio: trace has both read/write and trim\n");
+		return 1;
+	}
+	if (!p->nr_files) {
+		log_err("fio: no devices found\n");
+		return 1;
+	}
+
+	printf("[pid%u", p->pid);
+	if (p->nr_merge_pids)
+		for (i = 0; i < p->nr_merge_pids; i++)
+			printf(",pid%u", p->merge_pids[i]);
+	printf("]\n");
+
+	printf("numjobs=%u\n", p->numjobs);
+	printf("direct=1\n");
+	if (o->depth == 1)
+		printf("ioengine=sync\n");
+	else
+		printf("ioengine=libaio\niodepth=%u\n", o->depth);
+
+	if (o->ios[0] && !o->ios[1])
+		printf("rw=randread\n");
+	else if (!o->ios[0] && o->ios[1])
+		printf("rw=randwrite\n");
+	else if (o->ios[2])
+		printf("rw=randtrim\n");
+	else {
+		printf("rw=randrw\n");
+		total = ddir_rw_sum(o->ios);
+		perc = ((float) o->ios[0] * 100.0) / (float) total;
+		printf("rwmixread=%u\n", (int) floor(perc + 0.50));
+	}
+
+	printf("percentage_random=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->seq[i] && o->ios[i]) {
+			perc = ((float) o->seq[i] * 100.0) / (float) o->ios[i];
+			if (perc >= 99.0)
+				perc = 100.0;
+		} else
+			perc = 100.0;
+
+		if (i)
+			printf(",");
+		perc = 100.0 - perc;
+		printf("%u", (int) floor(perc + 0.5));
+	}
+	printf("\n");
+
+	printf("filename=");
+	for (i = 0; i < p->nr_files; i++) {
+		if (i)
+			printf(":");
+		printf("%s", p->files[i].name);
+	}
+	printf("\n");
+
+	if (o->start_delay / 1000000ULL)
+		printf("startdelay=%llus\n", o->start_delay / 1000000ULL);
+
+	time = o_longest_ttime(o);
+	time = (time + 1000000000ULL - 1) / 1000000000ULL;
+	printf("runtime=%llus\n", time);
+
+	printf("bssplit=");
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+
+		if (i && o->nr_bs[i - 1] && o->nr_bs[i])
+			printf(",");
+
+		for (j = 0; j < o->nr_bs[i]; j++) {
+			struct bs *bs = &o->bs[i][j];
+
+			perc = (((float) bs->nr * 100.0) / (float) o->ios[i]);
+			if (perc < 1.00)
+				continue;
+			if (j)
+				printf(":");
+			if (j + 1 == o->nr_bs[i])
+				printf("%u/", bs->bs);
+			else
+				printf("%u/%u", bs->bs, (int) floor(perc + 0.5));
+		}
+	}
+	printf("\n");
+
+	if (set_rate) {
+		printf("rate=");
+		for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+			unsigned long rate;
+
+			rate = o_to_kb_rate(o, i);
+			if (i)
+				printf(",");
+			if (rate)
+				printf("%luk", rate);
+		}
+		printf("\n");
+	}
+
+	if (n_add_opts)
+		for (i = 0; i < n_add_opts; i++)
+			printf("%s\n", add_opts[i]);
+
+	printf("\n");
+	return 0;
+}
+
+static int __output_p(struct btrace_pid *p, unsigned long *ios)
+{
+	struct btrace_out *o = &p->o;
+	int i, ret = 0;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if (o->nr_bs[i] <= 1)
+			continue;
+		qsort(o->bs[i], o->nr_bs[i], sizeof(struct bs), bs_cmp);
+	}
+
+	if (filename) {
+		p->files = malloc(sizeof(struct trace_file));
+		p->nr_files++;
+		p->files[0].name = filename;
+	}
+
+	if (output_ascii)
+		__output_p_ascii(p, ios);
+	else
+		ret = __output_p_fio(p, ios);
+
+	return ret;
+}
+
+static void remove_ddir(struct btrace_out *o, int rw)
+{
+	o->ios[rw] = 0;
+}
+
+static int prune_entry(struct btrace_out *o)
+{
+	unsigned long rate;
+	uint64_t time;
+	int i;
+
+	if (ddir_rw_sum(o->ios) < ios_threshold)
+		return 1;
+
+	time = o_longest_ttime(o) / 1000ULL;
+	if (time < rt_threshold)
+		return 1;
+
+	rate = 0;
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		unsigned long this_rate;
+
+		this_rate = o_to_kb_rate(o, i);
+		if (this_rate < rate_threshold) {
+			remove_ddir(o, i);
+			this_rate = 0;
+		}
+		rate += this_rate;
+	}
+
+	if (rate < rate_threshold)
+		return 1;
+
+	return 0;
+}
+
+static int entry_cmp(void *priv, struct flist_head *a, struct flist_head *b)
+{
+	struct btrace_pid *pa = flist_entry(a, struct btrace_pid, pid_list);
+	struct btrace_pid *pb = flist_entry(b, struct btrace_pid, pid_list);
+
+	return ddir_rw_sum(pb->o.ios) - ddir_rw_sum(pa->o.ios);
+}
+
+static void free_p(struct btrace_pid *p)
+{
+	struct btrace_out *o = &p->o;
+	int i;
+
+	for (i = 0; i < p->nr_files; i++) {
+		if (p->files[i].name && p->files[i].name != filename)
+			free(p->files[i].name);
+	}
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++)
+		free(o->bs[i]);
+
+	free(p->files);
+	flist_del(&p->pid_list);
+	flist_del(&p->hash_list);
+	free(p);
+}
+
+static int entries_close(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	float perca, percb, fdiff;
+	int i, idiff;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		if ((pida->o.ios[i] && !pidb->o.ios[i]) ||
+		    (pidb->o.ios[i] && !pida->o.ios[i]))
+			return 0;
+		if (pida->o.ios[i] && pidb->o.ios[i]) {
+			perca = ((float) pida->o.seq[i] * 100.0) / (float) pida->o.ios[i];
+			percb = ((float) pidb->o.seq[i] * 100.0) / (float) pidb->o.ios[i];
+			fdiff = perca - percb;
+			if (fabs(fdiff) > random_diff)
+				return 0;
+		}
+
+		idiff = pida->o.depth - pidb->o.depth;
+		if (abs(idiff) > depth_diff)
+			return 0;
+	}
+
+	return 1;
+}
+
+static void merge_bs(struct bs **bsap, unsigned int *nr_bsap,
+		     struct bs *bsb, unsigned int nr_bsb)
+{
+	struct bs *bsa = *bsap;
+	unsigned int nr_bsa = *nr_bsap;
+	int a, b;
+
+	for (b = 0; b < nr_bsb; b++) {
+		int next, found = 0;
+
+		for (a = 0; a < nr_bsa; a++) {
+			if (bsb[b].bs != bsa[a].bs)
+				continue;
+
+			bsa[a].nr += bsb[b].nr;
+			bsa[a].merges += bsb[b].merges;
+			found = 1;
+			break;
+		}
+
+		if (found)
+			continue;
+
+		next = *nr_bsap;
+		bsa = realloc(bsa, (next + 1) * sizeof(struct bs));
+		bsa[next].bs = bsb[b].bs;
+		bsa[next].nr = bsb[b].nr;
+		(*nr_bsap)++;
+		*bsap = bsa;
+	}
+}
+
+static int merge_entries(struct btrace_pid *pida, struct btrace_pid *pidb)
+{
+	int i;
+
+	if (!entries_close(pida, pidb))
+		return 0;
+
+	pida->nr_merge_pids++;
+	pida->merge_pids = realloc(pida->merge_pids, pida->nr_merge_pids * sizeof(pid_t));
+	pida->merge_pids[pida->nr_merge_pids - 1] = pidb->pid;
+
+	for (i = 0; i < DDIR_RWDIR_CNT; i++) {
+		struct btrace_out *oa = &pida->o;
+		struct btrace_out *ob = &pidb->o;
+
+		oa->ios[i] += ob->ios[i];
+		oa->merges[i] += ob->merges[i];
+		oa->seq[i] += ob->seq[i];
+		oa->kb[i] += ob->kb[i];
+		oa->first_ttime[i] = min(oa->first_ttime[i], ob->first_ttime[i]);
+		oa->last_ttime[i] = max(oa->last_ttime[i], ob->last_ttime[i]);
+		merge_bs(&oa->bs[i], &oa->nr_bs[i], ob->bs[i], ob->nr_bs[i]);
+	}
+
+	pida->o.start_delay = min(pida->o.start_delay, pidb->o.start_delay);
+	pida->o.depth = (pida->o.depth + pidb->o.depth) / 2;
+	return 1;
+}
+
+static void check_merges(struct btrace_pid *p, struct flist_head *pid_list)
+{
+	struct flist_head *e, *tmp;
+
+	if (p->ignore)
+		return;
+
+	flist_for_each_safe(e, tmp, pid_list) {
+		struct btrace_pid *pidb;
+
+		pidb = flist_entry(e, struct btrace_pid, pid_list);
+		if (pidb == p)
+			continue;
+
+		if (merge_entries(p, pidb)) {
+			pidb->ignore = 1;
+			p->numjobs++;
+		}
+	}
+}
+
+static int output_p(void)
+{
+	unsigned long ios[DDIR_RWDIR_CNT];
+	struct flist_head *e, *tmp;
+	int depth_disabled = 0;
+	int ret = 0;
+
+	flist_for_each_safe(e, tmp, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		if (prune_entry(&p->o)) {
+			free_p(p);
+			continue;
+		}
+		p->o.start_delay = (o_first_ttime(&p->o) / 1000ULL) - first_ttime;
+		depth_disabled += p->o.depth_disabled;
+	}
+
+	if (collapse_entries) {
+		struct btrace_pid *p;
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			check_merges(p, &pid_list);
+		}
+
+		flist_for_each_safe(e, tmp, &pid_list) {
+			p = flist_entry(e, struct btrace_pid, pid_list);
+			if (p->ignore)
+				free_p(p);
+		}
+	}
+
+	if (depth_disabled)
+		log_err("fio: missing completion traces, depths capped at %u\n", max_depth);
+
+	memset(ios, 0, sizeof(ios));
+
+	flist_sort(NULL, &pid_list, entry_cmp);
+
+	flist_for_each(e, &pid_list) {
+		struct btrace_pid *p;
+
+		p = flist_entry(e, struct btrace_pid, pid_list);
+		ret |= __output_p(p, ios);
+		if (ret && !output_ascii)
+			break;
+	}
+
+	if (output_ascii)
+		printf("Total: reads=%lu, writes=%lu\n", ios[0], ios[1]);
+
+	return ret;
+}
+
+static int usage(char *argv[])
+{
+	log_err("%s: [options] <blktrace bin file>\n", argv[0]);
+	log_err("\t-t\tUsec threshold to ignore task\n");
+	log_err("\t-n\tNumber IOS threshold to ignore task\n");
+	log_err("\t-f\tFio job file output\n");
+	log_err("\t-d\tUse this file/device for replay\n");
+	log_err("\t-r\tIgnore jobs with less than this KB/sec rate\n");
+	log_err("\t-R\tSet rate in fio job (def=%u)\n", set_rate);
+	log_err("\t-D\tCap queue depth at this value (def=%u)\n", max_depth);
+	log_err("\t-c\tCollapse \"identical\" jobs (def=%u)\n", collapse_entries);
+	log_err("\t-u\tDepth difference for collapse (def=%u)\n", depth_diff);
+	log_err("\t-x\tRandom difference for collapse (def=%u)\n", random_diff);
+	log_err("\t-a\tAdditional fio option to add to job file\n");
+	return 1;
+}
+
+static int trace_needs_swap(const char *trace_file, int *swap)
+{
+	struct blk_io_trace t;
+	int fd, ret;
+
+	*swap = -1;
+	
+	fd = open(trace_file, O_RDONLY);
+	if (fd < 0) {
+		perror("open");
+		return 1;
+	}
+
+	ret = read(fd, &t, sizeof(t));
+	if (ret < 0) {
+		close(fd);
+		perror("read");
+		return 1;
+	} else if (ret != sizeof(t)) {
+		close(fd);
+		log_err("fio: short read on trace file\n");
+		return 1;
+	}
+
+	close(fd);
+
+	if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+		*swap = 0;
+	else {
+		/*
+		 * Maybe it needs to be endian swapped...
+		 */
+		t.magic = fio_swap32(t.magic);
+		if ((t.magic & 0xffffff00) == BLK_IO_TRACE_MAGIC)
+			*swap = 1;
+	}
+
+	if (*swap == -1) {
+		log_err("fio: blktrace appears corrupt\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+int main(int argc, char *argv[])
+{
+	int need_swap, i, c;
+
+	if (argc < 2)
+		return usage(argv);
+
+	while ((c = getopt(argc, argv, "t:n:fd:r:RD:c:u:x:a:")) != -1) {
+		switch (c) {
+		case 'R':
+			set_rate = 1;
+			break;
+		case 'r':
+			rate_threshold = atoi(optarg);
+			break;
+		case 't':
+			rt_threshold = atoi(optarg);
+			break;
+		case 'n':
+			ios_threshold = atoi(optarg);
+			break;
+		case 'f':
+			output_ascii = 0;
+			break;
+		case 'd':
+			filename = strdup(optarg);
+			break;
+		case 'D':
+			max_depth = atoi(optarg);
+			break;
+		case 'c':
+			collapse_entries = atoi(optarg);
+			break;
+		case 'u':
+			depth_diff = atoi(optarg);
+			break;
+		case 'x':
+			random_diff = atoi(optarg);
+			break;
+		case 'a':
+			add_opts = realloc(add_opts, (n_add_opts + 1) * sizeof(char *));
+			add_opts[n_add_opts] = strdup(optarg);
+			n_add_opts++;
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (argc == optind)
+		return usage(argv);
+
+	if (trace_needs_swap(argv[optind], &need_swap))
+		return 1;
+
+	for (i = 0; i < PID_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&pid_hash[i]);
+	for (i = 0; i < INFLIGHT_HASH_SIZE; i++)
+		INIT_FLIST_HEAD(&inflight_hash[i]);
+
+	load_blktrace(argv[optind], need_swap);
+	first_ttime /= 1000ULL;
+
+	return output_p();
+}
diff --git a/t/debug.c b/t/debug.c
new file mode 100644
index 0000000..c297d61
--- /dev/null
+++ b/t/debug.c
@@ -0,0 +1,14 @@
+#include <stdio.h>
+
+FILE *f_err;
+struct timeval *fio_tv = NULL;
+unsigned int fio_debug = 0;
+
+void __dprint(int type, const char *str, ...)
+{
+}
+
+void debug_init(void)
+{
+	f_err = stderr;
+}
diff --git a/t/debug.h b/t/debug.h
new file mode 100644
index 0000000..9d1d415
--- /dev/null
+++ b/t/debug.h
@@ -0,0 +1,6 @@
+#ifndef FIO_DEBUG_INC_H
+#define FIO_DEBUG_INC_H
+
+extern void debug_init(void);
+
+#endif
diff --git a/t/dedupe.c b/t/dedupe.c
new file mode 100644
index 0000000..5b88fcb
--- /dev/null
+++ b/t/dedupe.c
@@ -0,0 +1,599 @@
+/*
+ * Small tool to check for dedupable blocks in a file or device. Basically
+ * just scans the filename for extents of the given size, checksums them,
+ * and orders them up.
+ */
+#include <stdio.h>
+#include <stdio.h>
+#include <unistd.h>
+#include <inttypes.h>
+#include <assert.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/ioctl.h>
+#include <fcntl.h>
+#include <string.h>
+
+#include "../lib/rbtree.h"
+#include "../flist.h"
+#include "../log.h"
+#include "../mutex.h"
+#include "../smalloc.h"
+#include "../minmax.h"
+#include "../crc/md5.h"
+#include "../memalign.h"
+#include "../os/os.h"
+#include "../gettime.h"
+#include "../fio_time.h"
+
+#include "../lib/bloom.h"
+#include "debug.h"
+
+struct worker_thread {
+	pthread_t thread;
+
+	volatile int done;
+
+	int fd;
+	uint64_t cur_offset;
+	uint64_t size;
+
+	unsigned long items;
+	unsigned long dupes;
+	int err;
+};
+
+struct extent {
+	struct flist_head list;
+	uint64_t offset;
+};
+
+struct chunk {
+	struct rb_node rb_node;
+	uint64_t count;
+	uint32_t hash[MD5_HASH_WORDS];
+	struct flist_head extent_list[0];
+};
+
+struct item {
+	uint64_t offset;
+	uint32_t hash[MD5_HASH_WORDS];
+};
+
+static struct rb_root rb_root;
+static struct bloom *bloom;
+static struct fio_mutex *rb_lock;
+
+static unsigned int blocksize = 4096;
+static unsigned int num_threads;
+static unsigned int chunk_size = 1048576;
+static unsigned int dump_output;
+static unsigned int odirect;
+static unsigned int collision_check;
+static unsigned int print_progress = 1;
+static unsigned int use_bloom = 1;
+
+static uint64_t total_size;
+static uint64_t cur_offset;
+static struct fio_mutex *size_lock;
+
+static struct fio_file file;
+
+static uint64_t get_size(struct fio_file *f, struct stat *sb)
+{
+	uint64_t ret;
+
+	if (S_ISBLK(sb->st_mode)) {
+		unsigned long long bytes;
+
+		if (blockdev_size(f, &bytes)) {
+			log_err("dedupe: failed getting bdev size\n");
+			return 0;
+		}
+		ret = bytes;
+	} else
+		ret = sb->st_size;
+
+	return (ret & ~((uint64_t)blocksize - 1));
+}
+
+static int get_work(uint64_t *offset, uint64_t *size)
+{
+	uint64_t this_chunk;
+	int ret = 1;
+
+	fio_mutex_down(size_lock);
+
+	if (cur_offset < total_size) {
+		*offset = cur_offset;
+		this_chunk = min((uint64_t)chunk_size, total_size - cur_offset);
+		*size = this_chunk;
+		cur_offset += this_chunk;
+		ret = 0;
+	}
+
+	fio_mutex_up(size_lock);
+	return ret;
+}
+
+static int __read_block(int fd, void *buf, off_t offset, size_t count)
+{
+	ssize_t ret;
+
+	ret = pread(fd, buf, count, offset);
+	if (ret < 0) {
+		perror("pread");
+		return 1;
+	} else if (!ret)
+		return 1;
+	else if (ret != count) {
+		log_err("dedupe: short read on block\n");
+		return 1;
+	}
+
+	return 0;
+}
+
+static int read_block(int fd, void *buf, off_t offset)
+{
+	return __read_block(fd, buf, offset, blocksize);
+}
+
+static void add_item(struct chunk *c, struct item *i)
+{
+	/*	
+	 * Save some memory and don't add extent items, if we don't
+	 * use them.
+	 */
+	if (dump_output || collision_check) {
+		struct extent *e;
+
+		e = malloc(sizeof(*e));
+		e->offset = i->offset;
+		flist_add_tail(&e->list, &c->extent_list[0]);
+	}
+
+	c->count++;
+}
+
+static int col_check(struct chunk *c, struct item *i)
+{
+	struct extent *e;
+	char *cbuf, *ibuf;
+	int ret = 1;
+
+	cbuf = fio_memalign(blocksize, blocksize);
+	ibuf = fio_memalign(blocksize, blocksize);
+
+	e = flist_entry(c->extent_list[0].next, struct extent, list);
+	if (read_block(file.fd, cbuf, e->offset))
+		goto out;
+
+	if (read_block(file.fd, ibuf, i->offset))
+		goto out;
+
+	ret = memcmp(ibuf, cbuf, blocksize);
+out:
+	fio_memfree(cbuf, blocksize);
+	fio_memfree(ibuf, blocksize);
+	return ret;
+}
+
+static struct chunk *alloc_chunk(void)
+{
+	struct chunk *c;
+
+	if (collision_check || dump_output) {
+		c = malloc(sizeof(struct chunk) + sizeof(struct flist_head));
+		INIT_FLIST_HEAD(&c->extent_list[0]);
+	} else
+		c = malloc(sizeof(struct chunk));
+
+	return c;
+}
+
+static void insert_chunk(struct item *i)
+{
+	struct rb_node **p, *parent;
+	struct chunk *c;
+	int diff;
+
+	p = &rb_root.rb_node;
+	parent = NULL;
+	while (*p) {
+		parent = *p;
+
+		c = rb_entry(parent, struct chunk, rb_node);
+		diff = memcmp(i->hash, c->hash, sizeof(i->hash));
+		if (diff < 0)
+			p = &(*p)->rb_left;
+		else if (diff > 0)
+			p = &(*p)->rb_right;
+		else {
+			int ret;
+
+			if (!collision_check)
+				goto add;
+
+			fio_mutex_up(rb_lock);
+			ret = col_check(c, i);
+			fio_mutex_down(rb_lock);
+
+			if (!ret)
+				goto add;
+
+			p = &(*p)->rb_right;
+		}
+	}
+
+	c = alloc_chunk();
+	RB_CLEAR_NODE(&c->rb_node);
+	c->count = 0;
+	memcpy(c->hash, i->hash, sizeof(i->hash));
+	rb_link_node(&c->rb_node, parent, p);
+	rb_insert_color(&c->rb_node, &rb_root);
+add:
+	add_item(c, i);
+}
+
+static void insert_chunks(struct item *items, unsigned int nitems,
+			  uint64_t *ndupes)
+{
+	int i;
+
+	fio_mutex_down(rb_lock);
+
+	for (i = 0; i < nitems; i++) {
+		if (bloom) {
+			unsigned int s;
+			int r;
+
+			s = sizeof(items[i].hash) / sizeof(uint32_t);
+			r = bloom_set(bloom, items[i].hash, s);
+			*ndupes += r;
+		} else
+			insert_chunk(&items[i]);
+	}
+
+	fio_mutex_up(rb_lock);
+}
+
+static void crc_buf(void *buf, uint32_t *hash)
+{
+	struct fio_md5_ctx ctx = { .hash = hash };
+
+	fio_md5_init(&ctx);
+	fio_md5_update(&ctx, buf, blocksize);
+	fio_md5_final(&ctx);
+}
+
+static unsigned int read_blocks(int fd, void *buf, off_t offset, size_t size)
+{
+	if (__read_block(fd, buf, offset, size))
+		return 0;
+
+	return size / blocksize;
+}
+
+static int do_work(struct worker_thread *thread, void *buf)
+{
+	unsigned int nblocks, i;
+	off_t offset;
+	int nitems = 0;
+	uint64_t ndupes = 0;
+	struct item *items;
+
+	offset = thread->cur_offset;
+
+	nblocks = read_blocks(thread->fd, buf, offset, min(thread->size, (uint64_t)chunk_size));
+	if (!nblocks)
+		return 1;
+
+	items = malloc(sizeof(*items) * nblocks);
+
+	for (i = 0; i < nblocks; i++) {
+		void *thisptr = buf + (i * blocksize);
+
+		items[i].offset = offset;
+		crc_buf(thisptr, items[i].hash);
+		offset += blocksize;
+		nitems++;
+	}
+
+	insert_chunks(items, nitems, &ndupes);
+
+	free(items);
+	thread->items += nitems;
+	thread->dupes += ndupes;
+	return 0;
+}
+
+static void *thread_fn(void *data)
+{
+	struct worker_thread *thread = data;
+	void *buf;
+
+	buf = fio_memalign(blocksize, chunk_size);
+
+	do {
+		if (get_work(&thread->cur_offset, &thread->size)) {
+			thread->err = 1;
+			break;
+		}
+		if (do_work(thread, buf)) {
+			thread->err = 1;
+			break;
+		}
+	} while (1);
+
+	thread->done = 1;
+	fio_memfree(buf, chunk_size);
+	return NULL;
+}
+
+static void show_progress(struct worker_thread *threads, unsigned long total)
+{
+	unsigned long last_nitems = 0;
+	struct timeval last_tv;
+
+	fio_gettime(&last_tv, NULL);
+
+	while (print_progress) {
+		unsigned long this_items;
+		unsigned long nitems = 0;
+		uint64_t tdiff;
+		float perc;
+		int some_done = 0;
+		int i;
+
+		for (i = 0; i < num_threads; i++) {
+			nitems += threads[i].items;
+			some_done = threads[i].done;
+			if (some_done)
+				break;
+		}
+
+		if (some_done)
+			break;
+
+		perc = (float) nitems / (float) total;
+		perc *= 100.0;
+		this_items = nitems - last_nitems;
+		this_items *= blocksize;
+		tdiff = mtime_since_now(&last_tv);
+		if (tdiff) {
+			this_items = (this_items * 1000) / (tdiff * 1024);
+			printf("%3.2f%% done (%luKB/sec)\r", perc, this_items);
+			last_nitems = nitems;
+			fio_gettime(&last_tv, NULL);
+		} else
+			printf("%3.2f%% done\r", perc);
+		fflush(stdout);
+		usleep(250000);
+	};
+}
+
+static int run_dedupe_threads(struct fio_file *f, uint64_t dev_size,
+			      uint64_t *nextents, uint64_t *nchunks)
+{
+	struct worker_thread *threads;
+	unsigned long nitems, total_items;
+	int i, err = 0;
+
+	total_size = dev_size;
+	total_items = dev_size / blocksize;
+	cur_offset = 0;
+	size_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+
+	threads = malloc(num_threads * sizeof(struct worker_thread));
+	for (i = 0; i < num_threads; i++) {
+		memset(&threads[i], 0, sizeof(struct worker_thread));
+		threads[i].fd = f->fd;
+
+		err = pthread_create(&threads[i].thread, NULL, thread_fn, &threads[i]);
+		if (err) {
+			log_err("fio: thread startup failed\n");
+			break;
+		}
+	}
+
+	show_progress(threads, total_items);
+
+	nitems = 0;
+	*nextents = 0;
+	*nchunks = 1;
+	for (i = 0; i < num_threads; i++) {
+		void *ret;
+		pthread_join(threads[i].thread, &ret);
+		nitems += threads[i].items;
+		*nchunks += threads[i].dupes;
+	}
+
+	printf("Threads(%u): %lu items processed\n", num_threads, nitems);
+
+	*nextents = nitems;
+	*nchunks = nitems - *nchunks;
+
+	fio_mutex_remove(size_lock);
+	free(threads);
+	return err;
+}
+
+static int dedupe_check(const char *filename, uint64_t *nextents,
+			uint64_t *nchunks)
+{
+	uint64_t dev_size;
+	struct stat sb;
+	int flags;
+
+	flags = O_RDONLY;
+	if (odirect)
+		flags |= OS_O_DIRECT;
+
+	memset(&file, 0, sizeof(file));
+	file.file_name = strdup(filename);
+
+	file.fd = open(filename, flags);
+	if (file.fd == -1) {
+		perror("open");
+		goto err;
+	}
+
+	if (fstat(file.fd, &sb) < 0) {
+		perror("fstat");
+		goto err;
+	}
+
+	dev_size = get_size(&file, &sb);
+	if (!dev_size)
+		goto err;
+
+	if (use_bloom) {
+		uint64_t bloom_entries;
+
+		bloom_entries = 8 * (dev_size / blocksize);
+		bloom = bloom_new(bloom_entries);
+	}
+
+	printf("Will check <%s>, size <%llu>, using %u threads\n", filename, (unsigned long long) dev_size, num_threads);
+
+	return run_dedupe_threads(&file, dev_size, nextents, nchunks);
+err:
+	if (file.fd != -1)
+		close(file.fd);
+	free(file.file_name);
+	return 1;
+}
+
+static void show_chunk(struct chunk *c)
+{
+	struct flist_head *n;
+	struct extent *e;
+
+	printf("c hash %8x %8x %8x %8x, count %lu\n", c->hash[0], c->hash[1], c->hash[2], c->hash[3], (unsigned long) c->count);
+	flist_for_each(n, &c->extent_list[0]) {
+		e = flist_entry(n, struct extent, list);
+		printf("\toffset %llu\n", (unsigned long long) e->offset);
+	}
+}
+
+static void show_stat(uint64_t nextents, uint64_t nchunks)
+{
+	double perc, ratio;
+
+	printf("Extents=%lu, Unique extents=%lu\n", (unsigned long) nextents, (unsigned long) nchunks);
+
+	if (nchunks) {
+		ratio = (double) nextents / (double) nchunks;
+		printf("De-dupe ratio: 1:%3.2f\n", ratio - 1.0);
+	} else
+		printf("De-dupe ratio: 1:infinite\n");
+
+	perc = 1.00 - ((double) nchunks / (double) nextents);
+	perc *= 100.0;
+	printf("Fio setting: dedupe_percentage=%u\n", (int) (perc + 0.50));
+
+}
+
+static void iter_rb_tree(uint64_t *nextents, uint64_t *nchunks)
+{
+	struct rb_node *n;
+
+	*nchunks = *nextents = 0;
+
+	n = rb_first(&rb_root);
+	if (!n)
+		return;
+
+	do {
+		struct chunk *c;
+
+		c = rb_entry(n, struct chunk, rb_node);
+		(*nchunks)++;
+		*nextents += c->count;
+
+		if (dump_output)
+			show_chunk(c);
+
+	} while ((n = rb_next(n)) != NULL);
+}
+
+static int usage(char *argv[])
+{
+	log_err("Check for dedupable blocks on a device/file\n\n");
+	log_err("%s: [options] <device or file>\n", argv[0]);
+	log_err("\t-b\tChunk size to use\n");
+	log_err("\t-t\tNumber of threads to use\n");
+	log_err("\t-d\tFull extent/chunk debug output\n");
+	log_err("\t-o\tUse O_DIRECT\n");
+	log_err("\t-c\tFull collision check\n");
+	log_err("\t-B\tUse probabilistic bloom filter\n");
+	log_err("\t-p\tPrint progress indicator\n");
+	return 1;
+}
+
+int main(int argc, char *argv[])
+{
+	uint64_t nextents = 0, nchunks = 0;
+	int c, ret;
+
+	debug_init();
+
+	while ((c = getopt(argc, argv, "b:t:d:o:c:p:B:")) != -1) {
+		switch (c) {
+		case 'b':
+			blocksize = atoi(optarg);
+			break;
+		case 't':
+			num_threads = atoi(optarg);
+			break;
+		case 'd':
+			dump_output = atoi(optarg);
+			break;
+		case 'o':
+			odirect = atoi(optarg);
+			break;
+		case 'c':
+			collision_check = atoi(optarg);
+			break;
+		case 'p':
+			print_progress = atoi(optarg);
+			break;
+		case 'B':
+			use_bloom = atoi(optarg);
+			break;
+		case '?':
+		default:
+			return usage(argv);
+		}
+	}
+
+	if (collision_check || dump_output)
+		use_bloom = 0;
+
+	if (!num_threads)
+		num_threads = cpus_online();
+
+	if (argc == optind)
+		return usage(argv);
+
+	sinit();
+
+	rb_root = RB_ROOT;
+	rb_lock = fio_mutex_init(FIO_MUTEX_UNLOCKED);
+
+	ret = dedupe_check(argv[optind], &nextents, &nchunks);
+
+	if (!ret) {
+		if (!bloom)
+			iter_rb_tree(&nextents, &nchunks);
+
+		show_stat(nextents, nchunks);
+	}
+
+	fio_mutex_remove(rb_lock);
+	if (bloom)
+		bloom_free(bloom);
+	scleanup();
+	return ret;
+}
diff --git a/t/jobs/t0009-f8b0bd10.fio b/t/jobs/t0009-f8b0bd10.fio
new file mode 100644
index 0000000..90e07ad
--- /dev/null
+++ b/t/jobs/t0009-f8b0bd10.fio
@@ -0,0 +1,40 @@
+# Expected result: fio verifies and runs for 1m
+# Buggy result: fio crashes with:
+# __get_io_u: Assertion `io_u->flags & IO_U_F_FREE' failed
+
+[global]
+direct=1
+ioengine=null
+size=20g
+norandommap
+randrepeat=0
+bs=4096
+iodepth=170
+#iodepth=96
+#numjobs=1
+numjobs=1
+#numjobs=24
+# number_ios=1
+# runtime=216000
+runtime=3600
+time_based=1
+group_reporting=1
+thread
+gtod_reduce=1
+iodepth_batch=4
+iodepth_batch_complete=4
+cpus_allowed=0-5
+cpus_allowed_policy=split
+rw=randwrite
+verify=crc32c-intel
+verify_backlog=1m
+do_verify=1
+verify_async=6
+verify_async_cpus=0-5
+runtime=1m
+
+[4_KiB_RR_drive_r]
+
+[4_KiB_RR_drive_s]
+
+
diff --git a/t/lfsr-test.c b/t/lfsr-test.c
index d371087..901f1a6 100644
--- a/t/lfsr-test.c
+++ b/t/lfsr-test.c
@@ -8,6 +8,8 @@
 #include <sys/stat.h>
 
 #include "../lib/lfsr.h"
+#include "../gettime.h"
+#include "../fio_time.h"
 
 void usage()
 {
@@ -25,7 +27,7 @@
 int main(int argc, char *argv[])
 {
 	int r;
-	struct timespec start, end;
+	struct timeval start, end;
 	struct fio_lfsr *fl;
 	int verify = 0;
 	unsigned int spin = 0;
@@ -65,11 +67,11 @@
 	printf("LFSR specs\n");
 	printf("==========================\n");
 	printf("Size is         %u\n", 64 - __builtin_clzl(fl->cached_bit));
-	printf("Max val is      %lu\n", fl->max_val);
-	printf("XOR-mask is     0x%lX\n", fl->xormask);
-	printf("Seed is         %lu\n", fl->last_val);
+	printf("Max val is      %lu\n", (unsigned long) fl->max_val);
+	printf("XOR-mask is     0x%lX\n", (unsigned long) fl->xormask);
+	printf("Seed is         %lu\n", (unsigned long) fl->last_val);
 	printf("Spin is         %u\n", fl->spin);
-	printf("Cycle length is %lu\n", fl->cycle_length);
+	printf("Cycle length is %lu\n", (unsigned long) fl->cycle_length);
 
 	/* Create verification table */
 	if (verify) {
@@ -86,12 +88,12 @@
 	 * negligible overhead.
 	 */
 	fprintf(stderr, "\nTest initiated... ");
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &start);
-	while (!lfsr_next(fl, &i, fl->max_val)) {
+	fio_gettime(&start, NULL);
+	while (!lfsr_next(fl, &i)) {
 		if (verify)
 			*(uint8_t *)(v + i) += 1;
 	}
-	clock_gettime(CLOCK_PROCESS_CPUTIME_ID, &end);
+	fio_gettime(&end, NULL);
 	fprintf(stderr, "finished.\n");
 
 
@@ -102,7 +104,8 @@
 		for (i = 0; i < numbers; i++) {
 			if (*(uint8_t *)(v + i) != 1) {
 				fprintf(stderr, "failed (%lu = %d).\n",
-						i, *(uint8_t *)(v + i));
+						(unsigned long) i,
+						*(uint8_t *)(v + i));
 				r = 1;
 				break;
 			}
@@ -112,8 +115,7 @@
 	}
 
 	/* Calculate elapsed time and mean time per number */
-	total = (end.tv_sec - start.tv_sec) * pow(10,9) +
-		end.tv_nsec - start.tv_nsec;
+	total = utime_since(&start, &end);
 	mean = total / fl->num_vals;
 
 	printf("\nTime results ");
@@ -121,7 +123,7 @@
 		printf("(slower due to verification)");
 	printf("\n==============================\n");
 	printf("Elapsed: %lf s\n", total / pow(10,9));
-	printf("Mean:    %lf ns\n", mean);
+	printf("Mean:    %lf us\n", mean);
 
 	free(v_start);
 	free(fl);
diff --git a/t/stest.c b/t/stest.c
index 0da8f2c..efb256e 100644
--- a/t/stest.c
+++ b/t/stest.c
@@ -4,10 +4,7 @@
 
 #include "../smalloc.h"
 #include "../flist.h"
-
-FILE *f_err;
-struct timeval *fio_tv = NULL;
-unsigned int fio_debug = 0;
+#include "debug.h"
 
 #define MAGIC1	0xa9b1c8d2
 #define MAGIC2	0xf0a1e9b3
@@ -72,9 +69,8 @@
 
 int main(int argc, char *argv[])
 {
-	f_err = stderr;
-
 	sinit();
+	debug_init();
 
 	do_rand_allocs();
 
@@ -84,7 +80,3 @@
 	scleanup();
 	return 0;
 }
-
-void __dprint(int type, const char *str, ...)
-{
-}
diff --git a/thread_options.h b/thread_options.h
index 57d84db..611f8e7 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -3,6 +3,7 @@
 
 #include "arch/arch.h"
 #include "os/os.h"
+#include "options.h"
 #include "stat.h"
 #include "gettime.h"
 #include "lib/ieee754.h"
@@ -28,10 +29,13 @@
 	uint32_t perc;
 };
 
+#define NR_OPTS_SZ	(FIO_MAX_OPTS / (8 * sizeof(uint64_t)))
+
 #define OPT_MAGIC	0x4f50544e
 
 struct thread_options {
 	int magic;
+	uint64_t set_options[NR_OPTS_SZ];
 	char *description;
 	char *name;
 	char *directory;
@@ -99,6 +103,8 @@
 	unsigned long long verify_backlog;
 	unsigned int verify_batch;
 	unsigned int experimental_verify;
+	unsigned int verify_state;
+	unsigned int verify_state_save;
 	unsigned int use_thread;
 	unsigned int unlink;
 	unsigned int do_disk_util;
@@ -106,8 +112,11 @@
 	unsigned int rand_repeatable;
 	unsigned int allrand_repeatable;
 	unsigned long long rand_seed;
-	unsigned int use_os_rand;
+	unsigned int dep_use_os_rand;
 	unsigned int log_avg_msec;
+	unsigned int log_offset;
+	unsigned int log_gz;
+	unsigned int log_gz_store;
 	unsigned int norandommap;
 	unsigned int softrandommap;
 	unsigned int bs_unaligned;
@@ -154,16 +163,12 @@
 	unsigned int new_group;
 	unsigned int numjobs;
 	os_cpu_mask_t cpumask;
-	unsigned int cpumask_set;
 	os_cpu_mask_t verify_cpumask;
-	unsigned int verify_cpumask_set;
 	unsigned int cpus_allowed_policy;
 	char *numa_cpunodes;
-	unsigned int numa_cpumask_set;
 	unsigned short numa_mem_mode;
 	unsigned int numa_mem_prefer_node;
 	char *numa_memnodes;
-	unsigned int numa_memmask_set;
 	unsigned int iolog;
 	unsigned int rwmixcycle;
 	unsigned int rwmix[DDIR_RWDIR_CNT];
@@ -181,6 +186,7 @@
 	unsigned int buffer_pattern_bytes;
 	unsigned int compress_percentage;
 	unsigned int compress_chunk;
+	unsigned int dedupe_percentage;
 	unsigned int time_based;
 	unsigned int disable_lat;
 	unsigned int disable_clat;
@@ -189,7 +195,6 @@
 	unsigned int unified_rw_rep;
 	unsigned int gtod_reduce;
 	unsigned int gtod_cpu;
-	unsigned int gtod_offload;
 	enum fio_cs clocksource;
 	unsigned int no_stall;
 	unsigned int trim_percentage;
@@ -259,6 +264,7 @@
 #define FIO_TOP_STR_MAX		256
 
 struct thread_options_pack {
+	uint64_t set_options[NR_OPTS_SZ];
 	uint8_t description[FIO_TOP_STR_MAX];
 	uint8_t name[FIO_TOP_STR_MAX];
 	uint8_t directory[FIO_TOP_STR_MAX];
@@ -326,6 +332,8 @@
 	uint64_t verify_backlog;
 	uint32_t verify_batch;
 	uint32_t experimental_verify;
+	uint32_t verify_state;
+	uint32_t verify_state_save;
 	uint32_t use_thread;
 	uint32_t unlink;
 	uint32_t do_disk_util;
@@ -333,8 +341,11 @@
 	uint32_t rand_repeatable;
 	uint32_t allrand_repeatable;
 	uint64_t rand_seed;
-	uint32_t use_os_rand;
+	uint32_t dep_use_os_rand;
 	uint32_t log_avg_msec;
+	uint32_t log_offset;
+	uint32_t log_gz;
+	uint32_t log_gz_store;
 	uint32_t norandommap;
 	uint32_t softrandommap;
 	uint32_t bs_unaligned;
@@ -342,6 +353,7 @@
 	uint32_t bs_is_seq_rand;
 
 	uint32_t random_distribution;
+	uint32_t pad;
 	fio_fp64_t zipf_theta;
 	fio_fp64_t pareto_h;
 
@@ -378,9 +390,7 @@
 	uint32_t new_group;
 	uint32_t numjobs;
 	uint8_t cpumask[FIO_TOP_STR_MAX];
-	uint32_t cpumask_set;
 	uint8_t verify_cpumask[FIO_TOP_STR_MAX];
-	uint32_t verify_cpumask_set;
 	uint32_t cpus_allowed_policy;
 	uint32_t iolog;
 	uint32_t rwmixcycle;
@@ -397,8 +407,9 @@
 	uint32_t scramble_buffers;
 	uint8_t buffer_pattern[MAX_PATTERN_SIZE];
 	uint32_t buffer_pattern_bytes;
-	unsigned int compress_percentage;
-	unsigned int compress_chunk;
+	uint32_t compress_percentage;
+	uint32_t compress_chunk;
+	uint32_t dedupe_percentage;
 	uint32_t time_based;
 	uint32_t disable_lat;
 	uint32_t disable_clat;
@@ -407,7 +418,6 @@
 	uint32_t unified_rw_rep;
 	uint32_t gtod_reduce;
 	uint32_t gtod_cpu;
-	uint32_t gtod_offload;
 	uint32_t clocksource;
 	uint32_t no_stall;
 	uint32_t trim_percentage;
@@ -416,6 +426,7 @@
 	uint64_t trim_backlog;
 	uint32_t clat_percentiles;
 	uint32_t percentile_precision;
+	uint32_t pad2;
 	fio_fp64_t percentile_list[FIO_IO_U_LIST_MAX_LEN];
 
 	uint8_t read_iolog_file[FIO_TOP_STR_MAX];
@@ -471,6 +482,7 @@
 
 	uint64_t latency_target;
 	uint64_t latency_window;
+	uint32_t pad3;
 	fio_fp64_t latency_percentile;
 } __attribute__((packed));
 
diff --git a/time.c b/time.c
index f3de3e7..f1833c7 100644
--- a/time.c
+++ b/time.c
@@ -9,25 +9,29 @@
 /*
  * busy looping version for the last few usec
  */
-void usec_spin(unsigned int usec)
+uint64_t usec_spin(unsigned int usec)
 {
 	struct timeval start;
+	uint64_t t;
 
 	fio_gettime(&start, NULL);
-	while (utime_since_now(&start) < usec)
+	while ((t = utime_since_now(&start)) < usec)
 		nop;
+
+	return t;
 }
 
-void usec_sleep(struct thread_data *td, unsigned long usec)
+uint64_t usec_sleep(struct thread_data *td, unsigned long usec)
 {
 	struct timespec req;
 	struct timeval tv;
+	uint64_t t = 0;
 
 	do {
 		unsigned long ts = usec;
 
 		if (usec < ns_granularity) {
-			usec_spin(usec);
+			t += usec_spin(usec);
 			break;
 		}
 
@@ -46,11 +50,19 @@
 			break;
 
 		ts = utime_since_now(&tv);
+		t += ts;
 		if (ts >= usec)
 			break;
 
 		usec -= ts;
 	} while (!td->terminate);
+
+	return t;
+}
+
+uint64_t time_since_genesis(void)
+{
+	return time_since_now(&genesis);
 }
 
 uint64_t mtime_since_genesis(void)
diff --git a/trim.c b/trim.c
index de792dc..95c433b 100644
--- a/trim.c
+++ b/trim.c
@@ -24,7 +24,7 @@
 		return 1;
 
 	assert(td->trim_entries);
-	ipo = flist_entry(td->trim_list.next, struct io_piece, trim_list);
+	ipo = flist_first_entry(&td->trim_list, struct io_piece, trim_list);
 	remove_trim_entry(td, ipo);
 
 	io_u->offset = ipo->offset;
@@ -75,13 +75,8 @@
 	if (!td->o.trim_percentage)
 		return 0;
 
-	if (td->o.use_os_rand) {
-		r = os_random_long(&td->trim_state);
-		val = (OS_RAND_MAX / 100ULL);
-	} else {
-		r = __rand(&td->__trim_state);
-		val = (FRAND_MAX / 100ULL);
-	}
+	r = __rand(&td->trim_state);
+	val = (FRAND_MAX / 100ULL);
 
 	val *= (unsigned long long) td->o.trim_percentage;
 	return r <= val;
diff --git a/verify.c b/verify.c
index 282a8cf..b6793d7 100644
--- a/verify.c
+++ b/verify.c
@@ -29,62 +29,47 @@
 			 struct verify_header *hdr, unsigned int header_num,
 			 unsigned int header_len);
 
-static void fill_pattern(struct thread_data *td, void *p, unsigned int len,
-			 char *pattern, unsigned int pattern_bytes)
-{
-	switch (pattern_bytes) {
-	case 0:
-		assert(0);
-		break;
-	case 1:
-		dprint(FD_VERIFY, "fill verify pattern b=0 len=%u\n", len);
-		memset(p, pattern[0], len);
-		break;
-	default: {
-		unsigned int i = 0, size = 0;
-		unsigned char *b = p;
-
-		dprint(FD_VERIFY, "fill verify pattern b=%d len=%u\n",
-					pattern_bytes, len);
-
-		while (i < len) {
-			size = pattern_bytes;
-			if (size > (len - i))
-				size = len - i;
-			memcpy(b+i, pattern, size);
-			i += size;
-		}
-		break;
-		}
-	}
-}
-
 void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len)
 {
-	fill_pattern(td, p, len, td->o.buffer_pattern, td->o.buffer_pattern_bytes);
+	fill_pattern(p, len, td->o.buffer_pattern, td->o.buffer_pattern_bytes);
+}
+
+void __fill_buffer(struct thread_options *o, unsigned long seed, void *p,
+		   unsigned int len)
+{
+	__fill_random_buf_percentage(seed, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
+}
+
+unsigned long fill_buffer(struct thread_data *td, void *p, unsigned int len)
+{
+	struct frand_state *fs = &td->verify_state;
+	struct thread_options *o = &td->o;
+
+	return fill_random_buf_percentage(fs, p, o->compress_percentage, len, len, o->buffer_pattern, o->buffer_pattern_bytes);
 }
 
 void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len,
 			 struct io_u *io_u, unsigned long seed, int use_seed)
 {
-	if (!td->o.verify_pattern_bytes) {
+	struct thread_options *o = &td->o;
+
+	if (!o->verify_pattern_bytes) {
 		dprint(FD_VERIFY, "fill random bytes len=%u\n", len);
 
 		if (use_seed)
-			__fill_random_buf(p, len, seed);
+			__fill_buffer(o, seed, p, len);
 		else
-			io_u->rand_seed = fill_random_buf(&td->__verify_state, p, len);
+			io_u->rand_seed = fill_buffer(td, p, len);
 		return;
 	}
 
 	if (io_u->buf_filled_len >= len) {
 		dprint(FD_VERIFY, "using already filled verify pattern b=%d len=%u\n",
-			td->o.verify_pattern_bytes, len);
+			o->verify_pattern_bytes, len);
 		return;
 	}
 
-	fill_pattern(td, p, len, td->o.verify_pattern, td->o.verify_pattern_bytes);
-
+	fill_pattern(p, len, o->verify_pattern, o->verify_pattern_bytes);
 	io_u->buf_filled_len = len;
 }
 
@@ -405,13 +390,14 @@
 
 	/*
 	 * For read-only workloads, the program cannot be certain of the
-	 * last numberio written to a block. Checking of numberio will be done
-	 * only for workloads that write data.
-	 * For verify_only, numberio will be checked in the last iteration when
-	 * the correct state of numberio, that would have been written to each
-	 * block in a previous run of fio, has been reached.
+	 * last numberio written to a block. Checking of numberio will be
+	 * done only for workloads that write data.  For verify_only,
+	 * numberio will be checked in the last iteration when the correct
+	 * state of numberio, that would have been written to each block
+	 * in a previous run of fio, has been reached.
 	 */
-	if (td_write(td) || td_rw(td))
+	if ((td_write(td) || td_rw(td)) && (td_min_bs(td) == td_max_bs(td)) &&
+	    !td->o.time_based)
 		if (!td->o.verify_only || td->o.loops == 0)
 			if (vh->numberio != io_u->numberio)
 				ret = EILSEQ;
@@ -486,6 +472,7 @@
 
 	fio_sha256_init(&sha256_ctx);
 	fio_sha256_update(&sha256_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha256_final(&sha256_ctx);
 
 	if (!memcmp(vh->sha256, sha256_ctx.buf, sizeof(sha256)))
 		return 0;
@@ -511,6 +498,7 @@
 
 	fio_sha1_init(&sha1_ctx);
 	fio_sha1_update(&sha1_ctx, p, hdr->len - hdr_size(hdr));
+	fio_sha1_final(&sha1_ctx);
 
 	if (!memcmp(vh->sha1, sha1_ctx.H, sizeof(sha1)))
 		return 0;
@@ -641,6 +629,7 @@
 
 	fio_md5_init(&md5_ctx);
 	fio_md5_update(&md5_ctx, p, hdr->len - hdr_size(hdr));
+	fio_md5_final(&md5_ctx);
 
 	if (!memcmp(vh->md5_digest, md5_ctx.hash, sizeof(hash)))
 		return 0;
@@ -656,19 +645,21 @@
 /*
  * Push IO verification to a separate thread
  */
-int verify_io_u_async(struct thread_data *td, struct io_u *io_u)
+int verify_io_u_async(struct thread_data *td, struct io_u **io_u_ptr)
 {
-	if (io_u->file)
-		put_file_log(td, io_u->file);
+	struct io_u *io_u = *io_u_ptr;
 
 	pthread_mutex_lock(&td->io_u_lock);
 
+	if (io_u->file)
+		put_file_log(td, io_u->file);
+
 	if (io_u->flags & IO_U_F_IN_CUR_DEPTH) {
 		td->cur_depth--;
 		io_u->flags &= ~IO_U_F_IN_CUR_DEPTH;
 	}
 	flist_add_tail(&io_u->verify_list, &td->verify_list);
-	io_u->flags |= IO_U_F_FREE_DEF;
+	*io_u_ptr = NULL;
 	pthread_mutex_unlock(&td->io_u_lock);
 
 	pthread_cond_signal(&td->verify_cond);
@@ -709,34 +700,61 @@
 	return ret;
 }
 
-static int verify_header(struct io_u *io_u, struct verify_header *hdr)
+static int verify_header(struct io_u *io_u, struct verify_header *hdr,
+			 unsigned int hdr_num, unsigned int hdr_len)
 {
 	void *p = hdr;
 	uint32_t crc;
 
-	if (hdr->magic != FIO_HDR_MAGIC)
-		return 1;
-	if (hdr->len > io_u->buflen)
-		return 2;
-	if (hdr->rand_seed != io_u->rand_seed)
-		return 3;
+	if (hdr->magic != FIO_HDR_MAGIC) {
+		log_err("verify: bad magic header %x, wanted %x",
+			hdr->magic, FIO_HDR_MAGIC);
+		goto err;
+	}
+	if (hdr->len != hdr_len) {
+		log_err("verify: bad header length %u, wanted %u",
+			hdr->len, hdr_len);
+		goto err;
+	}
+	if (hdr->rand_seed != io_u->rand_seed) {
+		log_err("verify: bad header rand_seed %"PRIu64
+			", wanted %"PRIu64,
+			hdr->rand_seed, io_u->rand_seed);
+		goto err;
+	}
 
 	crc = fio_crc32c(p, offsetof(struct verify_header, crc32));
-	if (crc == hdr->crc32)
-		return 0;
-	log_err("fio: verify header crc %x, calculated %x\n", hdr->crc32, crc);
-	return 4;
+	if (crc != hdr->crc32) {
+		log_err("verify: bad header crc %x, calculated %x",
+			hdr->crc32, crc);
+		goto err;
+	}
+	return 0;
+
+err:
+	log_err(" at file %s offset %llu, length %u\n",
+		io_u->file->file_name,
+		io_u->offset + hdr_num * hdr_len, hdr_len);
+	return EILSEQ;
 }
 
-int verify_io_u(struct thread_data *td, struct io_u *io_u)
+int verify_io_u(struct thread_data *td, struct io_u **io_u_ptr)
 {
 	struct verify_header *hdr;
+	struct io_u *io_u = *io_u_ptr;
 	unsigned int header_size, hdr_inc, hdr_num = 0;
 	void *p;
 	int ret;
 
 	if (td->o.verify == VERIFY_NULL || io_u->ddir != DDIR_READ)
 		return 0;
+	/*
+	 * If the IO engine is faking IO (like null), then just pretend
+	 * we verified everything.
+	 */
+	if (td->io_ops->flags & FIO_FAKEIO)
+		return 0;
+
 	if (io_u->flags & IO_U_F_TRIMMED) {
 		ret = verify_trimmed_io_u(td, io_u);
 		goto done;
@@ -769,42 +787,9 @@
 		if (td->o.verifysort || (td->flags & TD_F_VER_BACKLOG))
 			io_u->rand_seed = hdr->rand_seed;
 
-		ret = verify_header(io_u, hdr);
-		switch (ret) {
-		case 0:
-			break;
-		case 1:
-			log_err("verify: bad magic header %x, wanted %x at "
-				"file %s offset %llu, length %u\n",
-				hdr->magic, FIO_HDR_MAGIC,
-				io_u->file->file_name,
-				io_u->offset + hdr_num * hdr->len, hdr->len);
-			return EILSEQ;
-			break;
-		case 2:
-			log_err("fio: verify header exceeds buffer length (%u "
-				"> %lu)\n", hdr->len, io_u->buflen);
-			return EILSEQ;
-			break;
-		case 3:
-			log_err("verify: bad header rand_seed %"PRIu64
-				", wanted %"PRIu64" at file %s offset %llu, "
-				"length %u\n",
-				hdr->rand_seed, io_u->rand_seed,
-				io_u->file->file_name,
-				io_u->offset + hdr_num * hdr->len, hdr->len);
-			return EILSEQ;
-			break;
-		case 4:
-			return EILSEQ;
-			break;
-		default:
-			log_err("verify: unknown header error at file %s "
-			"offset %llu, length %u\n",
-			io_u->file->file_name,
-			io_u->offset + hdr_num * hdr->len, hdr->len);
-			return EILSEQ;
-		}
+		ret = verify_header(io_u, hdr, hdr_num, hdr_inc);
+		if (ret)
+			return ret;
 
 		if (td->o.verify != VERIFY_NONE)
 			verify_type = td->o.verify;
@@ -861,7 +846,7 @@
 
 done:
 	if (ret && td->o.verify_fatal)
-		td->terminate = 1;
+		fio_mark_td_terminate(td);
 
 	return ret;
 }
@@ -911,6 +896,7 @@
 
 	fio_sha256_init(&sha256_ctx);
 	fio_sha256_update(&sha256_ctx, p, len);
+	fio_sha256_final(&sha256_ctx);
 }
 
 static void fill_sha1(struct verify_header *hdr, void *p, unsigned int len)
@@ -922,6 +908,7 @@
 
 	fio_sha1_init(&sha1_ctx);
 	fio_sha1_update(&sha1_ctx, p, len);
+	fio_sha1_final(&sha1_ctx);
 }
 
 static void fill_crc7(struct verify_header *hdr, void *p, unsigned int len)
@@ -968,6 +955,7 @@
 
 	fio_md5_init(&md5_ctx);
 	fio_md5_update(&md5_ctx, p, len);
+	fio_md5_final(&md5_ctx);
 }
 
 static void populate_hdr(struct thread_data *td, struct io_u *io_u,
@@ -1096,7 +1084,7 @@
 		assert(ipo->flags & IP_F_ONRB);
 		ipo->flags &= ~IP_F_ONRB;
 	} else if (!flist_empty(&td->io_hist_list)) {
-		ipo = flist_entry(td->io_hist_list.next, struct io_piece, list);
+		ipo = flist_first_entry(&td->io_hist_list, struct io_piece, list);
 
 		/*
 		 * Ensure that the associated IO has completed
@@ -1143,9 +1131,9 @@
 		dprint(FD_VERIFY, "get_next_verify: ret io_u %p\n", io_u);
 
 		if (!td->o.verify_pattern_bytes) {
-			io_u->rand_seed = __rand(&td->__verify_state);
+			io_u->rand_seed = __rand(&td->verify_state);
 			if (sizeof(int) != sizeof(long *))
-				io_u->rand_seed *= __rand(&td->__verify_state);
+				io_u->rand_seed *= __rand(&td->verify_state);
 		}
 		return 0;
 	}
@@ -1169,7 +1157,7 @@
 	struct io_u *io_u;
 	int ret = 0;
 
-	if (td->o.verify_cpumask_set &&
+	if (fio_option_is_set(&td->o, verify_cpumask) &&
 	    fio_setaffinity(td->pid, td->o.verify_cpumask)) {
 		log_err("fio: failed setting verify thread affinity\n");
 		goto done;
@@ -1201,10 +1189,12 @@
 			continue;
 
 		while (!flist_empty(&list)) {
-			io_u = flist_entry(list.next, struct io_u, verify_list);
-			flist_del(&io_u->verify_list);
+			io_u = flist_first_entry(&list, struct io_u, verify_list);
+			flist_del_init(&io_u->verify_list);
 
-			ret = verify_io_u(td, io_u);
+			io_u->flags |= IO_U_F_NO_FILE_PUT;
+			ret = verify_io_u(td, &io_u);
+
 			put_io_u(td, io_u);
 			if (!ret)
 				continue;
@@ -1219,7 +1209,7 @@
 	if (ret) {
 		td_verror(td, ret, "async_verify");
 		if (td->o.verify_fatal)
-			td->terminate = 1;
+			fio_mark_td_terminate(td);
 	}
 
 done:
@@ -1287,3 +1277,294 @@
 	free(td->verify_threads);
 	td->verify_threads = NULL;
 }
+
+struct all_io_list *get_all_io_list(int save_mask, size_t *sz)
+{
+	struct all_io_list *rep;
+	struct thread_data *td;
+	size_t depth;
+	void *next;
+	int i, nr;
+
+	compiletime_assert(sizeof(struct all_io_list) == 8, "all_io_list");
+
+	/*
+	 * Calculate reply space needed. We need one 'io_state' per thread,
+	 * and the size will vary depending on depth.
+	 */
+	depth = 0;
+	nr = 0;
+	for_each_td(td, i) {
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+		td->stop_io = 1;
+		td->flags |= TD_F_VSTATE_SAVED;
+		depth += td->o.iodepth;
+		nr++;
+	}
+
+	if (!nr)
+		return NULL;
+
+	*sz = sizeof(*rep);
+	*sz += nr * sizeof(struct thread_io_list);
+	*sz += depth * sizeof(uint64_t);
+	rep = malloc(*sz);
+
+	rep->threads = cpu_to_le64((uint64_t) nr);
+
+	next = &rep->state[0];
+	for_each_td(td, i) {
+		struct thread_io_list *s = next;
+		unsigned int comps;
+
+		if (save_mask != IO_LIST_ALL && (i + 1) != save_mask)
+			continue;
+
+		if (td->last_write_comp) {
+			int j, k;
+
+			if (td->io_blocks[DDIR_WRITE] < td->o.iodepth)
+				comps = td->io_blocks[DDIR_WRITE];
+			else
+				comps = td->o.iodepth;
+
+			k = td->last_write_idx - 1;
+			for (j = 0; j < comps; j++) {
+				if (k == -1)
+					k = td->o.iodepth - 1;
+				s->offsets[j] = cpu_to_le64(td->last_write_comp[k]);
+				k--;
+			}
+		} else
+			comps = 0;
+
+		s->no_comps = cpu_to_le64((uint64_t) comps);
+		s->depth = cpu_to_le64((uint64_t) td->o.iodepth);
+		s->numberio = cpu_to_le64((uint64_t) td->io_issues[DDIR_WRITE]);
+		s->index = cpu_to_le64((uint64_t) i);
+		s->rand.s[0] = cpu_to_le32(td->random_state.s1);
+		s->rand.s[1] = cpu_to_le32(td->random_state.s2);
+		s->rand.s[2] = cpu_to_le32(td->random_state.s3);
+		s->rand.s[3] = 0;
+		s->name[sizeof(s->name) - 1] = '\0';
+		strncpy((char *) s->name, td->o.name, sizeof(s->name) - 1);
+		next = io_list_next(s);
+	}
+
+	return rep;
+}
+
+static int open_state_file(const char *name, const char *prefix, int num,
+			   int for_write)
+{
+	char out[64];
+	int flags;
+	int fd;
+
+	if (for_write)
+		flags = O_CREAT | O_TRUNC | O_WRONLY | O_SYNC;
+	else
+		flags = O_RDONLY;
+
+	verify_state_gen_name(out, sizeof(out), name, prefix, num);
+
+	fd = open(out, flags, 0644);
+	if (fd == -1) {
+		perror("fio: open state file");
+		return -1;
+	}
+
+	return fd;
+}
+
+static int write_thread_list_state(struct thread_io_list *s,
+				   const char *prefix)
+{
+	struct verify_state_hdr hdr;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	fd = open_state_file((const char *) s->name, prefix, s->index, 1);
+	if (fd == -1)
+		return 1;
+
+	crc = fio_crc32c((void *)s, thread_io_list_sz(s));
+
+	hdr.version = cpu_to_le64((uint64_t) VSTATE_HDR_VERSION);
+	hdr.size = cpu_to_le64((uint64_t) thread_io_list_sz(s));
+	hdr.crc = cpu_to_le64(crc);
+	ret = write(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr))
+		goto write_fail;
+
+	ret = write(fd, s, thread_io_list_sz(s));
+	if (ret != thread_io_list_sz(s)) {
+write_fail:
+		if (ret < 0)
+			perror("fio: write state file");
+		log_err("fio: failed to write state file\n");
+		ret = 1;
+	} else
+		ret = 0;
+
+	close(fd);
+	return ret;
+}
+
+void __verify_save_state(struct all_io_list *state, const char *prefix)
+{
+	struct thread_io_list *s = &state->state[0];
+	unsigned int i;
+
+	for (i = 0; i < le64_to_cpu(state->threads); i++) {
+		write_thread_list_state(s,  prefix);
+		s = io_list_next(s);
+	}
+}
+
+void verify_save_state(void)
+{
+	struct all_io_list *state;
+	size_t sz;
+
+	state = get_all_io_list(IO_LIST_ALL, &sz);
+	if (state) {
+		__verify_save_state(state, "local");
+		free(state);
+	}
+}
+
+void verify_free_state(struct thread_data *td)
+{
+	if (td->vstate)
+		free(td->vstate);
+}
+
+void verify_convert_assign_state(struct thread_data *td,
+				 struct thread_io_list *s)
+{
+	int i;
+
+	s->no_comps = le64_to_cpu(s->no_comps);
+	s->depth = le64_to_cpu(s->depth);
+	s->numberio = le64_to_cpu(s->numberio);
+	for (i = 0; i < 4; i++)
+		s->rand.s[i] = le32_to_cpu(s->rand.s[i]);
+	for (i = 0; i < s->no_comps; i++)
+		s->offsets[i] = le64_to_cpu(s->offsets[i]);
+
+	td->vstate = s;
+}
+
+int verify_state_hdr(struct verify_state_hdr *hdr, struct thread_io_list *s)
+{
+	uint64_t crc;
+
+	hdr->version = le64_to_cpu(hdr->version);
+	hdr->size = le64_to_cpu(hdr->size);
+	hdr->crc = le64_to_cpu(hdr->crc);
+
+	if (hdr->version != VSTATE_HDR_VERSION)
+		return 1;
+
+	crc = fio_crc32c((void *)s, hdr->size);
+	if (crc != hdr->crc)
+		return 1;
+
+	return 0;
+}
+
+int verify_load_state(struct thread_data *td, const char *prefix)
+{
+	struct thread_io_list *s = NULL;
+	struct verify_state_hdr hdr;
+	uint64_t crc;
+	ssize_t ret;
+	int fd;
+
+	if (!td->o.verify_state)
+		return 0;
+
+	fd = open_state_file(td->o.name, prefix, td->thread_number - 1, 0);
+	if (fd == -1)
+		return 1;
+
+	ret = read(fd, &hdr, sizeof(hdr));
+	if (ret != sizeof(hdr)) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state hdr");
+		log_err("fio: failed reading verify state header\n");
+		goto err;
+	}
+
+	hdr.version = le64_to_cpu(hdr.version);
+	hdr.size = le64_to_cpu(hdr.size);
+	hdr.crc = le64_to_cpu(hdr.crc);
+
+	if (hdr.version != VSTATE_HDR_VERSION) {
+		log_err("fio: bad version in verify state header\n");
+		goto err;
+	}
+
+	s = malloc(hdr.size);
+	ret = read(fd, s, hdr.size);
+	if (ret != hdr.size) {
+		if (ret < 0)
+			td_verror(td, errno, "read verify state");
+		log_err("fio: failed reading verity state\n");
+		goto err;
+	}
+
+	crc = fio_crc32c((void *)s, hdr.size);
+	if (crc != hdr.crc) {
+		log_err("fio: verify state is corrupt\n");
+		goto err;
+	}
+
+	close(fd);
+
+	verify_convert_assign_state(td, s);
+	return 0;
+err:
+	if (s)
+		free(s);
+	close(fd);
+	return 1;
+}
+
+/*
+ * Use the loaded verify state to know when to stop doing verification
+ */
+int verify_state_should_stop(struct thread_data *td, struct io_u *io_u)
+{
+	struct thread_io_list *s = td->vstate;
+	int i;
+
+	if (!s)
+		return 0;
+
+	/*
+	 * If we're not into the window of issues - depth yet, continue. If
+	 * issue is shorter than depth, do check.
+	 */
+	if ((td->io_blocks[DDIR_READ] < s->depth ||
+	    s->numberio - td->io_blocks[DDIR_READ] > s->depth) &&
+	    s->numberio > s->depth)
+		return 0;
+
+	/*
+	 * We're in the window of having to check if this io was
+	 * completed or not. If the IO was seen as completed, then
+	 * lets verify it.
+	 */
+	for (i = 0; i < s->no_comps; i++)
+		if (io_u->offset == s->offsets[i])
+			return 0;
+
+	/*
+	 * Not found, we have to stop
+	 */
+	return 1;
+}
diff --git a/verify.h b/verify.h
index dba7743..43de887 100644
--- a/verify.h
+++ b/verify.h
@@ -76,8 +76,8 @@
  */
 extern void populate_verify_io_u(struct thread_data *, struct io_u *);
 extern int __must_check get_next_verify(struct thread_data *td, struct io_u *);
-extern int __must_check verify_io_u(struct thread_data *, struct io_u *);
-extern int verify_io_u_async(struct thread_data *, struct io_u *);
+extern int __must_check verify_io_u(struct thread_data *, struct io_u **);
+extern int verify_io_u_async(struct thread_data *, struct io_u **);
 extern void fill_verify_pattern(struct thread_data *td, void *p, unsigned int len, struct io_u *io_u, unsigned long seed, int use_seed);
 extern void fill_buffer_pattern(struct thread_data *td, void *p, unsigned int len);
 extern void fio_verify_init(struct thread_data *td);
@@ -88,4 +88,62 @@
 extern int verify_async_init(struct thread_data *);
 extern void verify_async_exit(struct thread_data *);
 
+struct thread_rand_state {
+	uint32_t s[4];
+};
+
+/*
+ * For dumping current write state
+ */
+struct thread_io_list {
+	uint64_t no_comps;
+	uint64_t depth;
+	uint64_t numberio;
+	uint64_t index;
+	struct thread_rand_state rand;
+	uint8_t name[64];
+	uint64_t offsets[0];
+};
+
+struct all_io_list {
+	uint64_t threads;
+	struct thread_io_list state[0];
+};
+
+#define VSTATE_HDR_VERSION	0x01
+
+struct verify_state_hdr {
+	uint64_t version;
+	uint64_t size;
+	uint64_t crc;
+};
+
+#define IO_LIST_ALL		0xffffffff
+extern struct all_io_list *get_all_io_list(int, size_t *);
+extern void __verify_save_state(struct all_io_list *, const char *);
+extern void verify_save_state(void);
+extern int verify_load_state(struct thread_data *, const char *);
+extern void verify_free_state(struct thread_data *);
+extern int verify_state_should_stop(struct thread_data *, struct io_u *);
+extern void verify_convert_assign_state(struct thread_data *, struct thread_io_list *);
+extern int verify_state_hdr(struct verify_state_hdr *, struct thread_io_list *);
+
+static inline size_t thread_io_list_sz(struct thread_io_list *s)
+{
+	return sizeof(*s) + le64_to_cpu(s->depth) * sizeof(uint64_t);
+}
+
+static inline struct thread_io_list *io_list_next(struct thread_io_list *s)
+{
+	return (void *) s + thread_io_list_sz(s);
+}
+
+static inline void verify_state_gen_name(char *out, size_t size,
+					 const char *name, const char *prefix,
+					 int num)
+{
+	snprintf(out, size, "%s-%s-%d-verify.state", prefix, name, num);
+	out[size - 1] = '\0';
+}
+
 #endif