Merge branch 'gfio' into gfio-int

Conflicts:
	backend.c
	fio.c
	fio.h
	init.c
	libfio.c
	options.c
	thread_options.h

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/README b/README
index 317ddec..8db962b 100644
--- a/README
+++ b/README
@@ -224,115 +224,9 @@
 just write a simple job file to describe the workload. The job file format
 is in the ini style format, as that is easy to read and write for the user.
 
-The job file parameters are:
-
-	name=x		Use 'x' as the identifier for this job.
-	description=x	'x' is a text description of the job.
-	directory=x	Use 'x' as the top level directory for storing files
-	filename=x	Force the use of 'x' as the filename for all files
-			in this thread. If not given, fio will make up
-			a suitable filename based on the thread and file
-			number.
-	rw=x		'x' may be: read, randread, write, randwrite,
-			rw (read-write mix), randrw (read-write random mix)
-	rwmixcycle=x	Base cycle for switching between read and write
-			in msecs.
-	rwmixread=x	'x' percentage of rw mix ios will be reads. If
-			rwmixwrite is also given, the last of the two will
-			 be used if they don't add up to 100%.
-	rwmixwrite=x	'x' percentage of rw mix ios will be writes. See
-			rwmixread.
-	rand_repeatable=x  The sequence of random io blocks can be repeatable
-			across runs, if 'x' is 1.
-	size=x		Set file size to x bytes (x string can include k/m/g)
-	ioengine=x	'x' may be: aio/libaio/linuxaio for Linux aio,
-			posixaio for POSIX aio, solarisaio for Solaris
-			native async IO, windowsaio for Windows native async IO,
-			sync for regular read/write io,
-			psync for regular pread/pwrite io, vsync for regular
-			readv/writev (with queuing emulation) mmap for mmap'ed
-			io, syslet-rw for syslet driven read/write, splice for
-			using splice/vmsplice, sg for direct SG_IO io, net
-			for network io, rdma for RDMA io, or cpuio for a
-			cycler burner load. sg only works on Linux on
-			SCSI (or SCSI-like devices, such as usb-storage or
-			sata/libata driven) devices. Fio also has a null
-			io engine, which is mainly used for testing
-			fio itself.
-
-	iodepth=x	For async io, allow 'x' ios in flight
-	overwrite=x	If 'x', layout a write file first.
-	nrfiles=x	Spread io load over 'x' number of files per job,
-			if possible.
-	prio=x		Run io at prio X, 0-7 is the kernel allowed range
-	prioclass=x	Run io at prio class X
-	bs=x		Use 'x' for thread blocksize. May include k/m postfix.
-	bsrange=x-y	Mix thread block sizes randomly between x and y. May
-			also include k/m postfix.
-	direct=x	1 for direct IO, 0 for buffered IO
-	thinktime=x	"Think" x usec after each io
-	rate=x		Throttle rate to x KB/sec
-	ratemin=x	Quit if rate of x KB/sec can't be met
-	ratecycle=x	ratemin averaged over x msecs
-	cpumask=x	Only allow job to run on CPUs defined by mask.
-	cpus_allowed=x	Like 'cpumask', but allow text setting of CPU affinity.
-	numa_cpu_nodes=x,y-z  Allow job to run on specified NUMA nodes' CPU.
-	numa_mem_policy=m:x,y-z  Setup numa memory allocation policy.
-			'm' stands for policy, such as local, interleave,
-			bind, prefer, local. 'x, y-z' are numa node(s) for
-			memory allocation according to policy.
-	fsync=x		If writing with buffered IO, fsync after every
-			'x' blocks have been written.
-	end_fsync=x	If 'x', run fsync() after end-of-job.
-	startdelay=x	Start this thread x seconds after startup
-	runtime=x	Terminate x seconds after startup. Can include a
-			normal time suffix if not given in seconds, such as
-			'm' for minutes, 'h' for hours, and 'd' for days.
-	offset=x	Start io at offset x (x string can include k/m/g)
-	invalidate=x	Invalidate page cache for file prior to doing io
-	sync=x		Use sync writes if x and writing buffered IO.
-	mem=x		If x == malloc, use malloc for buffers. If x == shm,
-			use shared memory for buffers. If x == mmap, use
-			anonymous mmap.
-	exitall		When one thread quits, terminate the others
-	bwavgtime=x	Average bandwidth stats over an x msec window.
-	create_serialize=x	If 'x', serialize file creation.
-	create_fsync=x	If 'x', run fsync() after file creation.
-	unlink		If set, unlink files when done.
-	loops=x		Run the job 'x' number of times.
-	verify=x	If 'x' == md5, use md5 for verifies. If 'x' == crc32,
-			use crc32 for verifies. md5 is 'safer', but crc32 is
-			a lot faster. Only makes sense for writing to a file.
-			For other types of checksumming, see HOWTO.
-	stonewall	Wait for preceeding jobs to end before running.
-	numjobs=x	Create 'x' similar entries for this job
-	thread		Use pthreads instead of forked jobs
-	zonesize=x
-	zoneskip=y	Zone options must be paired. If given, the job
-			will skip y bytes for every x read/written. This
-			can be used to gauge hard drive speed over the entire
-			platter, without reading everything. Both x/y can
-			include k/m/g suffix.
-	read_iolog=x	Open and read io pattern from file 'x'. The file format
-                        is described in the HOWTO.
-	write_iolog=x	Write an iolog to file 'x' in the same format as iolog.
-			The iolog options are exclusive, if both given the
-			read iolog will be performed.  Specify a separate file
-			for each job, otherwise the iologs will be interspersed
-			and the file may be corrupt.
-	write_bw_log	Write a bandwidth log.
-	write_lat_log	Write a latency log.
-	lockmem=x	Lock down x amount of memory on the machine, to
-			simulate a machine with less memory available. x can
-			include k/m/g suffix.
-	nice=x		Run job at given nice value.
-	exec_prerun=x	Run 'x' before job io is begun.
-	exec_postrun=x	Run 'x' after job io has finished.
-	ioscheduler=x	Use ioscheduler 'x' for this job.
-	cpuload=x	For a CPU io thread, percentage of CPU time to attempt
-			to burn.
-	cpuchunks=x	Split burn cycles into pieces of x usecs.
-
+The HOWTO or man page has a full list of all options, along with
+descriptions, etc. The --cmdhelp option also lists all options. If
+used with an option argument, it will detail that particular option.
 
 
 Client/server
diff --git a/backend.c b/backend.c
index 022122a..119c4f9 100644
--- a/backend.c
+++ b/backend.c
@@ -1103,7 +1103,7 @@
 	} else
 		td->pid = gettid();
 
-	fio_local_clock_init(td->o.use_thread);
+	fio_local_clock_init(o->use_thread);
 
 	dprint(FD_PROCESS, "jobs pid=%d started\n", (int) td->pid);
 
@@ -1173,7 +1173,7 @@
 
 #ifdef CONFIG_LIBNUMA
 	/* numa node setup */
-	if (td->o.numa_cpumask_set || td->o.numa_memmask_set) {
+	if (o->numa_cpumask_set || o->numa_memmask_set) {
 		int ret;
 
 		if (numa_available() < 0) {
@@ -1181,8 +1181,8 @@
 			goto err;
 		}
 
-		if (td->o.numa_cpumask_set) {
-			ret = numa_run_on_node_mask(td->o.numa_cpunodesmask);
+		if (o->numa_cpumask_set) {
+			ret = numa_run_on_node_mask(o->numa_cpunodesmask);
 			if (ret == -1) {
 				td_verror(td, errno, \
 					"numa_run_on_node_mask failed\n");
@@ -1190,20 +1190,20 @@
 			}
 		}
 
-		if (td->o.numa_memmask_set) {
+		if (o->numa_memmask_set) {
 
-			switch (td->o.numa_mem_mode) {
+			switch (o->numa_mem_mode) {
 			case MPOL_INTERLEAVE:
-				numa_set_interleave_mask(td->o.numa_memnodesmask);
+				numa_set_interleave_mask(o->numa_memnodesmask);
 				break;
 			case MPOL_BIND:
-				numa_set_membind(td->o.numa_memnodesmask);
+				numa_set_membind(o->numa_memnodesmask);
 				break;
 			case MPOL_LOCAL:
 				numa_set_localalloc();
 				break;
 			case MPOL_PREFERRED:
-				numa_set_preferred(td->o.numa_mem_prefer_node);
+				numa_set_preferred(o->numa_mem_prefer_node);
 				break;
 			case MPOL_DEFAULT:
 			default:
@@ -1214,6 +1214,9 @@
 	}
 #endif
 
+	if (fio_pin_memory(td))
+		goto err;
+
 	/*
 	 * May alter parameters that init_io_u() will use, so we need to
 	 * do this first.
@@ -1235,7 +1238,7 @@
 		}
 	}
 
-	if (td->o.cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
+	if (o->cgroup && cgroup_setup(td, cgroup_list, &cgroup_mnt))
 		goto err;
 
 	errno = 0;
@@ -1277,8 +1280,8 @@
 		memcpy(&td->iops_sample_time, &td->start, sizeof(td->start));
 		memcpy(&td->tv_cache, &td->start, sizeof(td->start));
 
-		if (td->o.ratemin[DDIR_READ] || td->o.ratemin[DDIR_WRITE] ||
-				td->o.ratemin[DDIR_TRIM]) {
+		if (o->ratemin[DDIR_READ] || o->ratemin[DDIR_WRITE] ||
+				o->ratemin[DDIR_TRIM]) {
 		        memcpy(&td->lastrate[DDIR_READ], &td->bw_sample_time,
 						sizeof(td->bw_sample_time));
 		        memcpy(&td->lastrate[DDIR_WRITE], &td->bw_sample_time,
@@ -1312,8 +1315,8 @@
 		if (td->error || td->terminate)
 			break;
 
-		if (!td->o.do_verify ||
-		    td->o.verify == VERIFY_NONE ||
+		if (!o->do_verify ||
+		    o->verify == VERIFY_NONE ||
 		    (td->io_ops->flags & FIO_UNIDIR))
 			continue;
 
@@ -1342,44 +1345,44 @@
 
 	fio_mutex_down(writeout_mutex);
 	if (td->bw_log) {
-		if (td->o.bw_log_file) {
+		if (o->bw_log_file) {
 			finish_log_named(td, td->bw_log,
-						td->o.bw_log_file, "bw");
+						o->bw_log_file, "bw");
 		} else
 			finish_log(td, td->bw_log, "bw");
 	}
 	if (td->lat_log) {
-		if (td->o.lat_log_file) {
+		if (o->lat_log_file) {
 			finish_log_named(td, td->lat_log,
-						td->o.lat_log_file, "lat");
+						o->lat_log_file, "lat");
 		} else
 			finish_log(td, td->lat_log, "lat");
 	}
 	if (td->slat_log) {
-		if (td->o.lat_log_file) {
+		if (o->lat_log_file) {
 			finish_log_named(td, td->slat_log,
-						td->o.lat_log_file, "slat");
+						o->lat_log_file, "slat");
 		} else
 			finish_log(td, td->slat_log, "slat");
 	}
 	if (td->clat_log) {
-		if (td->o.lat_log_file) {
+		if (o->lat_log_file) {
 			finish_log_named(td, td->clat_log,
-						td->o.lat_log_file, "clat");
+						o->lat_log_file, "clat");
 		} else
 			finish_log(td, td->clat_log, "clat");
 	}
 	if (td->iops_log) {
-		if (td->o.iops_log_file) {
+		if (o->iops_log_file) {
 			finish_log_named(td, td->iops_log,
-						td->o.iops_log_file, "iops");
+						o->iops_log_file, "iops");
 		} else
 			finish_log(td, td->iops_log, "iops");
 	}
 
 	fio_mutex_up(writeout_mutex);
-	if (td->o.exec_postrun)
-		exec_string(td->o.exec_postrun);
+	if (o->exec_postrun)
+		exec_string(o->exec_postrun);
 
 	if (exitall_on_terminate)
 		fio_terminate_threads(td->groupid);
@@ -1389,7 +1392,7 @@
 		log_info("fio: pid=%d, err=%d/%s\n", (int) td->pid, td->error,
 							td->verror);
 
-	if (td->o.verify_async)
+	if (o->verify_async)
 		verify_async_exit(td);
 
 	close_and_free_files(td);
@@ -1406,7 +1409,7 @@
 	/*
 	 * do this very late, it will log file closing as well
 	 */
-	if (td->o.write_iolog_file)
+	if (o->write_iolog_file)
 		write_iolog_close(td);
 
 	fio_mutex_remove(td->rusage_sem);
diff --git a/examples/cpuio b/examples/cpuio.fio
similarity index 100%
rename from examples/cpuio
rename to examples/cpuio.fio
diff --git a/examples/e4defrag b/examples/e4defrag.fio
similarity index 100%
rename from examples/e4defrag
rename to examples/e4defrag.fio
diff --git a/examples/e4defrag2 b/examples/e4defrag2.fio
similarity index 100%
rename from examples/e4defrag2
rename to examples/e4defrag2.fio
diff --git a/examples/enospc-pressure b/examples/enospc-pressure.fio
similarity index 100%
rename from examples/enospc-pressure
rename to examples/enospc-pressure.fio
diff --git a/examples/falloc b/examples/falloc.fio
similarity index 100%
rename from examples/falloc
rename to examples/falloc.fio
diff --git a/examples/fusion-aw-sync.ini b/examples/fusion-aw-sync.fio
similarity index 100%
rename from examples/fusion-aw-sync.ini
rename to examples/fusion-aw-sync.fio
diff --git a/examples/numa b/examples/numa.fio
similarity index 100%
rename from examples/numa
rename to examples/numa.fio
diff --git a/examples/zipf b/examples/zipf.fio
similarity index 100%
rename from examples/zipf
rename to examples/zipf.fio
diff --git a/fio.h b/fio.h
index c8c8b7a..7d478a3 100644
--- a/fio.h
+++ b/fio.h
@@ -20,7 +20,6 @@
 #include "thread_options.h"
 #include "flist.h"
 #include "fifo.h"
-#include "lib/rbtree.h"
 #include "arch/arch.h"
 #include "os/os.h"
 #include "mutex.h"
@@ -37,6 +36,7 @@
 #include "gettime.h"
 #include "lib/getopt.h"
 #include "lib/rand.h"
+#include "lib/rbtree.h"
 #include "client.h"
 #include "server.h"
 #include "stat.h"
@@ -445,8 +445,8 @@
 extern char *num2str(unsigned long, int, int, int, int);
 extern int ioengine_load(struct thread_data *);
 
-extern unsigned long page_mask;
-extern unsigned long page_size;
+extern uintptr_t page_mask;
+extern uintptr_t page_size;
 extern int initialize_fio(char *envp[]);
 
 #define FIO_GETOPT_JOB		0x89000000
diff --git a/libfio.c b/libfio.c
index 867d86e..c26d6a3 100644
--- a/libfio.c
+++ b/libfio.c
@@ -41,8 +41,8 @@
 
 unsigned long arch_flags = 0;
 
-uintptr_t page_mask;
-uintptr_t page_size;
+uintptr_t page_mask = 0;
+uintptr_t page_size = 0;
 
 static const char *fio_os_strings[os_nr] = {
 	"Invalid",
@@ -241,6 +241,10 @@
 		return 1;
 	}
 
+#if !defined(CONFIG_GETTIMEOFDAY) && !defined(CONFIG_CLOCK_GETTIME)
+#error "No available clock source!"
+#endif
+
 	arch_init(envp);
 
 	sinit();
diff --git a/thread_options.h b/thread_options.h
index 68056df..60a1b69 100644
--- a/thread_options.h
+++ b/thread_options.h
@@ -5,6 +5,7 @@
 #include "os/os.h"
 #include "stat.h"
 #include "gettime.h"
+#include "lib/ieee754.h"
 
 /*
  * What type of allocation to use for io buffers