Add support for blkio cgroups on Linux

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/HOWTO b/HOWTO
index 9b3a684..7a7d14e 100644
--- a/HOWTO
+++ b/HOWTO
@@ -994,6 +994,7 @@
 		for doing these time calls will be excluded from other
 		uses. Fio will manually clear it from the CPU mask of other
 		jobs.
+
 continue_on_error=bool	Normally fio will exit the job on the first observed
 		failure. If this option is set, fio will continue the job when
 		there is a 'non-fatal error' (EIO or EILSEQ) until the runtime
@@ -1003,6 +1004,21 @@
 		given in the stats is the first error that was hit during the
 		run.
 
+cgroup_root=str	Root of the mounted blkio cgroup file systems. This is a Linux
+		specific IO controller. If your system doesn't have it mounted,
+		you can do so with:
+
+		# mount -t cgroup -o blkio none /cgroup
+
+		The cgroup_root defaults to /cgroup, if mounted elsewhere
+		please specify this option.
+
+cgroup=str	Add job to this control group. If it doesn't exist, it will
+		be created.
+
+cgroup_weight=int	Set the weight of the cgroup to this value. See
+		the documentation that comes with the kernel, allowed values
+		are in the range of 100..1000.
 
 6.0 Interpreting the output
 ---------------------------
diff --git a/Makefile b/Makefile
index 4f95a5d..ce63cfc 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,8 @@
 SCRIPTS = fio_generate_plots
 OBJS = gettime.o fio.o ioengines.o init.o stat.o log.o time.o filesetup.o \
 	eta.o verify.o memory.o io_u.o parse.o mutex.o options.o \
-	rbtree.o diskutil.o fifo.o blktrace.o smalloc.o filehash.o helpers.o
+	rbtree.o diskutil.o fifo.o blktrace.o smalloc.o filehash.o helpers.o \
+	cgroup.o
 
 OBJS += crc/crc7.o
 OBJS += crc/crc16.o
diff --git a/cgroup.c b/cgroup.c
new file mode 100644
index 0000000..15641e6
--- /dev/null
+++ b/cgroup.c
@@ -0,0 +1,115 @@
+/*
+ * Code related to setting up a blkio cgroup
+ */
+#include <stdio.h>
+#include <stdlib.h>
+#include "fio.h"
+#include "cgroup.h"
+
+static char *get_cgroup_root(struct thread_data *td)
+{
+	char *str = malloc(64);
+
+	if (td->o.cgroup)
+		sprintf(str, "%s/%s", td->o.cgroup_root, td->o.cgroup);
+	else
+		sprintf(str, "%s/%s", td->o.cgroup_root, td->o.name);
+
+	return str;
+}
+
+/*
+ * Add pid to given class
+ */
+static int cgroup_add_pid(struct thread_data *td)
+{
+	char *root, tmp[256];
+	FILE *f;
+
+	root = get_cgroup_root(td);
+	sprintf(tmp, "%s/tasks", root);
+
+	f = fopen(tmp, "w");
+	if (!f) {
+		td_verror(td, errno, "cgroup open tasks");
+		return 1;
+	}
+
+	fprintf(f, "%d", td->pid);
+	fclose(f);
+	free(root);
+	return 0;
+}
+
+/*
+ * Move pid to root class
+ */
+static int cgroup_del_pid(struct thread_data *td)
+{
+	char tmp[256];
+	FILE *f;
+
+	sprintf(tmp, "%s/tasks", td->o.cgroup_root);
+	f = fopen(tmp, "w");
+	if (!f) {
+		td_verror(td, errno, "cgroup open tasks");
+		return 1;
+	}
+
+	fprintf(f, "%d", td->pid);
+	fclose(f);
+	return 0;
+}
+
+
+int cgroup_setup(struct thread_data *td)
+{
+	char *root, tmp[256];
+	FILE *f;
+
+	/*
+	 * Create container, if it doesn't exist
+	 */
+	root = get_cgroup_root(td);
+	if (mkdir(root, 0755) < 0) {
+		int __e = errno;
+
+		if (__e != EEXIST) {
+			td_verror(td, __e, "cgroup mkdir");
+			return 1;
+		}
+	} else
+		td->o.cgroup_was_created = 1;
+
+	sprintf(tmp, "%s/blkio.weight", root);
+	f = fopen(tmp, "w");
+	if (!f) {
+		td_verror(td, errno, "cgroup open weight");
+		return 1;
+	}
+
+	fprintf(f, "%d", td->o.cgroup_weight);
+	fclose(f);
+	free(root);
+
+	if (cgroup_add_pid(td))
+		return 1;
+
+	return 0;
+}
+
+void cgroup_shutdown(struct thread_data *td)
+{
+	if (!td->o.cgroup_weight)
+		return;
+
+	cgroup_del_pid(td);
+
+	if (td->o.cgroup_was_created) {
+		char *root;
+
+		root = get_cgroup_root(td);
+		rmdir(root);
+		free(root);
+	}
+}
diff --git a/cgroup.h b/cgroup.h
new file mode 100644
index 0000000..65fa3ad
--- /dev/null
+++ b/cgroup.h
@@ -0,0 +1,22 @@
+#ifndef FIO_CGROUP_H
+#define FIO_CGROUP_H
+
+#ifdef FIO_HAVE_CGROUPS
+
+int cgroup_setup(struct thread_data *td);
+void cgroup_shutdown(struct thread_data *td);
+
+#else
+
+static inline int cgroup_setup(struct thread_data *td)
+{
+	td_verror(td, EINVAL, "cgroup_setup");
+	return 1;
+}
+
+static inline void cgroup_shutdown(struct thread_data *td)
+{
+}
+
+#endif
+#endif
diff --git a/fio.1 b/fio.1
index 4445d0a..648b4e9 100644
--- a/fio.1
+++ b/fio.1
@@ -725,13 +725,22 @@
 these time calls will be excluded from other uses. Fio will manually clear it
 from the CPU mask of other jobs.
 .TP
-.BI continue_on_error \fR=\fPbool
-Normally fio will exit the job on the first observed failure. If this option is
-set, fio will continue the job when there is a 'non-fatal error'
-(\fBEIO\fR or \fBEILSEQ\fR) until the runtime is exceeded or the I/O size
-specified is completed. If this option is used, there are two more stats that
-are appended, the total error count and the first error. The error field given
-in the stats is the first error that was hit during the run.
+.BI cgroup_root \fR=\fPstr
+Root of the mounted blkio cgroup file systems. This is a Linux
+specific IO controller. If your system doesn't have it mounted,
+you can do so with:
+
+# mount -t cgroup -o blkio none /cgroup
+
+The cgroup_root defaults to /cgroup, if mounted elsewhere please specify this
+option.
+.TP
+.BI cgroup \fR=\fPstr
+Add job to this control group. If it doesn't exist, it will be created.
+.TP
+.BI cgroup_weight \fR=\fPint
+Set the weight of the cgroup to this value. See the documentation that comes
+with the kernel, allowed values are in the range of 100..1000.
 .SH OUTPUT
 While running, \fBfio\fR will display the status of the created jobs.  For
 example:
diff --git a/fio.c b/fio.c
index 434b503..4bbab5a 100644
--- a/fio.c
+++ b/fio.c
@@ -39,6 +39,7 @@
 #include "smalloc.h"
 #include "verify.h"
 #include "diskutil.h"
+#include "cgroup.h"
 
 unsigned long page_mask;
 unsigned long page_size;
@@ -1075,6 +1076,9 @@
 		}
 	}
 
+	if (td->o.cgroup_weight && cgroup_setup(td))
+		goto err;
+
 	if (nice(td->o.nice) == -1) {
 		td_verror(td, errno, "nice");
 		goto err;
@@ -1204,6 +1208,7 @@
 	close_and_free_files(td);
 	close_ioengine(td);
 	cleanup_io_u(td);
+	cgroup_shutdown(td);
 
 	if (td->o.cpumask_set) {
 		int ret = fio_cpuset_exit(&td->o.cpumask);
diff --git a/fio.h b/fio.h
index 214fbd2..aa5124c 100644
--- a/fio.h
+++ b/fio.h
@@ -271,6 +271,14 @@
 	 * Benchmark profile type
 	 */
 	unsigned int profile;
+
+	/*
+	 * blkio cgroup support
+	 */
+	char *cgroup_root;
+	char *cgroup;
+	unsigned int cgroup_weight;
+	unsigned int cgroup_was_created;
 };
 
 #define FIO_VERROR_SIZE	128
diff --git a/options.c b/options.c
index ff27765..cb6337c 100644
--- a/options.c
+++ b/options.c
@@ -1727,6 +1727,28 @@
 		.help	= "Select a specific builtin performance test",
 	},
 	{
+		.name	= "cgroup_root",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= td_var_offset(cgroup_root),
+		.help	= "Root of mounted blkio cgroup",
+		.def	= "/cgroup",
+	},
+	{
+		.name	= "cgroup",
+		.type	= FIO_OPT_STR_STORE,
+		.off1	= td_var_offset(cgroup),
+		.help	= "Add job to cgroup of this name",
+	},
+	{
+		.name	= "cgroup_weight",
+		.type	= FIO_OPT_INT,
+		.off1	= td_var_offset(cgroup_weight),
+		.help	= "Use given weight for cgroup",
+		.minval = 100,
+		.maxval	= 1000,
+		.def	= "0",
+	},
+	{
 		.name = NULL,
 	},
 };
diff --git a/os/os-linux.h b/os/os-linux.h
index e4c4c3f..ac42264 100644
--- a/os/os-linux.h
+++ b/os/os-linux.h
@@ -31,6 +31,7 @@
 #define FIO_HAVE_POSIXAIO_FSYNC
 #define FIO_HAVE_PSHARED_MUTEX
 #define FIO_HAVE_CL_SIZE
+#define FIO_HAVE_CGROUPS
 
 #define OS_MAP_ANON		MAP_ANONYMOUS