Revamp file locking

Get rid of the semaphore implementation, no need to carry both.
Add different locking modes (exclusive and readwrite) to enable
a wider range of testing. Also combine lockfile and lockfile_batch,
the latter is now a postfix option to the former.

So to enable readers-excluding-writers locking mode with a lock batch
count of 4, you would write:

lockfile=readwrite:4

instead.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/HOWTO b/HOWTO
index 3e0a31b..3d92293 100644
--- a/HOWTO
+++ b/HOWTO
@@ -219,13 +219,24 @@
 opendir=str	Tell fio to recursively add any file it can find in this
 		directory and down the file system tree.
 
-lockfile=bool	If set, fio will lock a file internally before doing IO to it.
-		This makes it safe to share file descriptors across fio
-		jobs that run at the same time.
+lockfile=str	Fio defaults to not doing any locking files before it does
+		IO to them. If a file or file descriptor is shared, fio
+		can serialize IO to that file to make the end result
+		consistent. This is usual for emulating real workloads that
+		share files. The lock modes are:
 
-lockfile_batch=int	Acquiring a semaphore can be quite expensive, so
-		allow a process to complete this number of IOs before releasing
-		the semaphore again. Defaults to 1.
+			none		No locking. The default.
+			exclusive	Only one thread/process may do IO,
+					excluding all others.
+			readwrite	Read-write locking on the file. Many
+					readers may access the file at the
+					same time, but writes get exclusive
+					access.
+
+		The option may be post-fixed with a lock batch number. If
+		set, then each thread/process may do that amount of IOs to
+		the file before giving up the lock. Since lock acqusition is
+		expensive, batching the lock/unlocks will speed up IO.
 
 readwrite=str
 rw=str		Type of io pattern. Accepted values are:
diff --git a/Makefile b/Makefile
index 1b77b25..e0ab0b8 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 PROGS	= fio
 SCRIPTS = fio_generate_plots
 OBJS = gettime.o fio.o ioengines.o init.o stat.o log.o time.o filesetup.o \
-	eta.o verify.o memory.o io_u.o parse.o mutex.o sem.o options.o \
+	eta.o verify.o memory.o io_u.o parse.o mutex.o options.o \
 	rbtree.o diskutil.o fifo.o blktrace.o smalloc.o filehash.o
 
 OBJS += crc/crc7.o
diff --git a/filesetup.c b/filesetup.c
index 4e2a36c..4d2017d 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -226,9 +226,33 @@
 	return ret;
 }
 
-int generic_open_file(struct thread_data *td, struct fio_file *f)
+static int file_lookup_open(struct fio_file *f, int flags)
 {
 	struct fio_file *__f;
+	int from_hash;
+
+	__f = lookup_file_hash(f->file_name);
+	if (__f) {
+		/*
+		 * racy, need the __f->lock locked
+		 */
+		f->lock = __f->lock;
+		f->lock_owner = __f->lock_owner;
+		f->lock_batch = __f->lock_batch;
+		f->lock_ddir = __f->lock_ddir;
+		f->fd = dup(__f->fd);
+		f->references++;
+		from_hash = 1;
+	} else {
+		f->fd = open(f->file_name, flags, 0600);
+		from_hash = 0;
+	}
+
+	return from_hash;
+}
+
+int generic_open_file(struct thread_data *td, struct fio_file *f)
+{
 	int is_std = 0;
 	int flags = 0;
 	int from_hash = 0;
@@ -267,16 +291,8 @@
 
 		if (is_std)
 			f->fd = dup(STDOUT_FILENO);
-		else {
-			__f = lookup_file_hash(f->file_name);
-			if (__f) {
-				f->sem = __f->sem;
-				f->fd = dup(__f->fd);
-				f->references++;
-				from_hash = 1;
-			} else
-				f->fd = open(f->file_name, flags, 0600);
-		}
+		else
+			from_hash = file_lookup_open(f, flags);
 	} else {
 		if (f->filetype == FIO_TYPE_CHAR && !read_only)
 			flags |= O_RDWR;
@@ -285,16 +301,8 @@
 
 		if (is_std)
 			f->fd = dup(STDIN_FILENO);
-		else {
-			__f = lookup_file_hash(f->file_name);
-			if (__f) {
-				f->sem = __f->sem;
-				f->fd = dup(__f->fd);
-				f->references++;
-				from_hash = 1;
-			} else
-				f->fd = open(f->file_name, flags);
-		}
+		else
+			from_hash = file_lookup_open(f, flags);
 	}
 
 	if (f->fd == -1) {
@@ -641,8 +649,19 @@
 
 	get_file_type(f);
 
-	if (td->o.lockfile)
-		f->sem = fio_sem_init(1);
+	switch (td->o.file_lock_mode) {
+	case FILE_LOCK_NONE:
+		break;
+	case FILE_LOCK_READWRITE:
+		f->lock = fio_mutex_rw_init();
+		break;
+	case FILE_LOCK_EXCLUSIVE:
+		f->lock = fio_mutex_init(1);
+		break;
+	default:
+		log_err("fio: unknown lock mode: %d\n", td->o.file_lock_mode);
+		assert(0);
+	}
 
 	td->files_index++;
 	if (f->filetype == FIO_TYPE_FILE)
@@ -682,31 +701,62 @@
 	return ret;
 }
 
-void lock_file(struct thread_data *td, struct fio_file *f)
+void lock_file(struct thread_data *td, struct fio_file *f, enum fio_ddir ddir)
 {
-	if (f && f->sem) {
-		if (f->sem_owner == td && f->sem_batch--)
-			return;
+	if (!f->lock || td->o.file_lock_mode == FILE_LOCK_NONE)
+		return;
 
-		fio_sem_down(f->sem);
-		f->sem_owner = td;
-		f->sem_batch = td->o.lockfile_batch;
+	if (f->lock_owner == td && f->lock_batch--)
+		return;
+
+	if (td->o.file_lock_mode == FILE_LOCK_READWRITE) {
+		if (ddir == DDIR_READ)
+			fio_mutex_down_read(f->lock);
+		else
+			fio_mutex_down_write(f->lock);
+	} else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE)
+		fio_mutex_down(f->lock);
+
+	f->lock_owner = td;
+	f->lock_batch = td->o.lockfile_batch;
+	f->lock_ddir = ddir;
+}
+
+void unlock_file(struct thread_data *td, struct fio_file *f)
+{
+	if (!f->lock || td->o.file_lock_mode == FILE_LOCK_NONE)
+		return;
+	if (f->lock_batch)
+		return;
+
+	if (td->o.file_lock_mode == FILE_LOCK_READWRITE) {
+		const int is_read = f->lock_ddir == DDIR_READ;
+		int val = fio_mutex_getval(f->lock);
+
+		if ((is_read && val == 1) || (!is_read && val == -1))
+			f->lock_owner = NULL;
+
+		if (is_read)
+			fio_mutex_up_read(f->lock);
+		else
+			fio_mutex_up_write(f->lock);
+	} else if (td->o.file_lock_mode == FILE_LOCK_EXCLUSIVE) {
+		int val = fio_mutex_getval(f->lock);
+
+		if (val == 0)
+			f->lock_owner = NULL;
+
+		fio_mutex_up(f->lock);
 	}
 }
 
-void unlock_file(struct fio_file *f)
+void unlock_file_all(struct thread_data *td, struct fio_file *f)
 {
-	if (f && f->sem) {
-		int sem_val;
+	if (f->lock_owner != td)
+		return;
 
-		if (f->sem_batch)
-			return;
-
-		sem_getvalue(&f->sem->sem, &sem_val);
-		if (!sem_val)
-			f->sem_owner = NULL;
-		fio_sem_up(f->sem);
-	}
+	f->lock_batch = 0;
+	unlock_file(td, f);
 }
 
 static int recurse_dir(struct thread_data *td, const char *dirname)
diff --git a/fio.h b/fio.h
index ffd3d7d..b22009b 100644
--- a/fio.h
+++ b/fio.h
@@ -22,7 +22,6 @@
 #include "arch/arch.h"
 #include "os/os.h"
 #include "mutex.h"
-#include "sem.h"
 #include "log.h"
 #include "debug.h"
 
@@ -47,6 +46,12 @@
 	TD_DDIR_RANDRW		= TD_DDIR_RW | TD_DDIR_RAND,
 };
 
+enum file_lock_mode {
+	FILE_LOCK_NONE,
+	FILE_LOCK_EXCLUSIVE,
+	FILE_LOCK_READWRITE,
+};
+
 /*
  * Use for maintaining statistics
  */
@@ -313,9 +318,10 @@
 	/*
 	 * if io is protected by a semaphore, this is set
 	 */
-	struct fio_sem *sem;
-	void *sem_owner;
-	unsigned int sem_batch;
+	struct fio_mutex *lock;
+	void *lock_owner;
+	unsigned int lock_batch;
+	enum fio_ddir lock_ddir;
 
 	/*
 	 * block map for random io
@@ -415,7 +421,7 @@
 
 	unsigned int nr_files;
 	unsigned int open_files;
-	unsigned int lockfile;
+	enum file_lock_mode file_lock_mode;
 	unsigned int lockfile_batch;
 
 	unsigned int odirect;
@@ -820,8 +826,9 @@
 extern int add_file(struct thread_data *, const char *);
 extern void get_file(struct fio_file *);
 extern int __must_check put_file(struct thread_data *, struct fio_file *);
-extern void lock_file(struct thread_data *, struct fio_file *);
-extern void unlock_file(struct fio_file *);
+extern void lock_file(struct thread_data *, struct fio_file *, enum fio_ddir);
+extern void unlock_file(struct thread_data *, struct fio_file *);
+extern void unlock_file_all(struct thread_data *, struct fio_file *);
 extern int add_dir_files(struct thread_data *, const char *);
 extern int init_random_map(struct thread_data *);
 extern void dup_files(struct thread_data *, struct thread_data *);
diff --git a/io_u.c b/io_u.c
index 04d7dcb..5a3157a 100644
--- a/io_u.c
+++ b/io_u.c
@@ -665,7 +665,6 @@
 		 * td_io_close() does a put_file() as well, so no need to
 		 * do that here.
 		 */
-		unlock_file(io_u->file);
 		io_u->file = NULL;
 		td_io_close_file(td, f);
 		f->flags |= FIO_FILE_DONE;
diff --git a/ioengines.c b/ioengines.c
index bd2eb4a..87db11c 100644
--- a/ioengines.c
+++ b/ioengines.c
@@ -168,14 +168,14 @@
 	dprint_io_u(io_u, "prep");
 	fio_ro_check(td, io_u);
 
-	lock_file(td, io_u->file);
+	lock_file(td, io_u->file, io_u->ddir);
 
 	if (td->io_ops->prep) {
 		int ret = td->io_ops->prep(td, io_u);
 
 		dprint(FD_IO, "->prep(%p)=%d\n", io_u, ret);
 		if (ret)
-			unlock_file(io_u->file);
+			unlock_file(td, io_u->file);
 		return ret;
 	}
 
@@ -232,7 +232,7 @@
 
 	ret = td->io_ops->queue(td, io_u);
 
-	unlock_file(io_u->file);
+	unlock_file(td, io_u->file);
 
 	if (ret != FIO_Q_BUSY)
 		io_u_mark_depth(td, io_u);
@@ -359,10 +359,7 @@
 	 */
 	f->flags |= FIO_FILE_CLOSING;
 
-	if (f->sem_owner == td && f->sem_batch) {
-		f->sem_batch = 0;
-		unlock_file(f);
-	}
+	unlock_file_all(td, f);
 
 	return put_file(td, f);
 }
diff --git a/mutex.c b/mutex.c
index bcc37ae..e6fb3f0 100644
--- a/mutex.c
+++ b/mutex.c
@@ -7,6 +7,7 @@
 #include <sys/mman.h>
 
 #include "mutex.h"
+#include "arch/arch.h"
 
 void fio_mutex_remove(struct fio_mutex *mutex)
 {
@@ -76,8 +77,13 @@
 void fio_mutex_down(struct fio_mutex *mutex)
 {
 	pthread_mutex_lock(&mutex->lock);
-	while (mutex->value == 0)
+
+	while (!mutex->value) {
+		mutex->waiters++;
 		pthread_cond_wait(&mutex->cond, &mutex->lock);
+		mutex->waiters--;
+	}
+
 	mutex->value--;
 	pthread_mutex_unlock(&mutex->lock);
 }
@@ -85,7 +91,8 @@
 void fio_mutex_up(struct fio_mutex *mutex)
 {
 	pthread_mutex_lock(&mutex->lock);
-	if (!mutex->value)
+	read_barrier();
+	if (!mutex->value && mutex->waiters)
 		pthread_cond_signal(&mutex->cond);
 	mutex->value++;
 	pthread_mutex_unlock(&mutex->lock);
@@ -94,8 +101,13 @@
 void fio_mutex_down_write(struct fio_mutex *mutex)
 {
 	pthread_mutex_lock(&mutex->lock);
-	while (mutex->value != 0)
+
+	while (mutex->value != 0) {
+		mutex->waiters++;
 		pthread_cond_wait(&mutex->cond, &mutex->lock);
+		mutex->waiters--;
+	}
+
 	mutex->value--;
 	pthread_mutex_unlock(&mutex->lock);
 }
@@ -103,8 +115,13 @@
 void fio_mutex_down_read(struct fio_mutex *mutex)
 {
 	pthread_mutex_lock(&mutex->lock);
-	while (mutex->value < 0)
+
+	while (mutex->value < 0) {
+		mutex->waiters++;
 		pthread_cond_wait(&mutex->cond, &mutex->lock);
+		mutex->waiters--;
+	}
+
 	mutex->value++;
 	pthread_mutex_unlock(&mutex->lock);
 }
@@ -113,7 +130,8 @@
 {
 	pthread_mutex_lock(&mutex->lock);
 	mutex->value--;
-	if (mutex->value >= 0)
+	read_barrier();
+	if (mutex->value >= 0 && mutex->waiters)
 		pthread_cond_signal(&mutex->cond);
 	pthread_mutex_unlock(&mutex->lock);
 }
@@ -122,7 +140,8 @@
 {
 	pthread_mutex_lock(&mutex->lock);
 	mutex->value++;
-	if (mutex->value >= 0)
+	read_barrier();
+	if (mutex->value >= 0 && mutex->waiters)
 		pthread_cond_signal(&mutex->cond);
 	pthread_mutex_unlock(&mutex->lock);
 }
diff --git a/mutex.h b/mutex.h
index 40cfc1e..7be0ab1 100644
--- a/mutex.h
+++ b/mutex.h
@@ -7,6 +7,7 @@
 	pthread_mutex_t lock;
 	pthread_cond_t cond;
 	int value;
+	int waiters;
 
 	int mutex_fd;
 };
@@ -25,4 +26,9 @@
 	return fio_mutex_init(0);
 }
 
+static inline int fio_mutex_getval(struct fio_mutex *mutex)
+{
+	return mutex->value;
+}
+
 #endif
diff --git a/options.c b/options.c
index 27b4fb6..bb77683 100644
--- a/options.c
+++ b/options.c
@@ -351,6 +351,18 @@
 	return 0;
 }
 
+static int str_lockfile_cb(void *data, const char *str)
+{
+	struct thread_data *td = data;
+	char *nr = get_opt_postfix(str);
+
+	td->o.lockfile_batch = 1;
+	if (nr)
+		td->o.lockfile_batch = atoi(nr);
+
+	return 0;
+}
+
 #define __stringify_1(x)	#x
 #define __stringify(x)		__stringify_1(x)
 
@@ -386,19 +398,27 @@
 	},
 	{
 		.name	= "lockfile",
-		.type	= FIO_OPT_BOOL,
-		.off1	= td_var_offset(lockfile),
+		.type	= FIO_OPT_STR,
+		.cb	= str_lockfile_cb,
+		.off1	= td_var_offset(file_lock_mode),
 		.help	= "Lock file when doing IO to it",
 		.parent	= "filename",
-		.def	= "0",
-	},
-	{
-		.name	= "lockfile_batch",
-		.type	= FIO_OPT_INT,
-		.off1	= td_var_offset(lockfile_batch),
-		.help	= "Number of IOs to allow per file lock",
-		.parent	= "lockfile",
-		.def	= "1",
+		.def	= "none",
+		.posval = {
+			  { .ival = "none",
+			    .oval = FILE_LOCK_NONE,
+			    .help = "No file locking",
+			  },
+			  { .ival = "exclusive",
+			    .oval = FILE_LOCK_EXCLUSIVE,
+			    .help = "Exclusive file lock",
+			  },
+			  {
+			    .ival = "readwrite",
+			    .oval = FILE_LOCK_READWRITE,
+			    .help = "Read vs write lock",
+			  },
+		},
 	},
 	{
 		.name	= "opendir",
diff --git a/sem.c b/sem.c
deleted file mode 100644
index 8b25ad3..0000000
--- a/sem.c
+++ /dev/null
@@ -1,37 +0,0 @@
-#include <stdio.h>
-
-#include "sem.h"
-#include "smalloc.h"
-
-void fio_sem_remove(struct fio_sem *sem)
-{
-	sfree(sem);
-}
-
-struct fio_sem *fio_sem_init(int value)
-{
-	struct fio_sem *sem;
-
-	sem = smalloc(sizeof(*sem));
-	if (!sem)
-		return NULL;
-
-	sem->sem_val = value;
-
-	if (!sem_init(&sem->sem, 1, value))
-		return sem;
-
-	perror("sem_init");
-	sfree(sem);
-	return NULL;
-}
-
-void fio_sem_down(struct fio_sem *sem)
-{
-	sem_wait(&sem->sem);
-}
-
-void fio_sem_up(struct fio_sem *sem)
-{
-	sem_post(&sem->sem);
-}
diff --git a/sem.h b/sem.h
deleted file mode 100644
index a7b4664..0000000
--- a/sem.h
+++ /dev/null
@@ -1,16 +0,0 @@
-#ifndef FIO_SEM_H
-#define FIO_SEM_H
-
-#include <semaphore.h>
-
-struct fio_sem {
-	sem_t sem;
-	int sem_val;
-};
-
-extern struct fio_sem *fio_sem_init(int);
-extern void fio_sem_remove(struct fio_sem *);
-extern void fio_sem_down(struct fio_sem *);
-extern void fio_sem_up(struct fio_sem *);
-
-#endif
diff --git a/smalloc.c b/smalloc.c
index 9a7c25b..85da781 100644
--- a/smalloc.c
+++ b/smalloc.c
@@ -268,11 +268,12 @@
 
 void sinit(void)
 {
-	int ret = add_pool(&mp[0]);
+	int ret;
 
 #ifdef MP_SAFE
 	lock = fio_mutex_rw_init();
 #endif
+	ret = add_pool(&mp[0]);
 	assert(!ret);
 }