Add a real semaphore implemtation

I've seen races where job N+1 got started before N, this breaks
for dependent jobs. So give up and implement a real semaphore
in mmap'ed shared storage.

Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/Makefile b/Makefile
index 6af29a3..63289e6 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
 PROGS	= fio
 SCRIPTS = fio_generate_plots
 OBJS = gettime.o fio.o ioengines.o init.o stat.o log.o time.o md5.o crc32.o \
-	filesetup.o eta.o verify.o memory.o io_u.o parse.o
+	filesetup.o eta.o verify.o memory.o io_u.o parse.o mutex.o
 
 OBJS += engines/cpu.o
 OBJS += engines/libaio.o
diff --git a/fio.c b/fio.c
index 72cd02b..3cf2a9b 100644
--- a/fio.c
+++ b/fio.c
@@ -46,7 +46,7 @@
 int shm_id = 0;
 int temp_stall_ts;
 
-static volatile int startup_sem;
+static struct fio_sem *startup_sem;
 static volatile int fio_abort;
 static int exit_value;
 
@@ -731,8 +731,8 @@
 		goto err;
 
 	td_set_runstate(td, TD_INITIALIZED);
-	fio_sem_up(&startup_sem);
-	fio_sem_down(&td->mutex);
+	fio_sem_up(startup_sem);
+	fio_sem_down(td->mutex);
 
 	if (!td->create_serialize && setup_files(td))
 		goto err;
@@ -930,6 +930,8 @@
 				perror("pthread_join");
 		}
 
+		fio_sem_remove(td->mutex);
+
 		(*nr_running)--;
 		(*m_rate) -= td->ratemin;
 		(*t_rate) -= td->rate;
@@ -1030,7 +1032,6 @@
 			 */
 			td_set_runstate(td, TD_CREATED);
 			map[this_jobs++] = td;
-			fio_sem_init(&startup_sem, 1);
 			nr_started++;
 
 			if (td->use_thread) {
@@ -1039,14 +1040,13 @@
 					nr_started--;
 				}
 			} else {
-				if (fork())
-					fio_sem_down(&startup_sem);
-				else {
+				if (!fork()) {
 					int ret = fork_main(shm_id, i);
 
 					exit(ret);
 				}
 			}
+			fio_sem_down(startup_sem);
 		}
 
 		/*
@@ -1101,7 +1101,7 @@
 			m_rate += td->ratemin;
 			t_rate += td->rate;
 			todo--;
-			fio_sem_up(&td->mutex);
+			fio_sem_up(td->mutex);
 		}
 
 		reap_threads(&nr_running, &t_rate, &m_rate);
@@ -1151,6 +1151,8 @@
 		setup_log(&agg_io_log[DDIR_WRITE]);
 	}
 
+	startup_sem = fio_sem_init(0);
+
 	set_genesis_time();
 
 	disk_util_timer_arm();
@@ -1165,5 +1167,6 @@
 		}
 	}
 
+	fio_sem_remove(startup_sem);
 	return exit_value;
 }
diff --git a/fio.h b/fio.h
index 7f314de..12cf3c9 100644
--- a/fio.h
+++ b/fio.h
@@ -17,6 +17,7 @@
 #include "crc32.h"
 #include "arch.h"
 #include "os.h"
+#include "mutex.h"
 
 #ifdef FIO_HAVE_SYSLET
 #include "syslet.h"
@@ -431,7 +432,7 @@
 	unsigned long long io_bytes[2];
 	unsigned long long this_io_bytes[2];
 	unsigned long long zone_bytes;
-	volatile int mutex;
+	struct fio_sem *mutex;
 
 	/*
 	 * State for random io, a bitmap of blocks done vs not done
@@ -704,30 +705,6 @@
 extern void td_io_close_file(struct thread_data *, struct fio_file *);
 
 /*
- * This is a pretty crappy semaphore implementation, but with the use that fio
- * has (just signalling start/go conditions), it doesn't have to be better.
- * Naturally this would not work for any type of contended semaphore or
- * for real locking.
- */
-static inline void fio_sem_init(volatile int *sem, int val)
-{
-	*sem = val;
-}
-
-static inline void fio_sem_down(volatile int *sem)
-{
-	while (*sem == 0)
-		usleep(10000);
-
-	(*sem)--;
-}
-
-static inline void fio_sem_up(volatile int *sem)
-{
-	(*sem)++;
-}
-
-/*
  * If logging output to a file, stderr should go to both stderr and f_err
  */
 #define log_err(args...)	do {		\
diff --git a/init.c b/init.c
index a6d8bae..c75bed2 100644
--- a/init.c
+++ b/init.c
@@ -857,7 +857,7 @@
 		f->file_offset = td->start_offset;
 	}
 		
-	fio_sem_init(&td->mutex, 0);
+	td->mutex = fio_sem_init(0);
 
 	td->ts.clat_stat[0].min_val = td->ts.clat_stat[1].min_val = ULONG_MAX;
 	td->ts.slat_stat[0].min_val = td->ts.slat_stat[1].min_val = ULONG_MAX;
diff --git a/mutex.c b/mutex.c
new file mode 100644
index 0000000..bb417c2
--- /dev/null
+++ b/mutex.c
@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+
+#include "mutex.h"
+
+void fio_sem_remove(struct fio_sem *sem)
+{
+	unlink(sem->sem_name);
+	munmap(sem, sizeof(*sem));
+}
+
+struct fio_sem *fio_sem_init(int value)
+{
+	pthread_mutexattr_t attr;
+	struct fio_sem *sem;
+	char sem_name[32];
+	int fd;
+
+	sprintf(sem_name, "/tmp/.fio_lock.XXXXXX");
+	fd = mkstemp(sem_name);
+	if (fd < 0) {
+		perror("open sem");
+		return NULL;
+	}
+
+	if (ftruncate(fd, sizeof(struct fio_sem)) < 0) {
+		perror("ftruncate sem");
+		return NULL;
+	}
+
+	sem = mmap(NULL, sizeof(struct fio_sem), PROT_READ | PROT_WRITE,
+			MAP_SHARED, fd, 0);
+	if (sem == MAP_FAILED) {
+		perror("mmap sem");
+		close(fd);
+		unlink(sem_name);
+		return NULL;
+	}
+
+	close(fd);
+	sem->value = value;
+	strcpy(sem->sem_name, sem_name);
+
+	if (pthread_mutexattr_init(&attr)) {
+		perror("pthread_mutexattr_init");
+		goto err;
+	}
+	if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)) {
+		perror("pthread_mutexattr_setpshared");
+		goto err;
+	}
+	if (pthread_mutex_init(&sem->lock, &attr)) {
+		perror("pthread_mutex_init");
+		goto err;
+	}
+
+	return sem;
+err:
+	munmap(sem, sizeof(*sem));
+	unlink(sem_name);
+	return NULL;
+}
+
+void fio_sem_down(struct fio_sem *sem)
+{
+	pthread_mutex_lock(&sem->lock);
+	while (sem->value == 0)
+		pthread_cond_wait(&sem->cond, &sem->lock);
+	sem->value--;
+	pthread_mutex_unlock(&sem->lock);
+}
+
+void fio_sem_up(struct fio_sem *sem)
+{
+	pthread_mutex_lock(&sem->lock);
+	if (!sem->value)
+		pthread_cond_signal(&sem->cond);
+	sem->value++;
+	pthread_mutex_unlock(&sem->lock);
+}
diff --git a/mutex.h b/mutex.h
new file mode 100644
index 0000000..483e5f4
--- /dev/null
+++ b/mutex.h
@@ -0,0 +1,19 @@
+#ifndef FIO_MUTEX_H
+#define FIO_MUTEX_H
+
+#include <pthread.h>
+
+struct fio_sem {
+	pthread_mutex_t lock;
+	pthread_cond_t cond;
+	unsigned int value;
+
+	char sem_name[32];
+};
+
+extern struct fio_sem *fio_sem_init(int);
+extern void fio_sem_remove(struct fio_sem *);
+extern inline void fio_sem_down(struct fio_sem *);
+extern inline void fio_sem_up(struct fio_sem *sem);
+
+#endif