Add a real semaphore implemtation
I've seen races where job N+1 got started before N, this breaks
for dependent jobs. So give up and implement a real semaphore
in mmap'ed shared storage.
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
diff --git a/Makefile b/Makefile
index 6af29a3..63289e6 100644
--- a/Makefile
+++ b/Makefile
@@ -6,7 +6,7 @@
PROGS = fio
SCRIPTS = fio_generate_plots
OBJS = gettime.o fio.o ioengines.o init.o stat.o log.o time.o md5.o crc32.o \
- filesetup.o eta.o verify.o memory.o io_u.o parse.o
+ filesetup.o eta.o verify.o memory.o io_u.o parse.o mutex.o
OBJS += engines/cpu.o
OBJS += engines/libaio.o
diff --git a/fio.c b/fio.c
index 72cd02b..3cf2a9b 100644
--- a/fio.c
+++ b/fio.c
@@ -46,7 +46,7 @@
int shm_id = 0;
int temp_stall_ts;
-static volatile int startup_sem;
+static struct fio_sem *startup_sem;
static volatile int fio_abort;
static int exit_value;
@@ -731,8 +731,8 @@
goto err;
td_set_runstate(td, TD_INITIALIZED);
- fio_sem_up(&startup_sem);
- fio_sem_down(&td->mutex);
+ fio_sem_up(startup_sem);
+ fio_sem_down(td->mutex);
if (!td->create_serialize && setup_files(td))
goto err;
@@ -930,6 +930,8 @@
perror("pthread_join");
}
+ fio_sem_remove(td->mutex);
+
(*nr_running)--;
(*m_rate) -= td->ratemin;
(*t_rate) -= td->rate;
@@ -1030,7 +1032,6 @@
*/
td_set_runstate(td, TD_CREATED);
map[this_jobs++] = td;
- fio_sem_init(&startup_sem, 1);
nr_started++;
if (td->use_thread) {
@@ -1039,14 +1040,13 @@
nr_started--;
}
} else {
- if (fork())
- fio_sem_down(&startup_sem);
- else {
+ if (!fork()) {
int ret = fork_main(shm_id, i);
exit(ret);
}
}
+ fio_sem_down(startup_sem);
}
/*
@@ -1101,7 +1101,7 @@
m_rate += td->ratemin;
t_rate += td->rate;
todo--;
- fio_sem_up(&td->mutex);
+ fio_sem_up(td->mutex);
}
reap_threads(&nr_running, &t_rate, &m_rate);
@@ -1151,6 +1151,8 @@
setup_log(&agg_io_log[DDIR_WRITE]);
}
+ startup_sem = fio_sem_init(0);
+
set_genesis_time();
disk_util_timer_arm();
@@ -1165,5 +1167,6 @@
}
}
+ fio_sem_remove(startup_sem);
return exit_value;
}
diff --git a/fio.h b/fio.h
index 7f314de..12cf3c9 100644
--- a/fio.h
+++ b/fio.h
@@ -17,6 +17,7 @@
#include "crc32.h"
#include "arch.h"
#include "os.h"
+#include "mutex.h"
#ifdef FIO_HAVE_SYSLET
#include "syslet.h"
@@ -431,7 +432,7 @@
unsigned long long io_bytes[2];
unsigned long long this_io_bytes[2];
unsigned long long zone_bytes;
- volatile int mutex;
+ struct fio_sem *mutex;
/*
* State for random io, a bitmap of blocks done vs not done
@@ -704,30 +705,6 @@
extern void td_io_close_file(struct thread_data *, struct fio_file *);
/*
- * This is a pretty crappy semaphore implementation, but with the use that fio
- * has (just signalling start/go conditions), it doesn't have to be better.
- * Naturally this would not work for any type of contended semaphore or
- * for real locking.
- */
-static inline void fio_sem_init(volatile int *sem, int val)
-{
- *sem = val;
-}
-
-static inline void fio_sem_down(volatile int *sem)
-{
- while (*sem == 0)
- usleep(10000);
-
- (*sem)--;
-}
-
-static inline void fio_sem_up(volatile int *sem)
-{
- (*sem)++;
-}
-
-/*
* If logging output to a file, stderr should go to both stderr and f_err
*/
#define log_err(args...) do { \
diff --git a/init.c b/init.c
index a6d8bae..c75bed2 100644
--- a/init.c
+++ b/init.c
@@ -857,7 +857,7 @@
f->file_offset = td->start_offset;
}
- fio_sem_init(&td->mutex, 0);
+ td->mutex = fio_sem_init(0);
td->ts.clat_stat[0].min_val = td->ts.clat_stat[1].min_val = ULONG_MAX;
td->ts.slat_stat[0].min_val = td->ts.slat_stat[1].min_val = ULONG_MAX;
diff --git a/mutex.c b/mutex.c
new file mode 100644
index 0000000..bb417c2
--- /dev/null
+++ b/mutex.c
@@ -0,0 +1,85 @@
+#include <stdio.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdlib.h>
+#include <fcntl.h>
+#include <pthread.h>
+#include <sys/mman.h>
+
+#include "mutex.h"
+
+void fio_sem_remove(struct fio_sem *sem)
+{
+ unlink(sem->sem_name);
+ munmap(sem, sizeof(*sem));
+}
+
+struct fio_sem *fio_sem_init(int value)
+{
+ pthread_mutexattr_t attr;
+ struct fio_sem *sem;
+ char sem_name[32];
+ int fd;
+
+ sprintf(sem_name, "/tmp/.fio_lock.XXXXXX");
+ fd = mkstemp(sem_name);
+ if (fd < 0) {
+ perror("open sem");
+ return NULL;
+ }
+
+ if (ftruncate(fd, sizeof(struct fio_sem)) < 0) {
+ perror("ftruncate sem");
+ return NULL;
+ }
+
+ sem = mmap(NULL, sizeof(struct fio_sem), PROT_READ | PROT_WRITE,
+ MAP_SHARED, fd, 0);
+ if (sem == MAP_FAILED) {
+ perror("mmap sem");
+ close(fd);
+ unlink(sem_name);
+ return NULL;
+ }
+
+ close(fd);
+ sem->value = value;
+ strcpy(sem->sem_name, sem_name);
+
+ if (pthread_mutexattr_init(&attr)) {
+ perror("pthread_mutexattr_init");
+ goto err;
+ }
+ if (pthread_mutexattr_setpshared(&attr, PTHREAD_PROCESS_SHARED)) {
+ perror("pthread_mutexattr_setpshared");
+ goto err;
+ }
+ if (pthread_mutex_init(&sem->lock, &attr)) {
+ perror("pthread_mutex_init");
+ goto err;
+ }
+
+ return sem;
+err:
+ munmap(sem, sizeof(*sem));
+ unlink(sem_name);
+ return NULL;
+}
+
+void fio_sem_down(struct fio_sem *sem)
+{
+ pthread_mutex_lock(&sem->lock);
+ while (sem->value == 0)
+ pthread_cond_wait(&sem->cond, &sem->lock);
+ sem->value--;
+ pthread_mutex_unlock(&sem->lock);
+}
+
+void fio_sem_up(struct fio_sem *sem)
+{
+ pthread_mutex_lock(&sem->lock);
+ if (!sem->value)
+ pthread_cond_signal(&sem->cond);
+ sem->value++;
+ pthread_mutex_unlock(&sem->lock);
+}
diff --git a/mutex.h b/mutex.h
new file mode 100644
index 0000000..483e5f4
--- /dev/null
+++ b/mutex.h
@@ -0,0 +1,19 @@
+#ifndef FIO_MUTEX_H
+#define FIO_MUTEX_H
+
+#include <pthread.h>
+
+struct fio_sem {
+ pthread_mutex_t lock;
+ pthread_cond_t cond;
+ unsigned int value;
+
+ char sem_name[32];
+};
+
+extern struct fio_sem *fio_sem_init(int);
+extern void fio_sem_remove(struct fio_sem *);
+extern inline void fio_sem_down(struct fio_sem *);
+extern inline void fio_sem_up(struct fio_sem *sem);
+
+#endif