[PATCH] Introduce an extra runstate for monitoring thread startup
This way we can detect whether a thread is hanging in init, and take
it down after a timeout. This would have caught the /dev/random
problems fixed yesterday.
Also limit the Client message printed at the beginnig, to only show
the first and last of identical job types instead of everything
in between.
diff --git a/fio.c b/fio.c
index df28458..bedad68 100644
--- a/fio.c
+++ b/fio.c
@@ -65,6 +65,7 @@
enum {
TD_NOT_CREATED = 0,
TD_CREATED,
+ TD_INITIALIZED,
TD_RUNNING,
TD_VERIFYING,
TD_EXITED,
@@ -76,6 +77,7 @@
static sem_t startup_sem;
#define TERMINATE_ALL (-1)
+#define JOB_START_TIMEOUT (5 * 1000)
static void terminate_threads(int group_id)
{
@@ -1829,15 +1831,16 @@
}
}
+ if (init_random_state(td))
+ goto err;
+
+ td_set_runstate(td, TD_INITIALIZED);
sem_post(&startup_sem);
sem_wait(&td->mutex);
if (!td->create_serialize && setup_file(td))
goto err;
- if (init_random_state(td))
- goto err;
-
gettimeofday(&td->epoch, NULL);
while (td->loops--) {
@@ -2035,6 +2038,9 @@
case TD_CREATED:
c = 'C';
break;
+ case TD_INITIALIZED:
+ c = 'I';
+ break;
case TD_NOT_CREATED:
c = 'P';
break;
@@ -2106,7 +2112,8 @@
if (td->timeout && eta_sec > (td->timeout - elapsed))
eta_sec = td->timeout - elapsed;
- } else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED) {
+ } else if (td->runstate == TD_NOT_CREATED || td->runstate == TD_CREATED
+ || td->runstate == TD_INITIALIZED) {
int t_eta = 0, r_eta = 0;
/*
@@ -2317,6 +2324,10 @@
gettimeofday(&genesis, NULL);
while (todo) {
+ struct thread_data *map[MAX_JOBS];
+ struct timeval this_start;
+ int this_jobs = 0, left;
+
/*
* create threads (TD_NOT_CREATED -> TD_CREATED)
*/
@@ -2345,9 +2356,13 @@
if (td->stonewall && (nr_started || nr_running))
break;
+ /*
+ * Set state to created. Thread will transition
+ * to TD_INITIALIZED when it's done setting up.
+ */
td_set_runstate(td, TD_CREATED);
+ map[this_jobs++] = td;
sem_init(&startup_sem, 0, 1);
- todo--;
nr_started++;
if (td->use_thread) {
@@ -2366,12 +2381,50 @@
}
/*
- * start created threads (TD_CREATED -> TD_RUNNING)
+ * Wait for the started threads to transition to
+ * TD_INITIALIZED.
*/
+ printf("fio: Waiting for threads to initialize...\n");
+ gettimeofday(&this_start, NULL);
+ left = this_jobs;
+ while (left) {
+ if (mtime_since_now(&this_start) > JOB_START_TIMEOUT)
+ break;
+
+ usleep(100000);
+
+ for (i = 0; i < this_jobs; i++) {
+ td = map[i];
+ if (!td)
+ continue;
+ if (td->runstate == TD_INITIALIZED ||
+ td->runstate >= TD_EXITED) {
+ map[i] = NULL;
+ left--;
+ continue;
+ }
+ }
+ }
+
+ if (left) {
+ fprintf(stderr, "fio: %d jobs failed to start\n", left);
+ for (i = 0; i < this_jobs; i++) {
+ td = map[i];
+ if (!td)
+ continue;
+ kill(td->pid, SIGTERM);
+ }
+ break;
+ }
+
+ /*
+ * start created threads (TD_INITIALIZED -> TD_RUNNING)
+ */
+ printf("fio: Go for launch\n");
for (i = 0; i < thread_number; i++) {
td = &threads[i];
- if (td->runstate != TD_CREATED)
+ if (td->runstate != TD_INITIALIZED)
continue;
td_set_runstate(td, TD_RUNNING);
@@ -2379,6 +2432,7 @@
nr_started--;
m_rate += td->ratemin;
t_rate += td->rate;
+ todo--;
sem_post(&td->mutex);
}