file hash: don't close file fd on lookup/add race

On Linux, udev often has a rule that triggers blkid to read data off
the device, if the device has been opened for writing. This triggers
when the device is closed.

Fio has an internal file hash for lookup of files. When threads start,
it can happen that one thread does a hash lookup without finding the
file, but when it has opened the file and attempts to insert it into
the hash, another thread has already completed that operation. When
that race happens, fio closes the file handle and does the lookup
again. That then triggers blkid to read pages off the device. As data
in the cache is invalidated on open of the device, we know have page
cache entries for the device again.

This is a problem for unbuffered workloads, where the existance of
page cache pages slows it down due to having to check for invalidation.
The user observed problem is that fio exhibits bi-modal performance
results, depending on whether the file hash race was hit during setup
or not.

Fix this by NOT closing the file if we hit this race, but instead wait
until the file is closed after the run.

Signed-off-by: Jens Axboe <axboe@kernel.dk>
diff --git a/filesetup.c b/filesetup.c
index f4e1adc..9d3e062 100644
--- a/filesetup.c
+++ b/filesetup.c
@@ -434,6 +434,12 @@
 		ret = errno;
 
 	f->fd = -1;
+
+	if (f->shadow_fd != -1) {
+		close(f->shadow_fd);
+		f->shadow_fd = -1;
+	}
+
 	return ret;
 }
 
@@ -462,6 +468,24 @@
 	return from_hash;
 }
 
+static int file_close_shadow_fds(struct thread_data *td)
+{
+	struct fio_file *f;
+	int num_closed = 0;
+	unsigned int i;
+
+	for_each_file(td, f, i) {
+		if (f->shadow_fd == -1)
+			continue;
+
+		close(f->shadow_fd);
+		f->shadow_fd = -1;
+		num_closed++;
+	}
+
+	return num_closed;
+}
+
 int generic_open_file(struct thread_data *td, struct fio_file *f)
 {
 	int is_std = 0;
@@ -536,6 +560,8 @@
 			flags &= ~FIO_O_NOATIME;
 			goto open_again;
 		}
+		if (__e == EMFILE && file_close_shadow_fds(td))
+			goto open_again;
 
 		snprintf(buf, sizeof(buf) - 1, "open(%s)", f->file_name);
 
@@ -552,9 +578,22 @@
 			int fio_unused ret;
 
 			/*
-			 * OK to ignore, we haven't done anything with it
+			 * Stash away descriptor for later close. This is to
+			 * work-around a "feature" on Linux, where a close of
+			 * an fd that has been opened for write will trigger
+			 * udev to call blkid to check partitions, fs id, etc.
+			 * That polutes the device cache, which can slow down
+			 * unbuffered accesses.
 			 */
-			ret = generic_close_file(td, f);
+			if (f->shadow_fd == -1)
+				f->shadow_fd = f->fd;
+			else {
+				/*
+			 	 * OK to ignore, we haven't done anything
+				 * with it
+				 */
+				ret = generic_close_file(td, f);
+			}
 			goto open_again;
 		}
 	}
@@ -1029,6 +1068,7 @@
 	}
 
 	f->fd = -1;
+	f->shadow_fd = -1;
 	fio_file_reset(f);
 
 	if (td->files_size <= td->files_index) {