ext2fs: Add Direct I/O support to the ext2fs library

This adds the basic support for Direct I/O to unix_io.c, and adds a
new flag EXT_FLAG_DIRECT_IO which can be passed to ext2fs_open() or
ext2fs_open2() to request Direct I/O support.

Note that device mapper devices in Linux don't support Direct I/O, and
in some circumstances using Direct I/O can actually make performance
*worse*!

Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
diff --git a/lib/ext2fs/unix_io.c b/lib/ext2fs/unix_io.c
index 78a1d1c..1df1fdd 100644
--- a/lib/ext2fs/unix_io.c
+++ b/lib/ext2fs/unix_io.c
@@ -17,6 +17,7 @@
 
 #define _LARGEFILE_SOURCE
 #define _LARGEFILE64_SOURCE
+#define _GNU_SOURCE
 
 #include <stdio.h>
 #include <string.h>
@@ -47,10 +48,16 @@
 #include <sys/resource.h>
 #endif
 
-#if defined(__linux__) && defined(_IO) && !defined(BLKGETSIZE)
+#if defined(__linux__) && defined(_IO) && !defined(BLKROGET)
 #define BLKROGET   _IO(0x12, 94) /* Get read-only status (0 = read_write).  */
 #endif
 
+#if defined(__linux__) && defined(_IO) && !defined(BLKSSZGET)
+#define BLKSSZGET  _IO(0x12,104)/* get block device sector size */
+#endif
+
+#undef ALIGN_DEBUG
+
 #include "ext2_fs.h"
 #include "ext2fs.h"
 
@@ -77,12 +84,17 @@
 	int	magic;
 	int	dev;
 	int	flags;
+	int	align;
 	int	access_time;
 	ext2_loff_t offset;
 	struct unix_cache cache[CACHE_SIZE];
+	void	*bounce;
 	struct struct_io_stats io_stats;
 };
 
+#define IS_ALIGNED(n, align) ((((unsigned long) n) & \
+			       ((unsigned long) ((align)-1))) == 0)
+
 static errcode_t unix_open(const char *name, int flags, io_channel *channel);
 static errcode_t unix_close(io_channel channel);
 static errcode_t unix_set_blksize(io_channel channel, int blksize);
@@ -104,14 +116,6 @@
 static errcode_t unix_write_blk64(io_channel channel, unsigned long long block,
 				int count, const void *data);
 
-/* __FreeBSD_kernel__ is defined by GNU/kFreeBSD - the FreeBSD kernel
- * does not know buffered block devices - everything is raw. */
-#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
-#define NEED_BOUNCE_BUFFER
-#else
-#undef NEED_BOUNCE_BUFFER
-#endif
-
 static struct struct_io_manager struct_unix_manager = {
 	EXT2_ET_MAGIC_IO_MANAGER,
 	"Unix I/O Manager",
@@ -121,11 +125,7 @@
 	unix_read_blk,
 	unix_write_blk,
 	unix_flush,
-#ifdef NEED_BOUNCE_BUFFER
-	0,
-#else
 	unix_write_byte,
-#endif
 	unix_set_option,
 	unix_get_stats,
 	unix_read_blk64,
@@ -153,7 +153,6 @@
 /*
  * Here are the raw I/O functions
  */
-#ifndef NEED_BOUNCE_BUFFER
 static errcode_t raw_read_blk(io_channel channel,
 			      struct unix_private_data *data,
 			      unsigned long long block,
@@ -171,12 +170,38 @@
 		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
 		goto error_out;
 	}
-	actual = read(data->dev, buf, size);
-	if (actual != size) {
-		if (actual < 0)
-			actual = 0;
-		retval = EXT2_ET_SHORT_READ;
-		goto error_out;
+	if ((data->align == 0) ||
+	    ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
+		actual = read(data->dev, buf, size);
+		if (actual != size) {
+		short_read:
+			if (actual < 0)
+				actual = 0;
+			retval = EXT2_ET_SHORT_READ;
+			goto error_out;
+		}
+		return 0;
+	}
+
+#ifdef ALIGN_DEBUG
+	printf("raw_read_blk: O_DIRECT fallback: %p %lu\n", buf,
+	       (unsigned long) size);
+#endif
+
+	/*
+	 * The buffer or size which we're trying to read isn't aligned
+	 * to the O_DIRECT rules, so we need to do this the hard way...
+	 */
+	while (size > 0) {
+		actual = read(data->dev, data->bounce, channel->block_size);
+		if (actual != channel->block_size)
+			goto short_read;
+		actual = size;
+		if (size > channel->block_size)
+			actual = channel->block_size;
+		memcpy(buf, data->bounce, actual);
+		size -= actual;
+		buf += actual;
 	}
 	return 0;
 
@@ -187,61 +212,6 @@
 					       size, actual, retval);
 	return retval;
 }
-#else /* NEED_BOUNCE_BUFFER */
-/*
- * Windows and FreeBSD block devices only allow sector alignment IO in offset and size
- */
-static errcode_t raw_read_blk(io_channel channel,
-			      struct unix_private_data *data,
-			      unsigned long block,
-			      int count, void *buf)
-{
-	errcode_t	retval;
-	size_t		size, alignsize, fragment;
-	ext2_loff_t	location;
-	int		total = 0, actual;
-#define BLOCKALIGN 512
-	char		sector[BLOCKALIGN];
-
-	size = (count < 0) ? -count : count * channel->block_size;
-	data->io_stats.bytes_read += size;
-	location = ((ext2_loff_t) block * channel->block_size) + data->offset;
-#ifdef DEBUG
-	printf("count=%d, size=%d, block=%lu, blk_size=%d, location=%llx\n",
-	 		count, size, block, channel->block_size, (long long)location);
-#endif
-	if (ext2fs_llseek(data->dev, location, SEEK_SET) != location) {
-		retval = errno ? errno : EXT2_ET_LLSEEK_FAILED;
-		goto error_out;
-	}
-	fragment = size % BLOCKALIGN;
-	alignsize = size - fragment;
-	if (alignsize) {
-		actual = read(data->dev, buf, alignsize);
-		if (actual != alignsize)
-			goto short_read;
-	}
-	if (fragment) {
-		actual = read(data->dev, sector, BLOCKALIGN);
-		if (actual != BLOCKALIGN)
-			goto short_read;
-		memcpy(buf+alignsize, sector, fragment);
-	}
-	return 0;
-
-short_read:
-	if (actual>0)
-		total += actual;
-	retval = EXT2_ET_SHORT_READ;
-
-error_out:
-	memset((char *) buf+total, 0, size-actual);
-	if (channel->read_error)
-		retval = (channel->read_error)(channel, block, count, buf,
-					       size, actual, retval);
-	return retval;
-}
-#endif
 
 static errcode_t raw_write_blk(io_channel channel,
 			       struct unix_private_data *data,
@@ -269,10 +239,43 @@
 		goto error_out;
 	}
 
-	actual = write(data->dev, buf, size);
-	if (actual != size) {
-		retval = EXT2_ET_SHORT_WRITE;
-		goto error_out;
+	if ((data->align == 0) ||
+	    ((IS_ALIGNED(buf, data->align)) && IS_ALIGNED(size, data->align))) {
+		actual = write(data->dev, buf, size);
+		if (actual != size) {
+		short_write:
+			retval = EXT2_ET_SHORT_WRITE;
+			goto error_out;
+		}
+		return 0;
+	}
+
+#ifdef ALIGN_DEBUG
+	printf("raw_write_blk: O_DIRECT fallback: %p %lu\n", buf,
+	       (unsigned long) size);
+#endif
+	/*
+	 * The buffer or size which we're trying to write isn't aligned
+	 * to the O_DIRECT rules, so we need to do this the hard way...
+	 */
+	while (size > 0) {
+		if (size < channel->block_size) {
+			actual = read(data->dev, data->bounce,
+				      channel->block_size);
+			if (actual != channel->block_size) {
+				retval = EXT2_ET_SHORT_READ;
+				goto error_out;
+			}
+		}
+		actual = size;
+		if (size > channel->block_size)
+			actual = channel->block_size;
+		memcpy(data->bounce, buf, actual);
+		actual = write(data->dev, data->bounce, channel->block_size);
+		if (actual != channel->block_size)
+			goto short_write;
+		size -= actual;
+		buf += actual;
 	}
 	return 0;
 
@@ -304,11 +307,18 @@
 		cache->in_use = 0;
 		if (cache->buf)
 			ext2fs_free_mem(&cache->buf);
-		if ((retval = ext2fs_get_mem(channel->block_size,
-					     &cache->buf)))
+		retval = ext2fs_get_memalign(channel->block_size,
+					     data->align, &cache->buf);
+		if (retval)
 			return retval;
 	}
-	return 0;
+	if (data->align) {
+		if (data->bounce)
+			ext2fs_free_mem(&data->bounce);
+		retval = ext2fs_get_memalign(channel->block_size, data->align,
+					     &data->bounce);
+	}
+	return retval;
 }
 
 /* Free the cache buffers */
@@ -325,8 +335,9 @@
 		cache->in_use = 0;
 		if (cache->buf)
 			ext2fs_free_mem(&cache->buf);
-		cache->buf = 0;
 	}
+	if (data->bounce)
+		ext2fs_free_mem(&data->bounce);
 }
 
 #ifndef NO_IO_CACHE
@@ -449,12 +460,13 @@
 	data->magic = EXT2_ET_MAGIC_UNIX_IO_CHANNEL;
 	data->io_stats.num_fields = 2;
 
-	if ((retval = alloc_cache(io, data)))
-		goto cleanup;
-
 	open_flags = (flags & IO_FLAG_RW) ? O_RDWR : O_RDONLY;
 	if (flags & IO_FLAG_EXCLUSIVE)
 		open_flags |= O_EXCL;
+	if (flags & IO_FLAG_DIRECT_IO)
+		open_flags |= O_DIRECT;
+	data->flags = flags;
+
 #ifdef HAVE_OPEN64
 	data->dev = open64(io->name, open_flags);
 #else
@@ -465,6 +477,25 @@
 		goto cleanup;
 	}
 
+#ifdef BLKSSZGET
+	if (flags & IO_FLAG_DIRECT_IO) {
+		if (ioctl(data->dev, BLKSSZGET, &data->align) != 0)
+			data->align = io->block_size;
+	}
+#endif
+
+#if defined(__CYGWIN__) || defined(__FreeBSD__) || defined(__FreeBSD_kernel__)
+	/*
+	 * Some operating systems require that the buffers be aligned,
+	 * regardless of O_DIRECT
+	 */
+	data->align = 512;
+#endif
+
+
+	if ((retval = alloc_cache(io, data)))
+		goto cleanup;
+
 #ifdef BLKROGET
 	if (flags & IO_FLAG_RW) {
 		int error;
@@ -617,6 +648,22 @@
 			cp += channel->block_size;
 			continue;
 		}
+		if (count == 1) {
+			/*
+			 * Special case where we read directly into the
+			 * cache buffer; important in the O_DIRECT case
+			 */
+			cache = reuse[0];
+			reuse_cache(channel, data, cache, block);
+			if ((retval = raw_read_blk(channel, data, block, 1,
+						   cache->buf))) {
+				cache->in_use = 0;
+				return retval;
+			}
+			memcpy(cp, cache->buf, channel->block_size);
+			return 0;
+		}
+
 		/*
 		 * Find the number of uncached blocks so we can do a
 		 * single read request
@@ -718,6 +765,13 @@
 	data = (struct unix_private_data *) channel->private_data;
 	EXT2_CHECK_MAGIC(data, EXT2_ET_MAGIC_UNIX_IO_CHANNEL);
 
+	if (data->align != 0) {
+#ifdef ALIGN_DEBUG
+		printf("unix_write_byte: O_DIRECT fallback\n");
+#endif
+		return EXT2_ET_UNIMPLEMENTED;
+	}
+
 #ifndef NO_IO_CACHE
 	/*
 	 * Flush out the cache completely