ext4: introduce reserved space Currently in ENOSPC condition when writing into unwritten space, or punching a hole, we might need to split the extent and grow extent tree. However since we can not allocate any new metadata blocks we'll have to zero out unwritten part of extent or punched out part of extent, or in the worst case return ENOSPC even though use actually does not allocate any space. Also in delalloc path we do reserve metadata and data blocks for the time we're going to write out, however metadata block reservation is very tricky especially since we expect that logical connectivity implies physical connectivity, however that might not be the case and hence we might end up allocating more metadata blocks than previously reserved. So in future, metadata reservation checks should be removed since we can not assure that we do not under reserve. And this is where reserved space comes into the picture. When mounting the file system we slice off a little bit of the file system space (2% or 4096 clusters, whichever is smaller) which can be then used for the cases mentioned above to prevent costly zeroout, or unexpected ENOSPC. The number of reserved clusters can be set via sysfs, however it can never be bigger than number of free clusters in the file system. Note that this patch fixes the failure of xfstest 274 as expected. Signed-off-by: Lukas Czerner <lczerner@redhat.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu> Reviewed-by: Carlos Maiolino <cmaiolino@redhat.com>

commit: 27dd43854227bb0e6ab70129bd21b60d396db2e7 [log] [tgz]
author: Lukas Czerner <lczerner@redhat.com> Tue Apr 09 22:11:22 2013 -0400
committer: Theodore Ts'o <tytso@mit.edu> Tue Apr 09 22:11:22 2013 -0400
tree: 14e490b9d0ac63583849c4cad4b3ad0123902c3a
parent: f45a5ef91bef7e02149a216ed6dc3fcdd8b38268 [diff] [blame]
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 968ca93..6fea87d 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -81,6 +81,7 @@
 static void ext4_destroy_lazyinit_thread(void);
 static void ext4_unregister_li_request(struct super_block *sb);
 static void ext4_clear_request_list(void);
+static int ext4_reserve_clusters(struct ext4_sb_info *, ext4_fsblk_t);
 
 #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT23)
 static struct file_system_type ext2_fs_type = {
@@ -2382,6 +2383,17 @@
 	int offset;
 };
 
+static int parse_strtoull(const char *buf,
+		unsigned long long max, unsigned long long *value)
+{
+	int ret;
+
+	ret = kstrtoull(skip_spaces(buf), 0, value);
+	if (!ret && *value > max)
+		ret = -EINVAL;
+	return ret;
+}
+
 static int parse_strtoul(const char *buf,
 		unsigned long max, unsigned long *value)
 {
@@ -2466,6 +2478,27 @@
 	return count;
 }
 
+static ssize_t reserved_clusters_show(struct ext4_attr *a,
+				  struct ext4_sb_info *sbi, char *buf)
+{
+	return snprintf(buf, PAGE_SIZE, "%llu\n",
+		(unsigned long long) atomic64_read(&sbi->s_resv_clusters));
+}
+
+static ssize_t reserved_clusters_store(struct ext4_attr *a,
+				   struct ext4_sb_info *sbi,
+				   const char *buf, size_t count)
+{
+	unsigned long long val;
+	int ret;
+
+	if (parse_strtoull(buf, -1ULL, &val))
+		return -EINVAL;
+	ret = ext4_reserve_clusters(sbi, val);
+
+	return ret ? ret : count;
+}
+
 static ssize_t trigger_test_error(struct ext4_attr *a,
 				  struct ext4_sb_info *sbi,
 				  const char *buf, size_t count)
@@ -2503,6 +2536,7 @@
 EXT4_RO_ATTR(delayed_allocation_blocks);
 EXT4_RO_ATTR(session_write_kbytes);
 EXT4_RO_ATTR(lifetime_write_kbytes);
+EXT4_RW_ATTR(reserved_clusters);
 EXT4_ATTR_OFFSET(inode_readahead_blks, 0644, sbi_ui_show,
 		 inode_readahead_blks_store, s_inode_readahead_blks);
 EXT4_RW_ATTR_SBI_UI(inode_goal, s_inode_goal);
@@ -2520,6 +2554,7 @@
 	ATTR_LIST(delayed_allocation_blocks),
 	ATTR_LIST(session_write_kbytes),
 	ATTR_LIST(lifetime_write_kbytes),
+	ATTR_LIST(reserved_clusters),
 	ATTR_LIST(inode_readahead_blks),
 	ATTR_LIST(inode_goal),
 	ATTR_LIST(mb_stats),
@@ -3195,6 +3230,40 @@
 	return 0;
 }
 
+
+static ext4_fsblk_t ext4_calculate_resv_clusters(struct ext4_sb_info *sbi)
+{
+	ext4_fsblk_t resv_clusters;
+
+	/*
+	 * By default we reserve 2% or 4096 clusters, whichever is smaller.
+	 * This should cover the situations where we can not afford to run
+	 * out of space like for example punch hole, or converting
+	 * uninitialized extents in delalloc path. In most cases such
+	 * allocation would require 1, or 2 blocks, higher numbers are
+	 * very rare.
+	 */
+	resv_clusters = ext4_blocks_count(sbi->s_es) >> sbi->s_cluster_bits;
+
+	do_div(resv_clusters, 50);
+	resv_clusters = min_t(ext4_fsblk_t, resv_clusters, 4096);
+
+	return resv_clusters;
+}
+
+
+static int ext4_reserve_clusters(struct ext4_sb_info *sbi, ext4_fsblk_t count)
+{
+	ext4_fsblk_t clusters = ext4_blocks_count(sbi->s_es) >>
+				sbi->s_cluster_bits;
+
+	if (count >= clusters)
+		return -EINVAL;
+
+	atomic64_set(&sbi->s_resv_clusters, count);
+	return 0;
+}
+
 static int ext4_fill_super(struct super_block *sb, void *data, int silent)
 {
 	char *orig_data = kstrdup(data, GFP_KERNEL);
@@ -3918,6 +3987,13 @@
 			 "available");
 	}
 
+	err = ext4_reserve_clusters(sbi, ext4_calculate_resv_clusters(sbi));
+	if (err) {
+		ext4_msg(sb, KERN_ERR, "failed to reserve %llu clusters for "
+			 "reserved pool", ext4_calculate_resv_clusters(sbi));
+		goto failed_mount4a;
+	}
+
 	err = ext4_setup_system_zone(sb);
 	if (err) {
 		ext4_msg(sb, KERN_ERR, "failed to initialize system "
@@ -4750,9 +4826,10 @@
 	struct super_block *sb = dentry->d_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	struct ext4_super_block *es = sbi->s_es;
-	ext4_fsblk_t overhead = 0;
+	ext4_fsblk_t overhead = 0, resv_blocks;
 	u64 fsid;
 	s64 bfree;
+	resv_blocks = EXT4_C2B(sbi, atomic64_read(&sbi->s_resv_clusters));
 
 	if (!test_opt(sb, MINIX_DF))
 		overhead = sbi->s_overhead;
@@ -4764,8 +4841,9 @@
 		percpu_counter_sum_positive(&sbi->s_dirtyclusters_counter);
 	/* prevent underflow in case that few free space is available */
 	buf->f_bfree = EXT4_C2B(sbi, max_t(s64, bfree, 0));
-	buf->f_bavail = buf->f_bfree - ext4_r_blocks_count(es);
-	if (buf->f_bfree < ext4_r_blocks_count(es))
+	buf->f_bavail = buf->f_bfree -
+			(ext4_r_blocks_count(es) + resv_blocks);
+	if (buf->f_bfree < (ext4_r_blocks_count(es) + resv_blocks))
 		buf->f_bavail = 0;
 	buf->f_files = le32_to_cpu(es->s_inodes_count);
 	buf->f_ffree = percpu_counter_sum_positive(&sbi->s_freeinodes_counter);
commit	27dd43854227bb0e6ab70129bd21b60d396db2e7	[log] [tgz]
author	Lukas Czerner <lczerner@redhat.com>	Tue Apr 09 22:11:22 2013 -0400
committer	Theodore Ts'o <tytso@mit.edu>	Tue Apr 09 22:11:22 2013 -0400
tree	14e490b9d0ac63583849c4cad4b3ad0123902c3a
parent	f45a5ef91bef7e02149a216ed6dc3fcdd8b38268 [diff] [blame]