[PATCH] ext3: add extent map support

On disk extents format:
/*
* this is extent on-disk structure
* it's used at the bottom of the tree
*/
struct ext3_extent {
__le32  ee_block;       /* first logical block extent covers */
__le16  ee_len;         /* number of blocks covered by extent */
__le16  ee_start_hi;    /* high 16 bits of physical block */
__le32  ee_start;       /* low 32 bigs of physical block */
};

Signed-off-by: Alex Tomas <alex@clusterfs.com>
Signed-off-by: Dave Kleikamp <shaggy@austin.ibm.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
diff --git a/include/linux/ext4_fs.h b/include/linux/ext4_fs.h
index f582cd7..b61181aad 100644
--- a/include/linux/ext4_fs.h
+++ b/include/linux/ext4_fs.h
@@ -178,8 +178,9 @@
 #define EXT4_DIRSYNC_FL			0x00010000 /* dirsync behaviour (directories only) */
 #define EXT4_TOPDIR_FL			0x00020000 /* Top of directory hierarchies*/
 #define EXT4_RESERVED_FL		0x80000000 /* reserved for ext4 lib */
+#define EXT4_EXTENTS_FL			0x00080000 /* Inode uses extents */
 
-#define EXT4_FL_USER_VISIBLE		0x0003DFFF /* User visible flags */
+#define EXT4_FL_USER_VISIBLE		0x000BDFFF /* User visible flags */
 #define EXT4_FL_USER_MODIFIABLE		0x000380FF /* User modifiable flags */
 
 /*
@@ -384,6 +385,7 @@
 #define EXT4_MOUNT_QUOTA		0x80000 /* Some quota option set */
 #define EXT4_MOUNT_USRQUOTA		0x100000 /* "old" user quota */
 #define EXT4_MOUNT_GRPQUOTA		0x200000 /* "old" group quota */
+#define EXT4_MOUNT_EXTENTS		0x400000 /* Extents support */
 
 /* Compatibility, for having both ext2_fs.h and ext4_fs.h included at once */
 #ifndef _LINUX_EXT2_FS_H
@@ -582,11 +584,13 @@
 #define EXT4_FEATURE_INCOMPAT_RECOVER		0x0004 /* Needs recovery */
 #define EXT4_FEATURE_INCOMPAT_JOURNAL_DEV	0x0008 /* Journal device */
 #define EXT4_FEATURE_INCOMPAT_META_BG		0x0010
+#define EXT4_FEATURE_INCOMPAT_EXTENTS		0x0040 /* extents support */
 
 #define EXT4_FEATURE_COMPAT_SUPP	EXT2_FEATURE_COMPAT_EXT_ATTR
 #define EXT4_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
 					 EXT4_FEATURE_INCOMPAT_RECOVER| \
-					 EXT4_FEATURE_INCOMPAT_META_BG)
+					 EXT4_FEATURE_INCOMPAT_META_BG| \
+					 EXT4_FEATURE_INCOMPAT_EXTENTS)
 #define EXT4_FEATURE_RO_COMPAT_SUPP	(EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
 					 EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
 					 EXT4_FEATURE_RO_COMPAT_BTREE_DIR)
@@ -825,6 +829,9 @@
 extern void ext4_truncate (struct inode *);
 extern void ext4_set_inode_flags(struct inode *);
 extern void ext4_set_aops(struct inode *inode);
+extern int ext4_writepage_trans_blocks(struct inode *);
+extern int ext4_block_truncate_page(handle_t *handle, struct page *page,
+		struct address_space *mapping, loff_t from);
 
 /* ioctl.c */
 extern int ext4_ioctl (struct inode *, struct file *, unsigned int,
@@ -879,6 +886,28 @@
 extern struct inode_operations ext4_symlink_inode_operations;
 extern struct inode_operations ext4_fast_symlink_inode_operations;
 
+/* extents.c */
+extern int ext4_ext_tree_init(handle_t *handle, struct inode *);
+extern int ext4_ext_writepage_trans_blocks(struct inode *, int);
+extern int ext4_ext_get_blocks(handle_t *handle, struct inode *inode,
+			ext4_fsblk_t iblock,
+			unsigned long max_blocks, struct buffer_head *bh_result,
+			int create, int extend_disksize);
+extern void ext4_ext_truncate(struct inode *, struct page *);
+extern void ext4_ext_init(struct super_block *);
+extern void ext4_ext_release(struct super_block *);
+static inline int
+ext4_get_blocks_wrap(handle_t *handle, struct inode *inode, sector_t block,
+			unsigned long max_blocks, struct buffer_head *bh,
+			int create, int extend_disksize)
+{
+	if (EXT4_I(inode)->i_flags & EXT4_EXTENTS_FL)
+		return ext4_ext_get_blocks(handle, inode, block, max_blocks,
+					bh, create, extend_disksize);
+	return ext4_get_blocks_handle(handle, inode, block, max_blocks, bh,
+					create, extend_disksize);
+}
+
 
 #endif	/* __KERNEL__ */
 
diff --git a/include/linux/ext4_fs_extents.h b/include/linux/ext4_fs_extents.h
new file mode 100644
index 0000000..8029879
--- /dev/null
+++ b/include/linux/ext4_fs_extents.h
@@ -0,0 +1,196 @@
+/*
+ * Copyright (c) 2003-2006, Cluster File Systems, Inc, info@clusterfs.com
+ * Written by Alex Tomas <alex@clusterfs.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public Licens
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-
+ */
+
+#ifndef _LINUX_EXT4_EXTENTS
+#define _LINUX_EXT4_EXTENTS
+
+#include <linux/ext4_fs.h>
+
+/*
+ * with AGRESSIVE_TEST defined capacity of index/leaf blocks
+ * become very little, so index split, in-depth growing and
+ * other hard changes happens much more often
+ * this is for debug purposes only
+ */
+#define AGRESSIVE_TEST_
+
+/*
+ * with EXTENTS_STATS defined number of blocks and extents
+ * are collected in truncate path. they'll be showed at
+ * umount time
+ */
+#define EXTENTS_STATS__
+
+/*
+ * if CHECK_BINSEARCH defined, then results of binary search
+ * will be checked by linear search
+ */
+#define CHECK_BINSEARCH__
+
+/*
+ * if EXT_DEBUG is defined you can use 'extdebug' mount option
+ * to get lots of info what's going on
+ */
+#define EXT_DEBUG__
+#ifdef EXT_DEBUG
+#define ext_debug(a...)		printk(a)
+#else
+#define ext_debug(a...)
+#endif
+
+/*
+ * if EXT_STATS is defined then stats numbers are collected
+ * these number will be displayed at umount time
+ */
+#define EXT_STATS_
+
+
+/*
+ * ext4_inode has i_block array (60 bytes total)
+ * first 12 bytes store ext4_extent_header
+ * the remain stores array of ext4_extent
+ */
+
+/*
+ * this is extent on-disk structure
+ * it's used at the bottom of the tree
+ */
+struct ext4_extent {
+	__le32	ee_block;	/* first logical block extent covers */
+	__le16	ee_len;		/* number of blocks covered by extent */
+	__le16	ee_start_hi;	/* high 16 bits of physical block */
+	__le32	ee_start;	/* low 32 bigs of physical block */
+};
+
+/*
+ * this is index on-disk structure
+ * it's used at all the levels, but the bottom
+ */
+struct ext4_extent_idx {
+	__le32	ei_block;	/* index covers logical blocks from 'block' */
+	__le32	ei_leaf;	/* pointer to the physical block of the next *
+				 * level. leaf or next index could bet here */
+	__le16	ei_leaf_hi;	/* high 16 bits of physical block */
+	__u16	ei_unused;
+};
+
+/*
+ * each block (leaves and indexes), even inode-stored has header
+ */
+struct ext4_extent_header {
+	__le16	eh_magic;	/* probably will support different formats */
+	__le16	eh_entries;	/* number of valid entries */
+	__le16	eh_max;		/* capacity of store in entries */
+	__le16	eh_depth;	/* has tree real underlaying blocks? */
+	__le32	eh_generation;	/* generation of the tree */
+};
+
+#define EXT4_EXT_MAGIC		cpu_to_le16(0xf30a)
+
+/*
+ * array of ext4_ext_path contains path to some extent
+ * creation/lookup routines use it for traversal/splitting/etc
+ * truncate uses it to simulate recursive walking
+ */
+struct ext4_ext_path {
+	__u32				p_block;
+	__u16				p_depth;
+	struct ext4_extent		*p_ext;
+	struct ext4_extent_idx		*p_idx;
+	struct ext4_extent_header	*p_hdr;
+	struct buffer_head		*p_bh;
+};
+
+/*
+ * structure for external API
+ */
+
+#define EXT4_EXT_CACHE_NO	0
+#define EXT4_EXT_CACHE_GAP	1
+#define EXT4_EXT_CACHE_EXTENT	2
+
+/*
+ * to be called by ext4_ext_walk_space()
+ * negative retcode - error
+ * positive retcode - signal for ext4_ext_walk_space(), see below
+ * callback must return valid extent (passed or newly created)
+ */
+typedef int (*ext_prepare_callback)(struct inode *, struct ext4_ext_path *,
+					struct ext4_ext_cache *,
+					void *);
+
+#define EXT_CONTINUE	0
+#define EXT_BREAK	1
+#define EXT_REPEAT	2
+
+
+#define EXT_MAX_BLOCK	0xffffffff
+
+
+#define EXT_FIRST_EXTENT(__hdr__) \
+	((struct ext4_extent *) (((char *) (__hdr__)) +		\
+				 sizeof(struct ext4_extent_header)))
+#define EXT_FIRST_INDEX(__hdr__) \
+	((struct ext4_extent_idx *) (((char *) (__hdr__)) +	\
+				     sizeof(struct ext4_extent_header)))
+#define EXT_HAS_FREE_INDEX(__path__) \
+        (le16_to_cpu((__path__)->p_hdr->eh_entries) \
+	                             < le16_to_cpu((__path__)->p_hdr->eh_max))
+#define EXT_LAST_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_LAST_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_entries) - 1)
+#define EXT_MAX_EXTENT(__hdr__) \
+	(EXT_FIRST_EXTENT((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+#define EXT_MAX_INDEX(__hdr__) \
+	(EXT_FIRST_INDEX((__hdr__)) + le16_to_cpu((__hdr__)->eh_max) - 1)
+
+static inline struct ext4_extent_header *ext_inode_hdr(struct inode *inode)
+{
+	return (struct ext4_extent_header *) EXT4_I(inode)->i_data;
+}
+
+static inline struct ext4_extent_header *ext_block_hdr(struct buffer_head *bh)
+{
+	return (struct ext4_extent_header *) bh->b_data;
+}
+
+static inline unsigned short ext_depth(struct inode *inode)
+{
+	return le16_to_cpu(ext_inode_hdr(inode)->eh_depth);
+}
+
+static inline void ext4_ext_tree_changed(struct inode *inode)
+{
+	EXT4_I(inode)->i_ext_generation++;
+}
+
+static inline void
+ext4_ext_invalidate_cache(struct inode *inode)
+{
+	EXT4_I(inode)->i_cached_extent.ec_type = EXT4_EXT_CACHE_NO;
+}
+
+extern int ext4_extent_tree_init(handle_t *, struct inode *);
+extern int ext4_ext_calc_credits_for_insert(struct inode *, struct ext4_ext_path *);
+extern int ext4_ext_insert_extent(handle_t *, struct inode *, struct ext4_ext_path *, struct ext4_extent *);
+extern int ext4_ext_walk_space(struct inode *, unsigned long, unsigned long, ext_prepare_callback, void *);
+extern struct ext4_ext_path * ext4_ext_find_extent(struct inode *, int, struct ext4_ext_path *);
+
+#endif /* _LINUX_EXT4_EXTENTS */
+
diff --git a/include/linux/ext4_fs_i.h b/include/linux/ext4_fs_i.h
index 18a6ce9..40ce04a 100644
--- a/include/linux/ext4_fs_i.h
+++ b/include/linux/ext4_fs_i.h
@@ -65,6 +65,16 @@
 #define rsv_end rsv_window._rsv_end
 
 /*
+ * storage for cached extent
+ */
+struct ext4_ext_cache {
+	__u32	ec_start;
+	__u32	ec_block;
+	__u32	ec_len; /* must be 32bit to return holes */
+	__u32	ec_type;
+};
+
+/*
  * third extended file system inode data in memory
  */
 struct ext4_inode_info {
@@ -142,6 +152,9 @@
 	 */
 	struct mutex truncate_mutex;
 	struct inode vfs_inode;
+
+	unsigned long i_ext_generation;
+	struct ext4_ext_cache i_cached_extent;
 };
 
 #endif	/* _LINUX_EXT4_FS_I */
diff --git a/include/linux/ext4_fs_sb.h b/include/linux/ext4_fs_sb.h
index ce4856d..ce7a844 100644
--- a/include/linux/ext4_fs_sb.h
+++ b/include/linux/ext4_fs_sb.h
@@ -78,6 +78,16 @@
 	char *s_qf_names[MAXQUOTAS];		/* Names of quota files with journalled quota */
 	int s_jquota_fmt;			/* Format of quota to use */
 #endif
+
+#ifdef EXTENTS_STATS
+	/* ext4 extents stats */
+	unsigned long s_ext_min;
+	unsigned long s_ext_max;
+	unsigned long s_depth_max;
+	spinlock_t s_ext_stats_lock;
+	unsigned long s_ext_blocks;
+	unsigned long s_ext_extents;
+#endif
 };
 
 #endif	/* _LINUX_EXT4_FS_SB */
diff --git a/include/linux/ext4_jbd2.h b/include/linux/ext4_jbd2.h
index 99d3755..aa273f0 100644
--- a/include/linux/ext4_jbd2.h
+++ b/include/linux/ext4_jbd2.h
@@ -26,9 +26,14 @@
  *
  * We may have to touch one inode, one bitmap buffer, up to three
  * indirection blocks, the group and superblock summaries, and the data
- * block to complete the transaction.  */
+ * block to complete the transaction.
+ *
+ * For extents-enabled fs we may have to allocate and modify upto
+ * 5 levels of tree + root which is stored in inode. */
 
-#define EXT4_SINGLEDATA_TRANS_BLOCKS	8U
+#define EXT4_SINGLEDATA_TRANS_BLOCKS(sb)				\
+	(EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)	\
+		|| test_opt(sb, EXTENTS) ? 27U : 8U)
 
 /* Extended attribute operations touch at most two data buffers,
  * two bitmap buffers, and two group summaries, in addition to the inode
@@ -42,7 +47,7 @@
  * superblock only gets updated once, of course, so don't bother
  * counting that again for the quota updates. */
 
-#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS + \
+#define EXT4_DATA_TRANS_BLOCKS(sb)	(EXT4_SINGLEDATA_TRANS_BLOCKS(sb) + \
 					 EXT4_XATTR_TRANS_BLOCKS - 2 + \
 					 2*EXT4_QUOTA_TRANS_BLOCKS(sb))
 
@@ -78,9 +83,9 @@
 /* Amount of blocks needed for quota insert/delete - we do some block writes
  * but inode, sb and group updates are done only once */
 #define EXT4_QUOTA_INIT_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_INIT_ALLOC*\
-		(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_INIT_REWRITE) : 0)
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_INIT_REWRITE) : 0)
 #define EXT4_QUOTA_DEL_BLOCKS(sb) (test_opt(sb, QUOTA) ? (DQUOT_DEL_ALLOC*\
-		(EXT4_SINGLEDATA_TRANS_BLOCKS-3)+3+DQUOT_DEL_REWRITE) : 0)
+		(EXT4_SINGLEDATA_TRANS_BLOCKS(sb)-3)+3+DQUOT_DEL_REWRITE) : 0)
 #else
 #define EXT4_QUOTA_TRANS_BLOCKS(sb) 0
 #define EXT4_QUOTA_INIT_BLOCKS(sb) 0