Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates for 3.4 from Ted Ts'o: "Ext4 commits for 3.3 merge window; mostly cleanups and bug fixes The changes to export dirty_writeback_interval are from Artem's s_dirt cleanup patch series. The same is true of the change to remove the s_dirt helper functions which never got used by anyone in-tree. I've run these changes by Al Viro, and am carrying them so that Artem can more easily fix up the rest of the file systems during the next merge window. (Originally we had hopped to remove the use of s_dirt from ext4 during this merge window, but his patches had some bugs, so I ultimately ended dropping them from the ext4 tree.)" * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (66 commits) vfs: remove unused superblock helpers mm: export dirty_writeback_interval ext4: remove useless s_dirt assignment ext4: write superblock only once on unmount ext4: do not mark superblock as dirty unnecessarily ext4: correct ext4_punch_hole return codes ext4: remove restrictive checks for EOFBLOCKS_FL ext4: always set then trimmed blocks count into len ext4: fix trimmed block count accunting ext4: fix start and len arguments handling in ext4_trim_fs() ext4: update s_free_{inodes,blocks}_count during online resize ext4: change some printk() calls to use ext4_msg() instead ext4: avoid output message interleaving in ext4_error_<foo>() ext4: remove trailing newlines from ext4_msg() and ext4_error() messages ext4: add no_printk argument validation, fix fallout ext4: remove redundant "EXT4-fs: " from uses of ext4_msg ext4: give more helpful error message in ext4_ext_rm_leaf() ext4: remove unused code from ext4_ext_map_blocks() ext4: rewrite punch hole to use ext4_ext_remove_space() jbd2: cleanup journal tail after transaction commit ...

commit: 69e1aaddd63104f37021d0b0f6abfd9623c9134c [log] [tgz]
author: Linus Torvalds <torvalds@linux-foundation.org> Wed Mar 28 10:02:55 2012 -0700
committer: Linus Torvalds <torvalds@linux-foundation.org> Wed Mar 28 10:02:55 2012 -0700
tree: 14ad49741b428d270b681694bb2df349465455b9
parent: 56b59b429b4c26e5e730bc8c3d837de9f7d0a966 [diff]
parent: 9d547c35799a4ddd235f1565cec2fff6c9263504 [diff]
diff --git a/Documentation/filesystems/ext4.txt b/Documentation/filesystems/ext4.txt
index 8c10bf3..1b7f9ac 100644
--- a/Documentation/filesystems/ext4.txt
+++ b/Documentation/filesystems/ext4.txt

@@ -144,9 +144,6 @@
 			mount the device. This will enable 'journal_checksum'
 			internally.
 
-journal=update		Update the ext4 file system's journal to the current
-			format.
-
 journal_dev=devnum	When the external journal device's major/minor numbers
 			have changed, this option allows the user to specify
 			the new journal location.  The journal device is
@@ -356,11 +353,6 @@
 			interoperability  with  older kernels which only
 			store and expect 16-bit values.
 
-resize			Allows to resize filesystem to the end of the last
-			existing block group, further resize has to be done
-			with resize2fs either online, or offline. It can be
-			used only with conjunction with remount.
-
 block_validity		This options allows to enables/disables the in-kernel
 noblock_validity	facility for tracking filesystem metadata blocks
 			within internal data structures. This allows multi-

diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c
index f9e2cd8..4bbd07a 100644
--- a/fs/ext4/balloc.c
+++ b/fs/ext4/balloc.c

@@ -336,10 +336,10 @@
  * Return buffer_head on success or NULL in case of failure.
  */
 struct buffer_head *
-ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
 {
 	struct ext4_group_desc *desc;
-	struct buffer_head *bh = NULL;
+	struct buffer_head *bh;
 	ext4_fsblk_t bitmap_blk;
 
 	desc = ext4_get_group_desc(sb, block_group, NULL);
@@ -348,9 +348,9 @@
 	bitmap_blk = ext4_block_bitmap(sb, desc);
 	bh = sb_getblk(sb, bitmap_blk);
 	if (unlikely(!bh)) {
-		ext4_error(sb, "Cannot read block bitmap - "
-			    "block_group = %u, block_bitmap = %llu",
-			    block_group, bitmap_blk);
+		ext4_error(sb, "Cannot get buffer for block bitmap - "
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, bitmap_blk);
 		return NULL;
 	}
 
@@ -382,25 +382,50 @@
 		return bh;
 	}
 	/*
-	 * submit the buffer_head for read. We can
-	 * safely mark the bitmap as uptodate now.
-	 * We do it here so the bitmap uptodate bit
-	 * get set with buffer lock held.
+	 * submit the buffer_head for reading
 	 */
+	set_buffer_new(bh);
 	trace_ext4_read_block_bitmap_load(sb, block_group);
-	set_bitmap_uptodate(bh);
-	if (bh_submit_read(bh) < 0) {
-		put_bh(bh);
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	return bh;
+}
+
+/* Returns 0 on success, 1 on error */
+int ext4_wait_block_bitmap(struct super_block *sb, ext4_group_t block_group,
+			   struct buffer_head *bh)
+{
+	struct ext4_group_desc *desc;
+
+	if (!buffer_new(bh))
+		return 0;
+	desc = ext4_get_group_desc(sb, block_group, NULL);
+	if (!desc)
+		return 1;
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
 		ext4_error(sb, "Cannot read block bitmap - "
-			    "block_group = %u, block_bitmap = %llu",
-			    block_group, bitmap_blk);
+			   "block_group = %u, block_bitmap = %llu",
+			   block_group, (unsigned long long) bh->b_blocknr);
+		return 1;
+	}
+	clear_buffer_new(bh);
+	/* Panic or remount fs read-only if block bitmap is invalid */
+	ext4_valid_block_bitmap(sb, desc, block_group, bh);
+	return 0;
+}
+
+struct buffer_head *
+ext4_read_block_bitmap(struct super_block *sb, ext4_group_t block_group)
+{
+	struct buffer_head *bh;
+
+	bh = ext4_read_block_bitmap_nowait(sb, block_group);
+	if (ext4_wait_block_bitmap(sb, block_group, bh)) {
+		put_bh(bh);
 		return NULL;
 	}
-	ext4_valid_block_bitmap(sb, desc, block_group, bh);
-	/*
-	 * file system mounted not to panic on error,
-	 * continue with corrupt bitmap
-	 */
 	return bh;
 }
 

diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c
index 164c560..ad56866 100644
--- a/fs/ext4/dir.c
+++ b/fs/ext4/dir.c

@@ -91,17 +91,17 @@
 		return 0;
 
 	if (filp)
-		ext4_error_file(filp, function, line, bh ? bh->b_blocknr : 0,
+		ext4_error_file(filp, function, line, bh->b_blocknr,
 				"bad entry in directory: %s - offset=%u(%u), "
 				"inode=%u, rec_len=%d, name_len=%d",
-				error_msg, (unsigned) (offset%bh->b_size),
+				error_msg, (unsigned) (offset % bh->b_size),
 				offset, le32_to_cpu(de->inode),
 				rlen, de->name_len);
 	else
-		ext4_error_inode(dir, function, line, bh ? bh->b_blocknr : 0,
+		ext4_error_inode(dir, function, line, bh->b_blocknr,
 				"bad entry in directory: %s - offset=%u(%u), "
 				"inode=%u, rec_len=%d, name_len=%d",
-				error_msg, (unsigned) (offset%bh->b_size),
+				error_msg, (unsigned) (offset % bh->b_size),
 				offset, le32_to_cpu(de->inode),
 				rlen, de->name_len);
 
@@ -425,8 +425,9 @@
 	sb = inode->i_sb;
 
 	if (!fname) {
-		printk(KERN_ERR "EXT4-fs: call_filldir: called with "
-		       "null fname?!?\n");
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: comm %s: "
+			 "called with null fname?!?", __func__, __LINE__,
+			 inode->i_ino, current->comm);
 		return 0;
 	}
 	curr_pos = hash2pos(fname->hash, fname->minor_hash);

diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 513004f..ded731a 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h

@@ -53,7 +53,7 @@
 		printk(KERN_DEBUG f, ## a);				\
 	} while (0)
 #else
-#define ext4_debug(f, a...)	do {} while (0)
+#define ext4_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 #define EXT4_ERROR_INODE(inode, fmt, a...) \
@@ -184,6 +184,8 @@
 #define	EXT4_IO_END_UNWRITTEN	0x0001
 #define EXT4_IO_END_ERROR	0x0002
 #define EXT4_IO_END_QUEUED	0x0004
+#define EXT4_IO_END_DIRECT	0x0008
+#define EXT4_IO_END_IN_FSYNC	0x0010
 
 struct ext4_io_page {
 	struct page	*p_page;
@@ -192,18 +194,25 @@
 
 #define MAX_IO_PAGES 128
 
+/*
+ * For converting uninitialized extents on a work queue.
+ *
+ * 'page' is only used from the writepage() path; 'pages' is only used for
+ * buffered writes; they are used to keep page references until conversion
+ * takes place.  For AIO/DIO, neither field is filled in.
+ */
 typedef struct ext4_io_end {
 	struct list_head	list;		/* per-file finished IO list */
 	struct inode		*inode;		/* file being written to */
 	unsigned int		flag;		/* unwritten or not */
-	struct page		*page;		/* page struct for buffer write */
+	struct page		*page;		/* for writepage() path */
 	loff_t			offset;		/* offset in the file */
 	ssize_t			size;		/* size of the extent */
 	struct work_struct	work;		/* data work queue */
 	struct kiocb		*iocb;		/* iocb struct for AIO */
 	int			result;		/* error value for AIO */
-	int			num_io_pages;
-	struct ext4_io_page	*pages[MAX_IO_PAGES];
+	int			num_io_pages;   /* for writepages() */
+	struct ext4_io_page	*pages[MAX_IO_PAGES]; /* for writepages() */
 } ext4_io_end_t;
 
 struct ext4_io_submit {
@@ -923,6 +932,7 @@
 #define EXT4_MOUNT_ERRORS_CONT		0x00010	/* Continue on errors */
 #define EXT4_MOUNT_ERRORS_RO		0x00020	/* Remount fs ro on errors */
 #define EXT4_MOUNT_ERRORS_PANIC		0x00040	/* Panic on errors */
+#define EXT4_MOUNT_ERRORS_MASK		0x00070
 #define EXT4_MOUNT_MINIX_DF		0x00080	/* Mimics the Minix statfs */
 #define EXT4_MOUNT_NOLOAD		0x00100	/* Don't use existing journal*/
 #define EXT4_MOUNT_DATA_FLAGS		0x00C00	/* Mode for data writes: */
@@ -941,7 +951,6 @@
 #define EXT4_MOUNT_DIOREAD_NOLOCK	0x400000 /* Enable support for dio read nolocking */
 #define EXT4_MOUNT_JOURNAL_CHECKSUM	0x800000 /* Journal checksums */
 #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT	0x1000000 /* Journal Async Commit */
-#define EXT4_MOUNT_I_VERSION            0x2000000 /* i_version support */
 #define EXT4_MOUNT_MBLK_IO_SUBMIT	0x4000000 /* multi-block io submits */
 #define EXT4_MOUNT_DELALLOC		0x8000000 /* Delalloc support */
 #define EXT4_MOUNT_DATA_ERR_ABORT	0x10000000 /* Abort on file data write */
@@ -1142,6 +1151,7 @@
 	unsigned int s_mount_opt;
 	unsigned int s_mount_opt2;
 	unsigned int s_mount_flags;
+	unsigned int s_def_mount_opt;
 	ext4_fsblk_t s_sb_block;
 	uid_t s_resuid;
 	gid_t s_resgid;
@@ -1420,8 +1430,9 @@
 #define EXT4_FEATURE_INCOMPAT_FLEX_BG		0x0200
 #define EXT4_FEATURE_INCOMPAT_EA_INODE		0x0400 /* EA in inode */
 #define EXT4_FEATURE_INCOMPAT_DIRDATA		0x1000 /* data in dirent */
-#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x2000 /* data in inode */
+#define EXT4_FEATURE_INCOMPAT_BG_USE_META_CSUM	0x2000 /* use crc32c for bg */
 #define EXT4_FEATURE_INCOMPAT_LARGEDIR		0x4000 /* >2GB or 3-lvl htree */
+#define EXT4_FEATURE_INCOMPAT_INLINEDATA	0x8000 /* data in inode */
 
 #define EXT2_FEATURE_COMPAT_SUPP	EXT4_FEATURE_COMPAT_EXT_ATTR
 #define EXT2_FEATURE_INCOMPAT_SUPP	(EXT4_FEATURE_INCOMPAT_FILETYPE| \
@@ -1794,8 +1805,14 @@
 						    ext4_group_t block_group,
 						    struct buffer_head ** bh);
 extern int ext4_should_retry_alloc(struct super_block *sb, int *retries);
-struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
-				      ext4_group_t block_group);
+
+extern struct buffer_head *ext4_read_block_bitmap_nowait(struct super_block *sb,
+						ext4_group_t block_group);
+extern int ext4_wait_block_bitmap(struct super_block *sb,
+				  ext4_group_t block_group,
+				  struct buffer_head *bh);
+extern struct buffer_head *ext4_read_block_bitmap(struct super_block *sb,
+						  ext4_group_t block_group);
 extern void ext4_init_block_bitmap(struct super_block *sb,
 				   struct buffer_head *bh,
 				   ext4_group_t group,
@@ -1841,6 +1858,7 @@
 extern void ext4_mark_bitmap_end(int start_bit, int end_bit, char *bitmap);
 extern int ext4_init_inode_table(struct super_block *sb,
 				 ext4_group_t group, int barrier);
+extern void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate);
 
 /* mballoc.c */
 extern long ext4_mb_stats;

diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h
index a52db3a..0f58b86 100644
--- a/fs/ext4/ext4_extents.h
+++ b/fs/ext4/ext4_extents.h

@@ -47,9 +47,9 @@
  */
 #define EXT_DEBUG__
 #ifdef EXT_DEBUG
-#define ext_debug(a...)		printk(a)
+#define ext_debug(fmt, ...)	printk(fmt, ##__VA_ARGS__)
 #else
-#define ext_debug(a...)
+#define ext_debug(fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 /*

diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 5802fa1..83b20fc 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h

@@ -104,6 +104,78 @@
 #define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb))
 #define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb))
 
+/**
+ *   struct ext4_journal_cb_entry - Base structure for callback information.
+ *
+ *   This struct is a 'seed' structure for a using with your own callback
+ *   structs. If you are using callbacks you must allocate one of these
+ *   or another struct of your own definition which has this struct
+ *   as it's first element and pass it to ext4_journal_callback_add().
+ */
+struct ext4_journal_cb_entry {
+	/* list information for other callbacks attached to the same handle */
+	struct list_head jce_list;
+
+	/*  Function to call with this callback structure */
+	void (*jce_func)(struct super_block *sb,
+			 struct ext4_journal_cb_entry *jce, int error);
+
+	/* user data goes here */
+};
+
+/**
+ * ext4_journal_callback_add: add a function to call after transaction commit
+ * @handle: active journal transaction handle to register callback on
+ * @func: callback function to call after the transaction has committed:
+ *        @sb: superblock of current filesystem for transaction
+ *        @jce: returned journal callback data
+ *        @rc: journal state at commit (0 = transaction committed properly)
+ * @jce: journal callback data (internal and function private data struct)
+ *
+ * The registered function will be called in the context of the journal thread
+ * after the transaction for which the handle was created has completed.
+ *
+ * No locks are held when the callback function is called, so it is safe to
+ * call blocking functions from within the callback, but the callback should
+ * not block or run for too long, or the filesystem will be blocked waiting for
+ * the next transaction to commit. No journaling functions can be used, or
+ * there is a risk of deadlock.
+ *
+ * There is no guaranteed calling order of multiple registered callbacks on
+ * the same transaction.
+ */
+static inline void ext4_journal_callback_add(handle_t *handle,
+			void (*func)(struct super_block *sb,
+				     struct ext4_journal_cb_entry *jce,
+				     int rc),
+			struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	/* Add the jce to transaction's private list */
+	jce->jce_func = func;
+	spin_lock(&sbi->s_md_lock);
+	list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
+/**
+ * ext4_journal_callback_del: delete a registered callback
+ * @handle: active journal transaction handle on which callback was registered
+ * @jce: registered journal callback entry to unregister
+ */
+static inline void ext4_journal_callback_del(handle_t *handle,
+					     struct ext4_journal_cb_entry *jce)
+{
+	struct ext4_sb_info *sbi =
+			EXT4_SB(handle->h_transaction->t_journal->j_private);
+
+	spin_lock(&sbi->s_md_lock);
+	list_del_init(&jce->jce_list);
+	spin_unlock(&sbi->s_md_lock);
+}
+
 int
 ext4_mark_iloc_dirty(handle_t *handle,
 		     struct inode *inode,
@@ -261,43 +333,45 @@
 /* super.c */
 int ext4_force_commit(struct super_block *sb);
 
-static inline int ext4_should_journal_data(struct inode *inode)
+/*
+ * Ext4 inode journal modes
+ */
+#define EXT4_INODE_JOURNAL_DATA_MODE	0x01 /* journal data mode */
+#define EXT4_INODE_ORDERED_DATA_MODE	0x02 /* ordered data mode */
+#define EXT4_INODE_WRITEBACK_DATA_MODE	0x04 /* writeback data mode */
+
+static inline int ext4_inode_journal_mode(struct inode *inode)
 {
 	if (EXT4_JOURNAL(inode) == NULL)
-		return 0;
-	if (!S_ISREG(inode->i_mode))
-		return 1;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		return 1;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 1;
-	return 0;
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	/* We do not support data journalling with delayed allocation */
+	if (!S_ISREG(inode->i_mode) ||
+	    test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
+	    !test_opt(inode->i_sb, DELALLOC))
+		return EXT4_INODE_JOURNAL_DATA_MODE;	/* journal data */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+		return EXT4_INODE_ORDERED_DATA_MODE;	/* ordered */
+	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+		return EXT4_INODE_WRITEBACK_DATA_MODE;	/* writeback */
+	else
+		BUG();
+}
+
+static inline int ext4_should_journal_data(struct inode *inode)
+{
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_JOURNAL_DATA_MODE;
 }
 
 static inline int ext4_should_order_data(struct inode *inode)
 {
-	if (EXT4_JOURNAL(inode) == NULL)
-		return 0;
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-		return 1;
-	return 0;
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_ORDERED_DATA_MODE;
 }
 
 static inline int ext4_should_writeback_data(struct inode *inode)
 {
-	if (EXT4_JOURNAL(inode) == NULL)
-		return 1;
-	if (!S_ISREG(inode->i_mode))
-		return 0;
-	if (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
-		return 0;
-	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-		return 1;
-	return 0;
+	return ext4_inode_journal_mode(inode) & EXT4_INODE_WRITEBACK_DATA_MODE;
 }
 
 /*

diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 74f23c2..1421938 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c

@@ -44,6 +44,14 @@
 
 #include <trace/events/ext4.h>
 
+/*
+ * used by extent splitting.
+ */
+#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
+					due to ENOSPC */
+#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
+#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
+
 static int ext4_split_extent(handle_t *handle,
 				struct inode *inode,
 				struct ext4_ext_path *path,
@@ -51,6 +59,13 @@
 				int split_flag,
 				int flags);
 
+static int ext4_split_extent_at(handle_t *handle,
+			     struct inode *inode,
+			     struct ext4_ext_path *path,
+			     ext4_lblk_t split,
+			     int split_flag,
+			     int flags);
+
 static int ext4_ext_truncate_extend_restart(handle_t *handle,
 					    struct inode *inode,
 					    int needed)
@@ -300,6 +315,8 @@
 	ext4_fsblk_t block = ext4_ext_pblock(ext);
 	int len = ext4_ext_get_actual_len(ext);
 
+	if (len == 0)
+		return 0;
 	return ext4_data_block_valid(EXT4_SB(inode->i_sb), block, len);
 }
 
@@ -2308,7 +2325,7 @@
 	struct ext4_extent *ex;
 
 	/* the header must be checked already in ext4_ext_remove_space() */
-	ext_debug("truncate since %u in leaf\n", start);
+	ext_debug("truncate since %u in leaf to %u\n", start, end);
 	if (!path[depth].p_hdr)
 		path[depth].p_hdr = ext_block_hdr(path[depth].p_bh);
 	eh = path[depth].p_hdr;
@@ -2343,14 +2360,17 @@
 		ext_debug("  border %u:%u\n", a, b);
 
 		/* If this extent is beyond the end of the hole, skip it */
-		if (end <= ex_ee_block) {
+		if (end < ex_ee_block) {
 			ex--;
 			ex_ee_block = le32_to_cpu(ex->ee_block);
 			ex_ee_len = ext4_ext_get_actual_len(ex);
 			continue;
 		} else if (b != ex_ee_block + ex_ee_len - 1) {
-			EXT4_ERROR_INODE(inode,"  bad truncate %u:%u\n",
-					 start, end);
+			EXT4_ERROR_INODE(inode,
+					 "can not handle truncate %u:%u "
+					 "on extent %u:%u",
+					 start, end, ex_ee_block,
+					 ex_ee_block + ex_ee_len - 1);
 			err = -EIO;
 			goto out;
 		} else if (a != ex_ee_block) {
@@ -2482,7 +2502,8 @@
 	return 1;
 }
 
-static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start)
+static int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
+				 ext4_lblk_t end)
 {
 	struct super_block *sb = inode->i_sb;
 	int depth = ext_depth(inode);
@@ -2491,7 +2512,7 @@
 	handle_t *handle;
 	int i, err;
 
-	ext_debug("truncate since %u\n", start);
+	ext_debug("truncate since %u to %u\n", start, end);
 
 	/* probably first extent we're gonna free will be last in block */
 	handle = ext4_journal_start(inode, depth + 1);
@@ -2504,6 +2525,61 @@
 	trace_ext4_ext_remove_space(inode, start, depth);
 
 	/*
+	 * Check if we are removing extents inside the extent tree. If that
+	 * is the case, we are going to punch a hole inside the extent tree
+	 * so we have to check whether we need to split the extent covering
+	 * the last block to remove so we can easily remove the part of it
+	 * in ext4_ext_rm_leaf().
+	 */
+	if (end < EXT_MAX_BLOCKS - 1) {
+		struct ext4_extent *ex;
+		ext4_lblk_t ee_block;
+
+		/* find extent for this block */
+		path = ext4_ext_find_extent(inode, end, NULL);
+		if (IS_ERR(path)) {
+			ext4_journal_stop(handle);
+			return PTR_ERR(path);
+		}
+		depth = ext_depth(inode);
+		ex = path[depth].p_ext;
+		if (!ex)
+			goto cont;
+
+		ee_block = le32_to_cpu(ex->ee_block);
+
+		/*
+		 * See if the last block is inside the extent, if so split
+		 * the extent at 'end' block so we can easily remove the
+		 * tail of the first part of the split extent in
+		 * ext4_ext_rm_leaf().
+		 */
+		if (end >= ee_block &&
+		    end < ee_block + ext4_ext_get_actual_len(ex) - 1) {
+			int split_flag = 0;
+
+			if (ext4_ext_is_uninitialized(ex))
+				split_flag = EXT4_EXT_MARK_UNINIT1 |
+					     EXT4_EXT_MARK_UNINIT2;
+
+			/*
+			 * Split the extent in two so that 'end' is the last
+			 * block in the first new extent
+			 */
+			err = ext4_split_extent_at(handle, inode, path,
+						end + 1, split_flag,
+						EXT4_GET_BLOCKS_PRE_IO |
+						EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+
+			if (err < 0)
+				goto out;
+		}
+		ext4_ext_drop_refs(path);
+		kfree(path);
+	}
+cont:
+
+	/*
 	 * We start scanning from right side, freeing all the blocks
 	 * after i_size and walking into the tree depth-wise.
 	 */
@@ -2515,6 +2591,7 @@
 	}
 	path[0].p_depth = depth;
 	path[0].p_hdr = ext_inode_hdr(inode);
+
 	if (ext4_ext_check(inode, path[0].p_hdr, depth)) {
 		err = -EIO;
 		goto out;
@@ -2526,7 +2603,7 @@
 			/* this is leaf block */
 			err = ext4_ext_rm_leaf(handle, inode, path,
 					       &partial_cluster, start,
-					       EXT_MAX_BLOCKS - 1);
+					       end);
 			/* root level has p_bh == NULL, brelse() eats this */
 			brelse(path[i].p_bh);
 			path[i].p_bh = NULL;
@@ -2651,17 +2728,17 @@
 
 	if (EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_EXTENTS)) {
 #if defined(AGGRESSIVE_TEST) || defined(CHECK_BINSEARCH) || defined(EXTENTS_STATS)
-		printk(KERN_INFO "EXT4-fs: file extents enabled");
+		printk(KERN_INFO "EXT4-fs: file extents enabled"
 #ifdef AGGRESSIVE_TEST
-		printk(", aggressive tests");
+		       ", aggressive tests"
 #endif
 #ifdef CHECK_BINSEARCH
-		printk(", check binsearch");
+		       ", check binsearch"
 #endif
 #ifdef EXTENTS_STATS
-		printk(", stats");
+		       ", stats"
 #endif
-		printk("\n");
+		       "\n");
 #endif
 #ifdef EXTENTS_STATS
 		spin_lock_init(&EXT4_SB(sb)->s_ext_stats_lock);
@@ -2709,14 +2786,6 @@
 }
 
 /*
- * used by extent splitting.
- */
-#define EXT4_EXT_MAY_ZEROOUT	0x1  /* safe to zeroout if split fails \
-					due to ENOSPC */
-#define EXT4_EXT_MARK_UNINIT1	0x2  /* mark first half uninitialized */
-#define EXT4_EXT_MARK_UNINIT2	0x4  /* mark second half uninitialized */
-
-/*
  * ext4_split_extent_at() splits an extent at given block.
  *
  * @handle: the journal handle
@@ -3224,11 +3293,13 @@
 	depth = ext_depth(inode);
 	eh = path[depth].p_hdr;
 
-	if (unlikely(!eh->eh_entries)) {
-		EXT4_ERROR_INODE(inode, "eh->eh_entries == 0 and "
-				 "EOFBLOCKS_FL set");
-		return -EIO;
-	}
+	/*
+	 * We're going to remove EOFBLOCKS_FL entirely in future so we
+	 * do not care for this case anymore. Simply remove the flag
+	 * if there are no extents.
+	 */
+	if (unlikely(!eh->eh_entries))
+		goto out;
 	last_ex = EXT_LAST_EXTENT(eh);
 	/*
 	 * We should clear the EOFBLOCKS_FL flag if we are writing the
@@ -3252,6 +3323,7 @@
 	for (i = depth-1; i >= 0; i--)
 		if (path[i].p_idx != EXT_LAST_INDEX(path[i].p_hdr))
 			return 0;
+out:
 	ext4_clear_inode_flag(inode, EXT4_INODE_EOFBLOCKS);
 	return ext4_mark_inode_dirty(handle, inode);
 }
@@ -3710,8 +3782,6 @@
 	int free_on_err = 0, err = 0, depth, ret;
 	unsigned int allocated = 0, offset = 0;
 	unsigned int allocated_clusters = 0;
-	unsigned int punched_out = 0;
-	unsigned int result = 0;
 	struct ext4_allocation_request ar;
 	ext4_io_end_t *io = EXT4_I(inode)->cur_aio_dio;
 	ext4_lblk_t cluster_offset;
@@ -3721,8 +3791,7 @@
 	trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
 
 	/* check in cache */
-	if (!(flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) &&
-		ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
+	if (ext4_ext_in_cache(inode, map->m_lblk, &newex)) {
 		if (!newex.ee_start_lo && !newex.ee_start_hi) {
 			if ((sbi->s_cluster_ratio > 1) &&
 			    ext4_find_delalloc_cluster(inode, map->m_lblk, 0))
@@ -3790,113 +3859,25 @@
 
 		/* if found extent covers block, simply return it */
 		if (in_range(map->m_lblk, ee_block, ee_len)) {
-			struct ext4_map_blocks punch_map;
-			ext4_fsblk_t partial_cluster = 0;
-
 			newblock = map->m_lblk - ee_block + ee_start;
 			/* number of remaining blocks in the extent */
 			allocated = ee_len - (map->m_lblk - ee_block);
 			ext_debug("%u fit into %u:%d -> %llu\n", map->m_lblk,
 				  ee_block, ee_len, newblock);
 
-			if ((flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) == 0) {
-				/*
-				 * Do not put uninitialized extent
-				 * in the cache
-				 */
-				if (!ext4_ext_is_uninitialized(ex)) {
-					ext4_ext_put_in_cache(inode, ee_block,
-						ee_len, ee_start);
-					goto out;
-				}
-				ret = ext4_ext_handle_uninitialized_extents(
-					handle, inode, map, path, flags,
-					allocated, newblock);
-				return ret;
-			}
-
 			/*
-			 * Punch out the map length, but only to the
-			 * end of the extent
+			 * Do not put uninitialized extent
+			 * in the cache
 			 */
-			punched_out = allocated < map->m_len ?
-				allocated : map->m_len;
-
-			/*
-			 * Sense extents need to be converted to
-			 * uninitialized, they must fit in an
-			 * uninitialized extent
-			 */
-			if (punched_out > EXT_UNINIT_MAX_LEN)
-				punched_out = EXT_UNINIT_MAX_LEN;
-
-			punch_map.m_lblk = map->m_lblk;
-			punch_map.m_pblk = newblock;
-			punch_map.m_len = punched_out;
-			punch_map.m_flags = 0;
-
-			/* Check to see if the extent needs to be split */
-			if (punch_map.m_len != ee_len ||
-				punch_map.m_lblk != ee_block) {
-
-				ret = ext4_split_extent(handle, inode,
-				path, &punch_map, 0,
-				EXT4_GET_BLOCKS_PUNCH_OUT_EXT |
-				EXT4_GET_BLOCKS_PRE_IO);
-
-				if (ret < 0) {
-					err = ret;
-					goto out2;
-				}
-				/*
-				 * find extent for the block at
-				 * the start of the hole
-				 */
-				ext4_ext_drop_refs(path);
-				kfree(path);
-
-				path = ext4_ext_find_extent(inode,
-				map->m_lblk, NULL);
-				if (IS_ERR(path)) {
-					err = PTR_ERR(path);
-					path = NULL;
-					goto out2;
-				}
-
-				depth = ext_depth(inode);
-				ex = path[depth].p_ext;
-				ee_len = ext4_ext_get_actual_len(ex);
-				ee_block = le32_to_cpu(ex->ee_block);
-				ee_start = ext4_ext_pblock(ex);
-
+			if (!ext4_ext_is_uninitialized(ex)) {
+				ext4_ext_put_in_cache(inode, ee_block,
+					ee_len, ee_start);
+				goto out;
 			}
-
-			ext4_ext_mark_uninitialized(ex);
-
-			ext4_ext_invalidate_cache(inode);
-
-			err = ext4_ext_rm_leaf(handle, inode, path,
-					       &partial_cluster, map->m_lblk,
-					       map->m_lblk + punched_out);
-
-			if (!err && path->p_hdr->eh_entries == 0) {
-				/*
-				 * Punch hole freed all of this sub tree,
-				 * so we need to correct eh_depth
-				 */
-				err = ext4_ext_get_access(handle, inode, path);
-				if (err == 0) {
-					ext_inode_hdr(inode)->eh_depth = 0;
-					ext_inode_hdr(inode)->eh_max =
-					cpu_to_le16(ext4_ext_space_root(
-						inode, 0));
-
-					err = ext4_ext_dirty(
-						handle, inode, path);
-				}
-			}
-
-			goto out2;
+			ret = ext4_ext_handle_uninitialized_extents(
+				handle, inode, map, path, flags,
+				allocated, newblock);
+			return ret;
 		}
 	}
 
@@ -4165,13 +4146,11 @@
 		ext4_ext_drop_refs(path);
 		kfree(path);
 	}
-	result = (flags & EXT4_GET_BLOCKS_PUNCH_OUT_EXT) ?
-			punched_out : allocated;
 
 	trace_ext4_ext_map_blocks_exit(inode, map->m_lblk,
-		newblock, map->m_len, err ? err : result);
+		newblock, map->m_len, err ? err : allocated);
 
-	return err ? err : result;
+	return err ? err : allocated;
 }
 
 void ext4_ext_truncate(struct inode *inode)
@@ -4228,7 +4207,7 @@
 
 	last_block = (inode->i_size + sb->s_blocksize - 1)
 			>> EXT4_BLOCK_SIZE_BITS(sb);
-	err = ext4_ext_remove_space(inode, last_block);
+	err = ext4_ext_remove_space(inode, last_block, EXT_MAX_BLOCKS - 1);
 
 	/* In a multi-transaction truncate, we only make the final
 	 * transaction synchronous.
@@ -4436,10 +4415,11 @@
 				      EXT4_GET_BLOCKS_IO_CONVERT_EXT);
 		if (ret <= 0) {
 			WARN_ON(ret <= 0);
-			printk(KERN_ERR "%s: ext4_ext_map_blocks "
-				    "returned error inode#%lu, block=%u, "
-				    "max_blocks=%u", __func__,
-				    inode->i_ino, map.m_lblk, map.m_len);
+			ext4_msg(inode->i_sb, KERN_ERR,
+				 "%s:%d: inode #%lu: block %u: len %u: "
+				 "ext4_ext_map_blocks returned %d",
+				 __func__, __LINE__, inode->i_ino, map.m_lblk,
+				 map.m_len, ret);
 		}
 		ext4_mark_inode_dirty(handle, inode);
 		ret2 = ext4_journal_stop(handle);
@@ -4705,14 +4685,12 @@
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	struct super_block *sb = inode->i_sb;
-	struct ext4_ext_cache cache_ex;
-	ext4_lblk_t first_block, last_block, num_blocks, iblock, max_blocks;
+	ext4_lblk_t first_block, stop_block;
 	struct address_space *mapping = inode->i_mapping;
-	struct ext4_map_blocks map;
 	handle_t *handle;
 	loff_t first_page, last_page, page_len;
 	loff_t first_page_offset, last_page_offset;
-	int ret, credits, blocks_released, err = 0;
+	int credits, err = 0;
 
 	/* No need to punch hole beyond i_size */
 	if (offset >= inode->i_size)
@@ -4728,10 +4706,6 @@
 		   offset;
 	}
 
-	first_block = (offset + sb->s_blocksize - 1) >>
-		EXT4_BLOCK_SIZE_BITS(sb);
-	last_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
-
 	first_page = (offset + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
 	last_page = (offset + length) >> PAGE_CACHE_SHIFT;
 
@@ -4810,7 +4784,6 @@
 		}
 	}
 
-
 	/*
 	 * If i_size is contained in the last page, we need to
 	 * unmap and zero the partial page after i_size
@@ -4830,73 +4803,22 @@
 		}
 	}
 
+	first_block = (offset + sb->s_blocksize - 1) >>
+		EXT4_BLOCK_SIZE_BITS(sb);
+	stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb);
+
 	/* If there are no blocks to remove, return now */
-	if (first_block >= last_block)
+	if (first_block >= stop_block)
 		goto out;
 
 	down_write(&EXT4_I(inode)->i_data_sem);
 	ext4_ext_invalidate_cache(inode);
 	ext4_discard_preallocations(inode);
 
-	/*
-	 * Loop over all the blocks and identify blocks
-	 * that need to be punched out
-	 */
-	iblock = first_block;
-	blocks_released = 0;
-	while (iblock < last_block) {
-		max_blocks = last_block - iblock;
-		num_blocks = 1;
-		memset(&map, 0, sizeof(map));
-		map.m_lblk = iblock;
-		map.m_len = max_blocks;
-		ret = ext4_ext_map_blocks(handle, inode, &map,
-			EXT4_GET_BLOCKS_PUNCH_OUT_EXT);
+	err = ext4_ext_remove_space(inode, first_block, stop_block - 1);
 
-		if (ret > 0) {
-			blocks_released += ret;
-			num_blocks = ret;
-		} else if (ret == 0) {
-			/*
-			 * If map blocks could not find the block,
-			 * then it is in a hole.  If the hole was
-			 * not already cached, then map blocks should
-			 * put it in the cache.  So we can get the hole
-			 * out of the cache
-			 */
-			memset(&cache_ex, 0, sizeof(cache_ex));
-			if ((ext4_ext_check_cache(inode, iblock, &cache_ex)) &&
-				!cache_ex.ec_start) {
-
-				/* The hole is cached */
-				num_blocks = cache_ex.ec_block +
-				cache_ex.ec_len - iblock;
-
-			} else {
-				/* The block could not be identified */
-				err = -EIO;
-				break;
-			}
-		} else {
-			/* Map blocks error */
-			err = ret;
-			break;
-		}
-
-		if (num_blocks == 0) {
-			/* This condition should never happen */
-			ext_debug("Block lookup failed");
-			err = -EIO;
-			break;
-		}
-
-		iblock += num_blocks;
-	}
-
-	if (blocks_released > 0) {
-		ext4_ext_invalidate_cache(inode);
-		ext4_discard_preallocations(inode);
-	}
+	ext4_ext_invalidate_cache(inode);
+	ext4_discard_preallocations(inode);
 
 	if (IS_SYNC(inode))
 		ext4_handle_sync(handle);

diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c
index 00a2cb7..bb6c7d8 100644
--- a/fs/ext4/fsync.c
+++ b/fs/ext4/fsync.c

@@ -89,6 +89,7 @@
 		io = list_entry(ei->i_completed_io_list.next,
 				ext4_io_end_t, list);
 		list_del_init(&io->list);
+		io->flag |= EXT4_IO_END_IN_FSYNC;
 		/*
 		 * Calling ext4_end_io_nolock() to convert completed
 		 * IO to written.
@@ -108,6 +109,7 @@
 		if (ret < 0)
 			ret2 = ret;
 		spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+		io->flag &= ~EXT4_IO_END_IN_FSYNC;
 	}
 	spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 	return (ret2 < 0) ? ret2 : 0;

diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 25d8c97..409c2ee 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c

@@ -92,6 +92,16 @@
 	return EXT4_INODES_PER_GROUP(sb);
 }
 
+void ext4_end_bitmap_read(struct buffer_head *bh, int uptodate)
+{
+	if (uptodate) {
+		set_buffer_uptodate(bh);
+		set_bitmap_uptodate(bh);
+	}
+	unlock_buffer(bh);
+	put_bh(bh);
+}
+
 /*
  * Read the inode allocation bitmap for a given block_group, reading
  * into the specified slot in the superblock's bitmap cache.
@@ -147,18 +157,18 @@
 		return bh;
 	}
 	/*
-	 * submit the buffer_head for read. We can
-	 * safely mark the bitmap as uptodate now.
-	 * We do it here so the bitmap uptodate bit
-	 * get set with buffer lock held.
+	 * submit the buffer_head for reading
 	 */
 	trace_ext4_load_inode_bitmap(sb, block_group);
-	set_bitmap_uptodate(bh);
-	if (bh_submit_read(bh) < 0) {
+	bh->b_end_io = ext4_end_bitmap_read;
+	get_bh(bh);
+	submit_bh(READ, bh);
+	wait_on_buffer(bh);
+	if (!buffer_uptodate(bh)) {
 		put_bh(bh);
 		ext4_error(sb, "Cannot read inode bitmap - "
-			    "block_group = %u, inode_bitmap = %llu",
-			    block_group, bitmap_blk);
+			   "block_group = %u, inode_bitmap = %llu",
+			   block_group, bitmap_blk);
 		return NULL;
 	}
 	return bh;
@@ -194,19 +204,20 @@
 	struct ext4_sb_info *sbi;
 	int fatal = 0, err, count, cleared;
 
+	if (!sb) {
+		printk(KERN_ERR "EXT4-fs: %s:%d: inode on "
+		       "nonexistent device\n", __func__, __LINE__);
+		return;
+	}
 	if (atomic_read(&inode->i_count) > 1) {
-		printk(KERN_ERR "ext4_free_inode: inode has count=%d\n",
-		       atomic_read(&inode->i_count));
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: count=%d",
+			 __func__, __LINE__, inode->i_ino,
+			 atomic_read(&inode->i_count));
 		return;
 	}
 	if (inode->i_nlink) {
-		printk(KERN_ERR "ext4_free_inode: inode has nlink=%d\n",
-		       inode->i_nlink);
-		return;
-	}
-	if (!sb) {
-		printk(KERN_ERR "ext4_free_inode: inode on "
-		       "nonexistent device\n");
+		ext4_msg(sb, KERN_ERR, "%s:%d: inode #%lu: nlink=%d\n",
+			 __func__, __LINE__, inode->i_ino, inode->i_nlink);
 		return;
 	}
 	sbi = EXT4_SB(sb);
@@ -593,94 +604,6 @@
 }
 
 /*
- * claim the inode from the inode bitmap. If the group
- * is uninit we need to take the groups's ext4_group_lock
- * and clear the uninit flag. The inode bitmap update
- * and group desc uninit flag clear should be done
- * after holding ext4_group_lock so that ext4_read_inode_bitmap
- * doesn't race with the ext4_claim_inode
- */
-static int ext4_claim_inode(struct super_block *sb,
-			struct buffer_head *inode_bitmap_bh,
-			unsigned long ino, ext4_group_t group, umode_t mode)
-{
-	int free = 0, retval = 0, count;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_group_info *grp = ext4_get_group_info(sb, group);
-	struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
-
-	/*
-	 * We have to be sure that new inode allocation does not race with
-	 * inode table initialization, because otherwise we may end up
-	 * allocating and writing new inode right before sb_issue_zeroout
-	 * takes place and overwriting our new inode with zeroes. So we
-	 * take alloc_sem to prevent it.
-	 */
-	down_read(&grp->alloc_sem);
-	ext4_lock_group(sb, group);
-	if (ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data)) {
-		/* not a free inode */
-		retval = 1;
-		goto err_ret;
-	}
-	ino++;
-	if ((group == 0 && ino < EXT4_FIRST_INO(sb)) ||
-			ino > EXT4_INODES_PER_GROUP(sb)) {
-		ext4_unlock_group(sb, group);
-		up_read(&grp->alloc_sem);
-		ext4_error(sb, "reserved inode or inode > inodes count - "
-			   "block_group = %u, inode=%lu", group,
-			   ino + group * EXT4_INODES_PER_GROUP(sb));
-		return 1;
-	}
-	/* If we didn't allocate from within the initialized part of the inode
-	 * table then we need to initialize up to this inode. */
-	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
-
-		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
-			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
-			/* When marking the block group with
-			 * ~EXT4_BG_INODE_UNINIT we don't want to depend
-			 * on the value of bg_itable_unused even though
-			 * mke2fs could have initialized the same for us.
-			 * Instead we calculated the value below
-			 */
-
-			free = 0;
-		} else {
-			free = EXT4_INODES_PER_GROUP(sb) -
-				ext4_itable_unused_count(sb, gdp);
-		}
-
-		/*
-		 * Check the relative inode number against the last used
-		 * relative inode number in this group. if it is greater
-		 * we need to  update the bg_itable_unused count
-		 *
-		 */
-		if (ino > free)
-			ext4_itable_unused_set(sb, gdp,
-					(EXT4_INODES_PER_GROUP(sb) - ino));
-	}
-	count = ext4_free_inodes_count(sb, gdp) - 1;
-	ext4_free_inodes_set(sb, gdp, count);
-	if (S_ISDIR(mode)) {
-		count = ext4_used_dirs_count(sb, gdp) + 1;
-		ext4_used_dirs_set(sb, gdp, count);
-		if (sbi->s_log_groups_per_flex) {
-			ext4_group_t f = ext4_flex_group(sbi, group);
-
-			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
-		}
-	}
-	gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
-err_ret:
-	ext4_unlock_group(sb, group);
-	up_read(&grp->alloc_sem);
-	return retval;
-}
-
-/*
  * There are two policies for allocating an inode.  If the new inode is
  * a directory, then a forward search is made for a block group with both
  * free space and a low directory-to-inode ratio; if that fails, then of
@@ -741,6 +664,11 @@
 	if (ret2 == -1)
 		goto out;
 
+	/*
+	 * Normally we will only go through one pass of this loop,
+	 * unless we get unlucky and it turns out the group we selected
+	 * had its last inode grabbed by someone else.
+	 */
 	for (i = 0; i < ngroups; i++, ino = 0) {
 		err = -EIO;
 
@@ -757,51 +685,24 @@
 		ino = ext4_find_next_zero_bit((unsigned long *)
 					      inode_bitmap_bh->b_data,
 					      EXT4_INODES_PER_GROUP(sb), ino);
-
-		if (ino < EXT4_INODES_PER_GROUP(sb)) {
-
-			BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle,
-							    inode_bitmap_bh);
-			if (err)
-				goto fail;
-
-			BUFFER_TRACE(group_desc_bh, "get_write_access");
-			err = ext4_journal_get_write_access(handle,
-								group_desc_bh);
-			if (err)
-				goto fail;
-			if (!ext4_claim_inode(sb, inode_bitmap_bh,
-						ino, group, mode)) {
-				/* we won it */
-				BUFFER_TRACE(inode_bitmap_bh,
-					"call ext4_handle_dirty_metadata");
-				err = ext4_handle_dirty_metadata(handle,
-								 NULL,
-							inode_bitmap_bh);
-				if (err)
-					goto fail;
-				/* zero bit is inode number 1*/
-				ino++;
-				goto got;
-			}
-			/* we lost it */
-			ext4_handle_release_buffer(handle, inode_bitmap_bh);
-			ext4_handle_release_buffer(handle, group_desc_bh);
-
-			if (++ino < EXT4_INODES_PER_GROUP(sb))
-				goto repeat_in_this_group;
+		if (ino >= EXT4_INODES_PER_GROUP(sb)) {
+			if (++group == ngroups)
+				group = 0;
+			continue;
 		}
-
-		/*
-		 * This case is possible in concurrent environment.  It is very
-		 * rare.  We cannot repeat the find_group_xxx() call because
-		 * that will simply return the same blockgroup, because the
-		 * group descriptor metadata has not yet been updated.
-		 * So we just go onto the next blockgroup.
-		 */
-		if (++group == ngroups)
-			group = 0;
+		if (group == 0 && (ino+1) < EXT4_FIRST_INO(sb)) {
+			ext4_error(sb, "reserved inode found cleared - "
+				   "inode=%lu", ino + 1);
+			continue;
+		}
+		ext4_lock_group(sb, group);
+		ret2 = ext4_test_and_set_bit(ino, inode_bitmap_bh->b_data);
+		ext4_unlock_group(sb, group);
+		ino++;		/* the inode bitmap is zero-based */
+		if (!ret2)
+			goto got; /* we grabbed the inode! */
+		if (ino < EXT4_INODES_PER_GROUP(sb))
+			goto repeat_in_this_group;
 	}
 	err = -ENOSPC;
 	goto out;
@@ -838,6 +739,59 @@
 		if (err)
 			goto fail;
 	}
+
+	BUFFER_TRACE(inode_bitmap_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, inode_bitmap_bh);
+	if (err)
+		goto fail;
+
+	BUFFER_TRACE(group_desc_bh, "get_write_access");
+	err = ext4_journal_get_write_access(handle, group_desc_bh);
+	if (err)
+		goto fail;
+
+	/* Update the relevant bg descriptor fields */
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+		int free;
+		struct ext4_group_info *grp = ext4_get_group_info(sb, group);
+
+		down_read(&grp->alloc_sem); /* protect vs itable lazyinit */
+		ext4_lock_group(sb, group); /* while we modify the bg desc */
+		free = EXT4_INODES_PER_GROUP(sb) -
+			ext4_itable_unused_count(sb, gdp);
+		if (gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT)) {
+			gdp->bg_flags &= cpu_to_le16(~EXT4_BG_INODE_UNINIT);
+			free = 0;
+		}
+		/*
+		 * Check the relative inode number against the last used
+		 * relative inode number in this group. if it is greater
+		 * we need to update the bg_itable_unused count
+		 */
+		if (ino > free)
+			ext4_itable_unused_set(sb, gdp,
+					(EXT4_INODES_PER_GROUP(sb) - ino));
+		up_read(&grp->alloc_sem);
+	}
+	ext4_free_inodes_set(sb, gdp, ext4_free_inodes_count(sb, gdp) - 1);
+	if (S_ISDIR(mode)) {
+		ext4_used_dirs_set(sb, gdp, ext4_used_dirs_count(sb, gdp) + 1);
+		if (sbi->s_log_groups_per_flex) {
+			ext4_group_t f = ext4_flex_group(sbi, group);
+
+			atomic_inc(&sbi->s_flex_groups[f].used_dirs);
+		}
+	}
+	if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_GDT_CSUM)) {
+		gdp->bg_checksum = ext4_group_desc_csum(sbi, group, gdp);
+		ext4_unlock_group(sb, group);
+	}
+
+	BUFFER_TRACE(inode_bitmap_bh, "call ext4_handle_dirty_metadata");
+	err = ext4_handle_dirty_metadata(handle, NULL, inode_bitmap_bh);
+	if (err)
+		goto fail;
+
 	BUFFER_TRACE(group_desc_bh, "call ext4_handle_dirty_metadata");
 	err = ext4_handle_dirty_metadata(handle, NULL, group_desc_bh);
 	if (err)
@@ -1101,7 +1055,7 @@
  * where it is called from on active part of filesystem is ext4lazyinit
  * thread, so we do not need any special locks, however we have to prevent
  * inode allocation from the current group, so we take alloc_sem lock, to
- * block ext4_claim_inode until we are finished.
+ * block ext4_new_inode() until we are finished.
  */
 int ext4_init_inode_table(struct super_block *sb, ext4_group_t group,
 				 int barrier)
@@ -1149,9 +1103,9 @@
 			    sbi->s_inodes_per_block);
 
 	if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group)) {
-		ext4_error(sb, "Something is wrong with group %u\n"
-			   "Used itable blocks: %d"
-			   "itable unused count: %u\n",
+		ext4_error(sb, "Something is wrong with group %u: "
+			   "used itable blocks: %d; "
+			   "itable unused count: %u",
 			   group, used_blks,
 			   ext4_itable_unused_count(sb, gdp));
 		ret = 1;

diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index feaa82f..c77b0bd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c

@@ -272,7 +272,7 @@
 	trace_ext4_da_update_reserve_space(inode, used, quota_claim);
 	if (unlikely(used > ei->i_reserved_data_blocks)) {
 		ext4_msg(inode->i_sb, KERN_NOTICE, "%s: ino %lu, used %d "
-			 "with only %d reserved data blocks\n",
+			 "with only %d reserved data blocks",
 			 __func__, inode->i_ino, used,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
@@ -1165,7 +1165,7 @@
 		 */
 		ext4_msg(inode->i_sb, KERN_NOTICE, "ext4_da_release_space: "
 			 "ino %lu, to_free %d with only %d reserved "
-			 "data blocks\n", inode->i_ino, to_free,
+			 "data blocks", inode->i_ino, to_free,
 			 ei->i_reserved_data_blocks);
 		WARN_ON(1);
 		to_free = ei->i_reserved_data_blocks;
@@ -1428,20 +1428,22 @@
 static void ext4_print_free_blocks(struct inode *inode)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
-	printk(KERN_CRIT "Total free blocks count %lld\n",
+	struct super_block *sb = inode->i_sb;
+
+	ext4_msg(sb, KERN_CRIT, "Total free blocks count %lld",
 	       EXT4_C2B(EXT4_SB(inode->i_sb),
 			ext4_count_free_clusters(inode->i_sb)));
-	printk(KERN_CRIT "Free/Dirty block details\n");
-	printk(KERN_CRIT "free_blocks=%lld\n",
+	ext4_msg(sb, KERN_CRIT, "Free/Dirty block details");
+	ext4_msg(sb, KERN_CRIT, "free_blocks=%lld",
 	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
 		percpu_counter_sum(&sbi->s_freeclusters_counter)));
-	printk(KERN_CRIT "dirty_blocks=%lld\n",
+	ext4_msg(sb, KERN_CRIT, "dirty_blocks=%lld",
 	       (long long) EXT4_C2B(EXT4_SB(inode->i_sb),
 		percpu_counter_sum(&sbi->s_dirtyclusters_counter)));
-	printk(KERN_CRIT "Block reservation details\n");
-	printk(KERN_CRIT "i_reserved_data_blocks=%u\n",
-	       EXT4_I(inode)->i_reserved_data_blocks);
-	printk(KERN_CRIT "i_reserved_meta_blocks=%u\n",
+	ext4_msg(sb, KERN_CRIT, "Block reservation details");
+	ext4_msg(sb, KERN_CRIT, "i_reserved_data_blocks=%u",
+		 EXT4_I(inode)->i_reserved_data_blocks);
+	ext4_msg(sb, KERN_CRIT, "i_reserved_meta_blocks=%u",
 	       EXT4_I(inode)->i_reserved_meta_blocks);
 	return;
 }
@@ -2482,13 +2484,14 @@
 	int write_mode = (int)(unsigned long)fsdata;
 
 	if (write_mode == FALL_BACK_TO_NONDELALLOC) {
-		if (ext4_should_order_data(inode)) {
+		switch (ext4_inode_journal_mode(inode)) {
+		case EXT4_INODE_ORDERED_DATA_MODE:
 			return ext4_ordered_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
-		} else if (ext4_should_writeback_data(inode)) {
+		case EXT4_INODE_WRITEBACK_DATA_MODE:
 			return ext4_writeback_write_end(file, mapping, pos,
 					len, copied, page, fsdata);
-		} else {
+		default:
 			BUG();
 		}
 	}
@@ -2763,7 +2766,7 @@
 		goto out;
 
 	ext_debug("ext4_end_io_dio(): io_end 0x%p "
-		  "for inode %lu, iocb 0x%p, offset %llu, size %llu\n",
+		  "for inode %lu, iocb 0x%p, offset %llu, size %zd\n",
  		  iocb->private, io_end->inode->i_ino, iocb, offset,
 		  size);
 
@@ -2795,9 +2798,6 @@
 
 	/* queue the work to convert unwritten extents to written */
 	queue_work(wq, &io_end->work);
-
-	/* XXX: probably should move into the real I/O completion handler */
-	inode_dio_done(inode);
 }
 
 static void ext4_end_io_buffer_write(struct buffer_head *bh, int uptodate)
@@ -2811,8 +2811,9 @@
 		goto out;
 
 	if (!(io_end->inode->i_sb->s_flags & MS_ACTIVE)) {
-		printk("sb umounted, discard end_io request for inode %lu\n",
-			io_end->inode->i_ino);
+		ext4_msg(io_end->inode->i_sb, KERN_INFO,
+			 "sb umounted, discard end_io request for inode %lu",
+			 io_end->inode->i_ino);
 		ext4_free_io_end(io_end);
 		goto out;
 	}
@@ -2921,9 +2922,12 @@
 		iocb->private = NULL;
 		EXT4_I(inode)->cur_aio_dio = NULL;
 		if (!is_sync_kiocb(iocb)) {
-			iocb->private = ext4_init_io_end(inode, GFP_NOFS);
-			if (!iocb->private)
+			ext4_io_end_t *io_end =
+				ext4_init_io_end(inode, GFP_NOFS);
+			if (!io_end)
 				return -ENOMEM;
+			io_end->flag |= EXT4_IO_END_DIRECT;
+			iocb->private = io_end;
 			/*
 			 * we save the io structure for current async
 			 * direct IO, so that later ext4_map_blocks()
@@ -2940,7 +2944,7 @@
 					 ext4_get_block_write,
 					 ext4_end_io_dio,
 					 NULL,
-					 DIO_LOCKING | DIO_SKIP_HOLES);
+					 DIO_LOCKING);
 		if (iocb->private)
 			EXT4_I(inode)->cur_aio_dio = NULL;
 		/*
@@ -3086,18 +3090,25 @@
 
 void ext4_set_aops(struct inode *inode)
 {
-	if (ext4_should_order_data(inode) &&
-		test_opt(inode->i_sb, DELALLOC))
-		inode->i_mapping->a_ops = &ext4_da_aops;
-	else if (ext4_should_order_data(inode))
-		inode->i_mapping->a_ops = &ext4_ordered_aops;
-	else if (ext4_should_writeback_data(inode) &&
-		 test_opt(inode->i_sb, DELALLOC))
-		inode->i_mapping->a_ops = &ext4_da_aops;
-	else if (ext4_should_writeback_data(inode))
-		inode->i_mapping->a_ops = &ext4_writeback_aops;
-	else
+	switch (ext4_inode_journal_mode(inode)) {
+	case EXT4_INODE_ORDERED_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_ordered_aops;
+		break;
+	case EXT4_INODE_WRITEBACK_DATA_MODE:
+		if (test_opt(inode->i_sb, DELALLOC))
+			inode->i_mapping->a_ops = &ext4_da_aops;
+		else
+			inode->i_mapping->a_ops = &ext4_writeback_aops;
+		break;
+	case EXT4_INODE_JOURNAL_DATA_MODE:
 		inode->i_mapping->a_ops = &ext4_journalled_aops;
+		break;
+	default:
+		BUG();
+	}
 }
 
 
@@ -3329,16 +3340,16 @@
 {
 	struct inode *inode = file->f_path.dentry->d_inode;
 	if (!S_ISREG(inode->i_mode))
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 
 	if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
 		/* TODO: Add support for non extent hole punching */
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	if (EXT4_SB(inode->i_sb)->s_cluster_ratio > 1) {
 		/* TODO: Add support for bigalloc file systems */
-		return -ENOTSUPP;
+		return -EOPNOTSUPP;
 	}
 
 	return ext4_ext_punch_hole(file, offset, length);
@@ -3924,10 +3935,8 @@
 			ext4_update_dynamic_rev(sb);
 			EXT4_SET_RO_COMPAT_FEATURE(sb,
 					EXT4_FEATURE_RO_COMPAT_LARGE_FILE);
-			sb->s_dirt = 1;
 			ext4_handle_sync(handle);
-			err = ext4_handle_dirty_metadata(handle, NULL,
-					EXT4_SB(sb)->s_sbh);
+			err = ext4_handle_dirty_super(handle, sb);
 		}
 	}
 	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
@@ -4152,11 +4161,9 @@
 	}
 
 	if (attr->ia_valid & ATTR_SIZE) {
-		if (attr->ia_size != i_size_read(inode)) {
+		if (attr->ia_size != i_size_read(inode))
 			truncate_setsize(inode, attr->ia_size);
-			ext4_truncate(inode);
-		} else if (ext4_test_inode_flag(inode, EXT4_INODE_EOFBLOCKS))
-			ext4_truncate(inode);
+		ext4_truncate(inode);
 	}
 
 	if (!rc) {
@@ -4314,7 +4321,7 @@
 {
 	int err = 0;
 
-	if (test_opt(inode->i_sb, I_VERSION))
+	if (IS_I_VERSION(inode))
 		inode_inc_iversion(inode);
 
 	/* the do_update_inode consumes one bh->b_count */

diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index cb990b2..99ab428 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c

@@ -21,6 +21,7 @@
  * mballoc.c contains the multiblocks allocation routines
  */
 
+#include "ext4_jbd2.h"
 #include "mballoc.h"
 #include <linux/debugfs.h>
 #include <linux/slab.h>
@@ -339,7 +340,7 @@
  */
 static struct kmem_cache *ext4_pspace_cachep;
 static struct kmem_cache *ext4_ac_cachep;
-static struct kmem_cache *ext4_free_ext_cachep;
+static struct kmem_cache *ext4_free_data_cachep;
 
 /* We create slab caches for groupinfo data structures based on the
  * superblock block size.  There will be one per mounted filesystem for
@@ -357,7 +358,8 @@
 					ext4_group_t group);
 static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap,
 						ext4_group_t group);
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn);
+static void ext4_free_data_callback(struct super_block *sb,
+				struct ext4_journal_cb_entry *jce, int rc);
 
 static inline void *mb_correct_addr_and_bit(int *bit, void *addr)
 {
@@ -425,7 +427,7 @@
 {
 	char *bb;
 
-	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(max == NULL);
 
 	if (order > e4b->bd_blkbits + 1) {
@@ -436,10 +438,10 @@
 	/* at order 0 we see each particular block */
 	if (order == 0) {
 		*max = 1 << (e4b->bd_blkbits + 3);
-		return EXT4_MB_BITMAP(e4b);
+		return e4b->bd_bitmap;
 	}
 
-	bb = EXT4_MB_BUDDY(e4b) + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
+	bb = e4b->bd_buddy + EXT4_SB(e4b->bd_sb)->s_mb_offsets[order];
 	*max = EXT4_SB(e4b->bd_sb)->s_mb_maxs[order];
 
 	return bb;
@@ -588,7 +590,7 @@
 			for (j = 0; j < (1 << order); j++) {
 				k = (i * (1 << order)) + j;
 				MB_CHECK_ASSERT(
-					!mb_test_bit(k, EXT4_MB_BITMAP(e4b)));
+					!mb_test_bit(k, e4b->bd_bitmap));
 			}
 			count++;
 		}
@@ -782,7 +784,7 @@
 	int groups_per_page;
 	int err = 0;
 	int i;
-	ext4_group_t first_group;
+	ext4_group_t first_group, group;
 	int first_block;
 	struct super_block *sb;
 	struct buffer_head *bhs;
@@ -806,24 +808,23 @@
 
 	/* allocate buffer_heads to read bitmaps */
 	if (groups_per_page > 1) {
-		err = -ENOMEM;
 		i = sizeof(struct buffer_head *) * groups_per_page;
 		bh = kzalloc(i, GFP_NOFS);
-		if (bh == NULL)
+		if (bh == NULL) {
+			err = -ENOMEM;
 			goto out;
+		}
 	} else
 		bh = &bhs;
 
 	first_group = page->index * blocks_per_page / 2;
 
 	/* read all groups the page covers into the cache */
-	for (i = 0; i < groups_per_page; i++) {
-		struct ext4_group_desc *desc;
-
-		if (first_group + i >= ngroups)
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (group >= ngroups)
 			break;
 
-		grinfo = ext4_get_group_info(sb, first_group + i);
+		grinfo = ext4_get_group_info(sb, group);
 		/*
 		 * If page is uptodate then we came here after online resize
 		 * which added some new uninitialized group info structs, so
@@ -834,69 +835,21 @@
 			bh[i] = NULL;
 			continue;
 		}
-
-		err = -EIO;
-		desc = ext4_get_group_desc(sb, first_group + i, NULL);
-		if (desc == NULL)
+		if (!(bh[i] = ext4_read_block_bitmap_nowait(sb, group))) {
+			err = -ENOMEM;
 			goto out;
-
-		err = -ENOMEM;
-		bh[i] = sb_getblk(sb, ext4_block_bitmap(sb, desc));
-		if (bh[i] == NULL)
-			goto out;
-
-		if (bitmap_uptodate(bh[i]))
-			continue;
-
-		lock_buffer(bh[i]);
-		if (bitmap_uptodate(bh[i])) {
-			unlock_buffer(bh[i]);
-			continue;
 		}
-		ext4_lock_group(sb, first_group + i);
-		if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) {
-			ext4_init_block_bitmap(sb, bh[i],
-						first_group + i, desc);
-			set_bitmap_uptodate(bh[i]);
-			set_buffer_uptodate(bh[i]);
-			ext4_unlock_group(sb, first_group + i);
-			unlock_buffer(bh[i]);
-			continue;
-		}
-		ext4_unlock_group(sb, first_group + i);
-		if (buffer_uptodate(bh[i])) {
-			/*
-			 * if not uninit if bh is uptodate,
-			 * bitmap is also uptodate
-			 */
-			set_bitmap_uptodate(bh[i]);
-			unlock_buffer(bh[i]);
-			continue;
-		}
-		get_bh(bh[i]);
-		/*
-		 * submit the buffer_head for read. We can
-		 * safely mark the bitmap as uptodate now.
-		 * We do it here so the bitmap uptodate bit
-		 * get set with buffer lock held.
-		 */
-		set_bitmap_uptodate(bh[i]);
-		bh[i]->b_end_io = end_buffer_read_sync;
-		submit_bh(READ, bh[i]);
-		mb_debug(1, "read bitmap for group %u\n", first_group + i);
+		mb_debug(1, "read bitmap for group %u\n", group);
 	}
 
 	/* wait for I/O completion */
-	for (i = 0; i < groups_per_page; i++)
-		if (bh[i])
-			wait_on_buffer(bh[i]);
-
-	err = -EIO;
-	for (i = 0; i < groups_per_page; i++)
-		if (bh[i] && !buffer_uptodate(bh[i]))
+	for (i = 0, group = first_group; i < groups_per_page; i++, group++) {
+		if (bh[i] && ext4_wait_block_bitmap(sb, group, bh[i])) {
+			err = -EIO;
 			goto out;
+		}
+	}
 
-	err = 0;
 	first_block = page->index * blocks_per_page;
 	for (i = 0; i < blocks_per_page; i++) {
 		int group;
@@ -1250,10 +1203,10 @@
 	int order = 1;
 	void *bb;
 
-	BUG_ON(EXT4_MB_BITMAP(e4b) == EXT4_MB_BUDDY(e4b));
+	BUG_ON(e4b->bd_bitmap == e4b->bd_buddy);
 	BUG_ON(block >= (1 << (e4b->bd_blkbits + 3)));
 
-	bb = EXT4_MB_BUDDY(e4b);
+	bb = e4b->bd_buddy;
 	while (order <= e4b->bd_blkbits + 1) {
 		block = block >> 1;
 		if (!mb_test_bit(block, bb)) {
@@ -1323,9 +1276,9 @@
 
 	/* let's maintain fragments counter */
 	if (first != 0)
-		block = !mb_test_bit(first - 1, EXT4_MB_BITMAP(e4b));
+		block = !mb_test_bit(first - 1, e4b->bd_bitmap);
 	if (first + count < EXT4_SB(sb)->s_mb_maxs[0])
-		max = !mb_test_bit(first + count, EXT4_MB_BITMAP(e4b));
+		max = !mb_test_bit(first + count, e4b->bd_bitmap);
 	if (block && max)
 		e4b->bd_info->bb_fragments--;
 	else if (!block && !max)
@@ -1336,7 +1289,7 @@
 		block = first++;
 		order = 0;
 
-		if (!mb_test_bit(block, EXT4_MB_BITMAP(e4b))) {
+		if (!mb_test_bit(block, e4b->bd_bitmap)) {
 			ext4_fsblk_t blocknr;
 
 			blocknr = ext4_group_first_block_no(sb, e4b->bd_group);
@@ -1347,7 +1300,7 @@
 					      "freeing already freed block "
 					      "(bit %u)", block);
 		}
-		mb_clear_bit(block, EXT4_MB_BITMAP(e4b));
+		mb_clear_bit(block, e4b->bd_bitmap);
 		e4b->bd_info->bb_counters[order]++;
 
 		/* start of the buddy */
@@ -1429,7 +1382,7 @@
 			break;
 
 		next = (block + 1) * (1 << order);
-		if (mb_test_bit(next, EXT4_MB_BITMAP(e4b)))
+		if (mb_test_bit(next, e4b->bd_bitmap))
 			break;
 
 		order = mb_find_order_for_block(e4b, next);
@@ -1466,9 +1419,9 @@
 
 	/* let's maintain fragments counter */
 	if (start != 0)
-		mlen = !mb_test_bit(start - 1, EXT4_MB_BITMAP(e4b));
+		mlen = !mb_test_bit(start - 1, e4b->bd_bitmap);
 	if (start + len < EXT4_SB(e4b->bd_sb)->s_mb_maxs[0])
-		max = !mb_test_bit(start + len, EXT4_MB_BITMAP(e4b));
+		max = !mb_test_bit(start + len, e4b->bd_bitmap);
 	if (mlen && max)
 		e4b->bd_info->bb_fragments++;
 	else if (!mlen && !max)
@@ -1511,7 +1464,7 @@
 	}
 	mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info);
 
-	ext4_set_bits(EXT4_MB_BITMAP(e4b), ex->fe_start, len0);
+	ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0);
 	mb_check_buddy(e4b);
 
 	return ret;
@@ -1810,7 +1763,7 @@
 					struct ext4_buddy *e4b)
 {
 	struct super_block *sb = ac->ac_sb;
-	void *bitmap = EXT4_MB_BITMAP(e4b);
+	void *bitmap = e4b->bd_bitmap;
 	struct ext4_free_extent ex;
 	int i;
 	int free;
@@ -1870,7 +1823,7 @@
 {
 	struct super_block *sb = ac->ac_sb;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	void *bitmap = EXT4_MB_BITMAP(e4b);
+	void *bitmap = e4b->bd_bitmap;
 	struct ext4_free_extent ex;
 	ext4_fsblk_t first_group_block;
 	ext4_fsblk_t a;
@@ -2224,7 +2177,7 @@
 			EXT4_DESC_PER_BLOCK_BITS(sb);
 		meta_group_info = kmalloc(metalen, GFP_KERNEL);
 		if (meta_group_info == NULL) {
-			ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate mem "
+			ext4_msg(sb, KERN_ERR, "can't allocate mem "
 				 "for a buddy group");
 			goto exit_meta_group_info;
 		}
@@ -2238,7 +2191,7 @@
 
 	meta_group_info[i] = kmem_cache_alloc(cachep, GFP_KERNEL);
 	if (meta_group_info[i] == NULL) {
-		ext4_msg(sb, KERN_ERR, "EXT4-fs: can't allocate buddy mem");
+		ext4_msg(sb, KERN_ERR, "can't allocate buddy mem");
 		goto exit_group_info;
 	}
 	memset(meta_group_info[i], 0, kmem_cache_size(cachep));
@@ -2522,9 +2475,6 @@
 		proc_create_data("mb_groups", S_IRUGO, sbi->s_proc,
 				 &ext4_mb_seq_groups_fops, sb);
 
-	if (sbi->s_journal)
-		sbi->s_journal->j_commit_callback = release_blocks_on_commit;
-
 	return 0;
 
 out_free_locality_groups:
@@ -2637,58 +2587,55 @@
  * This function is called by the jbd2 layer once the commit has finished,
  * so we know we can free the blocks that were released with that commit.
  */
-static void release_blocks_on_commit(journal_t *journal, transaction_t *txn)
+static void ext4_free_data_callback(struct super_block *sb,
+				    struct ext4_journal_cb_entry *jce,
+				    int rc)
 {
-	struct super_block *sb = journal->j_private;
+	struct ext4_free_data *entry = (struct ext4_free_data *)jce;
 	struct ext4_buddy e4b;
 	struct ext4_group_info *db;
 	int err, count = 0, count2 = 0;
-	struct ext4_free_data *entry;
-	struct list_head *l, *ltmp;
 
-	list_for_each_safe(l, ltmp, &txn->t_private_list) {
-		entry = list_entry(l, struct ext4_free_data, list);
+	mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
+		 entry->efd_count, entry->efd_group, entry);
 
-		mb_debug(1, "gonna free %u blocks in group %u (0x%p):",
-			 entry->count, entry->group, entry);
+	if (test_opt(sb, DISCARD))
+		ext4_issue_discard(sb, entry->efd_group,
+				   entry->efd_start_cluster, entry->efd_count);
 
-		if (test_opt(sb, DISCARD))
-			ext4_issue_discard(sb, entry->group,
-					   entry->start_cluster, entry->count);
+	err = ext4_mb_load_buddy(sb, entry->efd_group, &e4b);
+	/* we expect to find existing buddy because it's pinned */
+	BUG_ON(err != 0);
 
-		err = ext4_mb_load_buddy(sb, entry->group, &e4b);
-		/* we expect to find existing buddy because it's pinned */
-		BUG_ON(err != 0);
 
-		db = e4b.bd_info;
-		/* there are blocks to put in buddy to make them really free */
-		count += entry->count;
-		count2++;
-		ext4_lock_group(sb, entry->group);
-		/* Take it out of per group rb tree */
-		rb_erase(&entry->node, &(db->bb_free_root));
-		mb_free_blocks(NULL, &e4b, entry->start_cluster, entry->count);
+	db = e4b.bd_info;
+	/* there are blocks to put in buddy to make them really free */
+	count += entry->efd_count;
+	count2++;
+	ext4_lock_group(sb, entry->efd_group);
+	/* Take it out of per group rb tree */
+	rb_erase(&entry->efd_node, &(db->bb_free_root));
+	mb_free_blocks(NULL, &e4b, entry->efd_start_cluster, entry->efd_count);
 
-		/*
-		 * Clear the trimmed flag for the group so that the next
-		 * ext4_trim_fs can trim it.
-		 * If the volume is mounted with -o discard, online discard
-		 * is supported and the free blocks will be trimmed online.
+	/*
+	 * Clear the trimmed flag for the group so that the next
+	 * ext4_trim_fs can trim it.
+	 * If the volume is mounted with -o discard, online discard
+	 * is supported and the free blocks will be trimmed online.
+	 */
+	if (!test_opt(sb, DISCARD))
+		EXT4_MB_GRP_CLEAR_TRIMMED(db);
+
+	if (!db->bb_free_root.rb_node) {
+		/* No more items in the per group rb tree
+		 * balance refcounts from ext4_mb_free_metadata()
 		 */
-		if (!test_opt(sb, DISCARD))
-			EXT4_MB_GRP_CLEAR_TRIMMED(db);
-
-		if (!db->bb_free_root.rb_node) {
-			/* No more items in the per group rb tree
-			 * balance refcounts from ext4_mb_free_metadata()
-			 */
-			page_cache_release(e4b.bd_buddy_page);
-			page_cache_release(e4b.bd_bitmap_page);
-		}
-		ext4_unlock_group(sb, entry->group);
-		kmem_cache_free(ext4_free_ext_cachep, entry);
-		ext4_mb_unload_buddy(&e4b);
+		page_cache_release(e4b.bd_buddy_page);
+		page_cache_release(e4b.bd_bitmap_page);
 	}
+	ext4_unlock_group(sb, entry->efd_group);
+	kmem_cache_free(ext4_free_data_cachep, entry);
+	ext4_mb_unload_buddy(&e4b);
 
 	mb_debug(1, "freed %u blocks in %u structures\n", count, count2);
 }
@@ -2741,9 +2688,9 @@
 		return -ENOMEM;
 	}
 
-	ext4_free_ext_cachep = KMEM_CACHE(ext4_free_data,
-					  SLAB_RECLAIM_ACCOUNT);
-	if (ext4_free_ext_cachep == NULL) {
+	ext4_free_data_cachep = KMEM_CACHE(ext4_free_data,
+					   SLAB_RECLAIM_ACCOUNT);
+	if (ext4_free_data_cachep == NULL) {
 		kmem_cache_destroy(ext4_pspace_cachep);
 		kmem_cache_destroy(ext4_ac_cachep);
 		return -ENOMEM;
@@ -2761,7 +2708,7 @@
 	rcu_barrier();
 	kmem_cache_destroy(ext4_pspace_cachep);
 	kmem_cache_destroy(ext4_ac_cachep);
-	kmem_cache_destroy(ext4_free_ext_cachep);
+	kmem_cache_destroy(ext4_free_data_cachep);
 	ext4_groupinfo_destroy_slabs();
 	ext4_remove_debugfs_entry();
 }
@@ -2815,7 +2762,7 @@
 	len = EXT4_C2B(sbi, ac->ac_b_ex.fe_len);
 	if (!ext4_data_block_valid(sbi, block, len)) {
 		ext4_error(sb, "Allocating blocks %llu-%llu which overlap "
-			   "fs metadata\n", block, block+len);
+			   "fs metadata", block, block+len);
 		/* File system mounted not to panic on error
 		 * Fix the bitmap and repeat the block allocation
 		 * We leak some of the blocks here.
@@ -2911,7 +2858,8 @@
 	struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb);
 	int bsbits, max;
 	ext4_lblk_t end;
-	loff_t size, orig_size, start_off;
+	loff_t size, start_off;
+	loff_t orig_size __maybe_unused;
 	ext4_lblk_t start;
 	struct ext4_inode_info *ei = EXT4_I(ac->ac_inode);
 	struct ext4_prealloc_space *pa;
@@ -3321,8 +3269,8 @@
 	n = rb_first(&(grp->bb_free_root));
 
 	while (n) {
-		entry = rb_entry(n, struct ext4_free_data, node);
-		ext4_set_bits(bitmap, entry->start_cluster, entry->count);
+		entry = rb_entry(n, struct ext4_free_data, efd_node);
+		ext4_set_bits(bitmap, entry->efd_start_cluster, entry->efd_count);
 		n = rb_next(n);
 	}
 	return;
@@ -3916,11 +3864,11 @@
 	    (EXT4_SB(sb)->s_mount_flags & EXT4_MF_FS_ABORTED))
 		return;
 
-	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: Can't allocate:"
+	ext4_msg(ac->ac_sb, KERN_ERR, "Can't allocate:"
 			" Allocation context details:");
-	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: status %d flags %d",
+	ext4_msg(ac->ac_sb, KERN_ERR, "status %d flags %d",
 			ac->ac_status, ac->ac_flags);
-	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: orig %lu/%lu/%lu@%lu, "
+	ext4_msg(ac->ac_sb, KERN_ERR, "orig %lu/%lu/%lu@%lu, "
 		 	"goal %lu/%lu/%lu@%lu, "
 			"best %lu/%lu/%lu@%lu cr %d",
 			(unsigned long)ac->ac_o_ex.fe_group,
@@ -3936,9 +3884,9 @@
 			(unsigned long)ac->ac_b_ex.fe_len,
 			(unsigned long)ac->ac_b_ex.fe_logical,
 			(int)ac->ac_criteria);
-	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: %lu scanned, %d found",
+	ext4_msg(ac->ac_sb, KERN_ERR, "%lu scanned, %d found",
 		 ac->ac_ex_scanned, ac->ac_found);
-	ext4_msg(ac->ac_sb, KERN_ERR, "EXT4-fs: groups: ");
+	ext4_msg(ac->ac_sb, KERN_ERR, "groups: ");
 	ngroups = ext4_get_groups_count(sb);
 	for (i = 0; i < ngroups; i++) {
 		struct ext4_group_info *grp = ext4_get_group_info(sb, i);
@@ -4428,9 +4376,9 @@
 static int can_merge(struct ext4_free_data *entry1,
 			struct ext4_free_data *entry2)
 {
-	if ((entry1->t_tid == entry2->t_tid) &&
-	    (entry1->group == entry2->group) &&
-	    ((entry1->start_cluster + entry1->count) == entry2->start_cluster))
+	if ((entry1->efd_tid == entry2->efd_tid) &&
+	    (entry1->efd_group == entry2->efd_group) &&
+	    ((entry1->efd_start_cluster + entry1->efd_count) == entry2->efd_start_cluster))
 		return 1;
 	return 0;
 }
@@ -4452,8 +4400,8 @@
 	BUG_ON(e4b->bd_bitmap_page == NULL);
 	BUG_ON(e4b->bd_buddy_page == NULL);
 
-	new_node = &new_entry->node;
-	cluster = new_entry->start_cluster;
+	new_node = &new_entry->efd_node;
+	cluster = new_entry->efd_start_cluster;
 
 	if (!*n) {
 		/* first free block exent. We need to
@@ -4466,10 +4414,10 @@
 	}
 	while (*n) {
 		parent = *n;
-		entry = rb_entry(parent, struct ext4_free_data, node);
-		if (cluster < entry->start_cluster)
+		entry = rb_entry(parent, struct ext4_free_data, efd_node);
+		if (cluster < entry->efd_start_cluster)
 			n = &(*n)->rb_left;
-		else if (cluster >= (entry->start_cluster + entry->count))
+		else if (cluster >= (entry->efd_start_cluster + entry->efd_count))
 			n = &(*n)->rb_right;
 		else {
 			ext4_grp_locked_error(sb, group, 0,
@@ -4486,34 +4434,29 @@
 	/* Now try to see the extent can be merged to left and right */
 	node = rb_prev(new_node);
 	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, node);
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
 		if (can_merge(entry, new_entry)) {
-			new_entry->start_cluster = entry->start_cluster;
-			new_entry->count += entry->count;
+			new_entry->efd_start_cluster = entry->efd_start_cluster;
+			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			spin_lock(&sbi->s_md_lock);
-			list_del(&entry->list);
-			spin_unlock(&sbi->s_md_lock);
-			kmem_cache_free(ext4_free_ext_cachep, entry);
+			ext4_journal_callback_del(handle, &entry->efd_jce);
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
 
 	node = rb_next(new_node);
 	if (node) {
-		entry = rb_entry(node, struct ext4_free_data, node);
+		entry = rb_entry(node, struct ext4_free_data, efd_node);
 		if (can_merge(new_entry, entry)) {
-			new_entry->count += entry->count;
+			new_entry->efd_count += entry->efd_count;
 			rb_erase(node, &(db->bb_free_root));
-			spin_lock(&sbi->s_md_lock);
-			list_del(&entry->list);
-			spin_unlock(&sbi->s_md_lock);
-			kmem_cache_free(ext4_free_ext_cachep, entry);
+			ext4_journal_callback_del(handle, &entry->efd_jce);
+			kmem_cache_free(ext4_free_data_cachep, entry);
 		}
 	}
 	/* Add the extent to transaction's private list */
-	spin_lock(&sbi->s_md_lock);
-	list_add(&new_entry->list, &handle->h_transaction->t_private_list);
-	spin_unlock(&sbi->s_md_lock);
+	ext4_journal_callback_add(handle, ext4_free_data_callback,
+				  &new_entry->efd_jce);
 	return 0;
 }
 
@@ -4691,15 +4634,15 @@
 		 * blocks being freed are metadata. these blocks shouldn't
 		 * be used until this transaction is committed
 		 */
-		new_entry = kmem_cache_alloc(ext4_free_ext_cachep, GFP_NOFS);
+		new_entry = kmem_cache_alloc(ext4_free_data_cachep, GFP_NOFS);
 		if (!new_entry) {
 			err = -ENOMEM;
 			goto error_return;
 		}
-		new_entry->start_cluster = bit;
-		new_entry->group  = block_group;
-		new_entry->count = count_clusters;
-		new_entry->t_tid = handle->h_transaction->t_tid;
+		new_entry->efd_start_cluster = bit;
+		new_entry->efd_group = block_group;
+		new_entry->efd_count = count_clusters;
+		new_entry->efd_tid = handle->h_transaction->t_tid;
 
 		ext4_lock_group(sb, block_group);
 		mb_clear_bits(bitmap_bh->b_data, bit, count_clusters);
@@ -4971,11 +4914,11 @@
 	start = (e4b.bd_info->bb_first_free > start) ?
 		e4b.bd_info->bb_first_free : start;
 
-	while (start < max) {
-		start = mb_find_next_zero_bit(bitmap, max, start);
-		if (start >= max)
+	while (start <= max) {
+		start = mb_find_next_zero_bit(bitmap, max + 1, start);
+		if (start > max)
 			break;
-		next = mb_find_next_bit(bitmap, max, start);
+		next = mb_find_next_bit(bitmap, max + 1, start);
 
 		if ((next - start) >= minblocks) {
 			ext4_trim_extent(sb, start,
@@ -5027,37 +4970,36 @@
 int ext4_trim_fs(struct super_block *sb, struct fstrim_range *range)
 {
 	struct ext4_group_info *grp;
-	ext4_group_t first_group, last_group;
-	ext4_group_t group, ngroups = ext4_get_groups_count(sb);
+	ext4_group_t group, first_group, last_group;
 	ext4_grpblk_t cnt = 0, first_cluster, last_cluster;
-	uint64_t start, len, minlen, trimmed = 0;
+	uint64_t start, end, minlen, trimmed = 0;
 	ext4_fsblk_t first_data_blk =
 			le32_to_cpu(EXT4_SB(sb)->s_es->s_first_data_block);
+	ext4_fsblk_t max_blks = ext4_blocks_count(EXT4_SB(sb)->s_es);
 	int ret = 0;
 
 	start = range->start >> sb->s_blocksize_bits;
-	len = range->len >> sb->s_blocksize_bits;
+	end = start + (range->len >> sb->s_blocksize_bits) - 1;
 	minlen = range->minlen >> sb->s_blocksize_bits;
 
-	if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)))
+	if (unlikely(minlen > EXT4_CLUSTERS_PER_GROUP(sb)) ||
+	    unlikely(start >= max_blks))
 		return -EINVAL;
-	if (start + len <= first_data_blk)
+	if (end >= max_blks)
+		end = max_blks - 1;
+	if (end <= first_data_blk)
 		goto out;
-	if (start < first_data_blk) {
-		len -= first_data_blk - start;
+	if (start < first_data_blk)
 		start = first_data_blk;
-	}
 
-	/* Determine first and last group to examine based on start and len */
+	/* Determine first and last group to examine based on start and end */
 	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) start,
 				     &first_group, &first_cluster);
-	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) (start + len),
+	ext4_get_group_no_and_offset(sb, (ext4_fsblk_t) end,
 				     &last_group, &last_cluster);
-	last_group = (last_group > ngroups - 1) ? ngroups - 1 : last_group;
-	last_cluster = EXT4_CLUSTERS_PER_GROUP(sb);
 
-	if (first_group > last_group)
-		return -EINVAL;
+	/* end now represents the last cluster to discard in this group */
+	end = EXT4_CLUSTERS_PER_GROUP(sb) - 1;
 
 	for (group = first_group; group <= last_group; group++) {
 		grp = ext4_get_group_info(sb, group);
@@ -5069,31 +5011,35 @@
 		}
 
 		/*
-		 * For all the groups except the last one, last block will
-		 * always be EXT4_BLOCKS_PER_GROUP(sb), so we only need to
-		 * change it for the last group in which case start +
-		 * len < EXT4_BLOCKS_PER_GROUP(sb).
+		 * For all the groups except the last one, last cluster will
+		 * always be EXT4_CLUSTERS_PER_GROUP(sb)-1, so we only need to
+		 * change it for the last group, note that last_cluster is
+		 * already computed earlier by ext4_get_group_no_and_offset()
 		 */
-		if (first_cluster + len < EXT4_CLUSTERS_PER_GROUP(sb))
-			last_cluster = first_cluster + len;
-		len -= last_cluster - first_cluster;
+		if (group == last_group)
+			end = last_cluster;
 
 		if (grp->bb_free >= minlen) {
 			cnt = ext4_trim_all_free(sb, group, first_cluster,
-						last_cluster, minlen);
+						end, minlen);
 			if (cnt < 0) {
 				ret = cnt;
 				break;
 			}
+			trimmed += cnt;
 		}
-		trimmed += cnt;
+
+		/*
+		 * For every group except the first one, we are sure
+		 * that the first cluster to discard will be cluster #0.
+		 */
 		first_cluster = 0;
 	}
-	range->len = trimmed * sb->s_blocksize;
 
 	if (!ret)
 		atomic_set(&EXT4_SB(sb)->s_last_trim_minblks, minlen);
 
 out:
+	range->len = trimmed * sb->s_blocksize;
 	return ret;
 }

diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h
index 47705f3..c070618 100644
--- a/fs/ext4/mballoc.h
+++ b/fs/ext4/mballoc.h

@@ -96,21 +96,23 @@
 
 
 struct ext4_free_data {
-	/* this links the free block information from group_info */
-	struct rb_node node;
+	/* MUST be the first member */
+	struct ext4_journal_cb_entry	efd_jce;
 
-	/* this links the free block information from ext4_sb_info */
-	struct list_head list;
+	/* ext4_free_data private data starts from here */
+
+	/* this links the free block information from group_info */
+	struct rb_node			efd_node;
 
 	/* group which free block extent belongs */
-	ext4_group_t group;
+	ext4_group_t			efd_group;
 
 	/* free block extent */
-	ext4_grpblk_t start_cluster;
-	ext4_grpblk_t count;
+	ext4_grpblk_t			efd_start_cluster;
+	ext4_grpblk_t			efd_count;
 
 	/* transaction which freed this extent */
-	tid_t	t_tid;
+	tid_t				efd_tid;
 };
 
 struct ext4_prealloc_space {
@@ -210,8 +212,6 @@
 	__u16 bd_blkbits;
 	ext4_group_t bd_group;
 };
-#define EXT4_MB_BITMAP(e4b)	((e4b)->bd_bitmap)
-#define EXT4_MB_BUDDY(e4b)	((e4b)->bd_buddy)
 
 static inline ext4_fsblk_t ext4_grp_offs_to_block(struct super_block *sb,
 					struct ext4_free_extent *fex)

diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index e7d6bb0..f39f80f 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c

@@ -471,7 +471,7 @@
 	tmp_inode = ext4_new_inode(handle, inode->i_sb->s_root->d_inode,
 				   S_IFREG, NULL, goal, owner);
 	if (IS_ERR(tmp_inode)) {
-		retval = PTR_ERR(inode);
+		retval = PTR_ERR(tmp_inode);
 		ext4_journal_stop(handle);
 		return retval;
 	}

diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 7ea4ba4..ed6548d 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c

@@ -257,8 +257,8 @@
 	 * If check_interval in MMP block is larger, use that instead of
 	 * update_interval from the superblock.
 	 */
-	if (mmp->mmp_check_interval > mmp_check_interval)
-		mmp_check_interval = mmp->mmp_check_interval;
+	if (le16_to_cpu(mmp->mmp_check_interval) > mmp_check_interval)
+		mmp_check_interval = le16_to_cpu(mmp->mmp_check_interval);
 
 	seq = le32_to_cpu(mmp->mmp_seq);
 	if (seq == EXT4_MMP_SEQ_CLEAN)

diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index 2043f48..349d7b3 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c

@@ -468,7 +468,7 @@
 fail:
 	if (*err == ERR_BAD_DX_DIR)
 		ext4_warning(dir->i_sb,
-			     "Corrupt dir inode %ld, running e2fsck is "
+			     "Corrupt dir inode %lu, running e2fsck is "
 			     "recommended.", dir->i_ino);
 	return NULL;
 }

diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index 4758518..74cd1f7 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c

@@ -60,7 +60,6 @@
 static void put_io_page(struct ext4_io_page *io_page)
 {
 	if (atomic_dec_and_test(&io_page->p_count)) {
-		end_page_writeback(io_page->p_page);
 		put_page(io_page->p_page);
 		kmem_cache_free(io_page_cachep, io_page);
 	}
@@ -110,6 +109,8 @@
 	if (io->iocb)
 		aio_complete(io->iocb, io->result, 0);
 
+	if (io->flag & EXT4_IO_END_DIRECT)
+		inode_dio_done(inode);
 	/* Wake up anyone waiting on unwritten extent conversion */
 	if (atomic_dec_and_test(&EXT4_I(inode)->i_aiodio_unwritten))
 		wake_up_all(ext4_ioend_wq(io->inode));
@@ -127,12 +128,18 @@
 	unsigned long		flags;
 
 	spin_lock_irqsave(&ei->i_completed_io_lock, flags);
+	if (io->flag & EXT4_IO_END_IN_FSYNC)
+		goto requeue;
 	if (list_empty(&io->list)) {
 		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		goto free;
 	}
 
 	if (!mutex_trylock(&inode->i_mutex)) {
+		bool was_queued;
+requeue:
+		was_queued = !!(io->flag & EXT4_IO_END_QUEUED);
+		io->flag |= EXT4_IO_END_QUEUED;
 		spin_unlock_irqrestore(&ei->i_completed_io_lock, flags);
 		/*
 		 * Requeue the work instead of waiting so that the work
@@ -145,9 +152,8 @@
 		 * yield the cpu if it sees an end_io request that has already
 		 * been requeued.
 		 */
-		if (io->flag & EXT4_IO_END_QUEUED)
+		if (was_queued)
 			yield();
-		io->flag |= EXT4_IO_END_QUEUED;
 		return;
 	}
 	list_del_init(&io->list);
@@ -227,9 +233,9 @@
 			} while (bh != head);
 		}
 
-		put_io_page(io_end->pages[i]);
+		if (atomic_read(&io_end->pages[i]->p_count) == 1)
+			end_page_writeback(io_end->pages[i]->p_page);
 	}
-	io_end->num_io_pages = 0;
 	inode = io_end->inode;
 
 	if (error) {
@@ -421,6 +427,8 @@
 	 * PageWriteback bit from the page to prevent the system from
 	 * wedging later on.
 	 */
+	if (atomic_read(&io_page->p_count) == 1)
+		end_page_writeback(page);
 	put_io_page(io_page);
 	return ret;
 }

diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index f9d948f..59fa0be 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c

@@ -1163,8 +1163,11 @@
 	do_div(reserved_blocks, 100);
 
 	ext4_blocks_count_set(es, ext4_blocks_count(es) + blocks_count);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + free_blocks);
 	le32_add_cpu(&es->s_inodes_count, EXT4_INODES_PER_GROUP(sb) *
 		     flex_gd->count);
+	le32_add_cpu(&es->s_free_inodes_count, EXT4_INODES_PER_GROUP(sb) *
+		     flex_gd->count);
 
 	/*
 	 * We need to protect s_groups_count against other CPUs seeing
@@ -1465,6 +1468,7 @@
 	}
 
 	ext4_blocks_count_set(es, o_blocks_count + add);
+	ext4_free_blocks_count_set(es, ext4_free_blocks_count(es) + add);
 	ext4_debug("freeing blocks %llu through %llu\n", o_blocks_count,
 		   o_blocks_count + add);
 	/* We add the blocks to the bitmap and set the group need init bit */
@@ -1512,16 +1516,17 @@
 	o_blocks_count = ext4_blocks_count(es);
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: extending last group from %llu to %llu blocks\n",
-		       o_blocks_count, n_blocks_count);
+		ext4_msg(sb, KERN_DEBUG,
+			 "extending last group from %llu to %llu blocks",
+			 o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count == 0 || n_blocks_count == o_blocks_count)
 		return 0;
 
 	if (n_blocks_count > (sector_t)(~0ULL) >> (sb->s_blocksize_bits - 9)) {
-		printk(KERN_ERR "EXT4-fs: filesystem on %s:"
-			" too large to resize to %llu blocks safely\n",
-			sb->s_id, n_blocks_count);
+		ext4_msg(sb, KERN_ERR,
+			 "filesystem too large to resize to %llu blocks safely",
+			 n_blocks_count);
 		if (sizeof(sector_t) < 8)
 			ext4_warning(sb, "CONFIG_LBDAF not enabled");
 		return -EINVAL;
@@ -1582,7 +1587,7 @@
 	ext4_fsblk_t o_blocks_count;
 	ext4_group_t o_group;
 	ext4_group_t n_group;
-	ext4_grpblk_t offset;
+	ext4_grpblk_t offset, add;
 	unsigned long n_desc_blocks;
 	unsigned long o_desc_blocks;
 	unsigned long desc_blocks;
@@ -1591,8 +1596,8 @@
 	o_blocks_count = ext4_blocks_count(es);
 
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: resizing filesystem from %llu "
-		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+		ext4_msg(sb, KERN_DEBUG, "resizing filesystem from %llu "
+		       "to %llu blocks", o_blocks_count, n_blocks_count);
 
 	if (n_blocks_count < o_blocks_count) {
 		/* On-line shrinking not supported */
@@ -1605,7 +1610,7 @@
 		return 0;
 
 	ext4_get_group_no_and_offset(sb, n_blocks_count - 1, &n_group, &offset);
-	ext4_get_group_no_and_offset(sb, o_blocks_count, &o_group, &offset);
+	ext4_get_group_no_and_offset(sb, o_blocks_count - 1, &o_group, &offset);
 
 	n_desc_blocks = (n_group + EXT4_DESC_PER_BLOCK(sb)) /
 			EXT4_DESC_PER_BLOCK(sb);
@@ -1634,10 +1639,12 @@
 	}
 	brelse(bh);
 
-	if (offset != 0) {
-		/* extend the last group */
-		ext4_grpblk_t add;
-		add = EXT4_BLOCKS_PER_GROUP(sb) - offset;
+	/* extend the last group */
+	if (n_group == o_group)
+		add = n_blocks_count - o_blocks_count;
+	else
+		add = EXT4_BLOCKS_PER_GROUP(sb) - (offset + 1);
+	if (add > 0) {
 		err = ext4_group_extend_no_check(sb, o_blocks_count, add);
 		if (err)
 			goto out;
@@ -1674,7 +1681,7 @@
 
 	iput(resize_inode);
 	if (test_opt(sb, DEBUG))
-		printk(KERN_DEBUG "EXT4-fs: resized filesystem from %llu "
-		       "upto %llu blocks\n", o_blocks_count, n_blocks_count);
+		ext4_msg(sb, KERN_DEBUG, "resized filesystem from %llu "
+		       "upto %llu blocks", o_blocks_count, n_blocks_count);
 	return err;
 }

diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9339009..ceebaf8 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c

@@ -62,6 +62,7 @@
 
 static int ext4_load_journal(struct super_block *, struct ext4_super_block *,
 			     unsigned long journal_devnum);
+static int ext4_show_options(struct seq_file *seq, struct dentry *root);
 static int ext4_commit_super(struct super_block *sb, int sync);
 static void ext4_mark_recovery_complete(struct super_block *sb,
 					struct ext4_super_block *es);
@@ -375,7 +376,7 @@
 	if (is_handle_aborted(handle))
 		return;
 
-	printk(KERN_ERR "%s:%d: aborting transaction: %s in %s\n",
+	printk(KERN_ERR "EXT4-fs: %s:%d: aborting transaction: %s in %s\n",
 	       caller, line, errstr, err_fn);
 
 	jbd2_journal_abort_handle(handle);
@@ -431,6 +432,22 @@
 	return bdi->dev == NULL;
 }
 
+static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
+{
+	struct super_block		*sb = journal->j_private;
+	struct ext4_sb_info		*sbi = EXT4_SB(sb);
+	int				error = is_journal_aborted(journal);
+	struct ext4_journal_cb_entry	*jce, *tmp;
+
+	spin_lock(&sbi->s_md_lock);
+	list_for_each_entry_safe(jce, tmp, &txn->t_private_list, jce_list) {
+		list_del_init(&jce->jce_list);
+		spin_unlock(&sbi->s_md_lock);
+		jce->jce_func(sb, jce, error);
+		spin_lock(&sbi->s_md_lock);
+	}
+	spin_unlock(&sbi->s_md_lock);
+}
 
 /* Deal with the reporting of failure conditions on a filesystem such as
  * inconsistencies detected or read IO failures.
@@ -498,11 +515,16 @@
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-	       inode->i_sb->s_id, function, line, inode->i_ino);
 	if (block)
-		printk(KERN_CONT "block %llu: ", block);
-	printk(KERN_CONT "comm %s: %pV\n", current->comm, &vaf);
+		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+		       "inode #%lu: block %llu: comm %s: %pV\n",
+		       inode->i_sb->s_id, function, line, inode->i_ino,
+		       block, current->comm, &vaf);
+	else
+		printk(KERN_CRIT "EXT4-fs error (device %s): %s:%d: "
+		       "inode #%lu: comm %s: %pV\n",
+		       inode->i_sb->s_id, function, line, inode->i_ino,
+		       current->comm, &vaf);
 	va_end(args);
 
 	ext4_handle_error(inode->i_sb);
@@ -524,15 +546,21 @@
 	path = d_path(&(file->f_path), pathname, sizeof(pathname));
 	if (IS_ERR(path))
 		path = "(unknown)";
-	printk(KERN_CRIT
-	       "EXT4-fs error (device %s): %s:%d: inode #%lu: ",
-	       inode->i_sb->s_id, function, line, inode->i_ino);
-	if (block)
-		printk(KERN_CONT "block %llu: ", block);
 	va_start(args, fmt);
 	vaf.fmt = fmt;
 	vaf.va = &args;
-	printk(KERN_CONT "comm %s: path %s: %pV\n", current->comm, path, &vaf);
+	if (block)
+		printk(KERN_CRIT
+		       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+		       "block %llu: comm %s: path %s: %pV\n",
+		       inode->i_sb->s_id, function, line, inode->i_ino,
+		       block, current->comm, path, &vaf);
+	else
+		printk(KERN_CRIT
+		       "EXT4-fs error (device %s): %s:%d: inode #%lu: "
+		       "comm %s: path %s: %pV\n",
+		       inode->i_sb->s_id, function, line, inode->i_ino,
+		       current->comm, path, &vaf);
 	va_end(args);
 
 	ext4_handle_error(inode->i_sb);
@@ -808,9 +836,6 @@
 	destroy_workqueue(sbi->dio_unwritten_wq);
 
 	lock_super(sb);
-	if (sb->s_dirt)
-		ext4_commit_super(sb, 1);
-
 	if (sbi->s_journal) {
 		err = jbd2_journal_destroy(sbi->s_journal);
 		sbi->s_journal = NULL;
@@ -827,9 +852,12 @@
 	if (!(sb->s_flags & MS_RDONLY)) {
 		EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER);
 		es->s_state = cpu_to_le16(sbi->s_mount_state);
-		ext4_commit_super(sb, 1);
 	}
+	if (sb->s_dirt || !(sb->s_flags & MS_RDONLY))
+		ext4_commit_super(sb, 1);
+
 	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 	kobject_del(&sbi->s_kobj);
@@ -990,180 +1018,6 @@
 	}
 }
 
-static inline void ext4_show_quota_options(struct seq_file *seq,
-					   struct super_block *sb)
-{
-#if defined(CONFIG_QUOTA)
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-
-	if (sbi->s_jquota_fmt) {
-		char *fmtname = "";
-
-		switch (sbi->s_jquota_fmt) {
-		case QFMT_VFS_OLD:
-			fmtname = "vfsold";
-			break;
-		case QFMT_VFS_V0:
-			fmtname = "vfsv0";
-			break;
-		case QFMT_VFS_V1:
-			fmtname = "vfsv1";
-			break;
-		}
-		seq_printf(seq, ",jqfmt=%s", fmtname);
-	}
-
-	if (sbi->s_qf_names[USRQUOTA])
-		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
-
-	if (sbi->s_qf_names[GRPQUOTA])
-		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
-
-	if (test_opt(sb, USRQUOTA))
-		seq_puts(seq, ",usrquota");
-
-	if (test_opt(sb, GRPQUOTA))
-		seq_puts(seq, ",grpquota");
-#endif
-}
-
-/*
- * Show an option if
- *  - it's set to a non-default value OR
- *  - if the per-sb default is different from the global default
- */
-static int ext4_show_options(struct seq_file *seq, struct dentry *root)
-{
-	int def_errors;
-	unsigned long def_mount_opts;
-	struct super_block *sb = root->d_sb;
-	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	struct ext4_super_block *es = sbi->s_es;
-
-	def_mount_opts = le32_to_cpu(es->s_default_mount_opts);
-	def_errors     = le16_to_cpu(es->s_errors);
-
-	if (sbi->s_sb_block != 1)
-		seq_printf(seq, ",sb=%llu", sbi->s_sb_block);
-	if (test_opt(sb, MINIX_DF))
-		seq_puts(seq, ",minixdf");
-	if (test_opt(sb, GRPID) && !(def_mount_opts & EXT4_DEFM_BSDGROUPS))
-		seq_puts(seq, ",grpid");
-	if (!test_opt(sb, GRPID) && (def_mount_opts & EXT4_DEFM_BSDGROUPS))
-		seq_puts(seq, ",nogrpid");
-	if (sbi->s_resuid != EXT4_DEF_RESUID ||
-	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) {
-		seq_printf(seq, ",resuid=%u", sbi->s_resuid);
-	}
-	if (sbi->s_resgid != EXT4_DEF_RESGID ||
-	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) {
-		seq_printf(seq, ",resgid=%u", sbi->s_resgid);
-	}
-	if (test_opt(sb, ERRORS_RO)) {
-		if (def_errors == EXT4_ERRORS_PANIC ||
-		    def_errors == EXT4_ERRORS_CONTINUE) {
-			seq_puts(seq, ",errors=remount-ro");
-		}
-	}
-	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
-		seq_puts(seq, ",errors=continue");
-	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
-		seq_puts(seq, ",errors=panic");
-	if (test_opt(sb, NO_UID32) && !(def_mount_opts & EXT4_DEFM_UID16))
-		seq_puts(seq, ",nouid32");
-	if (test_opt(sb, DEBUG) && !(def_mount_opts & EXT4_DEFM_DEBUG))
-		seq_puts(seq, ",debug");
-#ifdef CONFIG_EXT4_FS_XATTR
-	if (test_opt(sb, XATTR_USER))
-		seq_puts(seq, ",user_xattr");
-	if (!test_opt(sb, XATTR_USER))
-		seq_puts(seq, ",nouser_xattr");
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-	if (test_opt(sb, POSIX_ACL) && !(def_mount_opts & EXT4_DEFM_ACL))
-		seq_puts(seq, ",acl");
-	if (!test_opt(sb, POSIX_ACL) && (def_mount_opts & EXT4_DEFM_ACL))
-		seq_puts(seq, ",noacl");
-#endif
-	if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) {
-		seq_printf(seq, ",commit=%u",
-			   (unsigned) (sbi->s_commit_interval / HZ));
-	}
-	if (sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME) {
-		seq_printf(seq, ",min_batch_time=%u",
-			   (unsigned) sbi->s_min_batch_time);
-	}
-	if (sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME) {
-		seq_printf(seq, ",max_batch_time=%u",
-			   (unsigned) sbi->s_max_batch_time);
-	}
-
-	/*
-	 * We're changing the default of barrier mount option, so
-	 * let's always display its mount state so it's clear what its
-	 * status is.
-	 */
-	seq_puts(seq, ",barrier=");
-	seq_puts(seq, test_opt(sb, BARRIER) ? "1" : "0");
-	if (test_opt(sb, JOURNAL_ASYNC_COMMIT))
-		seq_puts(seq, ",journal_async_commit");
-	else if (test_opt(sb, JOURNAL_CHECKSUM))
-		seq_puts(seq, ",journal_checksum");
-	if (test_opt(sb, I_VERSION))
-		seq_puts(seq, ",i_version");
-	if (!test_opt(sb, DELALLOC) &&
-	    !(def_mount_opts & EXT4_DEFM_NODELALLOC))
-		seq_puts(seq, ",nodelalloc");
-
-	if (!test_opt(sb, MBLK_IO_SUBMIT))
-		seq_puts(seq, ",nomblk_io_submit");
-	if (sbi->s_stripe)
-		seq_printf(seq, ",stripe=%lu", sbi->s_stripe);
-	/*
-	 * journal mode get enabled in different ways
-	 * So just print the value even if we didn't specify it
-	 */
-	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
-		seq_puts(seq, ",data=journal");
-	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
-		seq_puts(seq, ",data=ordered");
-	else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
-		seq_puts(seq, ",data=writeback");
-
-	if (sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
-		seq_printf(seq, ",inode_readahead_blks=%u",
-			   sbi->s_inode_readahead_blks);
-
-	if (test_opt(sb, DATA_ERR_ABORT))
-		seq_puts(seq, ",data_err=abort");
-
-	if (test_opt(sb, NO_AUTO_DA_ALLOC))
-		seq_puts(seq, ",noauto_da_alloc");
-
-	if (test_opt(sb, DISCARD) && !(def_mount_opts & EXT4_DEFM_DISCARD))
-		seq_puts(seq, ",discard");
-
-	if (test_opt(sb, NOLOAD))
-		seq_puts(seq, ",norecovery");
-
-	if (test_opt(sb, DIOREAD_NOLOCK))
-		seq_puts(seq, ",dioread_nolock");
-
-	if (test_opt(sb, BLOCK_VALIDITY) &&
-	    !(def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY))
-		seq_puts(seq, ",block_validity");
-
-	if (!test_opt(sb, INIT_INODE_TABLE))
-		seq_puts(seq, ",noinit_itable");
-	else if (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)
-		seq_printf(seq, ",init_itable=%u",
-			   (unsigned) sbi->s_li_wait_mult);
-
-	ext4_show_quota_options(seq, sb);
-
-	return 0;
-}
-
 static struct inode *ext4_nfs_get_inode(struct super_block *sb,
 					u64 ino, u32 generation)
 {
@@ -1316,18 +1170,17 @@
 enum {
 	Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid,
 	Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, Opt_err_ro,
-	Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov,
+	Opt_nouid32, Opt_debug, Opt_removed,
 	Opt_user_xattr, Opt_nouser_xattr, Opt_acl, Opt_noacl,
-	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload, Opt_nobh, Opt_bh,
+	Opt_auto_da_alloc, Opt_noauto_da_alloc, Opt_noload,
 	Opt_commit, Opt_min_batch_time, Opt_max_batch_time,
-	Opt_journal_update, Opt_journal_dev,
-	Opt_journal_checksum, Opt_journal_async_commit,
+	Opt_journal_dev, Opt_journal_checksum, Opt_journal_async_commit,
 	Opt_abort, Opt_data_journal, Opt_data_ordered, Opt_data_writeback,
 	Opt_data_err_abort, Opt_data_err_ignore,
 	Opt_usrjquota, Opt_grpjquota, Opt_offusrjquota, Opt_offgrpjquota,
 	Opt_jqfmt_vfsold, Opt_jqfmt_vfsv0, Opt_jqfmt_vfsv1, Opt_quota,
-	Opt_noquota, Opt_ignore, Opt_barrier, Opt_nobarrier, Opt_err,
-	Opt_resize, Opt_usrquota, Opt_grpquota, Opt_i_version,
+	Opt_noquota, Opt_barrier, Opt_nobarrier, Opt_err,
+	Opt_usrquota, Opt_grpquota, Opt_i_version,
 	Opt_stripe, Opt_delalloc, Opt_nodelalloc, Opt_mblk_io_submit,
 	Opt_nomblk_io_submit, Opt_block_validity, Opt_noblock_validity,
 	Opt_inode_readahead_blks, Opt_journal_ioprio,
@@ -1350,20 +1203,19 @@
 	{Opt_err_ro, "errors=remount-ro"},
 	{Opt_nouid32, "nouid32"},
 	{Opt_debug, "debug"},
-	{Opt_oldalloc, "oldalloc"},
-	{Opt_orlov, "orlov"},
+	{Opt_removed, "oldalloc"},
+	{Opt_removed, "orlov"},
 	{Opt_user_xattr, "user_xattr"},
 	{Opt_nouser_xattr, "nouser_xattr"},
 	{Opt_acl, "acl"},
 	{Opt_noacl, "noacl"},
-	{Opt_noload, "noload"},
 	{Opt_noload, "norecovery"},
-	{Opt_nobh, "nobh"},
-	{Opt_bh, "bh"},
+	{Opt_noload, "noload"},
+	{Opt_removed, "nobh"},
+	{Opt_removed, "bh"},
 	{Opt_commit, "commit=%u"},
 	{Opt_min_batch_time, "min_batch_time=%u"},
 	{Opt_max_batch_time, "max_batch_time=%u"},
-	{Opt_journal_update, "journal=update"},
 	{Opt_journal_dev, "journal_dev=%u"},
 	{Opt_journal_checksum, "journal_checksum"},
 	{Opt_journal_async_commit, "journal_async_commit"},
@@ -1389,7 +1241,6 @@
 	{Opt_nobarrier, "nobarrier"},
 	{Opt_i_version, "i_version"},
 	{Opt_stripe, "stripe=%u"},
-	{Opt_resize, "resize"},
 	{Opt_delalloc, "delalloc"},
 	{Opt_nodelalloc, "nodelalloc"},
 	{Opt_mblk_io_submit, "mblk_io_submit"},
@@ -1408,6 +1259,11 @@
 	{Opt_init_itable, "init_itable=%u"},
 	{Opt_init_itable, "init_itable"},
 	{Opt_noinit_itable, "noinit_itable"},
+	{Opt_removed, "check=none"},	/* mount option from ext2/3 */
+	{Opt_removed, "nocheck"},	/* mount option from ext2/3 */
+	{Opt_removed, "reservation"},	/* mount option from ext2/3 */
+	{Opt_removed, "noreservation"}, /* mount option from ext2/3 */
+	{Opt_removed, "journal=%u"},	/* mount option from ext2/3 */
 	{Opt_err, NULL},
 };
 
@@ -1496,420 +1352,273 @@
 }
 #endif
 
+#define MOPT_SET	0x0001
+#define MOPT_CLEAR	0x0002
+#define MOPT_NOSUPPORT	0x0004
+#define MOPT_EXPLICIT	0x0008
+#define MOPT_CLEAR_ERR	0x0010
+#define MOPT_GTE0	0x0020
+#ifdef CONFIG_QUOTA
+#define MOPT_Q		0
+#define MOPT_QFMT	0x0040
+#else
+#define MOPT_Q		MOPT_NOSUPPORT
+#define MOPT_QFMT	MOPT_NOSUPPORT
+#endif
+#define MOPT_DATAJ	0x0080
+
+static const struct mount_opts {
+	int	token;
+	int	mount_opt;
+	int	flags;
+} ext4_mount_opts[] = {
+	{Opt_minix_df, EXT4_MOUNT_MINIX_DF, MOPT_SET},
+	{Opt_bsd_df, EXT4_MOUNT_MINIX_DF, MOPT_CLEAR},
+	{Opt_grpid, EXT4_MOUNT_GRPID, MOPT_SET},
+	{Opt_nogrpid, EXT4_MOUNT_GRPID, MOPT_CLEAR},
+	{Opt_mblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_SET},
+	{Opt_nomblk_io_submit, EXT4_MOUNT_MBLK_IO_SUBMIT, MOPT_CLEAR},
+	{Opt_block_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_SET},
+	{Opt_noblock_validity, EXT4_MOUNT_BLOCK_VALIDITY, MOPT_CLEAR},
+	{Opt_dioread_nolock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_SET},
+	{Opt_dioread_lock, EXT4_MOUNT_DIOREAD_NOLOCK, MOPT_CLEAR},
+	{Opt_discard, EXT4_MOUNT_DISCARD, MOPT_SET},
+	{Opt_nodiscard, EXT4_MOUNT_DISCARD, MOPT_CLEAR},
+	{Opt_delalloc, EXT4_MOUNT_DELALLOC, MOPT_SET | MOPT_EXPLICIT},
+	{Opt_nodelalloc, EXT4_MOUNT_DELALLOC, MOPT_CLEAR | MOPT_EXPLICIT},
+	{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM, MOPT_SET},
+	{Opt_journal_async_commit, (EXT4_MOUNT_JOURNAL_ASYNC_COMMIT |
+				    EXT4_MOUNT_JOURNAL_CHECKSUM), MOPT_SET},
+	{Opt_noload, EXT4_MOUNT_NOLOAD, MOPT_SET},
+	{Opt_err_panic, EXT4_MOUNT_ERRORS_PANIC, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_ro, EXT4_MOUNT_ERRORS_RO, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_err_cont, EXT4_MOUNT_ERRORS_CONT, MOPT_SET | MOPT_CLEAR_ERR},
+	{Opt_data_err_abort, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_SET},
+	{Opt_data_err_ignore, EXT4_MOUNT_DATA_ERR_ABORT, MOPT_CLEAR},
+	{Opt_barrier, EXT4_MOUNT_BARRIER, MOPT_SET},
+	{Opt_nobarrier, EXT4_MOUNT_BARRIER, MOPT_CLEAR},
+	{Opt_noauto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_SET},
+	{Opt_auto_da_alloc, EXT4_MOUNT_NO_AUTO_DA_ALLOC, MOPT_CLEAR},
+	{Opt_noinit_itable, EXT4_MOUNT_INIT_INODE_TABLE, MOPT_CLEAR},
+	{Opt_commit, 0, MOPT_GTE0},
+	{Opt_max_batch_time, 0, MOPT_GTE0},
+	{Opt_min_batch_time, 0, MOPT_GTE0},
+	{Opt_inode_readahead_blks, 0, MOPT_GTE0},
+	{Opt_init_itable, 0, MOPT_GTE0},
+	{Opt_stripe, 0, MOPT_GTE0},
+	{Opt_data_journal, EXT4_MOUNT_JOURNAL_DATA, MOPT_DATAJ},
+	{Opt_data_ordered, EXT4_MOUNT_ORDERED_DATA, MOPT_DATAJ},
+	{Opt_data_writeback, EXT4_MOUNT_WRITEBACK_DATA, MOPT_DATAJ},
+#ifdef CONFIG_EXT4_FS_XATTR
+	{Opt_user_xattr, EXT4_MOUNT_XATTR_USER, MOPT_SET},
+	{Opt_nouser_xattr, EXT4_MOUNT_XATTR_USER, MOPT_CLEAR},
+#else
+	{Opt_user_xattr, 0, MOPT_NOSUPPORT},
+	{Opt_nouser_xattr, 0, MOPT_NOSUPPORT},
+#endif
+#ifdef CONFIG_EXT4_FS_POSIX_ACL
+	{Opt_acl, EXT4_MOUNT_POSIX_ACL, MOPT_SET},
+	{Opt_noacl, EXT4_MOUNT_POSIX_ACL, MOPT_CLEAR},
+#else
+	{Opt_acl, 0, MOPT_NOSUPPORT},
+	{Opt_noacl, 0, MOPT_NOSUPPORT},
+#endif
+	{Opt_nouid32, EXT4_MOUNT_NO_UID32, MOPT_SET},
+	{Opt_debug, EXT4_MOUNT_DEBUG, MOPT_SET},
+	{Opt_quota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA, MOPT_SET | MOPT_Q},
+	{Opt_usrquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_grpquota, EXT4_MOUNT_QUOTA | EXT4_MOUNT_GRPQUOTA,
+							MOPT_SET | MOPT_Q},
+	{Opt_noquota, (EXT4_MOUNT_QUOTA | EXT4_MOUNT_USRQUOTA |
+		       EXT4_MOUNT_GRPQUOTA), MOPT_CLEAR | MOPT_Q},
+	{Opt_usrjquota, 0, MOPT_Q},
+	{Opt_grpjquota, 0, MOPT_Q},
+	{Opt_offusrjquota, 0, MOPT_Q},
+	{Opt_offgrpjquota, 0, MOPT_Q},
+	{Opt_jqfmt_vfsold, QFMT_VFS_OLD, MOPT_QFMT},
+	{Opt_jqfmt_vfsv0, QFMT_VFS_V0, MOPT_QFMT},
+	{Opt_jqfmt_vfsv1, QFMT_VFS_V1, MOPT_QFMT},
+	{Opt_err, 0, 0}
+};
+
+static int handle_mount_opt(struct super_block *sb, char *opt, int token,
+			    substring_t *args, unsigned long *journal_devnum,
+			    unsigned int *journal_ioprio, int is_remount)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	const struct mount_opts *m;
+	int arg = 0;
+
+	if (args->from && match_int(args, &arg))
+		return -1;
+	switch (token) {
+	case Opt_noacl:
+	case Opt_nouser_xattr:
+		ext4_msg(sb, KERN_WARNING, deprecated_msg, opt, "3.5");
+		break;
+	case Opt_sb:
+		return 1;	/* handled by get_sb_block() */
+	case Opt_removed:
+		ext4_msg(sb, KERN_WARNING,
+			 "Ignoring removed %s option", opt);
+		return 1;
+	case Opt_resuid:
+		sbi->s_resuid = arg;
+		return 1;
+	case Opt_resgid:
+		sbi->s_resgid = arg;
+		return 1;
+	case Opt_abort:
+		sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
+		return 1;
+	case Opt_i_version:
+		sb->s_flags |= MS_I_VERSION;
+		return 1;
+	case Opt_journal_dev:
+		if (is_remount) {
+			ext4_msg(sb, KERN_ERR,
+				 "Cannot specify journal on remount");
+			return -1;
+		}
+		*journal_devnum = arg;
+		return 1;
+	case Opt_journal_ioprio:
+		if (arg < 0 || arg > 7)
+			return -1;
+		*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg);
+		return 1;
+	}
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+		if (token != m->token)
+			continue;
+		if (args->from && (m->flags & MOPT_GTE0) && (arg < 0))
+			return -1;
+		if (m->flags & MOPT_EXPLICIT)
+			set_opt2(sb, EXPLICIT_DELALLOC);
+		if (m->flags & MOPT_CLEAR_ERR)
+			clear_opt(sb, ERRORS_MASK);
+		if (token == Opt_noquota && sb_any_quota_loaded(sb)) {
+			ext4_msg(sb, KERN_ERR, "Cannot change quota "
+				 "options when quota turned on");
+			return -1;
+		}
+
+		if (m->flags & MOPT_NOSUPPORT) {
+			ext4_msg(sb, KERN_ERR, "%s option not supported", opt);
+		} else if (token == Opt_commit) {
+			if (arg == 0)
+				arg = JBD2_DEFAULT_MAX_COMMIT_AGE;
+			sbi->s_commit_interval = HZ * arg;
+		} else if (token == Opt_max_batch_time) {
+			if (arg == 0)
+				arg = EXT4_DEF_MAX_BATCH_TIME;
+			sbi->s_max_batch_time = arg;
+		} else if (token == Opt_min_batch_time) {
+			sbi->s_min_batch_time = arg;
+		} else if (token == Opt_inode_readahead_blks) {
+			if (arg > (1 << 30))
+				return -1;
+			if (arg && !is_power_of_2(arg)) {
+				ext4_msg(sb, KERN_ERR,
+					 "EXT4-fs: inode_readahead_blks"
+					 " must be a power of 2");
+				return -1;
+			}
+			sbi->s_inode_readahead_blks = arg;
+		} else if (token == Opt_init_itable) {
+			set_opt(sb, INIT_INODE_TABLE);
+			if (!args->from)
+				arg = EXT4_DEF_LI_WAIT_MULT;
+			sbi->s_li_wait_mult = arg;
+		} else if (token == Opt_stripe) {
+			sbi->s_stripe = arg;
+		} else if (m->flags & MOPT_DATAJ) {
+			if (is_remount) {
+				if (!sbi->s_journal)
+					ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
+				else if (test_opt(sb, DATA_FLAGS) !=
+					 m->mount_opt) {
+					ext4_msg(sb, KERN_ERR,
+					 "Cannot change data mode on remount");
+					return -1;
+				}
+			} else {
+				clear_opt(sb, DATA_FLAGS);
+				sbi->s_mount_opt |= m->mount_opt;
+			}
+#ifdef CONFIG_QUOTA
+		} else if (token == Opt_usrjquota) {
+			if (!set_qf_name(sb, USRQUOTA, &args[0]))
+				return -1;
+		} else if (token == Opt_grpjquota) {
+			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
+				return -1;
+		} else if (token == Opt_offusrjquota) {
+			if (!clear_qf_name(sb, USRQUOTA))
+				return -1;
+		} else if (token == Opt_offgrpjquota) {
+			if (!clear_qf_name(sb, GRPQUOTA))
+				return -1;
+		} else if (m->flags & MOPT_QFMT) {
+			if (sb_any_quota_loaded(sb) &&
+			    sbi->s_jquota_fmt != m->mount_opt) {
+				ext4_msg(sb, KERN_ERR, "Cannot "
+					 "change journaled quota options "
+					 "when quota turned on");
+				return -1;
+			}
+			sbi->s_jquota_fmt = m->mount_opt;
+#endif
+		} else {
+			if (!args->from)
+				arg = 1;
+			if (m->flags & MOPT_CLEAR)
+				arg = !arg;
+			else if (unlikely(!(m->flags & MOPT_SET))) {
+				ext4_msg(sb, KERN_WARNING,
+					 "buggy handling of option %s", opt);
+				WARN_ON(1);
+				return -1;
+			}
+			if (arg != 0)
+				sbi->s_mount_opt |= m->mount_opt;
+			else
+				sbi->s_mount_opt &= ~m->mount_opt;
+		}
+		return 1;
+	}
+	ext4_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" "
+		 "or missing value", opt);
+	return -1;
+}
+
 static int parse_options(char *options, struct super_block *sb,
 			 unsigned long *journal_devnum,
 			 unsigned int *journal_ioprio,
-			 ext4_fsblk_t *n_blocks_count, int is_remount)
+			 int is_remount)
 {
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
 	char *p;
 	substring_t args[MAX_OPT_ARGS];
-	int data_opt = 0;
-	int option;
-#ifdef CONFIG_QUOTA
-	int qfmt;
-#endif
+	int token;
 
 	if (!options)
 		return 1;
 
 	while ((p = strsep(&options, ",")) != NULL) {
-		int token;
 		if (!*p)
 			continue;
-
 		/*
 		 * Initialize args struct so we know whether arg was
 		 * found; some options take optional arguments.
 		 */
-		args[0].to = args[0].from = NULL;
+		args[0].to = args[0].from = 0;
 		token = match_token(p, tokens, args);
-		switch (token) {
-		case Opt_bsd_df:
-			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			clear_opt(sb, MINIX_DF);
-			break;
-		case Opt_minix_df:
-			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			set_opt(sb, MINIX_DF);
-
-			break;
-		case Opt_grpid:
-			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			set_opt(sb, GRPID);
-
-			break;
-		case Opt_nogrpid:
-			ext4_msg(sb, KERN_WARNING, deprecated_msg, p, "2.6.38");
-			clear_opt(sb, GRPID);
-
-			break;
-		case Opt_resuid:
-			if (match_int(&args[0], &option))
-				return 0;
-			sbi->s_resuid = option;
-			break;
-		case Opt_resgid:
-			if (match_int(&args[0], &option))
-				return 0;
-			sbi->s_resgid = option;
-			break;
-		case Opt_sb:
-			/* handled by get_sb_block() instead of here */
-			/* *sb_block = match_int(&args[0]); */
-			break;
-		case Opt_err_panic:
-			clear_opt(sb, ERRORS_CONT);
-			clear_opt(sb, ERRORS_RO);
-			set_opt(sb, ERRORS_PANIC);
-			break;
-		case Opt_err_ro:
-			clear_opt(sb, ERRORS_CONT);
-			clear_opt(sb, ERRORS_PANIC);
-			set_opt(sb, ERRORS_RO);
-			break;
-		case Opt_err_cont:
-			clear_opt(sb, ERRORS_RO);
-			clear_opt(sb, ERRORS_PANIC);
-			set_opt(sb, ERRORS_CONT);
-			break;
-		case Opt_nouid32:
-			set_opt(sb, NO_UID32);
-			break;
-		case Opt_debug:
-			set_opt(sb, DEBUG);
-			break;
-		case Opt_oldalloc:
-			ext4_msg(sb, KERN_WARNING,
-				 "Ignoring deprecated oldalloc option");
-			break;
-		case Opt_orlov:
-			ext4_msg(sb, KERN_WARNING,
-				 "Ignoring deprecated orlov option");
-			break;
-#ifdef CONFIG_EXT4_FS_XATTR
-		case Opt_user_xattr:
-			set_opt(sb, XATTR_USER);
-			break;
-		case Opt_nouser_xattr:
-			clear_opt(sb, XATTR_USER);
-			break;
-#else
-		case Opt_user_xattr:
-		case Opt_nouser_xattr:
-			ext4_msg(sb, KERN_ERR, "(no)user_xattr options not supported");
-			break;
-#endif
-#ifdef CONFIG_EXT4_FS_POSIX_ACL
-		case Opt_acl:
-			set_opt(sb, POSIX_ACL);
-			break;
-		case Opt_noacl:
-			clear_opt(sb, POSIX_ACL);
-			break;
-#else
-		case Opt_acl:
-		case Opt_noacl:
-			ext4_msg(sb, KERN_ERR, "(no)acl options not supported");
-			break;
-#endif
-		case Opt_journal_update:
-			/* @@@ FIXME */
-			/* Eventually we will want to be able to create
-			   a journal file here.  For now, only allow the
-			   user to specify an existing inode to be the
-			   journal file. */
-			if (is_remount) {
-				ext4_msg(sb, KERN_ERR,
-					 "Cannot specify journal on remount");
-				return 0;
-			}
-			set_opt(sb, UPDATE_JOURNAL);
-			break;
-		case Opt_journal_dev:
-			if (is_remount) {
-				ext4_msg(sb, KERN_ERR,
-					"Cannot specify journal on remount");
-				return 0;
-			}
-			if (match_int(&args[0], &option))
-				return 0;
-			*journal_devnum = option;
-			break;
-		case Opt_journal_checksum:
-			set_opt(sb, JOURNAL_CHECKSUM);
-			break;
-		case Opt_journal_async_commit:
-			set_opt(sb, JOURNAL_ASYNC_COMMIT);
-			set_opt(sb, JOURNAL_CHECKSUM);
-			break;
-		case Opt_noload:
-			set_opt(sb, NOLOAD);
-			break;
-		case Opt_commit:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			if (option == 0)
-				option = JBD2_DEFAULT_MAX_COMMIT_AGE;
-			sbi->s_commit_interval = HZ * option;
-			break;
-		case Opt_max_batch_time:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			if (option == 0)
-				option = EXT4_DEF_MAX_BATCH_TIME;
-			sbi->s_max_batch_time = option;
-			break;
-		case Opt_min_batch_time:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			sbi->s_min_batch_time = option;
-			break;
-		case Opt_data_journal:
-			data_opt = EXT4_MOUNT_JOURNAL_DATA;
-			goto datacheck;
-		case Opt_data_ordered:
-			data_opt = EXT4_MOUNT_ORDERED_DATA;
-			goto datacheck;
-		case Opt_data_writeback:
-			data_opt = EXT4_MOUNT_WRITEBACK_DATA;
-		datacheck:
-			if (is_remount) {
-				if (!sbi->s_journal)
-					ext4_msg(sb, KERN_WARNING, "Remounting file system with no journal so ignoring journalled data option");
-				else if (test_opt(sb, DATA_FLAGS) != data_opt) {
-					ext4_msg(sb, KERN_ERR,
-						"Cannot change data mode on remount");
-					return 0;
-				}
-			} else {
-				clear_opt(sb, DATA_FLAGS);
-				sbi->s_mount_opt |= data_opt;
-			}
-			break;
-		case Opt_data_err_abort:
-			set_opt(sb, DATA_ERR_ABORT);
-			break;
-		case Opt_data_err_ignore:
-			clear_opt(sb, DATA_ERR_ABORT);
-			break;
-#ifdef CONFIG_QUOTA
-		case Opt_usrjquota:
-			if (!set_qf_name(sb, USRQUOTA, &args[0]))
-				return 0;
-			break;
-		case Opt_grpjquota:
-			if (!set_qf_name(sb, GRPQUOTA, &args[0]))
-				return 0;
-			break;
-		case Opt_offusrjquota:
-			if (!clear_qf_name(sb, USRQUOTA))
-				return 0;
-			break;
-		case Opt_offgrpjquota:
-			if (!clear_qf_name(sb, GRPQUOTA))
-				return 0;
-			break;
-
-		case Opt_jqfmt_vfsold:
-			qfmt = QFMT_VFS_OLD;
-			goto set_qf_format;
-		case Opt_jqfmt_vfsv0:
-			qfmt = QFMT_VFS_V0;
-			goto set_qf_format;
-		case Opt_jqfmt_vfsv1:
-			qfmt = QFMT_VFS_V1;
-set_qf_format:
-			if (sb_any_quota_loaded(sb) &&
-			    sbi->s_jquota_fmt != qfmt) {
-				ext4_msg(sb, KERN_ERR, "Cannot change "
-					"journaled quota options when "
-					"quota turned on");
-				return 0;
-			}
-			sbi->s_jquota_fmt = qfmt;
-			break;
-		case Opt_quota:
-		case Opt_usrquota:
-			set_opt(sb, QUOTA);
-			set_opt(sb, USRQUOTA);
-			break;
-		case Opt_grpquota:
-			set_opt(sb, QUOTA);
-			set_opt(sb, GRPQUOTA);
-			break;
-		case Opt_noquota:
-			if (sb_any_quota_loaded(sb)) {
-				ext4_msg(sb, KERN_ERR, "Cannot change quota "
-					"options when quota turned on");
-				return 0;
-			}
-			clear_opt(sb, QUOTA);
-			clear_opt(sb, USRQUOTA);
-			clear_opt(sb, GRPQUOTA);
-			break;
-#else
-		case Opt_quota:
-		case Opt_usrquota:
-		case Opt_grpquota:
-			ext4_msg(sb, KERN_ERR,
-				"quota options not supported");
-			break;
-		case Opt_usrjquota:
-		case Opt_grpjquota:
-		case Opt_offusrjquota:
-		case Opt_offgrpjquota:
-		case Opt_jqfmt_vfsold:
-		case Opt_jqfmt_vfsv0:
-		case Opt_jqfmt_vfsv1:
-			ext4_msg(sb, KERN_ERR,
-				"journaled quota options not supported");
-			break;
-		case Opt_noquota:
-			break;
-#endif
-		case Opt_abort:
-			sbi->s_mount_flags |= EXT4_MF_FS_ABORTED;
-			break;
-		case Opt_nobarrier:
-			clear_opt(sb, BARRIER);
-			break;
-		case Opt_barrier:
-			if (args[0].from) {
-				if (match_int(&args[0], &option))
-					return 0;
-			} else
-				option = 1;	/* No argument, default to 1 */
-			if (option)
-				set_opt(sb, BARRIER);
-			else
-				clear_opt(sb, BARRIER);
-			break;
-		case Opt_ignore:
-			break;
-		case Opt_resize:
-			if (!is_remount) {
-				ext4_msg(sb, KERN_ERR,
-					"resize option only available "
-					"for remount");
-				return 0;
-			}
-			if (match_int(&args[0], &option) != 0)
-				return 0;
-			*n_blocks_count = option;
-			break;
-		case Opt_nobh:
-			ext4_msg(sb, KERN_WARNING,
-				 "Ignoring deprecated nobh option");
-			break;
-		case Opt_bh:
-			ext4_msg(sb, KERN_WARNING,
-				 "Ignoring deprecated bh option");
-			break;
-		case Opt_i_version:
-			set_opt(sb, I_VERSION);
-			sb->s_flags |= MS_I_VERSION;
-			break;
-		case Opt_nodelalloc:
-			clear_opt(sb, DELALLOC);
-			clear_opt2(sb, EXPLICIT_DELALLOC);
-			break;
-		case Opt_mblk_io_submit:
-			set_opt(sb, MBLK_IO_SUBMIT);
-			break;
-		case Opt_nomblk_io_submit:
-			clear_opt(sb, MBLK_IO_SUBMIT);
-			break;
-		case Opt_stripe:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0)
-				return 0;
-			sbi->s_stripe = option;
-			break;
-		case Opt_delalloc:
-			set_opt(sb, DELALLOC);
-			set_opt2(sb, EXPLICIT_DELALLOC);
-			break;
-		case Opt_block_validity:
-			set_opt(sb, BLOCK_VALIDITY);
-			break;
-		case Opt_noblock_validity:
-			clear_opt(sb, BLOCK_VALIDITY);
-			break;
-		case Opt_inode_readahead_blks:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0 || option > (1 << 30))
-				return 0;
-			if (option && !is_power_of_2(option)) {
-				ext4_msg(sb, KERN_ERR,
-					 "EXT4-fs: inode_readahead_blks"
-					 " must be a power of 2");
-				return 0;
-			}
-			sbi->s_inode_readahead_blks = option;
-			break;
-		case Opt_journal_ioprio:
-			if (match_int(&args[0], &option))
-				return 0;
-			if (option < 0 || option > 7)
-				break;
-			*journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE,
-							    option);
-			break;
-		case Opt_noauto_da_alloc:
-			set_opt(sb, NO_AUTO_DA_ALLOC);
-			break;
-		case Opt_auto_da_alloc:
-			if (args[0].from) {
-				if (match_int(&args[0], &option))
-					return 0;
-			} else
-				option = 1;	/* No argument, default to 1 */
-			if (option)
-				clear_opt(sb, NO_AUTO_DA_ALLOC);
-			else
-				set_opt(sb,NO_AUTO_DA_ALLOC);
-			break;
-		case Opt_discard:
-			set_opt(sb, DISCARD);
-			break;
-		case Opt_nodiscard:
-			clear_opt(sb, DISCARD);
-			break;
-		case Opt_dioread_nolock:
-			set_opt(sb, DIOREAD_NOLOCK);
-			break;
-		case Opt_dioread_lock:
-			clear_opt(sb, DIOREAD_NOLOCK);
-			break;
-		case Opt_init_itable:
-			set_opt(sb, INIT_INODE_TABLE);
-			if (args[0].from) {
-				if (match_int(&args[0], &option))
-					return 0;
-			} else
-				option = EXT4_DEF_LI_WAIT_MULT;
-			if (option < 0)
-				return 0;
-			sbi->s_li_wait_mult = option;
-			break;
-		case Opt_noinit_itable:
-			clear_opt(sb, INIT_INODE_TABLE);
-			break;
-		default:
-			ext4_msg(sb, KERN_ERR,
-			       "Unrecognized mount option \"%s\" "
-			       "or missing value", p);
+		if (handle_mount_opt(sb, p, token, args, journal_devnum,
+				     journal_ioprio, is_remount) < 0)
 			return 0;
-		}
 	}
 #ifdef CONFIG_QUOTA
 	if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA]) {
@@ -1942,6 +1651,160 @@
 	return 1;
 }
 
+static inline void ext4_show_quota_options(struct seq_file *seq,
+					   struct super_block *sb)
+{
+#if defined(CONFIG_QUOTA)
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+
+	if (sbi->s_jquota_fmt) {
+		char *fmtname = "";
+
+		switch (sbi->s_jquota_fmt) {
+		case QFMT_VFS_OLD:
+			fmtname = "vfsold";
+			break;
+		case QFMT_VFS_V0:
+			fmtname = "vfsv0";
+			break;
+		case QFMT_VFS_V1:
+			fmtname = "vfsv1";
+			break;
+		}
+		seq_printf(seq, ",jqfmt=%s", fmtname);
+	}
+
+	if (sbi->s_qf_names[USRQUOTA])
+		seq_printf(seq, ",usrjquota=%s", sbi->s_qf_names[USRQUOTA]);
+
+	if (sbi->s_qf_names[GRPQUOTA])
+		seq_printf(seq, ",grpjquota=%s", sbi->s_qf_names[GRPQUOTA]);
+
+	if (test_opt(sb, USRQUOTA))
+		seq_puts(seq, ",usrquota");
+
+	if (test_opt(sb, GRPQUOTA))
+		seq_puts(seq, ",grpquota");
+#endif
+}
+
+static const char *token2str(int token)
+{
+	static const struct match_token *t;
+
+	for (t = tokens; t->token != Opt_err; t++)
+		if (t->token == token && !strchr(t->pattern, '='))
+			break;
+	return t->pattern;
+}
+
+/*
+ * Show an option if
+ *  - it's set to a non-default value OR
+ *  - if the per-sb default is different from the global default
+ */
+static int _ext4_show_options(struct seq_file *seq, struct super_block *sb,
+			      int nodefs)
+{
+	struct ext4_sb_info *sbi = EXT4_SB(sb);
+	struct ext4_super_block *es = sbi->s_es;
+	int def_errors, def_mount_opt = nodefs ? 0 : sbi->s_def_mount_opt;
+	const struct mount_opts *m;
+	char sep = nodefs ? '\n' : ',';
+
+#define SEQ_OPTS_PUTS(str) seq_printf(seq, "%c" str, sep)
+#define SEQ_OPTS_PRINT(str, arg) seq_printf(seq, "%c" str, sep, arg)
+
+	if (sbi->s_sb_block != 1)
+		SEQ_OPTS_PRINT("sb=%llu", sbi->s_sb_block);
+
+	for (m = ext4_mount_opts; m->token != Opt_err; m++) {
+		int want_set = m->flags & MOPT_SET;
+		if (((m->flags & (MOPT_SET|MOPT_CLEAR)) == 0) ||
+		    (m->flags & MOPT_CLEAR_ERR))
+			continue;
+		if (!(m->mount_opt & (sbi->s_mount_opt ^ def_mount_opt)))
+			continue; /* skip if same as the default */
+		if ((want_set &&
+		     (sbi->s_mount_opt & m->mount_opt) != m->mount_opt) ||
+		    (!want_set && (sbi->s_mount_opt & m->mount_opt)))
+			continue; /* select Opt_noFoo vs Opt_Foo */
+		SEQ_OPTS_PRINT("%s", token2str(m->token));
+	}
+
+	if (nodefs || sbi->s_resuid != EXT4_DEF_RESUID ||
+	    le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID)
+		SEQ_OPTS_PRINT("resuid=%u", sbi->s_resuid);
+	if (nodefs || sbi->s_resgid != EXT4_DEF_RESGID ||
+	    le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID)
+		SEQ_OPTS_PRINT("resgid=%u", sbi->s_resgid);
+	def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors);
+	if (test_opt(sb, ERRORS_RO) && def_errors != EXT4_ERRORS_RO)
+		SEQ_OPTS_PUTS("errors=remount-ro");
+	if (test_opt(sb, ERRORS_CONT) && def_errors != EXT4_ERRORS_CONTINUE)
+		SEQ_OPTS_PUTS("errors=continue");
+	if (test_opt(sb, ERRORS_PANIC) && def_errors != EXT4_ERRORS_PANIC)
+		SEQ_OPTS_PUTS("errors=panic");
+	if (nodefs || sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ)
+		SEQ_OPTS_PRINT("commit=%lu", sbi->s_commit_interval / HZ);
+	if (nodefs || sbi->s_min_batch_time != EXT4_DEF_MIN_BATCH_TIME)
+		SEQ_OPTS_PRINT("min_batch_time=%u", sbi->s_min_batch_time);
+	if (nodefs || sbi->s_max_batch_time != EXT4_DEF_MAX_BATCH_TIME)
+		SEQ_OPTS_PRINT("max_batch_time=%u", sbi->s_max_batch_time);
+	if (sb->s_flags & MS_I_VERSION)
+		SEQ_OPTS_PUTS("i_version");
+	if (nodefs || sbi->s_stripe)
+		SEQ_OPTS_PRINT("stripe=%lu", sbi->s_stripe);
+	if (EXT4_MOUNT_DATA_FLAGS & (sbi->s_mount_opt ^ def_mount_opt)) {
+		if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)
+			SEQ_OPTS_PUTS("data=journal");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_ORDERED_DATA)
+			SEQ_OPTS_PUTS("data=ordered");
+		else if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_WRITEBACK_DATA)
+			SEQ_OPTS_PUTS("data=writeback");
+	}
+	if (nodefs ||
+	    sbi->s_inode_readahead_blks != EXT4_DEF_INODE_READAHEAD_BLKS)
+		SEQ_OPTS_PRINT("inode_readahead_blks=%u",
+			       sbi->s_inode_readahead_blks);
+
+	if (nodefs || (test_opt(sb, INIT_INODE_TABLE) &&
+		       (sbi->s_li_wait_mult != EXT4_DEF_LI_WAIT_MULT)))
+		SEQ_OPTS_PRINT("init_itable=%u", sbi->s_li_wait_mult);
+
+	ext4_show_quota_options(seq, sb);
+	return 0;
+}
+
+static int ext4_show_options(struct seq_file *seq, struct dentry *root)
+{
+	return _ext4_show_options(seq, root->d_sb, 0);
+}
+
+static int options_seq_show(struct seq_file *seq, void *offset)
+{
+	struct super_block *sb = seq->private;
+	int rc;
+
+	seq_puts(seq, (sb->s_flags & MS_RDONLY) ? "ro" : "rw");
+	rc = _ext4_show_options(seq, sb, 1);
+	seq_puts(seq, "\n");
+	return rc;
+}
+
+static int options_open_fs(struct inode *inode, struct file *file)
+{
+	return single_open(file, options_seq_show, PDE(inode)->data);
+}
+
+static const struct file_operations ext4_seq_options_fops = {
+	.owner = THIS_MODULE,
+	.open = options_open_fs,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
 			    int read_only)
 {
@@ -2945,7 +2808,7 @@
 		ext4_clear_request_list();
 		kfree(ext4_li_info);
 		ext4_li_info = NULL;
-		printk(KERN_CRIT "EXT4: error %d creating inode table "
+		printk(KERN_CRIT "EXT4-fs: error %d creating inode table "
 				 "initialization thread\n",
 				 err);
 		return err;
@@ -3183,11 +3046,8 @@
 	set_opt(sb, INIT_INODE_TABLE);
 	if (def_mount_opts & EXT4_DEFM_DEBUG)
 		set_opt(sb, DEBUG);
-	if (def_mount_opts & EXT4_DEFM_BSDGROUPS) {
-		ext4_msg(sb, KERN_WARNING, deprecated_msg, "bsdgroups",
-			"2.6.38");
+	if (def_mount_opts & EXT4_DEFM_BSDGROUPS)
 		set_opt(sb, GRPID);
-	}
 	if (def_mount_opts & EXT4_DEFM_UID16)
 		set_opt(sb, NO_UID32);
 	/* xattr user namespace & acls are now defaulted on */
@@ -3240,13 +3100,14 @@
 	sbi->s_li_wait_mult = EXT4_DEF_LI_WAIT_MULT;
 
 	if (!parse_options((char *) sbi->s_es->s_mount_opts, sb,
-			   &journal_devnum, &journal_ioprio, NULL, 0)) {
+			   &journal_devnum, &journal_ioprio, 0)) {
 		ext4_msg(sb, KERN_WARNING,
 			 "failed to parse options in superblock: %s",
 			 sbi->s_es->s_mount_opts);
 	}
+	sbi->s_def_mount_opt = sbi->s_mount_opt;
 	if (!parse_options((char *) data, sb, &journal_devnum,
-			   &journal_ioprio, NULL, 0))
+			   &journal_ioprio, 0))
 		goto failed_mount;
 
 	if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA) {
@@ -3416,7 +3277,6 @@
 #else
 		es->s_flags |= cpu_to_le32(EXT2_FLAGS_SIGNED_HASH);
 #endif
-		sb->s_dirt = 1;
 	}
 
 	/* Handle clustersize */
@@ -3540,6 +3400,10 @@
 	if (ext4_proc_root)
 		sbi->s_proc = proc_mkdir(sb->s_id, ext4_proc_root);
 
+	if (sbi->s_proc)
+		proc_create_data("options", S_IRUGO, sbi->s_proc,
+				 &ext4_seq_options_fops, sb);
+
 	bgl_lock_init(sbi->s_blockgroup_lock);
 
 	for (i = 0; i < db_count; i++) {
@@ -3694,6 +3558,8 @@
 	}
 	set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 
+	sbi->s_journal->j_commit_callback = ext4_journal_commit_callback;
+
 	/*
 	 * The journal may have updated the bg summary counts, so we
 	 * need to update the global counters.
@@ -3861,6 +3727,7 @@
 	ext4_kvfree(sbi->s_group_desc);
 failed_mount:
 	if (sbi->s_proc) {
+		remove_proc_entry("options", sbi->s_proc);
 		remove_proc_entry(sb->s_id, ext4_proc_root);
 	}
 #ifdef CONFIG_QUOTA
@@ -4090,15 +3957,6 @@
 	if (!(journal->j_flags & JBD2_BARRIER))
 		ext4_msg(sb, KERN_INFO, "barriers disabled");
 
-	if (!really_read_only && test_opt(sb, UPDATE_JOURNAL)) {
-		err = jbd2_journal_update_format(journal);
-		if (err)  {
-			ext4_msg(sb, KERN_ERR, "error updating journal");
-			jbd2_journal_destroy(journal);
-			return err;
-		}
-	}
-
 	if (!EXT4_HAS_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER))
 		err = jbd2_journal_wipe(journal, !really_read_only);
 	if (!err) {
@@ -4385,7 +4243,6 @@
 {
 	struct ext4_super_block *es;
 	struct ext4_sb_info *sbi = EXT4_SB(sb);
-	ext4_fsblk_t n_blocks_count = 0;
 	unsigned long old_sb_flags;
 	struct ext4_mount_options old_opts;
 	int enable_quota = 0;
@@ -4418,8 +4275,7 @@
 	/*
 	 * Allow the "check" option to be passed as a remount option.
 	 */
-	if (!parse_options(data, sb, NULL, &journal_ioprio,
-			   &n_blocks_count, 1)) {
+	if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) {
 		err = -EINVAL;
 		goto restore_opts;
 	}
@@ -4437,8 +4293,7 @@
 		set_task_ioprio(sbi->s_journal->j_task, journal_ioprio);
 	}
 
-	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY) ||
-		n_blocks_count > ext4_blocks_count(es)) {
+	if ((*flags & MS_RDONLY) != (sb->s_flags & MS_RDONLY)) {
 		if (sbi->s_mount_flags & EXT4_MF_FS_ABORTED) {
 			err = -EROFS;
 			goto restore_opts;
@@ -4513,8 +4368,6 @@
 			if (sbi->s_journal)
 				ext4_clear_journal_err(sb, es);
 			sbi->s_mount_state = le16_to_cpu(es->s_state);
-			if ((err = ext4_group_extend(sb, es, n_blocks_count)))
-				goto restore_opts;
 			if (!ext4_setup_super(sb, es, 0))
 				sb->s_flags &= ~MS_RDONLY;
 			if (EXT4_HAS_INCOMPAT_FEATURE(sb,

diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 93a00d8..e88748e 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c

@@ -82,8 +82,8 @@
 		printk("\n"); \
 	} while (0)
 #else
-# define ea_idebug(f...)
-# define ea_bdebug(f...)
+# define ea_idebug(inode, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
+# define ea_bdebug(bh, fmt, ...)	no_printk(fmt, ##__VA_ARGS__)
 #endif
 
 static void ext4_xattr_cache_insert(struct buffer_head *);
@@ -158,13 +158,10 @@
 static inline int
 ext4_xattr_check_block(struct buffer_head *bh)
 {
-	int error;
-
 	if (BHDR(bh)->h_magic != cpu_to_le32(EXT4_XATTR_MAGIC) ||
 	    BHDR(bh)->h_blocks != cpu_to_le32(1))
 		return -EIO;
-	error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
-	return error;
+	return ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size);
 }
 
 static inline int
@@ -220,7 +217,8 @@
 	error = -ENODATA;
 	if (!EXT4_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	if (!bh)
 		goto cleanup;
@@ -363,7 +361,8 @@
 	error = 0;
 	if (!EXT4_I(inode)->i_file_acl)
 		goto cleanup;
-	ea_idebug(inode, "reading block %u", EXT4_I(inode)->i_file_acl);
+	ea_idebug(inode, "reading block %llu",
+		  (unsigned long long)EXT4_I(inode)->i_file_acl);
 	bh = sb_bread(inode->i_sb, EXT4_I(inode)->i_file_acl);
 	error = -EIO;
 	if (!bh)
@@ -487,18 +486,19 @@
 		ext4_free_blocks(handle, inode, bh, 0, 1,
 				 EXT4_FREE_BLOCKS_METADATA |
 				 EXT4_FREE_BLOCKS_FORGET);
+		unlock_buffer(bh);
 	} else {
 		le32_add_cpu(&BHDR(bh)->h_refcount, -1);
+		if (ce)
+			mb_cache_entry_release(ce);
+		unlock_buffer(bh);
 		error = ext4_handle_dirty_metadata(handle, inode, bh);
 		if (IS_SYNC(inode))
 			ext4_handle_sync(handle);
 		dquot_free_block(inode, 1);
 		ea_bdebug(bh, "refcount now=%d; releasing",
 			  le32_to_cpu(BHDR(bh)->h_refcount));
-		if (ce)
-			mb_cache_entry_release(ce);
 	}
-	unlock_buffer(bh);
 out:
 	ext4_std_error(inode->i_sb, error);
 	return;
@@ -834,7 +834,8 @@
 			if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
 				BUG_ON(block > EXT4_MAX_BLOCK_FILE_PHYS);
 
-			ea_idebug(inode, "creating block %d", block);
+			ea_idebug(inode, "creating block %llu",
+				  (unsigned long long)block);
 
 			new_bh = sb_getblk(sb, block);
 			if (!new_bh) {

diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c
index d49d202..c78841e 100644
--- a/fs/jbd2/checkpoint.c
+++ b/fs/jbd2/checkpoint.c

@@ -88,14 +88,13 @@
  * whole transaction.
  *
  * Requires j_list_lock
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __try_to_free_cp_buf(struct journal_head *jh)
 {
 	int ret = 0;
 	struct buffer_head *bh = jh2bh(jh);
 
-	if (jh->b_jlist == BJ_None && !buffer_locked(bh) &&
+	if (jh->b_transaction == NULL && !buffer_locked(bh) &&
 	    !buffer_dirty(bh) && !buffer_write_io_error(bh)) {
 		/*
 		 * Get our reference so that bh cannot be freed before
@@ -104,11 +103,8 @@
 		get_bh(bh);
 		JBUFFER_TRACE(jh, "remove from checkpoint list");
 		ret = __jbd2_journal_remove_checkpoint(jh) + 1;
-		jbd_unlock_bh_state(bh);
 		BUFFER_TRACE(bh, "release");
 		__brelse(bh);
-	} else {
-		jbd_unlock_bh_state(bh);
 	}
 	return ret;
 }
@@ -180,21 +176,6 @@
 }
 
 /*
- * We were unable to perform jbd_trylock_bh_state() inside j_list_lock.
- * The caller must restart a list walk.  Wait for someone else to run
- * jbd_unlock_bh_state().
- */
-static void jbd_sync_bh(journal_t *journal, struct buffer_head *bh)
-	__releases(journal->j_list_lock)
-{
-	get_bh(bh);
-	spin_unlock(&journal->j_list_lock);
-	jbd_lock_bh_state(bh);
-	jbd_unlock_bh_state(bh);
-	put_bh(bh);
-}
-
-/*
  * Clean up transaction's list of buffers submitted for io.
  * We wait for any pending IO to complete and remove any clean
  * buffers. Note that we take the buffers in the opposite ordering
@@ -222,15 +203,9 @@
 	while (!released && transaction->t_checkpoint_io_list) {
 		jh = transaction->t_checkpoint_io_list;
 		bh = jh2bh(jh);
-		if (!jbd_trylock_bh_state(bh)) {
-			jbd_sync_bh(journal, bh);
-			spin_lock(&journal->j_list_lock);
-			goto restart;
-		}
 		get_bh(bh);
 		if (buffer_locked(bh)) {
 			spin_unlock(&journal->j_list_lock);
-			jbd_unlock_bh_state(bh);
 			wait_on_buffer(bh);
 			/* the journal_head may have gone by now */
 			BUFFER_TRACE(bh, "brelse");
@@ -246,7 +221,6 @@
 		 * it has been written out and so we can drop it from the list
 		 */
 		released = __jbd2_journal_remove_checkpoint(jh);
-		jbd_unlock_bh_state(bh);
 		__brelse(bh);
 	}
 
@@ -266,7 +240,6 @@
 
 	for (i = 0; i < *batch_count; i++) {
 		struct buffer_head *bh = journal->j_chkpt_bhs[i];
-		clear_buffer_jwrite(bh);
 		BUFFER_TRACE(bh, "brelse");
 		__brelse(bh);
 	}
@@ -281,7 +254,6 @@
  * be written out.
  *
  * Called with j_list_lock held and drops it if 1 is returned
- * Called under jbd_lock_bh_state(jh2bh(jh)), and drops it
  */
 static int __process_buffer(journal_t *journal, struct journal_head *jh,
 			    int *batch_count, transaction_t *transaction)
@@ -292,7 +264,6 @@
 	if (buffer_locked(bh)) {
 		get_bh(bh);
 		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
 		wait_on_buffer(bh);
 		/* the journal_head may have gone by now */
 		BUFFER_TRACE(bh, "brelse");
@@ -304,7 +275,6 @@
 
 		transaction->t_chp_stats.cs_forced_to_close++;
 		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
 		if (unlikely(journal->j_flags & JBD2_UNMOUNT))
 			/*
 			 * The journal thread is dead; so starting and
@@ -323,11 +293,9 @@
 		if (unlikely(buffer_write_io_error(bh)))
 			ret = -EIO;
 		get_bh(bh);
-		J_ASSERT_JH(jh, !buffer_jbddirty(bh));
 		BUFFER_TRACE(bh, "remove from checkpoint");
 		__jbd2_journal_remove_checkpoint(jh);
 		spin_unlock(&journal->j_list_lock);
-		jbd_unlock_bh_state(bh);
 		__brelse(bh);
 	} else {
 		/*
@@ -340,10 +308,8 @@
 		BUFFER_TRACE(bh, "queue");
 		get_bh(bh);
 		J_ASSERT_BH(bh, !buffer_jwrite(bh));
-		set_buffer_jwrite(bh);
 		journal->j_chkpt_bhs[*batch_count] = bh;
 		__buffer_relink_io(jh);
-		jbd_unlock_bh_state(bh);
 		transaction->t_chp_stats.cs_written++;
 		(*batch_count)++;
 		if (*batch_count == JBD2_NR_BATCH) {
@@ -407,15 +373,7 @@
 		int retry = 0, err;
 
 		while (!retry && transaction->t_checkpoint_list) {
-			struct buffer_head *bh;
-
 			jh = transaction->t_checkpoint_list;
-			bh = jh2bh(jh);
-			if (!jbd_trylock_bh_state(bh)) {
-				jbd_sync_bh(journal, bh);
-				retry = 1;
-				break;
-			}
 			retry = __process_buffer(journal, jh, &batch_count,
 						 transaction);
 			if (retry < 0 && !result)
@@ -478,79 +436,28 @@
 
 int jbd2_cleanup_journal_tail(journal_t *journal)
 {
-	transaction_t * transaction;
 	tid_t		first_tid;
-	unsigned long	blocknr, freed;
+	unsigned long	blocknr;
 
 	if (is_journal_aborted(journal))
 		return 1;
 
-	/* OK, work out the oldest transaction remaining in the log, and
-	 * the log block it starts at.
-	 *
-	 * If the log is now empty, we need to work out which is the
-	 * next transaction ID we will write, and where it will
-	 * start. */
-
-	write_lock(&journal->j_state_lock);
-	spin_lock(&journal->j_list_lock);
-	transaction = journal->j_checkpoint_transactions;
-	if (transaction) {
-		first_tid = transaction->t_tid;
-		blocknr = transaction->t_log_start;
-	} else if ((transaction = journal->j_committing_transaction) != NULL) {
-		first_tid = transaction->t_tid;
-		blocknr = transaction->t_log_start;
-	} else if ((transaction = journal->j_running_transaction) != NULL) {
-		first_tid = transaction->t_tid;
-		blocknr = journal->j_head;
-	} else {
-		first_tid = journal->j_transaction_sequence;
-		blocknr = journal->j_head;
-	}
-	spin_unlock(&journal->j_list_lock);
+	if (!jbd2_journal_get_log_tail(journal, &first_tid, &blocknr))
+		return 1;
 	J_ASSERT(blocknr != 0);
 
-	/* If the oldest pinned transaction is at the tail of the log
-           already then there's not much we can do right now. */
-	if (journal->j_tail_sequence == first_tid) {
-		write_unlock(&journal->j_state_lock);
-		return 1;
-	}
-
-	/* OK, update the superblock to recover the freed space.
-	 * Physical blocks come first: have we wrapped beyond the end of
-	 * the log?  */
-	freed = blocknr - journal->j_tail;
-	if (blocknr < journal->j_tail)
-		freed = freed + journal->j_last - journal->j_first;
-
-	trace_jbd2_cleanup_journal_tail(journal, first_tid, blocknr, freed);
-	jbd_debug(1,
-		  "Cleaning journal tail from %d to %d (offset %lu), "
-		  "freeing %lu\n",
-		  journal->j_tail_sequence, first_tid, blocknr, freed);
-
-	journal->j_free += freed;
-	journal->j_tail_sequence = first_tid;
-	journal->j_tail = blocknr;
-	write_unlock(&journal->j_state_lock);
-
 	/*
-	 * If there is an external journal, we need to make sure that
-	 * any data blocks that were recently written out --- perhaps
-	 * by jbd2_log_do_checkpoint() --- are flushed out before we
-	 * drop the transactions from the external journal.  It's
-	 * unlikely this will be necessary, especially with a
-	 * appropriately sized journal, but we need this to guarantee
-	 * correctness.  Fortunately jbd2_cleanup_journal_tail()
-	 * doesn't get called all that often.
+	 * We need to make sure that any blocks that were recently written out
+	 * --- perhaps by jbd2_log_do_checkpoint() --- are flushed out before
+	 * we drop the transactions from the journal. It's unlikely this will
+	 * be necessary, especially with an appropriately sized journal, but we
+	 * need this to guarantee correctness.  Fortunately
+	 * jbd2_cleanup_journal_tail() doesn't get called all that often.
 	 */
-	if ((journal->j_fs_dev != journal->j_dev) &&
-	    (journal->j_flags & JBD2_BARRIER))
+	if (journal->j_flags & JBD2_BARRIER)
 		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
-	if (!(journal->j_flags & JBD2_ABORT))
-		jbd2_journal_update_superblock(journal, 1);
+
+	__jbd2_update_log_tail(journal, first_tid, blocknr);
 	return 0;
 }
 
@@ -582,15 +489,12 @@
 	do {
 		jh = next_jh;
 		next_jh = jh->b_cpnext;
-		/* Use trylock because of the ranking */
-		if (jbd_trylock_bh_state(jh2bh(jh))) {
-			ret = __try_to_free_cp_buf(jh);
-			if (ret) {
-				freed++;
-				if (ret == 2) {
-					*released = 1;
-					return freed;
-				}
+		ret = __try_to_free_cp_buf(jh);
+		if (ret) {
+			freed++;
+			if (ret == 2) {
+				*released = 1;
+				return freed;
 			}
 		}
 		/*
@@ -673,9 +577,7 @@
  * The function can free jh and bh.
  *
  * This function is called with j_list_lock held.
- * This function is called with jbd_lock_bh_state(jh2bh(jh))
  */
-
 int __jbd2_journal_remove_checkpoint(struct journal_head *jh)
 {
 	struct transaction_chp_stats_s *stats;
@@ -722,7 +624,7 @@
 				    transaction->t_tid, stats);
 
 	__jbd2_journal_drop_transaction(journal, transaction);
-	kfree(transaction);
+	jbd2_journal_free_transaction(transaction);
 
 	/* Just in case anybody was waiting for more transactions to be
            checkpointed... */
@@ -797,5 +699,7 @@
 	J_ASSERT(journal->j_committing_transaction != transaction);
 	J_ASSERT(journal->j_running_transaction != transaction);
 
+	trace_jbd2_drop_transaction(journal, transaction);
+
 	jbd_debug(1, "Dropping transaction %d, all done\n", transaction->t_tid);
 }

diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index c067a8c..17f557f 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c

@@ -331,6 +331,10 @@
 	struct buffer_head *cbh = NULL; /* For transactional checksums */
 	__u32 crc32_sum = ~0;
 	struct blk_plug plug;
+	/* Tail of the journal */
+	unsigned long first_block;
+	tid_t first_tid;
+	int update_tail;
 
 	/*
 	 * First job: lock down the current transaction and wait for
@@ -340,7 +344,18 @@
 	/* Do we need to erase the effects of a prior jbd2_journal_flush? */
 	if (journal->j_flags & JBD2_FLUSHED) {
 		jbd_debug(3, "super block updated\n");
-		jbd2_journal_update_superblock(journal, 1);
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * We hold j_checkpoint_mutex so tail cannot change under us.
+		 * We don't need any special data guarantees for writing sb
+		 * since journal is empty and it is ok for write to be
+		 * flushed only with transaction commit.
+		 */
+		jbd2_journal_update_sb_log_tail(journal,
+						journal->j_tail_sequence,
+						journal->j_tail,
+						WRITE_SYNC);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 	} else {
 		jbd_debug(3, "superblock not updated\n");
 	}
@@ -677,10 +692,30 @@
 		err = 0;
 	}
 
+	/*
+	 * Get current oldest transaction in the log before we issue flush
+	 * to the filesystem device. After the flush we can be sure that
+	 * blocks of all older transactions are checkpointed to persistent
+	 * storage and we will be safe to update journal start in the
+	 * superblock with the numbers we get here.
+	 */
+	update_tail =
+		jbd2_journal_get_log_tail(journal, &first_tid, &first_block);
+
 	write_lock(&journal->j_state_lock);
+	if (update_tail) {
+		long freed = first_block - journal->j_tail;
+
+		if (first_block < journal->j_tail)
+			freed += journal->j_last - journal->j_first;
+		/* Update tail only if we free significant amount of space */
+		if (freed < journal->j_maxlen / 4)
+			update_tail = 0;
+	}
 	J_ASSERT(commit_transaction->t_state == T_COMMIT);
 	commit_transaction->t_state = T_COMMIT_DFLUSH;
 	write_unlock(&journal->j_state_lock);
+
 	/* 
 	 * If the journal is not located on the file system device,
 	 * then we must flush the file system device before we issue
@@ -831,6 +866,14 @@
 	if (err)
 		jbd2_journal_abort(journal, err);
 
+	/*
+	 * Now disk caches for filesystem device are flushed so we are safe to
+	 * erase checkpointed transactions from the log by updating journal
+	 * superblock.
+	 */
+	if (update_tail)
+		jbd2_update_log_tail(journal, first_tid, first_block);
+
 	/* End of a transaction!  Finally, we can do checkpoint
            processing: any buffers committed as a result of this
            transaction can be removed from any checkpoint list it was on
@@ -1048,7 +1091,7 @@
 	jbd_debug(1, "JBD2: commit %d complete, head %d\n",
 		  journal->j_commit_sequence, journal->j_tail_sequence);
 	if (to_free)
-		kfree(commit_transaction);
+		jbd2_journal_free_transaction(commit_transaction);
 
 	wake_up(&journal->j_wait_done_commit);
 }

diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 839377e..98ed6db 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c

@@ -71,7 +71,6 @@
 
 EXPORT_SYMBOL(jbd2_journal_init_dev);
 EXPORT_SYMBOL(jbd2_journal_init_inode);
-EXPORT_SYMBOL(jbd2_journal_update_format);
 EXPORT_SYMBOL(jbd2_journal_check_used_features);
 EXPORT_SYMBOL(jbd2_journal_check_available_features);
 EXPORT_SYMBOL(jbd2_journal_set_features);
@@ -96,7 +95,6 @@
 EXPORT_SYMBOL(jbd2_journal_begin_ordered_truncate);
 EXPORT_SYMBOL(jbd2_inode_cache);
 
-static int journal_convert_superblock_v1(journal_t *, journal_superblock_t *);
 static void __journal_abort_soft (journal_t *journal, int errno);
 static int jbd2_journal_create_slab(size_t slab_size);
 
@@ -746,6 +744,98 @@
 	return jbd2_journal_add_journal_head(bh);
 }
 
+/*
+ * Return tid of the oldest transaction in the journal and block in the journal
+ * where the transaction starts.
+ *
+ * If the journal is now empty, return which will be the next transaction ID
+ * we will write and where will that transaction start.
+ *
+ * The return value is 0 if journal tail cannot be pushed any further, 1 if
+ * it can.
+ */
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+			      unsigned long *block)
+{
+	transaction_t *transaction;
+	int ret;
+
+	read_lock(&journal->j_state_lock);
+	spin_lock(&journal->j_list_lock);
+	transaction = journal->j_checkpoint_transactions;
+	if (transaction) {
+		*tid = transaction->t_tid;
+		*block = transaction->t_log_start;
+	} else if ((transaction = journal->j_committing_transaction) != NULL) {
+		*tid = transaction->t_tid;
+		*block = transaction->t_log_start;
+	} else if ((transaction = journal->j_running_transaction) != NULL) {
+		*tid = transaction->t_tid;
+		*block = journal->j_head;
+	} else {
+		*tid = journal->j_transaction_sequence;
+		*block = journal->j_head;
+	}
+	ret = tid_gt(*tid, journal->j_tail_sequence);
+	spin_unlock(&journal->j_list_lock);
+	read_unlock(&journal->j_state_lock);
+
+	return ret;
+}
+
+/*
+ * Update information in journal structure and in on disk journal superblock
+ * about log tail. This function does not check whether information passed in
+ * really pushes log tail further. It's responsibility of the caller to make
+ * sure provided log tail information is valid (e.g. by holding
+ * j_checkpoint_mutex all the time between computing log tail and calling this
+ * function as is the case with jbd2_cleanup_journal_tail()).
+ *
+ * Requires j_checkpoint_mutex
+ */
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+	unsigned long freed;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+
+	/*
+	 * We cannot afford for write to remain in drive's caches since as
+	 * soon as we update j_tail, next transaction can start reusing journal
+	 * space and if we lose sb update during power failure we'd replay
+	 * old transaction with possibly newly overwritten data.
+	 */
+	jbd2_journal_update_sb_log_tail(journal, tid, block, WRITE_FUA);
+	write_lock(&journal->j_state_lock);
+	freed = block - journal->j_tail;
+	if (block < journal->j_tail)
+		freed += journal->j_last - journal->j_first;
+
+	trace_jbd2_update_log_tail(journal, tid, block, freed);
+	jbd_debug(1,
+		  "Cleaning journal tail from %d to %d (offset %lu), "
+		  "freeing %lu\n",
+		  journal->j_tail_sequence, tid, block, freed);
+
+	journal->j_free += freed;
+	journal->j_tail_sequence = tid;
+	journal->j_tail = block;
+	write_unlock(&journal->j_state_lock);
+}
+
+/*
+ * This is a variaon of __jbd2_update_log_tail which checks for validity of
+ * provided log tail and locks j_checkpoint_mutex. So it is safe against races
+ * with other threads updating log tail.
+ */
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block)
+{
+	mutex_lock(&journal->j_checkpoint_mutex);
+	if (tid_gt(tid, journal->j_tail_sequence))
+		__jbd2_update_log_tail(journal, tid, block);
+	mutex_unlock(&journal->j_checkpoint_mutex);
+}
+
 struct jbd2_stats_proc_session {
 	journal_t *journal;
 	struct transaction_stats_s *stats;
@@ -1114,40 +1204,45 @@
 
 	journal->j_max_transaction_buffers = journal->j_maxlen / 4;
 
-	/* Add the dynamic fields and write it to disk. */
-	jbd2_journal_update_superblock(journal, 1);
-	return jbd2_journal_start_thread(journal);
-}
-
-/**
- * void jbd2_journal_update_superblock() - Update journal sb on disk.
- * @journal: The journal to update.
- * @wait: Set to '0' if you don't want to wait for IO completion.
- *
- * Update a journal's dynamic superblock fields and write it to disk,
- * optionally waiting for the IO to complete.
- */
-void jbd2_journal_update_superblock(journal_t *journal, int wait)
-{
-	journal_superblock_t *sb = journal->j_superblock;
-	struct buffer_head *bh = journal->j_sb_buffer;
-
 	/*
 	 * As a special case, if the on-disk copy is already marked as needing
-	 * no recovery (s_start == 0) and there are no outstanding transactions
-	 * in the filesystem, then we can safely defer the superblock update
-	 * until the next commit by setting JBD2_FLUSHED.  This avoids
+	 * no recovery (s_start == 0), then we can safely defer the superblock
+	 * update until the next commit by setting JBD2_FLUSHED.  This avoids
 	 * attempting a write to a potential-readonly device.
 	 */
-	if (sb->s_start == 0 && journal->j_tail_sequence ==
-				journal->j_transaction_sequence) {
+	if (sb->s_start == 0) {
 		jbd_debug(1, "JBD2: Skipping superblock update on recovered sb "
 			"(start %ld, seq %d, errno %d)\n",
 			journal->j_tail, journal->j_tail_sequence,
 			journal->j_errno);
-		goto out;
+		journal->j_flags |= JBD2_FLUSHED;
+	} else {
+		/* Lock here to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		/*
+		 * Update log tail information. We use WRITE_FUA since new
+		 * transaction will start reusing journal space and so we
+		 * must make sure information about current log tail is on
+		 * disk before that.
+		 */
+		jbd2_journal_update_sb_log_tail(journal,
+						journal->j_tail_sequence,
+						journal->j_tail,
+						WRITE_FUA);
+		mutex_unlock(&journal->j_checkpoint_mutex);
 	}
+	return jbd2_journal_start_thread(journal);
+}
 
+static void jbd2_write_superblock(journal_t *journal, int write_op)
+{
+	struct buffer_head *bh = journal->j_sb_buffer;
+	int ret;
+
+	trace_jbd2_write_superblock(journal, write_op);
+	if (!(journal->j_flags & JBD2_BARRIER))
+		write_op &= ~(REQ_FUA | REQ_FLUSH);
+	lock_buffer(bh);
 	if (buffer_write_io_error(bh)) {
 		/*
 		 * Oh, dear.  A previous attempt to write the journal
@@ -1163,48 +1258,106 @@
 		clear_buffer_write_io_error(bh);
 		set_buffer_uptodate(bh);
 	}
+	get_bh(bh);
+	bh->b_end_io = end_buffer_write_sync;
+	ret = submit_bh(write_op, bh);
+	wait_on_buffer(bh);
+	if (buffer_write_io_error(bh)) {
+		clear_buffer_write_io_error(bh);
+		set_buffer_uptodate(bh);
+		ret = -EIO;
+	}
+	if (ret) {
+		printk(KERN_ERR "JBD2: Error %d detected when updating "
+		       "journal superblock for %s.\n", ret,
+		       journal->j_devname);
+	}
+}
 
+/**
+ * jbd2_journal_update_sb_log_tail() - Update log tail in journal sb on disk.
+ * @journal: The journal to update.
+ * @tail_tid: TID of the new transaction at the tail of the log
+ * @tail_block: The first block of the transaction at the tail of the log
+ * @write_op: With which operation should we write the journal sb
+ *
+ * Update a journal's superblock information about log tail and write it to
+ * disk, waiting for the IO to complete.
+ */
+void jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid,
+				     unsigned long tail_block, int write_op)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
+	jbd_debug(1, "JBD2: updating superblock (start %lu, seq %u)\n",
+		  tail_block, tail_tid);
+
+	sb->s_sequence = cpu_to_be32(tail_tid);
+	sb->s_start    = cpu_to_be32(tail_block);
+
+	jbd2_write_superblock(journal, write_op);
+
+	/* Log is no longer empty */
+	write_lock(&journal->j_state_lock);
+	WARN_ON(!sb->s_sequence);
+	journal->j_flags &= ~JBD2_FLUSHED;
+	write_unlock(&journal->j_state_lock);
+}
+
+/**
+ * jbd2_mark_journal_empty() - Mark on disk journal as empty.
+ * @journal: The journal to update.
+ *
+ * Update a journal's dynamic superblock fields to show that journal is empty.
+ * Write updated superblock to disk waiting for IO to complete.
+ */
+static void jbd2_mark_journal_empty(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	BUG_ON(!mutex_is_locked(&journal->j_checkpoint_mutex));
 	read_lock(&journal->j_state_lock);
-	jbd_debug(1, "JBD2: updating superblock (start %ld, seq %d, errno %d)\n",
-		  journal->j_tail, journal->j_tail_sequence, journal->j_errno);
+	jbd_debug(1, "JBD2: Marking journal as empty (seq %d)\n",
+		  journal->j_tail_sequence);
 
 	sb->s_sequence = cpu_to_be32(journal->j_tail_sequence);
-	sb->s_start    = cpu_to_be32(journal->j_tail);
+	sb->s_start    = cpu_to_be32(0);
+	read_unlock(&journal->j_state_lock);
+
+	jbd2_write_superblock(journal, WRITE_FUA);
+
+	/* Log is no longer empty */
+	write_lock(&journal->j_state_lock);
+	journal->j_flags |= JBD2_FLUSHED;
+	write_unlock(&journal->j_state_lock);
+}
+
+
+/**
+ * jbd2_journal_update_sb_errno() - Update error in the journal.
+ * @journal: The journal to update.
+ *
+ * Update a journal's errno.  Write updated superblock to disk waiting for IO
+ * to complete.
+ */
+static void jbd2_journal_update_sb_errno(journal_t *journal)
+{
+	journal_superblock_t *sb = journal->j_superblock;
+
+	read_lock(&journal->j_state_lock);
+	jbd_debug(1, "JBD2: updating superblock error (errno %d)\n",
+		  journal->j_errno);
 	sb->s_errno    = cpu_to_be32(journal->j_errno);
 	read_unlock(&journal->j_state_lock);
 
-	BUFFER_TRACE(bh, "marking dirty");
-	mark_buffer_dirty(bh);
-	if (wait) {
-		sync_dirty_buffer(bh);
-		if (buffer_write_io_error(bh)) {
-			printk(KERN_ERR "JBD2: I/O error detected "
-			       "when updating journal superblock for %s.\n",
-			       journal->j_devname);
-			clear_buffer_write_io_error(bh);
-			set_buffer_uptodate(bh);
-		}
-	} else
-		write_dirty_buffer(bh, WRITE);
-
-out:
-	/* If we have just flushed the log (by marking s_start==0), then
-	 * any future commit will have to be careful to update the
-	 * superblock again to re-record the true start of the log. */
-
-	write_lock(&journal->j_state_lock);
-	if (sb->s_start)
-		journal->j_flags &= ~JBD2_FLUSHED;
-	else
-		journal->j_flags |= JBD2_FLUSHED;
-	write_unlock(&journal->j_state_lock);
+	jbd2_write_superblock(journal, WRITE_SYNC);
 }
 
 /*
  * Read the superblock for a given journal, performing initial
  * validation of the format.
  */
-
 static int journal_get_superblock(journal_t *journal)
 {
 	struct buffer_head *bh;
@@ -1398,14 +1551,11 @@
 
 	if (journal->j_sb_buffer) {
 		if (!is_journal_aborted(journal)) {
-			/* We can now mark the journal as empty. */
-			journal->j_tail = 0;
-			journal->j_tail_sequence =
-				++journal->j_transaction_sequence;
-			jbd2_journal_update_superblock(journal, 1);
-		} else {
+			mutex_lock(&journal->j_checkpoint_mutex);
+			jbd2_mark_journal_empty(journal);
+			mutex_unlock(&journal->j_checkpoint_mutex);
+		} else
 			err = -EIO;
-		}
 		brelse(journal->j_sb_buffer);
 	}
 
@@ -1552,61 +1702,6 @@
 EXPORT_SYMBOL(jbd2_journal_clear_features);
 
 /**
- * int jbd2_journal_update_format () - Update on-disk journal structure.
- * @journal: Journal to act on.
- *
- * Given an initialised but unloaded journal struct, poke about in the
- * on-disk structure to update it to the most recent supported version.
- */
-int jbd2_journal_update_format (journal_t *journal)
-{
-	journal_superblock_t *sb;
-	int err;
-
-	err = journal_get_superblock(journal);
-	if (err)
-		return err;
-
-	sb = journal->j_superblock;
-
-	switch (be32_to_cpu(sb->s_header.h_blocktype)) {
-	case JBD2_SUPERBLOCK_V2:
-		return 0;
-	case JBD2_SUPERBLOCK_V1:
-		return journal_convert_superblock_v1(journal, sb);
-	default:
-		break;
-	}
-	return -EINVAL;
-}
-
-static int journal_convert_superblock_v1(journal_t *journal,
-					 journal_superblock_t *sb)
-{
-	int offset, blocksize;
-	struct buffer_head *bh;
-
-	printk(KERN_WARNING
-		"JBD2: Converting superblock from version 1 to 2.\n");
-
-	/* Pre-initialise new fields to zero */
-	offset = ((char *) &(sb->s_feature_compat)) - ((char *) sb);
-	blocksize = be32_to_cpu(sb->s_blocksize);
-	memset(&sb->s_feature_compat, 0, blocksize-offset);
-
-	sb->s_nr_users = cpu_to_be32(1);
-	sb->s_header.h_blocktype = cpu_to_be32(JBD2_SUPERBLOCK_V2);
-	journal->j_format_version = 2;
-
-	bh = journal->j_sb_buffer;
-	BUFFER_TRACE(bh, "marking dirty");
-	mark_buffer_dirty(bh);
-	sync_dirty_buffer(bh);
-	return 0;
-}
-
-
-/**
  * int jbd2_journal_flush () - Flush journal
  * @journal: Journal to act on.
  *
@@ -1619,7 +1714,6 @@
 {
 	int err = 0;
 	transaction_t *transaction = NULL;
-	unsigned long old_tail;
 
 	write_lock(&journal->j_state_lock);
 
@@ -1654,6 +1748,7 @@
 	if (is_journal_aborted(journal))
 		return -EIO;
 
+	mutex_lock(&journal->j_checkpoint_mutex);
 	jbd2_cleanup_journal_tail(journal);
 
 	/* Finally, mark the journal as really needing no recovery.
@@ -1661,14 +1756,9 @@
 	 * the magic code for a fully-recovered superblock.  Any future
 	 * commits of data to the journal will restore the current
 	 * s_start value. */
+	jbd2_mark_journal_empty(journal);
+	mutex_unlock(&journal->j_checkpoint_mutex);
 	write_lock(&journal->j_state_lock);
-	old_tail = journal->j_tail;
-	journal->j_tail = 0;
-	write_unlock(&journal->j_state_lock);
-	jbd2_journal_update_superblock(journal, 1);
-	write_lock(&journal->j_state_lock);
-	journal->j_tail = old_tail;
-
 	J_ASSERT(!journal->j_running_transaction);
 	J_ASSERT(!journal->j_committing_transaction);
 	J_ASSERT(!journal->j_checkpoint_transactions);
@@ -1708,8 +1798,12 @@
 		write ? "Clearing" : "Ignoring");
 
 	err = jbd2_journal_skip_recovery(journal);
-	if (write)
-		jbd2_journal_update_superblock(journal, 1);
+	if (write) {
+		/* Lock to make assertions happy... */
+		mutex_lock(&journal->j_checkpoint_mutex);
+		jbd2_mark_journal_empty(journal);
+		mutex_unlock(&journal->j_checkpoint_mutex);
+	}
 
  no_recovery:
 	return err;
@@ -1759,7 +1853,7 @@
 	__jbd2_journal_abort_hard(journal);
 
 	if (errno)
-		jbd2_journal_update_superblock(journal, 1);
+		jbd2_journal_update_sb_errno(journal);
 }
 
 /**
@@ -2017,7 +2111,7 @@
 static atomic_t nr_journal_heads = ATOMIC_INIT(0);
 #endif
 
-static int journal_init_jbd2_journal_head_cache(void)
+static int jbd2_journal_init_journal_head_cache(void)
 {
 	int retval;
 
@@ -2035,7 +2129,7 @@
 	return retval;
 }
 
-static void jbd2_journal_destroy_jbd2_journal_head_cache(void)
+static void jbd2_journal_destroy_journal_head_cache(void)
 {
 	if (jbd2_journal_head_cache) {
 		kmem_cache_destroy(jbd2_journal_head_cache);
@@ -2323,7 +2417,7 @@
 
 struct kmem_cache *jbd2_handle_cache, *jbd2_inode_cache;
 
-static int __init journal_init_handle_cache(void)
+static int __init jbd2_journal_init_handle_cache(void)
 {
 	jbd2_handle_cache = KMEM_CACHE(jbd2_journal_handle, SLAB_TEMPORARY);
 	if (jbd2_handle_cache == NULL) {
@@ -2358,17 +2452,20 @@
 
 	ret = jbd2_journal_init_revoke_caches();
 	if (ret == 0)
-		ret = journal_init_jbd2_journal_head_cache();
+		ret = jbd2_journal_init_journal_head_cache();
 	if (ret == 0)
-		ret = journal_init_handle_cache();
+		ret = jbd2_journal_init_handle_cache();
+	if (ret == 0)
+		ret = jbd2_journal_init_transaction_cache();
 	return ret;
 }
 
 static void jbd2_journal_destroy_caches(void)
 {
 	jbd2_journal_destroy_revoke_caches();
-	jbd2_journal_destroy_jbd2_journal_head_cache();
+	jbd2_journal_destroy_journal_head_cache();
 	jbd2_journal_destroy_handle_cache();
+	jbd2_journal_destroy_transaction_cache();
 	jbd2_journal_destroy_slabs();
 }
 

diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index da6d7ba..c1a0335 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c

@@ -21,6 +21,7 @@
 #include <linux/jbd2.h>
 #include <linux/errno.h>
 #include <linux/crc32.h>
+#include <linux/blkdev.h>
 #endif
 
 /*
@@ -265,7 +266,9 @@
 	err2 = sync_blockdev(journal->j_fs_dev);
 	if (!err)
 		err = err2;
-
+	/* Make sure all replayed data is on permanent storage */
+	if (journal->j_flags & JBD2_BARRIER)
+		blkdev_issue_flush(journal->j_fs_dev, GFP_KERNEL, NULL);
 	return err;
 }
 

diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c
index 30b2867..6973705 100644
--- a/fs/jbd2/revoke.c
+++ b/fs/jbd2/revoke.c

@@ -208,17 +208,13 @@
 	J_ASSERT(!jbd2_revoke_record_cache);
 	J_ASSERT(!jbd2_revoke_table_cache);
 
-	jbd2_revoke_record_cache = kmem_cache_create("jbd2_revoke_record",
-					   sizeof(struct jbd2_revoke_record_s),
-					   0,
-					   SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
-					   NULL);
+	jbd2_revoke_record_cache = KMEM_CACHE(jbd2_revoke_record_s,
+					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY);
 	if (!jbd2_revoke_record_cache)
 		goto record_cache_failure;
 
-	jbd2_revoke_table_cache = kmem_cache_create("jbd2_revoke_table",
-					   sizeof(struct jbd2_revoke_table_s),
-					   0, SLAB_TEMPORARY, NULL);
+	jbd2_revoke_table_cache = KMEM_CACHE(jbd2_revoke_table_s,
+					     SLAB_TEMPORARY);
 	if (!jbd2_revoke_table_cache)
 		goto table_cache_failure;
 	return 0;

diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e5aba56..ddcd354 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c

@@ -33,6 +33,35 @@
 static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh);
 static void __jbd2_journal_unfile_buffer(struct journal_head *jh);
 
+static struct kmem_cache *transaction_cache;
+int __init jbd2_journal_init_transaction_cache(void)
+{
+	J_ASSERT(!transaction_cache);
+	transaction_cache = kmem_cache_create("jbd2_transaction_s",
+					sizeof(transaction_t),
+					0,
+					SLAB_HWCACHE_ALIGN|SLAB_TEMPORARY,
+					NULL);
+	if (transaction_cache)
+		return 0;
+	return -ENOMEM;
+}
+
+void jbd2_journal_destroy_transaction_cache(void)
+{
+	if (transaction_cache) {
+		kmem_cache_destroy(transaction_cache);
+		transaction_cache = NULL;
+	}
+}
+
+void jbd2_journal_free_transaction(transaction_t *transaction)
+{
+	if (unlikely(ZERO_OR_NULL_PTR(transaction)))
+		return;
+	kmem_cache_free(transaction_cache, transaction);
+}
+
 /*
  * jbd2_get_transaction: obtain a new transaction_t object.
  *
@@ -133,7 +162,8 @@
 
 alloc_transaction:
 	if (!journal->j_running_transaction) {
-		new_transaction = kzalloc(sizeof(*new_transaction), gfp_mask);
+		new_transaction = kmem_cache_alloc(transaction_cache,
+						   gfp_mask | __GFP_ZERO);
 		if (!new_transaction) {
 			/*
 			 * If __GFP_FS is not present, then we may be
@@ -162,7 +192,7 @@
 	if (is_journal_aborted(journal) ||
 	    (journal->j_errno != 0 && !(journal->j_flags & JBD2_ACK_ERR))) {
 		read_unlock(&journal->j_state_lock);
-		kfree(new_transaction);
+		jbd2_journal_free_transaction(new_transaction);
 		return -EROFS;
 	}
 
@@ -284,7 +314,7 @@
 	read_unlock(&journal->j_state_lock);
 
 	lock_map_acquire(&handle->h_lockdep_map);
-	kfree(new_transaction);
+	jbd2_journal_free_transaction(new_transaction);
 	return 0;
 }
 
@@ -1549,9 +1579,9 @@
  * of these pointers, it could go bad.  Generally the caller needs to re-read
  * the pointer from the transaction_t.
  *
- * Called under j_list_lock.  The journal may not be locked.
+ * Called under j_list_lock.
  */
-void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
+static void __jbd2_journal_temp_unlink_buffer(struct journal_head *jh)
 {
 	struct journal_head **list = NULL;
 	transaction_t *transaction;
@@ -1646,10 +1676,8 @@
 	spin_lock(&journal->j_list_lock);
 	if (jh->b_cp_transaction != NULL && jh->b_transaction == NULL) {
 		/* written-back checkpointed metadata buffer */
-		if (jh->b_jlist == BJ_None) {
-			JBUFFER_TRACE(jh, "remove from checkpoint list");
-			__jbd2_journal_remove_checkpoint(jh);
-		}
+		JBUFFER_TRACE(jh, "remove from checkpoint list");
+		__jbd2_journal_remove_checkpoint(jh);
 	}
 	spin_unlock(&journal->j_list_lock);
 out:
@@ -1949,6 +1977,8 @@
 	clear_buffer_mapped(bh);
 	clear_buffer_req(bh);
 	clear_buffer_new(bh);
+	clear_buffer_delay(bh);
+	clear_buffer_unwritten(bh);
 	bh->b_bdev = NULL;
 	return may_free;
 }

diff --git a/include/linux/fs.h b/include/linux/fs.h
index fa63f1b..c437f91 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h

@@ -1872,19 +1872,6 @@
 	const struct dentry_operations *dops,
 	unsigned long);
 
-static inline void sb_mark_dirty(struct super_block *sb)
-{
-	sb->s_dirt = 1;
-}
-static inline void sb_mark_clean(struct super_block *sb)
-{
-	sb->s_dirt = 0;
-}
-static inline int sb_is_dirty(struct super_block *sb)
-{
-	return sb->s_dirt;
-}
-
 /* Alas, no aliases. Too much hassle with bringing module.h everywhere */
 #define fops_get(fops) \
 	(((fops) && try_module_get((fops)->owner) ? (fops) : NULL))

diff --git a/include/linux/jbd2.h b/include/linux/jbd2.h
index 5557bae..912c30a 100644
--- a/include/linux/jbd2.h
+++ b/include/linux/jbd2.h

@@ -971,6 +971,10 @@
 /* Log buffer allocation */
 extern struct journal_head * jbd2_journal_get_descriptor_buffer(journal_t *);
 int jbd2_journal_next_log_block(journal_t *, unsigned long long *);
+int jbd2_journal_get_log_tail(journal_t *journal, tid_t *tid,
+			      unsigned long *block);
+void __jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
+void jbd2_update_log_tail(journal_t *journal, tid_t tid, unsigned long block);
 
 /* Commit management */
 extern void jbd2_journal_commit_transaction(journal_t *);
@@ -1020,6 +1024,11 @@
 /* Transaction locking */
 extern void		__wait_on_journal (journal_t *);
 
+/* Transaction cache support */
+extern void jbd2_journal_destroy_transaction_cache(void);
+extern int  jbd2_journal_init_transaction_cache(void);
+extern void jbd2_journal_free_transaction(transaction_t *);
+
 /*
  * Journal locking.
  *
@@ -1082,7 +1091,8 @@
 extern int	   jbd2_journal_recover    (journal_t *journal);
 extern int	   jbd2_journal_wipe       (journal_t *, int);
 extern int	   jbd2_journal_skip_recovery	(journal_t *);
-extern void	   jbd2_journal_update_superblock	(journal_t *, int);
+extern void	   jbd2_journal_update_sb_log_tail	(journal_t *, tid_t,
+				unsigned long, int);
 extern void	   __jbd2_journal_abort_hard	(journal_t *);
 extern void	   jbd2_journal_abort      (journal_t *, int);
 extern int	   jbd2_journal_errno      (journal_t *);

diff --git a/include/linux/journal-head.h b/include/linux/journal-head.h
index 423cb6d..c18b46f 100644
--- a/include/linux/journal-head.h
+++ b/include/linux/journal-head.h

@@ -66,6 +66,8 @@
 	 * transaction (if there is one).  Only applies to buffers on a
 	 * transaction's data or metadata journaling list.
 	 * [j_list_lock] [jbd_lock_bh_state()]
+	 * Either of these locks is enough for reading, both are needed for
+	 * changes.
 	 */
 	transaction_t *b_transaction;
 

diff --git a/include/trace/events/jbd2.h b/include/trace/events/jbd2.h
index 7596441..127993d 100644
--- a/include/trace/events/jbd2.h
+++ b/include/trace/events/jbd2.h

@@ -81,6 +81,13 @@
 	TP_ARGS(journal, commit_transaction)
 );
 
+DEFINE_EVENT(jbd2_commit, jbd2_drop_transaction,
+
+	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
+
+	TP_ARGS(journal, commit_transaction)
+);
+
 TRACE_EVENT(jbd2_end_commit,
 	TP_PROTO(journal_t *journal, transaction_t *commit_transaction),
 
@@ -200,7 +207,7 @@
 		  __entry->forced_to_close, __entry->written, __entry->dropped)
 );
 
-TRACE_EVENT(jbd2_cleanup_journal_tail,
+TRACE_EVENT(jbd2_update_log_tail,
 
 	TP_PROTO(journal_t *journal, tid_t first_tid,
 		 unsigned long block_nr, unsigned long freed),
@@ -229,6 +236,26 @@
 		  __entry->block_nr, __entry->freed)
 );
 
+TRACE_EVENT(jbd2_write_superblock,
+
+	TP_PROTO(journal_t *journal, int write_op),
+
+	TP_ARGS(journal, write_op),
+
+	TP_STRUCT__entry(
+		__field(	dev_t,  dev			)
+		__field(	  int,  write_op		)
+	),
+
+	TP_fast_assign(
+		__entry->dev		= journal->j_fs_dev->bd_dev;
+		__entry->write_op	= write_op;
+	),
+
+	TP_printk("dev %d,%d write_op %x", MAJOR(__entry->dev),
+		  MINOR(__entry->dev), __entry->write_op)
+);
+
 #endif /* _TRACE_JBD2_H */
 
 /* This part must be outside protection */

diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 3fc2617..26adea8 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c

@@ -95,6 +95,8 @@
  */
 unsigned int dirty_writeback_interval = 5 * 100; /* centiseconds */
 
+EXPORT_SYMBOL_GPL(dirty_writeback_interval);
+
 /*
  * The longest time for which data is allowed to remain dirty
  */
commit	69e1aaddd63104f37021d0b0f6abfd9623c9134c	[log] [tgz]
author	Linus Torvalds <torvalds@linux-foundation.org>	Wed Mar 28 10:02:55 2012 -0700
committer	Linus Torvalds <torvalds@linux-foundation.org>	Wed Mar 28 10:02:55 2012 -0700
tree	14ad49741b428d270b681694bb2df349465455b9
parent	56b59b429b4c26e5e730bc8c3d837de9f7d0a966 [diff]
parent	9d547c35799a4ddd235f1565cec2fff6c9263504 [diff]