Btrfs: Add data=ordered support

This forces file data extents down the disk along with the metadata that
references them.  The current implementation is fairly simple, and just
writes out all of the dirty pages in an inode before the commit.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index d5804c5..ab9a9f8 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -5,7 +5,7 @@
 btrfs-y := super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
-	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o
+	   extent_map.o sysfs.o struct-funcs.o xattr.o acl.o ordered-data.o
 
 #btrfs-y := ctree.o disk-io.o radix-tree.o extent-tree.o print-tree.o \
 #	  root-tree.o dir-item.o hash.o file-item.o inode-item.o \
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index d1d5af4..f27e633 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -29,6 +29,7 @@
 	struct extent_map_tree extent_tree;
 	struct inode vfs_inode;
 
+	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 35c5707..43d2314 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -221,7 +221,9 @@
 		    struct extent_buffer **cow_ret)
 {
 	u64 search_start;
+	u64 header_trans;
 	int ret;
+
 	if (trans->transaction != root->fs_info->running_transaction) {
 		printk(KERN_CRIT "trans %Lu running %Lu\n", trans->transid,
 		       root->fs_info->running_transaction->transid);
@@ -232,7 +234,9 @@
 		       root->fs_info->generation);
 		WARN_ON(1);
 	}
-	if (btrfs_header_generation(buf) == trans->transid) {
+
+	header_trans = btrfs_header_generation(buf);
+	if (header_trans == trans->transid) {
 		*cow_ret = buf;
 		return 0;
 	}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9873975..b55dba5 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -16,8 +16,8 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __BTRFS__
-#define __BTRFS__
+#ifndef __BTRFS_CTREE__
+#define __BTRFS_CTREE__
 
 #include <linux/version.h>
 #include <linux/mm.h>
@@ -363,7 +363,6 @@
 	struct inode *inode;
 	struct kobject root_kobj;
 	struct completion kobj_unregister;
-	struct rw_semaphore snap_sem;
 	u64 objectid;
 	u64 last_trans;
 
@@ -1142,6 +1141,8 @@
 long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root);
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid);
 int btrfs_commit_write(struct file *file, struct page *page,
 		       unsigned from, unsigned to);
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a6170ff1..34cf1f1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -406,7 +406,6 @@
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
 	memset(&root->root_kobj, 0, sizeof(root->root_kobj));
 	init_completion(&root->kobj_unregister);
-	init_rwsem(&root->snap_sem);
 	root->defrag_running = 0;
 	root->defrag_level = 0;
 	root->root_key.objectid = objectid;
@@ -498,6 +497,21 @@
 	return root;
 }
 
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid)
+{
+	struct btrfs_root *root;
+
+	if (root_objectid == BTRFS_ROOT_TREE_OBJECTID)
+		return fs_info->tree_root;
+	if (root_objectid == BTRFS_EXTENT_TREE_OBJECTID)
+		return fs_info->extent_root;
+
+	root = radix_tree_lookup(&fs_info->fs_roots_radix,
+				 (unsigned long)root_objectid);
+	return root;
+}
+
 struct btrfs_root *btrfs_read_fs_root_no_name(struct btrfs_fs_info *fs_info,
 					      struct btrfs_key *location)
 {
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 8c3cfd0..dae9fba 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -34,6 +34,8 @@
 		      struct btrfs_root *root);
 struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root,
 					    u64 bytenr, u32 blocksize);
+struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
+					u64 root_objectid);
 struct btrfs_root *btrfs_read_fs_root(struct btrfs_fs_info *fs_info,
 				      struct btrfs_key *location,
 				      const char *name, int namelen);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index c906bb1..68137cd 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1195,7 +1195,9 @@
 			if (btrfs_buffer_uptodate(buf)) {
 				u64 transid =
 				    root->fs_info->running_transaction->transid;
-				if (btrfs_header_generation(buf) == transid) {
+				u64 header_transid =
+					btrfs_header_generation(buf);
+				if (header_transid == transid) {
 					free_extent_buffer(buf);
 					return 1;
 				}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 94c9337..0a5f4de 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,6 +34,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 
@@ -329,6 +330,7 @@
 		root->fs_info->delalloc_bytes += (end_of_last_block + 1 -
 					  start_pos) - existing_delalloc;
 		spin_unlock(&root->fs_info->delalloc_lock);
+		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -724,8 +726,6 @@
 
 	pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL);
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
-
 	mutex_lock(&inode->i_mutex);
 	first_index = pos >> PAGE_CACHE_SHIFT;
 	last_index = (pos + count) >> PAGE_CACHE_SHIFT;
@@ -804,7 +804,6 @@
 	}
 out:
 	mutex_unlock(&inode->i_mutex);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 
 out_nolock:
 	kfree(pages);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 6d0cd9a..6d6e1ac 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -135,6 +135,7 @@
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
+	btrfs_add_ordered_inode(inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -367,8 +368,8 @@
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
 	mutex_lock(&root->fs_info->fs_mutex);
-
 	memcpy(&location, &BTRFS_I(inode)->location, sizeof(location));
+
 	ret = btrfs_lookup_inode(NULL, root, path, &location, 0);
 	if (ret)
 		goto make_bad;
@@ -898,7 +899,6 @@
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
 
-	down_read(&root->snap_sem);
 	ret = -ENOMEM;
 	page = grab_cache_page(mapping, index);
 	if (!page)
@@ -917,7 +917,6 @@
 
 	unlock_page(page);
 	page_cache_release(page);
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 out:
 	return ret;
 }
@@ -1146,6 +1145,19 @@
 		args->root == BTRFS_I(inode)->root);
 }
 
+struct inode *btrfs_ilookup(struct super_block *s, u64 objectid,
+			    u64 root_objectid)
+{
+	struct btrfs_iget_args args;
+	args.ino = objectid;
+	args.root = btrfs_lookup_fs_root(btrfs_sb(s)->fs_info, root_objectid);
+
+	if (!args.root)
+		return NULL;
+
+	return ilookup5(s, objectid, btrfs_find_actor, (void *)&args);
+}
+
 struct inode *btrfs_iget_locked(struct super_block *s, u64 objectid,
 				struct btrfs_root *root)
 {
@@ -1336,7 +1348,6 @@
 
 			d_type = btrfs_filetype_table[btrfs_dir_type(leaf, di)];
 			btrfs_dir_item_key_to_cpu(leaf, di, &location);
-
 			over = filldir(dirent, name_ptr, name_len,
 				       found_key.offset,
 				       location.objectid,
@@ -2054,7 +2065,6 @@
 
 	ret = -EINVAL;
 
-	down_read(&BTRFS_I(inode)->root->snap_sem);
 	lock_page(page);
 	wait_on_page_writeback(page);
 	size = i_size_read(inode);
@@ -2075,7 +2085,6 @@
 	ret = btrfs_cow_one_page(inode, page, end);
 
 out_unlock:
-	up_read(&BTRFS_I(inode)->root->snap_sem);
 	unlock_page(page);
 out:
 	return ret;
@@ -2118,7 +2127,7 @@
 	struct btrfs_root_item root_item;
 	struct btrfs_inode_item *inode_item;
 	struct extent_buffer *leaf;
-	struct btrfs_root *new_root;
+	struct btrfs_root *new_root = root;
 	struct inode *inode;
 	struct inode *dir;
 	int ret;
@@ -2230,7 +2239,7 @@
 		goto fail;
 fail:
 	nr = trans->blocks_used;
-	err = btrfs_commit_transaction(trans, root);
+	err = btrfs_commit_transaction(trans, new_root);
 	if (err && !ret)
 		ret = err;
 fail_commit:
@@ -2253,10 +2262,6 @@
 	if (!root->ref_cows)
 		return -EINVAL;
 
-	down_write(&root->snap_sem);
-	freeze_bdev(root->fs_info->sb->s_bdev);
-	thaw_bdev(root->fs_info->sb->s_bdev, root->fs_info->sb);
-
 	mutex_lock(&root->fs_info->fs_mutex);
 	ret = btrfs_check_free_space(root, 1, 0);
 	if (ret)
@@ -2264,6 +2269,9 @@
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
+	err = btrfs_commit_transaction(trans, root);
+
+	trans = btrfs_start_transaction(root, 1);
 
 	ret = btrfs_update_inode(trans, root, root->inode);
 	if (ret)
@@ -2272,9 +2280,7 @@
 	ret = btrfs_find_free_objectid(trans, root->fs_info->tree_root,
 				       0, &objectid);
 	if (ret)
-		goto fail;
-
-	memcpy(&new_root_item, &root->root_item,
+		goto fail; memcpy(&new_root_item, &root->root_item,
 	       sizeof(new_root_item));
 
 	key.objectid = objectid;
@@ -2285,12 +2291,20 @@
 	btrfs_cow_block(trans, root, root->node, NULL, 0, &tmp);
 	free_extent_buffer(tmp);
 
+	/* write the ordered inodes to force all delayed allocations to
+	 * be filled.  Once this is done, we can copy the root
+	 */
+	mutex_lock(&root->fs_info->trans_mutex);
+	btrfs_write_ordered_inodes(trans, root);
+	mutex_unlock(&root->fs_info->trans_mutex);
+
 	btrfs_copy_root(trans, root, root->node, &tmp, objectid);
 
 	btrfs_set_root_bytenr(&new_root_item, tmp->start);
 	btrfs_set_root_level(&new_root_item, btrfs_header_level(tmp));
 	ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key,
 				&new_root_item);
+printk("new root %Lu node %Lu\n", objectid, tmp->start);
 	free_extent_buffer(tmp);
 	if (ret)
 		goto fail;
@@ -2321,7 +2335,6 @@
 		ret = err;
 fail_unlock:
 	mutex_unlock(&root->fs_info->fs_mutex);
-	up_write(&root->snap_sem);
 	btrfs_btree_balance_dirty(root, nr);
 	return ret;
 }
@@ -2608,6 +2621,7 @@
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
+	ei->ordered_trans = 0;
 	return &ei->vfs_inode;
 }
 
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
new file mode 100644
index 0000000..411aba8
--- /dev/null
+++ b/fs/btrfs/ordered-data.c
@@ -0,0 +1,221 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#include <linux/gfp.h>
+#include <linux/slab.h>
+#include "ctree.h"
+#include "transaction.h"
+#include "btrfs_inode.h"
+
+struct tree_entry {
+	u64 root_objectid;
+	u64 objectid;
+	struct rb_node rb_node;
+};
+
+/*
+ * returns > 0 if entry passed (root, objectid) is > entry,
+ * < 0 if (root, objectid) < entry and zero if they are equal
+ */
+static int comp_entry(struct tree_entry *entry, u64 root_objectid,
+		      u64 objectid)
+{
+	if (root_objectid < entry->root_objectid)
+		return -1;
+	if (root_objectid > entry->root_objectid)
+		return 1;
+	if (objectid < entry->objectid)
+		return -1;
+	if (objectid > entry->objectid)
+		return 1;
+	return 0;
+}
+
+static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
+				   u64 objectid, struct rb_node *node)
+{
+	struct rb_node ** p = &root->rb_node;
+	struct rb_node * parent = NULL;
+	struct tree_entry *entry;
+	int comp;
+
+	while(*p) {
+		parent = *p;
+		entry = rb_entry(parent, struct tree_entry, rb_node);
+
+		comp = comp_entry(entry, root_objectid, objectid);
+		if (comp < 0)
+			p = &(*p)->rb_left;
+		else if (comp > 0)
+			p = &(*p)->rb_right;
+		else
+			return parent;
+	}
+
+	rb_link_node(node, parent, p);
+	rb_insert_color(node, root);
+	return NULL;
+}
+
+static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
+				     u64 objectid, struct rb_node **prev_ret)
+{
+	struct rb_node * n = root->rb_node;
+	struct rb_node *prev = NULL;
+	struct tree_entry *entry;
+	struct tree_entry *prev_entry = NULL;
+	int comp;
+
+	while(n) {
+		entry = rb_entry(n, struct tree_entry, rb_node);
+		prev = n;
+		prev_entry = entry;
+		comp = comp_entry(entry, root_objectid, objectid);
+
+		if (comp < 0)
+			n = n->rb_left;
+		else if (comp > 0)
+			n = n->rb_right;
+		else
+			return n;
+	}
+	if (!prev_ret)
+		return NULL;
+
+	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
+		prev = rb_next(prev);
+		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	}
+	*prev_ret = prev;
+	return NULL;
+}
+
+static inline struct rb_node *tree_search(struct rb_root *root,
+					  u64 root_objectid, u64 objectid)
+{
+	struct rb_node *prev;
+	struct rb_node *ret;
+	ret = __tree_search(root, root_objectid, objectid, &prev);
+	if (!ret)
+		return prev;
+	return ret;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 root_objectid = root->root_key.objectid;
+	u64 transid = root->fs_info->running_transaction->transid;
+	struct tree_entry *entry;
+	struct rb_node *node;
+	struct btrfs_ordered_inode_tree *tree;
+
+	if (transid <= BTRFS_I(inode)->ordered_trans)
+		return 0;
+
+	tree = &root->fs_info->running_transaction->ordered_inode_tree;
+
+	read_lock(&tree->lock);
+	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
+	read_unlock(&tree->lock);
+	if (node) {
+		return 0;
+	}
+
+	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	if (!entry)
+		return -ENOMEM;
+
+	write_lock(&tree->lock);
+	entry->objectid = inode->i_ino;
+	entry->root_objectid = root_objectid;
+
+	node = tree_insert(&tree->tree, root_objectid,
+			   inode->i_ino, &entry->rb_node);
+
+	BTRFS_I(inode)->ordered_trans = transid;
+
+	write_unlock(&tree->lock);
+	if (node)
+		kfree(entry);
+	return 0;
+}
+
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+	entry = rb_entry(node, struct tree_entry, rb_node);
+
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	write_unlock(&tree->lock);
+	return 1;
+}
+
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid)
+{
+	struct tree_entry *entry;
+	struct rb_node *node;
+
+	write_lock(&tree->lock);
+	node = tree_search(&tree->tree, *root_objectid, *objectid);
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	entry = rb_entry(node, struct tree_entry, rb_node);
+	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
+		node = rb_next(node);
+		if (!node)
+			break;
+		entry = rb_entry(node, struct tree_entry, rb_node);
+	}
+	if (!node) {
+		write_unlock(&tree->lock);
+		return 0;
+	}
+
+	*root_objectid = entry->root_objectid;
+	*objectid = entry->objectid;
+	rb_erase(node, &tree->tree);
+	write_unlock(&tree->lock);
+	kfree(entry);
+	return 1;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
new file mode 100644
index 0000000..aaf9eb1
--- /dev/null
+++ b/fs/btrfs/ordered-data.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_ORDERED_DATA__
+#define __BTRFS_ORDERED_DATA__
+
+struct btrfs_ordered_inode_tree {
+	rwlock_t lock;
+	struct rb_root tree;
+};
+
+static inline void
+btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
+{
+	rwlock_init(&t->lock);
+	t->tree.rb_node = NULL;
+}
+
+int btrfs_add_ordered_inode(struct inode *inode);
+int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
+				       u64 *root_objectid, u64 *objectid);
+#endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index 02721ee..3ed5868 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,6 +67,7 @@
 		cur_trans->commit_done = 0;
 		cur_trans->start_time = get_seconds();
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
+		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_map_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -473,6 +474,60 @@
 	return ret;
 }
 
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root)
+{
+	struct btrfs_transaction *cur_trans = trans->transaction;
+	struct inode *inode;
+	u64 root_objectid = 0;
+	u64 objectid = 0;
+	u64 transid = trans->transid;
+	int ret;
+
+printk("write ordered trans %Lu\n", transid);
+	while(1) {
+		ret = btrfs_find_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_fdatawrite(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+	while(1) {
+		root_objectid = 0;
+		objectid = 0;
+		ret = btrfs_find_del_first_ordered_inode(
+				&cur_trans->ordered_inode_tree,
+				&root_objectid, &objectid);
+		if (!ret)
+			break;
+		mutex_unlock(&root->fs_info->trans_mutex);
+		mutex_unlock(&root->fs_info->fs_mutex);
+		inode = btrfs_ilookup(root->fs_info->sb, objectid,
+				      root_objectid);
+		if (inode) {
+			if (S_ISREG(inode->i_mode))
+				filemap_write_and_wait(inode->i_mapping);
+			iput(inode);
+		}
+		mutex_lock(&root->fs_info->fs_mutex);
+		mutex_lock(&root->fs_info->trans_mutex);
+	}
+printk("done write ordered trans %Lu\n", transid);
+	return 0;
+}
+
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root)
 {
@@ -550,10 +605,13 @@
 		mutex_lock(&root->fs_info->fs_mutex);
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
+		ret = btrfs_write_ordered_inodes(trans, root);
+
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
 	WARN_ON(cur_trans != trans->transaction);
+
 	ret = add_dirty_roots(trans, &root->fs_info->fs_roots_radix,
 			      &dirty_fs_roots);
 	BUG_ON(ret);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index eef840b..c157ddb 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -16,9 +16,10 @@
  * Boston, MA 021110-1307, USA.
  */
 
-#ifndef __TRANSACTION__
-#define __TRANSACTION__
+#ifndef __BTRFS_TRANSACTION__
+#define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
+#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -30,6 +31,7 @@
 	struct list_head list;
 	struct extent_map_tree dirty_pages;
 	unsigned long start_time;
+	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 };
@@ -90,4 +92,6 @@
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
+int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root);
 #endif