Btrfs: New data=ordered implementation

The old data=ordered code would force commit to wait until
all the data extents from the transaction were fully on disk.  This
introduced large latencies into the commit and stalled new writers
in the transaction for a long time.

The new code changes the way data allocations and extents work:

* When delayed allocation is filled, data extents are reserved, and
  the extent bit EXTENT_ORDERED is set on the entire range of the extent.
  A struct btrfs_ordered_extent is allocated an inserted into a per-inode
  rbtree to track the pending extents.

* As each page is written EXTENT_ORDERED is cleared on the bytes corresponding
  to that page.

* When all of the bytes corresponding to a single struct btrfs_ordered_extent
  are written, The previously reserved extent is inserted into the FS
  btree and into the extent allocation trees.  The checksums for the file
  data are also updated.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 40b4e0c..8d03687 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -21,6 +21,7 @@
 
 #include "extent_map.h"
 #include "extent_io.h"
+#include "ordered-data.h"
 
 /* in memory btrfs inode */
 struct btrfs_inode {
@@ -32,9 +33,8 @@
 	struct extent_io_tree io_failure_tree;
 	struct mutex csum_mutex;
 	struct inode vfs_inode;
-	atomic_t ordered_writeback;
+	struct btrfs_ordered_inode_tree ordered_tree;
 
-	u64 ordered_trans;
 	/*
 	 * transid of the trans_handle that last modified this inode
 	 */
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index f3783db..ceebc05 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -25,6 +25,7 @@
 #include <linux/fs.h>
 #include <linux/completion.h>
 #include <linux/backing-dev.h>
+#include <linux/wait.h>
 #include <asm/kmap_types.h>
 #include "bit-radix.h"
 #include "extent_io.h"
@@ -37,6 +38,7 @@
 extern struct kmem_cache *btrfs_transaction_cachep;
 extern struct kmem_cache *btrfs_bit_radix_cachep;
 extern struct kmem_cache *btrfs_path_cachep;
+struct btrfs_ordered_sum;
 
 #define BTRFS_MAGIC "_B5RfS_M"
 
@@ -510,6 +512,7 @@
 	u64 max_inline;
 	u64 alloc_start;
 	struct btrfs_transaction *running_transaction;
+	wait_queue_head_t transaction_throttle;
 	struct btrfs_super_block super_copy;
 	struct btrfs_super_block super_for_commit;
 	struct block_device *__bdev;
@@ -541,6 +544,7 @@
 	 */
 	struct btrfs_workers workers;
 	struct btrfs_workers endio_workers;
+	struct btrfs_workers endio_write_workers;
 	struct btrfs_workers submit_workers;
 	struct task_struct *transaction_kthread;
 	struct task_struct *cleaner_kthread;
@@ -1384,6 +1388,17 @@
 		       u64 owner, u64 owner_offset,
 		       u64 empty_size, u64 hint_byte,
 		       u64 search_end, struct btrfs_key *ins, u64 data);
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins);
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data);
 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
 		  struct extent_buffer *buf);
 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root
@@ -1556,9 +1571,9 @@
 			     u64 bytenr, int mod);
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums);
+			   struct btrfs_ordered_sum *sums);
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret);
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret);
 struct btrfs_csum_item *btrfs_lookup_csum(struct btrfs_trans_handle *trans,
 					  struct btrfs_root *root,
 					  struct btrfs_path *path,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index b01b3f4..4a5ebaf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -407,7 +407,11 @@
 	end_io_wq->error = err;
 	end_io_wq->work.func = end_workqueue_fn;
 	end_io_wq->work.flags = 0;
-	btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
+	if (bio->bi_rw & (1 << BIO_RW))
+		btrfs_queue_worker(&fs_info->endio_write_workers,
+				   &end_io_wq->work);
+	else
+		btrfs_queue_worker(&fs_info->endio_workers, &end_io_wq->work);
 
 #if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,23)
 	return 0;
@@ -1286,6 +1290,7 @@
 	mutex_init(&fs_info->transaction_kthread_mutex);
 	mutex_init(&fs_info->cleaner_mutex);
 	mutex_init(&fs_info->volume_mutex);
+	init_waitqueue_head(&fs_info->transaction_throttle);
 
 #if 0
 	ret = add_hasher(fs_info, "crc32c");
@@ -1325,9 +1330,13 @@
 	btrfs_init_workers(&fs_info->workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->submit_workers, fs_info->thread_pool_size);
 	btrfs_init_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_init_workers(&fs_info->endio_write_workers,
+			   fs_info->thread_pool_size);
 	btrfs_start_workers(&fs_info->workers, 1);
 	btrfs_start_workers(&fs_info->submit_workers, 1);
 	btrfs_start_workers(&fs_info->endio_workers, fs_info->thread_pool_size);
+	btrfs_start_workers(&fs_info->endio_write_workers,
+			    fs_info->thread_pool_size);
 
 	err = -EINVAL;
 	if (btrfs_super_num_devices(disk_super) > fs_devices->open_devices) {
@@ -1447,6 +1456,7 @@
 	extent_io_tree_empty_lru(&BTRFS_I(fs_info->btree_inode)->io_tree);
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 fail_iput:
 	iput(fs_info->btree_inode);
@@ -1702,6 +1712,7 @@
 
 	btrfs_stop_workers(&fs_info->workers);
 	btrfs_stop_workers(&fs_info->endio_workers);
+	btrfs_stop_workers(&fs_info->endio_write_workers);
 	btrfs_stop_workers(&fs_info->submit_workers);
 
 	iput(fs_info->btree_inode);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 8ebfa6b..343d110 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1895,36 +1895,17 @@
 	return ret;
 }
 
-/*
- * finds a free extent and does all the dirty work required for allocation
- * returns the key for the extent through ins, and a tree buffer for
- * the first block of the extent through buf.
- *
- * returns 0 if everything worked, non-zero otherwise.
- */
-int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
-		       struct btrfs_root *root,
-		       u64 num_bytes, u64 min_alloc_size,
-		       u64 root_objectid, u64 ref_generation,
-		       u64 owner, u64 owner_offset,
-		       u64 empty_size, u64 hint_byte,
-		       u64 search_end, struct btrfs_key *ins, u64 data)
+static int __btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
 {
 	int ret;
-	int pending_ret;
-	u64 super_used;
-	u64 root_used;
 	u64 search_start = 0;
 	u64 alloc_profile;
-	u32 sizes[2];
 	struct btrfs_fs_info *info = root->fs_info;
-	struct btrfs_root *extent_root = info->extent_root;
-	struct btrfs_extent_item *extent_item;
-	struct btrfs_extent_ref *ref;
-	struct btrfs_path *path;
-	struct btrfs_key keys[2];
-
-	maybe_lock_mutex(root);
 
 	if (data) {
 		alloc_profile = info->avail_data_alloc_bits &
@@ -1974,11 +1955,48 @@
 	}
 	if (ret) {
 		printk("allocation failed flags %Lu\n", data);
-	}
-	if (ret) {
 		BUG();
-		goto out;
 	}
+	clear_extent_dirty(&root->fs_info->free_space_cache,
+			   ins->objectid, ins->objectid + ins->offset - 1,
+			   GFP_NOFS);
+	return 0;
+}
+
+int btrfs_reserve_extent(struct btrfs_trans_handle *trans,
+				  struct btrfs_root *root,
+				  u64 num_bytes, u64 min_alloc_size,
+				  u64 empty_size, u64 hint_byte,
+				  u64 search_end, struct btrfs_key *ins,
+				  u64 data)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_reserve_extent(trans, root, num_bytes, min_alloc_size,
+				     empty_size, hint_byte, search_end, ins,
+				     data);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+
+static int __btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+					 struct btrfs_root *root,
+					 u64 root_objectid, u64 ref_generation,
+					 u64 owner, u64 owner_offset,
+					 struct btrfs_key *ins)
+{
+	int ret;
+	int pending_ret;
+	u64 super_used;
+	u64 root_used;
+	u64 num_bytes = ins->offset;
+	u32 sizes[2];
+	struct btrfs_fs_info *info = root->fs_info;
+	struct btrfs_root *extent_root = info->extent_root;
+	struct btrfs_extent_item *extent_item;
+	struct btrfs_extent_ref *ref;
+	struct btrfs_path *path;
+	struct btrfs_key keys[2];
 
 	/* block accounting for super block */
 	spin_lock_irq(&info->delalloc_lock);
@@ -1990,10 +2008,6 @@
 	root_used = btrfs_root_used(&root->root_item);
 	btrfs_set_root_used(&root->root_item, root_used + num_bytes);
 
-	clear_extent_dirty(&root->fs_info->free_space_cache,
-			   ins->objectid, ins->objectid + ins->offset - 1,
-			   GFP_NOFS);
-
 	if (root == extent_root) {
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
@@ -2001,10 +2015,6 @@
 		goto update_block;
 	}
 
-	WARN_ON(trans->alloc_exclude_nr);
-	trans->alloc_exclude_start = ins->objectid;
-	trans->alloc_exclude_nr = ins->offset;
-
 	memcpy(&keys[0], ins, sizeof(*ins));
 	keys[1].offset = hash_extent_ref(root_objectid, ref_generation,
 					 owner, owner_offset);
@@ -2054,6 +2064,51 @@
 		BUG();
 	}
 out:
+	return ret;
+}
+
+int btrfs_alloc_reserved_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_root *root,
+				u64 root_objectid, u64 ref_generation,
+				u64 owner, u64 owner_offset,
+				struct btrfs_key *ins)
+{
+	int ret;
+	maybe_lock_mutex(root);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	maybe_unlock_mutex(root);
+	return ret;
+}
+/*
+ * finds a free extent and does all the dirty work required for allocation
+ * returns the key for the extent through ins, and a tree buffer for
+ * the first block of the extent through buf.
+ *
+ * returns 0 if everything worked, non-zero otherwise.
+ */
+int btrfs_alloc_extent(struct btrfs_trans_handle *trans,
+		       struct btrfs_root *root,
+		       u64 num_bytes, u64 min_alloc_size,
+		       u64 root_objectid, u64 ref_generation,
+		       u64 owner, u64 owner_offset,
+		       u64 empty_size, u64 hint_byte,
+		       u64 search_end, struct btrfs_key *ins, u64 data)
+{
+	int ret;
+
+	maybe_lock_mutex(root);
+
+	ret = __btrfs_reserve_extent(trans, root, num_bytes,
+				     min_alloc_size, empty_size, hint_byte,
+				     search_end, ins, data);
+	BUG_ON(ret);
+	ret = __btrfs_alloc_reserved_extent(trans, root, root_objectid,
+					    ref_generation, owner,
+					    owner_offset, ins);
+	BUG_ON(ret);
+
 	maybe_unlock_mutex(root);
 	return ret;
 }
@@ -2288,8 +2343,8 @@
 			mutex_lock(&root->fs_info->alloc_mutex);
 
 			/* we've dropped the lock, double check */
-			ret = drop_snap_lookup_refcount(root, bytenr,
-						blocksize, &refs);
+			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
+						&refs);
 			BUG_ON(ret);
 			if (refs != 1) {
 				parent = path->nodes[*level];
@@ -2584,7 +2639,6 @@
 	kfree(ra);
 	trans = btrfs_start_transaction(BTRFS_I(inode)->root, 1);
 	if (trans) {
-		btrfs_add_ordered_inode(inode);
 		btrfs_end_transaction(trans, BTRFS_I(inode)->root);
 		mark_inode_dirty(inode);
 	}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 40a5f53..3f82a6e 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -793,6 +793,13 @@
 }
 EXPORT_SYMBOL(set_extent_dirty);
 
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask)
+{
+	return set_extent_bit(tree, start, end, EXTENT_ORDERED, 0, NULL, mask);
+}
+EXPORT_SYMBOL(set_extent_ordered);
+
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask)
 {
@@ -812,8 +819,8 @@
 		     gfp_t mask)
 {
 	return set_extent_bit(tree, start, end,
-			      EXTENT_DELALLOC | EXTENT_DIRTY, 0, NULL,
-			      mask);
+			      EXTENT_DELALLOC | EXTENT_DIRTY,
+			      0, NULL, mask);
 }
 EXPORT_SYMBOL(set_extent_delalloc);
 
@@ -825,6 +832,13 @@
 }
 EXPORT_SYMBOL(clear_extent_dirty);
 
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+			 gfp_t mask)
+{
+	return clear_extent_bit(tree, start, end, EXTENT_ORDERED, 1, 0, mask);
+}
+EXPORT_SYMBOL(clear_extent_ordered);
+
 int set_extent_new(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask)
 {
@@ -1395,10 +1409,9 @@
 
 		if (--bvec >= bio->bi_io_vec)
 			prefetchw(&bvec->bv_page->flags);
-
 		if (tree->ops && tree->ops->writepage_end_io_hook) {
 			ret = tree->ops->writepage_end_io_hook(page, start,
-						       end, state);
+						       end, state, uptodate);
 			if (ret)
 				uptodate = 0;
 		}
@@ -1868,9 +1881,14 @@
 			unlock_extent(tree, cur, end, GFP_NOFS);
 			break;
 		}
-
 		extent_offset = cur - em->start;
+		if (extent_map_end(em) <= cur) {
+printk("bad mapping em [%Lu %Lu] cur %Lu\n", em->start, extent_map_end(em), cur);
+		}
 		BUG_ON(extent_map_end(em) <= cur);
+		if (end < cur) {
+printk("2bad mapping end %Lu cur %Lu\n", end, cur);
+		}
 		BUG_ON(end < cur);
 
 		iosize = min(extent_map_end(em) - cur, end - cur + 1);
@@ -1976,6 +1994,7 @@
 	u64 last_byte = i_size_read(inode);
 	u64 block_start;
 	u64 iosize;
+	u64 unlock_start;
 	sector_t sector;
 	struct extent_map *em;
 	struct block_device *bdev;
@@ -1988,7 +2007,6 @@
 	u64 nr_delalloc;
 	u64 delalloc_end;
 
-
 	WARN_ON(!PageLocked(page));
 	page_offset = i_size & (PAGE_CACHE_SIZE - 1);
 	if (page->index > end_index ||
@@ -2030,6 +2048,7 @@
 		delalloc_start = delalloc_end + 1;
 	}
 	lock_extent(tree, start, page_end, GFP_NOFS);
+	unlock_start = start;
 
 	end = page_end;
 	if (test_range_bit(tree, start, page_end, EXTENT_DELALLOC, 0)) {
@@ -2038,6 +2057,11 @@
 
 	if (last_byte <= start) {
 		clear_extent_dirty(tree, start, page_end, GFP_NOFS);
+		unlock_extent(tree, start, page_end, GFP_NOFS);
+		if (tree->ops && tree->ops->writepage_end_io_hook)
+			tree->ops->writepage_end_io_hook(page, start,
+							 page_end, NULL, 1);
+		unlock_start = page_end + 1;
 		goto done;
 	}
 
@@ -2047,6 +2071,11 @@
 	while (cur <= end) {
 		if (cur >= last_byte) {
 			clear_extent_dirty(tree, cur, page_end, GFP_NOFS);
+			unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 page_end, NULL, 1);
+			unlock_start = page_end + 1;
 			break;
 		}
 		em = epd->get_extent(inode, page, page_offset, cur,
@@ -2071,8 +2100,16 @@
 		    block_start == EXTENT_MAP_INLINE) {
 			clear_extent_dirty(tree, cur,
 					   cur + iosize - 1, GFP_NOFS);
+
+			unlock_extent(tree, unlock_start, cur + iosize -1,
+				      GFP_NOFS);
+			if (tree->ops && tree->ops->writepage_end_io_hook)
+				tree->ops->writepage_end_io_hook(page, cur,
+							 cur + iosize - 1,
+							 NULL, 1);
 			cur = cur + iosize;
 			page_offset += iosize;
+			unlock_start = cur;
 			continue;
 		}
 
@@ -2119,7 +2156,8 @@
 		set_page_writeback(page);
 		end_page_writeback(page);
 	}
-	unlock_extent(tree, start, page_end, GFP_NOFS);
+	if (unlock_start <= page_end)
+		unlock_extent(tree, unlock_start, page_end, GFP_NOFS);
 	unlock_page(page);
 	return 0;
 }
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f1960da..2268a79 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -13,6 +13,8 @@
 #define EXTENT_DEFRAG (1 << 6)
 #define EXTENT_DEFRAG_DONE (1 << 7)
 #define EXTENT_BUFFER_FILLED (1 << 8)
+#define EXTENT_ORDERED (1 << 9)
+#define EXTENT_ORDERED_METADATA (1 << 10)
 #define EXTENT_IOBITS (EXTENT_LOCKED | EXTENT_WRITEBACK)
 
 /*
@@ -42,7 +44,7 @@
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
 	int (*writepage_end_io_hook)(struct page *page, u64 start, u64 end,
-				      struct extent_state *state);
+				      struct extent_state *state, int uptodate);
 	int (*set_bit_hook)(struct inode *inode, u64 start, u64 end,
 			    unsigned long old, unsigned long bits);
 	int (*clear_bit_hook)(struct inode *inode, u64 start, u64 end,
@@ -131,6 +133,8 @@
 		   int bits, int filled);
 int clear_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		      int bits, gfp_t mask);
+int clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
+		     int bits, int wake, int delete, gfp_t mask);
 int set_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
 		    int bits, gfp_t mask);
 int set_extent_uptodate(struct extent_io_tree *tree, u64 start, u64 end,
@@ -141,8 +145,14 @@
 		     gfp_t mask);
 int clear_extent_dirty(struct extent_io_tree *tree, u64 start, u64 end,
 		       gfp_t mask);
+int clear_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		       gfp_t mask);
+int clear_extent_ordered_metadata(struct extent_io_tree *tree, u64 start,
+				  u64 end, gfp_t mask);
 int set_extent_delalloc(struct extent_io_tree *tree, u64 start, u64 end,
 		     gfp_t mask);
+int set_extent_ordered(struct extent_io_tree *tree, u64 start, u64 end,
+		     gfp_t mask);
 int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
 			  u64 *start_ret, u64 *end_ret, int bits);
 struct extent_state *find_first_extent_bit_state(struct extent_io_tree *tree,
@@ -209,6 +219,8 @@
 			  unsigned long start, unsigned long len);
 int wait_on_extent_buffer_writeback(struct extent_io_tree *tree,
 				    struct extent_buffer *eb);
+int wait_on_extent_writeback(struct extent_io_tree *tree, u64 start, u64 end);
+int wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, int bits);
 int clear_extent_buffer_dirty(struct extent_io_tree *tree,
 			      struct extent_buffer *eb);
 int set_extent_buffer_dirty(struct extent_io_tree *tree,
diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c
index f5a04eb..8112327 100644
--- a/fs/btrfs/extent_map.c
+++ b/fs/btrfs/extent_map.c
@@ -206,10 +206,11 @@
 	struct extent_map *merge = NULL;
 	struct rb_node *rb;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb = tree_insert(&tree->map, em->start, &em->rb_node);
 	if (rb) {
-		merge = rb_entry(rb, struct extent_map, rb_node);
 		ret = -EEXIST;
+		free_extent_map(merge);
 		goto out;
 	}
 	atomic_inc(&em->refs);
@@ -268,6 +269,7 @@
 	struct rb_node *next = NULL;
 	u64 end = range_end(start, len);
 
+	BUG_ON(spin_trylock(&tree->lock));
 	em = tree->last;
 	if (em && end > em->start && start < extent_map_end(em))
 		goto found;
@@ -318,6 +320,7 @@
 {
 	int ret = 0;
 
+	BUG_ON(spin_trylock(&tree->lock));
 	rb_erase(&em->rb_node, &tree->map);
 	em->in_tree = 0;
 	if (tree->last == em)
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index f537eb4..345caf8 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -135,26 +135,37 @@
 }
 
 int btrfs_csum_one_bio(struct btrfs_root *root,
-		       struct bio *bio, char **sums_ret)
+		       struct bio *bio, struct btrfs_ordered_sum **sums_ret)
 {
-	u32 *sums;
+	struct btrfs_ordered_sum *sums;
+	struct btrfs_sector_sum *sector_sum;
 	char *data;
 	struct bio_vec *bvec = bio->bi_io_vec;
 	int bio_index = 0;
 
-	sums = kmalloc(bio->bi_vcnt * BTRFS_CRC32_SIZE, GFP_NOFS);
+	WARN_ON(bio->bi_vcnt <= 0);
+	sums = kzalloc(btrfs_ordered_sum_size(root, bio->bi_size), GFP_NOFS);
 	if (!sums)
 		return -ENOMEM;
-	*sums_ret = (char *)sums;
+	*sums_ret = sums;
+	sector_sum = &sums->sums;
+	sums->file_offset = page_offset(bvec->bv_page);
+	sums->len = bio->bi_size;
+	INIT_LIST_HEAD(&sums->list);
 
 	while(bio_index < bio->bi_vcnt) {
 		data = kmap_atomic(bvec->bv_page, KM_USER0);
-		*sums = ~(u32)0;
-		*sums = btrfs_csum_data(root, data + bvec->bv_offset,
-					*sums, bvec->bv_len);
+		sector_sum->sum = ~(u32)0;
+		sector_sum->sum = btrfs_csum_data(root,
+						  data + bvec->bv_offset,
+						  sector_sum->sum,
+						  bvec->bv_len);
 		kunmap_atomic(data, KM_USER0);
-		btrfs_csum_final(*sums, (char *)sums);
-		sums++;
+		btrfs_csum_final(sector_sum->sum,
+				 (char *)&sector_sum->sum);
+		sector_sum->offset = page_offset(bvec->bv_page) +
+			bvec->bv_offset;
+		sector_sum++;
 		bio_index++;
 		bvec++;
 	}
@@ -163,7 +174,7 @@
 
 int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, struct inode *inode,
-			   struct bio *bio, char *sums)
+			   struct btrfs_ordered_sum *sums)
 {
 	u64 objectid = inode->i_ino;
 	u64 offset;
@@ -171,17 +182,16 @@
 	struct btrfs_key file_key;
 	struct btrfs_key found_key;
 	u64 next_offset;
+	u64 total_bytes = 0;
 	int found_next;
 	struct btrfs_path *path;
 	struct btrfs_csum_item *item;
 	struct btrfs_csum_item *item_end;
 	struct extent_buffer *leaf = NULL;
 	u64 csum_offset;
-	u32 *sums32 = (u32 *)sums;
+	struct btrfs_sector_sum *sector_sum;
 	u32 nritems;
 	u32 ins_size;
-	int bio_index = 0;
-	struct bio_vec *bvec = bio->bi_io_vec;
 	char *eb_map;
 	char *eb_token;
 	unsigned long map_len;
@@ -189,10 +199,11 @@
 
 	path = btrfs_alloc_path();
 	BUG_ON(!path);
+	sector_sum = &sums->sums;
 again:
 	next_offset = (u64)-1;
 	found_next = 0;
-	offset = page_offset(bvec->bv_page) + bvec->bv_offset;
+	offset = sector_sum->offset;
 	file_key.objectid = objectid;
 	file_key.offset = offset;
 	btrfs_set_key_type(&file_key, BTRFS_CSUM_ITEM_KEY);
@@ -303,7 +314,7 @@
 	item_end = (struct btrfs_csum_item *)((unsigned char *)item_end +
 				      btrfs_item_size_nr(leaf, path->slots[0]));
 	eb_token = NULL;
-next_bvec:
+next_sector:
 
 	if (!eb_token ||
 	   (unsigned long)item  + BTRFS_CRC32_SIZE >= map_start + map_len) {
@@ -321,21 +332,20 @@
 	}
 	if (eb_token) {
 		memcpy(eb_token + ((unsigned long)item & (PAGE_CACHE_SIZE - 1)),
-		       sums32, BTRFS_CRC32_SIZE);
+		       &sector_sum->sum, BTRFS_CRC32_SIZE);
 	} else {
-		write_extent_buffer(leaf, sums32, (unsigned long)item,
-				    BTRFS_CRC32_SIZE);
+		write_extent_buffer(leaf, &sector_sum->sum,
+				    (unsigned long)item, BTRFS_CRC32_SIZE);
 	}
-	bio_index++;
-	bvec++;
-	sums32++;
-	if (bio_index < bio->bi_vcnt) {
+	total_bytes += root->sectorsize;
+	sector_sum++;
+	if (total_bytes < sums->len) {
 		item = (struct btrfs_csum_item *)((char *)item +
 						  BTRFS_CRC32_SIZE);
 		if (item < item_end && offset + PAGE_CACHE_SIZE ==
-		    page_offset(bvec->bv_page)) {
-			offset = page_offset(bvec->bv_page);
-			goto next_bvec;
+		    sector_sum->offset) {
+			    offset = sector_sum->offset;
+			goto next_sector;
 		}
 	}
 	if (eb_token) {
@@ -343,7 +353,7 @@
 		eb_token = NULL;
 	}
 	btrfs_mark_buffer_dirty(path->nodes[0]);
-	if (bio_index < bio->bi_vcnt) {
+	if (total_bytes < sums->len) {
 		btrfs_release_path(root, path);
 		goto again;
 	}
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 8037792..12e765f 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -34,7 +34,6 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 #include "ioctl.h"
 #include "print-tree.h"
 #include "compat.h"
@@ -273,7 +272,9 @@
 		u64 mask = root->sectorsize - 1;
 		last_pos_in_file = (isize + mask) & ~mask;
 		hole_size = (start_pos - last_pos_in_file + mask) & ~mask;
-		if (last_pos_in_file < start_pos) {
+		if (hole_size > 0) {
+			btrfs_wait_ordered_range(inode, last_pos_in_file,
+						 last_pos_in_file + hole_size);
 			err = btrfs_drop_extents(trans, root, inode,
 						 last_pos_in_file,
 						 last_pos_in_file + hole_size,
@@ -303,19 +304,17 @@
 	    inline_size > root->fs_info->max_inline ||
 	    (inline_size & (root->sectorsize -1)) == 0 ||
 	    inline_size >= BTRFS_MAX_INLINE_DATA_SIZE(root)) {
-		u64 last_end;
-
+		/* check for reserved extents on each page, we don't want
+		 * to reset the delalloc bit on things that already have
+		 * extents reserved.
+		 */
+		set_extent_delalloc(io_tree, start_pos,
+				    end_of_last_block, GFP_NOFS);
 		for (i = 0; i < num_pages; i++) {
 			struct page *p = pages[i];
 			SetPageUptodate(p);
 			set_page_dirty(p);
 		}
-		last_end = (u64)(pages[num_pages -1]->index) <<
-				PAGE_CACHE_SHIFT;
-		last_end += PAGE_CACHE_SIZE - 1;
-		set_extent_delalloc(io_tree, start_pos, end_of_last_block,
-				 GFP_NOFS);
-		btrfs_add_ordered_inode(inode);
 	} else {
 		u64 aligned_end;
 		/* step one, delete the existing extents in this range */
@@ -350,10 +349,13 @@
 	struct extent_map *split = NULL;
 	struct extent_map *split2 = NULL;
 	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	struct extent_map *tmp;
 	u64 len = end - start + 1;
+	u64 next_start;
 	int ret;
 	int testend = 1;
 
+	WARN_ON(end < start);
 	if (end == (u64)-1) {
 		len = (u64)-1;
 		testend = 0;
@@ -370,6 +372,8 @@
 			spin_unlock(&em_tree->lock);
 			break;
 		}
+		tmp = rb_entry(&em->rb_node, struct extent_map, rb_node);
+		next_start = tmp->start;
 		remove_extent_mapping(em_tree, em);
 
 		if (em->block_start < EXTENT_MAP_LAST_BYTE &&
@@ -778,37 +782,58 @@
 	struct inode *inode = fdentry(file)->d_inode;
 	int err = 0;
 	u64 start_pos;
+	u64 last_pos;
 
 	start_pos = pos & ~((u64)root->sectorsize - 1);
+	last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
 
 	memset(pages, 0, num_pages * sizeof(struct page *));
-
+again:
 	for (i = 0; i < num_pages; i++) {
 		pages[i] = grab_cache_page(inode->i_mapping, index + i);
 		if (!pages[i]) {
 			err = -ENOMEM;
 			BUG_ON(1);
 		}
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(pages[i]);
-#else
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-#endif
 		wait_on_page_writeback(pages[i]);
-		set_page_extent_mapped(pages[i]);
-		WARN_ON(!PageLocked(pages[i]));
 	}
 	if (start_pos < inode->i_size) {
-		u64 last_pos;
-		last_pos = ((u64)index + num_pages) << PAGE_CACHE_SHIFT;
+		struct btrfs_ordered_extent *ordered;
 		lock_extent(&BTRFS_I(inode)->io_tree,
 			    start_pos, last_pos - 1, GFP_NOFS);
+		ordered = btrfs_lookup_first_ordered_extent(inode, last_pos -1);
+		if (ordered &&
+		    ordered->file_offset + ordered->len > start_pos &&
+		    ordered->file_offset < last_pos) {
+			btrfs_put_ordered_extent(ordered);
+			unlock_extent(&BTRFS_I(inode)->io_tree,
+				      start_pos, last_pos - 1, GFP_NOFS);
+			for (i = 0; i < num_pages; i++) {
+				unlock_page(pages[i]);
+				page_cache_release(pages[i]);
+			}
+			btrfs_wait_ordered_range(inode, start_pos,
+						 last_pos - start_pos);
+			goto again;
+		}
+		if (ordered)
+			btrfs_put_ordered_extent(ordered);
+
 		clear_extent_bits(&BTRFS_I(inode)->io_tree, start_pos,
 				  last_pos - 1, EXTENT_DIRTY | EXTENT_DELALLOC,
 				  GFP_NOFS);
 		unlock_extent(&BTRFS_I(inode)->io_tree,
 			      start_pos, last_pos - 1, GFP_NOFS);
 	}
+	for (i = 0; i < num_pages; i++) {
+#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
+		ClearPageDirty(pages[i]);
+#else
+		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
+#endif
+		set_page_extent_mapped(pages[i]);
+		WARN_ON(!PageLocked(pages[i]));
+	}
 	return 0;
 }
 
@@ -969,13 +994,11 @@
 		     (start_pos + num_written - 1) >> PAGE_CACHE_SHIFT);
 	}
 	current->backing_dev_info = NULL;
-	btrfs_ordered_throttle(root, inode);
 	return num_written ? num_written : err;
 }
 
 int btrfs_release_file(struct inode * inode, struct file * filp)
 {
-	btrfs_del_ordered_inode(inode, 0);
 	if (filp->private_data)
 		btrfs_ioctl_trans_end(filp);
 	return 0;
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index d39433d..c5a62f0 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -43,6 +43,7 @@
 #include "ioctl.h"
 #include "print-tree.h"
 #include "volumes.h"
+#include "ordered-data.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -109,10 +110,11 @@
 	u64 num_bytes;
 	u64 cur_alloc_size;
 	u64 blocksize = root->sectorsize;
-	u64 orig_start = start;
 	u64 orig_num_bytes;
 	struct btrfs_key ins;
-	int ret;
+	struct extent_map *em;
+	struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
+	int ret = 0;
 
 	trans = btrfs_start_transaction(root, 1);
 	BUG_ON(!trans);
@@ -120,33 +122,44 @@
 
 	num_bytes = (end - start + blocksize) & ~(blocksize - 1);
 	num_bytes = max(blocksize,  num_bytes);
-	ret = btrfs_drop_extents(trans, root, inode,
-				 start, start + num_bytes, start, &alloc_hint);
 	orig_num_bytes = num_bytes;
 
 	if (alloc_hint == EXTENT_MAP_INLINE)
 		goto out;
 
 	BUG_ON(num_bytes > btrfs_super_total_bytes(&root->fs_info->super_copy));
+	btrfs_drop_extent_cache(inode, start, start + num_bytes - 1);
 
 	while(num_bytes > 0) {
 		cur_alloc_size = min(num_bytes, root->fs_info->max_extent);
-		ret = btrfs_alloc_extent(trans, root, cur_alloc_size,
-					 root->sectorsize,
-					 root->root_key.objectid,
-					 trans->transid,
-					 inode->i_ino, start, 0,
-					 alloc_hint, (u64)-1, &ins, 1);
+		ret = btrfs_reserve_extent(trans, root, cur_alloc_size,
+					   root->sectorsize, 0, 0,
+					   (u64)-1, &ins, 1);
 		if (ret) {
 			WARN_ON(1);
 			goto out;
 		}
+		em = alloc_extent_map(GFP_NOFS);
+		em->start = start;
+		em->len = ins.offset;
+		em->block_start = ins.objectid;
+		em->bdev = root->fs_info->fs_devices->latest_bdev;
+		while(1) {
+			spin_lock(&em_tree->lock);
+			ret = add_extent_mapping(em_tree, em);
+			spin_unlock(&em_tree->lock);
+			if (ret != -EEXIST) {
+				free_extent_map(em);
+				break;
+			}
+			btrfs_drop_extent_cache(inode, start,
+						start + ins.offset - 1);
+		}
+
 		cur_alloc_size = ins.offset;
-		ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
-					       start, ins.objectid, ins.offset,
-					       ins.offset, 0);
-		inode->i_blocks += ins.offset >> 9;
-		btrfs_check_file(root, inode);
+		ret = btrfs_add_ordered_extent(inode, start, ins.objectid,
+					       ins.offset);
+		BUG_ON(ret);
 		if (num_bytes < cur_alloc_size) {
 			printk("num_bytes %Lu cur_alloc %Lu\n", num_bytes,
 			       cur_alloc_size);
@@ -156,10 +169,6 @@
 		alloc_hint = ins.objectid + ins.offset;
 		start += cur_alloc_size;
 	}
-	btrfs_drop_extent_cache(inode, orig_start,
-				orig_start + orig_num_bytes - 1);
-	btrfs_add_ordered_inode(inode);
-	btrfs_update_inode(trans, root, inode);
 out:
 	btrfs_end_transaction(trans, root);
 	return ret;
@@ -341,25 +350,15 @@
 			  int mirror_num)
 {
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	struct btrfs_trans_handle *trans;
 	int ret = 0;
-	char *sums = NULL;
+	struct btrfs_ordered_sum *sums;
 
 	ret = btrfs_csum_one_bio(root, bio, &sums);
 	BUG_ON(ret);
 
-	trans = btrfs_start_transaction(root, 1);
-
-	btrfs_set_trans_block_group(trans, inode);
-	mutex_lock(&BTRFS_I(inode)->csum_mutex);
-	btrfs_csum_file_blocks(trans, root, inode, bio, sums);
-	mutex_unlock(&BTRFS_I(inode)->csum_mutex);
-
-	ret = btrfs_end_transaction(trans, root);
+	ret = btrfs_add_ordered_sum(inode, sums);
 	BUG_ON(ret);
 
-	kfree(sums);
-
 	return btrfs_map_bio(root, rw, bio, mirror_num, 1);
 }
 
@@ -369,14 +368,10 @@
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	int ret = 0;
 
-	if (!(rw & (1 << BIO_RW))) {
-		ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
-		BUG_ON(ret);
-		goto mapit;
-	}
+	ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0);
+	BUG_ON(ret);
 
-	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM)) {
+	if (!(rw & (1 << BIO_RW))) {
 		goto mapit;
 	}
 
@@ -387,6 +382,96 @@
 	return btrfs_map_bio(root, rw, bio, mirror_num, 0);
 }
 
+static int add_pending_csums(struct btrfs_trans_handle *trans,
+			     struct inode *inode, u64 file_offset,
+			     struct list_head *list)
+{
+	struct list_head *cur;
+	struct btrfs_ordered_sum *sum;
+
+	btrfs_set_trans_block_group(trans, inode);
+	while(!list_empty(list)) {
+		cur = list->next;
+		sum = list_entry(cur, struct btrfs_ordered_sum, list);
+		mutex_lock(&BTRFS_I(inode)->csum_mutex);
+		btrfs_csum_file_blocks(trans, BTRFS_I(inode)->root,
+				       inode, sum);
+		mutex_unlock(&BTRFS_I(inode)->csum_mutex);
+		list_del(&sum->list);
+		kfree(sum);
+	}
+	return 0;
+}
+
+int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end,
+				struct extent_state *state, int uptodate)
+{
+	struct inode *inode = page->mapping->host;
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct btrfs_trans_handle *trans;
+	struct btrfs_ordered_extent *ordered_extent;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	u64 alloc_hint = 0;
+	struct list_head list;
+	struct btrfs_key ins;
+	int ret;
+
+	ret = btrfs_dec_test_ordered_pending(inode, start, end - start + 1);
+	if (!ret) {
+		return 0;
+	}
+
+	trans = btrfs_start_transaction(root, 1);
+
+	ordered_extent = btrfs_lookup_ordered_extent(inode, start);
+	BUG_ON(!ordered_extent);
+
+	lock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+
+	INIT_LIST_HEAD(&list);
+
+	ins.objectid = ordered_extent->start;
+	ins.offset = ordered_extent->len;
+	ins.type = BTRFS_EXTENT_ITEM_KEY;
+	ret = btrfs_alloc_reserved_extent(trans, root, root->root_key.objectid,
+					  trans->transid, inode->i_ino,
+					  ordered_extent->file_offset, &ins);
+	BUG_ON(ret);
+	ret = btrfs_drop_extents(trans, root, inode,
+				 ordered_extent->file_offset,
+				 ordered_extent->file_offset +
+				 ordered_extent->len,
+				 ordered_extent->file_offset, &alloc_hint);
+	BUG_ON(ret);
+	ret = btrfs_insert_file_extent(trans, root, inode->i_ino,
+				       ordered_extent->file_offset,
+				       ordered_extent->start,
+				       ordered_extent->len,
+				       ordered_extent->len, 0);
+	BUG_ON(ret);
+	btrfs_drop_extent_cache(inode, ordered_extent->file_offset,
+				ordered_extent->file_offset +
+				ordered_extent->len - 1);
+	inode->i_blocks += ordered_extent->len >> 9;
+	unlock_extent(io_tree, ordered_extent->file_offset,
+		    ordered_extent->file_offset + ordered_extent->len - 1,
+		    GFP_NOFS);
+	add_pending_csums(trans, inode, ordered_extent->file_offset,
+			  &ordered_extent->list);
+
+	btrfs_remove_ordered_extent(inode, ordered_extent);
+	/* once for us */
+	btrfs_put_ordered_extent(ordered_extent);
+	/* once for the tree */
+	btrfs_put_ordered_extent(ordered_extent);
+
+	btrfs_update_inode(trans, root, inode);
+	btrfs_end_transaction(trans, root);
+	return 0;
+}
+
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
 {
 	int ret = 0;
@@ -409,7 +494,8 @@
 		if (ret == -ENOENT || ret == -EFBIG)
 			ret = 0;
 		csum = 0;
-		printk("no csum found for inode %lu start %Lu\n", inode->i_ino, start);
+		printk("no csum found for inode %lu start %Lu\n", inode->i_ino,
+		       start);
 		goto out;
 	}
 	read_extent_buffer(path->nodes[0], &csum, (unsigned long)item,
@@ -833,7 +919,6 @@
 {
 	struct btrfs_root *root;
 	struct btrfs_trans_handle *trans;
-	struct inode *inode = dentry->d_inode;
 	int ret;
 	unsigned long nr = 0;
 
@@ -849,14 +934,6 @@
 	ret = btrfs_unlink_trans(trans, root, dir, dentry);
 	nr = trans->blocks_used;
 
-	if (inode->i_nlink == 0) {
-		/* if the inode isn't linked anywhere,
-		 * we don't need to worry about
-		 * data=ordered
-		 */
-		btrfs_del_ordered_inode(inode, 1);
-	}
-
 	btrfs_end_transaction_throttle(trans, root);
 fail:
 	btrfs_btree_balance_dirty(root, nr);
@@ -931,6 +1008,7 @@
 	int extent_type = -1;
 	u64 mask = root->sectorsize - 1;
 
+	btrfs_wait_ordered_range(inode, inode->i_size & (~mask), (u64)-1);
 	btrfs_drop_extent_cache(inode, inode->i_size & (~mask), (u64)-1);
 	path = btrfs_alloc_path();
 	path->reada = -1;
@@ -1117,34 +1195,6 @@
 	return ret;
 }
 
-static int btrfs_cow_one_page(struct inode *inode, struct page *page,
-			      size_t zero_start)
-{
-	char *kaddr;
-	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
-	u64 page_start = (u64)page->index << PAGE_CACHE_SHIFT;
-	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
-	int ret = 0;
-
-	WARN_ON(!PageLocked(page));
-	set_page_extent_mapped(page);
-
-	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
-	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
-			    page_end, GFP_NOFS);
-
-	if (zero_start != PAGE_CACHE_SIZE) {
-		kaddr = kmap(page);
-		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
-		flush_dcache_page(page);
-		kunmap(page);
-	}
-	set_page_dirty(page);
-	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
-
-	return ret;
-}
-
 /*
  * taken from block_truncate_page, but does cow as it zeros out
  * any bytes left in the last page in the file.
@@ -1153,12 +1203,16 @@
 {
 	struct inode *inode = mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
 	u32 blocksize = root->sectorsize;
 	pgoff_t index = from >> PAGE_CACHE_SHIFT;
 	unsigned offset = from & (PAGE_CACHE_SIZE-1);
 	struct page *page;
 	int ret = 0;
 	u64 page_start;
+	u64 page_end;
 
 	if ((offset & (blocksize - 1)) == 0)
 		goto out;
@@ -1168,6 +1222,10 @@
 	page = grab_cache_page(mapping, index);
 	if (!page)
 		goto out;
+
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
+
 	if (!PageUptodate(page)) {
 		ret = btrfs_readpage(NULL, page);
 		lock_page(page);
@@ -1181,10 +1239,32 @@
 			goto out;
 		}
 	}
-
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
 	wait_on_page_writeback(page);
-	ret = btrfs_cow_one_page(inode, page, offset);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		page_cache_release(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
+	if (offset != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + offset, 0, PAGE_CACHE_SIZE - offset);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 	unlock_page(page);
 	page_cache_release(page);
@@ -1222,8 +1302,9 @@
 
 		btrfs_truncate_page(inode->i_mapping, inode->i_size);
 
-		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 		hole_size = block_end - hole_start;
+		btrfs_wait_ordered_range(inode, hole_start, hole_size);
+		lock_extent(io_tree, hole_start, block_end - 1, GFP_NOFS);
 
 		trans = btrfs_start_transaction(root, 1);
 		btrfs_set_trans_block_group(trans, inode);
@@ -1258,6 +1339,7 @@
 	unsigned long nr;
 	int ret;
 
+	btrfs_wait_ordered_range(inode, 0, (u64)-1);
 	truncate_inode_pages(&inode->i_data, 0);
 	if (is_bad_inode(inode)) {
 		goto no_delete;
@@ -1403,7 +1485,6 @@
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	return 0;
 }
 
@@ -1705,7 +1786,6 @@
 	extent_io_tree_init(&BTRFS_I(inode)->io_failure_tree,
 			     inode->i_mapping, GFP_NOFS);
 	mutex_init(&BTRFS_I(inode)->csum_mutex);
-	atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 	BTRFS_I(inode)->delalloc_bytes = 0;
 	BTRFS_I(inode)->root = root;
 
@@ -1930,7 +2010,6 @@
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2066,64 +2145,18 @@
 
 static int merge_extent_mapping(struct extent_map_tree *em_tree,
 				struct extent_map *existing,
-				struct extent_map *em)
+				struct extent_map *em,
+				u64 map_start, u64 map_len)
 {
 	u64 start_diff;
-	u64 new_end;
-	int ret = 0;
-	int real_blocks = existing->block_start < EXTENT_MAP_LAST_BYTE;
 
-	if (real_blocks && em->block_start >= EXTENT_MAP_LAST_BYTE)
-		goto invalid;
-
-	if (!real_blocks && em->block_start != existing->block_start)
-		goto invalid;
-
-	new_end = max(existing->start + existing->len, em->start + em->len);
-
-	if (existing->start >= em->start) {
-		if (em->start + em->len < existing->start)
-			goto invalid;
-
-		start_diff = existing->start - em->start;
-		if (real_blocks && em->block_start + start_diff !=
-		    existing->block_start)
-			goto invalid;
-
-		em->len = new_end - em->start;
-
-		remove_extent_mapping(em_tree, existing);
-		/* free for the tree */
-		free_extent_map(existing);
-		ret = add_extent_mapping(em_tree, em);
-
-	} else if (em->start > existing->start) {
-
-		if (existing->start + existing->len < em->start)
-			goto invalid;
-
-		start_diff = em->start - existing->start;
-		if (real_blocks && existing->block_start + start_diff !=
-		    em->block_start)
-			goto invalid;
-
-		remove_extent_mapping(em_tree, existing);
-		em->block_start = existing->block_start;
-		em->start = existing->start;
-		em->len = new_end - existing->start;
-		free_extent_map(existing);
-
-		ret = add_extent_mapping(em_tree, em);
-	} else {
-		goto invalid;
-	}
-	return ret;
-
-invalid:
-	printk("invalid extent map merge [%Lu %Lu %Lu] [%Lu %Lu %Lu]\n",
-	       existing->start, existing->len, existing->block_start,
-	       em->start, em->len, em->block_start);
-	return -EIO;
+	BUG_ON(map_start < em->start || map_start >= extent_map_end(em));
+	start_diff = map_start - em->start;
+	em->start = map_start;
+	em->len = map_len;
+	if (em->block_start < EXTENT_MAP_LAST_BYTE)
+		em->block_start += start_diff;
+	return add_extent_mapping(em_tree, em);
 }
 
 struct extent_map *btrfs_get_extent(struct inode *inode, struct page *page,
@@ -2170,10 +2203,9 @@
 		err = -ENOMEM;
 		goto out;
 	}
-
+	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	em->start = EXTENT_MAP_HOLE;
 	em->len = (u64)-1;
-	em->bdev = root->fs_info->fs_devices->latest_bdev;
 	ret = btrfs_lookup_file_extent(trans, root, path,
 				       objectid, start, trans != NULL);
 	if (ret < 0) {
@@ -2314,6 +2346,9 @@
 	 */
 	if (ret == -EEXIST) {
 		struct extent_map *existing;
+
+		ret = 0;
+
 		existing = lookup_extent_mapping(em_tree, start, len);
 		if (existing && (existing->start > start ||
 		    existing->start + existing->len <= start)) {
@@ -2325,7 +2360,8 @@
 							 em->len);
 			if (existing) {
 				err = merge_extent_mapping(em_tree, existing,
-							   em);
+							   em, start,
+							   root->sectorsize);
 				free_extent_map(existing);
 				if (err) {
 					free_extent_map(em);
@@ -2341,6 +2377,7 @@
 		} else {
 			free_extent_map(em);
 			em = existing;
+			err = 0;
 		}
 	}
 	spin_unlock(&em_tree->lock);
@@ -2348,8 +2385,9 @@
 	btrfs_free_path(path);
 	if (trans) {
 		ret = btrfs_end_transaction(trans, root);
-		if (!err)
+		if (!err) {
 			err = ret;
+		}
 	}
 	if (err) {
 		free_extent_map(em);
@@ -2474,8 +2512,7 @@
 	return extent_readpages(tree, mapping, pages, nr_pages,
 				btrfs_get_extent);
 }
-
-static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+static int __btrfs_releasepage(struct page *page, gfp_t gfp_flags)
 {
 	struct extent_io_tree *tree;
 	struct extent_map_tree *map;
@@ -2493,15 +2530,54 @@
 	return ret;
 }
 
+static int btrfs_releasepage(struct page *page, gfp_t gfp_flags)
+{
+	struct btrfs_ordered_extent *ordered;
+
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					      page_offset(page));
+	if (ordered) {
+		btrfs_put_ordered_extent(ordered);
+		return 0;
+	}
+	return __btrfs_releasepage(page, gfp_flags);
+}
+
 static void btrfs_invalidatepage(struct page *page, unsigned long offset)
 {
 	struct extent_io_tree *tree;
+	struct btrfs_ordered_extent *ordered;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
 
+	wait_on_page_writeback(page);
 	tree = &BTRFS_I(page->mapping->host)->io_tree;
-	extent_invalidatepage(tree, page, offset);
-	btrfs_releasepage(page, GFP_NOFS);
+	if (offset) {
+		btrfs_releasepage(page, GFP_NOFS);
+		return;
+	}
+
+	lock_extent(tree, page_start, page_end, GFP_NOFS);
+	ordered = btrfs_lookup_ordered_extent(page->mapping->host,
+					   page_offset(page));
+	if (ordered) {
+		clear_extent_bit(tree, page_start, page_end,
+				 EXTENT_DIRTY | EXTENT_DELALLOC |
+				 EXTENT_LOCKED, 1, 0, GFP_NOFS);
+		btrfs_writepage_end_io_hook(page, page_start,
+					    page_end, NULL, 1);
+		btrfs_put_ordered_extent(ordered);
+		lock_extent(tree, page_start, page_end, GFP_NOFS);
+	}
+	clear_extent_bit(tree, page_start, page_end,
+		 EXTENT_LOCKED | EXTENT_DIRTY | EXTENT_DELALLOC |
+		 EXTENT_ORDERED,
+		 1, 1, GFP_NOFS);
+	__btrfs_releasepage(page, GFP_NOFS);
+
 	if (PagePrivate(page)) {
-		invalidate_extent_lru(tree, page_offset(page), PAGE_CACHE_SIZE);
+		invalidate_extent_lru(tree, page_offset(page),
+				      PAGE_CACHE_SIZE);
 		ClearPagePrivate(page);
 		set_page_private(page, 0);
 		page_cache_release(page);
@@ -2527,35 +2603,63 @@
 {
 	struct inode *inode = fdentry(vma->vm_file)->d_inode;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
-	unsigned long end;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	struct btrfs_ordered_extent *ordered;
+	char *kaddr;
+	unsigned long zero_start;
 	loff_t size;
 	int ret;
 	u64 page_start;
+	u64 page_end;
 
 	ret = btrfs_check_free_space(root, PAGE_CACHE_SIZE, 0);
 	if (ret)
 		goto out;
 
 	ret = -EINVAL;
-
+again:
 	lock_page(page);
-	wait_on_page_writeback(page);
 	size = i_size_read(inode);
-	page_start = (u64)page->index << PAGE_CACHE_SHIFT;
+	page_start = page_offset(page);
+	page_end = page_start + PAGE_CACHE_SIZE - 1;
 
 	if ((page->mapping != inode->i_mapping) ||
-	    (page_start > size)) {
+	    (page_start >= size)) {
 		/* page got truncated out from underneath us */
 		goto out_unlock;
 	}
+	wait_on_page_writeback(page);
+
+	lock_extent(io_tree, page_start, page_end, GFP_NOFS);
+	set_page_extent_mapped(page);
+
+	ordered = btrfs_lookup_ordered_extent(inode, page_start);
+	if (ordered) {
+		unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
+		unlock_page(page);
+		btrfs_wait_ordered_extent(inode, ordered);
+		btrfs_put_ordered_extent(ordered);
+		goto again;
+	}
+
+	set_extent_delalloc(&BTRFS_I(inode)->io_tree, page_start,
+			    page_end, GFP_NOFS);
+	ret = 0;
 
 	/* page is wholly or partially inside EOF */
 	if (page_start + PAGE_CACHE_SIZE > size)
-		end = size & ~PAGE_CACHE_MASK;
+		zero_start = size & ~PAGE_CACHE_MASK;
 	else
-		end = PAGE_CACHE_SIZE;
+		zero_start = PAGE_CACHE_SIZE;
 
-	ret = btrfs_cow_one_page(inode, page, end);
+	if (zero_start != PAGE_CACHE_SIZE) {
+		kaddr = kmap(page);
+		memset(kaddr + zero_start, 0, PAGE_CACHE_SIZE - zero_start);
+		flush_dcache_page(page);
+		kunmap(page);
+	}
+	set_page_dirty(page);
+	unlock_extent(io_tree, page_start, page_end, GFP_NOFS);
 
 out_unlock:
 	unlock_page(page);
@@ -2662,15 +2766,28 @@
 	if (!ei)
 		return NULL;
 	ei->last_trans = 0;
-	ei->ordered_trans = 0;
+	btrfs_ordered_inode_tree_init(&ei->ordered_tree);
 	return &ei->vfs_inode;
 }
 
 void btrfs_destroy_inode(struct inode *inode)
 {
+	struct btrfs_ordered_extent *ordered;
 	WARN_ON(!list_empty(&inode->i_dentry));
 	WARN_ON(inode->i_data.nrpages);
 
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1);
+		if (!ordered)
+			break;
+		else {
+			printk("found ordered extent %Lu %Lu\n",
+			       ordered->file_offset, ordered->len);
+			btrfs_remove_ordered_extent(inode, ordered);
+			btrfs_put_ordered_extent(ordered);
+			btrfs_put_ordered_extent(ordered);
+		}
+	}
 	btrfs_drop_extent_cache(inode, 0, (u64)-1);
 	kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode));
 }
@@ -2869,7 +2986,6 @@
 				     inode->i_mapping, GFP_NOFS);
 		mutex_init(&BTRFS_I(inode)->csum_mutex);
 		BTRFS_I(inode)->delalloc_bytes = 0;
-		atomic_set(&BTRFS_I(inode)->ordered_writeback, 0);
 		BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops;
 	}
 	dir->i_sb->s_dirt = 1;
@@ -2921,6 +3037,20 @@
 	return err;
 }
 
+static int btrfs_set_page_dirty(struct page *page)
+{
+	struct inode *inode = page->mapping->host;
+	u64 page_start = page_offset(page);
+	u64 page_end = page_start + PAGE_CACHE_SIZE - 1;
+
+	if (!test_range_bit(&BTRFS_I(inode)->io_tree, page_start, page_end,
+			    EXTENT_DELALLOC, 0)) {
+printk("inode %lu page %Lu not delalloc\n", inode->i_ino, page_offset(page));
+WARN_ON(1);
+	}
+	return __set_page_dirty_nobuffers(page);
+}
+
 static int btrfs_permission(struct inode *inode, int mask,
 			    struct nameidata *nd)
 {
@@ -2967,6 +3097,7 @@
 	.merge_bio_hook = btrfs_merge_bio_hook,
 	.readpage_io_hook = btrfs_readpage_io_hook,
 	.readpage_end_io_hook = btrfs_readpage_end_io_hook,
+	.writepage_end_io_hook = btrfs_writepage_end_io_hook,
 	.readpage_io_failed_hook = btrfs_io_failed_hook,
 	.set_bit_hook = btrfs_set_bit_hook,
 	.clear_bit_hook = btrfs_clear_bit_hook,
@@ -2982,7 +3113,7 @@
 	.direct_IO	= btrfs_direct_IO,
 	.invalidatepage = btrfs_invalidatepage,
 	.releasepage	= btrfs_releasepage,
-	.set_page_dirty	= __set_page_dirty_nobuffers,
+	.set_page_dirty	= btrfs_set_page_dirty,
 };
 
 static struct address_space_operations btrfs_symlink_aops = {
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 254da82..6513270 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -22,48 +22,30 @@
 #include "ctree.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "extent_io.h"
 
-struct tree_entry {
-	u64 root_objectid;
-	u64 objectid;
-	struct inode *inode;
-	struct rb_node rb_node;
-};
 
-/*
- * returns > 0 if entry passed (root, objectid) is > entry,
- * < 0 if (root, objectid) < entry and zero if they are equal
- */
-static int comp_entry(struct tree_entry *entry, u64 root_objectid,
-		      u64 objectid)
+static u64 entry_end(struct btrfs_ordered_extent *entry)
 {
-	if (root_objectid < entry->root_objectid)
-		return -1;
-	if (root_objectid > entry->root_objectid)
-		return 1;
-	if (objectid < entry->objectid)
-		return -1;
-	if (objectid > entry->objectid)
-		return 1;
-	return 0;
+	if (entry->file_offset + entry->len < entry->file_offset)
+		return (u64)-1;
+	return entry->file_offset + entry->len;
 }
 
-static struct rb_node *tree_insert(struct rb_root *root, u64 root_objectid,
-				   u64 objectid, struct rb_node *node)
+static struct rb_node *tree_insert(struct rb_root *root, u64 file_offset,
+				   struct rb_node *node)
 {
 	struct rb_node ** p = &root->rb_node;
 	struct rb_node * parent = NULL;
-	struct tree_entry *entry;
-	int comp;
+	struct btrfs_ordered_extent *entry;
 
 	while(*p) {
 		parent = *p;
-		entry = rb_entry(parent, struct tree_entry, rb_node);
+		entry = rb_entry(parent, struct btrfs_ordered_extent, rb_node);
 
-		comp = comp_entry(entry, root_objectid, objectid);
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			p = &(*p)->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			p = &(*p)->rb_right;
 		else
 			return parent;
@@ -74,24 +56,23 @@
 	return NULL;
 }
 
-static struct rb_node *__tree_search(struct rb_root *root, u64 root_objectid,
-				     u64 objectid, struct rb_node **prev_ret)
+static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
+				     struct rb_node **prev_ret)
 {
 	struct rb_node * n = root->rb_node;
 	struct rb_node *prev = NULL;
-	struct tree_entry *entry;
-	struct tree_entry *prev_entry = NULL;
-	int comp;
+	struct rb_node *test;
+	struct btrfs_ordered_extent *entry;
+	struct btrfs_ordered_extent *prev_entry = NULL;
 
 	while(n) {
-		entry = rb_entry(n, struct tree_entry, rb_node);
+		entry = rb_entry(n, struct btrfs_ordered_extent, rb_node);
 		prev = n;
 		prev_entry = entry;
-		comp = comp_entry(entry, root_objectid, objectid);
 
-		if (comp < 0)
+		if (file_offset < entry->file_offset)
 			n = n->rb_left;
-		else if (comp > 0)
+		else if (file_offset >= entry_end(entry))
 			n = n->rb_right;
 		else
 			return n;
@@ -99,195 +80,329 @@
 	if (!prev_ret)
 		return NULL;
 
-	while(prev && comp_entry(prev_entry, root_objectid, objectid) >= 0) {
-		prev = rb_next(prev);
-		prev_entry = rb_entry(prev, struct tree_entry, rb_node);
+	while(prev && file_offset >= entry_end(prev_entry)) {
+		test = rb_next(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		if (file_offset < entry_end(prev_entry))
+			break;
+
+		prev = test;
+	}
+	if (prev)
+		prev_entry = rb_entry(prev, struct btrfs_ordered_extent,
+				      rb_node);
+	while(prev && file_offset < entry_end(prev_entry)) {
+		test = rb_prev(prev);
+		if (!test)
+			break;
+		prev_entry = rb_entry(test, struct btrfs_ordered_extent,
+				      rb_node);
+		prev = test;
 	}
 	*prev_ret = prev;
 	return NULL;
 }
 
-static inline struct rb_node *tree_search(struct rb_root *root,
-					  u64 root_objectid, u64 objectid)
+static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
 {
+	if (file_offset < entry->file_offset ||
+	    entry->file_offset + entry->len <= file_offset)
+		return 0;
+	return 1;
+}
+
+static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
+					  u64 file_offset)
+{
+	struct rb_root *root = &tree->tree;
 	struct rb_node *prev;
 	struct rb_node *ret;
-	ret = __tree_search(root, root_objectid, objectid, &prev);
+	struct btrfs_ordered_extent *entry;
+
+	if (tree->last) {
+		entry = rb_entry(tree->last, struct btrfs_ordered_extent,
+				 rb_node);
+		if (offset_in_entry(entry, file_offset))
+			return tree->last;
+	}
+	ret = __tree_search(root, file_offset, &prev);
 	if (!ret)
-		return prev;
+		ret = prev;
+	if (ret)
+		tree->last = ret;
 	return ret;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode)
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len)
 {
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
-	u64 transid = root->fs_info->running_transaction->transid;
-	struct tree_entry *entry;
-	struct rb_node *node;
 	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	if (transid <= BTRFS_I(inode)->ordered_trans)
-		return 0;
-
-	tree = &root->fs_info->running_transaction->ordered_inode_tree;
-
-	read_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, inode->i_ino, NULL);
-	read_unlock(&tree->lock);
-	if (node) {
-		return 0;
-	}
-
-	entry = kmalloc(sizeof(*entry), GFP_NOFS);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	entry = kzalloc(sizeof(*entry), GFP_NOFS);
 	if (!entry)
 		return -ENOMEM;
 
-	write_lock(&tree->lock);
-	entry->objectid = inode->i_ino;
-	entry->root_objectid = root_objectid;
+	mutex_lock(&tree->mutex);
+	entry->file_offset = file_offset;
+	entry->start = start;
+	entry->len = len;
 	entry->inode = inode;
+	/* one ref for the tree */
+	atomic_set(&entry->refs, 1);
+	init_waitqueue_head(&entry->wait);
+	INIT_LIST_HEAD(&entry->list);
 
-	node = tree_insert(&tree->tree, root_objectid,
-			   inode->i_ino, &entry->rb_node);
+	node = tree_insert(&tree->tree, file_offset,
+			   &entry->rb_node);
+	if (node) {
+		entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+		atomic_inc(&entry->refs);
+	}
+	set_extent_ordered(&BTRFS_I(inode)->io_tree, file_offset,
+			   entry_end(entry) - 1, GFP_NOFS);
 
-	BTRFS_I(inode)->ordered_trans = transid;
-	if (!node)
-		igrab(inode);
+	set_bit(BTRFS_ORDERED_START, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	BUG_ON(node);
+	return 0;
+}
 
-	write_unlock(&tree->lock);
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
 
-	if (node)
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, sum->file_offset);
+	if (!node) {
+search_fail:
+printk("add ordered sum failed to find a node for inode %lu offset %Lu\n", inode->i_ino, sum->file_offset);
+		node = rb_first(&tree->tree);
+		while(node) {
+			entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+			printk("entry %Lu %Lu %Lu\n", entry->file_offset, entry->file_offset + entry->len, entry->start);
+			node = rb_next(node);
+		}
+		BUG();
+	}
+	BUG_ON(!node);
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, sum->file_offset)) {
+		goto search_fail;
+	}
+
+	list_add_tail(&sum->list, &entry->list);
+	mutex_unlock(&tree->mutex);
+	return 0;
+}
+
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				   u64 file_offset, u64 io_size)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+	int ret;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	clear_extent_ordered(io_tree, file_offset, file_offset + io_size - 1,
+			     GFP_NOFS);
+	node = tree_search(tree, file_offset);
+	if (!node) {
+		ret = 1;
+		goto out;
+	}
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset)) {
+		ret = 1;
+		goto out;
+	}
+
+	ret = test_range_bit(io_tree, entry->file_offset,
+			     entry->file_offset + entry->len - 1,
+			     EXTENT_ORDERED, 0);
+	if (!test_bit(BTRFS_ORDERED_START, &entry->flags)) {
+printk("inode %lu not ready yet for extent %Lu %Lu\n", inode->i_ino, entry->file_offset, entry_end(entry));
+	}
+	if (ret == 0)
+		ret = test_and_set_bit(BTRFS_ORDERED_IO_DONE, &entry->flags);
+out:
+	mutex_unlock(&tree->mutex);
+	return ret == 0;
+}
+
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
+{
+	if (atomic_dec_and_test(&entry->refs))
 		kfree(entry);
 	return 0;
 }
 
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				   u64 *root_objectid, u64 *objectid,
-				   struct inode **inode)
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry)
 {
-	struct tree_entry *entry;
+	struct btrfs_ordered_inode_tree *tree;
 	struct rb_node *node;
 
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
-	}
-	entry = rb_entry(node, struct tree_entry, rb_node);
-
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
-	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
-	}
-
-	*root_objectid = entry->root_objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
-	*objectid = entry->objectid;
-	write_unlock(&tree->lock);
-	return 1;
-}
-
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode)
-{
-	struct tree_entry *entry;
-	struct rb_node *node;
-
-	write_lock(&tree->lock);
-	node = tree_search(&tree->tree, *root_objectid, *objectid);
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
-	}
-
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	while(comp_entry(entry, *root_objectid, *objectid) >= 0) {
-		node = rb_next(node);
-		if (!node)
-			break;
-		entry = rb_entry(node, struct tree_entry, rb_node);
-	}
-	if (!node) {
-		write_unlock(&tree->lock);
-		return 0;
-	}
-
-	*root_objectid = entry->root_objectid;
-	*objectid = entry->objectid;
-	*inode = entry->inode;
-	atomic_inc(&entry->inode->i_count);
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = &entry->rb_node;
 	rb_erase(node, &tree->tree);
-	write_unlock(&tree->lock);
-	kfree(entry);
-	return 1;
-}
-
-static void __btrfs_del_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				     struct inode *inode,
-				     u64 root_objectid, u64 objectid)
-{
-	struct tree_entry *entry;
-	struct rb_node *node;
-	struct rb_node *prev;
-
-	write_lock(&tree->lock);
-	node = __tree_search(&tree->tree, root_objectid, objectid, &prev);
-	if (!node) {
-		write_unlock(&tree->lock);
-		return;
-	}
-	rb_erase(node, &tree->tree);
-	BTRFS_I(inode)->ordered_trans = 0;
-	write_unlock(&tree->lock);
-	atomic_dec(&inode->i_count);
-	entry = rb_entry(node, struct tree_entry, rb_node);
-	kfree(entry);
-	return;
-}
-
-void btrfs_del_ordered_inode(struct inode *inode, int force)
-{
-	struct btrfs_root *root = BTRFS_I(inode)->root;
-	u64 root_objectid = root->root_key.objectid;
-
-	if (!BTRFS_I(inode)->ordered_trans) {
-		return;
-	}
-
-	if (!force && (mapping_tagged(inode->i_mapping, PAGECACHE_TAG_DIRTY) ||
-	    mapping_tagged(inode->i_mapping, PAGECACHE_TAG_WRITEBACK)))
-		return;
-
-	spin_lock(&root->fs_info->new_trans_lock);
-	if (root->fs_info->running_transaction) {
-		struct btrfs_ordered_inode_tree *tree;
-		tree = &root->fs_info->running_transaction->ordered_inode_tree;
-		 __btrfs_del_ordered_inode(tree, inode, root_objectid,
-						inode->i_ino);
-	}
-	spin_unlock(&root->fs_info->new_trans_lock);
-}
-
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode)
-{
-	struct btrfs_transaction *cur = root->fs_info->running_transaction;
-	while(cur == root->fs_info->running_transaction &&
-	      atomic_read(&BTRFS_I(inode)->ordered_writeback)) {
-#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,18)
-		congestion_wait(WRITE, HZ/20);
-#else
-		blk_congestion_wait(WRITE, HZ/20);
-#endif
-	}
+	tree->last = NULL;
+	set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags);
+	mutex_unlock(&tree->mutex);
+	wake_up(&entry->wait);
 	return 0;
 }
+
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	wait_event(entry->wait,
+		   test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags));
+}
+
+static void btrfs_start_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry, int wait)
+{
+	u64 start = entry->file_offset;
+	u64 end = start + entry->len - 1;
+
+#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22)
+	do_sync_file_range(file, start, end, SYNC_FILE_RANGE_WRITE);
+#else
+	do_sync_mapping_range(inode->i_mapping, start, end,
+			      SYNC_FILE_RANGE_WRITE);
+#endif
+	if (wait)
+		wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE,
+						 &entry->flags));
+}
+
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len)
+{
+	u64 end;
+	struct btrfs_ordered_extent *ordered;
+	int found;
+	int should_wait = 0;
+
+again:
+	if (start + len < start)
+		end = (u64)-1;
+	else
+		end = start + len - 1;
+	found = 0;
+	while(1) {
+		ordered = btrfs_lookup_first_ordered_extent(inode, end);
+		if (!ordered) {
+			break;
+		}
+		if (ordered->file_offset >= start + len) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		if (ordered->file_offset + ordered->len < start) {
+			btrfs_put_ordered_extent(ordered);
+			break;
+		}
+		btrfs_start_ordered_extent(inode, ordered, should_wait);
+		found++;
+		end = ordered->file_offset;
+		btrfs_put_ordered_extent(ordered);
+		if (end == 0)
+			break;
+		end--;
+	}
+	if (should_wait && found) {
+		should_wait = 0;
+		goto again;
+	}
+}
+
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len)
+{
+	WARN_ON(1);
+	return 0;
+#if 0
+	int ret;
+	struct btrfs_ordered_inode_tree *tree;
+	struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	if (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags)) {
+		ret = -EAGAIN;
+		goto out;
+	}
+	set_extent_ordered(io_tree, start, start + len - 1, GFP_NOFS);
+	ret = 0;
+out:
+	mutex_unlock(&tree->mutex);
+	return ret;
+#endif
+}
+
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	if (!offset_in_entry(entry, file_offset))
+		entry = NULL;
+	if (entry)
+		atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
+
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset)
+{
+	struct btrfs_ordered_inode_tree *tree;
+	struct rb_node *node;
+	struct btrfs_ordered_extent *entry = NULL;
+
+	tree = &BTRFS_I(inode)->ordered_tree;
+	mutex_lock(&tree->mutex);
+	node = tree_search(tree, file_offset);
+	if (!node)
+		goto out;
+
+	entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
+	atomic_inc(&entry->refs);
+out:
+	mutex_unlock(&tree->mutex);
+	return entry;
+}
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 4fa7873..33292c5 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -20,24 +20,73 @@
 #define __BTRFS_ORDERED_DATA__
 
 struct btrfs_ordered_inode_tree {
-	rwlock_t lock;
+	struct mutex mutex;
 	struct rb_root tree;
+	struct rb_node *last;
 };
 
+struct btrfs_sector_sum {
+	u64 offset;
+	u32 sum;
+};
+
+struct btrfs_ordered_sum {
+	u64 file_offset;
+	u64 len;
+	struct list_head list;
+	struct btrfs_sector_sum sums;
+};
+
+/* bits for the flags field */
+#define BTRFS_ORDERED_IO_DONE 0 /* set when all the pages are written */
+#define BTRFS_ORDERED_COMPLETE 1 /* set when removed from the tree */
+#define BTRFS_ORDERED_START 2 /* set when tree setup */
+
+struct btrfs_ordered_extent {
+	u64 file_offset;
+	u64 start;
+	u64 len;
+	unsigned long flags;
+	atomic_t refs;
+	struct list_head list;
+	struct inode *inode;
+	wait_queue_head_t wait;
+	struct rb_node rb_node;
+};
+
+
+static inline int btrfs_ordered_sum_size(struct btrfs_root *root, u64 bytes)
+{
+	unsigned long num_sectors = (bytes + root->sectorsize - 1) /
+		root->sectorsize;
+	return sizeof(struct btrfs_ordered_sum) +
+		num_sectors * sizeof(struct btrfs_sector_sum);
+}
+
 static inline void
 btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t)
 {
-	rwlock_init(&t->lock);
+	mutex_init(&t->mutex);
 	t->tree.rb_node = NULL;
+	t->last = NULL;
 }
 
-int btrfs_add_ordered_inode(struct inode *inode);
-int btrfs_find_del_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-int btrfs_find_first_ordered_inode(struct btrfs_ordered_inode_tree *tree,
-				       u64 *root_objectid, u64 *objectid,
-				       struct inode **inode);
-void btrfs_del_ordered_inode(struct inode *inode, int force);
-int btrfs_ordered_throttle(struct btrfs_root *root, struct inode *inode);
+int btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry);
+int btrfs_remove_ordered_extent(struct inode *inode,
+				struct btrfs_ordered_extent *entry);
+int btrfs_dec_test_ordered_pending(struct inode *inode,
+				       u64 file_offset, u64 io_size);
+int btrfs_add_ordered_extent(struct inode *inode, u64 file_offset,
+			     u64 start, u64 len);
+int btrfs_add_ordered_sum(struct inode *inode, struct btrfs_ordered_sum *sum);
+struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct inode *inode,
+							 u64 file_offset);
+void btrfs_wait_ordered_extent(struct inode *inode,
+			       struct btrfs_ordered_extent *entry);
+void btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len);
+struct btrfs_ordered_extent *
+btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset);
+int btrfs_add_ordered_pending(struct inode *inode,
+			      struct btrfs_ordered_extent *ordered,
+			      u64 start, u64 len);
 #endif
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index a8a3cb0..86a5acc 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -67,7 +67,6 @@
 		cur_trans->start_time = get_seconds();
 		INIT_LIST_HEAD(&cur_trans->pending_snapshots);
 		list_add_tail(&cur_trans->list, &root->fs_info->trans_list);
-		btrfs_ordered_inode_tree_init(&cur_trans->ordered_inode_tree);
 		extent_io_tree_init(&cur_trans->dirty_pages,
 				     root->fs_info->btree_inode->i_mapping,
 				     GFP_NOFS);
@@ -158,10 +157,12 @@
 		wake_up(&cur_trans->writer_wait);
 
 	if (cur_trans->in_commit && throttle) {
-		int ret;
+		DEFINE_WAIT(wait);
 		mutex_unlock(&root->fs_info->trans_mutex);
-		ret = wait_for_commit(root, cur_trans);
-		BUG_ON(ret);
+		prepare_to_wait(&root->fs_info->transaction_throttle, &wait,
+				TASK_UNINTERRUPTIBLE);
+		schedule();
+		finish_wait(&root->fs_info->transaction_throttle, &wait);
 		mutex_lock(&root->fs_info->trans_mutex);
 	}
 
@@ -486,58 +487,6 @@
 	return ret;
 }
 
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root)
-{
-	struct btrfs_transaction *cur_trans = trans->transaction;
-	struct inode *inode;
-	u64 root_objectid = 0;
-	u64 objectid = 0;
-	int ret;
-
-	atomic_inc(&root->fs_info->throttles);
-	while(1) {
-		ret = btrfs_find_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_fdatawrite(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	while(1) {
-		root_objectid = 0;
-		objectid = 0;
-		ret = btrfs_find_del_first_ordered_inode(
-				&cur_trans->ordered_inode_tree,
-				&root_objectid, &objectid, &inode);
-		if (!ret)
-			break;
-		mutex_unlock(&root->fs_info->trans_mutex);
-
-		if (S_ISREG(inode->i_mode)) {
-			atomic_inc(&BTRFS_I(inode)->ordered_writeback);
-			filemap_write_and_wait(inode->i_mapping);
-			atomic_dec(&BTRFS_I(inode)->ordered_writeback);
-		}
-		atomic_dec(&inode->i_count);
-		iput(inode);
-
-		mutex_lock(&root->fs_info->trans_mutex);
-	}
-	atomic_dec(&root->fs_info->throttles);
-	return 0;
-}
-
 static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
 				   struct btrfs_fs_info *fs_info,
 				   struct btrfs_pending_snapshot *pending)
@@ -666,6 +615,7 @@
 	extent_io_tree_init(pinned_copy,
 			     root->fs_info->btree_inode->i_mapping, GFP_NOFS);
 
+printk("commit trans %Lu\n", trans->transid);
 	trans->transaction->in_commit = 1;
 	cur_trans = trans->transaction;
 	if (cur_trans->list.prev != &root->fs_info->trans_list) {
@@ -699,8 +649,6 @@
 
 		mutex_lock(&root->fs_info->trans_mutex);
 		finish_wait(&cur_trans->writer_wait, &wait);
-		ret = btrfs_write_ordered_inodes(trans, root);
-
 	} while (cur_trans->num_writers > 1 ||
 		 (cur_trans->num_joined != joined));
 
@@ -736,6 +684,8 @@
 
 	btrfs_copy_pinned(root, pinned_copy);
 
+	wake_up(&root->fs_info->transaction_throttle);
+
 	mutex_unlock(&root->fs_info->trans_mutex);
 	ret = btrfs_write_and_wait_transaction(trans, root);
 	BUG_ON(ret);
@@ -758,6 +708,7 @@
 		list_splice_init(&dirty_fs_roots, &root->fs_info->dead_roots);
 
 	mutex_unlock(&root->fs_info->trans_mutex);
+printk("done commit trans %Lu\n", trans->transid);
 	kmem_cache_free(btrfs_trans_handle_cachep, trans);
 
 	if (root->fs_info->closing) {
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 9ccd5a5..910350c 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -19,7 +19,6 @@
 #ifndef __BTRFS_TRANSACTION__
 #define __BTRFS_TRANSACTION__
 #include "btrfs_inode.h"
-#include "ordered-data.h"
 
 struct btrfs_transaction {
 	u64 transid;
@@ -31,7 +30,6 @@
 	struct list_head list;
 	struct extent_io_tree dirty_pages;
 	unsigned long start_time;
-	struct btrfs_ordered_inode_tree ordered_inode_tree;
 	wait_queue_head_t writer_wait;
 	wait_queue_head_t commit_wait;
 	struct list_head pending_snapshots;
@@ -88,8 +86,6 @@
 int btrfs_clean_old_snapshots(struct btrfs_root *root);
 int btrfs_commit_transaction(struct btrfs_trans_handle *trans,
 			     struct btrfs_root *root);
-int btrfs_write_ordered_inodes(struct btrfs_trans_handle *trans,
-				struct btrfs_root *root);
 int btrfs_end_transaction_throttle(struct btrfs_trans_handle *trans,
 				   struct btrfs_root *root);
 #endif