Btrfs: Add support for multiple devices per filesystem

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9c61609..5ac6f29 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -6,7 +6,7 @@
 	   hash.o file-item.o inode-item.o inode-map.o disk-io.o \
 	   transaction.o bit-radix.o inode.o file.o tree-defrag.o \
 	   extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
-	   extent_io.o
+	   extent_io.o volumes.o
 
 ifeq ($(CONFIG_FS_POSIX_ACL),y)
 btrfs-y += acl.o
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index ef8809c..70b6ddf 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -70,6 +70,14 @@
 	memset(p, 0, sizeof(*p));
 }
 
+static void add_root_to_dirty_list(struct btrfs_root *root)
+{
+	if (root->track_dirty && list_empty(&root->dirty_list)) {
+		list_add(&root->dirty_list,
+			 &root->fs_info->dirty_cowonly_roots);
+	}
+}
+
 int btrfs_copy_root(struct btrfs_trans_handle *trans,
 		      struct btrfs_root *root,
 		      struct extent_buffer *buf,
@@ -196,6 +204,7 @@
 					  root_gen, 0, 0, 1);
 		}
 		free_extent_buffer(buf);
+		add_root_to_dirty_list(root);
 	} else {
 		root_gen = btrfs_header_generation(parent);
 		btrfs_set_node_blockptr(parent, parent_slot,
@@ -241,7 +250,7 @@
 		return 0;
 	}
 
-	search_start = buf->start & ~((u64)BTRFS_BLOCK_GROUP_SIZE - 1);
+	search_start = buf->start & ~((u64)(1024 * 1024 * 1024) - 1);
 	ret = __btrfs_cow_block(trans, root, buf, parent,
 				 parent_slot, cow_ret, search_start, 0);
 	return ret;
@@ -724,6 +733,7 @@
 		BUG_ON(ret);
 
 		root->node = child;
+		add_root_to_dirty_list(root);
 		path->nodes[level] = NULL;
 		clean_tree_block(trans, root, mid);
 		wait_on_tree_block_writeback(root, mid);
@@ -1369,6 +1379,7 @@
 	/* the super has an extra ref to root->node */
 	free_extent_buffer(root->node);
 	root->node = c;
+	add_root_to_dirty_list(root);
 	extent_buffer_get(c);
 	path->nodes[level] = c;
 	path->slots[level] = 0;
@@ -2777,3 +2788,28 @@
 	}
 	return 0;
 }
+
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type)
+{
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int ret;
+
+	while(1) {
+		if (path->slots[0] == 0) {
+			ret = btrfs_prev_leaf(root, path);
+			if (ret != 0)
+				return ret;
+		} else {
+			path->slots[0]--;
+		}
+		leaf = path->nodes[0];
+		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
+		if (found_key.type == type)
+			return 0;
+	}
+	return 1;
+}
+
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92d892f..1453d99 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,12 +40,44 @@
 #define BTRFS_MAGIC "_B4RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
+
+/* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/* one per subvolume, storing files and directories */
 #define BTRFS_FS_TREE_OBJECTID 3ULL
+
+/* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
+
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 5ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 6ULL
+
+/*
+ * All files have objectids higher than this.
+ */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
 
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -95,6 +127,81 @@
 	u64 offset;
 } __attribute__ ((__packed__));
 
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+#define BTRFS_DEV_UUID_SIZE 16
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* the kernel device number */
+	__le64 rdev;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* partition number, 0 for whole dev */
+	__le32 partition;
+
+	/* length of the name data at the end of the item */
+	__le16 name_len;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	__le64 owner;
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
 #define BTRFS_FSID_SIZE 16
 /*
  * every tree block (leaf or node) starts with this header.
@@ -119,6 +226,13 @@
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -131,6 +245,7 @@
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
+	__le64 chunk_root;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -138,7 +253,10 @@
 	__le32 nodesize;
 	__le32 leafsize;
 	__le32 stripesize;
+	__le32 sys_chunk_array_size;
 	u8 root_level;
+	u8 chunk_root_level;
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
 /*
@@ -208,12 +326,22 @@
 	__le64 offset;
 } __attribute__ ((__packed__));
 
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent
+ */
+struct btrfs_dev_extent {
+	__le64 owner;
+	__le64 length;
+} __attribute__ ((__packed__));
+
+
 struct btrfs_inode_ref {
 	__le16 name_len;
 	/* name goes here */
 } __attribute__ ((__packed__));
 
-struct btrfs_inode_timespec {
+struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
 } __attribute__ ((__packed__));
@@ -231,13 +359,13 @@
 	__le32 uid;
 	__le32 gid;
 	__le32 mode;
-	__le32 rdev;
+	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
-	struct btrfs_inode_timespec atime;
-	struct btrfs_inode_timespec ctime;
-	struct btrfs_inode_timespec mtime;
-	struct btrfs_inode_timespec otime;
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
 } __attribute__ ((__packed__));
 
 struct btrfs_dir_item {
@@ -290,29 +418,34 @@
 	u8 csum;
 } __attribute__ ((__packed__));
 
-/* tag for the radix tree of block groups in ram */
-#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 
 
-#define BTRFS_BLOCK_GROUP_DATA 1
-#define BTRFS_BLOCK_GROUP_MIXED 2
-
 struct btrfs_block_group_item {
 	__le64 used;
-	u8 flags;
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	int data;
-	int cached;
 	u64 pinned;
+	u64 flags;
+	int cached;
 };
+
+struct btrfs_device;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct radix_tree_root fs_roots_radix;
 
 	struct extent_io_tree free_space_cache;
@@ -321,6 +454,9 @@
 	struct extent_io_tree pending_del;
 	struct extent_io_tree extent_ins;
 
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
 	u64 generation;
 	u64 last_trans_committed;
 	unsigned long mount_opt;
@@ -330,6 +466,7 @@
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
+	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
 	spinlock_t hash_lock;
@@ -350,12 +487,17 @@
 	unsigned long throttles;
 
 	u64 total_pinned;
+	struct list_head dirty_cowonly_roots;
+
+	struct list_head devices;
+	struct list_head *last_device;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
 };
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -387,14 +529,19 @@
 	u64 highest_inode;
 	u64 last_inode_alloc;
 	int ref_cows;
+	int track_dirty;
 	struct btrfs_key defrag_progress;
 	int defrag_running;
 	int defrag_level;
 	char *name;
 	int in_sysfs;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
 };
 
 /*
+
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
  * the FS
@@ -439,6 +586,10 @@
  */
 #define BTRFS_BLOCK_GROUP_ITEM_KEY 50
 
+#define BTRFS_DEV_EXTENT_KEY	75
+#define BTRFS_DEV_ITEM_KEY	76
+#define BTRFS_CHUNK_ITEM_KEY	77
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -518,13 +669,104 @@
 	s->member = cpu_to_le##bits(val);				\
 }
 
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64);
+BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32);
+BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_name(struct btrfs_dev_item *d)
+{
+	return (char *)(d + 1);
+}
+
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item,
-		   flags, 8);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
@@ -538,49 +780,53 @@
 BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
 BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
 BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
 		   compat_flags, 16);
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, atime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, mtime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, ctime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, otime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_extent_item */
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
 /* struct btrfs_extent_ref */
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
@@ -846,8 +1092,14 @@
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 64);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -1009,7 +1261,14 @@
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size);
 /* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 88e21bd..8e37fa12 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -28,6 +28,7 @@
 #include "disk-io.h"
 #include "transaction.h"
 #include "btrfs_inode.h"
+#include "volumes.h"
 #include "print-tree.h"
 
 #if 0
@@ -234,6 +235,19 @@
 	return 0;
 }
 
+static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
+{
+	struct btrfs_root *root = BTRFS_I(inode)->root;
+	u64 offset;
+	offset = bio->bi_sector << 9;
+	if (offset == BTRFS_SUPER_INFO_OFFSET) {
+		bio->bi_bdev = root->fs_info->sb->s_bdev;
+		submit_bio(rw, bio);
+		return 0;
+	}
+	return btrfs_map_bio(BTRFS_I(inode)->root, rw, bio);
+}
+
 static int btree_writepage(struct page *page, struct writeback_control *wbc)
 {
 	struct extent_io_tree *tree;
@@ -345,6 +359,23 @@
 	return ret;
 }
 
+static int close_all_devices(struct btrfs_fs_info *fs_info)
+{
+	struct list_head *list;
+	struct list_head *next;
+	struct btrfs_device *device;
+
+	list = &fs_info->devices;
+	while(!list_empty(list)) {
+		next = list->next;
+		list_del(next);
+		device = list_entry(next, struct btrfs_device, dev_list);
+		kfree(device->name);
+		kfree(device);
+	}
+	return 0;
+}
+
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize)
 {
@@ -420,6 +451,8 @@
 	root->leafsize = leafsize;
 	root->stripesize = stripesize;
 	root->ref_cows = 0;
+	root->track_dirty = 0;
+
 	root->fs_info = fs_info;
 	root->objectid = objectid;
 	root->last_trans = 0;
@@ -427,6 +460,8 @@
 	root->last_inode_alloc = 0;
 	root->name = NULL;
 	root->in_sysfs = 0;
+
+	INIT_LIST_HEAD(&root->dirty_list);
 	memset(&root->root_key, 0, sizeof(root->root_key));
 	memset(&root->root_item, 0, sizeof(root->root_item));
 	memset(&root->defrag_progress, 0, sizeof(root->defrag_progress));
@@ -634,6 +669,10 @@
 					       GFP_NOFS);
 	struct btrfs_fs_info *fs_info = kmalloc(sizeof(*fs_info),
 						GFP_NOFS);
+	struct btrfs_root *chunk_root = kmalloc(sizeof(struct btrfs_root),
+						GFP_NOFS);
+	struct btrfs_root *dev_root = kmalloc(sizeof(struct btrfs_root),
+					      GFP_NOFS);
 	int ret;
 	int err = -EIO;
 	struct btrfs_super_block *disk_super;
@@ -657,6 +696,12 @@
 	fs_info->last_trans_committed = 0;
 	fs_info->tree_root = tree_root;
 	fs_info->extent_root = extent_root;
+	fs_info->chunk_root = chunk_root;
+	fs_info->dev_root = dev_root;
+	INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots);
+	INIT_LIST_HEAD(&fs_info->devices);
+	btrfs_mapping_init(&fs_info->mapping_tree);
+	fs_info->last_device = &fs_info->devices;
 	fs_info->sb = sb;
 	fs_info->throttles = 0;
 	fs_info->mount_opt = 0;
@@ -714,12 +759,12 @@
 		goto fail_iput;
 	}
 #endif
-	__setup_root(512, 512, 512, 512, tree_root,
+	__setup_root(4096, 4096, 4096, 4096, tree_root,
 		     fs_info, BTRFS_ROOT_TREE_OBJECTID);
 
 	fs_info->sb_buffer = read_tree_block(tree_root,
 					     BTRFS_SUPER_INFO_OFFSET,
-					     512);
+					     4096);
 
 	if (!fs_info->sb_buffer)
 		goto fail_iput;
@@ -730,6 +775,7 @@
 	read_extent_buffer(fs_info->sb_buffer, fs_info->fsid,
 			   (unsigned long)btrfs_super_fsid(fs_info->sb_buffer),
 			   BTRFS_FSID_SIZE);
+
 	disk_super = &fs_info->super_copy;
 	if (!btrfs_super_root(disk_super))
 		goto fail_sb_buffer;
@@ -753,23 +799,47 @@
 		goto fail_sb_buffer;
 	}
 
+	mutex_lock(&fs_info->fs_mutex);
+	ret = btrfs_read_sys_array(tree_root);
+	BUG_ON(ret);
+
+	blocksize = btrfs_level_size(tree_root,
+				     btrfs_super_chunk_root_level(disk_super));
+
+	__setup_root(nodesize, leafsize, sectorsize, stripesize,
+		     chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID);
+
+	chunk_root->node = read_tree_block(chunk_root,
+					   btrfs_super_chunk_root(disk_super),
+					   blocksize);
+	BUG_ON(!chunk_root->node);
+
+	ret = btrfs_read_chunk_tree(chunk_root);
+	BUG_ON(ret);
+
 	blocksize = btrfs_level_size(tree_root,
 				     btrfs_super_root_level(disk_super));
 
+
 	tree_root->node = read_tree_block(tree_root,
 					  btrfs_super_root(disk_super),
 					  blocksize);
 	if (!tree_root->node)
 		goto fail_sb_buffer;
 
-	mutex_lock(&fs_info->fs_mutex);
 
 	ret = find_and_setup_root(tree_root, fs_info,
 				  BTRFS_EXTENT_TREE_OBJECTID, extent_root);
-	if (ret) {
-		mutex_unlock(&fs_info->fs_mutex);
+	if (ret)
 		goto fail_tree_root;
-	}
+	extent_root->track_dirty = 1;
+
+	ret = find_and_setup_root(tree_root, fs_info,
+				  BTRFS_DEV_TREE_OBJECTID, dev_root);
+	dev_root->track_dirty = 1;
+
+	if (ret)
+		goto fail_extent_root;
 
 	btrfs_read_block_groups(extent_root);
 
@@ -777,7 +847,10 @@
 	mutex_unlock(&fs_info->fs_mutex);
 	return tree_root;
 
+fail_extent_root:
+	free_extent_buffer(extent_root->node);
 fail_tree_root:
+	mutex_unlock(&fs_info->fs_mutex);
 	free_extent_buffer(tree_root->node);
 fail_sb_buffer:
 	free_extent_buffer(fs_info->sb_buffer);
@@ -874,6 +947,12 @@
 	if (fs_info->tree_root->node)
 		free_extent_buffer(fs_info->tree_root->node);
 
+	if (root->fs_info->chunk_root->node);
+		free_extent_buffer(root->fs_info->chunk_root->node);
+
+	if (root->fs_info->dev_root->node);
+		free_extent_buffer(root->fs_info->dev_root->node);
+
 	free_extent_buffer(fs_info->sb_buffer);
 
 	btrfs_free_block_groups(root->fs_info);
@@ -901,8 +980,13 @@
 		kfree(hasher);
 	}
 #endif
+	close_all_devices(fs_info);
+	btrfs_mapping_tree_free(&fs_info->mapping_tree);
+
 	kfree(fs_info->extent_root);
 	kfree(fs_info->tree_root);
+	kfree(fs_info->chunk_root);
+	kfree(fs_info->dev_root);
 	return 0;
 }
 
@@ -1016,4 +1100,5 @@
 
 static struct extent_io_ops btree_extent_io_ops = {
 	.writepage_io_hook = btree_writepage_io_hook,
+	.submit_bio_hook = btree_submit_bio_hook,
 };
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
index 828f3a2..206cb48 100644
--- a/fs/btrfs/disk-io.h
+++ b/fs/btrfs/disk-io.h
@@ -20,6 +20,7 @@
 #define __DISKIO__
 
 #define BTRFS_SUPER_INFO_OFFSET (16 * 1024)
+struct btrfs_device;
 
 struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr,
 				      u32 blocksize);
@@ -65,4 +66,5 @@
 u32 btrfs_csum_data(struct btrfs_root *root, char *data, u32 seed, size_t len);
 void btrfs_csum_final(u32 crc, char *result);
 void btrfs_throttle(struct btrfs_root *root);
+int btrfs_open_device(struct btrfs_device *dev);
 #endif
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index ebfd304..2cd957d 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -24,37 +24,19 @@
 #include "disk-io.h"
 #include "print-tree.h"
 #include "transaction.h"
+#include "volumes.h"
 
-#define BLOCK_GROUP_DATA EXTENT_WRITEBACK
+#define BLOCK_GROUP_DATA     EXTENT_WRITEBACK
 #define BLOCK_GROUP_METADATA EXTENT_UPTODATE
+#define BLOCK_GROUP_SYSTEM   EXTENT_NEW
+
 #define BLOCK_GROUP_DIRTY EXTENT_DIRTY
 
 static int finish_current_insert(struct btrfs_trans_handle *trans, struct
 				 btrfs_root *extent_root);
 static int del_pending_extents(struct btrfs_trans_handle *trans, struct
 			       btrfs_root *extent_root);
-static int find_previous_extent(struct btrfs_root *root,
-				struct btrfs_path *path)
-{
-	struct btrfs_key found_key;
-	struct extent_buffer *leaf;
-	int ret;
 
-	while(1) {
-		if (path->slots[0] == 0) {
-			ret = btrfs_prev_leaf(root, path);
-			if (ret != 0)
-				return ret;
-		} else {
-			path->slots[0]--;
-		}
-		leaf = path->nodes[0];
-		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
-		if (found_key.type == BTRFS_EXTENT_ITEM_KEY)
-			return 0;
-	}
-	return 1;
-}
 
 static int cache_block_group(struct btrfs_root *root,
 			     struct btrfs_block_group_cache *block_group)
@@ -91,7 +73,7 @@
 	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
 	if (ret < 0)
 		return ret;
-	ret = find_previous_extent(root, path);
+	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		return ret;
 	if (ret == 0) {
@@ -168,7 +150,8 @@
 	block_group_cache = &info->block_group_cache;
 	ret = find_first_extent_bit(block_group_cache,
 				    bytenr, &start, &end,
-				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA);
+				    BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA |
+				    BLOCK_GROUP_SYSTEM);
 	if (ret) {
 		return NULL;
 	}
@@ -182,23 +165,38 @@
 		return block_group;
 	return NULL;
 }
-static u64 noinline find_search_start(struct btrfs_root *root,
+
+static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
+{
+	if ((bits & BLOCK_GROUP_DATA) &&
+	    (cache->flags & BTRFS_BLOCK_GROUP_DATA))
+		return 1;
+	if ((bits & BLOCK_GROUP_METADATA) &&
+	     (cache->flags & BTRFS_BLOCK_GROUP_METADATA))
+		return 1;
+	if ((bits & BLOCK_GROUP_SYSTEM) &&
+	     (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM))
+		return 1;
+	return 0;
+}
+
+static int noinline find_search_start(struct btrfs_root *root,
 			      struct btrfs_block_group_cache **cache_ret,
-			      u64 search_start, int num, int data)
+			      u64 *start_ret, int num, int data)
 {
 	int ret;
 	struct btrfs_block_group_cache *cache = *cache_ret;
 	struct extent_io_tree *free_space_cache;
-	struct extent_state *state;
 	u64 last;
 	u64 start = 0;
+	u64 end = 0;
 	u64 cache_miss = 0;
 	u64 total_fs_bytes;
+	u64 search_start = *start_ret;
 	int wrapped = 0;
 
-	if (!cache) {
+	if (!cache)
 		goto out;
-	}
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
 	free_space_cache = &root->fs_info->free_space_cache;
 
@@ -208,6 +206,9 @@
 		goto out;
 
 	last = max(search_start, cache->key.objectid);
+	if (!block_group_bits(cache, data)) {
+		goto new_group;
+	}
 
 	while(1) {
 		ret = find_first_extent_bit(&root->fs_info->free_space_cache,
@@ -225,22 +226,20 @@
 				cache_miss = start;
 			continue;
 		}
-		if (data != BTRFS_BLOCK_GROUP_MIXED &&
-		    start + num > cache->key.objectid + cache->key.offset)
+		if (start + num > cache->key.objectid + cache->key.offset)
 			goto new_group;
 		if (start + num  > total_fs_bytes)
 			goto new_group;
-		return start;
+		*start_ret = start;
+		return 0;
 	}
 out:
 	cache = btrfs_lookup_block_group(root->fs_info, search_start);
 	if (!cache) {
-		printk("Unable to find block group for %Lu\n",
-		       search_start);
+		printk("Unable to find block group for %Lu\n", search_start);
 		WARN_ON(1);
-		return search_start;
 	}
-	return search_start;
+	return -ENOSPC;
 
 new_group:
 	last = cache->key.objectid + cache->key.offset;
@@ -251,7 +250,6 @@
 		if (!wrapped) {
 			wrapped = 1;
 			last = search_start;
-			data = BTRFS_BLOCK_GROUP_MIXED;
 			goto wrapped;
 		}
 		goto out;
@@ -299,7 +297,6 @@
 	int ret;
 	int full_search = 0;
 	int factor = 8;
-	int data_swap = 0;
 
 	block_group_cache = &info->block_group_cache;
 	total_fs_bytes = btrfs_super_total_bytes(&root->fs_info->super_copy);
@@ -307,19 +304,12 @@
 	if (!owner)
 		factor = 8;
 
-	if (data == BTRFS_BLOCK_GROUP_MIXED) {
-		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-		factor = 10;
-	} else if (data)
-		bit = BLOCK_GROUP_DATA;
-	else
-		bit = BLOCK_GROUP_METADATA;
+	bit = data;
 
 	if (search_start && search_start < total_fs_bytes) {
 		struct btrfs_block_group_cache *shint;
 		shint = btrfs_lookup_block_group(info, search_start);
-		if (shint && (shint->data == data ||
-			      shint->data == BTRFS_BLOCK_GROUP_MIXED)) {
+		if (shint && block_group_bits(shint, data)) {
 			used = btrfs_block_group_used(&shint->item);
 			if (used + shint->pinned <
 			    div_factor(shint->key.offset, factor)) {
@@ -327,8 +317,8 @@
 			}
 		}
 	}
-	if (hint && hint->key.objectid < total_fs_bytes &&
-	    (hint->data == data || hint->data == BTRFS_BLOCK_GROUP_MIXED)) {
+	if (hint && block_group_bits(hint, data) &&
+	    hint->key.objectid < total_fs_bytes) {
 		used = btrfs_block_group_used(&hint->item);
 		if (used + hint->pinned <
 		    div_factor(hint->key.offset, factor)) {
@@ -379,12 +369,6 @@
 		full_search = 1;
 		goto again;
 	}
-	if (!data_swap) {
-		data_swap = 1;
-		bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-		last = search_start;
-		goto again;
-	}
 found:
 	return found_group;
 }
@@ -1002,7 +986,7 @@
 static int update_block_group(struct btrfs_trans_handle *trans,
 			      struct btrfs_root *root,
 			      u64 bytenr, u64 num_bytes, int alloc,
-			      int mark_free, int data)
+			      int mark_free)
 {
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -1027,41 +1011,6 @@
 		old_val = btrfs_block_group_used(&cache->item);
 		num_bytes = min(total, cache->key.offset - byte_in_group);
 		if (alloc) {
-			if (cache->data != data &&
-			    old_val < (cache->key.offset >> 1)) {
-				int bit_to_clear;
-				int bit_to_set;
-				cache->data = data;
-				if (data) {
-					bit_to_clear = BLOCK_GROUP_METADATA;
-					bit_to_set = BLOCK_GROUP_DATA;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_MIXED;
-					cache->item.flags |=
-						BTRFS_BLOCK_GROUP_DATA;
-				} else {
-					bit_to_clear = BLOCK_GROUP_DATA;
-					bit_to_set = BLOCK_GROUP_METADATA;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_MIXED;
-					cache->item.flags &=
-						~BTRFS_BLOCK_GROUP_DATA;
-				}
-				clear_extent_bits(&info->block_group_cache,
-						  start, end, bit_to_clear,
-						  GFP_NOFS);
-				set_extent_bits(&info->block_group_cache,
-						start, end, bit_to_set,
-						GFP_NOFS);
-			} else if (cache->data != data &&
-				   cache->data != BTRFS_BLOCK_GROUP_MIXED) {
-				cache->data = BTRFS_BLOCK_GROUP_MIXED;
-				set_extent_bits(&info->block_group_cache,
-						start, end,
-						BLOCK_GROUP_DATA |
-						BLOCK_GROUP_METADATA,
-						GFP_NOFS);
-			}
 			old_val += num_bytes;
 		} else {
 			old_val -= num_bytes;
@@ -1357,7 +1306,7 @@
 			return ret;
 		}
 		ret = update_block_group(trans, root, bytenr, num_bytes, 0,
-					 mark_free, 0);
+					 mark_free);
 		BUG_ON(ret);
 	}
 	btrfs_free_path(path);
@@ -1450,38 +1399,21 @@
 				     u64 exclude_start, u64 exclude_nr,
 				     int data)
 {
-	struct btrfs_path *path;
-	struct btrfs_key key;
-	u64 hole_size = 0;
-	u64 aligned;
 	int ret;
-	int slot = 0;
-	u64 last_byte = 0;
-	u64 *last_ptr = NULL;
 	u64 orig_search_start = search_start;
-	int start_found;
-	struct extent_buffer *l;
 	struct btrfs_root * root = orig_root->fs_info->extent_root;
 	struct btrfs_fs_info *info = root->fs_info;
 	u64 total_needed = num_bytes;
-	int level;
 	struct btrfs_block_group_cache *block_group;
 	int full_scan = 0;
 	int wrapped = 0;
-	int empty_cluster;
-	u64 cached_start;
 
 	WARN_ON(num_bytes < root->sectorsize);
 	btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY);
 
-	level = btrfs_header_level(root->node);
-
-	if (num_bytes >= 32 * 1024 * 1024 && hint_byte) {
-		data = BTRFS_BLOCK_GROUP_MIXED;
-	}
-
 	if (search_end == (u64)-1)
 		search_end = btrfs_super_total_bytes(&info->super_copy);
+
 	if (hint_byte) {
 		block_group = btrfs_lookup_block_group(info, hint_byte);
 		if (!block_group)
@@ -1495,7 +1427,7 @@
 	}
 
 	total_needed += empty_size;
-	path = btrfs_alloc_path();
+
 check_failed:
 	if (!block_group) {
 		block_group = btrfs_lookup_block_group(info, search_start);
@@ -1503,135 +1435,49 @@
 			block_group = btrfs_lookup_block_group(info,
 						       orig_search_start);
 	}
-	search_start = find_search_start(root, &block_group, search_start,
-					 total_needed, data);
+	ret = find_search_start(root, &block_group, &search_start,
+				total_needed, data);
+	if (ret)
+		goto error;
+
 	search_start = stripe_align(root, search_start);
-	cached_start = search_start;
-	btrfs_init_path(path);
 	ins->objectid = search_start;
-	ins->offset = 0;
-	start_found = 0;
-	path->reada = 2;
-
-	ret = btrfs_search_slot(trans, root, ins, path, 0, 0);
-	if (ret < 0)
-		goto error;
-	ret = find_previous_extent(root, path);
-	if (ret < 0)
-		goto error;
-	l = path->nodes[0];
-	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
-	while (1) {
-		l = path->nodes[0];
-		slot = path->slots[0];
-		if (slot >= btrfs_header_nritems(l)) {
-			ret = btrfs_next_leaf(root, path);
-			if (ret == 0)
-				continue;
-			if (ret < 0)
-				goto error;
-
-			search_start = max(search_start,
-					   block_group->key.objectid);
-			if (!start_found) {
-				aligned = stripe_align(root, search_start);
-				ins->objectid = aligned;
-				if (aligned >= search_end) {
-					ret = -ENOSPC;
-					goto error;
-				}
-				ins->offset = search_end - aligned;
-				start_found = 1;
-				goto check_pending;
-			}
-			ins->objectid = stripe_align(root,
-						     last_byte > search_start ?
-						     last_byte : search_start);
-			if (search_end <= ins->objectid) {
-				ret = -ENOSPC;
-				goto error;
-			}
-			ins->offset = search_end - ins->objectid;
-			BUG_ON(ins->objectid >= search_end);
-			goto check_pending;
-		}
-		btrfs_item_key_to_cpu(l, &key, slot);
-
-		if (key.objectid >= search_start && key.objectid > last_byte &&
-		    start_found) {
-			if (last_byte < search_start)
-				last_byte = search_start;
-			aligned = stripe_align(root, last_byte);
-			hole_size = key.objectid - aligned;
-			if (key.objectid > aligned && hole_size >= num_bytes) {
-				ins->objectid = aligned;
-				ins->offset = hole_size;
-				goto check_pending;
-			}
-		}
-		if (btrfs_key_type(&key) != BTRFS_EXTENT_ITEM_KEY) {
-			if (!start_found && btrfs_key_type(&key) ==
-			    BTRFS_BLOCK_GROUP_ITEM_KEY) {
-				last_byte = key.objectid;
-				start_found = 1;
-			}
-			goto next;
-		}
-
-
-		start_found = 1;
-		last_byte = key.objectid + key.offset;
-
-		if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
-		    last_byte >= block_group->key.objectid +
-		    block_group->key.offset) {
-			btrfs_release_path(root, path);
-			search_start = block_group->key.objectid +
-				block_group->key.offset;
-			goto new_group;
-		}
-next:
-		path->slots[0]++;
-		cond_resched();
-	}
-check_pending:
-	/* we have to make sure we didn't find an extent that has already
-	 * been allocated by the map tree or the original allocation
-	 */
-	btrfs_release_path(root, path);
-	BUG_ON(ins->objectid < search_start);
+	ins->offset = num_bytes;
 
 	if (ins->objectid + num_bytes >= search_end)
 		goto enospc;
-	if (!full_scan && data != BTRFS_BLOCK_GROUP_MIXED &&
-	    ins->objectid + num_bytes > block_group->
-	    key.objectid + block_group->key.offset) {
+
+	if (ins->objectid + num_bytes >
+	    block_group->key.objectid + block_group->key.offset) {
 		search_start = block_group->key.objectid +
 			block_group->key.offset;
 		goto new_group;
 	}
+
 	if (test_range_bit(&info->extent_ins, ins->objectid,
 			   ins->objectid + num_bytes -1, EXTENT_LOCKED, 0)) {
 		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
+
 	if (test_range_bit(&info->pinned_extents, ins->objectid,
 			   ins->objectid + num_bytes -1, EXTENT_DIRTY, 0)) {
 		search_start = ins->objectid + num_bytes;
 		goto new_group;
 	}
+
 	if (exclude_nr > 0 && (ins->objectid + num_bytes > exclude_start &&
 	    ins->objectid < exclude_start + exclude_nr)) {
 		search_start = exclude_start + exclude_nr;
 		goto new_group;
 	}
-	if (!data) {
+
+	if (!(data & BLOCK_GROUP_DATA)) {
 		block_group = btrfs_lookup_block_group(info, ins->objectid);
 		if (block_group)
 			trans->block_group = block_group;
 	}
 	ins->offset = num_bytes;
-	btrfs_free_path(path);
 	return 0;
 
 new_group:
@@ -1646,7 +1492,6 @@
 			if (!full_scan)
 				total_needed -= empty_size;
 			full_scan = 1;
-			data = BTRFS_BLOCK_GROUP_MIXED;
 		} else
 			wrapped = 1;
 	}
@@ -1657,8 +1502,6 @@
 	goto check_failed;
 
 error:
-	btrfs_release_path(root, path);
-	btrfs_free_path(path);
 	return ret;
 }
 /*
@@ -1689,6 +1532,13 @@
 	struct btrfs_path *path;
 	struct btrfs_key keys[2];
 
+	if (data)
+		data = BLOCK_GROUP_DATA;
+	else if (root == root->fs_info->chunk_root)
+		data = BLOCK_GROUP_SYSTEM;
+	else
+		data = BLOCK_GROUP_METADATA;
+
 	new_hint = max(hint_byte, root->fs_info->alloc_start);
 	if (new_hint < btrfs_super_total_bytes(&info->super_copy))
 		hint_byte = new_hint;
@@ -1718,7 +1568,6 @@
 		set_extent_bits(&root->fs_info->extent_ins, ins->objectid,
 				ins->objectid + ins->offset - 1,
 				EXTENT_LOCKED, GFP_NOFS);
-		WARN_ON(data == 1);
 		goto update_block;
 	}
 
@@ -1768,8 +1617,7 @@
 	}
 
 update_block:
-	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0,
-				 data);
+	ret = update_block_group(trans, root, ins->objectid, ins->offset, 1, 0);
 	if (ret) {
 		printk("update block group failed for %Lu %Lu\n",
 		       ins->objectid, ins->offset);
@@ -2457,7 +2305,7 @@
 	if (ret < 0)
 		goto out;
 
-	ret = find_previous_extent(root, path);
+	ret = btrfs_previous_item(root, path, 0, BTRFS_EXTENT_ITEM_KEY);
 	if (ret < 0)
 		goto out;
 	if (ret == 0) {
@@ -2604,95 +2452,48 @@
 int btrfs_grow_extent_tree(struct btrfs_trans_handle *trans,
 			   struct btrfs_root *root, u64 new_size)
 {
-	struct btrfs_path *path;
-	u64 nr = 0;
-	u64 cur_byte;
-	u64 old_size;
-	unsigned long rem;
-	struct btrfs_block_group_cache *cache;
-	struct btrfs_block_group_item *item;
-	struct btrfs_fs_info *info = root->fs_info;
-	struct extent_io_tree *block_group_cache;
-	struct btrfs_key key;
-	struct extent_buffer *leaf;
-	int ret;
-	int bit;
-
-	old_size = btrfs_super_total_bytes(&info->super_copy);
-	block_group_cache = &info->block_group_cache;
-
-	root = info->extent_root;
-
-	cache = btrfs_lookup_block_group(root->fs_info, old_size - 1);
-
-	cur_byte = cache->key.objectid + cache->key.offset;
-	if (cur_byte >= new_size)
-		goto set_size;
-
-	key.offset = BTRFS_BLOCK_GROUP_SIZE;
-	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
-
-	path = btrfs_alloc_path();
-	if (!path)
-		return -ENOMEM;
-
-	while(cur_byte < new_size) {
-		key.objectid = cur_byte;
-		ret = btrfs_insert_empty_item(trans, root, path, &key,
-				        sizeof(struct btrfs_block_group_item));
-		BUG_ON(ret);
-		leaf = path->nodes[0];
-		item = btrfs_item_ptr(leaf, path->slots[0],
-				      struct btrfs_block_group_item);
-
-		btrfs_set_disk_block_group_used(leaf, item, 0);
-		div_long_long_rem(nr, 3, &rem);
-		if (rem) {
-			btrfs_set_disk_block_group_flags(leaf, item,
-						 BTRFS_BLOCK_GROUP_DATA);
-		} else {
-			btrfs_set_disk_block_group_flags(leaf, item, 0);
-		}
-		nr++;
-
-		cache = kmalloc(sizeof(*cache), GFP_NOFS);
-		BUG_ON(!cache);
-
-		read_extent_buffer(leaf, &cache->item, (unsigned long)item,
-				   sizeof(cache->item));
-
-		memcpy(&cache->key, &key, sizeof(key));
-		cache->cached = 0;
-		cache->pinned = 0;
-		cur_byte = key.objectid + key.offset;
-		btrfs_release_path(root, path);
-
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
-			bit = BLOCK_GROUP_DATA;
-			cache->data = BTRFS_BLOCK_GROUP_DATA;
-		} else {
-			bit = BLOCK_GROUP_METADATA;
-			cache->data = 0;
-		}
-
-		/* use EXTENT_LOCKED to prevent merging */
-		set_extent_bits(block_group_cache, key.objectid,
-				key.objectid + key.offset - 1,
-				bit | EXTENT_LOCKED, GFP_NOFS);
-		set_state_private(block_group_cache, key.objectid,
-				  (unsigned long)cache);
-	}
-	btrfs_free_path(path);
-set_size:
-	btrfs_set_super_total_bytes(&info->super_copy, new_size);
+	btrfs_set_super_total_bytes(&root->fs_info->super_copy, new_size);
 	return 0;
 }
 
+int find_first_block_group(struct btrfs_root *root, struct btrfs_path *path,
+			   struct btrfs_key *key)
+{
+	int ret;
+	struct btrfs_key found_key;
+	struct extent_buffer *leaf;
+	int slot;
+
+	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
+	if (ret < 0)
+		return ret;
+	while(1) {
+		slot = path->slots[0];
+		leaf = path->nodes[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+
+		if (found_key.objectid >= key->objectid &&
+		    found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY)
+			return 0;
+		path->slots[0]++;
+	}
+	ret = -ENOENT;
+error:
+	return ret;
+}
+
 int btrfs_read_block_groups(struct btrfs_root *root)
 {
 	struct btrfs_path *path;
 	int ret;
-	int err = 0;
 	int bit;
 	struct btrfs_block_group_cache *cache;
 	struct btrfs_fs_info *info = root->fs_info;
@@ -2702,28 +2503,28 @@
 	struct extent_buffer *leaf;
 
 	block_group_cache = &info->block_group_cache;
-
 	root = info->extent_root;
 	key.objectid = 0;
-	key.offset = BTRFS_BLOCK_GROUP_SIZE;
+	key.offset = 0;
 	btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY);
-
 	path = btrfs_alloc_path();
 	if (!path)
 		return -ENOMEM;
 
 	while(1) {
-		ret = btrfs_search_slot(NULL, info->extent_root,
-					&key, path, 0, 0);
-		if (ret != 0) {
-			err = ret;
-			break;
+		ret = find_first_block_group(root, path, &key);
+		if (ret > 0) {
+			ret = 0;
+			goto error;
 		}
+		if (ret != 0)
+			goto error;
+
 		leaf = path->nodes[0];
 		btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
 		cache = kmalloc(sizeof(*cache), GFP_NOFS);
 		if (!cache) {
-			err = -1;
+			ret = -ENOMEM;
 			break;
 		}
 
@@ -2733,18 +2534,17 @@
 		memcpy(&cache->key, &found_key, sizeof(found_key));
 		cache->cached = 0;
 		cache->pinned = 0;
+
 		key.objectid = found_key.objectid + found_key.offset;
 		btrfs_release_path(root, path);
-
-		if (cache->item.flags & BTRFS_BLOCK_GROUP_MIXED) {
-			bit = BLOCK_GROUP_DATA | BLOCK_GROUP_METADATA;
-			cache->data = BTRFS_BLOCK_GROUP_MIXED;
-		} else if (cache->item.flags & BTRFS_BLOCK_GROUP_DATA) {
+		cache->flags = btrfs_block_group_flags(&cache->item);
+		bit = 0;
+		if (cache->flags & BTRFS_BLOCK_GROUP_DATA) {
 			bit = BLOCK_GROUP_DATA;
-			cache->data = BTRFS_BLOCK_GROUP_DATA;
-		} else {
+		} else if (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM) {
+			bit = BLOCK_GROUP_SYSTEM;
+		} else if (cache->flags & BTRFS_BLOCK_GROUP_METADATA) {
 			bit = BLOCK_GROUP_METADATA;
-			cache->data = 0;
 		}
 
 		/* use EXTENT_LOCKED to prevent merging */
@@ -2758,7 +2558,8 @@
 		    btrfs_super_total_bytes(&info->super_copy))
 			break;
 	}
-
+	ret = 0;
+error:
 	btrfs_free_path(path);
-	return 0;
+	return ret;
 }
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e8130c8..7e3a1eb 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -1706,9 +1706,9 @@
 		WARN_ON(1);
 	}
 	if (tree->ops && tree->ops->submit_bio_hook)
-		tree->ops->submit_bio_hook(rw, bio);
-
-	submit_bio(rw, bio);
+		tree->ops->submit_bio_hook(page->mapping->host, rw, bio);
+	else
+		submit_bio(rw, bio);
 	if (bio_flagged(bio, BIO_EOPNOTSUPP))
 		ret = -EOPNOTSUPP;
 	bio_put(bio);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 9d66546..0dca893 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -28,7 +28,7 @@
 struct extent_io_ops {
 	int (*fill_delalloc)(struct inode *inode, u64 start, u64 end);
 	int (*writepage_io_hook)(struct page *page, u64 start, u64 end);
-	int (*submit_bio_hook)(int rw, struct bio *bio);
+	int (*submit_bio_hook)(struct inode *inode, int rw, struct bio *bio);
 	int (*readpage_io_hook)(struct page *page, u64 start, u64 end);
 	int (*readpage_end_io_hook)(struct page *page, u64 start, u64 end,
 				    struct extent_state *state);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 315dd55..17063cd 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -40,6 +40,7 @@
 #include "btrfs_inode.h"
 #include "ioctl.h"
 #include "print-tree.h"
+#include "volumes.h"
 
 struct btrfs_iget_args {
 	u64 ino;
@@ -295,20 +296,20 @@
 	return 0;
 }
 
-int btrfs_submit_bio_hook(int rw, struct bio *bio)
+int btrfs_submit_bio_hook(struct inode *inode, int rw, struct bio *bio)
 {
-	struct bio_vec *bvec = bio->bi_io_vec;
-	struct inode *inode = bvec->bv_page->mapping->host;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_trans_handle *trans;
 	int ret = 0;
 
-	if (rw != WRITE)
-		return 0;
+	if (rw != WRITE) {
+		goto mapit;
+	}
 
 	if (btrfs_test_opt(root, NODATASUM) ||
-	    btrfs_test_flag(inode, NODATASUM))
-		return 0;
+	    btrfs_test_flag(inode, NODATASUM)) {
+		goto mapit;
+	}
 
 	mutex_lock(&root->fs_info->fs_mutex);
 	trans = btrfs_start_transaction(root, 1);
@@ -317,7 +318,8 @@
 	ret = btrfs_end_transaction(trans, root);
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->fs_mutex);
-	return ret;
+mapit:
+	return btrfs_map_bio(root, rw, bio);
 }
 
 int btrfs_readpage_io_hook(struct page *page, u64 start, u64 end)
@@ -406,7 +408,7 @@
 	struct btrfs_path *path;
 	struct extent_buffer *leaf;
 	struct btrfs_inode_item *inode_item;
-	struct btrfs_inode_timespec *tspec;
+	struct btrfs_timespec *tspec;
 	struct btrfs_root *root = BTRFS_I(inode)->root;
 	struct btrfs_key location;
 	u64 alloc_group_block;
@@ -455,7 +457,8 @@
 	BTRFS_I(inode)->flags = btrfs_inode_flags(leaf, inode_item);
 	if (!BTRFS_I(inode)->block_group) {
 		BTRFS_I(inode)->block_group = btrfs_find_block_group(root,
-						         NULL, 0, 0, 0);
+						 NULL, 0,
+						 BTRFS_BLOCK_GROUP_METADATA, 0);
 	}
 	btrfs_free_path(path);
 	inode_item = NULL;
@@ -1550,7 +1553,8 @@
 		owner = 0;
 	else
 		owner = 1;
-	group = btrfs_find_block_group(root, group, 0, 0, owner);
+	group = btrfs_find_block_group(root, group, 0,
+				       BTRFS_BLOCK_GROUP_METADATA, owner);
 	BTRFS_I(inode)->block_group = group;
 	BTRFS_I(inode)->flags = 0;
 
diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c
index da0b4dc..9c1335d 100644
--- a/fs/btrfs/print-tree.c
+++ b/fs/btrfs/print-tree.c
@@ -20,6 +20,40 @@
 #include "disk-io.h"
 #include "print-tree.h"
 
+static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk)
+{
+	int num_stripes = btrfs_chunk_num_stripes(eb, chunk);
+	int i;
+	printk("\t\tchunk owner %llu type %llu num_stripes %d\n",
+	       (unsigned long long)btrfs_chunk_owner(eb, chunk),
+	       (unsigned long long)btrfs_chunk_type(eb, chunk),
+	       num_stripes);
+	for (i = 0 ; i < num_stripes ; i++) {
+		printk("\t\t\tstripe %d devid %llu offset %llu\n", i,
+		      (unsigned long long)btrfs_stripe_devid_nr(eb, chunk, i),
+		      (unsigned long long)btrfs_stripe_offset_nr(eb, chunk, i));
+	}
+}
+static void print_dev_item(struct extent_buffer *eb,
+			   struct btrfs_dev_item *dev_item)
+{
+	char *name;
+	int name_len;
+
+	name_len = btrfs_device_name_len(eb, dev_item);
+	name = kmalloc(name_len, GFP_NOFS);
+	if (name) {
+		read_extent_buffer(eb, name,
+				   (unsigned long)btrfs_device_name(dev_item),
+				   name_len);
+	}
+	printk("\t\tdev item name %.*s devid %llu "
+	       "total_bytes %llu bytes used %Lu\n", name_len, name,
+	       (unsigned long long)btrfs_device_id(eb, dev_item),
+	       (unsigned long long)btrfs_device_total_bytes(eb, dev_item),
+	       (unsigned long long)btrfs_device_bytes_used(eb, dev_item));
+	kfree(name);
+}
 void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l)
 {
 	int i;
@@ -34,6 +68,7 @@
 	struct btrfs_key key;
 	struct btrfs_key found_key;
 	struct btrfs_extent_ref *ref;
+	struct btrfs_dev_extent *dev_extent;
 	u32 type;
 
 	printk("leaf %llu total ptrs %d free space %d\n",
@@ -106,6 +141,19 @@
 			printk("\t\tblock group used %llu\n",
 			       (unsigned long long)btrfs_disk_block_group_used(l, bi));
 			break;
+		case BTRFS_CHUNK_ITEM_KEY:
+			print_chunk(l, btrfs_item_ptr(l, i, struct btrfs_chunk));
+			break;
+		case BTRFS_DEV_ITEM_KEY:
+			print_dev_item(l, btrfs_item_ptr(l, i,
+					struct btrfs_dev_item));
+			break;
+		case BTRFS_DEV_EXTENT_KEY:
+			dev_extent = btrfs_item_ptr(l, i,
+						    struct btrfs_dev_extent);
+			printk("\t\tdev extent owner %llu length %llu\n",
+			       (unsigned long long)btrfs_dev_extent_owner(l, dev_extent),
+			       (unsigned long long)btrfs_dev_extent_length(l, dev_extent));
 		};
 	}
 }
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index e9a0983..5e9f692 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -198,29 +198,42 @@
 	return werr;
 }
 
+static int update_cowonly_root(struct btrfs_trans_handle *trans,
+			       struct btrfs_root *root)
+{
+	int ret;
+	u64 old_root_bytenr;
+	struct btrfs_root *tree_root = root->fs_info->tree_root;
+
+	btrfs_write_dirty_block_groups(trans, root);
+	while(1) {
+		old_root_bytenr = btrfs_root_bytenr(&root->root_item);
+		if (old_root_bytenr == root->node->start)
+			break;
+		btrfs_set_root_bytenr(&root->root_item,
+				       root->node->start);
+		btrfs_set_root_level(&root->root_item,
+				     btrfs_header_level(root->node));
+		ret = btrfs_update_root(trans, tree_root,
+					&root->root_key,
+					&root->root_item);
+		BUG_ON(ret);
+		btrfs_write_dirty_block_groups(trans, root);
+	}
+	return 0;
+}
+
 int btrfs_commit_tree_roots(struct btrfs_trans_handle *trans,
 			    struct btrfs_root *root)
 {
-	int ret;
-	u64 old_extent_block;
 	struct btrfs_fs_info *fs_info = root->fs_info;
-	struct btrfs_root *tree_root = fs_info->tree_root;
-	struct btrfs_root *extent_root = fs_info->extent_root;
+	struct list_head *next;
 
-	btrfs_write_dirty_block_groups(trans, extent_root);
-	while(1) {
-		old_extent_block = btrfs_root_bytenr(&extent_root->root_item);
-		if (old_extent_block == extent_root->node->start)
-			break;
-		btrfs_set_root_bytenr(&extent_root->root_item,
-				      extent_root->node->start);
-		btrfs_set_root_level(&extent_root->root_item,
-				     btrfs_header_level(extent_root->node));
-		ret = btrfs_update_root(trans, tree_root,
-					&extent_root->root_key,
-					&extent_root->root_item);
-		BUG_ON(ret);
-		btrfs_write_dirty_block_groups(trans, extent_root);
+	while(!list_empty(&fs_info->dirty_cowonly_roots)) {
+		next = fs_info->dirty_cowonly_roots.next;
+		list_del_init(next);
+		root = list_entry(next, struct btrfs_root, dirty_list);
+		update_cowonly_root(trans, root);
 	}
 	return 0;
 }
@@ -616,6 +629,7 @@
 	unsigned long timeout = 1;
 	struct btrfs_transaction *cur_trans;
 	struct btrfs_transaction *prev_trans = NULL;
+	struct btrfs_root *chunk_root = root->fs_info->chunk_root;
 	struct list_head dirty_fs_roots;
 	struct extent_io_tree *pinned_copy;
 	DEFINE_WAIT(wait);
@@ -714,6 +728,10 @@
 	btrfs_set_super_root_level(&root->fs_info->super_copy,
 			   btrfs_header_level(root->fs_info->tree_root->node));
 
+	btrfs_set_super_chunk_root(&root->fs_info->super_copy,
+				   chunk_root->node->start);
+	btrfs_set_super_chunk_root_level(&root->fs_info->super_copy,
+					 btrfs_header_level(chunk_root->node));
 	write_extent_buffer(root->fs_info->sb_buffer,
 			    &root->fs_info->super_copy, 0,
 			    sizeof(root->fs_info->super_copy));
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
new file mode 100644
index 0000000..90a8d45
--- /dev/null
+++ b/fs/btrfs/volumes.c
@@ -0,0 +1,852 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+#include <linux/sched.h>
+#include <linux/bio.h>
+#include "ctree.h"
+#include "extent_map.h"
+#include "disk-io.h"
+#include "transaction.h"
+#include "print-tree.h"
+#include "volumes.h"
+
+struct map_lookup {
+	struct btrfs_device *dev;
+	u64 physical;
+};
+
+/*
+ * this uses a pretty simple search, the expectation is that it is
+ * called very infrequently and that a given device has a small number
+ * of extents
+ */
+static int find_free_dev_extent(struct btrfs_trans_handle *trans,
+				struct btrfs_device *device,
+				struct btrfs_path *path,
+				u64 num_bytes, u64 *start)
+{
+	struct btrfs_key key;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *dev_extent = NULL;
+	u64 hole_size = 0;
+	u64 last_byte = 0;
+	u64 search_start = 0;
+	u64 search_end = device->total_bytes;
+	int ret;
+	int slot = 0;
+	int start_found;
+	struct extent_buffer *l;
+
+	start_found = 0;
+	path->reada = 2;
+
+	/* FIXME use last free of some kind */
+
+	key.objectid = device->devid;
+	key.offset = search_start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+	ret = btrfs_previous_item(root, path, 0, key.type);
+	if (ret < 0)
+		goto error;
+	l = path->nodes[0];
+	btrfs_item_key_to_cpu(l, &key, path->slots[0]);
+	while (1) {
+		l = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(l)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+no_more_items:
+			if (!start_found) {
+				if (search_start >= search_end) {
+					ret = -ENOSPC;
+					goto error;
+				}
+				*start = search_start;
+				start_found = 1;
+				goto check_pending;
+			}
+			*start = last_byte > search_start ?
+				last_byte : search_start;
+			if (search_end <= *start) {
+				ret = -ENOSPC;
+				goto error;
+			}
+			goto check_pending;
+		}
+		btrfs_item_key_to_cpu(l, &key, slot);
+
+		if (key.objectid < device->devid)
+			goto next;
+
+		if (key.objectid > device->devid)
+			goto no_more_items;
+
+		if (key.offset >= search_start && key.offset > last_byte &&
+		    start_found) {
+			if (last_byte < search_start)
+				last_byte = search_start;
+			hole_size = key.offset - last_byte;
+			if (key.offset > last_byte &&
+			    hole_size >= num_bytes) {
+				*start = last_byte;
+				goto check_pending;
+			}
+		}
+		if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) {
+			goto next;
+		}
+
+		start_found = 1;
+		dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
+		last_byte = key.offset + btrfs_dev_extent_length(l, dev_extent);
+next:
+		path->slots[0]++;
+		cond_resched();
+	}
+check_pending:
+	/* we have to make sure we didn't find an extent that has already
+	 * been allocated by the map tree or the original allocation
+	 */
+	btrfs_release_path(root, path);
+	BUG_ON(*start < search_start);
+
+	if (*start + num_bytes >= search_end) {
+		ret = -ENOSPC;
+		goto error;
+	}
+	/* check for pending inserts here */
+	return 0;
+
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root = device->dev_root;
+	struct btrfs_dev_extent *extent;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_free_dev_extent(trans, device, path, num_bytes, start);
+	if (ret)
+		goto err;
+
+	key.objectid = device->devid;
+	key.offset = *start;
+	key.type = BTRFS_DEV_EXTENT_KEY;
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*extent));
+	BUG_ON(ret);
+
+	leaf = path->nodes[0];
+	extent = btrfs_item_ptr(leaf, path->slots[0],
+				struct btrfs_dev_extent);
+	btrfs_set_dev_extent_owner(leaf, extent, owner);
+	btrfs_set_dev_extent_length(leaf, extent, num_bytes);
+	btrfs_mark_buffer_dirty(leaf);
+err:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static int find_next_chunk(struct btrfs_root *root, u64 *objectid)
+{
+	struct btrfs_path *path;
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	path = btrfs_alloc_path();
+	BUG_ON(!path);
+
+	key.objectid = (u64)-1;
+	key.offset = (u64)-1;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, 0, BTRFS_CHUNK_ITEM_KEY);
+	if (ret) {
+		*objectid = 0;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.objectid + found_key.offset;
+	}
+	ret = 0;
+error:
+	btrfs_free_path(path);
+	return ret;
+}
+
+static struct btrfs_device *next_device(struct list_head *head,
+					struct list_head *last)
+{
+	struct list_head *next = last->next;
+	struct btrfs_device *dev;
+
+	if (list_empty(head))
+		return NULL;
+
+	if (next == head)
+		next = next->next;
+
+	dev = list_entry(next, struct btrfs_device, dev_list);
+	return dev;
+}
+
+static int find_next_devid(struct btrfs_root *root, struct btrfs_path *path,
+			   u64 *objectid)
+{
+	int ret;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = (u64)-1;
+
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	if (ret < 0)
+		goto error;
+
+	BUG_ON(ret == 0);
+
+	ret = btrfs_previous_item(root, path, BTRFS_DEV_ITEMS_OBJECTID,
+				  BTRFS_DEV_ITEM_KEY);
+	if (ret) {
+		*objectid = 1;
+	} else {
+		btrfs_item_key_to_cpu(path->nodes[0], &found_key,
+				      path->slots[0]);
+		*objectid = found_key.offset + 1;
+	}
+	ret = 0;
+error:
+	btrfs_release_path(root, path);
+	return ret;
+}
+
+/*
+ * the device information is stored in the chunk root
+ * the btrfs_device struct should be fully filled in
+ */
+int btrfs_add_device(struct btrfs_trans_handle *trans,
+		     struct btrfs_root *root,
+		     struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	unsigned long ptr;
+	u64 free_devid;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	ret = find_next_devid(root, path, &free_devid);
+	if (ret)
+		goto out;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = free_devid;
+
+	ret = btrfs_insert_empty_item(trans, root, path, &key,
+				      sizeof(*dev_item) + device->name_len);
+	if (ret)
+		goto out;
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_name_len(leaf, dev_item, device->name_len);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	write_extent_buffer(leaf, device->name, ptr, device->name_len);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	write_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+	btrfs_mark_buffer_dirty(leaf);
+	ret = 0;
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+int btrfs_update_device(struct btrfs_trans_handle *trans,
+			struct btrfs_device *device)
+{
+	int ret;
+	struct btrfs_path *path;
+	struct btrfs_root *root;
+	struct btrfs_dev_item *dev_item;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+
+	root = device->dev_root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.type = BTRFS_DEV_ITEM_KEY;
+	key.offset = device->devid;
+
+	ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
+	if (ret < 0)
+		goto out;
+
+	if (ret > 0) {
+		ret = -ENOENT;
+		goto out;
+	}
+
+	leaf = path->nodes[0];
+	dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
+
+	btrfs_set_device_id(leaf, dev_item, device->devid);
+	btrfs_set_device_type(leaf, dev_item, device->type);
+	btrfs_set_device_io_align(leaf, dev_item, device->io_align);
+	btrfs_set_device_io_width(leaf, dev_item, device->io_width);
+	btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
+	btrfs_set_device_rdev(leaf, dev_item, device->rdev);
+	btrfs_set_device_partition(leaf, dev_item, device->partition);
+	btrfs_set_device_total_bytes(leaf, dev_item, device->total_bytes);
+	btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used);
+	btrfs_mark_buffer_dirty(leaf);
+
+out:
+	btrfs_free_path(path);
+	return ret;
+}
+
+int btrfs_add_system_chunk(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root,
+			   struct btrfs_key *key,
+			   struct btrfs_chunk *chunk, int item_size)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct btrfs_disk_key disk_key;
+	u32 array_size;
+	u8 *ptr;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+	if (array_size + item_size > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE)
+		return -EFBIG;
+
+	ptr = super_copy->sys_chunk_array + array_size;
+	btrfs_cpu_key_to_disk(&disk_key, key);
+	memcpy(ptr, &disk_key, sizeof(disk_key));
+	ptr += sizeof(disk_key);
+	memcpy(ptr, chunk, item_size);
+	item_size += sizeof(disk_key);
+	btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
+	return 0;
+}
+
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type)
+{
+	u64 dev_offset;
+	struct btrfs_root *chunk_root = extent_root->fs_info->chunk_root;
+	struct btrfs_stripe *stripes;
+	struct btrfs_device *device = NULL;
+	struct btrfs_chunk *chunk;
+	struct list_head *dev_list = &extent_root->fs_info->devices;
+	struct list_head *last_dev = extent_root->fs_info->last_device;
+	struct extent_map_tree *em_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 physical;
+	u64 calc_size = 1024 * 1024 * 1024;
+	int num_stripes;
+	int ret;
+	int index = 0;
+	struct btrfs_key key;
+
+
+	ret = find_next_chunk(chunk_root, &key.objectid);
+	if (ret)
+		return ret;
+
+	num_stripes = 1;
+	chunk = kmalloc(btrfs_chunk_item_size(num_stripes), GFP_NOFS);
+	if (!chunk)
+		return -ENOMEM;
+
+	stripes = &chunk->stripe;
+
+	*num_bytes = calc_size;
+	while(index < num_stripes) {
+		device = next_device(dev_list, last_dev);
+		BUG_ON(!device);
+		last_dev = &device->dev_list;
+		extent_root->fs_info->last_device = last_dev;
+
+		ret = btrfs_alloc_dev_extent(trans, device,
+					     key.objectid,
+					     calc_size, &dev_offset);
+		BUG_ON(ret);
+
+		device->bytes_used += calc_size;
+		ret = btrfs_update_device(trans, device);
+		BUG_ON(ret);
+
+		btrfs_set_stack_stripe_devid(stripes + index, device->devid);
+		btrfs_set_stack_stripe_offset(stripes + index, dev_offset);
+		physical = dev_offset;
+		index++;
+	}
+
+	/* key.objectid was set above */
+	key.offset = *num_bytes;
+	key.type = BTRFS_CHUNK_ITEM_KEY;
+	btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
+	btrfs_set_stack_chunk_stripe_len(chunk, 64 * 1024);
+	btrfs_set_stack_chunk_type(chunk, type);
+	btrfs_set_stack_chunk_num_stripes(chunk, num_stripes);
+	btrfs_set_stack_chunk_io_align(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_io_width(chunk, extent_root->sectorsize);
+	btrfs_set_stack_chunk_sector_size(chunk, extent_root->sectorsize);
+
+	ret = btrfs_insert_item(trans, chunk_root, &key, chunk,
+				btrfs_chunk_item_size(num_stripes));
+	BUG_ON(ret);
+	*start = key.objectid;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = key.objectid;
+	em->len = key.offset;
+	em->block_start = 0;
+
+	map->physical = physical;
+	map->dev = device;
+
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+	kfree(chunk);
+
+	em_tree = &extent_root->fs_info->mapping_tree.map_tree;
+	spin_lock(&em_tree->lock);
+	ret = add_extent_mapping(em_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&em_tree->lock);
+	free_extent_map(em);
+	return ret;
+}
+
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree)
+{
+	extent_map_tree_init(&tree->map_tree, GFP_NOFS);
+}
+
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree)
+{
+	struct extent_map *em;
+
+	while(1) {
+		spin_lock(&tree->map_tree.lock);
+		em = lookup_extent_mapping(&tree->map_tree, 0, (u64)-1);
+		if (em)
+			remove_extent_mapping(&tree->map_tree, em);
+		spin_unlock(&tree->map_tree.lock);
+		if (!em)
+			break;
+		kfree(em->bdev);
+		/* once for us */
+		free_extent_map(em);
+		/* once for the tree */
+		free_extent_map(em);
+	}
+}
+
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev)
+{
+	struct extent_map *em;
+	struct map_lookup *map;
+	struct extent_map_tree *em_tree = &map_tree->map_tree;
+	u64 offset;
+
+
+	spin_lock(&em_tree->lock);
+	em = lookup_extent_mapping(em_tree, logical, *length);
+	BUG_ON(!em);
+
+	BUG_ON(em->start > logical || em->start + em->len < logical);
+	map = (struct map_lookup *)em->bdev;
+	offset = logical - em->start;
+	*phys = map->physical + offset;
+	*length = em->len - offset;
+	*dev = map->dev;
+	free_extent_map(em);
+	spin_unlock(&em_tree->lock);
+	return 0;
+}
+
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio)
+{
+	struct btrfs_mapping_tree *map_tree;
+	struct btrfs_device *dev;
+	u64 logical = bio->bi_sector << 9;
+	u64 physical;
+	u64 length = 0;
+	u64 map_length;
+	struct bio_vec *bvec;
+	int i;
+	int ret;
+
+	bio_for_each_segment(bvec, bio, i) {
+		length += bvec->bv_len;
+	}
+	map_tree = &root->fs_info->mapping_tree;
+	map_length = length;
+	ret = btrfs_map_block(map_tree, logical, &physical, &map_length, &dev);
+	BUG_ON(map_length < length);
+	bio->bi_sector = physical >> 9;
+	bio->bi_bdev = dev->bdev;
+	submit_bio(rw, bio);
+	return 0;
+}
+
+struct btrfs_device *btrfs_find_device(struct btrfs_root *root, u64 devid)
+{
+	struct btrfs_device *dev;
+	struct list_head *cur = root->fs_info->devices.next;
+	struct list_head *head = &root->fs_info->devices;
+
+	while(cur != head) {
+		dev = list_entry(cur, struct btrfs_device, dev_list);
+		if (dev->devid == devid)
+			return dev;
+		cur = cur->next;
+	}
+	return NULL;
+}
+
+static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key,
+			  struct extent_buffer *leaf,
+			  struct btrfs_chunk *chunk)
+{
+	struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree;
+	struct map_lookup *map;
+	struct extent_map *em;
+	u64 logical;
+	u64 length;
+	u64 devid;
+	int ret;
+
+	logical = key->objectid;
+	length = key->offset;
+	spin_lock(&map_tree->map_tree.lock);
+	em = lookup_extent_mapping(&map_tree->map_tree, logical, 1);
+
+	/* already mapped? */
+	if (em && em->start <= logical && em->start + em->len > logical) {
+		free_extent_map(em);
+		spin_unlock(&map_tree->map_tree.lock);
+		return 0;
+	} else if (em) {
+		free_extent_map(em);
+	}
+	spin_unlock(&map_tree->map_tree.lock);
+
+	map = kzalloc(sizeof(*map), GFP_NOFS);
+	if (!map)
+		return -ENOMEM;
+
+	em = alloc_extent_map(GFP_NOFS);
+	if (!em)
+		return -ENOMEM;
+	map = kmalloc(sizeof(*map), GFP_NOFS);
+	if (!map) {
+		free_extent_map(em);
+		return -ENOMEM;
+	}
+
+	em->bdev = (struct block_device *)map;
+	em->start = logical;
+	em->len = length;
+	em->block_start = 0;
+
+	map->physical = btrfs_stripe_offset_nr(leaf, chunk, 0);
+	devid = btrfs_stripe_devid_nr(leaf, chunk, 0);
+	map->dev = btrfs_find_device(root, devid);
+	if (!map->dev) {
+		kfree(map);
+		free_extent_map(em);
+		return -EIO;
+	}
+
+	spin_lock(&map_tree->map_tree.lock);
+	ret = add_extent_mapping(&map_tree->map_tree, em);
+	BUG_ON(ret);
+	spin_unlock(&map_tree->map_tree.lock);
+	free_extent_map(em);
+
+	return 0;
+}
+
+static int fill_device_from_item(struct extent_buffer *leaf,
+				 struct btrfs_dev_item *dev_item,
+				 struct btrfs_device *device)
+{
+	unsigned long ptr;
+	char *name;
+
+	device->devid = btrfs_device_id(leaf, dev_item);
+	device->total_bytes = btrfs_device_total_bytes(leaf, dev_item);
+	device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
+	device->type = btrfs_device_type(leaf, dev_item);
+	device->io_align = btrfs_device_io_align(leaf, dev_item);
+	device->io_width = btrfs_device_io_width(leaf, dev_item);
+	device->sector_size = btrfs_device_sector_size(leaf, dev_item);
+	device->rdev = btrfs_device_rdev(leaf, dev_item);
+	device->partition = btrfs_device_partition(leaf, dev_item);
+	device->name_len = btrfs_device_name_len(leaf, dev_item);
+
+	ptr = (unsigned long)btrfs_device_uuid(dev_item);
+	read_extent_buffer(leaf, device->uuid, ptr, BTRFS_DEV_UUID_SIZE);
+
+	name = kmalloc(device->name_len + 1, GFP_NOFS);
+	if (!name)
+		return -ENOMEM;
+	device->name = name;
+	ptr = (unsigned long)btrfs_device_name(dev_item);
+	read_extent_buffer(leaf, name, ptr, device->name_len);
+	name[device->name_len] = '\0';
+	return 0;
+}
+
+static int read_one_dev(struct btrfs_root *root, struct btrfs_key *key,
+			struct extent_buffer *leaf,
+			struct btrfs_dev_item *dev_item)
+{
+	struct btrfs_device *device;
+	u64 devid;
+	int ret;
+
+	devid = btrfs_device_id(leaf, dev_item);
+	if (btrfs_find_device(root, devid))
+		return 0;
+
+	device = kmalloc(sizeof(*device), GFP_NOFS);
+	if (!device)
+		return -ENOMEM;
+
+	fill_device_from_item(leaf, dev_item, device);
+	device->dev_root = root->fs_info->dev_root;
+	device->bdev = root->fs_info->sb->s_bdev;
+	list_add(&device->dev_list, &root->fs_info->devices);
+	memcpy(&device->dev_key, key, sizeof(*key));
+	ret = 0;
+#if 0
+	ret = btrfs_open_device(device);
+	if (ret) {
+		kfree(device);
+	}
+#endif
+	return ret;
+}
+
+int btrfs_read_sys_array(struct btrfs_root *root)
+{
+	struct btrfs_super_block *super_copy = &root->fs_info->super_copy;
+	struct extent_buffer *sb = root->fs_info->sb_buffer;
+	struct btrfs_disk_key *disk_key;
+	struct btrfs_dev_item *dev_item;
+	struct btrfs_chunk *chunk;
+	struct btrfs_key key;
+	u32 num_stripes;
+	u32 array_size;
+	u32 len = 0;
+	u8 *ptr;
+	unsigned long sb_ptr;
+	u32 cur;
+	int ret;
+	int dev_only = 1;
+
+	array_size = btrfs_super_sys_array_size(super_copy);
+
+	/*
+	 * we do this loop twice, once for the device items and
+	 * once for all of the chunks.  This way there are device
+	 * structs filled in for every chunk
+	 */
+again:
+	ptr = super_copy->sys_chunk_array;
+	sb_ptr = offsetof(struct btrfs_super_block, sys_chunk_array);
+	cur = 0;
+
+	while (cur < array_size) {
+		disk_key = (struct btrfs_disk_key *)ptr;
+		btrfs_disk_key_to_cpu(&key, disk_key);
+
+		len = sizeof(*disk_key);
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID &&
+		    key.type == BTRFS_DEV_ITEM_KEY) {
+			dev_item = (struct btrfs_dev_item *)sb_ptr;
+			if (dev_only) {
+				ret = read_one_dev(root, &key, sb, dev_item);
+				BUG_ON(ret);
+			}
+			len = sizeof(*dev_item);
+			len += btrfs_device_name_len(sb, dev_item);
+		} else if (key.type == BTRFS_CHUNK_ITEM_KEY) {
+
+			chunk = (struct btrfs_chunk *)sb_ptr;
+			if (!dev_only) {
+				ret = read_one_chunk(root, &key, sb, chunk);
+				BUG_ON(ret);
+			}
+			num_stripes = btrfs_chunk_num_stripes(sb, chunk);
+			len = btrfs_chunk_item_size(num_stripes);
+		} else {
+			BUG();
+		}
+		ptr += len;
+		sb_ptr += len;
+		cur += len;
+	}
+	if (dev_only == 1) {
+		dev_only = 0;
+		goto again;
+	}
+	return 0;
+}
+
+int btrfs_read_chunk_tree(struct btrfs_root *root)
+{
+	struct btrfs_path *path;
+	struct extent_buffer *leaf;
+	struct btrfs_key key;
+	struct btrfs_key found_key;
+	int ret;
+	int slot;
+
+	root = root->fs_info->chunk_root;
+
+	path = btrfs_alloc_path();
+	if (!path)
+		return -ENOMEM;
+
+	/* first we search for all of the device items, and then we
+	 * read in all of the chunk items.  This way we can create chunk
+	 * mappings that reference all of the devices that are afound
+	 */
+	key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
+	key.offset = 0;
+	key.type = 0;
+again:
+	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
+	while(1) {
+		leaf = path->nodes[0];
+		slot = path->slots[0];
+		if (slot >= btrfs_header_nritems(leaf)) {
+			ret = btrfs_next_leaf(root, path);
+			if (ret == 0)
+				continue;
+			if (ret < 0)
+				goto error;
+			break;
+		}
+		btrfs_item_key_to_cpu(leaf, &found_key, slot);
+		if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+			if (found_key.objectid != BTRFS_DEV_ITEMS_OBJECTID)
+				break;
+			if (found_key.type == BTRFS_DEV_ITEM_KEY) {
+				struct btrfs_dev_item *dev_item;
+				dev_item = btrfs_item_ptr(leaf, slot,
+						  struct btrfs_dev_item);
+				ret = read_one_dev(root, &found_key, leaf,
+						   dev_item);
+				BUG_ON(ret);
+			}
+		} else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
+			struct btrfs_chunk *chunk;
+			chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
+			ret = read_one_chunk(root, &found_key, leaf, chunk);
+		}
+		path->slots[0]++;
+	}
+	if (key.objectid == BTRFS_DEV_ITEMS_OBJECTID) {
+		key.objectid = 0;
+		btrfs_release_path(root, path);
+		goto again;
+	}
+
+	btrfs_free_path(path);
+	ret = 0;
+error:
+	return ret;
+}
+
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
new file mode 100644
index 0000000..4a47dcb
--- /dev/null
+++ b/fs/btrfs/volumes.h
@@ -0,0 +1,78 @@
+/*
+ * Copyright (C) 2007 Oracle.  All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public
+ * License v2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public
+ * License along with this program; if not, write to the
+ * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
+ * Boston, MA 021110-1307, USA.
+ */
+
+#ifndef __BTRFS_VOLUMES_
+#define __BTRFS_VOLUMES_
+struct btrfs_device {
+	struct list_head dev_list;
+	struct btrfs_root *dev_root;
+	struct btrfs_key dev_key;
+
+	struct block_device *bdev;
+
+	/* the internal btrfs device id */
+	u64 devid;
+
+	/* size of the device */
+	u64 total_bytes;
+
+	/* bytes used */
+	u64 bytes_used;
+
+	/* optimal io alignment for this device */
+	u32 io_align;
+
+	/* optimal io width for this device */
+	u32 io_width;
+
+	/* minimal io size for this device */
+	u32 sector_size;
+
+	/* the kernel device number */
+	u64 rdev;
+
+	/* type and info about this device */
+	u64 type;
+
+	/* partition number, 0 for whole dev */
+	int partition;
+
+	/* length of the name data at the end of the item */
+	int name_len;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_DEV_UUID_SIZE];
+
+	char *name;
+};
+
+int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
+			   struct btrfs_device *device,
+			   u64 owner, u64 num_bytes, u64 *start);
+int btrfs_map_block(struct btrfs_mapping_tree *map_tree,
+		    u64 logical, u64 *phys, u64 *length,
+		    struct btrfs_device **dev);
+int btrfs_read_sys_array(struct btrfs_root *root);
+int btrfs_read_chunk_tree(struct btrfs_root *root);
+int btrfs_alloc_chunk(struct btrfs_trans_handle *trans,
+		      struct btrfs_root *extent_root, u64 *start,
+		      u64 *num_bytes, u32 type);
+void btrfs_mapping_init(struct btrfs_mapping_tree *tree);
+void btrfs_mapping_tree_free(struct btrfs_mapping_tree *tree);
+int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio);
+#endif