Btrfs: Add support for multiple devices per filesystem

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 92d892f..1453d99 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -40,12 +40,44 @@
 #define BTRFS_MAGIC "_B4RfS_M"
 
 #define BTRFS_MAX_LEVEL 8
+
+/* holds pointers to all of the tree roots */
 #define BTRFS_ROOT_TREE_OBJECTID 1ULL
+
+/* stores information about which extents are in use, and reference counts */
 #define BTRFS_EXTENT_TREE_OBJECTID 2ULL
+
+/* one per subvolume, storing files and directories */
 #define BTRFS_FS_TREE_OBJECTID 3ULL
+
+/* directory objectid inside the root tree */
 #define BTRFS_ROOT_TREE_DIR_OBJECTID 4ULL
+
+
+/*
+ * chunk tree stores translations from logical -> physical block numbering
+ * the super block points to the chunk tree
+ */
+#define BTRFS_CHUNK_TREE_OBJECTID 5ULL
+
+/*
+ * stores information about which areas of a given device are in use.
+ * one per device.  The tree of tree roots points to the device tree
+ */
+#define BTRFS_DEV_TREE_OBJECTID 6ULL
+
+/*
+ * All files have objectids higher than this.
+ */
 #define BTRFS_FIRST_FREE_OBJECTID 256ULL
 
+
+/*
+ * the device items go into the chunk tree.  The key is in the form
+ * [ 1 BTRFS_DEV_ITEM_KEY device_id ]
+ */
+#define BTRFS_DEV_ITEMS_OBJECTID 1ULL
+
 /*
  * we can actually store much bigger names, but lets not confuse the rest
  * of linux
@@ -95,6 +127,81 @@
 	u64 offset;
 } __attribute__ ((__packed__));
 
+struct btrfs_mapping_tree {
+	struct extent_map_tree map_tree;
+};
+
+#define BTRFS_DEV_UUID_SIZE 16
+struct btrfs_dev_item {
+	/* the internal btrfs device id */
+	__le64 devid;
+
+	/* size of the device */
+	__le64 total_bytes;
+
+	/* bytes used */
+	__le64 bytes_used;
+
+	/* optimal io alignment for this device */
+	__le32 io_align;
+
+	/* optimal io width for this device */
+	__le32 io_width;
+
+	/* minimal io size for this device */
+	__le32 sector_size;
+
+	/* the kernel device number */
+	__le64 rdev;
+
+	/* type and info about this device */
+	__le64 type;
+
+	/* partition number, 0 for whole dev */
+	__le32 partition;
+
+	/* length of the name data at the end of the item */
+	__le16 name_len;
+
+	/* physical drive uuid (or lvm uuid) */
+	u8 uuid[BTRFS_DEV_UUID_SIZE];
+	/* name goes here */
+} __attribute__ ((__packed__));
+
+struct btrfs_stripe {
+	__le64 devid;
+	__le64 offset;
+} __attribute__ ((__packed__));
+
+struct btrfs_chunk {
+	__le64 owner;
+	__le64 stripe_len;
+	__le64 type;
+
+	/* optimal io alignment for this chunk */
+	__le32 io_align;
+
+	/* optimal io width for this chunk */
+	__le32 io_width;
+
+	/* minimal io size for this chunk */
+	__le32 sector_size;
+
+	/* 2^16 stripes is quite a lot, a second limit is the size of a single
+	 * item in the btree
+	 */
+	__le16 num_stripes;
+	struct btrfs_stripe stripe;
+	/* additional stripes go here */
+} __attribute__ ((__packed__));
+
+static inline unsigned long btrfs_chunk_item_size(int num_stripes)
+{
+	BUG_ON(num_stripes == 0);
+	return sizeof(struct btrfs_chunk) +
+		sizeof(struct btrfs_stripe) * (num_stripes - 1);
+}
+
 #define BTRFS_FSID_SIZE 16
 /*
  * every tree block (leaf or node) starts with this header.
@@ -119,6 +226,13 @@
 					sizeof(struct btrfs_item) - \
 					sizeof(struct btrfs_file_extent_item))
 
+
+/*
+ * this is a very generous portion of the super block, giving us
+ * room to translate 14 chunks with 3 stripes each.
+ */
+#define BTRFS_SYSTEM_CHUNK_ARRAY_SIZE 2048
+
 /*
  * the super block basically lists the main trees of the FS
  * it currently lacks any block count etc etc
@@ -131,6 +245,7 @@
 	__le64 magic;
 	__le64 generation;
 	__le64 root;
+	__le64 chunk_root;
 	__le64 total_bytes;
 	__le64 bytes_used;
 	__le64 root_dir_objectid;
@@ -138,7 +253,10 @@
 	__le32 nodesize;
 	__le32 leafsize;
 	__le32 stripesize;
+	__le32 sys_chunk_array_size;
 	u8 root_level;
+	u8 chunk_root_level;
+	u8 sys_chunk_array[BTRFS_SYSTEM_CHUNK_ARRAY_SIZE];
 } __attribute__ ((__packed__));
 
 /*
@@ -208,12 +326,22 @@
 	__le64 offset;
 } __attribute__ ((__packed__));
 
+/* dev extents record free space on individual devices.  The owner
+ * field points back to the chunk allocation mapping tree that allocated
+ * the extent
+ */
+struct btrfs_dev_extent {
+	__le64 owner;
+	__le64 length;
+} __attribute__ ((__packed__));
+
+
 struct btrfs_inode_ref {
 	__le16 name_len;
 	/* name goes here */
 } __attribute__ ((__packed__));
 
-struct btrfs_inode_timespec {
+struct btrfs_timespec {
 	__le64 sec;
 	__le32 nsec;
 } __attribute__ ((__packed__));
@@ -231,13 +359,13 @@
 	__le32 uid;
 	__le32 gid;
 	__le32 mode;
-	__le32 rdev;
+	__le64 rdev;
 	__le16 flags;
 	__le16 compat_flags;
-	struct btrfs_inode_timespec atime;
-	struct btrfs_inode_timespec ctime;
-	struct btrfs_inode_timespec mtime;
-	struct btrfs_inode_timespec otime;
+	struct btrfs_timespec atime;
+	struct btrfs_timespec ctime;
+	struct btrfs_timespec mtime;
+	struct btrfs_timespec otime;
 } __attribute__ ((__packed__));
 
 struct btrfs_dir_item {
@@ -290,29 +418,34 @@
 	u8 csum;
 } __attribute__ ((__packed__));
 
-/* tag for the radix tree of block groups in ram */
-#define BTRFS_BLOCK_GROUP_SIZE (256 * 1024 * 1024)
+/* different types of block groups (and chunks) */
+#define BTRFS_BLOCK_GROUP_DATA     (1 << 0)
+#define BTRFS_BLOCK_GROUP_SYSTEM   (1 << 1)
+#define BTRFS_BLOCK_GROUP_METADATA (1 << 2)
 
 
-#define BTRFS_BLOCK_GROUP_DATA 1
-#define BTRFS_BLOCK_GROUP_MIXED 2
-
 struct btrfs_block_group_item {
 	__le64 used;
-	u8 flags;
+	__le64 chunk_tree;
+	__le64 chunk_objectid;
+	__le64 flags;
 } __attribute__ ((__packed__));
 
 struct btrfs_block_group_cache {
 	struct btrfs_key key;
 	struct btrfs_block_group_item item;
-	int data;
-	int cached;
 	u64 pinned;
+	u64 flags;
+	int cached;
 };
+
+struct btrfs_device;
 struct btrfs_fs_info {
 	u8 fsid[BTRFS_FSID_SIZE];
 	struct btrfs_root *extent_root;
 	struct btrfs_root *tree_root;
+	struct btrfs_root *chunk_root;
+	struct btrfs_root *dev_root;
 	struct radix_tree_root fs_roots_radix;
 
 	struct extent_io_tree free_space_cache;
@@ -321,6 +454,9 @@
 	struct extent_io_tree pending_del;
 	struct extent_io_tree extent_ins;
 
+	/* logical->physical extent mapping */
+	struct btrfs_mapping_tree mapping_tree;
+
 	u64 generation;
 	u64 last_trans_committed;
 	unsigned long mount_opt;
@@ -330,6 +466,7 @@
 	struct btrfs_transaction *running_transaction;
 	struct btrfs_super_block super_copy;
 	struct extent_buffer *sb_buffer;
+	struct block_device *__bdev;
 	struct super_block *sb;
 	struct inode *btree_inode;
 	spinlock_t hash_lock;
@@ -350,12 +487,17 @@
 	unsigned long throttles;
 
 	u64 total_pinned;
+	struct list_head dirty_cowonly_roots;
+
+	struct list_head devices;
+	struct list_head *last_device;
 	spinlock_t delalloc_lock;
 	spinlock_t new_trans_lock;
 	u64 delalloc_bytes;
 	u64 last_alloc;
 	u64 last_data_alloc;
 };
+
 /*
  * in ram representation of the tree.  extent_root is used for all allocations
  * and for the extent tree extent_root root.
@@ -387,14 +529,19 @@
 	u64 highest_inode;
 	u64 last_inode_alloc;
 	int ref_cows;
+	int track_dirty;
 	struct btrfs_key defrag_progress;
 	int defrag_running;
 	int defrag_level;
 	char *name;
 	int in_sysfs;
+
+	/* the dirty list is only used by non-reference counted roots */
+	struct list_head dirty_list;
 };
 
 /*
+
  * inode items have the data typically returned from stat and store other
  * info about object characteristics.  There is one for every file and dir in
  * the FS
@@ -439,6 +586,10 @@
  */
 #define BTRFS_BLOCK_GROUP_ITEM_KEY 50
 
+#define BTRFS_DEV_EXTENT_KEY	75
+#define BTRFS_DEV_ITEM_KEY	76
+#define BTRFS_CHUNK_ITEM_KEY	77
+
 /*
  * string items are for debugging.  They just store a short string of
  * data in the FS
@@ -518,13 +669,104 @@
 	s->member = cpu_to_le##bits(val);				\
 }
 
+BTRFS_SETGET_FUNCS(device_type, struct btrfs_dev_item, type, 64);
+BTRFS_SETGET_FUNCS(device_total_bytes, struct btrfs_dev_item, total_bytes, 64);
+BTRFS_SETGET_FUNCS(device_bytes_used, struct btrfs_dev_item, bytes_used, 64);
+BTRFS_SETGET_FUNCS(device_io_align, struct btrfs_dev_item, io_align, 32);
+BTRFS_SETGET_FUNCS(device_io_width, struct btrfs_dev_item, io_width, 32);
+BTRFS_SETGET_FUNCS(device_sector_size, struct btrfs_dev_item, sector_size, 32);
+BTRFS_SETGET_FUNCS(device_id, struct btrfs_dev_item, devid, 64);
+BTRFS_SETGET_FUNCS(device_rdev, struct btrfs_dev_item, rdev, 64);
+BTRFS_SETGET_FUNCS(device_partition, struct btrfs_dev_item, partition, 32);
+BTRFS_SETGET_FUNCS(device_name_len, struct btrfs_dev_item, name_len, 16);
+
+static inline char *btrfs_device_uuid(struct btrfs_dev_item *d)
+{
+	return (char *)d + offsetof(struct btrfs_dev_item, uuid);
+}
+
+static inline char *btrfs_device_name(struct btrfs_dev_item *d)
+{
+	return (char *)(d + 1);
+}
+
+BTRFS_SETGET_FUNCS(chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_FUNCS(chunk_stripe_len, struct btrfs_chunk, stripe_len, 64);
+BTRFS_SETGET_FUNCS(chunk_io_align, struct btrfs_chunk, io_align, 32);
+BTRFS_SETGET_FUNCS(chunk_io_width, struct btrfs_chunk, io_width, 32);
+BTRFS_SETGET_FUNCS(chunk_sector_size, struct btrfs_chunk, sector_size, 32);
+BTRFS_SETGET_FUNCS(chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_FUNCS(chunk_num_stripes, struct btrfs_chunk, num_stripes, 16);
+BTRFS_SETGET_FUNCS(stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_FUNCS(stripe_offset, struct btrfs_stripe, offset, 64);
+
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_owner, struct btrfs_chunk, owner, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_stripe_len, struct btrfs_chunk,
+			 stripe_len, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_align, struct btrfs_chunk,
+			 io_align, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_io_width, struct btrfs_chunk,
+			 io_width, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_sector_size, struct btrfs_chunk,
+			 sector_size, 32);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_type, struct btrfs_chunk, type, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_chunk_num_stripes, struct btrfs_chunk,
+			 num_stripes, 16);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_devid, struct btrfs_stripe, devid, 64);
+BTRFS_SETGET_STACK_FUNCS(stack_stripe_offset, struct btrfs_stripe, offset, 64);
+
+static inline struct btrfs_stripe *btrfs_stripe_nr(struct btrfs_chunk *c,
+						   int nr)
+{
+	unsigned long offset = (unsigned long)c;
+	offset += offsetof(struct btrfs_chunk, stripe);
+	offset += nr * sizeof(struct btrfs_stripe);
+	return (struct btrfs_stripe *)offset;
+}
+
+static inline u64 btrfs_stripe_offset_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_offset(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_offset_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_offset(eb, btrfs_stripe_nr(c, nr), val);
+}
+
+static inline u64 btrfs_stripe_devid_nr(struct extent_buffer *eb,
+					 struct btrfs_chunk *c, int nr)
+{
+	return btrfs_stripe_devid(eb, btrfs_stripe_nr(c, nr));
+}
+
+static inline void btrfs_set_stripe_devid_nr(struct extent_buffer *eb,
+					     struct btrfs_chunk *c, int nr,
+					     u64 val)
+{
+	btrfs_set_stripe_devid(eb, btrfs_stripe_nr(c, nr), val);
+}
+
 /* struct btrfs_block_group_item */
 BTRFS_SETGET_STACK_FUNCS(block_group_used, struct btrfs_block_group_item,
 			 used, 64);
 BTRFS_SETGET_FUNCS(disk_block_group_used, struct btrfs_block_group_item,
 			 used, 64);
-BTRFS_SETGET_FUNCS(disk_block_group_flags, struct btrfs_block_group_item,
-		   flags, 8);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_tree, struct btrfs_block_group_item,
+			 chunk_tree, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_chunk_objectid,
+			struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_chunk_objecitd,
+		   struct btrfs_block_group_item, chunk_objectid, 64);
+BTRFS_SETGET_FUNCS(disk_block_group_flags,
+		   struct btrfs_block_group_item, flags, 64);
+BTRFS_SETGET_STACK_FUNCS(block_group_flags,
+			struct btrfs_block_group_item, flags, 64);
 
 /* struct btrfs_inode_ref */
 BTRFS_SETGET_FUNCS(inode_ref_name_len, struct btrfs_inode_ref, name_len, 16);
@@ -538,49 +780,53 @@
 BTRFS_SETGET_FUNCS(inode_uid, struct btrfs_inode_item, uid, 32);
 BTRFS_SETGET_FUNCS(inode_gid, struct btrfs_inode_item, gid, 32);
 BTRFS_SETGET_FUNCS(inode_mode, struct btrfs_inode_item, mode, 32);
-BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 32);
+BTRFS_SETGET_FUNCS(inode_rdev, struct btrfs_inode_item, rdev, 64);
 BTRFS_SETGET_FUNCS(inode_flags, struct btrfs_inode_item, flags, 16);
 BTRFS_SETGET_FUNCS(inode_compat_flags, struct btrfs_inode_item,
 		   compat_flags, 16);
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_atime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, atime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_mtime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, mtime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_ctime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, ctime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-static inline struct btrfs_inode_timespec *
+static inline struct btrfs_timespec *
 btrfs_inode_otime(struct btrfs_inode_item *inode_item)
 {
 	unsigned long ptr = (unsigned long)inode_item;
 	ptr += offsetof(struct btrfs_inode_item, otime);
-	return (struct btrfs_inode_timespec *)ptr;
+	return (struct btrfs_timespec *)ptr;
 }
 
-BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_inode_timespec, sec, 64);
-BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_inode_timespec, nsec, 32);
+BTRFS_SETGET_FUNCS(timespec_sec, struct btrfs_timespec, sec, 64);
+BTRFS_SETGET_FUNCS(timespec_nsec, struct btrfs_timespec, nsec, 32);
 
 /* struct btrfs_extent_item */
 BTRFS_SETGET_FUNCS(extent_refs, struct btrfs_extent_item, refs, 32);
 
+/* struct btrfs_dev_extent */
+BTRFS_SETGET_FUNCS(dev_extent_owner, struct btrfs_dev_extent, owner, 64);
+BTRFS_SETGET_FUNCS(dev_extent_length, struct btrfs_dev_extent, length, 64);
+
 /* struct btrfs_extent_ref */
 BTRFS_SETGET_FUNCS(ref_root, struct btrfs_extent_ref, root, 64);
 BTRFS_SETGET_FUNCS(ref_generation, struct btrfs_extent_ref, generation, 64);
@@ -846,8 +1092,14 @@
 BTRFS_SETGET_STACK_FUNCS(super_generation, struct btrfs_super_block,
 			 generation, 64);
 BTRFS_SETGET_STACK_FUNCS(super_root, struct btrfs_super_block, root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_sys_array_size,
+			 struct btrfs_super_block, sys_chunk_array_size, 32);
 BTRFS_SETGET_STACK_FUNCS(super_root_level, struct btrfs_super_block,
 			 root_level, 8);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root, struct btrfs_super_block,
+			 chunk_root, 64);
+BTRFS_SETGET_STACK_FUNCS(super_chunk_root_level, struct btrfs_super_block,
+			 chunk_root_level, 64);
 BTRFS_SETGET_STACK_FUNCS(super_total_bytes, struct btrfs_super_block,
 			 total_bytes, 64);
 BTRFS_SETGET_STACK_FUNCS(super_bytes_used, struct btrfs_super_block,
@@ -1009,7 +1261,14 @@
 				    struct btrfs_root *root);
 int btrfs_free_block_groups(struct btrfs_fs_info *info);
 int btrfs_read_block_groups(struct btrfs_root *root);
+int btrfs_make_block_group(struct btrfs_trans_handle *trans,
+			   struct btrfs_root *root, u64 bytes_used,
+			   u64 type, u64 chunk_tree, u64 chunk_objectid,
+			   u64 size);
 /* ctree.c */
+int btrfs_previous_item(struct btrfs_root *root,
+			struct btrfs_path *path, u64 min_objectid,
+			int type);
 int btrfs_cow_block(struct btrfs_trans_handle *trans,
 		    struct btrfs_root *root, struct extent_buffer *buf,
 		    struct extent_buffer *parent, int parent_slot,