Btrfs: Add backing store, memory management

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index 9f84c08..6336021 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -1,7 +1,12 @@
 
-ctree: ctree.o
-	gcc -g -O2 -Wall -o ctree ctree.c
+CFLAGS= -g -Wall
+
+.c.o:
+	$(CC) $(CFLAGS) -c $<
+
+ctree: ctree.o disk-io.h ctree.h disk-io.o radix-tree.o radix-tree.h
+	gcc $(CFLAGS) -o ctree ctree.o disk-io.o radix-tree.o
 
 clean:
-	rm ctree ctree.o
+	rm ctree *.o
 
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 4bf5e92..6f0522f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -1,68 +1,25 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include "kerncompat.h"
-
-#define BLOCKSIZE 4096
-
-struct key {
-	u64 objectid;
-	u32 flags;
-	u64 offset;
-} __attribute__ ((__packed__));
-
-struct header {
-	u64 fsid[2]; /* FS specific uuid */
-	u64 blocknum;
-	u64 parentid;
-	u32 csum;
-	u32 ham;
-	u16 nritems;
-	u16 flags;
-} __attribute__ ((__packed__));
-
-#define NODEPTRS_PER_BLOCK ((BLOCKSIZE - sizeof(struct header)) / \
-			    (sizeof(struct key) + sizeof(u64)))
-
-#define LEVEL_BITS 3
-#define MAX_LEVEL (1 << LEVEL_BITS)
-#define node_level(f) ((f) & (MAX_LEVEL-1))
-#define is_leaf(f) (node_level(f) == 0)
-
-struct ctree_root {
-	struct node *node;
-};
-
-struct item {
-	struct key key;
-	u16 offset;
-	u16 size;
-} __attribute__ ((__packed__));
-
-#define LEAF_DATA_SIZE (BLOCKSIZE - sizeof(struct header))
-struct leaf {
-	struct header header;
-	union {
-		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
-		u8 data[BLOCKSIZE-sizeof(struct header)];
-	};
-} __attribute__ ((__packed__));
-
-struct node {
-	struct header header;
-	struct key keys[NODEPTRS_PER_BLOCK];
-	u64 blockptrs[NODEPTRS_PER_BLOCK];
-} __attribute__ ((__packed__));
-
-struct ctree_path {
-	struct node *nodes[MAX_LEVEL];
-	int slots[MAX_LEVEL];
-};
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
 
 static inline void init_path(struct ctree_path *p)
 {
 	memset(p, 0, sizeof(*p));
 }
 
+static void release_path(struct ctree_root *root, struct ctree_path *p)
+{
+	int i;
+	for (i = 0; i < MAX_LEVEL; i++) {
+		if (!p->nodes[i])
+			break;
+		tree_block_release(root, p->nodes[i]);
+	}
+}
+
 static inline unsigned int leaf_data_end(struct leaf *leaf)
 {
 	unsigned int nr = leaf->header.nritems;
@@ -135,26 +92,25 @@
 	return -1;
 }
 
-void *read_block(u64 blocknum)
-{
-	return (void *)blocknum;
-}
-
 int search_slot(struct ctree_root *root, struct key *key, struct ctree_path *p)
 {
-	struct node *c = root->node;
+	struct tree_buffer *b = root->node;
+	struct node *c;
+
 	int slot;
 	int ret;
 	int level;
-	while (c) {
+	b->count++;
+	while (b) {
+		c = &b->node;
 		level = node_level(c->header.flags);
-		p->nodes[level] = c;
+		p->nodes[level] = b;
 		ret = bin_search(c, key, &slot);
 		if (!is_leaf(c->header.flags)) {
 			if (ret && slot > 0)
 				slot -= 1;
 			p->slots[level] = slot;
-			c = read_block(c->blockptrs[slot]);
+			b = read_tree_block(root, c->blockptrs[slot]);
 			continue;
 		} else {
 			p->slots[level] = slot;
@@ -164,17 +120,20 @@
 	return -1;
 }
 
-static void fixup_low_keys(struct ctree_path *path, struct key *key,
-			     int level)
+static void fixup_low_keys(struct ctree_root *root,
+			   struct ctree_path *path, struct key *key,
+			   int level)
 {
 	int i;
 	/* adjust the pointers going up the tree */
 	for (i = level; i < MAX_LEVEL; i++) {
-		struct node *t = path->nodes[i];
+		struct node *t;
 		int tslot = path->slots[i];
-		if (!t)
+		if (!path->nodes[i])
 			break;
+		t = &path->nodes[i]->node;
 		memcpy(t->keys + tslot, key, sizeof(*key));
+		write_tree_block(root, path->nodes[i]);
 		if (tslot != 0)
 			break;
 	}
@@ -190,27 +149,34 @@
 	int nritems;
 	/* need a new root */
 	if (!path->nodes[level]) {
-		c = malloc(sizeof(struct node));
+		struct tree_buffer *t;
+		t = alloc_free_block(root);
+		c = &t->node;
 		memset(c, 0, sizeof(c));
 		c->header.nritems = 2;
 		c->header.flags = node_level(level);
-		lower = path->nodes[level-1];
+		c->header.blocknr = t->blocknr;
+		lower = &path->nodes[level-1]->node;
 		if (is_leaf(lower->header.flags))
 			lower_key = &((struct leaf *)lower)->items[0].key;
 		else
 			lower_key = lower->keys;
 		memcpy(c->keys, lower_key, sizeof(struct key));
 		memcpy(c->keys + 1, key, sizeof(struct key));
-		c->blockptrs[0] = (u64)lower;
+		c->blockptrs[0] = path->nodes[level-1]->blocknr;
 		c->blockptrs[1] = blocknr;
-		root->node = c;
-		path->nodes[level] = c;
+		/* the path has an extra ref to root->node */
+		tree_block_release(root, root->node);
+		root->node = t;
+		t->count++;
+		write_tree_block(root, t);
+		path->nodes[level] = t;
 		path->slots[level] = 0;
 		if (c->keys[1].objectid == 0)
 			BUG();
 		return 0;
 	}
-	lower = path->nodes[level];
+	lower = &path->nodes[level]->node;
 	nritems = lower->header.nritems;
 	if (slot > nritems)
 		BUG();
@@ -227,6 +193,7 @@
 	lower->header.nritems++;
 	if (lower->keys[1].objectid == 0)
 			BUG();
+	write_tree_block(root, path->nodes[level]);
 	return 0;
 }
 
@@ -238,6 +205,8 @@
 	int push_items = 0;
 	int left_nritems;
 	int right_nritems;
+	struct tree_buffer *t;
+	struct tree_buffer *right_buf;
 
 	if (level == MAX_LEVEL - 1 || path->nodes[level + 1] == 0)
 		return 1;
@@ -245,13 +214,18 @@
 	if (slot == 0)
 		return 1;
 
-	left = read_block(path->nodes[level + 1]->blockptrs[slot - 1]);
-	right = path->nodes[level];
+	t = read_tree_block(root,
+		            path->nodes[level + 1]->node.blockptrs[slot - 1]);
+	left = &t->node;
+	right_buf = path->nodes[level];
+	right = &right_buf->node;
 	left_nritems = left->header.nritems;
 	right_nritems = right->header.nritems;
 	push_items = NODEPTRS_PER_BLOCK - (left_nritems + 1);
-	if (push_items <= 0)
+	if (push_items <= 0) {
+		tree_block_release(root, t);
 		return 1;
+	}
 
 	if (right_nritems < push_items)
 		push_items = right_nritems;
@@ -267,15 +241,20 @@
 	left->header.nritems += push_items;
 
 	/* adjust the pointers going up the tree */
-	fixup_low_keys(path, right->keys, level + 1);
+	fixup_low_keys(root, path, right->keys, level + 1);
+
+	write_tree_block(root, t);
+	write_tree_block(root, right_buf);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[level] < push_items) {
 		path->slots[level] += left_nritems;
-		path->nodes[level] = (struct node*)left;
+		tree_block_release(root, path->nodes[level]);
+		path->nodes[level] = t;
 		path->slots[level + 1] -= 1;
 	} else {
 		path->slots[level] -= push_items;
+		tree_block_release(root, t);
 	}
 	return 0;
 }
@@ -283,6 +262,8 @@
 int push_node_right(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
+	struct tree_buffer *t;
+	struct tree_buffer *src_buffer;
 	struct node *dst;
 	struct node *src;
 	int push_items = 0;
@@ -295,16 +276,21 @@
 	if (slot == NODEPTRS_PER_BLOCK - 1)
 		return 1;
 
-	if (slot >= path->nodes[level + 1]->header.nritems -1)
+	if (slot >= path->nodes[level + 1]->node.header.nritems -1)
 		return 1;
 
-	dst = read_block(path->nodes[level + 1]->blockptrs[slot + 1]);
-	src = path->nodes[level];
+	t = read_tree_block(root,
+			    path->nodes[level + 1]->node.blockptrs[slot + 1]);
+	dst = &t->node;
+	src_buffer = path->nodes[level];
+	src = &src_buffer->node;
 	dst_nritems = dst->header.nritems;
 	src_nritems = src->header.nritems;
 	push_items = NODEPTRS_PER_BLOCK - (dst_nritems + 1);
-	if (push_items <= 0)
+	if (push_items <= 0) {
+		tree_block_release(root, t);
 		return 1;
+	}
 
 	if (src_nritems < push_items)
 		push_items = src_nritems;
@@ -322,13 +308,21 @@
 	dst->header.nritems += push_items;
 
 	/* adjust the pointers going up the tree */
-	memcpy(path->nodes[level + 1]->keys + path->slots[level + 1] + 1,
+	memcpy(path->nodes[level + 1]->node.keys + path->slots[level + 1] + 1,
 		dst->keys, sizeof(struct key));
+
+	write_tree_block(root, path->nodes[level + 1]);
+	write_tree_block(root, t);
+	write_tree_block(root, src_buffer);
+
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[level] >= src->header.nritems) {
 		path->slots[level] -= src->header.nritems;
-		path->nodes[level] = (struct node*)dst;
+		tree_block_release(root, path->nodes[level]);
+		path->nodes[level] = t;
 		path->slots[level + 1] += 1;
+	} else {
+		tree_block_release(root, t);
 	}
 	return 0;
 }
@@ -337,15 +331,18 @@
 		struct ctree_path *path, struct key *key,
 		u64 blocknr, int level)
 {
-	struct node *c = path->nodes[level];
+	struct tree_buffer *t = path->nodes[level];
+	struct node *c = &path->nodes[level]->node;
 	struct node *b;
-	struct node *bal[MAX_LEVEL];
+	struct tree_buffer *b_buffer;
+	struct tree_buffer *bal[MAX_LEVEL];
 	int bal_level = level;
 	int mid;
 	int bal_start = -1;
 
 	memset(bal, 0, ARRAY_SIZE(bal));
-	while(c && c->header.nritems == NODEPTRS_PER_BLOCK) {
+	while(t && t->node.header.nritems == NODEPTRS_PER_BLOCK) {
+		c = &t->node;
 		if (push_node_left(root, path,
 		   node_level(c->header.flags)) == 0)
 			break;
@@ -355,8 +352,10 @@
 		bal_start = bal_level;
 		if (bal_level == MAX_LEVEL - 1)
 			BUG();
-		b = malloc(sizeof(struct node));
+		b_buffer = alloc_free_block(root);
+		b = &b_buffer->node;
 		b->header.flags = c->header.flags;
+		b->header.blocknr = b_buffer->blocknr;
 		mid = (c->header.nritems + 1) / 2;
 		memcpy(b->keys, c->keys + mid,
 			(c->header.nritems - mid) * sizeof(struct key));
@@ -364,21 +363,28 @@
 			(c->header.nritems - mid) * sizeof(u64));
 		b->header.nritems = c->header.nritems - mid;
 		c->header.nritems = mid;
-		bal[bal_level] = b;
+
+		write_tree_block(root, t);
+		write_tree_block(root, b_buffer);
+
+		bal[bal_level] = b_buffer;
 		if (bal_level == MAX_LEVEL - 1)
 			break;
 		bal_level += 1;
-		c = path->nodes[bal_level];
+		t = path->nodes[bal_level];
 	}
 	while(bal_start > 0) {
-		b = bal[bal_start];
-		c = path->nodes[bal_start];
-		__insert_ptr(root, path, b->keys, (u64)b,
+		b_buffer = bal[bal_start];
+		c = &path->nodes[bal_start]->node;
+		__insert_ptr(root, path, b_buffer->node.keys, b_buffer->blocknr,
 				path->slots[bal_start + 1] + 1, bal_start + 1);
 		if (path->slots[bal_start] >= c->header.nritems) {
 			path->slots[bal_start] -= c->header.nritems;
-			path->nodes[bal_start] = b;
+			tree_block_release(root, path->nodes[bal_start]);
+			path->nodes[bal_start] = b_buffer;
 			path->slots[bal_start + 1] += 1;
+		} else {
+			tree_block_release(root, b_buffer);
 		}
 		bal_start--;
 		if (!bal[bal_start])
@@ -404,7 +410,9 @@
 int push_leaf_left(struct ctree_root *root, struct ctree_path *path,
 		   int data_size)
 {
-	struct leaf *right = (struct leaf *)path->nodes[0];
+	struct tree_buffer *right_buf = path->nodes[0];
+	struct leaf *right = &right_buf->leaf;
+	struct tree_buffer *t;
 	struct leaf *left;
 	int slot;
 	int i;
@@ -421,9 +429,11 @@
 	if (!path->nodes[1]) {
 		return 1;
 	}
-	left = read_block(path->nodes[1]->blockptrs[slot - 1]);
+	t = read_tree_block(root, path->nodes[1]->node.blockptrs[slot - 1]);
+	left = &t->leaf;
 	free_space = leaf_free_space(left);
 	if (free_space < data_size + sizeof(struct item)) {
+		tree_block_release(root, t);
 		return 1;
 	}
 	for (i = 0; i < right->header.nritems; i++) {
@@ -436,6 +446,7 @@
 		push_space += item->size + sizeof(*item);
 	}
 	if (push_items == 0) {
+		tree_block_release(root, t);
 		return 1;
 	}
 	/* push data from right to left */
@@ -446,6 +457,8 @@
 		right->data + right->items[push_items - 1].offset,
 		push_space);
 	old_left_nritems = left->header.nritems;
+	BUG_ON(old_left_nritems < 0);
+
 	for(i = old_left_nritems; i < old_left_nritems + push_items; i++) {
 		left->items[i].offset -= LEAF_DATA_SIZE -
 			left->items[old_left_nritems -1].offset;
@@ -460,30 +473,40 @@
 		(right->header.nritems - push_items) * sizeof(struct item));
 	right->header.nritems -= push_items;
 	push_space = LEAF_DATA_SIZE;
+
 	for (i = 0; i < right->header.nritems; i++) {
 		right->items[i].offset = push_space - right->items[i].size;
 		push_space = right->items[i].offset;
 	}
-	fixup_low_keys(path, &right->items[0].key, 1);
+
+	write_tree_block(root, t);
+	write_tree_block(root, right_buf);
+
+	fixup_low_keys(root, path, &right->items[0].key, 1);
 
 	/* then fixup the leaf pointer in the path */
 	if (path->slots[0] < push_items) {
 		path->slots[0] += old_left_nritems;
-		path->nodes[0] = (struct node*)left;
+		tree_block_release(root, path->nodes[0]);
+		path->nodes[0] = t;
 		path->slots[1] -= 1;
 	} else {
+		tree_block_release(root, t);
 		path->slots[0] -= push_items;
 	}
+	BUG_ON(path->slots[0] < 0);
 	return 0;
 }
 
 int split_leaf(struct ctree_root *root, struct ctree_path *path, int data_size)
 {
-	struct leaf *l = (struct leaf *)path->nodes[0];
-	int nritems = l->header.nritems;
-	int mid = (nritems + 1)/ 2;
-	int slot = path->slots[0];
+	struct tree_buffer *l_buf = path->nodes[0];
+	struct leaf *l = &l_buf->leaf;
+	int nritems;
+	int mid;
+	int slot;
 	struct leaf *right;
+	struct tree_buffer *right_buffer;
 	int space_needed = data_size + sizeof(struct item);
 	int data_copy_size;
 	int rt_data_off;
@@ -491,9 +514,19 @@
 	int ret;
 
 	if (push_leaf_left(root, path, data_size) == 0) {
-		return 0;
+		l_buf = path->nodes[0];
+		l = &l_buf->leaf;
+		if (leaf_free_space(l) >= sizeof(struct item) + data_size)
+			return 0;
 	}
-	right = malloc(sizeof(struct leaf));
+	slot = path->slots[0];
+	nritems = l->header.nritems;
+	mid = (nritems + 1)/ 2;
+
+	right_buffer = alloc_free_block(root);
+	BUG_ON(!right_buffer);
+	BUG_ON(mid == nritems);
+	right = &right_buffer->leaf;
 	memset(right, 0, sizeof(*right));
 	if (mid <= slot) {
 		if (leaf_space_used(l, mid, nritems - mid) + space_needed >
@@ -505,6 +538,8 @@
 			BUG();
 	}
 	right->header.nritems = nritems - mid;
+	right->header.blocknr = right_buffer->blocknr;
+	right->header.flags = node_level(0);
 	data_copy_size = l->items[mid].offset + l->items[mid].size -
 			 leaf_data_end(l);
 	memcpy(right->items, l->items + mid,
@@ -518,12 +553,20 @@
 	}
 	l->header.nritems = mid;
 	ret = insert_ptr(root, path, &right->items[0].key,
-			  (u64)right, 1);
+			  right_buffer->blocknr, 1);
+
+	write_tree_block(root, right_buffer);
+	write_tree_block(root, l_buf);
+
+	BUG_ON(path->slots[0] != slot);
 	if (mid <= slot) {
-		path->nodes[0] = (struct node *)right;
+		tree_block_release(root, path->nodes[0]);
+		path->nodes[0] = right_buffer;
 		path->slots[0] -= mid;
 		path->slots[1] += 1;
-	}
+	} else
+		tree_block_release(root, right_buffer);
+	BUG_ON(path->slots[0] < 0);
 	return ret;
 }
 
@@ -532,28 +575,48 @@
 {
 	int ret;
 	int slot;
+	int slot_orig;
 	struct leaf *leaf;
+	struct tree_buffer *leaf_buf;
 	unsigned int nritems;
 	unsigned int data_end;
 	struct ctree_path path;
 
+	if (!root->node) {
+		struct tree_buffer *t;
+		t = alloc_free_block(root);
+		BUG_ON(!t);
+		t->node.header.nritems = 0;
+		t->node.header.flags = node_level(0);
+		t->node.header.blocknr = t->blocknr;
+		root->node = t;
+		write_tree_block(root, t);
+	}
 	init_path(&path);
 	ret = search_slot(root, key, &path);
-	if (ret == 0)
+	if (ret == 0) {
+		release_path(root, &path);
 		return -EEXIST;
+	}
 
-	leaf = (struct leaf *)path.nodes[0];
-	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
+	slot_orig = path.slots[0];
+	leaf_buf = path.nodes[0];
+	leaf = &leaf_buf->leaf;
+	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size) {
 		split_leaf(root, &path, data_size);
-	leaf = (struct leaf *)path.nodes[0];
+		leaf_buf = path.nodes[0];
+		leaf = &path.nodes[0]->leaf;
+	}
 	nritems = leaf->header.nritems;
 	data_end = leaf_data_end(leaf);
+
 	if (leaf_free_space(leaf) <  sizeof(struct item) + data_size)
 		BUG();
 
 	slot = path.slots[0];
+	BUG_ON(slot < 0);
 	if (slot == 0)
-		fixup_low_keys(&path, key, 1);
+		fixup_low_keys(root, &path, key, 1);
 	if (slot != nritems) {
 		int i;
 		unsigned int old_data = leaf->items[slot].offset +
@@ -580,21 +643,25 @@
 	leaf->items[slot].size = data_size;
 	memcpy(leaf->data + data_end - data_size, data, data_size);
 	leaf->header.nritems += 1;
+	write_tree_block(root, leaf_buf);
 	if (leaf_free_space(leaf) < 0)
 		BUG();
+	release_path(root, &path);
 	return 0;
 }
 
 int del_ptr(struct ctree_root *root, struct ctree_path *path, int level)
 {
 	int slot;
+	struct tree_buffer *t;
 	struct node *node;
 	int nritems;
 
 	while(1) {
-		node = path->nodes[level];
-		if (!node)
+		t = path->nodes[level];
+		if (!t)
 			break;
+		node = &t->node;
 		slot = path->slots[level];
 		nritems = node->header.nritems;
 
@@ -606,28 +673,34 @@
 				sizeof(u64) * (nritems - slot - 1));
 		}
 		node->header.nritems--;
+		write_tree_block(root, t);
 		if (node->header.nritems != 0) {
 			int tslot;
 			if (slot == 0)
-				fixup_low_keys(path, node->keys, level + 1);
+				fixup_low_keys(root, path, node->keys,
+					       level + 1);
 			tslot = path->slots[level+1];
+			t->count++;
 			push_node_left(root, path, level);
 			if (node->header.nritems) {
 				push_node_right(root, path, level);
 			}
-			if (node->header.nritems)
+			if (node->header.nritems) {
+				tree_block_release(root, t);
 				break;
+			}
+			tree_block_release(root, t);
 			path->slots[level+1] = tslot;
 		}
-		if (node == root->node) {
-			printf("root is now null!\n");
-			root->node = NULL;
+		if (t == root->node) {
+			/* just turn the root into a leaf and break */
+			root->node->node.header.flags = node_level(0);
+			write_tree_block(root, t);
 			break;
 		}
 		level++;
 		if (!path->nodes[level])
 			BUG();
-		free(node);
 	}
 	return 0;
 }
@@ -636,10 +709,12 @@
 {
 	int slot;
 	struct leaf *leaf;
+	struct tree_buffer *leaf_buf;
 	int doff;
 	int dsize;
 
-	leaf = (struct leaf *)path->nodes[0];
+	leaf_buf = path->nodes[0];
+	leaf = &leaf_buf->leaf;
 	slot = path->slots[0];
 	doff = leaf->items[slot].offset;
 	dsize = leaf->items[slot].size;
@@ -658,14 +733,15 @@
 	}
 	leaf->header.nritems -= 1;
 	if (leaf->header.nritems == 0) {
-		if (leaf == (struct leaf *)root->node)
-			root->node = NULL;
-		else
+		if (leaf_buf == root->node) {
+			leaf->header.flags = node_level(0);
+			write_tree_block(root, leaf_buf);
+		} else
 			del_ptr(root, path, 1);
-		free(leaf);
 	} else {
 		if (slot == 0)
-			fixup_low_keys(path, &leaf->items[0].key, 1);
+			fixup_low_keys(root, path, &leaf->items[0].key, 1);
+		write_tree_block(root, leaf_buf);
 		if (leaf_space_used(leaf, 0, leaf->header.nritems) <
 		    LEAF_DATA_SIZE / 4) {
 			/* push_leaf_left fixes the path.
@@ -673,12 +749,13 @@
 			 * for possible call to del_ptr below
 			 */
 			slot = path->slots[1];
+			leaf_buf->count++;
 			push_leaf_left(root, path, 1);
 			if (leaf->header.nritems == 0) {
-				free(leaf);
 				path->slots[1] = slot;
 				del_ptr(root, path, 1);
 			}
+			tree_block_release(root, leaf_buf);
 		}
 	}
 	return 0;
@@ -689,7 +766,7 @@
 	int i;
 	int nr = l->header.nritems;
 	struct item *item;
-	printf("leaf %p total ptrs %d free space %d\n", l, nr,
+	printf("leaf %lu total ptrs %d free space %d\n", l->header.blocknr, nr,
 	       leaf_free_space(l));
 	fflush(stdout);
 	for (i = 0 ; i < nr ; i++) {
@@ -703,38 +780,45 @@
 		fflush(stdout);
 	}
 }
-void print_tree(struct node *c)
+void print_tree(struct ctree_root *root, struct tree_buffer *t)
 {
 	int i;
 	int nr;
+	struct node *c;
 
-	if (!c)
+	if (!t)
 		return;
+	c = &t->node;
 	nr = c->header.nritems;
+	if (c->header.blocknr != t->blocknr)
+		BUG();
 	if (is_leaf(c->header.flags)) {
 		print_leaf((struct leaf *)c);
 		return;
 	}
-	printf("node %p level %d total ptrs %d free spc %lu\n", c,
+	printf("node %lu level %d total ptrs %d free spc %lu\n", t->blocknr,
 	        node_level(c->header.flags), c->header.nritems,
 		NODEPTRS_PER_BLOCK - c->header.nritems);
 	fflush(stdout);
 	for (i = 0; i < nr; i++) {
-		printf("\tkey %d (%lu %u %lu) block %lx\n",
+		printf("\tkey %d (%lu %u %lu) block %lu\n",
 		       i,
 		       c->keys[i].objectid, c->keys[i].flags, c->keys[i].offset,
 		       c->blockptrs[i]);
 		fflush(stdout);
 	}
 	for (i = 0; i < nr; i++) {
-		struct node *next = read_block(c->blockptrs[i]);
+		struct tree_buffer *next_buf = read_tree_block(root,
+							    c->blockptrs[i]);
+		struct node *next = &next_buf->node;
 		if (is_leaf(next->header.flags) &&
 		    node_level(c->header.flags) != 1)
 			BUG();
 		if (node_level(next->header.flags) !=
 			node_level(c->header.flags) - 1)
 			BUG();
-		print_tree(next);
+		print_tree(root, next_buf);
+		tree_block_release(root, next_buf);
 	}
 
 }
@@ -746,23 +830,24 @@
 }
 
 int main() {
-	struct leaf *first_node = malloc(sizeof(struct leaf));
-	struct ctree_root root;
+	struct ctree_root *root;
 	struct key ins;
 	struct key last = { (u64)-1, 0, 0};
 	char *buf;
 	int i;
 	int num;
 	int ret;
-	int run_size = 100000;
+	int run_size = 1000000;
 	int max_key = 100000000;
 	int tree_size = 0;
 	struct ctree_path path;
 
+	radix_tree_init();
+
+
+	root = open_ctree("dbfile");
 
 	srand(55);
-	root.node = (struct node *)first_node;
-	memset(first_node, 0, sizeof(*first_node));
 	for (i = 0; i < run_size; i++) {
 		buf = malloc(64);
 		num = next_key(i, max_key);
@@ -772,39 +857,46 @@
 		ins.objectid = num;
 		ins.offset = 0;
 		ins.flags = 0;
-		ret = insert_item(&root, &ins, buf, strlen(buf));
+		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 	}
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("starting search\n");
 	srand(55);
 	for (i = 0; i < run_size; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret) {
-			print_tree(root.node);
+			print_tree(root, root->node);
 			printf("unable to find %d\n", num);
 			exit(1);
 		}
+		release_path(root, &path);
 	}
-	printf("node %p level %d total ptrs %d free spc %lu\n", root.node,
-	        node_level(root.node->header.flags), root.node->header.nritems,
-		NODEPTRS_PER_BLOCK - root.node->header.nritems);
-	// print_tree(root.node);
-	printf("all searches good\n");
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("node %p level %d total ptrs %d free spc %lu\n", root->node,
+	        node_level(root->node->node.header.flags),
+		root->node->node.header.nritems,
+		NODEPTRS_PER_BLOCK - root->node->node.header.nritems);
+	printf("all searches good, deleting some items\n");
 	i = 0;
 	srand(55);
 	for (i = 0 ; i < run_size/4; i++) {
 		num = next_key(i, max_key);
 		ins.objectid = num;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret)
 			continue;
-		ret = del_item(&root, &path);
+		ret = del_item(root, &path);
 		if (ret != 0)
 			BUG();
+		release_path(root, &path);
 		tree_size--;
 	}
 	srand(128);
@@ -813,38 +905,58 @@
 		num = next_key(i, max_key);
 		sprintf(buf, "string-%d", num);
 		ins.objectid = num;
-		ret = insert_item(&root, &ins, buf, strlen(buf));
+		ret = insert_item(root, &ins, buf, strlen(buf));
 		if (!ret)
 			tree_size++;
 	}
-	while(root.node) {
+	close_ctree(root);
+	root = open_ctree("dbfile");
+	printf("starting search2\n");
+	srand(128);
+	for (i = 0; i < run_size; i++) {
+		num = next_key(i, max_key);
+		ins.objectid = num;
+		init_path(&path);
+		ret = search_slot(root, &ins, &path);
+		if (ret) {
+			print_tree(root, root->node);
+			printf("unable to find %d\n", num);
+			exit(1);
+		}
+		release_path(root, &path);
+	}
+	printf("starting big long delete run\n");
+	while(root->node && root->node->node.header.nritems > 0) {
 		struct leaf *leaf;
 		int slot;
 		ins.objectid = (u64)-1;
 		init_path(&path);
-		ret = search_slot(&root, &ins, &path);
+		ret = search_slot(root, &ins, &path);
 		if (ret == 0)
 			BUG();
 
-		leaf = (struct leaf *)(path.nodes[0]);
+		leaf = &path.nodes[0]->leaf;
 		slot = path.slots[0];
 		if (slot != leaf->header.nritems)
 			BUG();
 		while(path.slots[0] > 0) {
 			path.slots[0] -= 1;
 			slot = path.slots[0];
-			leaf = (struct leaf *)(path.nodes[0]);
+			leaf = &path.nodes[0]->leaf;
 
 			if (comp_keys(&last, &leaf->items[slot].key) <= 0)
 				BUG();
 			memcpy(&last, &leaf->items[slot].key, sizeof(last));
-			ret = del_item(&root, &path);
-			if (ret != 0)
+			ret = del_item(root, &path);
+			if (ret != 0) {
+				printf("del_item returned %d\n", ret);
 				BUG();
+			}
 			tree_size--;
 		}
+		release_path(root, &path);
 	}
-	print_tree(root.node);
+	close_ctree(root);
 	printf("tree size is now %d\n", tree_size);
 	return 0;
 }
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
new file mode 100644
index 0000000..586bf18
--- /dev/null
+++ b/fs/btrfs/ctree.h
@@ -0,0 +1,62 @@
+#ifndef __CTREE__
+#define __CTREE__
+
+#define CTREE_BLOCKSIZE 4096
+
+struct key {
+	u64 objectid;
+	u32 flags;
+	u64 offset;
+} __attribute__ ((__packed__));
+
+struct header {
+	u64 fsid[2]; /* FS specific uuid */
+	u64 blocknr;
+	u64 parentid;
+	u32 csum;
+	u32 ham;
+	u16 nritems;
+	u16 flags;
+} __attribute__ ((__packed__));
+
+#define NODEPTRS_PER_BLOCK ((CTREE_BLOCKSIZE - sizeof(struct header)) / \
+			    (sizeof(struct key) + sizeof(u64)))
+
+#define LEVEL_BITS 3
+#define MAX_LEVEL (1 << LEVEL_BITS)
+#define node_level(f) ((f) & (MAX_LEVEL-1))
+#define is_leaf(f) (node_level(f) == 0)
+
+struct tree_buffer;
+struct ctree_root {
+	struct tree_buffer *node;
+	int fp;
+	struct radix_tree_root cache_radix;
+};
+
+struct item {
+	struct key key;
+	u16 offset;
+	u16 size;
+} __attribute__ ((__packed__));
+
+#define LEAF_DATA_SIZE (CTREE_BLOCKSIZE - sizeof(struct header))
+struct leaf {
+	struct header header;
+	union {
+		struct item items[LEAF_DATA_SIZE/sizeof(struct item)];
+		u8 data[CTREE_BLOCKSIZE-sizeof(struct header)];
+	};
+} __attribute__ ((__packed__));
+
+struct node {
+	struct header header;
+	struct key keys[NODEPTRS_PER_BLOCK];
+	u64 blockptrs[NODEPTRS_PER_BLOCK];
+} __attribute__ ((__packed__));
+
+struct ctree_path {
+	struct tree_buffer *nodes[MAX_LEVEL];
+	int slots[MAX_LEVEL];
+};
+#endif
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
new file mode 100644
index 0000000..8d51a07
--- /dev/null
+++ b/fs/btrfs/disk-io.c
@@ -0,0 +1,174 @@
+#define _XOPEN_SOURCE 500
+#include <stdio.h>
+#include <stdlib.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include "kerncompat.h"
+#include "radix-tree.h"
+#include "ctree.h"
+#include "disk-io.h"
+
+static int allocated_blocks = 0;
+
+struct ctree_header {
+	u64 root_block;
+} __attribute__ ((__packed__));
+
+static int get_free_block(struct ctree_root *root, u64 *block)
+{
+	struct stat st;
+	int ret;
+
+	st.st_size = 0;
+	ret = fstat(root->fp, &st);
+	if (st.st_size > sizeof(struct ctree_header)) {
+		*block = (st.st_size -
+			sizeof(struct ctree_header)) / CTREE_BLOCKSIZE;
+	} else {
+		*block = 0;
+	}
+	ret = ftruncate(root->fp, sizeof(struct ctree_header) + (*block + 1) *
+			CTREE_BLOCKSIZE);
+	return ret;
+}
+
+struct tree_buffer *alloc_tree_block(struct ctree_root *root, u64 blocknr)
+{
+	struct tree_buffer *buf;
+	int ret;
+	buf = malloc(sizeof(struct tree_buffer));
+	if (!buf)
+		return buf;
+	allocated_blocks++;
+	buf->blocknr = blocknr;
+	buf->count = 1;
+	radix_tree_preload(GFP_KERNEL);
+	ret = radix_tree_insert(&root->cache_radix, blocknr, buf);
+	radix_tree_preload_end();
+	if (ret) {
+		free(buf);
+		return NULL;
+	}
+	return buf;
+}
+
+struct tree_buffer *alloc_free_block(struct ctree_root *root)
+{
+	u64 free_block;
+	int ret;
+	struct tree_buffer * buf;
+	ret = get_free_block(root, &free_block);
+	if (ret) {
+		BUG();
+		return NULL;
+	}
+	buf = alloc_tree_block(root, free_block);
+	if (!buf)
+		BUG();
+	return buf;
+}
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr)
+{
+	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	struct tree_buffer *buf;
+	int ret;
+
+	buf = radix_tree_lookup(&root->cache_radix, blocknr);
+	if (buf) {
+		buf->count++;
+		if (buf->blocknr != blocknr)
+			BUG();
+		if (buf->blocknr != buf->node.header.blocknr)
+			BUG();
+		return buf;
+	}
+	buf = alloc_tree_block(root, blocknr);
+	if (!buf)
+		return NULL;
+	ret = pread(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+	if (ret != CTREE_BLOCKSIZE) {
+		free(buf);
+		return NULL;
+	}
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	return buf;
+}
+
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf)
+{
+	u64 blocknr = buf->blocknr;
+	loff_t offset = blocknr * CTREE_BLOCKSIZE + sizeof(struct ctree_header);
+	int ret;
+
+	if (buf->blocknr != buf->node.header.blocknr)
+		BUG();
+	ret = pwrite(root->fp, &buf->node, CTREE_BLOCKSIZE, offset);
+	if (ret != CTREE_BLOCKSIZE)
+		return ret;
+	if (buf == root->node)
+		return update_root_block(root);
+	return 0;
+}
+
+struct ctree_root *open_ctree(char *filename)
+{
+	struct ctree_root *root = malloc(sizeof(struct ctree_root));
+	int fp;
+	u64 root_block;
+	int ret;
+
+	fp = open(filename, O_CREAT | O_RDWR);
+	if (fp < 0) {
+		free(root);
+		return NULL;
+	}
+	root->fp = fp;
+	INIT_RADIX_TREE(&root->cache_radix, GFP_KERNEL);
+	ret = pread(fp, &root_block, sizeof(u64), 0);
+	if (ret == sizeof(u64)) {
+		printf("reading root node at block %lu\n", root_block);
+		root->node = read_tree_block(root, root_block);
+	} else
+		root->node = NULL;
+	return root;
+}
+
+int close_ctree(struct ctree_root *root)
+{
+	close(root->fp);
+	if (root->node)
+		tree_block_release(root, root->node);
+	free(root);
+	printf("on close %d blocks are allocated\n", allocated_blocks);
+	return 0;
+}
+
+int update_root_block(struct ctree_root *root)
+{
+	int ret;
+	u64 root_block = root->node->blocknr;
+
+	ret = pwrite(root->fp, &root_block, sizeof(u64), 0);
+	if (ret != sizeof(u64))
+		return ret;
+	return 0;
+}
+
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf)
+{
+	buf->count--;
+	if (buf->count == 0) {
+		if (!radix_tree_lookup(&root->cache_radix, buf->blocknr))
+			BUG();
+		radix_tree_delete(&root->cache_radix, buf->blocknr);
+		memset(buf, 0, sizeof(*buf));
+		free(buf);
+		BUG_ON(allocated_blocks == 0);
+		allocated_blocks--;
+	}
+}
+
diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h
new file mode 100644
index 0000000..ee95fa0
--- /dev/null
+++ b/fs/btrfs/disk-io.h
@@ -0,0 +1,21 @@
+#ifndef __DISKIO__
+#define __DISKIO__
+
+struct tree_buffer {
+	u64 blocknr;
+	int count;
+	union {
+		struct node node;
+		struct leaf leaf;
+	};
+};
+
+struct tree_buffer *read_tree_block(struct ctree_root *root, u64 blocknr);
+int write_tree_block(struct ctree_root *root, struct tree_buffer *buf);
+struct ctree_root *open_ctree(char *filename);
+int close_ctree(struct ctree_root *root);
+void tree_block_release(struct ctree_root *root, struct tree_buffer *buf);
+struct tree_buffer *alloc_free_block(struct ctree_root *root);
+int update_root_block(struct ctree_root *root);
+
+#endif
diff --git a/fs/btrfs/kerncompat.h b/fs/btrfs/kerncompat.h
index 3a4bb4d..347ca06 100644
--- a/fs/btrfs/kerncompat.h
+++ b/fs/btrfs/kerncompat.h
@@ -6,6 +6,7 @@
 #define BITS_PER_LONG 64
 #define __GFP_BITS_SHIFT 20
 #define __GFP_BITS_MASK ((int)((1 << __GFP_BITS_SHIFT) - 1))
+#define GFP_KERNEL 0
 #define __read_mostly
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 #define __force
diff --git a/fs/btrfs/radix-tree.c b/fs/btrfs/radix-tree.c
new file mode 100644
index 0000000..baa25ca
--- /dev/null
+++ b/fs/btrfs/radix-tree.c
@@ -0,0 +1,836 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ * Copyright (C) 2005 SGI, Christoph Lameter <clameter@sgi.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+
+#include "kerncompat.h"
+#include "radix-tree.h"
+#ifdef __KERNEL__
+#define RADIX_TREE_MAP_SHIFT	(CONFIG_BASE_SMALL ? 4 : 6)
+#else
+#define RADIX_TREE_MAP_SHIFT	3	/* For more stressful testing */
+#endif
+
+#define RADIX_TREE_MAP_SIZE	(1UL << RADIX_TREE_MAP_SHIFT)
+#define RADIX_TREE_MAP_MASK	(RADIX_TREE_MAP_SIZE-1)
+
+#define RADIX_TREE_TAG_LONGS	\
+	((RADIX_TREE_MAP_SIZE + BITS_PER_LONG - 1) / BITS_PER_LONG)
+
+struct radix_tree_node {
+	unsigned int	count;
+	void		*slots[RADIX_TREE_MAP_SIZE];
+	unsigned long	tags[RADIX_TREE_MAX_TAGS][RADIX_TREE_TAG_LONGS];
+};
+
+struct radix_tree_path {
+	struct radix_tree_node *node;
+	int offset;
+};
+
+#define RADIX_TREE_INDEX_BITS  (8 /* CHAR_BIT */ * sizeof(unsigned long))
+#define RADIX_TREE_MAX_PATH (RADIX_TREE_INDEX_BITS/RADIX_TREE_MAP_SHIFT + 2)
+
+static unsigned long height_to_maxindex[RADIX_TREE_MAX_PATH] __read_mostly;
+
+/*
+ * Per-cpu pool of preloaded nodes
+ */
+struct radix_tree_preload {
+	int nr;
+	struct radix_tree_node *nodes[RADIX_TREE_MAX_PATH];
+};
+struct radix_tree_preload radix_tree_preloads = { 0, };
+
+static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
+{
+	return root->gfp_mask & __GFP_BITS_MASK;
+}
+
+static int internal_nodes = 0;
+/*
+ * This assumes that the caller has performed appropriate preallocation, and
+ * that the caller has pinned this thread of control to the current CPU.
+ */
+static struct radix_tree_node *
+radix_tree_node_alloc(struct radix_tree_root *root)
+{
+	struct radix_tree_node *ret;
+	ret = malloc(sizeof(struct radix_tree_node));
+	if (ret) {
+		memset(ret, 0, sizeof(struct radix_tree_node));
+		internal_nodes++;
+	}
+	return ret;
+}
+
+static inline void
+radix_tree_node_free(struct radix_tree_node *node)
+{
+	internal_nodes--;
+	free(node);
+}
+
+/*
+ * Load up this CPU's radix_tree_node buffer with sufficient objects to
+ * ensure that the addition of a single element in the tree cannot fail.  On
+ * success, return zero, with preemption disabled.  On error, return -ENOMEM
+ * with preemption not disabled.
+ */
+int radix_tree_preload(gfp_t gfp_mask)
+{
+	struct radix_tree_preload *rtp;
+	struct radix_tree_node *node;
+	int ret = -ENOMEM;
+
+	preempt_disable();
+	rtp = &__get_cpu_var(radix_tree_preloads);
+	while (rtp->nr < ARRAY_SIZE(rtp->nodes)) {
+		preempt_enable();
+		node = radix_tree_node_alloc(NULL);
+		if (node == NULL)
+			goto out;
+		preempt_disable();
+		rtp = &__get_cpu_var(radix_tree_preloads);
+		if (rtp->nr < ARRAY_SIZE(rtp->nodes))
+			rtp->nodes[rtp->nr++] = node;
+		else
+			radix_tree_node_free(node);
+	}
+	ret = 0;
+out:
+	return ret;
+}
+
+static inline void tag_set(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__set_bit(offset, node->tags[tag]);
+}
+
+static inline void tag_clear(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	__clear_bit(offset, node->tags[tag]);
+}
+
+static inline int tag_get(struct radix_tree_node *node, unsigned int tag,
+		int offset)
+{
+	return test_bit(offset, node->tags[tag]);
+}
+
+static inline void root_tag_set(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask |= (__force gfp_t)(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+
+static inline void root_tag_clear(struct radix_tree_root *root, unsigned int tag)
+{
+	root->gfp_mask &= (__force gfp_t)~(1 << (tag + __GFP_BITS_SHIFT));
+}
+
+static inline void root_tag_clear_all(struct radix_tree_root *root)
+{
+	root->gfp_mask &= __GFP_BITS_MASK;
+}
+
+static inline int root_tag_get(struct radix_tree_root *root, unsigned int tag)
+{
+	return (__force unsigned)root->gfp_mask & (1 << (tag + __GFP_BITS_SHIFT));
+}
+
+/*
+ * Returns 1 if any slot in the node has this tag set.
+ * Otherwise returns 0.
+ */
+static inline int any_tag_set(struct radix_tree_node *node, unsigned int tag)
+{
+	int idx;
+	for (idx = 0; idx < RADIX_TREE_TAG_LONGS; idx++) {
+		if (node->tags[tag][idx])
+			return 1;
+	}
+	return 0;
+}
+
+/*
+ *	Return the maximum key which can be store into a
+ *	radix tree with height HEIGHT.
+ */
+static inline unsigned long radix_tree_maxindex(unsigned int height)
+{
+	return height_to_maxindex[height];
+}
+
+/*
+ *	Extend a radix tree so it can store key @index.
+ */
+static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_node *node;
+	unsigned int height;
+	int tag;
+
+	/* Figure out what the height should be.  */
+	height = root->height + 1;
+	while (index > radix_tree_maxindex(height))
+		height++;
+
+	if (root->rnode == NULL) {
+		root->height = height;
+		goto out;
+	}
+
+	do {
+		if (!(node = radix_tree_node_alloc(root)))
+			return -ENOMEM;
+
+		/* Increase the height.  */
+		node->slots[0] = root->rnode;
+
+		/* Propagate the aggregated tag info into the new root */
+		for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+			if (root_tag_get(root, tag))
+				tag_set(node, tag, 0);
+		}
+
+		node->count = 1;
+		root->rnode = node;
+		root->height++;
+	} while (height > root->height);
+out:
+	return 0;
+}
+
+/**
+ *	radix_tree_insert    -    insert into a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@item:		item to insert
+ *
+ *	Insert an item into the radix tree at position @index.
+ */
+int radix_tree_insert(struct radix_tree_root *root,
+			unsigned long index, void *item)
+{
+	struct radix_tree_node *node = NULL, *slot;
+	unsigned int height, shift;
+	int offset;
+	int error;
+
+	/* Make sure the tree is high enough.  */
+	if (index > radix_tree_maxindex(root->height)) {
+		error = radix_tree_extend(root, index);
+		if (error)
+			return error;
+	}
+
+	slot = root->rnode;
+	height = root->height;
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+
+	offset = 0;			/* uninitialised var warning */
+	while (height > 0) {
+		if (slot == NULL) {
+			/* Have to add a child node.  */
+			if (!(slot = radix_tree_node_alloc(root)))
+				return -ENOMEM;
+			if (node) {
+				node->slots[offset] = slot;
+				node->count++;
+			} else
+				root->rnode = slot;
+		}
+
+		/* Go a level down */
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		node = slot;
+		slot = node->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (slot != NULL)
+		return -EEXIST;
+
+	if (node) {
+		node->count++;
+		node->slots[offset] = item;
+		BUG_ON(tag_get(node, 0, offset));
+		BUG_ON(tag_get(node, 1, offset));
+	} else {
+		root->rnode = item;
+		BUG_ON(root_tag_get(root, 0));
+		BUG_ON(root_tag_get(root, 1));
+	}
+
+	return 0;
+}
+
+static inline void **__lookup_slot(struct radix_tree_root *root,
+				   unsigned long index)
+{
+	unsigned int height, shift;
+	struct radix_tree_node **slot;
+
+	height = root->height;
+
+	if (index > radix_tree_maxindex(height))
+		return NULL;
+
+	if (height == 0 && root->rnode)
+		return (void **)&root->rnode;
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = &root->rnode;
+
+	while (height > 0) {
+		if (*slot == NULL)
+			return NULL;
+
+		slot = (struct radix_tree_node **)
+			((*slot)->slots +
+				((index >> shift) & RADIX_TREE_MAP_MASK));
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	return (void **)slot;
+}
+
+/**
+ *	radix_tree_lookup_slot    -    lookup a slot in a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the slot corresponding to the position @index in the radix tree
+ *	@root. This is useful for update-if-exists operations.
+ */
+void **radix_tree_lookup_slot(struct radix_tree_root *root, unsigned long index)
+{
+	return __lookup_slot(root, index);
+}
+
+/**
+ *	radix_tree_lookup    -    perform lookup operation on a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Lookup the item at the position @index in the radix tree @root.
+ */
+void *radix_tree_lookup(struct radix_tree_root *root, unsigned long index)
+{
+	void **slot;
+
+	slot = __lookup_slot(root, index);
+	return slot != NULL ? *slot : NULL;
+}
+
+/**
+ *	radix_tree_tag_set - set a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Set the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  From
+ *	the root all the way down to the leaf node.
+ *
+ *	Returns the address of the tagged item.   Setting a tag on a not-present
+ *	item is a bug.
+ */
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node *slot;
+
+	height = root->height;
+	BUG_ON(index > radix_tree_maxindex(height));
+
+	slot = root->rnode;
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+
+	while (height > 0) {
+		int offset;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		if (!tag_get(slot, tag, offset))
+			tag_set(slot, tag, offset);
+		slot = slot->slots[offset];
+		BUG_ON(slot == NULL);
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	/* set the root's tag bit */
+	if (slot && !root_tag_get(root, tag))
+		root_tag_set(root, tag);
+
+	return slot;
+}
+
+/**
+ *	radix_tree_tag_clear - clear a tag on a radix tree node
+ *	@root:		radix tree root
+ *	@index:		index key
+ *	@tag: 		tag index
+ *
+ *	Clear the search tag (which must be < RADIX_TREE_MAX_TAGS)
+ *	corresponding to @index in the radix tree.  If
+ *	this causes the leaf node to have no tags set then clear the tag in the
+ *	next-to-leaf node, etc.
+ *
+ *	Returns the address of the tagged item on success, else NULL.  ie:
+ *	has the same return value and semantics as radix_tree_lookup().
+ */
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_node *slot = NULL;
+	unsigned int height, shift;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+	slot = root->rnode;
+
+	while (height > 0) {
+		int offset;
+
+		if (slot == NULL)
+			goto out;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp[1].offset = offset;
+		pathp[1].node = slot;
+		slot = slot->slots[offset];
+		pathp++;
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+
+	if (slot == NULL)
+		goto out;
+
+	while (pathp->node) {
+		if (!tag_get(pathp->node, tag, pathp->offset))
+			goto out;
+		tag_clear(pathp->node, tag, pathp->offset);
+		if (any_tag_set(pathp->node, tag))
+			goto out;
+		pathp--;
+	}
+
+	/* clear the root's tag bit */
+	if (root_tag_get(root, tag))
+		root_tag_clear(root, tag);
+
+out:
+	return slot;
+}
+
+#ifndef __KERNEL__	/* Only the test harness uses this at present */
+/**
+ * radix_tree_tag_get - get a tag on a radix tree node
+ * @root:		radix tree root
+ * @index:		index key
+ * @tag: 		tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ * Return values:
+ *
+ *  0: tag not present or not set
+ *  1: tag set
+ */
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag)
+{
+	unsigned int height, shift;
+	struct radix_tree_node *slot;
+	int saw_unset_tag = 0;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		return 0;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	if (height == 0)
+		return 1;
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	for ( ; ; ) {
+		int offset;
+
+		if (slot == NULL)
+			return 0;
+
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		/*
+		 * This is just a debug check.  Later, we can bale as soon as
+		 * we see an unset tag.
+		 */
+		if (!tag_get(slot, tag, offset))
+			saw_unset_tag = 1;
+		if (height == 1) {
+			int ret = tag_get(slot, tag, offset);
+
+			BUG_ON(ret && saw_unset_tag);
+			return !!ret;
+		}
+		slot = slot->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	}
+}
+#endif
+
+static unsigned int
+__lookup(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift, height;
+	struct radix_tree_node *slot;
+	unsigned long i;
+
+	height = root->height;
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
+	shift = (height-1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	for ( ; height > 1; height--) {
+
+		for (i = (index >> shift) & RADIX_TREE_MAP_MASK ;
+				i < RADIX_TREE_MAP_SIZE; i++) {
+			if (slot->slots[i] != NULL)
+				break;
+			index &= ~((1UL << shift) - 1);
+			index += 1UL << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	}
+
+	/* Bottom level: grab some items */
+	for (i = index & RADIX_TREE_MAP_MASK; i < RADIX_TREE_MAP_SIZE; i++) {
+		index++;
+		if (slot->slots[i]) {
+			results[nr_found++] = slot->slots[i];
+			if (nr_found == max_items)
+				goto out;
+		}
+	}
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup - perform multiple lookup on a radix tree
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *
+ *	Performs an index-ascending scan of the tree for present items.  Places
+ *	them at *@results and returns the number of items which were placed at
+ *	*@results.
+ *
+ *	The implementation is naive.
+ */
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup(root, results + ret, cur_index,
+					max_items - ret, &next_index);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+
+/*
+ * FIXME: the two tag_get()s here should use find_next_bit() instead of
+ * open-coding the search.
+ */
+static unsigned int
+__lookup_tag(struct radix_tree_root *root, void **results, unsigned long index,
+	unsigned int max_items, unsigned long *next_index, unsigned int tag)
+{
+	unsigned int nr_found = 0;
+	unsigned int shift;
+	unsigned int height = root->height;
+	struct radix_tree_node *slot;
+
+	if (height == 0) {
+		if (root->rnode && index == 0)
+			results[nr_found++] = root->rnode;
+		goto out;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	slot = root->rnode;
+
+	do {
+		unsigned long i = (index >> shift) & RADIX_TREE_MAP_MASK;
+
+		for ( ; i < RADIX_TREE_MAP_SIZE; i++) {
+			if (tag_get(slot, tag, i)) {
+				BUG_ON(slot->slots[i] == NULL);
+				break;
+			}
+			index &= ~((1UL << shift) - 1);
+			index += 1UL << shift;
+			if (index == 0)
+				goto out;	/* 32-bit wraparound */
+		}
+		if (i == RADIX_TREE_MAP_SIZE)
+			goto out;
+		height--;
+		if (height == 0) {	/* Bottom level: grab some items */
+			unsigned long j = index & RADIX_TREE_MAP_MASK;
+
+			for ( ; j < RADIX_TREE_MAP_SIZE; j++) {
+				index++;
+				if (tag_get(slot, tag, j)) {
+					BUG_ON(slot->slots[j] == NULL);
+					results[nr_found++] = slot->slots[j];
+					if (nr_found == max_items)
+						goto out;
+				}
+			}
+		}
+		shift -= RADIX_TREE_MAP_SHIFT;
+		slot = slot->slots[i];
+	} while (height > 0);
+out:
+	*next_index = index;
+	return nr_found;
+}
+
+/**
+ *	radix_tree_gang_lookup_tag - perform multiple lookup on a radix tree
+ *	                             based on a tag
+ *	@root:		radix tree root
+ *	@results:	where the results of the lookup are placed
+ *	@first_index:	start the lookup from this key
+ *	@max_items:	place up to this many items at *results
+ *	@tag:		the tag index (< RADIX_TREE_MAX_TAGS)
+ *
+ *	Performs an index-ascending scan of the tree for present items which
+ *	have the tag indexed by @tag set.  Places the items at *@results and
+ *	returns the number of items which were placed at *@results.
+ */
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag)
+{
+	const unsigned long max_index = radix_tree_maxindex(root->height);
+	unsigned long cur_index = first_index;
+	unsigned int ret = 0;
+
+	/* check the root's tag bit */
+	if (!root_tag_get(root, tag))
+		return 0;
+
+	while (ret < max_items) {
+		unsigned int nr_found;
+		unsigned long next_index;	/* Index of next search */
+
+		if (cur_index > max_index)
+			break;
+		nr_found = __lookup_tag(root, results + ret, cur_index,
+					max_items - ret, &next_index, tag);
+		ret += nr_found;
+		if (next_index == 0)
+			break;
+		cur_index = next_index;
+	}
+	return ret;
+}
+
+/**
+ *	radix_tree_shrink    -    shrink height of a radix tree to minimal
+ *	@root		radix tree root
+ */
+static inline void radix_tree_shrink(struct radix_tree_root *root)
+{
+	/* try to shrink tree height */
+	while (root->height > 0 &&
+			root->rnode->count == 1 &&
+			root->rnode->slots[0]) {
+		struct radix_tree_node *to_free = root->rnode;
+
+		root->rnode = to_free->slots[0];
+		root->height--;
+		/* must only free zeroed nodes into the slab */
+		tag_clear(to_free, 0, 0);
+		tag_clear(to_free, 1, 0);
+		to_free->slots[0] = NULL;
+		to_free->count = 0;
+		radix_tree_node_free(to_free);
+	}
+}
+
+/**
+ *	radix_tree_delete    -    delete an item from a radix tree
+ *	@root:		radix tree root
+ *	@index:		index key
+ *
+ *	Remove the item at @index from the radix tree rooted at @root.
+ *
+ *	Returns the address of the deleted item, or NULL if it was not present.
+ */
+void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
+{
+	struct radix_tree_path path[RADIX_TREE_MAX_PATH], *pathp = path;
+	struct radix_tree_node *slot = NULL;
+	unsigned int height, shift;
+	int tag;
+	int offset;
+
+	height = root->height;
+	if (index > radix_tree_maxindex(height))
+		goto out;
+
+	slot = root->rnode;
+	if (height == 0 && root->rnode) {
+		root_tag_clear_all(root);
+		root->rnode = NULL;
+		goto out;
+	}
+
+	shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
+	pathp->node = NULL;
+
+	do {
+		if (slot == NULL)
+			goto out;
+
+		pathp++;
+		offset = (index >> shift) & RADIX_TREE_MAP_MASK;
+		pathp->offset = offset;
+		pathp->node = slot;
+		slot = slot->slots[offset];
+		shift -= RADIX_TREE_MAP_SHIFT;
+		height--;
+	} while (height > 0);
+
+	if (slot == NULL)
+		goto out;
+
+	/*
+	 * Clear all tags associated with the just-deleted item
+	 */
+	for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
+		if (tag_get(pathp->node, tag, pathp->offset))
+			radix_tree_tag_clear(root, index, tag);
+	}
+
+	/* Now free the nodes we do not need anymore */
+	while (pathp->node) {
+		pathp->node->slots[pathp->offset] = NULL;
+		pathp->node->count--;
+
+		if (pathp->node->count) {
+			if (pathp->node == root->rnode)
+				radix_tree_shrink(root);
+			goto out;
+		}
+
+		/* Node with zero slots in use so free it */
+		radix_tree_node_free(pathp->node);
+
+		pathp--;
+	}
+	root_tag_clear_all(root);
+	root->height = 0;
+	root->rnode = NULL;
+
+out:
+	return slot;
+}
+
+/**
+ *	radix_tree_tagged - test whether any items in the tree are tagged
+ *	@root:		radix tree root
+ *	@tag:		tag to test
+ */
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag)
+{
+	return root_tag_get(root, tag);
+}
+
+static unsigned long __maxindex(unsigned int height)
+{
+	unsigned int tmp = height * RADIX_TREE_MAP_SHIFT;
+	unsigned long index = (~0UL >> (RADIX_TREE_INDEX_BITS - tmp - 1)) >> 1;
+
+	if (tmp >= RADIX_TREE_INDEX_BITS)
+		index = ~0UL;
+	return index;
+}
+
+static void radix_tree_init_maxindex(void)
+{
+	unsigned int i;
+
+	for (i = 0; i < ARRAY_SIZE(height_to_maxindex); i++)
+		height_to_maxindex[i] = __maxindex(i);
+}
+
+void radix_tree_init(void)
+{
+	radix_tree_init_maxindex();
+}
diff --git a/fs/btrfs/radix-tree.h b/fs/btrfs/radix-tree.h
new file mode 100644
index 0000000..c3ce881
--- /dev/null
+++ b/fs/btrfs/radix-tree.h
@@ -0,0 +1,73 @@
+/*
+ * Copyright (C) 2001 Momchil Velikov
+ * Portions Copyright (C) 2001 Christoph Hellwig
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2, or (at
+ * your option) any later version.
+ * 
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ * 
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write to the Free Software
+ * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
+ */
+#ifndef _LINUX_RADIX_TREE_H
+#define _LINUX_RADIX_TREE_H
+
+#define RADIX_TREE_MAX_TAGS 2
+
+/* root tags are stored in gfp_mask, shifted by __GFP_BITS_SHIFT */
+struct radix_tree_root {
+	unsigned int		height;
+	gfp_t			gfp_mask;
+	struct radix_tree_node	*rnode;
+};
+
+#define RADIX_TREE_INIT(mask)	{					\
+	.height = 0,							\
+	.gfp_mask = (mask),						\
+	.rnode = NULL,							\
+}
+
+#define RADIX_TREE(name, mask) \
+	struct radix_tree_root name = RADIX_TREE_INIT(mask)
+
+#define INIT_RADIX_TREE(root, mask)					\
+do {									\
+	(root)->height = 0;						\
+	(root)->gfp_mask = (mask);					\
+	(root)->rnode = NULL;						\
+} while (0)
+
+int radix_tree_insert(struct radix_tree_root *, unsigned long, void *);
+void *radix_tree_lookup(struct radix_tree_root *, unsigned long);
+void **radix_tree_lookup_slot(struct radix_tree_root *, unsigned long);
+void *radix_tree_delete(struct radix_tree_root *, unsigned long);
+unsigned int
+radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
+			unsigned long first_index, unsigned int max_items);
+int radix_tree_preload(gfp_t gfp_mask);
+void radix_tree_init(void);
+void *radix_tree_tag_set(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+void *radix_tree_tag_clear(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+int radix_tree_tag_get(struct radix_tree_root *root,
+			unsigned long index, unsigned int tag);
+unsigned int
+radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
+		unsigned long first_index, unsigned int max_items,
+		unsigned int tag);
+int radix_tree_tagged(struct radix_tree_root *root, unsigned int tag);
+
+static inline void radix_tree_preload_end(void)
+{
+	preempt_enable();
+}
+
+#endif /* _LINUX_RADIX_TREE_H */