Btrfs: Improve and cleanup locking done by walk_down_tree

While dropping snapshots, walk_down_tree does most of the work of checking
reference counts and limiting tree traversal to just the blocks that
we are freeing.

It dropped and held the allocation mutex in strange and confusing ways,
this commit changes it to only hold the mutex while actually freeing a block.

The rest of the checks around reference counts should be safe without the lock
because we only allow one process in btrfs_drop_snapshot at a time.  Other
processes dropping reference counts should not drop it to 1 because
their tree roots already have an extra ref on the block.

Signed-off-by: Chris Mason <chris.mason@oracle.com>
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index b003b43..58bceee 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -2333,8 +2333,6 @@
 	leaf_owner = btrfs_header_owner(leaf);
 	leaf_generation = btrfs_header_generation(leaf);
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
-
 	for (i = 0; i < nritems; i++) {
 		u64 disk_bytenr;
 		cond_resched();
@@ -2362,8 +2360,6 @@
 		mutex_unlock(&root->fs_info->alloc_mutex);
 		BUG_ON(ret);
 	}
-
-	mutex_lock(&root->fs_info->alloc_mutex);
 	return 0;
 }
 
@@ -2375,7 +2371,6 @@
 	int ret;
 	struct btrfs_extent_info *info = ref->extents;
 
-	mutex_unlock(&root->fs_info->alloc_mutex);
 	for (i = 0; i < ref->nritems; i++) {
 		mutex_lock(&root->fs_info->alloc_mutex);
 		ret = __btrfs_free_extent(trans, root,
@@ -2386,7 +2381,6 @@
 		BUG_ON(ret);
 		info++;
 	}
-	mutex_lock(&root->fs_info->alloc_mutex);
 
 	return 0;
 }
@@ -2440,10 +2434,39 @@
 			      u32 *refs)
 {
 	int ret;
-	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	ret = lookup_extent_ref(NULL, root, start, len, refs);
+	BUG_ON(ret);
+
+#if 0 // some debugging code in case we see problems here
+	/* if the refs count is one, it won't get increased again.  But
+	 * if the ref count is > 1, someone may be decreasing it at
+	 * the same time we are.
+	 */
+	if (*refs != 1) {
+		struct extent_buffer *eb = NULL;
+		eb = btrfs_find_create_tree_block(root, start, len);
+		if (eb)
+			btrfs_tree_lock(eb);
+
+		mutex_lock(&root->fs_info->alloc_mutex);
+		ret = lookup_extent_ref(NULL, root, start, len, refs);
+		BUG_ON(ret);
+		mutex_unlock(&root->fs_info->alloc_mutex);
+
+		if (eb) {
+			btrfs_tree_unlock(eb);
+			free_extent_buffer(eb);
+		}
+		if (*refs == 1) {
+			printk("block %llu went down to one during drop_snap\n",
+			       (unsigned long long)start);
+		}
+
+	}
+#endif
+
 	cond_resched();
-	mutex_lock(&root->fs_info->alloc_mutex);
 	return ret;
 }
 
@@ -2467,8 +2490,6 @@
 	int ret;
 	u32 refs;
 
-	mutex_lock(&root->fs_info->alloc_mutex);
-
 	WARN_ON(*level < 0);
 	WARN_ON(*level >= BTRFS_MAX_LEVEL);
 	ret = drop_snap_lookup_refcount(root, path->nodes[*level]->start,
@@ -2507,13 +2528,21 @@
 			root_owner = btrfs_header_owner(parent);
 			root_gen = btrfs_header_generation(parent);
 			path->slots[*level]++;
+
+			mutex_lock(&root->fs_info->alloc_mutex);
 			ret = __btrfs_free_extent(trans, root, bytenr,
 						blocksize, root_owner,
 						root_gen, 0, 0, 1);
 			BUG_ON(ret);
+			mutex_unlock(&root->fs_info->alloc_mutex);
 			continue;
 		}
-
+		/*
+		 * at this point, we have a single ref, and since the
+		 * only place referencing this extent is a dead root
+		 * the reference count should never go higher.
+		 * So, we don't need to check it again
+		 */
 		if (*level == 1) {
 			struct btrfs_key key;
 			btrfs_node_key_to_cpu(cur, &key, path->slots[*level]);
@@ -2533,33 +2562,23 @@
 		next = btrfs_find_tree_block(root, bytenr, blocksize);
 		if (!next || !btrfs_buffer_uptodate(next, ptr_gen)) {
 			free_extent_buffer(next);
-			mutex_unlock(&root->fs_info->alloc_mutex);
 
 			if (path->slots[*level] == 0)
 				reada_walk_down(root, cur, path->slots[*level]);
 			next = read_tree_block(root, bytenr, blocksize,
 					       ptr_gen);
 			cond_resched();
-			mutex_lock(&root->fs_info->alloc_mutex);
-
-			/* we've dropped the lock, double check */
+#if 0
+			/*
+			 * this is a debugging check and can go away
+			 * the ref should never go all the way down to 1
+			 * at this point
+			 */
 			ret = lookup_extent_ref(NULL, root, bytenr, blocksize,
 						&refs);
 			BUG_ON(ret);
-			if (refs != 1) {
-				parent = path->nodes[*level];
-				root_owner = btrfs_header_owner(parent);
-				root_gen = btrfs_header_generation(parent);
-
-				path->slots[*level]++;
-				free_extent_buffer(next);
-				ret = __btrfs_free_extent(trans, root, bytenr,
-							blocksize,
-							root_owner,
-							root_gen, 0, 0, 1);
-				BUG_ON(ret);
-				continue;
-			}
+			WARN_ON(refs != 1);
+#endif
 		}
 		WARN_ON(*level <= 0);
 		if (path->nodes[*level-1])
@@ -2584,6 +2603,8 @@
 	root_owner = btrfs_header_owner(parent);
 	root_gen = btrfs_header_generation(parent);
 
+
+	mutex_lock(&root->fs_info->alloc_mutex);
 	ret = __btrfs_free_extent(trans, root, bytenr, blocksize,
 				  root_owner, root_gen, 0, 0, 1);
 	free_extent_buffer(path->nodes[*level]);
@@ -2591,6 +2612,7 @@
 	*level += 1;
 	BUG_ON(ret);
 	mutex_unlock(&root->fs_info->alloc_mutex);
+
 	cond_resched();
 	return 0;
 }
@@ -2834,6 +2856,11 @@
 		}
 		set_page_extent_mapped(page);
 
+		/*
+		 * make sure page_mkwrite is called for this page if userland
+		 * wants to change it from mmap
+		 */
+		clear_page_dirty_for_io(page);
 
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index c78f184..8915f2d 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -338,6 +338,13 @@
 		btrfs_drop_extent_cache(inode, start_pos, aligned_end - 1);
 		BUG_ON(err);
 		mutex_unlock(&BTRFS_I(inode)->extent_mutex);
+
+		/*
+		 * an ugly way to do all the prop accounting around
+		 * the page bits and mapping tags
+		 */
+		set_page_writeback(pages[0]);
+		end_page_writeback(pages[0]);
 		did_inline = 1;
 	}
 	if (end_pos > isize) {
@@ -833,11 +840,7 @@
 			      start_pos, last_pos - 1, GFP_NOFS);
 	}
 	for (i = 0; i < num_pages; i++) {
-#if LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,18)
-		ClearPageDirty(pages[i]);
-#else
-		cancel_dirty_page(pages[i], PAGE_CACHE_SIZE);
-#endif
+		clear_page_dirty_for_io(pages[i]);
 		set_page_extent_mapped(pages[i]);
 		WARN_ON(!PageLocked(pages[i]));
 	}
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index 5e62774..224da28 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -268,6 +268,12 @@
 		}
 		set_page_extent_mapped(page);
 
+		/*
+		 * this makes sure page_mkwrite is called on the
+		 * page if it is dirtied again later
+		 */
+		clear_page_dirty_for_io(page);
+
 		set_extent_delalloc(io_tree, page_start,
 				    page_end, GFP_NOFS);