Blame - fs/btrfs/tree-log.c - kernel/msm-4.19

blob: 9c462fbd60fac14ad17f192afb08c37794295d36 [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
Christoph Hellwig	b295086	2008-12-02 09:54:17 -0500	[diff] [blame]	26	#include "tree-log.h"
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	27
				28	/* magic values for the inode_only field in btrfs_log_inode:
				29	*
				30	* LOG_INODE_ALL means to log everything
				31	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				32	* during log replay
				33	*/
				34	#define LOG_INODE_ALL 0
				35	#define LOG_INODE_EXISTS 1
				36
				37	/*
				38	* stages for the tree walking. The first
				39	* stage (0) is to only pin down the blocks we find
				40	* the second stage (1) is to make sure that all the inodes
				41	* we find in the log are created in the subvolume.
				42	*
				43	* The last stage is to deal with directories and links and extents
				44	* and all the other fun semantics
				45	*/
				46	#define LOG_WALK_PIN_ONLY 0
				47	#define LOG_WALK_REPLAY_INODES 1
				48	#define LOG_WALK_REPLAY_ALL 2
				49
				50	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				51	struct btrfs_root root, struct inode inode,
				52	int inode_only);
Yan Zheng	ec051c0	2009-01-05 15:43:42 -0500	[diff] [blame]	53	static int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				54	struct btrfs_root *root,
				55	struct btrfs_path *path, u64 objectid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	56
				57	/*
				58	* tree logging is a special write ahead log used to make sure that
				59	* fsyncs and O_SYNCs can happen without doing full tree commits.
				60	*
				61	* Full tree commits are expensive because they require commonly
				62	* modified blocks to be recowed, creating many dirty pages in the
				63	* extent tree an 4x-6x higher write load than ext3.
				64	*
				65	* Instead of doing a tree commit on every fsync, we use the
				66	* key ranges and transaction ids to find items for a given file or directory
				67	* that have changed in this transaction. Those items are copied into
				68	* a special tree (one per subvolume root), that tree is written to disk
				69	* and then the fsync is considered complete.
				70	*
				71	* After a crash, items are copied out of the log-tree back into the
				72	* subvolume tree. Any file data extents found are recorded in the extent
				73	* allocation tree, and the log-tree freed.
				74	*
				75	* The log tree is read three times, once to pin down all the extents it is
				76	* using in ram and once, once to create all the inodes logged in the tree
				77	* and once to do all the other items.
				78	*/
				79
				80	/*
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	81	* start a sub transaction and setup the log tree
				82	* this increments the log tree writer count to make the people
				83	* syncing the tree wait for us to finish
				84	*/
				85	static int start_log_trans(struct btrfs_trans_handle *trans,
				86	struct btrfs_root *root)
				87	{
				88	int ret;
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	89
				90	mutex_lock(&root->log_mutex);
				91	if (root->log_root) {
				92	root->log_batch++;
				93	atomic_inc(&root->log_writers);
				94	mutex_unlock(&root->log_mutex);
				95	return 0;
				96	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	97	mutex_lock(&root->fs_info->tree_log_mutex);
				98	if (!root->fs_info->log_root_tree) {
				99	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				100	BUG_ON(ret);
				101	}
				102	if (!root->log_root) {
				103	ret = btrfs_add_log_tree(trans, root);
				104	BUG_ON(ret);
				105	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	106	mutex_unlock(&root->fs_info->tree_log_mutex);
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	107	root->log_batch++;
				108	atomic_inc(&root->log_writers);
				109	mutex_unlock(&root->log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	110	return 0;
				111	}
				112
				113	/*
				114	* returns 0 if there was a log transaction running and we were able
				115	* to join, or returns -ENOENT if there were not transactions
				116	* in progress
				117	*/
				118	static int join_running_log_trans(struct btrfs_root *root)
				119	{
				120	int ret = -ENOENT;
				121
				122	smp_mb();
				123	if (!root->log_root)
				124	return -ENOENT;
				125
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	126	mutex_lock(&root->log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	127	if (root->log_root) {
				128	ret = 0;
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	129	atomic_inc(&root->log_writers);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	130	}
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	131	mutex_unlock(&root->log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	132	return ret;
				133	}
				134
				135	/*
				136	* indicate we're done making changes to the log tree
				137	* and wake up anyone waiting to do a sync
				138	*/
				139	static int end_log_trans(struct btrfs_root *root)
				140	{
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	141	if (atomic_dec_and_test(&root->log_writers)) {
				142	smp_mb();
				143	if (waitqueue_active(&root->log_writer_wait))
				144	wake_up(&root->log_writer_wait);
				145	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	146	return 0;
				147	}
				148
				149
				150	/*
				151	* the walk control struct is used to pass state down the chain when
				152	* processing the log tree. The stage field tells us which part
				153	* of the log tree processing we are currently doing. The others
				154	* are state fields used for that specific part
				155	*/
				156	struct walk_control {
				157	/* should we free the extent on disk when done? This is used
				158	* at transaction commit time while freeing a log tree
				159	*/
				160	int free;
				161
				162	/* should we write out the extent buffer? This is used
				163	* while flushing the log tree to disk during a sync
				164	*/
				165	int write;
				166
				167	/* should we wait for the extent buffer io to finish? Also used
				168	* while flushing the log tree to disk for a sync
				169	*/
				170	int wait;
				171
				172	/* pin only walk, we record which extents on disk belong to the
				173	* log trees
				174	*/
				175	int pin;
				176
				177	/* what stage of the replay code we're currently in */
				178	int stage;
				179
				180	/* the root we are currently replaying */
				181	struct btrfs_root *replay_dest;
				182
				183	/* the trans handle for the current replay */
				184	struct btrfs_trans_handle *trans;
				185
				186	/* the function that gets used to process blocks we find in the
				187	* tree. Note the extent_buffer might not be up to date when it is
				188	* passed in, and it must be checked or read if you need the data
				189	* inside it
				190	*/
				191	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				192	struct walk_control *wc, u64 gen);
				193	};
				194
				195	/*
				196	* process_func used to pin down extents, write them or wait on them
				197	*/
				198	static int process_one_buffer(struct btrfs_root *log,
				199	struct extent_buffer *eb,
				200	struct walk_control *wc, u64 gen)
				201	{
				202	if (wc->pin) {
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	203	mutex_lock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	204	btrfs_update_pinned_extents(log->fs_info->extent_root,
				205	eb->start, eb->len, 1);
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	206	mutex_unlock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	207	}
				208
				209	if (btrfs_buffer_uptodate(eb, gen)) {
				210	if (wc->write)
				211	btrfs_write_tree_block(eb);
				212	if (wc->wait)
				213	btrfs_wait_tree_block_writeback(eb);
				214	}
				215	return 0;
				216	}
				217
				218	/*
				219	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				220	* to the src data we are copying out.
				221	*
				222	* root is the tree we are copying into, and path is a scratch
				223	* path for use in this function (it should be released on entry and
				224	* will be released on exit).
				225	*
				226	* If the key is already in the destination tree the existing item is
				227	* overwritten. If the existing item isn't big enough, it is extended.
				228	* If it is too large, it is truncated.
				229	*
				230	* If the key isn't in the destination yet, a new item is inserted.
				231	*/
				232	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				233	struct btrfs_root *root,
				234	struct btrfs_path *path,
				235	struct extent_buffer *eb, int slot,
				236	struct btrfs_key *key)
				237	{
				238	int ret;
				239	u32 item_size;
				240	u64 saved_i_size = 0;
				241	int save_old_i_size = 0;
				242	unsigned long src_ptr;
				243	unsigned long dst_ptr;
				244	int overwrite_root = 0;
				245
				246	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				247	overwrite_root = 1;
				248
				249	item_size = btrfs_item_size_nr(eb, slot);
				250	src_ptr = btrfs_item_ptr_offset(eb, slot);
				251
				252	/* look for the key in the destination tree */
				253	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				254	if (ret == 0) {
				255	char *src_copy;
				256	char *dst_copy;
				257	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				258	path->slots[0]);
				259	if (dst_size != item_size)
				260	goto insert;
				261
				262	if (item_size == 0) {
				263	btrfs_release_path(root, path);
				264	return 0;
				265	}
				266	dst_copy = kmalloc(item_size, GFP_NOFS);
				267	src_copy = kmalloc(item_size, GFP_NOFS);
				268
				269	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				270
				271	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				272	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				273	item_size);
				274	ret = memcmp(dst_copy, src_copy, item_size);
				275
				276	kfree(dst_copy);
				277	kfree(src_copy);
				278	/*
				279	* they have the same contents, just return, this saves
				280	* us from cowing blocks in the destination tree and doing
				281	* extra writes that may not have been done by a previous
				282	* sync
				283	*/
				284	if (ret == 0) {
				285	btrfs_release_path(root, path);
				286	return 0;
				287	}
				288
				289	}
				290	insert:
				291	btrfs_release_path(root, path);
				292	/* try to insert the key into the destination tree */
				293	ret = btrfs_insert_empty_item(trans, root, path,
				294	key, item_size);
				295
				296	/* make sure any existing item is the correct size */
				297	if (ret == -EEXIST) {
				298	u32 found_size;
				299	found_size = btrfs_item_size_nr(path->nodes[0],
				300	path->slots[0]);
				301	if (found_size > item_size) {
				302	btrfs_truncate_item(trans, root, path, item_size, 1);
				303	} else if (found_size < item_size) {
Yan Zheng	87b29b2	2008-12-17 10:21:48 -0500	[diff] [blame]	304	ret = btrfs_extend_item(trans, root, path,
				305	item_size - found_size);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	306	BUG_ON(ret);
				307	}
				308	} else if (ret) {
				309	BUG();
				310	}
				311	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				312	path->slots[0]);
				313
				314	/* don't overwrite an existing inode if the generation number
				315	* was logged as zero. This is done when the tree logging code
				316	* is just logging an inode to make sure it exists after recovery.
				317	*
				318	* Also, don't overwrite i_size on directories during replay.
				319	* log replay inserts and removes directory items based on the
				320	* state of the tree found in the subvolume, and i_size is modified
				321	* as it goes
				322	*/
				323	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				324	struct btrfs_inode_item *src_item;
				325	struct btrfs_inode_item *dst_item;
				326
				327	src_item = (struct btrfs_inode_item *)src_ptr;
				328	dst_item = (struct btrfs_inode_item *)dst_ptr;
				329
				330	if (btrfs_inode_generation(eb, src_item) == 0)
				331	goto no_copy;
				332
				333	if (overwrite_root &&
				334	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				335	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				336	save_old_i_size = 1;
				337	saved_i_size = btrfs_inode_size(path->nodes[0],
				338	dst_item);
				339	}
				340	}
				341
				342	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				343	src_ptr, item_size);
				344
				345	if (save_old_i_size) {
				346	struct btrfs_inode_item *dst_item;
				347	dst_item = (struct btrfs_inode_item *)dst_ptr;
				348	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				349	}
				350
				351	/* make sure the generation is filled in */
				352	if (key->type == BTRFS_INODE_ITEM_KEY) {
				353	struct btrfs_inode_item *dst_item;
				354	dst_item = (struct btrfs_inode_item *)dst_ptr;
				355	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				356	btrfs_set_inode_generation(path->nodes[0], dst_item,
				357	trans->transid);
				358	}
				359	}
				360	no_copy:
				361	btrfs_mark_buffer_dirty(path->nodes[0]);
				362	btrfs_release_path(root, path);
				363	return 0;
				364	}
				365
				366	/*
				367	* simple helper to read an inode off the disk from a given root
				368	* This can only be called for subvolume roots and not for the log
				369	*/
				370	static noinline struct inode read_one_inode(struct btrfs_root root,
				371	u64 objectid)
				372	{
				373	struct inode *inode;
				374	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				375	if (inode->i_state & I_NEW) {
				376	BTRFS_I(inode)->root = root;
				377	BTRFS_I(inode)->location.objectid = objectid;
				378	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				379	BTRFS_I(inode)->location.offset = 0;
				380	btrfs_read_locked_inode(inode);
				381	unlock_new_inode(inode);
				382
				383	}
				384	if (is_bad_inode(inode)) {
				385	iput(inode);
				386	inode = NULL;
				387	}
				388	return inode;
				389	}
				390
				391	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				392	* subvolume 'root'. path is released on entry and should be released
				393	* on exit.
				394	*
				395	* extents in the log tree have not been allocated out of the extent
				396	* tree yet. So, this completes the allocation, taking a reference
				397	* as required if the extent already exists or creating a new extent
				398	* if it isn't in the extent allocation tree yet.
				399	*
				400	* The extent is inserted into the file, dropping any existing extents
				401	* from the file that overlap the new one.
				402	*/
				403	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				404	struct btrfs_root *root,
				405	struct btrfs_path *path,
				406	struct extent_buffer *eb, int slot,
				407	struct btrfs_key *key)
				408	{
				409	int found_type;
				410	u64 mask = root->sectorsize - 1;
				411	u64 extent_end;
				412	u64 alloc_hint;
				413	u64 start = key->offset;
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	414	u64 saved_nbytes;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	415	struct btrfs_file_extent_item *item;
				416	struct inode *inode = NULL;
				417	unsigned long size;
				418	int ret = 0;
				419
				420	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				421	found_type = btrfs_file_extent_type(eb, item);
				422
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	423	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				424	found_type == BTRFS_FILE_EXTENT_PREALLOC)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	425	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				426	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Chris Mason	c8b9781	2008-10-29 14:49:59 -0400	[diff] [blame]	427	size = btrfs_file_extent_inline_len(eb, item);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	428	extent_end = (start + size + mask) & ~mask;
				429	} else {
				430	ret = 0;
				431	goto out;
				432	}
				433
				434	inode = read_one_inode(root, key->objectid);
				435	if (!inode) {
				436	ret = -EIO;
				437	goto out;
				438	}
				439
				440	/*
				441	* first check to see if we already have this extent in the
				442	* file. This must be done before the btrfs_drop_extents run
				443	* so we don't try to drop this extent.
				444	*/
				445	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				446	start, 0);
				447
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	448	if (ret == 0 &&
				449	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				450	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	451	struct btrfs_file_extent_item cmp1;
				452	struct btrfs_file_extent_item cmp2;
				453	struct btrfs_file_extent_item *existing;
				454	struct extent_buffer *leaf;
				455
				456	leaf = path->nodes[0];
				457	existing = btrfs_item_ptr(leaf, path->slots[0],
				458	struct btrfs_file_extent_item);
				459
				460	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				461	sizeof(cmp1));
				462	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				463	sizeof(cmp2));
				464
				465	/*
				466	* we already have a pointer to this exact extent,
				467	* we don't have to do anything
				468	*/
				469	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				470	btrfs_release_path(root, path);
				471	goto out;
				472	}
				473	}
				474	btrfs_release_path(root, path);
				475
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	476	saved_nbytes = inode_get_bytes(inode);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	477	/* drop any overlapping extents */
				478	ret = btrfs_drop_extents(trans, root, inode,
				479	start, extent_end, start, &alloc_hint);
				480	BUG_ON(ret);
				481
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	482	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				483	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
				484	unsigned long dest_offset;
				485	struct btrfs_key ins;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	486
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	487	ret = btrfs_insert_empty_item(trans, root, path, key,
				488	sizeof(*item));
				489	BUG_ON(ret);
				490	dest_offset = btrfs_item_ptr_offset(path->nodes[0],
				491	path->slots[0]);
				492	copy_extent_buffer(path->nodes[0], eb, dest_offset,
				493	(unsigned long)item, sizeof(*item));
				494
				495	ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				496	ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				497	ins.type = BTRFS_EXTENT_ITEM_KEY;
				498
				499	if (ins.objectid > 0) {
				500	u64 csum_start;
				501	u64 csum_end;
				502	LIST_HEAD(ordered_sums);
				503	/*
				504	* is this extent already allocated in the extent
				505	* allocation tree? If so, just add a reference
				506	*/
				507	ret = btrfs_lookup_extent(root, ins.objectid,
				508	ins.offset);
				509	if (ret == 0) {
				510	ret = btrfs_inc_extent_ref(trans, root,
				511	ins.objectid, ins.offset,
				512	path->nodes[0]->start,
				513	root->root_key.objectid,
				514	trans->transid, key->objectid);
				515	} else {
				516	/*
				517	* insert the extent pointer in the extent
				518	* allocation tree
				519	*/
				520	ret = btrfs_alloc_logged_extent(trans, root,
				521	path->nodes[0]->start,
				522	root->root_key.objectid,
				523	trans->transid, key->objectid,
				524	&ins);
				525	BUG_ON(ret);
				526	}
				527	btrfs_release_path(root, path);
				528
				529	if (btrfs_file_extent_compression(eb, item)) {
				530	csum_start = ins.objectid;
				531	csum_end = csum_start + ins.offset;
				532	} else {
				533	csum_start = ins.objectid +
				534	btrfs_file_extent_offset(eb, item);
				535	csum_end = csum_start +
				536	btrfs_file_extent_num_bytes(eb, item);
				537	}
				538
				539	ret = btrfs_lookup_csums_range(root->log_root,
				540	csum_start, csum_end - 1,
				541	&ordered_sums);
				542	BUG_ON(ret);
				543	while (!list_empty(&ordered_sums)) {
				544	struct btrfs_ordered_sum *sums;
				545	sums = list_entry(ordered_sums.next,
				546	struct btrfs_ordered_sum,
				547	list);
				548	ret = btrfs_csum_file_blocks(trans,
				549	root->fs_info->csum_root,
				550	sums);
				551	BUG_ON(ret);
				552	list_del(&sums->list);
				553	kfree(sums);
				554	}
				555	} else {
				556	btrfs_release_path(root, path);
				557	}
				558	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				559	/* inline extents are easy, we just overwrite them */
				560	ret = overwrite_item(trans, root, path, eb, slot, key);
				561	BUG_ON(ret);
				562	}
				563
				564	inode_set_bytes(inode, saved_nbytes);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	565	btrfs_update_inode(trans, root, inode);
				566	out:
				567	if (inode)
				568	iput(inode);
				569	return ret;
				570	}
				571
				572	/*
				573	* when cleaning up conflicts between the directory names in the
				574	* subvolume, directory names in the log and directory names in the
				575	* inode back references, we may have to unlink inodes from directories.
				576	*
				577	* This is a helper function to do the unlink of a specific directory
				578	* item
				579	*/
				580	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				581	struct btrfs_root *root,
				582	struct btrfs_path *path,
				583	struct inode *dir,
				584	struct btrfs_dir_item *di)
				585	{
				586	struct inode *inode;
				587	char *name;
				588	int name_len;
				589	struct extent_buffer *leaf;
				590	struct btrfs_key location;
				591	int ret;
				592
				593	leaf = path->nodes[0];
				594
				595	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				596	name_len = btrfs_dir_name_len(leaf, di);
				597	name = kmalloc(name_len, GFP_NOFS);
				598	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				599	btrfs_release_path(root, path);
				600
				601	inode = read_one_inode(root, location.objectid);
				602	BUG_ON(!inode);
				603
Yan Zheng	ec051c0	2009-01-05 15:43:42 -0500	[diff] [blame]	604	ret = link_to_fixup_dir(trans, root, path, location.objectid);
				605	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	606	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
Yan Zheng	ec051c0	2009-01-05 15:43:42 -0500	[diff] [blame]	607	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	608	kfree(name);
				609
				610	iput(inode);
				611	return ret;
				612	}
				613
				614	/*
				615	* helper function to see if a given name and sequence number found
				616	* in an inode back reference are already in a directory and correctly
				617	* point to this inode
				618	*/
				619	static noinline int inode_in_dir(struct btrfs_root *root,
				620	struct btrfs_path *path,
				621	u64 dirid, u64 objectid, u64 index,
				622	const char *name, int name_len)
				623	{
				624	struct btrfs_dir_item *di;
				625	struct btrfs_key location;
				626	int match = 0;
				627
				628	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				629	index, name, name_len, 0);
				630	if (di && !IS_ERR(di)) {
				631	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				632	if (location.objectid != objectid)
				633	goto out;
				634	} else
				635	goto out;
				636	btrfs_release_path(root, path);
				637
				638	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				639	if (di && !IS_ERR(di)) {
				640	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				641	if (location.objectid != objectid)
				642	goto out;
				643	} else
				644	goto out;
				645	match = 1;
				646	out:
				647	btrfs_release_path(root, path);
				648	return match;
				649	}
				650
				651	/*
				652	* helper function to check a log tree for a named back reference in
				653	* an inode. This is used to decide if a back reference that is
				654	* found in the subvolume conflicts with what we find in the log.
				655	*
				656	* inode backreferences may have multiple refs in a single item,
				657	* during replay we process one reference at a time, and we don't
				658	* want to delete valid links to a file from the subvolume if that
				659	* link is also in the log.
				660	*/
				661	static noinline int backref_in_log(struct btrfs_root *log,
				662	struct btrfs_key *key,
				663	char *name, int namelen)
				664	{
				665	struct btrfs_path *path;
				666	struct btrfs_inode_ref *ref;
				667	unsigned long ptr;
				668	unsigned long ptr_end;
				669	unsigned long name_ptr;
				670	int found_name_len;
				671	int item_size;
				672	int ret;
				673	int match = 0;
				674
				675	path = btrfs_alloc_path();
				676	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				677	if (ret != 0)
				678	goto out;
				679
				680	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				681	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				682	ptr_end = ptr + item_size;
				683	while (ptr < ptr_end) {
				684	ref = (struct btrfs_inode_ref *)ptr;
				685	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				686	if (found_name_len == namelen) {
				687	name_ptr = (unsigned long)(ref + 1);
				688	ret = memcmp_extent_buffer(path->nodes[0], name,
				689	name_ptr, namelen);
				690	if (ret == 0) {
				691	match = 1;
				692	goto out;
				693	}
				694	}
				695	ptr = (unsigned long)(ref + 1) + found_name_len;
				696	}
				697	out:
				698	btrfs_free_path(path);
				699	return match;
				700	}
				701
				702
				703	/*
				704	* replay one inode back reference item found in the log tree.
				705	* eb, slot and key refer to the buffer and key found in the log tree.
				706	* root is the destination we are replaying into, and path is for temp
				707	* use by this function. (it should be released on return).
				708	*/
				709	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				710	struct btrfs_root *root,
				711	struct btrfs_root *log,
				712	struct btrfs_path *path,
				713	struct extent_buffer *eb, int slot,
				714	struct btrfs_key *key)
				715	{
				716	struct inode *dir;
				717	int ret;
				718	struct btrfs_key location;
				719	struct btrfs_inode_ref *ref;
				720	struct btrfs_dir_item *di;
				721	struct inode *inode;
				722	char *name;
				723	int namelen;
				724	unsigned long ref_ptr;
				725	unsigned long ref_end;
				726
				727	location.objectid = key->objectid;
				728	location.type = BTRFS_INODE_ITEM_KEY;
				729	location.offset = 0;
				730
				731	/*
				732	* it is possible that we didn't log all the parent directories
				733	* for a given inode. If we don't find the dir, just don't
				734	* copy the back ref in. The link count fixup code will take
				735	* care of the rest
				736	*/
				737	dir = read_one_inode(root, key->offset);
				738	if (!dir)
				739	return -ENOENT;
				740
				741	inode = read_one_inode(root, key->objectid);
				742	BUG_ON(!dir);
				743
				744	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				745	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				746
				747	again:
				748	ref = (struct btrfs_inode_ref *)ref_ptr;
				749
				750	namelen = btrfs_inode_ref_name_len(eb, ref);
				751	name = kmalloc(namelen, GFP_NOFS);
				752	BUG_ON(!name);
				753
				754	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				755
				756	/* if we already have a perfect match, we're done */
				757	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				758	btrfs_inode_ref_index(eb, ref),
				759	name, namelen)) {
				760	goto out;
				761	}
				762
				763	/*
				764	* look for a conflicting back reference in the metadata.
				765	* if we find one we have to unlink that name of the file
				766	* before we add our new link. Later on, we overwrite any
				767	* existing back reference, and we don't want to create
				768	* dangling pointers in the directory.
				769	*/
				770	conflict_again:
				771	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				772	if (ret == 0) {
				773	char *victim_name;
				774	int victim_name_len;
				775	struct btrfs_inode_ref *victim_ref;
				776	unsigned long ptr;
				777	unsigned long ptr_end;
				778	struct extent_buffer *leaf = path->nodes[0];
				779
				780	/* are we trying to overwrite a back ref for the root directory
				781	* if so, just jump out, we're done
				782	*/
				783	if (key->objectid == key->offset)
				784	goto out_nowrite;
				785
				786	/* check all the names in this back reference to see
				787	* if they are in the log. if so, we allow them to stay
				788	* otherwise they must be unlinked as a conflict
				789	*/
				790	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				791	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	792	while (ptr < ptr_end) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	793	victim_ref = (struct btrfs_inode_ref *)ptr;
				794	victim_name_len = btrfs_inode_ref_name_len(leaf,
				795	victim_ref);
				796	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				797	BUG_ON(!victim_name);
				798
				799	read_extent_buffer(leaf, victim_name,
				800	(unsigned long)(victim_ref + 1),
				801	victim_name_len);
				802
				803	if (!backref_in_log(log, key, victim_name,
				804	victim_name_len)) {
				805	btrfs_inc_nlink(inode);
				806	btrfs_release_path(root, path);
				807	ret = btrfs_unlink_inode(trans, root, dir,
				808	inode, victim_name,
				809	victim_name_len);
				810	kfree(victim_name);
				811	btrfs_release_path(root, path);
				812	goto conflict_again;
				813	}
				814	kfree(victim_name);
				815	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				816	}
				817	BUG_ON(ret);
				818	}
				819	btrfs_release_path(root, path);
				820
				821	/* look for a conflicting sequence number */
				822	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				823	btrfs_inode_ref_index(eb, ref),
				824	name, namelen, 0);
				825	if (di && !IS_ERR(di)) {
				826	ret = drop_one_dir_item(trans, root, path, dir, di);
				827	BUG_ON(ret);
				828	}
				829	btrfs_release_path(root, path);
				830
				831
				832	/* look for a conflicting name */
				833	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				834	name, namelen, 0);
				835	if (di && !IS_ERR(di)) {
				836	ret = drop_one_dir_item(trans, root, path, dir, di);
				837	BUG_ON(ret);
				838	}
				839	btrfs_release_path(root, path);
				840
				841	/* insert our name */
				842	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				843	btrfs_inode_ref_index(eb, ref));
				844	BUG_ON(ret);
				845
				846	btrfs_update_inode(trans, root, inode);
				847
				848	out:
				849	ref_ptr = (unsigned long)(ref + 1) + namelen;
				850	kfree(name);
				851	if (ref_ptr < ref_end)
				852	goto again;
				853
				854	/* finally write the back reference in the inode */
				855	ret = overwrite_item(trans, root, path, eb, slot, key);
				856	BUG_ON(ret);
				857
				858	out_nowrite:
				859	btrfs_release_path(root, path);
				860	iput(dir);
				861	iput(inode);
				862	return 0;
				863	}
				864
				865	/*
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	866	* There are a few corners where the link count of the file can't
				867	* be properly maintained during replay. So, instead of adding
				868	* lots of complexity to the log code, we just scan the backrefs
				869	* for any file that has been through replay.
				870	*
				871	* The scan will update the link count on the inode to reflect the
				872	* number of back refs found. If it goes down to zero, the iput
				873	* will free the inode.
				874	*/
				875	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				876	struct btrfs_root *root,
				877	struct inode *inode)
				878	{
				879	struct btrfs_path *path;
				880	int ret;
				881	struct btrfs_key key;
				882	u64 nlink = 0;
				883	unsigned long ptr;
				884	unsigned long ptr_end;
				885	int name_len;
				886
				887	key.objectid = inode->i_ino;
				888	key.type = BTRFS_INODE_REF_KEY;
				889	key.offset = (u64)-1;
				890
				891	path = btrfs_alloc_path();
				892
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	893	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	894	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				895	if (ret < 0)
				896	break;
				897	if (ret > 0) {
				898	if (path->slots[0] == 0)
				899	break;
				900	path->slots[0]--;
				901	}
				902	btrfs_item_key_to_cpu(path->nodes[0], &key,
				903	path->slots[0]);
				904	if (key.objectid != inode->i_ino \|\|
				905	key.type != BTRFS_INODE_REF_KEY)
				906	break;
				907	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				908	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				909	path->slots[0]);
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	910	while (ptr < ptr_end) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	911	struct btrfs_inode_ref *ref;
				912
				913	ref = (struct btrfs_inode_ref *)ptr;
				914	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				915	ref);
				916	ptr = (unsigned long)(ref + 1) + name_len;
				917	nlink++;
				918	}
				919
				920	if (key.offset == 0)
				921	break;
				922	key.offset--;
				923	btrfs_release_path(root, path);
				924	}
				925	btrfs_free_path(path);
				926	if (nlink != inode->i_nlink) {
				927	inode->i_nlink = nlink;
				928	btrfs_update_inode(trans, root, inode);
				929	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	930	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	931
				932	return 0;
				933	}
				934
				935	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				936	struct btrfs_root *root,
				937	struct btrfs_path *path)
				938	{
				939	int ret;
				940	struct btrfs_key key;
				941	struct inode *inode;
				942
				943	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				944	key.type = BTRFS_ORPHAN_ITEM_KEY;
				945	key.offset = (u64)-1;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	946	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	947	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				948	if (ret < 0)
				949	break;
				950
				951	if (ret == 1) {
				952	if (path->slots[0] == 0)
				953	break;
				954	path->slots[0]--;
				955	}
				956
				957	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				958	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				959	key.type != BTRFS_ORPHAN_ITEM_KEY)
				960	break;
				961
				962	ret = btrfs_del_item(trans, root, path);
				963	BUG_ON(ret);
				964
				965	btrfs_release_path(root, path);
				966	inode = read_one_inode(root, key.offset);
				967	BUG_ON(!inode);
				968
				969	ret = fixup_inode_link_count(trans, root, inode);
				970	BUG_ON(ret);
				971
				972	iput(inode);
				973
				974	if (key.offset == 0)
				975	break;
				976	key.offset--;
				977	}
				978	btrfs_release_path(root, path);
				979	return 0;
				980	}
				981
				982
				983	/*
				984	* record a given inode in the fixup dir so we can check its link
				985	* count when replay is done. The link count is incremented here
				986	* so the inode won't go away until we check it
				987	*/
				988	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				989	struct btrfs_root *root,
				990	struct btrfs_path *path,
				991	u64 objectid)
				992	{
				993	struct btrfs_key key;
				994	int ret = 0;
				995	struct inode *inode;
				996
				997	inode = read_one_inode(root, objectid);
				998	BUG_ON(!inode);
				999
				1000	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1001	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1002	key.offset = objectid;
				1003
				1004	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1005
				1006	btrfs_release_path(root, path);
				1007	if (ret == 0) {
				1008	btrfs_inc_nlink(inode);
				1009	btrfs_update_inode(trans, root, inode);
				1010	} else if (ret == -EEXIST) {
				1011	ret = 0;
				1012	} else {
				1013	BUG();
				1014	}
				1015	iput(inode);
				1016
				1017	return ret;
				1018	}
				1019
				1020	/*
				1021	* when replaying the log for a directory, we only insert names
				1022	* for inodes that actually exist. This means an fsync on a directory
				1023	* does not implicitly fsync all the new files in it
				1024	*/
				1025	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1026	struct btrfs_root *root,
				1027	struct btrfs_path *path,
				1028	u64 dirid, u64 index,
				1029	char *name, int name_len, u8 type,
				1030	struct btrfs_key *location)
				1031	{
				1032	struct inode *inode;
				1033	struct inode *dir;
				1034	int ret;
				1035
				1036	inode = read_one_inode(root, location->objectid);
				1037	if (!inode)
				1038	return -ENOENT;
				1039
				1040	dir = read_one_inode(root, dirid);
				1041	if (!dir) {
				1042	iput(inode);
				1043	return -EIO;
				1044	}
				1045	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1046
				1047	/* FIXME, put inode into FIXUP list */
				1048
				1049	iput(inode);
				1050	iput(dir);
				1051	return ret;
				1052	}
				1053
				1054	/*
				1055	* take a single entry in a log directory item and replay it into
				1056	* the subvolume.
				1057	*
				1058	* if a conflicting item exists in the subdirectory already,
				1059	* the inode it points to is unlinked and put into the link count
				1060	* fix up tree.
				1061	*
				1062	* If a name from the log points to a file or directory that does
				1063	* not exist in the FS, it is skipped. fsyncs on directories
				1064	* do not force down inodes inside that directory, just changes to the
				1065	* names or unlinks in a directory.
				1066	*/
				1067	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1068	struct btrfs_root *root,
				1069	struct btrfs_path *path,
				1070	struct extent_buffer *eb,
				1071	struct btrfs_dir_item *di,
				1072	struct btrfs_key *key)
				1073	{
				1074	char *name;
				1075	int name_len;
				1076	struct btrfs_dir_item *dst_di;
				1077	struct btrfs_key found_key;
				1078	struct btrfs_key log_key;
				1079	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1080	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1081	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1082	int ret;
				1083
				1084	dir = read_one_inode(root, key->objectid);
				1085	BUG_ON(!dir);
				1086
				1087	name_len = btrfs_dir_name_len(eb, di);
				1088	name = kmalloc(name_len, GFP_NOFS);
				1089	log_type = btrfs_dir_type(eb, di);
				1090	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1091	name_len);
				1092
				1093	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1094	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1095	if (exists == 0)
				1096	exists = 1;
				1097	else
				1098	exists = 0;
				1099	btrfs_release_path(root, path);
				1100
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1101	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1102	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1103	name, name_len, 1);
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1104	} else if (key->type == BTRFS_DIR_INDEX_KEY) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1105	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1106	key->objectid,
				1107	key->offset, name,
				1108	name_len, 1);
				1109	} else {
				1110	BUG();
				1111	}
				1112	if (!dst_di \|\| IS_ERR(dst_di)) {
				1113	/* we need a sequence number to insert, so we only
				1114	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1115	*/
				1116	if (key->type != BTRFS_DIR_INDEX_KEY)
				1117	goto out;
				1118	goto insert;
				1119	}
				1120
				1121	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1122	/* the existing item matches the logged item */
				1123	if (found_key.objectid == log_key.objectid &&
				1124	found_key.type == log_key.type &&
				1125	found_key.offset == log_key.offset &&
				1126	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1127	goto out;
				1128	}
				1129
				1130	/*
				1131	* don't drop the conflicting directory entry if the inode
				1132	* for the new entry doesn't exist
				1133	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1134	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1135	goto out;
				1136
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1137	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1138	BUG_ON(ret);
				1139
				1140	if (key->type == BTRFS_DIR_INDEX_KEY)
				1141	goto insert;
				1142	out:
				1143	btrfs_release_path(root, path);
				1144	kfree(name);
				1145	iput(dir);
				1146	return 0;
				1147
				1148	insert:
				1149	btrfs_release_path(root, path);
				1150	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1151	name, name_len, log_type, &log_key);
				1152
				1153	if (ret && ret != -ENOENT)
				1154	BUG();
				1155	goto out;
				1156	}
				1157
				1158	/*
				1159	* find all the names in a directory item and reconcile them into
				1160	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1161	* one name in a directory item, but the same code gets used for
				1162	* both directory index types
				1163	*/
				1164	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1165	struct btrfs_root *root,
				1166	struct btrfs_path *path,
				1167	struct extent_buffer *eb, int slot,
				1168	struct btrfs_key *key)
				1169	{
				1170	int ret;
				1171	u32 item_size = btrfs_item_size_nr(eb, slot);
				1172	struct btrfs_dir_item *di;
				1173	int name_len;
				1174	unsigned long ptr;
				1175	unsigned long ptr_end;
				1176
				1177	ptr = btrfs_item_ptr_offset(eb, slot);
				1178	ptr_end = ptr + item_size;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1179	while (ptr < ptr_end) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1180	di = (struct btrfs_dir_item *)ptr;
				1181	name_len = btrfs_dir_name_len(eb, di);
				1182	ret = replay_one_name(trans, root, path, eb, di, key);
				1183	BUG_ON(ret);
				1184	ptr = (unsigned long)(di + 1);
				1185	ptr += name_len;
				1186	}
				1187	return 0;
				1188	}
				1189
				1190	/*
				1191	* directory replay has two parts. There are the standard directory
				1192	* items in the log copied from the subvolume, and range items
				1193	* created in the log while the subvolume was logged.
				1194	*
				1195	* The range items tell us which parts of the key space the log
				1196	* is authoritative for. During replay, if a key in the subvolume
				1197	* directory is in a logged range item, but not actually in the log
				1198	* that means it was deleted from the directory before the fsync
				1199	* and should be removed.
				1200	*/
				1201	static noinline int find_dir_range(struct btrfs_root *root,
				1202	struct btrfs_path *path,
				1203	u64 dirid, int key_type,
				1204	u64 start_ret, u64 end_ret)
				1205	{
				1206	struct btrfs_key key;
				1207	u64 found_end;
				1208	struct btrfs_dir_log_item *item;
				1209	int ret;
				1210	int nritems;
				1211
				1212	if (*start_ret == (u64)-1)
				1213	return 1;
				1214
				1215	key.objectid = dirid;
				1216	key.type = key_type;
				1217	key.offset = *start_ret;
				1218
				1219	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1220	if (ret < 0)
				1221	goto out;
				1222	if (ret > 0) {
				1223	if (path->slots[0] == 0)
				1224	goto out;
				1225	path->slots[0]--;
				1226	}
				1227	if (ret != 0)
				1228	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1229
				1230	if (key.type != key_type \|\| key.objectid != dirid) {
				1231	ret = 1;
				1232	goto next;
				1233	}
				1234	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1235	struct btrfs_dir_log_item);
				1236	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1237
				1238	if (start_ret >= key.offset && start_ret <= found_end) {
				1239	ret = 0;
				1240	*start_ret = key.offset;
				1241	*end_ret = found_end;
				1242	goto out;
				1243	}
				1244	ret = 1;
				1245	next:
				1246	/* check the next slot in the tree to see if it is a valid item */
				1247	nritems = btrfs_header_nritems(path->nodes[0]);
				1248	if (path->slots[0] >= nritems) {
				1249	ret = btrfs_next_leaf(root, path);
				1250	if (ret)
				1251	goto out;
				1252	} else {
				1253	path->slots[0]++;
				1254	}
				1255
				1256	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1257
				1258	if (key.type != key_type \|\| key.objectid != dirid) {
				1259	ret = 1;
				1260	goto out;
				1261	}
				1262	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1263	struct btrfs_dir_log_item);
				1264	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1265	*start_ret = key.offset;
				1266	*end_ret = found_end;
				1267	ret = 0;
				1268	out:
				1269	btrfs_release_path(root, path);
				1270	return ret;
				1271	}
				1272
				1273	/*
				1274	* this looks for a given directory item in the log. If the directory
				1275	* item is not in the log, the item is removed and the inode it points
				1276	* to is unlinked
				1277	*/
				1278	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1279	struct btrfs_root *root,
				1280	struct btrfs_root *log,
				1281	struct btrfs_path *path,
				1282	struct btrfs_path *log_path,
				1283	struct inode *dir,
				1284	struct btrfs_key *dir_key)
				1285	{
				1286	int ret;
				1287	struct extent_buffer *eb;
				1288	int slot;
				1289	u32 item_size;
				1290	struct btrfs_dir_item *di;
				1291	struct btrfs_dir_item *log_di;
				1292	int name_len;
				1293	unsigned long ptr;
				1294	unsigned long ptr_end;
				1295	char *name;
				1296	struct inode *inode;
				1297	struct btrfs_key location;
				1298
				1299	again:
				1300	eb = path->nodes[0];
				1301	slot = path->slots[0];
				1302	item_size = btrfs_item_size_nr(eb, slot);
				1303	ptr = btrfs_item_ptr_offset(eb, slot);
				1304	ptr_end = ptr + item_size;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1305	while (ptr < ptr_end) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1306	di = (struct btrfs_dir_item *)ptr;
				1307	name_len = btrfs_dir_name_len(eb, di);
				1308	name = kmalloc(name_len, GFP_NOFS);
				1309	if (!name) {
				1310	ret = -ENOMEM;
				1311	goto out;
				1312	}
				1313	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1314	name_len);
				1315	log_di = NULL;
				1316	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1317	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1318	dir_key->objectid,
				1319	name, name_len, 0);
				1320	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1321	log_di = btrfs_lookup_dir_index_item(trans, log,
				1322	log_path,
				1323	dir_key->objectid,
				1324	dir_key->offset,
				1325	name, name_len, 0);
				1326	}
				1327	if (!log_di \|\| IS_ERR(log_di)) {
				1328	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1329	btrfs_release_path(root, path);
				1330	btrfs_release_path(log, log_path);
				1331	inode = read_one_inode(root, location.objectid);
				1332	BUG_ON(!inode);
				1333
				1334	ret = link_to_fixup_dir(trans, root,
				1335	path, location.objectid);
				1336	BUG_ON(ret);
				1337	btrfs_inc_nlink(inode);
				1338	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1339	name, name_len);
				1340	BUG_ON(ret);
				1341	kfree(name);
				1342	iput(inode);
				1343
				1344	/* there might still be more names under this key
				1345	* check and repeat if required
				1346	*/
				1347	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1348	0, 0);
				1349	if (ret == 0)
				1350	goto again;
				1351	ret = 0;
				1352	goto out;
				1353	}
				1354	btrfs_release_path(log, log_path);
				1355	kfree(name);
				1356
				1357	ptr = (unsigned long)(di + 1);
				1358	ptr += name_len;
				1359	}
				1360	ret = 0;
				1361	out:
				1362	btrfs_release_path(root, path);
				1363	btrfs_release_path(log, log_path);
				1364	return ret;
				1365	}
				1366
				1367	/*
				1368	* deletion replay happens before we copy any new directory items
				1369	* out of the log or out of backreferences from inodes. It
				1370	* scans the log to find ranges of keys that log is authoritative for,
				1371	* and then scans the directory to find items in those ranges that are
				1372	* not present in the log.
				1373	*
				1374	* Anything we don't find in the log is unlinked and removed from the
				1375	* directory.
				1376	*/
				1377	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1378	struct btrfs_root *root,
				1379	struct btrfs_root *log,
				1380	struct btrfs_path *path,
				1381	u64 dirid)
				1382	{
				1383	u64 range_start;
				1384	u64 range_end;
				1385	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1386	int ret = 0;
				1387	struct btrfs_key dir_key;
				1388	struct btrfs_key found_key;
				1389	struct btrfs_path *log_path;
				1390	struct inode *dir;
				1391
				1392	dir_key.objectid = dirid;
				1393	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1394	log_path = btrfs_alloc_path();
				1395	if (!log_path)
				1396	return -ENOMEM;
				1397
				1398	dir = read_one_inode(root, dirid);
				1399	/* it isn't an error if the inode isn't there, that can happen
				1400	* because we replay the deletes before we copy in the inode item
				1401	* from the log
				1402	*/
				1403	if (!dir) {
				1404	btrfs_free_path(log_path);
				1405	return 0;
				1406	}
				1407	again:
				1408	range_start = 0;
				1409	range_end = 0;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1410	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1411	ret = find_dir_range(log, path, dirid, key_type,
				1412	&range_start, &range_end);
				1413	if (ret != 0)
				1414	break;
				1415
				1416	dir_key.offset = range_start;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1417	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1418	int nritems;
				1419	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1420	0, 0);
				1421	if (ret < 0)
				1422	goto out;
				1423
				1424	nritems = btrfs_header_nritems(path->nodes[0]);
				1425	if (path->slots[0] >= nritems) {
				1426	ret = btrfs_next_leaf(root, path);
				1427	if (ret)
				1428	break;
				1429	}
				1430	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1431	path->slots[0]);
				1432	if (found_key.objectid != dirid \|\|
				1433	found_key.type != dir_key.type)
				1434	goto next_type;
				1435
				1436	if (found_key.offset > range_end)
				1437	break;
				1438
				1439	ret = check_item_in_log(trans, root, log, path,
				1440	log_path, dir, &found_key);
				1441	BUG_ON(ret);
				1442	if (found_key.offset == (u64)-1)
				1443	break;
				1444	dir_key.offset = found_key.offset + 1;
				1445	}
				1446	btrfs_release_path(root, path);
				1447	if (range_end == (u64)-1)
				1448	break;
				1449	range_start = range_end + 1;
				1450	}
				1451
				1452	next_type:
				1453	ret = 0;
				1454	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1455	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1456	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1457	btrfs_release_path(root, path);
				1458	goto again;
				1459	}
				1460	out:
				1461	btrfs_release_path(root, path);
				1462	btrfs_free_path(log_path);
				1463	iput(dir);
				1464	return ret;
				1465	}
				1466
				1467	/*
				1468	* the process_func used to replay items from the log tree. This
				1469	* gets called in two different stages. The first stage just looks
				1470	* for inodes and makes sure they are all copied into the subvolume.
				1471	*
				1472	* The second stage copies all the other item types from the log into
				1473	* the subvolume. The two stage approach is slower, but gets rid of
				1474	* lots of complexity around inodes referencing other inodes that exist
				1475	* only in the log (references come from either directory items or inode
				1476	* back refs).
				1477	*/
				1478	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1479	struct walk_control *wc, u64 gen)
				1480	{
				1481	int nritems;
				1482	struct btrfs_path *path;
				1483	struct btrfs_root *root = wc->replay_dest;
				1484	struct btrfs_key key;
				1485	u32 item_size;
				1486	int level;
				1487	int i;
				1488	int ret;
				1489
				1490	btrfs_read_buffer(eb, gen);
				1491
				1492	level = btrfs_header_level(eb);
				1493
				1494	if (level != 0)
				1495	return 0;
				1496
				1497	path = btrfs_alloc_path();
				1498	BUG_ON(!path);
				1499
				1500	nritems = btrfs_header_nritems(eb);
				1501	for (i = 0; i < nritems; i++) {
				1502	btrfs_item_key_to_cpu(eb, &key, i);
				1503	item_size = btrfs_item_size_nr(eb, i);
				1504
				1505	/* inode keys are done during the first stage */
				1506	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1507	wc->stage == LOG_WALK_REPLAY_INODES) {
				1508	struct inode *inode;
				1509	struct btrfs_inode_item *inode_item;
				1510	u32 mode;
				1511
				1512	inode_item = btrfs_item_ptr(eb, i,
				1513	struct btrfs_inode_item);
				1514	mode = btrfs_inode_mode(eb, inode_item);
				1515	if (S_ISDIR(mode)) {
				1516	ret = replay_dir_deletes(wc->trans,
				1517	root, log, path, key.objectid);
				1518	BUG_ON(ret);
				1519	}
				1520	ret = overwrite_item(wc->trans, root, path,
				1521	eb, i, &key);
				1522	BUG_ON(ret);
				1523
				1524	/* for regular files, truncate away
				1525	* extents past the new EOF
				1526	*/
				1527	if (S_ISREG(mode)) {
				1528	inode = read_one_inode(root,
				1529	key.objectid);
				1530	BUG_ON(!inode);
				1531
				1532	ret = btrfs_truncate_inode_items(wc->trans,
				1533	root, inode, inode->i_size,
				1534	BTRFS_EXTENT_DATA_KEY);
				1535	BUG_ON(ret);
				1536	iput(inode);
				1537	}
				1538	ret = link_to_fixup_dir(wc->trans, root,
				1539	path, key.objectid);
				1540	BUG_ON(ret);
				1541	}
				1542	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1543	continue;
				1544
				1545	/* these keys are simply copied */
				1546	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1547	ret = overwrite_item(wc->trans, root, path,
				1548	eb, i, &key);
				1549	BUG_ON(ret);
				1550	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1551	ret = add_inode_ref(wc->trans, root, log, path,
				1552	eb, i, &key);
				1553	BUG_ON(ret && ret != -ENOENT);
				1554	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1555	ret = replay_one_extent(wc->trans, root, path,
				1556	eb, i, &key);
				1557	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1558	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1559	key.type == BTRFS_DIR_INDEX_KEY) {
				1560	ret = replay_one_dir_item(wc->trans, root, path,
				1561	eb, i, &key);
				1562	BUG_ON(ret);
				1563	}
				1564	}
				1565	btrfs_free_path(path);
				1566	return 0;
				1567	}
				1568
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1569	static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1570	struct btrfs_root *root,
				1571	struct btrfs_path path, int level,
				1572	struct walk_control *wc)
				1573	{
				1574	u64 root_owner;
				1575	u64 root_gen;
				1576	u64 bytenr;
				1577	u64 ptr_gen;
				1578	struct extent_buffer *next;
				1579	struct extent_buffer *cur;
				1580	struct extent_buffer *parent;
				1581	u32 blocksize;
				1582	int ret = 0;
				1583
				1584	WARN_ON(*level < 0);
				1585	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1586
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1587	while (*level > 0) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1588	WARN_ON(*level < 0);
				1589	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1590	cur = path->nodes[*level];
				1591
				1592	if (btrfs_header_level(cur) != *level)
				1593	WARN_ON(1);
				1594
				1595	if (path->slots[*level] >=
				1596	btrfs_header_nritems(cur))
				1597	break;
				1598
				1599	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1600	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1601	blocksize = btrfs_level_size(root, *level - 1);
				1602
				1603	parent = path->nodes[*level];
				1604	root_owner = btrfs_header_owner(parent);
				1605	root_gen = btrfs_header_generation(parent);
				1606
				1607	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1608
				1609	wc->process_func(root, next, wc, ptr_gen);
				1610
				1611	if (*level == 1) {
				1612	path->slots[*level]++;
				1613	if (wc->free) {
				1614	btrfs_read_buffer(next, ptr_gen);
				1615
				1616	btrfs_tree_lock(next);
				1617	clean_tree_block(trans, root, next);
Chris Mason	b4ce94d	2009-02-04 09:25:08 -0500	[diff] [blame]	1618	btrfs_set_lock_blocking(next);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1619	btrfs_wait_tree_block_writeback(next);
				1620	btrfs_tree_unlock(next);
				1621
				1622	ret = btrfs_drop_leaf_ref(trans, root, next);
				1623	BUG_ON(ret);
				1624
				1625	WARN_ON(root_owner !=
				1626	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1627	ret = btrfs_free_reserved_extent(root,
				1628	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1629	BUG_ON(ret);
				1630	}
				1631	free_extent_buffer(next);
				1632	continue;
				1633	}
				1634	btrfs_read_buffer(next, ptr_gen);
				1635
				1636	WARN_ON(*level <= 0);
				1637	if (path->nodes[*level-1])
				1638	free_extent_buffer(path->nodes[*level-1]);
				1639	path->nodes[*level-1] = next;
				1640	*level = btrfs_header_level(next);
				1641	path->slots[*level] = 0;
				1642	cond_resched();
				1643	}
				1644	WARN_ON(*level < 0);
				1645	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1646
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1647	if (path->nodes[*level] == root->node)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1648	parent = path->nodes[*level];
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1649	else
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1650	parent = path->nodes[*level + 1];
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1651
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1652	bytenr = path->nodes[*level]->start;
				1653
				1654	blocksize = btrfs_level_size(root, *level);
				1655	root_owner = btrfs_header_owner(parent);
				1656	root_gen = btrfs_header_generation(parent);
				1657
				1658	wc->process_func(root, path->nodes[*level], wc,
				1659	btrfs_header_generation(path->nodes[*level]));
				1660
				1661	if (wc->free) {
				1662	next = path->nodes[*level];
				1663	btrfs_tree_lock(next);
				1664	clean_tree_block(trans, root, next);
Chris Mason	b4ce94d	2009-02-04 09:25:08 -0500	[diff] [blame]	1665	btrfs_set_lock_blocking(next);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1666	btrfs_wait_tree_block_writeback(next);
				1667	btrfs_tree_unlock(next);
				1668
				1669	if (*level == 0) {
				1670	ret = btrfs_drop_leaf_ref(trans, root, next);
				1671	BUG_ON(ret);
				1672	}
				1673	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1674	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1675	BUG_ON(ret);
				1676	}
				1677	free_extent_buffer(path->nodes[*level]);
				1678	path->nodes[*level] = NULL;
				1679	*level += 1;
				1680
				1681	cond_resched();
				1682	return 0;
				1683	}
				1684
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1685	static noinline int walk_up_log_tree(struct btrfs_trans_handle *trans,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1686	struct btrfs_root *root,
				1687	struct btrfs_path path, int level,
				1688	struct walk_control *wc)
				1689	{
				1690	u64 root_owner;
				1691	u64 root_gen;
				1692	int i;
				1693	int slot;
				1694	int ret;
				1695
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1696	for (i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1697	slot = path->slots[i];
				1698	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1699	struct extent_buffer *node;
				1700	node = path->nodes[i];
				1701	path->slots[i]++;
				1702	*level = i;
				1703	WARN_ON(*level == 0);
				1704	return 0;
				1705	} else {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	1706	struct extent_buffer *parent;
				1707	if (path->nodes[*level] == root->node)
				1708	parent = path->nodes[*level];
				1709	else
				1710	parent = path->nodes[*level + 1];
				1711
				1712	root_owner = btrfs_header_owner(parent);
				1713	root_gen = btrfs_header_generation(parent);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1714	wc->process_func(root, path->nodes[*level], wc,
				1715	btrfs_header_generation(path->nodes[*level]));
				1716	if (wc->free) {
				1717	struct extent_buffer *next;
				1718
				1719	next = path->nodes[*level];
				1720
				1721	btrfs_tree_lock(next);
				1722	clean_tree_block(trans, root, next);
Chris Mason	b4ce94d	2009-02-04 09:25:08 -0500	[diff] [blame]	1723	btrfs_set_lock_blocking(next);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1724	btrfs_wait_tree_block_writeback(next);
				1725	btrfs_tree_unlock(next);
				1726
				1727	if (*level == 0) {
				1728	ret = btrfs_drop_leaf_ref(trans, root,
				1729	next);
				1730	BUG_ON(ret);
				1731	}
				1732
				1733	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1734	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1735	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1736	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1737	BUG_ON(ret);
				1738	}
				1739	free_extent_buffer(path->nodes[*level]);
				1740	path->nodes[*level] = NULL;
				1741	*level = i + 1;
				1742	}
				1743	}
				1744	return 1;
				1745	}
				1746
				1747	/*
				1748	* drop the reference count on the tree rooted at 'snap'. This traverses
				1749	* the tree freeing any blocks that have a ref count of zero after being
				1750	* decremented.
				1751	*/
				1752	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1753	struct btrfs_root log, struct walk_control wc)
				1754	{
				1755	int ret = 0;
				1756	int wret;
				1757	int level;
				1758	struct btrfs_path *path;
				1759	int i;
				1760	int orig_level;
				1761
				1762	path = btrfs_alloc_path();
				1763	BUG_ON(!path);
				1764
				1765	level = btrfs_header_level(log->node);
				1766	orig_level = level;
				1767	path->nodes[level] = log->node;
				1768	extent_buffer_get(log->node);
				1769	path->slots[level] = 0;
				1770
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1771	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1772	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1773	if (wret > 0)
				1774	break;
				1775	if (wret < 0)
				1776	ret = wret;
				1777
				1778	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1779	if (wret > 0)
				1780	break;
				1781	if (wret < 0)
				1782	ret = wret;
				1783	}
				1784
				1785	/* was the root node processed? if not, catch it here */
				1786	if (path->nodes[orig_level]) {
				1787	wc->process_func(log, path->nodes[orig_level], wc,
				1788	btrfs_header_generation(path->nodes[orig_level]));
				1789	if (wc->free) {
				1790	struct extent_buffer *next;
				1791
				1792	next = path->nodes[orig_level];
				1793
				1794	btrfs_tree_lock(next);
				1795	clean_tree_block(trans, log, next);
Chris Mason	b4ce94d	2009-02-04 09:25:08 -0500	[diff] [blame]	1796	btrfs_set_lock_blocking(next);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1797	btrfs_wait_tree_block_writeback(next);
				1798	btrfs_tree_unlock(next);
				1799
				1800	if (orig_level == 0) {
				1801	ret = btrfs_drop_leaf_ref(trans, log,
				1802	next);
				1803	BUG_ON(ret);
				1804	}
				1805	WARN_ON(log->root_key.objectid !=
				1806	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1807	ret = btrfs_free_reserved_extent(log, next->start,
				1808	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1809	BUG_ON(ret);
				1810	}
				1811	}
				1812
				1813	for (i = 0; i <= orig_level; i++) {
				1814	if (path->nodes[i]) {
				1815	free_extent_buffer(path->nodes[i]);
				1816	path->nodes[i] = NULL;
				1817	}
				1818	}
				1819	btrfs_free_path(path);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1820	return ret;
				1821	}
				1822
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1823	/*
				1824	* helper function to update the item for a given subvolumes log root
				1825	* in the tree of log roots
				1826	*/
				1827	static int update_log_root(struct btrfs_trans_handle *trans,
				1828	struct btrfs_root *log)
				1829	{
				1830	int ret;
				1831
				1832	if (log->log_transid == 1) {
				1833	/* insert root item on the first sync */
				1834	ret = btrfs_insert_root(trans, log->fs_info->log_root_tree,
				1835	&log->root_key, &log->root_item);
				1836	} else {
				1837	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				1838	&log->root_key, &log->root_item);
				1839	}
				1840	return ret;
				1841	}
				1842
				1843	static int wait_log_commit(struct btrfs_root *root, unsigned long transid)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1844	{
				1845	DEFINE_WAIT(wait);
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1846	int index = transid % 2;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1847
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1848	/*
				1849	* we only allow two pending log transactions at a time,
				1850	* so we know that if ours is more than 2 older than the
				1851	* current transaction, we're done
				1852	*/
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1853	do {
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1854	prepare_to_wait(&root->log_commit_wait[index],
				1855	&wait, TASK_UNINTERRUPTIBLE);
				1856	mutex_unlock(&root->log_mutex);
				1857	if (root->log_transid < transid + 2 &&
				1858	atomic_read(&root->log_commit[index]))
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1859	schedule();
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1860	finish_wait(&root->log_commit_wait[index], &wait);
				1861	mutex_lock(&root->log_mutex);
				1862	} while (root->log_transid < transid + 2 &&
				1863	atomic_read(&root->log_commit[index]));
				1864	return 0;
				1865	}
				1866
				1867	static int wait_for_writer(struct btrfs_root *root)
				1868	{
				1869	DEFINE_WAIT(wait);
				1870	while (atomic_read(&root->log_writers)) {
				1871	prepare_to_wait(&root->log_writer_wait,
				1872	&wait, TASK_UNINTERRUPTIBLE);
				1873	mutex_unlock(&root->log_mutex);
				1874	if (atomic_read(&root->log_writers))
				1875	schedule();
				1876	mutex_lock(&root->log_mutex);
				1877	finish_wait(&root->log_writer_wait, &wait);
				1878	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1879	return 0;
				1880	}
				1881
				1882	/*
				1883	* btrfs_sync_log does sends a given tree log down to the disk and
				1884	* updates the super blocks to record it. When this call is done,
				1885	* you know that any inodes previously logged are safely on disk
				1886	*/
				1887	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1888	struct btrfs_root *root)
				1889	{
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1890	int index1;
				1891	int index2;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1892	int ret;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1893	struct btrfs_root *log = root->log_root;
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1894	struct btrfs_root *log_root_tree = root->fs_info->log_root_tree;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1895
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1896	mutex_lock(&root->log_mutex);
				1897	index1 = root->log_transid % 2;
				1898	if (atomic_read(&root->log_commit[index1])) {
				1899	wait_log_commit(root, root->log_transid);
				1900	mutex_unlock(&root->log_mutex);
				1901	return 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1902	}
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1903	atomic_set(&root->log_commit[index1], 1);
				1904
				1905	/* wait for previous tree log sync to complete */
				1906	if (atomic_read(&root->log_commit[(index1 + 1) % 2]))
				1907	wait_log_commit(root, root->log_transid - 1);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1908
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	1909	while (1) {
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1910	unsigned long batch = root->log_batch;
				1911	mutex_unlock(&root->log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1912	schedule_timeout_uninterruptible(1);
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1913	mutex_lock(&root->log_mutex);
				1914	wait_for_writer(root);
				1915	if (batch == root->log_batch)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1916	break;
				1917	}
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1918
				1919	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1920	BUG_ON(ret);
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1921
				1922	btrfs_set_root_bytenr(&log->root_item, log->node->start);
				1923	btrfs_set_root_generation(&log->root_item, trans->transid);
				1924	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				1925
				1926	root->log_batch = 0;
				1927	root->log_transid++;
				1928	log->log_transid = root->log_transid;
				1929	smp_mb();
				1930	/*
				1931	* log tree has been flushed to disk, new modifications of
				1932	* the log will be written to new positions. so it's safe to
				1933	* allow log writers to go in.
				1934	*/
				1935	mutex_unlock(&root->log_mutex);
				1936
				1937	mutex_lock(&log_root_tree->log_mutex);
				1938	log_root_tree->log_batch++;
				1939	atomic_inc(&log_root_tree->log_writers);
				1940	mutex_unlock(&log_root_tree->log_mutex);
				1941
				1942	ret = update_log_root(trans, log);
				1943	BUG_ON(ret);
				1944
				1945	mutex_lock(&log_root_tree->log_mutex);
				1946	if (atomic_dec_and_test(&log_root_tree->log_writers)) {
				1947	smp_mb();
				1948	if (waitqueue_active(&log_root_tree->log_writer_wait))
				1949	wake_up(&log_root_tree->log_writer_wait);
				1950	}
				1951
				1952	index2 = log_root_tree->log_transid % 2;
				1953	if (atomic_read(&log_root_tree->log_commit[index2])) {
				1954	wait_log_commit(log_root_tree, log_root_tree->log_transid);
				1955	mutex_unlock(&log_root_tree->log_mutex);
				1956	goto out;
				1957	}
				1958	atomic_set(&log_root_tree->log_commit[index2], 1);
				1959
				1960	if (atomic_read(&log_root_tree->log_commit[(index2 + 1) % 2]))
				1961	wait_log_commit(log_root_tree, log_root_tree->log_transid - 1);
				1962
				1963	wait_for_writer(log_root_tree);
				1964
				1965	ret = btrfs_write_and_wait_marked_extents(log_root_tree,
				1966	&log_root_tree->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1967	BUG_ON(ret);
				1968
				1969	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1970	log_root_tree->node->start);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1971	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1972	btrfs_header_level(log_root_tree->node));
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1973
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1974	log_root_tree->log_batch = 0;
				1975	log_root_tree->log_transid++;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1976	smp_mb();
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1977
				1978	mutex_unlock(&log_root_tree->log_mutex);
				1979
				1980	/*
				1981	* nobody else is going to jump in and write the the ctree
				1982	* super here because the log_commit atomic below is protecting
				1983	* us. We must be called with a transaction handle pinning
				1984	* the running transaction open, so a full commit can't hop
				1985	* in and cause problems either.
				1986	*/
				1987	write_ctree_super(trans, root->fs_info->tree_root, 2);
				1988
				1989	atomic_set(&log_root_tree->log_commit[index2], 0);
				1990	smp_mb();
				1991	if (waitqueue_active(&log_root_tree->log_commit_wait[index2]))
				1992	wake_up(&log_root_tree->log_commit_wait[index2]);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1993	out:
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	1994	atomic_set(&root->log_commit[index1], 0);
				1995	smp_mb();
				1996	if (waitqueue_active(&root->log_commit_wait[index1]))
				1997	wake_up(&root->log_commit_wait[index1]);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1998	return 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1999	}
				2000
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2001	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2002	* at commit time of the full transaction
				2003	*/
				2004	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2005	{
				2006	int ret;
				2007	struct btrfs_root *log;
				2008	struct key;
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2009	u64 start;
				2010	u64 end;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2011	struct walk_control wc = {
				2012	.free = 1,
				2013	.process_func = process_one_buffer
				2014	};
				2015
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	2016	if (!root->log_root \|\| root->fs_info->log_root_recovering)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2017	return 0;
				2018
				2019	log = root->log_root;
				2020	ret = walk_log_tree(trans, log, &wc);
				2021	BUG_ON(ret);
				2022
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2023	while (1) {
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2024	ret = find_first_extent_bit(&log->dirty_log_pages,
				2025	0, &start, &end, EXTENT_DIRTY);
				2026	if (ret)
				2027	break;
				2028
				2029	clear_extent_dirty(&log->dirty_log_pages,
				2030	start, end, GFP_NOFS);
				2031	}
				2032
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	2033	if (log->log_transid > 0) {
				2034	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2035	&log->root_key);
				2036	BUG_ON(ret);
				2037	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2038	root->log_root = NULL;
Yan Zheng	7237f18	2009-01-21 12:54:03 -0500	[diff] [blame]	2039	free_extent_buffer(log->node);
				2040	kfree(log);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2041	return 0;
				2042	}
				2043
				2044	/*
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2045	* If both a file and directory are logged, and unlinks or renames are
				2046	* mixed in, we have a few interesting corners:
				2047	*
				2048	* create file X in dir Y
				2049	* link file X to X.link in dir Y
				2050	* fsync file X
				2051	* unlink file X but leave X.link
				2052	* fsync dir Y
				2053	*
				2054	* After a crash we would expect only X.link to exist. But file X
				2055	* didn't get fsync'd again so the log has back refs for X and X.link.
				2056	*
				2057	* We solve this by removing directory entries and inode backrefs from the
				2058	* log when a file that was logged in the current transaction is
				2059	* unlinked. Any later fsync will include the updated log entries, and
				2060	* we'll be able to reconstruct the proper directory items from backrefs.
				2061	*
				2062	* This optimizations allows us to avoid relogging the entire inode
				2063	* or the entire directory.
				2064	*/
				2065	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2066	struct btrfs_root *root,
				2067	const char *name, int name_len,
				2068	struct inode *dir, u64 index)
				2069	{
				2070	struct btrfs_root *log;
				2071	struct btrfs_dir_item *di;
				2072	struct btrfs_path *path;
				2073	int ret;
				2074	int bytes_del = 0;
				2075
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2076	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2077	return 0;
				2078
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2079	ret = join_running_log_trans(root);
				2080	if (ret)
				2081	return 0;
				2082
				2083	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2084
				2085	log = root->log_root;
				2086	path = btrfs_alloc_path();
				2087	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2088	name, name_len, -1);
				2089	if (di && !IS_ERR(di)) {
				2090	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2091	bytes_del += name_len;
				2092	BUG_ON(ret);
				2093	}
				2094	btrfs_release_path(log, path);
				2095	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2096	index, name, name_len, -1);
				2097	if (di && !IS_ERR(di)) {
				2098	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2099	bytes_del += name_len;
				2100	BUG_ON(ret);
				2101	}
				2102
				2103	/* update the directory size in the log to reflect the names
				2104	* we have removed
				2105	*/
				2106	if (bytes_del) {
				2107	struct btrfs_key key;
				2108
				2109	key.objectid = dir->i_ino;
				2110	key.offset = 0;
				2111	key.type = BTRFS_INODE_ITEM_KEY;
				2112	btrfs_release_path(log, path);
				2113
				2114	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2115	if (ret == 0) {
				2116	struct btrfs_inode_item *item;
				2117	u64 i_size;
				2118
				2119	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2120	struct btrfs_inode_item);
				2121	i_size = btrfs_inode_size(path->nodes[0], item);
				2122	if (i_size > bytes_del)
				2123	i_size -= bytes_del;
				2124	else
				2125	i_size = 0;
				2126	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2127	btrfs_mark_buffer_dirty(path->nodes[0]);
				2128	} else
				2129	ret = 0;
				2130	btrfs_release_path(log, path);
				2131	}
				2132
				2133	btrfs_free_path(path);
				2134	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2135	end_log_trans(root);
				2136
				2137	return 0;
				2138	}
				2139
				2140	/* see comments for btrfs_del_dir_entries_in_log */
				2141	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2142	struct btrfs_root *root,
				2143	const char *name, int name_len,
				2144	struct inode *inode, u64 dirid)
				2145	{
				2146	struct btrfs_root *log;
				2147	u64 index;
				2148	int ret;
				2149
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2150	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2151	return 0;
				2152
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2153	ret = join_running_log_trans(root);
				2154	if (ret)
				2155	return 0;
				2156	log = root->log_root;
				2157	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2158
				2159	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2160	dirid, &index);
				2161	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2162	end_log_trans(root);
				2163
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2164	return ret;
				2165	}
				2166
				2167	/*
				2168	* creates a range item in the log for 'dirid'. first_offset and
				2169	* last_offset tell us which parts of the key space the log should
				2170	* be considered authoritative for.
				2171	*/
				2172	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2173	struct btrfs_root *log,
				2174	struct btrfs_path *path,
				2175	int key_type, u64 dirid,
				2176	u64 first_offset, u64 last_offset)
				2177	{
				2178	int ret;
				2179	struct btrfs_key key;
				2180	struct btrfs_dir_log_item *item;
				2181
				2182	key.objectid = dirid;
				2183	key.offset = first_offset;
				2184	if (key_type == BTRFS_DIR_ITEM_KEY)
				2185	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2186	else
				2187	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2188	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2189	BUG_ON(ret);
				2190
				2191	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2192	struct btrfs_dir_log_item);
				2193	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2194	btrfs_mark_buffer_dirty(path->nodes[0]);
				2195	btrfs_release_path(log, path);
				2196	return 0;
				2197	}
				2198
				2199	/*
				2200	* log all the items included in the current transaction for a given
				2201	* directory. This also creates the range items in the log tree required
				2202	* to replay anything deleted before the fsync
				2203	*/
				2204	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2205	struct btrfs_root root, struct inode inode,
				2206	struct btrfs_path *path,
				2207	struct btrfs_path *dst_path, int key_type,
				2208	u64 min_offset, u64 *last_offset_ret)
				2209	{
				2210	struct btrfs_key min_key;
				2211	struct btrfs_key max_key;
				2212	struct btrfs_root *log = root->log_root;
				2213	struct extent_buffer *src;
				2214	int ret;
				2215	int i;
				2216	int nritems;
				2217	u64 first_offset = min_offset;
				2218	u64 last_offset = (u64)-1;
				2219
				2220	log = root->log_root;
				2221	max_key.objectid = inode->i_ino;
				2222	max_key.offset = (u64)-1;
				2223	max_key.type = key_type;
				2224
				2225	min_key.objectid = inode->i_ino;
				2226	min_key.type = key_type;
				2227	min_key.offset = min_offset;
				2228
				2229	path->keep_locks = 1;
				2230
				2231	ret = btrfs_search_forward(root, &min_key, &max_key,
				2232	path, 0, trans->transid);
				2233
				2234	/*
				2235	* we didn't find anything from this transaction, see if there
				2236	* is anything at all
				2237	*/
				2238	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2239	min_key.type != key_type) {
				2240	min_key.objectid = inode->i_ino;
				2241	min_key.type = key_type;
				2242	min_key.offset = (u64)-1;
				2243	btrfs_release_path(root, path);
				2244	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2245	if (ret < 0) {
				2246	btrfs_release_path(root, path);
				2247	return ret;
				2248	}
				2249	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2250
				2251	/* if ret == 0 there are items for this type,
				2252	* create a range to tell us the last key of this type.
				2253	* otherwise, there are no items in this directory after
				2254	* *min_offset, and we create a range to indicate that.
				2255	*/
				2256	if (ret == 0) {
				2257	struct btrfs_key tmp;
				2258	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2259	path->slots[0]);
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2260	if (key_type == tmp.type)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2261	first_offset = max(min_offset, tmp.offset) + 1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2262	}
				2263	goto done;
				2264	}
				2265
				2266	/* go backward to find any previous key */
				2267	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2268	if (ret == 0) {
				2269	struct btrfs_key tmp;
				2270	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2271	if (key_type == tmp.type) {
				2272	first_offset = tmp.offset;
				2273	ret = overwrite_item(trans, log, dst_path,
				2274	path->nodes[0], path->slots[0],
				2275	&tmp);
				2276	}
				2277	}
				2278	btrfs_release_path(root, path);
				2279
				2280	/* find the first key from this transaction again */
				2281	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2282	if (ret != 0) {
				2283	WARN_ON(1);
				2284	goto done;
				2285	}
				2286
				2287	/*
				2288	* we have a block from this transaction, log every item in it
				2289	* from our directory
				2290	*/
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2291	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2292	struct btrfs_key tmp;
				2293	src = path->nodes[0];
				2294	nritems = btrfs_header_nritems(src);
				2295	for (i = path->slots[0]; i < nritems; i++) {
				2296	btrfs_item_key_to_cpu(src, &min_key, i);
				2297
				2298	if (min_key.objectid != inode->i_ino \|\|
				2299	min_key.type != key_type)
				2300	goto done;
				2301	ret = overwrite_item(trans, log, dst_path, src, i,
				2302	&min_key);
				2303	BUG_ON(ret);
				2304	}
				2305	path->slots[0] = nritems;
				2306
				2307	/*
				2308	* look ahead to the next item and see if it is also
				2309	* from this directory and from this transaction
				2310	*/
				2311	ret = btrfs_next_leaf(root, path);
				2312	if (ret == 1) {
				2313	last_offset = (u64)-1;
				2314	goto done;
				2315	}
				2316	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2317	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2318	last_offset = (u64)-1;
				2319	goto done;
				2320	}
				2321	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2322	ret = overwrite_item(trans, log, dst_path,
				2323	path->nodes[0], path->slots[0],
				2324	&tmp);
				2325
				2326	BUG_ON(ret);
				2327	last_offset = tmp.offset;
				2328	goto done;
				2329	}
				2330	}
				2331	done:
				2332	*last_offset_ret = last_offset;
				2333	btrfs_release_path(root, path);
				2334	btrfs_release_path(log, dst_path);
				2335
				2336	/* insert the log range keys to indicate where the log is valid */
				2337	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2338	first_offset, last_offset);
				2339	BUG_ON(ret);
				2340	return 0;
				2341	}
				2342
				2343	/*
				2344	* logging directories is very similar to logging inodes, We find all the items
				2345	* from the current transaction and write them to the log.
				2346	*
				2347	* The recovery code scans the directory in the subvolume, and if it finds a
				2348	* key in the range logged that is not present in the log tree, then it means
				2349	* that dir entry was unlinked during the transaction.
				2350	*
				2351	* In order for that scan to work, we must include one key smaller than
				2352	* the smallest logged by this transaction and one key larger than the largest
				2353	* key logged by this transaction.
				2354	*/
				2355	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2356	struct btrfs_root root, struct inode inode,
				2357	struct btrfs_path *path,
				2358	struct btrfs_path *dst_path)
				2359	{
				2360	u64 min_key;
				2361	u64 max_key;
				2362	int ret;
				2363	int key_type = BTRFS_DIR_ITEM_KEY;
				2364
				2365	again:
				2366	min_key = 0;
				2367	max_key = 0;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2368	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2369	ret = log_dir_items(trans, root, inode, path,
				2370	dst_path, key_type, min_key,
				2371	&max_key);
				2372	BUG_ON(ret);
				2373	if (max_key == (u64)-1)
				2374	break;
				2375	min_key = max_key + 1;
				2376	}
				2377
				2378	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2379	key_type = BTRFS_DIR_INDEX_KEY;
				2380	goto again;
				2381	}
				2382	return 0;
				2383	}
				2384
				2385	/*
				2386	* a helper function to drop items from the log before we relog an
				2387	* inode. max_key_type indicates the highest item type to remove.
				2388	* This cannot be run for file data extents because it does not
				2389	* free the extents they point to.
				2390	*/
				2391	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2392	struct btrfs_root *log,
				2393	struct btrfs_path *path,
				2394	u64 objectid, int max_key_type)
				2395	{
				2396	int ret;
				2397	struct btrfs_key key;
				2398	struct btrfs_key found_key;
				2399
				2400	key.objectid = objectid;
				2401	key.type = max_key_type;
				2402	key.offset = (u64)-1;
				2403
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2404	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2405	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2406
				2407	if (ret != 1)
				2408	break;
				2409
				2410	if (path->slots[0] == 0)
				2411	break;
				2412
				2413	path->slots[0]--;
				2414	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2415	path->slots[0]);
				2416
				2417	if (found_key.objectid != objectid)
				2418	break;
				2419
				2420	ret = btrfs_del_item(trans, log, path);
				2421	BUG_ON(ret);
				2422	btrfs_release_path(log, path);
				2423	}
				2424	btrfs_release_path(log, path);
				2425	return 0;
				2426	}
				2427
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2428	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2429	struct btrfs_root *log,
				2430	struct btrfs_path *dst_path,
				2431	struct extent_buffer *src,
				2432	int start_slot, int nr, int inode_only)
				2433	{
				2434	unsigned long src_offset;
				2435	unsigned long dst_offset;
				2436	struct btrfs_file_extent_item *extent;
				2437	struct btrfs_inode_item *inode_item;
				2438	int ret;
				2439	struct btrfs_key *ins_keys;
				2440	u32 *ins_sizes;
				2441	char *ins_data;
				2442	int i;
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2443	struct list_head ordered_sums;
				2444
				2445	INIT_LIST_HEAD(&ordered_sums);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2446
				2447	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2448	nr * sizeof(u32), GFP_NOFS);
				2449	ins_sizes = (u32 *)ins_data;
				2450	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2451
				2452	for (i = 0; i < nr; i++) {
				2453	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2454	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2455	}
				2456	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2457	ins_keys, ins_sizes, nr);
				2458	BUG_ON(ret);
				2459
				2460	for (i = 0; i < nr; i++) {
				2461	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2462	dst_path->slots[0]);
				2463
				2464	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2465
				2466	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2467	src_offset, ins_sizes[i]);
				2468
				2469	if (inode_only == LOG_INODE_EXISTS &&
				2470	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2471	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2472	dst_path->slots[0],
				2473	struct btrfs_inode_item);
				2474	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2475
				2476	/* set the generation to zero so the recover code
				2477	* can tell the difference between an logging
				2478	* just to say 'this inode exists' and a logging
				2479	* to say 'update this inode with these values'
				2480	*/
				2481	btrfs_set_inode_generation(dst_path->nodes[0],
				2482	inode_item, 0);
				2483	}
				2484	/* take a reference on file data extents so that truncates
				2485	* or deletes of this inode don't have to relog the inode
				2486	* again
				2487	*/
				2488	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2489	int found_type;
				2490	extent = btrfs_item_ptr(src, start_slot + i,
				2491	struct btrfs_file_extent_item);
				2492
				2493	found_type = btrfs_file_extent_type(src, extent);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame]	2494	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				2495	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2496	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2497	extent);
				2498	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2499	extent);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2500	u64 cs = btrfs_file_extent_offset(src, extent);
				2501	u64 cl = btrfs_file_extent_num_bytes(src,
				2502	extent);;
Chris Mason	580afd7	2008-12-08 19:15:39 -0500	[diff] [blame]	2503	if (btrfs_file_extent_compression(src,
				2504	extent)) {
				2505	cs = 0;
				2506	cl = dl;
				2507	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2508	/* ds == 0 is a hole */
				2509	if (ds != 0) {
				2510	ret = btrfs_inc_extent_ref(trans, log,
				2511	ds, dl,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2512	dst_path->nodes[0]->start,
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2513	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2514	trans->transid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	2515	ins_keys[i].objectid);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2516	BUG_ON(ret);
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	2517	ret = btrfs_lookup_csums_range(
				2518	log->fs_info->csum_root,
				2519	ds + cs, ds + cs + cl - 1,
				2520	&ordered_sums);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2521	BUG_ON(ret);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2522	}
				2523	}
				2524	}
				2525	dst_path->slots[0]++;
				2526	}
				2527
				2528	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2529	btrfs_release_path(log, dst_path);
				2530	kfree(ins_data);
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2531
				2532	/*
				2533	* we have to do this after the loop above to avoid changing the
				2534	* log tree while trying to change the log tree.
				2535	*/
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2536	while (!list_empty(&ordered_sums)) {
Chris Mason	d20f704	2008-12-08 16:58:54 -0500	[diff] [blame]	2537	struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next,
				2538	struct btrfs_ordered_sum,
				2539	list);
				2540	ret = btrfs_csum_file_blocks(trans, log, sums);
				2541	BUG_ON(ret);
				2542	list_del(&sums->list);
				2543	kfree(sums);
				2544	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2545	return 0;
				2546	}
				2547
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2548	/* log a single inode in the tree log.
				2549	* At least one parent directory for this inode must exist in the tree
				2550	* or be logged already.
				2551	*
				2552	* Any items from this inode changed by the current transaction are copied
				2553	* to the log tree. An extra reference is taken on any extents in this
				2554	* file, allowing us to avoid a whole pile of corner cases around logging
				2555	* blocks that have been removed from the tree.
				2556	*
				2557	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2558	* does.
				2559	*
				2560	* This handles both files and directories.
				2561	*/
				2562	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2563	struct btrfs_root root, struct inode inode,
				2564	int inode_only)
				2565	{
				2566	struct btrfs_path *path;
				2567	struct btrfs_path *dst_path;
				2568	struct btrfs_key min_key;
				2569	struct btrfs_key max_key;
				2570	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2571	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2572	u32 size;
				2573	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2574	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2575	int ins_start_slot = 0;
				2576	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2577
				2578	log = root->log_root;
				2579
				2580	path = btrfs_alloc_path();
				2581	dst_path = btrfs_alloc_path();
				2582
				2583	min_key.objectid = inode->i_ino;
				2584	min_key.type = BTRFS_INODE_ITEM_KEY;
				2585	min_key.offset = 0;
				2586
				2587	max_key.objectid = inode->i_ino;
				2588	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2589	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2590	else
				2591	max_key.type = (u8)-1;
				2592	max_key.offset = (u64)-1;
				2593
				2594	/*
				2595	* if this inode has already been logged and we're in inode_only
				2596	* mode, we don't want to delete the things that have already
				2597	* been written to the log.
				2598	*
				2599	* But, if the inode has been through an inode_only log,
				2600	* the logged_trans field is not set. This allows us to catch
				2601	* any new names for this inode in the backrefs by logging it
				2602	* again
				2603	*/
				2604	if (inode_only == LOG_INODE_EXISTS &&
				2605	BTRFS_I(inode)->logged_trans == trans->transid) {
				2606	btrfs_free_path(path);
				2607	btrfs_free_path(dst_path);
				2608	goto out;
				2609	}
				2610	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2611
				2612	/*
				2613	* a brute force approach to making sure we get the most uptodate
				2614	* copies of everything.
				2615	*/
				2616	if (S_ISDIR(inode->i_mode)) {
				2617	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2618
				2619	if (inode_only == LOG_INODE_EXISTS)
				2620	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2621	ret = drop_objectid_items(trans, log, path,
				2622	inode->i_ino, max_key_type);
				2623	} else {
				2624	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2625	}
				2626	BUG_ON(ret);
				2627	path->keep_locks = 1;
				2628
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2629	while (1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2630	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2631	ret = btrfs_search_forward(root, &min_key, &max_key,
				2632	path, 0, trans->transid);
				2633	if (ret != 0)
				2634	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2635	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2636	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2637	if (min_key.objectid != inode->i_ino)
				2638	break;
				2639	if (min_key.type > max_key.type)
				2640	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2641
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2642	src = path->nodes[0];
				2643	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2644	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2645	ins_nr++;
				2646	goto next_slot;
				2647	} else if (!ins_nr) {
				2648	ins_start_slot = path->slots[0];
				2649	ins_nr = 1;
				2650	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2651	}
				2652
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2653	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2654	ins_nr, inode_only);
				2655	BUG_ON(ret);
				2656	ins_nr = 1;
				2657	ins_start_slot = path->slots[0];
				2658	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2659
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2660	nritems = btrfs_header_nritems(path->nodes[0]);
				2661	path->slots[0]++;
				2662	if (path->slots[0] < nritems) {
				2663	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2664	path->slots[0]);
				2665	goto again;
				2666	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2667	if (ins_nr) {
				2668	ret = copy_items(trans, log, dst_path, src,
				2669	ins_start_slot,
				2670	ins_nr, inode_only);
				2671	BUG_ON(ret);
				2672	ins_nr = 0;
				2673	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2674	btrfs_release_path(root, path);
				2675
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2676	if (min_key.offset < (u64)-1)
				2677	min_key.offset++;
				2678	else if (min_key.type < (u8)-1)
				2679	min_key.type++;
				2680	else if (min_key.objectid < (u64)-1)
				2681	min_key.objectid++;
				2682	else
				2683	break;
				2684	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2685	if (ins_nr) {
				2686	ret = copy_items(trans, log, dst_path, src,
				2687	ins_start_slot,
				2688	ins_nr, inode_only);
				2689	BUG_ON(ret);
				2690	ins_nr = 0;
				2691	}
				2692	WARN_ON(ins_nr);
Chris Mason	9623f9a	2008-09-11 17:42:42 -0400	[diff] [blame]	2693	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2694	btrfs_release_path(root, path);
				2695	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2696	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2697	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2698	BUG_ON(ret);
				2699	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2700	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2701	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2702
				2703	btrfs_free_path(path);
				2704	btrfs_free_path(dst_path);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2705	out:
				2706	return 0;
				2707	}
				2708
				2709	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2710	struct btrfs_root root, struct inode inode,
				2711	int inode_only)
				2712	{
				2713	int ret;
				2714
				2715	start_log_trans(trans, root);
				2716	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2717	end_log_trans(root);
				2718	return ret;
				2719	}
				2720
				2721	/*
				2722	* helper function around btrfs_log_inode to make sure newly created
				2723	* parent directories also end up in the log. A minimal inode and backref
				2724	* only logging is done of any parent directories that are older than
				2725	* the last committed transaction
				2726	*/
				2727	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2728	struct btrfs_root root, struct dentry dentry)
				2729	{
				2730	int inode_only = LOG_INODE_ALL;
				2731	struct super_block *sb;
				2732	int ret;
				2733
				2734	start_log_trans(trans, root);
				2735	sb = dentry->d_inode->i_sb;
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2736	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2737	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2738	inode_only);
				2739	BUG_ON(ret);
				2740	inode_only = LOG_INODE_EXISTS;
				2741
				2742	dentry = dentry->d_parent;
				2743	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2744	break;
				2745
				2746	if (BTRFS_I(dentry->d_inode)->generation <=
				2747	root->fs_info->last_trans_committed)
				2748	break;
				2749	}
				2750	end_log_trans(root);
				2751	return 0;
				2752	}
				2753
				2754	/*
				2755	* it is not safe to log dentry if the chunk root has added new
				2756	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2757	* If this returns 1, you must commit the transaction to safely get your
				2758	* data on disk.
				2759	*/
				2760	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2761	struct btrfs_root root, struct dentry dentry)
				2762	{
				2763	u64 gen;
				2764	gen = root->fs_info->last_trans_new_blockgroup;
				2765	if (gen > root->fs_info->last_trans_committed)
				2766	return 1;
				2767	else
				2768	return btrfs_log_dentry(trans, root, dentry);
				2769	}
				2770
				2771	/*
				2772	* should be called during mount to recover any replay any log trees
				2773	* from the FS
				2774	*/
				2775	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2776	{
				2777	int ret;
				2778	struct btrfs_path *path;
				2779	struct btrfs_trans_handle *trans;
				2780	struct btrfs_key key;
				2781	struct btrfs_key found_key;
				2782	struct btrfs_key tmp_key;
				2783	struct btrfs_root *log;
				2784	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2785	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2786	struct walk_control wc = {
				2787	.process_func = process_one_buffer,
				2788	.stage = 0,
				2789	};
				2790
				2791	fs_info->log_root_recovering = 1;
				2792	path = btrfs_alloc_path();
				2793	BUG_ON(!path);
				2794
				2795	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2796
				2797	wc.trans = trans;
				2798	wc.pin = 1;
				2799
				2800	walk_log_tree(trans, log_root_tree, &wc);
				2801
				2802	again:
				2803	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2804	key.offset = (u64)-1;
				2805	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2806
Chris Mason	d397712	2009-01-05 21:25:51 -0500	[diff] [blame]	2807	while (1) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2808	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2809	if (ret < 0)
				2810	break;
				2811	if (ret > 0) {
				2812	if (path->slots[0] == 0)
				2813	break;
				2814	path->slots[0]--;
				2815	}
				2816	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2817	path->slots[0]);
				2818	btrfs_release_path(log_root_tree, path);
				2819	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2820	break;
				2821
				2822	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2823	&found_key);
				2824	BUG_ON(!log);
				2825
				2826
				2827	tmp_key.objectid = found_key.offset;
				2828	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2829	tmp_key.offset = (u64)-1;
				2830
				2831	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2832	BUG_ON(!wc.replay_dest);
				2833
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	2834	wc.replay_dest->log_root = log;
Yan Zheng	2456242	2009-02-12 14:14:53 -0500	[diff] [blame]	2835	mutex_lock(&fs_info->trans_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2836	btrfs_record_root_in_trans(wc.replay_dest);
Yan Zheng	2456242	2009-02-12 14:14:53 -0500	[diff] [blame]	2837	mutex_unlock(&fs_info->trans_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2838	ret = walk_log_tree(trans, log, &wc);
				2839	BUG_ON(ret);
				2840
				2841	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2842	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2843	path);
				2844	BUG_ON(ret);
				2845	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2846	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2847	if (ret == 0) {
				2848	wc.replay_dest->highest_inode = highest_inode;
				2849	wc.replay_dest->last_inode_alloc = highest_inode;
				2850	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2851
				2852	key.offset = found_key.offset - 1;
Yan Zheng	07d400a	2009-01-06 11:42:00 -0500	[diff] [blame]	2853	wc.replay_dest->log_root = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2854	free_extent_buffer(log->node);
				2855	kfree(log);
				2856
				2857	if (found_key.offset == 0)
				2858	break;
				2859	}
				2860	btrfs_release_path(log_root_tree, path);
				2861
				2862	/* step one is to pin it all, step two is to replay just inodes */
				2863	if (wc.pin) {
				2864	wc.pin = 0;
				2865	wc.process_func = replay_one_buffer;
				2866	wc.stage = LOG_WALK_REPLAY_INODES;
				2867	goto again;
				2868	}
				2869	/* step three is to replay everything */
				2870	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2871	wc.stage++;
				2872	goto again;
				2873	}
				2874
				2875	btrfs_free_path(path);
				2876
				2877	free_extent_buffer(log_root_tree->node);
				2878	log_root_tree->log_root = NULL;
				2879	fs_info->log_root_recovering = 0;
				2880
				2881	/* step 4: commit the transaction, which also unpins the blocks */
				2882	btrfs_commit_transaction(trans, fs_info->tree_root);
				2883
				2884	kfree(log_root_tree);
				2885	return 0;
				2886	}