Blame - fs/btrfs/tree-log.c - fp2-dev/kernel/msm

blob: 835daed5561f5c96b8f9cb244c20f1975ff3f73b [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
				26
				27	/* magic values for the inode_only field in btrfs_log_inode:
				28	*
				29	* LOG_INODE_ALL means to log everything
				30	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				31	* during log replay
				32	*/
				33	#define LOG_INODE_ALL 0
				34	#define LOG_INODE_EXISTS 1
				35
				36	/*
				37	* stages for the tree walking. The first
				38	* stage (0) is to only pin down the blocks we find
				39	* the second stage (1) is to make sure that all the inodes
				40	* we find in the log are created in the subvolume.
				41	*
				42	* The last stage is to deal with directories and links and extents
				43	* and all the other fun semantics
				44	*/
				45	#define LOG_WALK_PIN_ONLY 0
				46	#define LOG_WALK_REPLAY_INODES 1
				47	#define LOG_WALK_REPLAY_ALL 2
				48
				49	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				50	struct btrfs_root root, struct inode inode,
				51	int inode_only);
				52
				53	/*
				54	* tree logging is a special write ahead log used to make sure that
				55	* fsyncs and O_SYNCs can happen without doing full tree commits.
				56	*
				57	* Full tree commits are expensive because they require commonly
				58	* modified blocks to be recowed, creating many dirty pages in the
				59	* extent tree an 4x-6x higher write load than ext3.
				60	*
				61	* Instead of doing a tree commit on every fsync, we use the
				62	* key ranges and transaction ids to find items for a given file or directory
				63	* that have changed in this transaction. Those items are copied into
				64	* a special tree (one per subvolume root), that tree is written to disk
				65	* and then the fsync is considered complete.
				66	*
				67	* After a crash, items are copied out of the log-tree back into the
				68	* subvolume tree. Any file data extents found are recorded in the extent
				69	* allocation tree, and the log-tree freed.
				70	*
				71	* The log tree is read three times, once to pin down all the extents it is
				72	* using in ram and once, once to create all the inodes logged in the tree
				73	* and once to do all the other items.
				74	*/
				75
				76	/*
				77	* btrfs_add_log_tree adds a new per-subvolume log tree into the
				78	* tree of log tree roots. This must be called with a tree log transaction
				79	* running (see start_log_trans).
				80	*/
				81	int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
				82	struct btrfs_root *root)
				83	{
				84	struct btrfs_key key;
				85	struct btrfs_root_item root_item;
				86	struct btrfs_inode_item *inode_item;
				87	struct extent_buffer *leaf;
				88	struct btrfs_root *new_root = root;
				89	int ret;
				90	u64 objectid = root->root_key.objectid;
				91
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	92	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	93	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	94	trans->transid, 0, 0, 0);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	95	if (IS_ERR(leaf)) {
				96	ret = PTR_ERR(leaf);
				97	return ret;
				98	}
				99
				100	btrfs_set_header_nritems(leaf, 0);
				101	btrfs_set_header_level(leaf, 0);
				102	btrfs_set_header_bytenr(leaf, leaf->start);
				103	btrfs_set_header_generation(leaf, trans->transid);
				104	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				105
				106	write_extent_buffer(leaf, root->fs_info->fsid,
				107	(unsigned long)btrfs_header_fsid(leaf),
				108	BTRFS_FSID_SIZE);
				109	btrfs_mark_buffer_dirty(leaf);
				110
				111	inode_item = &root_item.inode;
				112	memset(inode_item, 0, sizeof(*inode_item));
				113	inode_item->generation = cpu_to_le64(1);
				114	inode_item->size = cpu_to_le64(3);
				115	inode_item->nlink = cpu_to_le32(1);
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	116	inode_item->nbytes = cpu_to_le64(root->leafsize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	117	inode_item->mode = cpu_to_le32(S_IFDIR \| 0755);
				118
				119	btrfs_set_root_bytenr(&root_item, leaf->start);
				120	btrfs_set_root_level(&root_item, 0);
				121	btrfs_set_root_refs(&root_item, 0);
				122	btrfs_set_root_used(&root_item, 0);
				123
				124	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
				125	root_item.drop_level = 0;
				126
				127	btrfs_tree_unlock(leaf);
				128	free_extent_buffer(leaf);
				129	leaf = NULL;
				130
				131	btrfs_set_root_dirid(&root_item, 0);
				132
				133	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				134	key.offset = objectid;
				135	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				136	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
				137	&root_item);
				138	if (ret)
				139	goto fail;
				140
				141	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
				142	&key);
				143	BUG_ON(!new_root);
				144
				145	WARN_ON(root->log_root);
				146	root->log_root = new_root;
				147
				148	/*
				149	* log trees do not get reference counted because they go away
				150	* before a real commit is actually done. They do store pointers
				151	* to file data extents, and those reference counts still get
				152	* updated (along with back refs to the log tree).
				153	*/
				154	new_root->ref_cows = 0;
				155	new_root->last_trans = trans->transid;
				156	fail:
				157	return ret;
				158	}
				159
				160	/*
				161	* start a sub transaction and setup the log tree
				162	* this increments the log tree writer count to make the people
				163	* syncing the tree wait for us to finish
				164	*/
				165	static int start_log_trans(struct btrfs_trans_handle *trans,
				166	struct btrfs_root *root)
				167	{
				168	int ret;
				169	mutex_lock(&root->fs_info->tree_log_mutex);
				170	if (!root->fs_info->log_root_tree) {
				171	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				172	BUG_ON(ret);
				173	}
				174	if (!root->log_root) {
				175	ret = btrfs_add_log_tree(trans, root);
				176	BUG_ON(ret);
				177	}
				178	atomic_inc(&root->fs_info->tree_log_writers);
				179	root->fs_info->tree_log_batch++;
				180	mutex_unlock(&root->fs_info->tree_log_mutex);
				181	return 0;
				182	}
				183
				184	/*
				185	* returns 0 if there was a log transaction running and we were able
				186	* to join, or returns -ENOENT if there were not transactions
				187	* in progress
				188	*/
				189	static int join_running_log_trans(struct btrfs_root *root)
				190	{
				191	int ret = -ENOENT;
				192
				193	smp_mb();
				194	if (!root->log_root)
				195	return -ENOENT;
				196
				197	mutex_lock(&root->fs_info->tree_log_mutex);
				198	if (root->log_root) {
				199	ret = 0;
				200	atomic_inc(&root->fs_info->tree_log_writers);
				201	root->fs_info->tree_log_batch++;
				202	}
				203	mutex_unlock(&root->fs_info->tree_log_mutex);
				204	return ret;
				205	}
				206
				207	/*
				208	* indicate we're done making changes to the log tree
				209	* and wake up anyone waiting to do a sync
				210	*/
				211	static int end_log_trans(struct btrfs_root *root)
				212	{
				213	atomic_dec(&root->fs_info->tree_log_writers);
				214	smp_mb();
				215	if (waitqueue_active(&root->fs_info->tree_log_wait))
				216	wake_up(&root->fs_info->tree_log_wait);
				217	return 0;
				218	}
				219
				220
				221	/*
				222	* the walk control struct is used to pass state down the chain when
				223	* processing the log tree. The stage field tells us which part
				224	* of the log tree processing we are currently doing. The others
				225	* are state fields used for that specific part
				226	*/
				227	struct walk_control {
				228	/* should we free the extent on disk when done? This is used
				229	* at transaction commit time while freeing a log tree
				230	*/
				231	int free;
				232
				233	/* should we write out the extent buffer? This is used
				234	* while flushing the log tree to disk during a sync
				235	*/
				236	int write;
				237
				238	/* should we wait for the extent buffer io to finish? Also used
				239	* while flushing the log tree to disk for a sync
				240	*/
				241	int wait;
				242
				243	/* pin only walk, we record which extents on disk belong to the
				244	* log trees
				245	*/
				246	int pin;
				247
				248	/* what stage of the replay code we're currently in */
				249	int stage;
				250
				251	/* the root we are currently replaying */
				252	struct btrfs_root *replay_dest;
				253
				254	/* the trans handle for the current replay */
				255	struct btrfs_trans_handle *trans;
				256
				257	/* the function that gets used to process blocks we find in the
				258	* tree. Note the extent_buffer might not be up to date when it is
				259	* passed in, and it must be checked or read if you need the data
				260	* inside it
				261	*/
				262	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				263	struct walk_control *wc, u64 gen);
				264	};
				265
				266	/*
				267	* process_func used to pin down extents, write them or wait on them
				268	*/
				269	static int process_one_buffer(struct btrfs_root *log,
				270	struct extent_buffer *eb,
				271	struct walk_control *wc, u64 gen)
				272	{
				273	if (wc->pin) {
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame^]	274	mutex_lock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	275	btrfs_update_pinned_extents(log->fs_info->extent_root,
				276	eb->start, eb->len, 1);
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame^]	277	mutex_unlock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	278	}
				279
				280	if (btrfs_buffer_uptodate(eb, gen)) {
				281	if (wc->write)
				282	btrfs_write_tree_block(eb);
				283	if (wc->wait)
				284	btrfs_wait_tree_block_writeback(eb);
				285	}
				286	return 0;
				287	}
				288
				289	/*
				290	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				291	* to the src data we are copying out.
				292	*
				293	* root is the tree we are copying into, and path is a scratch
				294	* path for use in this function (it should be released on entry and
				295	* will be released on exit).
				296	*
				297	* If the key is already in the destination tree the existing item is
				298	* overwritten. If the existing item isn't big enough, it is extended.
				299	* If it is too large, it is truncated.
				300	*
				301	* If the key isn't in the destination yet, a new item is inserted.
				302	*/
				303	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				304	struct btrfs_root *root,
				305	struct btrfs_path *path,
				306	struct extent_buffer *eb, int slot,
				307	struct btrfs_key *key)
				308	{
				309	int ret;
				310	u32 item_size;
				311	u64 saved_i_size = 0;
				312	int save_old_i_size = 0;
				313	unsigned long src_ptr;
				314	unsigned long dst_ptr;
				315	int overwrite_root = 0;
				316
				317	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				318	overwrite_root = 1;
				319
				320	item_size = btrfs_item_size_nr(eb, slot);
				321	src_ptr = btrfs_item_ptr_offset(eb, slot);
				322
				323	/* look for the key in the destination tree */
				324	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				325	if (ret == 0) {
				326	char *src_copy;
				327	char *dst_copy;
				328	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				329	path->slots[0]);
				330	if (dst_size != item_size)
				331	goto insert;
				332
				333	if (item_size == 0) {
				334	btrfs_release_path(root, path);
				335	return 0;
				336	}
				337	dst_copy = kmalloc(item_size, GFP_NOFS);
				338	src_copy = kmalloc(item_size, GFP_NOFS);
				339
				340	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				341
				342	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				343	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				344	item_size);
				345	ret = memcmp(dst_copy, src_copy, item_size);
				346
				347	kfree(dst_copy);
				348	kfree(src_copy);
				349	/*
				350	* they have the same contents, just return, this saves
				351	* us from cowing blocks in the destination tree and doing
				352	* extra writes that may not have been done by a previous
				353	* sync
				354	*/
				355	if (ret == 0) {
				356	btrfs_release_path(root, path);
				357	return 0;
				358	}
				359
				360	}
				361	insert:
				362	btrfs_release_path(root, path);
				363	/* try to insert the key into the destination tree */
				364	ret = btrfs_insert_empty_item(trans, root, path,
				365	key, item_size);
				366
				367	/* make sure any existing item is the correct size */
				368	if (ret == -EEXIST) {
				369	u32 found_size;
				370	found_size = btrfs_item_size_nr(path->nodes[0],
				371	path->slots[0]);
				372	if (found_size > item_size) {
				373	btrfs_truncate_item(trans, root, path, item_size, 1);
				374	} else if (found_size < item_size) {
				375	ret = btrfs_del_item(trans, root,
				376	path);
				377	BUG_ON(ret);
				378
				379	btrfs_release_path(root, path);
				380	ret = btrfs_insert_empty_item(trans,
				381	root, path, key, item_size);
				382	BUG_ON(ret);
				383	}
				384	} else if (ret) {
				385	BUG();
				386	}
				387	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				388	path->slots[0]);
				389
				390	/* don't overwrite an existing inode if the generation number
				391	* was logged as zero. This is done when the tree logging code
				392	* is just logging an inode to make sure it exists after recovery.
				393	*
				394	* Also, don't overwrite i_size on directories during replay.
				395	* log replay inserts and removes directory items based on the
				396	* state of the tree found in the subvolume, and i_size is modified
				397	* as it goes
				398	*/
				399	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				400	struct btrfs_inode_item *src_item;
				401	struct btrfs_inode_item *dst_item;
				402
				403	src_item = (struct btrfs_inode_item *)src_ptr;
				404	dst_item = (struct btrfs_inode_item *)dst_ptr;
				405
				406	if (btrfs_inode_generation(eb, src_item) == 0)
				407	goto no_copy;
				408
				409	if (overwrite_root &&
				410	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				411	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				412	save_old_i_size = 1;
				413	saved_i_size = btrfs_inode_size(path->nodes[0],
				414	dst_item);
				415	}
				416	}
				417
				418	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				419	src_ptr, item_size);
				420
				421	if (save_old_i_size) {
				422	struct btrfs_inode_item *dst_item;
				423	dst_item = (struct btrfs_inode_item *)dst_ptr;
				424	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				425	}
				426
				427	/* make sure the generation is filled in */
				428	if (key->type == BTRFS_INODE_ITEM_KEY) {
				429	struct btrfs_inode_item *dst_item;
				430	dst_item = (struct btrfs_inode_item *)dst_ptr;
				431	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				432	btrfs_set_inode_generation(path->nodes[0], dst_item,
				433	trans->transid);
				434	}
				435	}
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	436
				437	if (overwrite_root &&
				438	key->type == BTRFS_EXTENT_DATA_KEY) {
				439	int extent_type;
				440	struct btrfs_file_extent_item *fi;
				441
				442	fi = (struct btrfs_file_extent_item *)dst_ptr;
				443	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
				444	if (extent_type == BTRFS_FILE_EXTENT_REG) {
				445	struct btrfs_key ins;
				446	ins.objectid = btrfs_file_extent_disk_bytenr(
				447	path->nodes[0], fi);
				448	ins.offset = btrfs_file_extent_disk_num_bytes(
				449	path->nodes[0], fi);
				450	ins.type = BTRFS_EXTENT_ITEM_KEY;
				451
				452	/*
				453	* is this extent already allocated in the extent
				454	* allocation tree? If so, just add a reference
				455	*/
				456	ret = btrfs_lookup_extent(root, ins.objectid,
				457	ins.offset);
				458	if (ret == 0) {
				459	ret = btrfs_inc_extent_ref(trans, root,
				460	ins.objectid, ins.offset,
				461	path->nodes[0]->start,
				462	root->root_key.objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	463	trans->transid, key->objectid);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	464	} else {
				465	/*
				466	* insert the extent pointer in the extent
				467	* allocation tree
				468	*/
				469	ret = btrfs_alloc_logged_extent(trans, root,
				470	path->nodes[0]->start,
				471	root->root_key.objectid,
				472	trans->transid, key->objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	473	&ins);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	474	BUG_ON(ret);
				475	}
				476	}
				477	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	478	no_copy:
				479	btrfs_mark_buffer_dirty(path->nodes[0]);
				480	btrfs_release_path(root, path);
				481	return 0;
				482	}
				483
				484	/*
				485	* simple helper to read an inode off the disk from a given root
				486	* This can only be called for subvolume roots and not for the log
				487	*/
				488	static noinline struct inode read_one_inode(struct btrfs_root root,
				489	u64 objectid)
				490	{
				491	struct inode *inode;
				492	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				493	if (inode->i_state & I_NEW) {
				494	BTRFS_I(inode)->root = root;
				495	BTRFS_I(inode)->location.objectid = objectid;
				496	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				497	BTRFS_I(inode)->location.offset = 0;
				498	btrfs_read_locked_inode(inode);
				499	unlock_new_inode(inode);
				500
				501	}
				502	if (is_bad_inode(inode)) {
				503	iput(inode);
				504	inode = NULL;
				505	}
				506	return inode;
				507	}
				508
				509	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				510	* subvolume 'root'. path is released on entry and should be released
				511	* on exit.
				512	*
				513	* extents in the log tree have not been allocated out of the extent
				514	* tree yet. So, this completes the allocation, taking a reference
				515	* as required if the extent already exists or creating a new extent
				516	* if it isn't in the extent allocation tree yet.
				517	*
				518	* The extent is inserted into the file, dropping any existing extents
				519	* from the file that overlap the new one.
				520	*/
				521	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				522	struct btrfs_root *root,
				523	struct btrfs_path *path,
				524	struct extent_buffer *eb, int slot,
				525	struct btrfs_key *key)
				526	{
				527	int found_type;
				528	u64 mask = root->sectorsize - 1;
				529	u64 extent_end;
				530	u64 alloc_hint;
				531	u64 start = key->offset;
				532	struct btrfs_file_extent_item *item;
				533	struct inode *inode = NULL;
				534	unsigned long size;
				535	int ret = 0;
				536
				537	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				538	found_type = btrfs_file_extent_type(eb, item);
				539
				540	if (found_type == BTRFS_FILE_EXTENT_REG)
				541	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				542	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Chris Mason	c8b9781	2008-10-29 14:49:59 -0400	[diff] [blame]	543	size = btrfs_file_extent_inline_len(eb, item);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	544	extent_end = (start + size + mask) & ~mask;
				545	} else {
				546	ret = 0;
				547	goto out;
				548	}
				549
				550	inode = read_one_inode(root, key->objectid);
				551	if (!inode) {
				552	ret = -EIO;
				553	goto out;
				554	}
				555
				556	/*
				557	* first check to see if we already have this extent in the
				558	* file. This must be done before the btrfs_drop_extents run
				559	* so we don't try to drop this extent.
				560	*/
				561	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				562	start, 0);
				563
				564	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
				565	struct btrfs_file_extent_item cmp1;
				566	struct btrfs_file_extent_item cmp2;
				567	struct btrfs_file_extent_item *existing;
				568	struct extent_buffer *leaf;
				569
				570	leaf = path->nodes[0];
				571	existing = btrfs_item_ptr(leaf, path->slots[0],
				572	struct btrfs_file_extent_item);
				573
				574	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				575	sizeof(cmp1));
				576	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				577	sizeof(cmp2));
				578
				579	/*
				580	* we already have a pointer to this exact extent,
				581	* we don't have to do anything
				582	*/
				583	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				584	btrfs_release_path(root, path);
				585	goto out;
				586	}
				587	}
				588	btrfs_release_path(root, path);
				589
				590	/* drop any overlapping extents */
				591	ret = btrfs_drop_extents(trans, root, inode,
				592	start, extent_end, start, &alloc_hint);
				593	BUG_ON(ret);
				594
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	595	/* insert the extent */
				596	ret = overwrite_item(trans, root, path, eb, slot, key);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	597	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	598
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	599	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
				600	inode_add_bytes(inode, extent_end - start);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	601	btrfs_update_inode(trans, root, inode);
				602	out:
				603	if (inode)
				604	iput(inode);
				605	return ret;
				606	}
				607
				608	/*
				609	* when cleaning up conflicts between the directory names in the
				610	* subvolume, directory names in the log and directory names in the
				611	* inode back references, we may have to unlink inodes from directories.
				612	*
				613	* This is a helper function to do the unlink of a specific directory
				614	* item
				615	*/
				616	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				617	struct btrfs_root *root,
				618	struct btrfs_path *path,
				619	struct inode *dir,
				620	struct btrfs_dir_item *di)
				621	{
				622	struct inode *inode;
				623	char *name;
				624	int name_len;
				625	struct extent_buffer *leaf;
				626	struct btrfs_key location;
				627	int ret;
				628
				629	leaf = path->nodes[0];
				630
				631	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				632	name_len = btrfs_dir_name_len(leaf, di);
				633	name = kmalloc(name_len, GFP_NOFS);
				634	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				635	btrfs_release_path(root, path);
				636
				637	inode = read_one_inode(root, location.objectid);
				638	BUG_ON(!inode);
				639
				640	btrfs_inc_nlink(inode);
				641	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
				642	kfree(name);
				643
				644	iput(inode);
				645	return ret;
				646	}
				647
				648	/*
				649	* helper function to see if a given name and sequence number found
				650	* in an inode back reference are already in a directory and correctly
				651	* point to this inode
				652	*/
				653	static noinline int inode_in_dir(struct btrfs_root *root,
				654	struct btrfs_path *path,
				655	u64 dirid, u64 objectid, u64 index,
				656	const char *name, int name_len)
				657	{
				658	struct btrfs_dir_item *di;
				659	struct btrfs_key location;
				660	int match = 0;
				661
				662	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				663	index, name, name_len, 0);
				664	if (di && !IS_ERR(di)) {
				665	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				666	if (location.objectid != objectid)
				667	goto out;
				668	} else
				669	goto out;
				670	btrfs_release_path(root, path);
				671
				672	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				673	if (di && !IS_ERR(di)) {
				674	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				675	if (location.objectid != objectid)
				676	goto out;
				677	} else
				678	goto out;
				679	match = 1;
				680	out:
				681	btrfs_release_path(root, path);
				682	return match;
				683	}
				684
				685	/*
				686	* helper function to check a log tree for a named back reference in
				687	* an inode. This is used to decide if a back reference that is
				688	* found in the subvolume conflicts with what we find in the log.
				689	*
				690	* inode backreferences may have multiple refs in a single item,
				691	* during replay we process one reference at a time, and we don't
				692	* want to delete valid links to a file from the subvolume if that
				693	* link is also in the log.
				694	*/
				695	static noinline int backref_in_log(struct btrfs_root *log,
				696	struct btrfs_key *key,
				697	char *name, int namelen)
				698	{
				699	struct btrfs_path *path;
				700	struct btrfs_inode_ref *ref;
				701	unsigned long ptr;
				702	unsigned long ptr_end;
				703	unsigned long name_ptr;
				704	int found_name_len;
				705	int item_size;
				706	int ret;
				707	int match = 0;
				708
				709	path = btrfs_alloc_path();
				710	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				711	if (ret != 0)
				712	goto out;
				713
				714	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				715	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				716	ptr_end = ptr + item_size;
				717	while (ptr < ptr_end) {
				718	ref = (struct btrfs_inode_ref *)ptr;
				719	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				720	if (found_name_len == namelen) {
				721	name_ptr = (unsigned long)(ref + 1);
				722	ret = memcmp_extent_buffer(path->nodes[0], name,
				723	name_ptr, namelen);
				724	if (ret == 0) {
				725	match = 1;
				726	goto out;
				727	}
				728	}
				729	ptr = (unsigned long)(ref + 1) + found_name_len;
				730	}
				731	out:
				732	btrfs_free_path(path);
				733	return match;
				734	}
				735
				736
				737	/*
				738	* replay one inode back reference item found in the log tree.
				739	* eb, slot and key refer to the buffer and key found in the log tree.
				740	* root is the destination we are replaying into, and path is for temp
				741	* use by this function. (it should be released on return).
				742	*/
				743	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				744	struct btrfs_root *root,
				745	struct btrfs_root *log,
				746	struct btrfs_path *path,
				747	struct extent_buffer *eb, int slot,
				748	struct btrfs_key *key)
				749	{
				750	struct inode *dir;
				751	int ret;
				752	struct btrfs_key location;
				753	struct btrfs_inode_ref *ref;
				754	struct btrfs_dir_item *di;
				755	struct inode *inode;
				756	char *name;
				757	int namelen;
				758	unsigned long ref_ptr;
				759	unsigned long ref_end;
				760
				761	location.objectid = key->objectid;
				762	location.type = BTRFS_INODE_ITEM_KEY;
				763	location.offset = 0;
				764
				765	/*
				766	* it is possible that we didn't log all the parent directories
				767	* for a given inode. If we don't find the dir, just don't
				768	* copy the back ref in. The link count fixup code will take
				769	* care of the rest
				770	*/
				771	dir = read_one_inode(root, key->offset);
				772	if (!dir)
				773	return -ENOENT;
				774
				775	inode = read_one_inode(root, key->objectid);
				776	BUG_ON(!dir);
				777
				778	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				779	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				780
				781	again:
				782	ref = (struct btrfs_inode_ref *)ref_ptr;
				783
				784	namelen = btrfs_inode_ref_name_len(eb, ref);
				785	name = kmalloc(namelen, GFP_NOFS);
				786	BUG_ON(!name);
				787
				788	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				789
				790	/* if we already have a perfect match, we're done */
				791	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				792	btrfs_inode_ref_index(eb, ref),
				793	name, namelen)) {
				794	goto out;
				795	}
				796
				797	/*
				798	* look for a conflicting back reference in the metadata.
				799	* if we find one we have to unlink that name of the file
				800	* before we add our new link. Later on, we overwrite any
				801	* existing back reference, and we don't want to create
				802	* dangling pointers in the directory.
				803	*/
				804	conflict_again:
				805	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				806	if (ret == 0) {
				807	char *victim_name;
				808	int victim_name_len;
				809	struct btrfs_inode_ref *victim_ref;
				810	unsigned long ptr;
				811	unsigned long ptr_end;
				812	struct extent_buffer *leaf = path->nodes[0];
				813
				814	/* are we trying to overwrite a back ref for the root directory
				815	* if so, just jump out, we're done
				816	*/
				817	if (key->objectid == key->offset)
				818	goto out_nowrite;
				819
				820	/* check all the names in this back reference to see
				821	* if they are in the log. if so, we allow them to stay
				822	* otherwise they must be unlinked as a conflict
				823	*/
				824	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				825	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				826	while(ptr < ptr_end) {
				827	victim_ref = (struct btrfs_inode_ref *)ptr;
				828	victim_name_len = btrfs_inode_ref_name_len(leaf,
				829	victim_ref);
				830	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				831	BUG_ON(!victim_name);
				832
				833	read_extent_buffer(leaf, victim_name,
				834	(unsigned long)(victim_ref + 1),
				835	victim_name_len);
				836
				837	if (!backref_in_log(log, key, victim_name,
				838	victim_name_len)) {
				839	btrfs_inc_nlink(inode);
				840	btrfs_release_path(root, path);
				841	ret = btrfs_unlink_inode(trans, root, dir,
				842	inode, victim_name,
				843	victim_name_len);
				844	kfree(victim_name);
				845	btrfs_release_path(root, path);
				846	goto conflict_again;
				847	}
				848	kfree(victim_name);
				849	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				850	}
				851	BUG_ON(ret);
				852	}
				853	btrfs_release_path(root, path);
				854
				855	/* look for a conflicting sequence number */
				856	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				857	btrfs_inode_ref_index(eb, ref),
				858	name, namelen, 0);
				859	if (di && !IS_ERR(di)) {
				860	ret = drop_one_dir_item(trans, root, path, dir, di);
				861	BUG_ON(ret);
				862	}
				863	btrfs_release_path(root, path);
				864
				865
				866	/* look for a conflicting name */
				867	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				868	name, namelen, 0);
				869	if (di && !IS_ERR(di)) {
				870	ret = drop_one_dir_item(trans, root, path, dir, di);
				871	BUG_ON(ret);
				872	}
				873	btrfs_release_path(root, path);
				874
				875	/* insert our name */
				876	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				877	btrfs_inode_ref_index(eb, ref));
				878	BUG_ON(ret);
				879
				880	btrfs_update_inode(trans, root, inode);
				881
				882	out:
				883	ref_ptr = (unsigned long)(ref + 1) + namelen;
				884	kfree(name);
				885	if (ref_ptr < ref_end)
				886	goto again;
				887
				888	/* finally write the back reference in the inode */
				889	ret = overwrite_item(trans, root, path, eb, slot, key);
				890	BUG_ON(ret);
				891
				892	out_nowrite:
				893	btrfs_release_path(root, path);
				894	iput(dir);
				895	iput(inode);
				896	return 0;
				897	}
				898
				899	/*
				900	* replay one csum item from the log tree into the subvolume 'root'
				901	* eb, slot and key all refer to the log tree
				902	* path is for temp use by this function and should be released on return
				903	*
				904	* This copies the checksums out of the log tree and inserts them into
				905	* the subvolume. Any existing checksums for this range in the file
				906	* are overwritten, and new items are added where required.
				907	*
				908	* We keep this simple by reusing the btrfs_ordered_sum code from
				909	* the data=ordered mode. This basically means making a copy
				910	* of all the checksums in ram, which we have to do anyway for kmap
				911	* rules.
				912	*
				913	* The copy is then sent down to btrfs_csum_file_blocks, which
				914	* does all the hard work of finding existing items in the file
				915	* or adding new ones.
				916	*/
				917	static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
				918	struct btrfs_root *root,
				919	struct btrfs_path *path,
				920	struct extent_buffer *eb, int slot,
				921	struct btrfs_key *key)
				922	{
				923	int ret;
				924	u32 item_size = btrfs_item_size_nr(eb, slot);
				925	u64 cur_offset;
				926	unsigned long file_bytes;
				927	struct btrfs_ordered_sum *sums;
				928	struct btrfs_sector_sum *sector_sum;
				929	struct inode *inode;
				930	unsigned long ptr;
				931
				932	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
				933	inode = read_one_inode(root, key->objectid);
				934	if (!inode) {
				935	return -EIO;
				936	}
				937
				938	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
				939	if (!sums) {
				940	iput(inode);
				941	return -ENOMEM;
				942	}
				943
				944	INIT_LIST_HEAD(&sums->list);
				945	sums->len = file_bytes;
				946	sums->file_offset = key->offset;
				947
				948	/*
				949	* copy all the sums into the ordered sum struct
				950	*/
				951	sector_sum = sums->sums;
				952	cur_offset = key->offset;
				953	ptr = btrfs_item_ptr_offset(eb, slot);
				954	while(item_size > 0) {
				955	sector_sum->offset = cur_offset;
				956	read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
				957	sector_sum++;
				958	item_size -= BTRFS_CRC32_SIZE;
				959	ptr += BTRFS_CRC32_SIZE;
				960	cur_offset += root->sectorsize;
				961	}
				962
				963	/* let btrfs_csum_file_blocks add them into the file */
				964	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
				965	BUG_ON(ret);
				966	kfree(sums);
				967	iput(inode);
				968
				969	return 0;
				970	}
				971	/*
				972	* There are a few corners where the link count of the file can't
				973	* be properly maintained during replay. So, instead of adding
				974	* lots of complexity to the log code, we just scan the backrefs
				975	* for any file that has been through replay.
				976	*
				977	* The scan will update the link count on the inode to reflect the
				978	* number of back refs found. If it goes down to zero, the iput
				979	* will free the inode.
				980	*/
				981	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				982	struct btrfs_root *root,
				983	struct inode *inode)
				984	{
				985	struct btrfs_path *path;
				986	int ret;
				987	struct btrfs_key key;
				988	u64 nlink = 0;
				989	unsigned long ptr;
				990	unsigned long ptr_end;
				991	int name_len;
				992
				993	key.objectid = inode->i_ino;
				994	key.type = BTRFS_INODE_REF_KEY;
				995	key.offset = (u64)-1;
				996
				997	path = btrfs_alloc_path();
				998
				999	while(1) {
				1000	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1001	if (ret < 0)
				1002	break;
				1003	if (ret > 0) {
				1004	if (path->slots[0] == 0)
				1005	break;
				1006	path->slots[0]--;
				1007	}
				1008	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1009	path->slots[0]);
				1010	if (key.objectid != inode->i_ino \|\|
				1011	key.type != BTRFS_INODE_REF_KEY)
				1012	break;
				1013	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1014	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1015	path->slots[0]);
				1016	while(ptr < ptr_end) {
				1017	struct btrfs_inode_ref *ref;
				1018
				1019	ref = (struct btrfs_inode_ref *)ptr;
				1020	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1021	ref);
				1022	ptr = (unsigned long)(ref + 1) + name_len;
				1023	nlink++;
				1024	}
				1025
				1026	if (key.offset == 0)
				1027	break;
				1028	key.offset--;
				1029	btrfs_release_path(root, path);
				1030	}
				1031	btrfs_free_path(path);
				1032	if (nlink != inode->i_nlink) {
				1033	inode->i_nlink = nlink;
				1034	btrfs_update_inode(trans, root, inode);
				1035	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	1036	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1037
				1038	return 0;
				1039	}
				1040
				1041	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1042	struct btrfs_root *root,
				1043	struct btrfs_path *path)
				1044	{
				1045	int ret;
				1046	struct btrfs_key key;
				1047	struct inode *inode;
				1048
				1049	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1050	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1051	key.offset = (u64)-1;
				1052	while(1) {
				1053	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1054	if (ret < 0)
				1055	break;
				1056
				1057	if (ret == 1) {
				1058	if (path->slots[0] == 0)
				1059	break;
				1060	path->slots[0]--;
				1061	}
				1062
				1063	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1064	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1065	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1066	break;
				1067
				1068	ret = btrfs_del_item(trans, root, path);
				1069	BUG_ON(ret);
				1070
				1071	btrfs_release_path(root, path);
				1072	inode = read_one_inode(root, key.offset);
				1073	BUG_ON(!inode);
				1074
				1075	ret = fixup_inode_link_count(trans, root, inode);
				1076	BUG_ON(ret);
				1077
				1078	iput(inode);
				1079
				1080	if (key.offset == 0)
				1081	break;
				1082	key.offset--;
				1083	}
				1084	btrfs_release_path(root, path);
				1085	return 0;
				1086	}
				1087
				1088
				1089	/*
				1090	* record a given inode in the fixup dir so we can check its link
				1091	* count when replay is done. The link count is incremented here
				1092	* so the inode won't go away until we check it
				1093	*/
				1094	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1095	struct btrfs_root *root,
				1096	struct btrfs_path *path,
				1097	u64 objectid)
				1098	{
				1099	struct btrfs_key key;
				1100	int ret = 0;
				1101	struct inode *inode;
				1102
				1103	inode = read_one_inode(root, objectid);
				1104	BUG_ON(!inode);
				1105
				1106	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1107	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1108	key.offset = objectid;
				1109
				1110	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1111
				1112	btrfs_release_path(root, path);
				1113	if (ret == 0) {
				1114	btrfs_inc_nlink(inode);
				1115	btrfs_update_inode(trans, root, inode);
				1116	} else if (ret == -EEXIST) {
				1117	ret = 0;
				1118	} else {
				1119	BUG();
				1120	}
				1121	iput(inode);
				1122
				1123	return ret;
				1124	}
				1125
				1126	/*
				1127	* when replaying the log for a directory, we only insert names
				1128	* for inodes that actually exist. This means an fsync on a directory
				1129	* does not implicitly fsync all the new files in it
				1130	*/
				1131	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1132	struct btrfs_root *root,
				1133	struct btrfs_path *path,
				1134	u64 dirid, u64 index,
				1135	char *name, int name_len, u8 type,
				1136	struct btrfs_key *location)
				1137	{
				1138	struct inode *inode;
				1139	struct inode *dir;
				1140	int ret;
				1141
				1142	inode = read_one_inode(root, location->objectid);
				1143	if (!inode)
				1144	return -ENOENT;
				1145
				1146	dir = read_one_inode(root, dirid);
				1147	if (!dir) {
				1148	iput(inode);
				1149	return -EIO;
				1150	}
				1151	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1152
				1153	/* FIXME, put inode into FIXUP list */
				1154
				1155	iput(inode);
				1156	iput(dir);
				1157	return ret;
				1158	}
				1159
				1160	/*
				1161	* take a single entry in a log directory item and replay it into
				1162	* the subvolume.
				1163	*
				1164	* if a conflicting item exists in the subdirectory already,
				1165	* the inode it points to is unlinked and put into the link count
				1166	* fix up tree.
				1167	*
				1168	* If a name from the log points to a file or directory that does
				1169	* not exist in the FS, it is skipped. fsyncs on directories
				1170	* do not force down inodes inside that directory, just changes to the
				1171	* names or unlinks in a directory.
				1172	*/
				1173	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1174	struct btrfs_root *root,
				1175	struct btrfs_path *path,
				1176	struct extent_buffer *eb,
				1177	struct btrfs_dir_item *di,
				1178	struct btrfs_key *key)
				1179	{
				1180	char *name;
				1181	int name_len;
				1182	struct btrfs_dir_item *dst_di;
				1183	struct btrfs_key found_key;
				1184	struct btrfs_key log_key;
				1185	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1186	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1187	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1188	int ret;
				1189
				1190	dir = read_one_inode(root, key->objectid);
				1191	BUG_ON(!dir);
				1192
				1193	name_len = btrfs_dir_name_len(eb, di);
				1194	name = kmalloc(name_len, GFP_NOFS);
				1195	log_type = btrfs_dir_type(eb, di);
				1196	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1197	name_len);
				1198
				1199	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1200	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1201	if (exists == 0)
				1202	exists = 1;
				1203	else
				1204	exists = 0;
				1205	btrfs_release_path(root, path);
				1206
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1207	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1208	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1209	name, name_len, 1);
				1210	}
				1211	else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1212	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1213	key->objectid,
				1214	key->offset, name,
				1215	name_len, 1);
				1216	} else {
				1217	BUG();
				1218	}
				1219	if (!dst_di \|\| IS_ERR(dst_di)) {
				1220	/* we need a sequence number to insert, so we only
				1221	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1222	*/
				1223	if (key->type != BTRFS_DIR_INDEX_KEY)
				1224	goto out;
				1225	goto insert;
				1226	}
				1227
				1228	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1229	/* the existing item matches the logged item */
				1230	if (found_key.objectid == log_key.objectid &&
				1231	found_key.type == log_key.type &&
				1232	found_key.offset == log_key.offset &&
				1233	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1234	goto out;
				1235	}
				1236
				1237	/*
				1238	* don't drop the conflicting directory entry if the inode
				1239	* for the new entry doesn't exist
				1240	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1241	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1242	goto out;
				1243
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1244	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1245	BUG_ON(ret);
				1246
				1247	if (key->type == BTRFS_DIR_INDEX_KEY)
				1248	goto insert;
				1249	out:
				1250	btrfs_release_path(root, path);
				1251	kfree(name);
				1252	iput(dir);
				1253	return 0;
				1254
				1255	insert:
				1256	btrfs_release_path(root, path);
				1257	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1258	name, name_len, log_type, &log_key);
				1259
				1260	if (ret && ret != -ENOENT)
				1261	BUG();
				1262	goto out;
				1263	}
				1264
				1265	/*
				1266	* find all the names in a directory item and reconcile them into
				1267	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1268	* one name in a directory item, but the same code gets used for
				1269	* both directory index types
				1270	*/
				1271	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1272	struct btrfs_root *root,
				1273	struct btrfs_path *path,
				1274	struct extent_buffer *eb, int slot,
				1275	struct btrfs_key *key)
				1276	{
				1277	int ret;
				1278	u32 item_size = btrfs_item_size_nr(eb, slot);
				1279	struct btrfs_dir_item *di;
				1280	int name_len;
				1281	unsigned long ptr;
				1282	unsigned long ptr_end;
				1283
				1284	ptr = btrfs_item_ptr_offset(eb, slot);
				1285	ptr_end = ptr + item_size;
				1286	while(ptr < ptr_end) {
				1287	di = (struct btrfs_dir_item *)ptr;
				1288	name_len = btrfs_dir_name_len(eb, di);
				1289	ret = replay_one_name(trans, root, path, eb, di, key);
				1290	BUG_ON(ret);
				1291	ptr = (unsigned long)(di + 1);
				1292	ptr += name_len;
				1293	}
				1294	return 0;
				1295	}
				1296
				1297	/*
				1298	* directory replay has two parts. There are the standard directory
				1299	* items in the log copied from the subvolume, and range items
				1300	* created in the log while the subvolume was logged.
				1301	*
				1302	* The range items tell us which parts of the key space the log
				1303	* is authoritative for. During replay, if a key in the subvolume
				1304	* directory is in a logged range item, but not actually in the log
				1305	* that means it was deleted from the directory before the fsync
				1306	* and should be removed.
				1307	*/
				1308	static noinline int find_dir_range(struct btrfs_root *root,
				1309	struct btrfs_path *path,
				1310	u64 dirid, int key_type,
				1311	u64 start_ret, u64 end_ret)
				1312	{
				1313	struct btrfs_key key;
				1314	u64 found_end;
				1315	struct btrfs_dir_log_item *item;
				1316	int ret;
				1317	int nritems;
				1318
				1319	if (*start_ret == (u64)-1)
				1320	return 1;
				1321
				1322	key.objectid = dirid;
				1323	key.type = key_type;
				1324	key.offset = *start_ret;
				1325
				1326	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1327	if (ret < 0)
				1328	goto out;
				1329	if (ret > 0) {
				1330	if (path->slots[0] == 0)
				1331	goto out;
				1332	path->slots[0]--;
				1333	}
				1334	if (ret != 0)
				1335	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1336
				1337	if (key.type != key_type \|\| key.objectid != dirid) {
				1338	ret = 1;
				1339	goto next;
				1340	}
				1341	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1342	struct btrfs_dir_log_item);
				1343	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1344
				1345	if (start_ret >= key.offset && start_ret <= found_end) {
				1346	ret = 0;
				1347	*start_ret = key.offset;
				1348	*end_ret = found_end;
				1349	goto out;
				1350	}
				1351	ret = 1;
				1352	next:
				1353	/* check the next slot in the tree to see if it is a valid item */
				1354	nritems = btrfs_header_nritems(path->nodes[0]);
				1355	if (path->slots[0] >= nritems) {
				1356	ret = btrfs_next_leaf(root, path);
				1357	if (ret)
				1358	goto out;
				1359	} else {
				1360	path->slots[0]++;
				1361	}
				1362
				1363	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1364
				1365	if (key.type != key_type \|\| key.objectid != dirid) {
				1366	ret = 1;
				1367	goto out;
				1368	}
				1369	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1370	struct btrfs_dir_log_item);
				1371	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1372	*start_ret = key.offset;
				1373	*end_ret = found_end;
				1374	ret = 0;
				1375	out:
				1376	btrfs_release_path(root, path);
				1377	return ret;
				1378	}
				1379
				1380	/*
				1381	* this looks for a given directory item in the log. If the directory
				1382	* item is not in the log, the item is removed and the inode it points
				1383	* to is unlinked
				1384	*/
				1385	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1386	struct btrfs_root *root,
				1387	struct btrfs_root *log,
				1388	struct btrfs_path *path,
				1389	struct btrfs_path *log_path,
				1390	struct inode *dir,
				1391	struct btrfs_key *dir_key)
				1392	{
				1393	int ret;
				1394	struct extent_buffer *eb;
				1395	int slot;
				1396	u32 item_size;
				1397	struct btrfs_dir_item *di;
				1398	struct btrfs_dir_item *log_di;
				1399	int name_len;
				1400	unsigned long ptr;
				1401	unsigned long ptr_end;
				1402	char *name;
				1403	struct inode *inode;
				1404	struct btrfs_key location;
				1405
				1406	again:
				1407	eb = path->nodes[0];
				1408	slot = path->slots[0];
				1409	item_size = btrfs_item_size_nr(eb, slot);
				1410	ptr = btrfs_item_ptr_offset(eb, slot);
				1411	ptr_end = ptr + item_size;
				1412	while(ptr < ptr_end) {
				1413	di = (struct btrfs_dir_item *)ptr;
				1414	name_len = btrfs_dir_name_len(eb, di);
				1415	name = kmalloc(name_len, GFP_NOFS);
				1416	if (!name) {
				1417	ret = -ENOMEM;
				1418	goto out;
				1419	}
				1420	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1421	name_len);
				1422	log_di = NULL;
				1423	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1424	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1425	dir_key->objectid,
				1426	name, name_len, 0);
				1427	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1428	log_di = btrfs_lookup_dir_index_item(trans, log,
				1429	log_path,
				1430	dir_key->objectid,
				1431	dir_key->offset,
				1432	name, name_len, 0);
				1433	}
				1434	if (!log_di \|\| IS_ERR(log_di)) {
				1435	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1436	btrfs_release_path(root, path);
				1437	btrfs_release_path(log, log_path);
				1438	inode = read_one_inode(root, location.objectid);
				1439	BUG_ON(!inode);
				1440
				1441	ret = link_to_fixup_dir(trans, root,
				1442	path, location.objectid);
				1443	BUG_ON(ret);
				1444	btrfs_inc_nlink(inode);
				1445	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1446	name, name_len);
				1447	BUG_ON(ret);
				1448	kfree(name);
				1449	iput(inode);
				1450
				1451	/* there might still be more names under this key
				1452	* check and repeat if required
				1453	*/
				1454	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1455	0, 0);
				1456	if (ret == 0)
				1457	goto again;
				1458	ret = 0;
				1459	goto out;
				1460	}
				1461	btrfs_release_path(log, log_path);
				1462	kfree(name);
				1463
				1464	ptr = (unsigned long)(di + 1);
				1465	ptr += name_len;
				1466	}
				1467	ret = 0;
				1468	out:
				1469	btrfs_release_path(root, path);
				1470	btrfs_release_path(log, log_path);
				1471	return ret;
				1472	}
				1473
				1474	/*
				1475	* deletion replay happens before we copy any new directory items
				1476	* out of the log or out of backreferences from inodes. It
				1477	* scans the log to find ranges of keys that log is authoritative for,
				1478	* and then scans the directory to find items in those ranges that are
				1479	* not present in the log.
				1480	*
				1481	* Anything we don't find in the log is unlinked and removed from the
				1482	* directory.
				1483	*/
				1484	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1485	struct btrfs_root *root,
				1486	struct btrfs_root *log,
				1487	struct btrfs_path *path,
				1488	u64 dirid)
				1489	{
				1490	u64 range_start;
				1491	u64 range_end;
				1492	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1493	int ret = 0;
				1494	struct btrfs_key dir_key;
				1495	struct btrfs_key found_key;
				1496	struct btrfs_path *log_path;
				1497	struct inode *dir;
				1498
				1499	dir_key.objectid = dirid;
				1500	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1501	log_path = btrfs_alloc_path();
				1502	if (!log_path)
				1503	return -ENOMEM;
				1504
				1505	dir = read_one_inode(root, dirid);
				1506	/* it isn't an error if the inode isn't there, that can happen
				1507	* because we replay the deletes before we copy in the inode item
				1508	* from the log
				1509	*/
				1510	if (!dir) {
				1511	btrfs_free_path(log_path);
				1512	return 0;
				1513	}
				1514	again:
				1515	range_start = 0;
				1516	range_end = 0;
				1517	while(1) {
				1518	ret = find_dir_range(log, path, dirid, key_type,
				1519	&range_start, &range_end);
				1520	if (ret != 0)
				1521	break;
				1522
				1523	dir_key.offset = range_start;
				1524	while(1) {
				1525	int nritems;
				1526	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1527	0, 0);
				1528	if (ret < 0)
				1529	goto out;
				1530
				1531	nritems = btrfs_header_nritems(path->nodes[0]);
				1532	if (path->slots[0] >= nritems) {
				1533	ret = btrfs_next_leaf(root, path);
				1534	if (ret)
				1535	break;
				1536	}
				1537	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1538	path->slots[0]);
				1539	if (found_key.objectid != dirid \|\|
				1540	found_key.type != dir_key.type)
				1541	goto next_type;
				1542
				1543	if (found_key.offset > range_end)
				1544	break;
				1545
				1546	ret = check_item_in_log(trans, root, log, path,
				1547	log_path, dir, &found_key);
				1548	BUG_ON(ret);
				1549	if (found_key.offset == (u64)-1)
				1550	break;
				1551	dir_key.offset = found_key.offset + 1;
				1552	}
				1553	btrfs_release_path(root, path);
				1554	if (range_end == (u64)-1)
				1555	break;
				1556	range_start = range_end + 1;
				1557	}
				1558
				1559	next_type:
				1560	ret = 0;
				1561	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1562	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1563	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1564	btrfs_release_path(root, path);
				1565	goto again;
				1566	}
				1567	out:
				1568	btrfs_release_path(root, path);
				1569	btrfs_free_path(log_path);
				1570	iput(dir);
				1571	return ret;
				1572	}
				1573
				1574	/*
				1575	* the process_func used to replay items from the log tree. This
				1576	* gets called in two different stages. The first stage just looks
				1577	* for inodes and makes sure they are all copied into the subvolume.
				1578	*
				1579	* The second stage copies all the other item types from the log into
				1580	* the subvolume. The two stage approach is slower, but gets rid of
				1581	* lots of complexity around inodes referencing other inodes that exist
				1582	* only in the log (references come from either directory items or inode
				1583	* back refs).
				1584	*/
				1585	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1586	struct walk_control *wc, u64 gen)
				1587	{
				1588	int nritems;
				1589	struct btrfs_path *path;
				1590	struct btrfs_root *root = wc->replay_dest;
				1591	struct btrfs_key key;
				1592	u32 item_size;
				1593	int level;
				1594	int i;
				1595	int ret;
				1596
				1597	btrfs_read_buffer(eb, gen);
				1598
				1599	level = btrfs_header_level(eb);
				1600
				1601	if (level != 0)
				1602	return 0;
				1603
				1604	path = btrfs_alloc_path();
				1605	BUG_ON(!path);
				1606
				1607	nritems = btrfs_header_nritems(eb);
				1608	for (i = 0; i < nritems; i++) {
				1609	btrfs_item_key_to_cpu(eb, &key, i);
				1610	item_size = btrfs_item_size_nr(eb, i);
				1611
				1612	/* inode keys are done during the first stage */
				1613	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1614	wc->stage == LOG_WALK_REPLAY_INODES) {
				1615	struct inode *inode;
				1616	struct btrfs_inode_item *inode_item;
				1617	u32 mode;
				1618
				1619	inode_item = btrfs_item_ptr(eb, i,
				1620	struct btrfs_inode_item);
				1621	mode = btrfs_inode_mode(eb, inode_item);
				1622	if (S_ISDIR(mode)) {
				1623	ret = replay_dir_deletes(wc->trans,
				1624	root, log, path, key.objectid);
				1625	BUG_ON(ret);
				1626	}
				1627	ret = overwrite_item(wc->trans, root, path,
				1628	eb, i, &key);
				1629	BUG_ON(ret);
				1630
				1631	/* for regular files, truncate away
				1632	* extents past the new EOF
				1633	*/
				1634	if (S_ISREG(mode)) {
				1635	inode = read_one_inode(root,
				1636	key.objectid);
				1637	BUG_ON(!inode);
				1638
				1639	ret = btrfs_truncate_inode_items(wc->trans,
				1640	root, inode, inode->i_size,
				1641	BTRFS_EXTENT_DATA_KEY);
				1642	BUG_ON(ret);
				1643	iput(inode);
				1644	}
				1645	ret = link_to_fixup_dir(wc->trans, root,
				1646	path, key.objectid);
				1647	BUG_ON(ret);
				1648	}
				1649	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1650	continue;
				1651
				1652	/* these keys are simply copied */
				1653	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1654	ret = overwrite_item(wc->trans, root, path,
				1655	eb, i, &key);
				1656	BUG_ON(ret);
				1657	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1658	ret = add_inode_ref(wc->trans, root, log, path,
				1659	eb, i, &key);
				1660	BUG_ON(ret && ret != -ENOENT);
				1661	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1662	ret = replay_one_extent(wc->trans, root, path,
				1663	eb, i, &key);
				1664	BUG_ON(ret);
				1665	} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
				1666	ret = replay_one_csum(wc->trans, root, path,
				1667	eb, i, &key);
				1668	BUG_ON(ret);
				1669	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1670	key.type == BTRFS_DIR_INDEX_KEY) {
				1671	ret = replay_one_dir_item(wc->trans, root, path,
				1672	eb, i, &key);
				1673	BUG_ON(ret);
				1674	}
				1675	}
				1676	btrfs_free_path(path);
				1677	return 0;
				1678	}
				1679
				1680	static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
				1681	struct btrfs_root *root,
				1682	struct btrfs_path path, int level,
				1683	struct walk_control *wc)
				1684	{
				1685	u64 root_owner;
				1686	u64 root_gen;
				1687	u64 bytenr;
				1688	u64 ptr_gen;
				1689	struct extent_buffer *next;
				1690	struct extent_buffer *cur;
				1691	struct extent_buffer *parent;
				1692	u32 blocksize;
				1693	int ret = 0;
				1694
				1695	WARN_ON(*level < 0);
				1696	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1697
				1698	while(*level > 0) {
				1699	WARN_ON(*level < 0);
				1700	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1701	cur = path->nodes[*level];
				1702
				1703	if (btrfs_header_level(cur) != *level)
				1704	WARN_ON(1);
				1705
				1706	if (path->slots[*level] >=
				1707	btrfs_header_nritems(cur))
				1708	break;
				1709
				1710	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1711	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1712	blocksize = btrfs_level_size(root, *level - 1);
				1713
				1714	parent = path->nodes[*level];
				1715	root_owner = btrfs_header_owner(parent);
				1716	root_gen = btrfs_header_generation(parent);
				1717
				1718	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1719
				1720	wc->process_func(root, next, wc, ptr_gen);
				1721
				1722	if (*level == 1) {
				1723	path->slots[*level]++;
				1724	if (wc->free) {
				1725	btrfs_read_buffer(next, ptr_gen);
				1726
				1727	btrfs_tree_lock(next);
				1728	clean_tree_block(trans, root, next);
				1729	btrfs_wait_tree_block_writeback(next);
				1730	btrfs_tree_unlock(next);
				1731
				1732	ret = btrfs_drop_leaf_ref(trans, root, next);
				1733	BUG_ON(ret);
				1734
				1735	WARN_ON(root_owner !=
				1736	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1737	ret = btrfs_free_reserved_extent(root,
				1738	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1739	BUG_ON(ret);
				1740	}
				1741	free_extent_buffer(next);
				1742	continue;
				1743	}
				1744	btrfs_read_buffer(next, ptr_gen);
				1745
				1746	WARN_ON(*level <= 0);
				1747	if (path->nodes[*level-1])
				1748	free_extent_buffer(path->nodes[*level-1]);
				1749	path->nodes[*level-1] = next;
				1750	*level = btrfs_header_level(next);
				1751	path->slots[*level] = 0;
				1752	cond_resched();
				1753	}
				1754	WARN_ON(*level < 0);
				1755	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1756
				1757	if (path->nodes[*level] == root->node) {
				1758	parent = path->nodes[*level];
				1759	} else {
				1760	parent = path->nodes[*level + 1];
				1761	}
				1762	bytenr = path->nodes[*level]->start;
				1763
				1764	blocksize = btrfs_level_size(root, *level);
				1765	root_owner = btrfs_header_owner(parent);
				1766	root_gen = btrfs_header_generation(parent);
				1767
				1768	wc->process_func(root, path->nodes[*level], wc,
				1769	btrfs_header_generation(path->nodes[*level]));
				1770
				1771	if (wc->free) {
				1772	next = path->nodes[*level];
				1773	btrfs_tree_lock(next);
				1774	clean_tree_block(trans, root, next);
				1775	btrfs_wait_tree_block_writeback(next);
				1776	btrfs_tree_unlock(next);
				1777
				1778	if (*level == 0) {
				1779	ret = btrfs_drop_leaf_ref(trans, root, next);
				1780	BUG_ON(ret);
				1781	}
				1782	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1783	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1784	BUG_ON(ret);
				1785	}
				1786	free_extent_buffer(path->nodes[*level]);
				1787	path->nodes[*level] = NULL;
				1788	*level += 1;
				1789
				1790	cond_resched();
				1791	return 0;
				1792	}
				1793
				1794	static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
				1795	struct btrfs_root *root,
				1796	struct btrfs_path path, int level,
				1797	struct walk_control *wc)
				1798	{
				1799	u64 root_owner;
				1800	u64 root_gen;
				1801	int i;
				1802	int slot;
				1803	int ret;
				1804
				1805	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				1806	slot = path->slots[i];
				1807	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1808	struct extent_buffer *node;
				1809	node = path->nodes[i];
				1810	path->slots[i]++;
				1811	*level = i;
				1812	WARN_ON(*level == 0);
				1813	return 0;
				1814	} else {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	1815	struct extent_buffer *parent;
				1816	if (path->nodes[*level] == root->node)
				1817	parent = path->nodes[*level];
				1818	else
				1819	parent = path->nodes[*level + 1];
				1820
				1821	root_owner = btrfs_header_owner(parent);
				1822	root_gen = btrfs_header_generation(parent);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1823	wc->process_func(root, path->nodes[*level], wc,
				1824	btrfs_header_generation(path->nodes[*level]));
				1825	if (wc->free) {
				1826	struct extent_buffer *next;
				1827
				1828	next = path->nodes[*level];
				1829
				1830	btrfs_tree_lock(next);
				1831	clean_tree_block(trans, root, next);
				1832	btrfs_wait_tree_block_writeback(next);
				1833	btrfs_tree_unlock(next);
				1834
				1835	if (*level == 0) {
				1836	ret = btrfs_drop_leaf_ref(trans, root,
				1837	next);
				1838	BUG_ON(ret);
				1839	}
				1840
				1841	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1842	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1843	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1844	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1845	BUG_ON(ret);
				1846	}
				1847	free_extent_buffer(path->nodes[*level]);
				1848	path->nodes[*level] = NULL;
				1849	*level = i + 1;
				1850	}
				1851	}
				1852	return 1;
				1853	}
				1854
				1855	/*
				1856	* drop the reference count on the tree rooted at 'snap'. This traverses
				1857	* the tree freeing any blocks that have a ref count of zero after being
				1858	* decremented.
				1859	*/
				1860	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1861	struct btrfs_root log, struct walk_control wc)
				1862	{
				1863	int ret = 0;
				1864	int wret;
				1865	int level;
				1866	struct btrfs_path *path;
				1867	int i;
				1868	int orig_level;
				1869
				1870	path = btrfs_alloc_path();
				1871	BUG_ON(!path);
				1872
				1873	level = btrfs_header_level(log->node);
				1874	orig_level = level;
				1875	path->nodes[level] = log->node;
				1876	extent_buffer_get(log->node);
				1877	path->slots[level] = 0;
				1878
				1879	while(1) {
				1880	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1881	if (wret > 0)
				1882	break;
				1883	if (wret < 0)
				1884	ret = wret;
				1885
				1886	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1887	if (wret > 0)
				1888	break;
				1889	if (wret < 0)
				1890	ret = wret;
				1891	}
				1892
				1893	/* was the root node processed? if not, catch it here */
				1894	if (path->nodes[orig_level]) {
				1895	wc->process_func(log, path->nodes[orig_level], wc,
				1896	btrfs_header_generation(path->nodes[orig_level]));
				1897	if (wc->free) {
				1898	struct extent_buffer *next;
				1899
				1900	next = path->nodes[orig_level];
				1901
				1902	btrfs_tree_lock(next);
				1903	clean_tree_block(trans, log, next);
				1904	btrfs_wait_tree_block_writeback(next);
				1905	btrfs_tree_unlock(next);
				1906
				1907	if (orig_level == 0) {
				1908	ret = btrfs_drop_leaf_ref(trans, log,
				1909	next);
				1910	BUG_ON(ret);
				1911	}
				1912	WARN_ON(log->root_key.objectid !=
				1913	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1914	ret = btrfs_free_reserved_extent(log, next->start,
				1915	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1916	BUG_ON(ret);
				1917	}
				1918	}
				1919
				1920	for (i = 0; i <= orig_level; i++) {
				1921	if (path->nodes[i]) {
				1922	free_extent_buffer(path->nodes[i]);
				1923	path->nodes[i] = NULL;
				1924	}
				1925	}
				1926	btrfs_free_path(path);
				1927	if (wc->free)
				1928	free_extent_buffer(log->node);
				1929	return ret;
				1930	}
				1931
				1932	int wait_log_commit(struct btrfs_root *log)
				1933	{
				1934	DEFINE_WAIT(wait);
				1935	u64 transid = log->fs_info->tree_log_transid;
				1936
				1937	do {
				1938	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1939	TASK_UNINTERRUPTIBLE);
				1940	mutex_unlock(&log->fs_info->tree_log_mutex);
				1941	if (atomic_read(&log->fs_info->tree_log_commit))
				1942	schedule();
				1943	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1944	mutex_lock(&log->fs_info->tree_log_mutex);
				1945	} while(transid == log->fs_info->tree_log_transid &&
				1946	atomic_read(&log->fs_info->tree_log_commit));
				1947	return 0;
				1948	}
				1949
				1950	/*
				1951	* btrfs_sync_log does sends a given tree log down to the disk and
				1952	* updates the super blocks to record it. When this call is done,
				1953	* you know that any inodes previously logged are safely on disk
				1954	*/
				1955	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1956	struct btrfs_root *root)
				1957	{
				1958	int ret;
				1959	unsigned long batch;
				1960	struct btrfs_root *log = root->log_root;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1961
				1962	mutex_lock(&log->fs_info->tree_log_mutex);
				1963	if (atomic_read(&log->fs_info->tree_log_commit)) {
				1964	wait_log_commit(log);
				1965	goto out;
				1966	}
				1967	atomic_set(&log->fs_info->tree_log_commit, 1);
				1968
				1969	while(1) {
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	1970	batch = log->fs_info->tree_log_batch;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1971	mutex_unlock(&log->fs_info->tree_log_mutex);
				1972	schedule_timeout_uninterruptible(1);
				1973	mutex_lock(&log->fs_info->tree_log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1974
				1975	while(atomic_read(&log->fs_info->tree_log_writers)) {
				1976	DEFINE_WAIT(wait);
				1977	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1978	TASK_UNINTERRUPTIBLE);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1979	mutex_unlock(&log->fs_info->tree_log_mutex);
				1980	if (atomic_read(&log->fs_info->tree_log_writers))
				1981	schedule();
				1982	mutex_lock(&log->fs_info->tree_log_mutex);
				1983	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1984	}
				1985	if (batch == log->fs_info->tree_log_batch)
				1986	break;
				1987	}
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1988
				1989	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1990	BUG_ON(ret);
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1991	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
				1992	&root->fs_info->log_root_tree->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1993	BUG_ON(ret);
				1994
				1995	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
				1996	log->fs_info->log_root_tree->node->start);
				1997	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
				1998	btrfs_header_level(log->fs_info->log_root_tree->node));
				1999
				2000	write_ctree_super(trans, log->fs_info->tree_root);
				2001	log->fs_info->tree_log_transid++;
				2002	log->fs_info->tree_log_batch = 0;
				2003	atomic_set(&log->fs_info->tree_log_commit, 0);
				2004	smp_mb();
				2005	if (waitqueue_active(&log->fs_info->tree_log_wait))
				2006	wake_up(&log->fs_info->tree_log_wait);
				2007	out:
				2008	mutex_unlock(&log->fs_info->tree_log_mutex);
				2009	return 0;
				2010
				2011	}
				2012
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2013	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2014	* at commit time of the full transaction
				2015	*/
				2016	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2017	{
				2018	int ret;
				2019	struct btrfs_root *log;
				2020	struct key;
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2021	u64 start;
				2022	u64 end;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2023	struct walk_control wc = {
				2024	.free = 1,
				2025	.process_func = process_one_buffer
				2026	};
				2027
				2028	if (!root->log_root)
				2029	return 0;
				2030
				2031	log = root->log_root;
				2032	ret = walk_log_tree(trans, log, &wc);
				2033	BUG_ON(ret);
				2034
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2035	while(1) {
				2036	ret = find_first_extent_bit(&log->dirty_log_pages,
				2037	0, &start, &end, EXTENT_DIRTY);
				2038	if (ret)
				2039	break;
				2040
				2041	clear_extent_dirty(&log->dirty_log_pages,
				2042	start, end, GFP_NOFS);
				2043	}
				2044
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2045	log = root->log_root;
				2046	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2047	&log->root_key);
				2048	BUG_ON(ret);
				2049	root->log_root = NULL;
				2050	kfree(root->log_root);
				2051	return 0;
				2052	}
				2053
				2054	/*
				2055	* helper function to update the item for a given subvolumes log root
				2056	* in the tree of log roots
				2057	*/
				2058	static int update_log_root(struct btrfs_trans_handle *trans,
				2059	struct btrfs_root *log)
				2060	{
				2061	u64 bytenr = btrfs_root_bytenr(&log->root_item);
				2062	int ret;
				2063
				2064	if (log->node->start == bytenr)
				2065	return 0;
				2066
				2067	btrfs_set_root_bytenr(&log->root_item, log->node->start);
				2068	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				2069	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				2070	&log->root_key, &log->root_item);
				2071	BUG_ON(ret);
				2072	return ret;
				2073	}
				2074
				2075	/*
				2076	* If both a file and directory are logged, and unlinks or renames are
				2077	* mixed in, we have a few interesting corners:
				2078	*
				2079	* create file X in dir Y
				2080	* link file X to X.link in dir Y
				2081	* fsync file X
				2082	* unlink file X but leave X.link
				2083	* fsync dir Y
				2084	*
				2085	* After a crash we would expect only X.link to exist. But file X
				2086	* didn't get fsync'd again so the log has back refs for X and X.link.
				2087	*
				2088	* We solve this by removing directory entries and inode backrefs from the
				2089	* log when a file that was logged in the current transaction is
				2090	* unlinked. Any later fsync will include the updated log entries, and
				2091	* we'll be able to reconstruct the proper directory items from backrefs.
				2092	*
				2093	* This optimizations allows us to avoid relogging the entire inode
				2094	* or the entire directory.
				2095	*/
				2096	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2097	struct btrfs_root *root,
				2098	const char *name, int name_len,
				2099	struct inode *dir, u64 index)
				2100	{
				2101	struct btrfs_root *log;
				2102	struct btrfs_dir_item *di;
				2103	struct btrfs_path *path;
				2104	int ret;
				2105	int bytes_del = 0;
				2106
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2107	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2108	return 0;
				2109
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2110	ret = join_running_log_trans(root);
				2111	if (ret)
				2112	return 0;
				2113
				2114	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2115
				2116	log = root->log_root;
				2117	path = btrfs_alloc_path();
				2118	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2119	name, name_len, -1);
				2120	if (di && !IS_ERR(di)) {
				2121	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2122	bytes_del += name_len;
				2123	BUG_ON(ret);
				2124	}
				2125	btrfs_release_path(log, path);
				2126	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2127	index, name, name_len, -1);
				2128	if (di && !IS_ERR(di)) {
				2129	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2130	bytes_del += name_len;
				2131	BUG_ON(ret);
				2132	}
				2133
				2134	/* update the directory size in the log to reflect the names
				2135	* we have removed
				2136	*/
				2137	if (bytes_del) {
				2138	struct btrfs_key key;
				2139
				2140	key.objectid = dir->i_ino;
				2141	key.offset = 0;
				2142	key.type = BTRFS_INODE_ITEM_KEY;
				2143	btrfs_release_path(log, path);
				2144
				2145	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2146	if (ret == 0) {
				2147	struct btrfs_inode_item *item;
				2148	u64 i_size;
				2149
				2150	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2151	struct btrfs_inode_item);
				2152	i_size = btrfs_inode_size(path->nodes[0], item);
				2153	if (i_size > bytes_del)
				2154	i_size -= bytes_del;
				2155	else
				2156	i_size = 0;
				2157	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2158	btrfs_mark_buffer_dirty(path->nodes[0]);
				2159	} else
				2160	ret = 0;
				2161	btrfs_release_path(log, path);
				2162	}
				2163
				2164	btrfs_free_path(path);
				2165	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2166	end_log_trans(root);
				2167
				2168	return 0;
				2169	}
				2170
				2171	/* see comments for btrfs_del_dir_entries_in_log */
				2172	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2173	struct btrfs_root *root,
				2174	const char *name, int name_len,
				2175	struct inode *inode, u64 dirid)
				2176	{
				2177	struct btrfs_root *log;
				2178	u64 index;
				2179	int ret;
				2180
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2181	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2182	return 0;
				2183
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2184	ret = join_running_log_trans(root);
				2185	if (ret)
				2186	return 0;
				2187	log = root->log_root;
				2188	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2189
				2190	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2191	dirid, &index);
				2192	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2193	end_log_trans(root);
				2194
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2195	return ret;
				2196	}
				2197
				2198	/*
				2199	* creates a range item in the log for 'dirid'. first_offset and
				2200	* last_offset tell us which parts of the key space the log should
				2201	* be considered authoritative for.
				2202	*/
				2203	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2204	struct btrfs_root *log,
				2205	struct btrfs_path *path,
				2206	int key_type, u64 dirid,
				2207	u64 first_offset, u64 last_offset)
				2208	{
				2209	int ret;
				2210	struct btrfs_key key;
				2211	struct btrfs_dir_log_item *item;
				2212
				2213	key.objectid = dirid;
				2214	key.offset = first_offset;
				2215	if (key_type == BTRFS_DIR_ITEM_KEY)
				2216	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2217	else
				2218	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2219	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2220	BUG_ON(ret);
				2221
				2222	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2223	struct btrfs_dir_log_item);
				2224	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2225	btrfs_mark_buffer_dirty(path->nodes[0]);
				2226	btrfs_release_path(log, path);
				2227	return 0;
				2228	}
				2229
				2230	/*
				2231	* log all the items included in the current transaction for a given
				2232	* directory. This also creates the range items in the log tree required
				2233	* to replay anything deleted before the fsync
				2234	*/
				2235	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2236	struct btrfs_root root, struct inode inode,
				2237	struct btrfs_path *path,
				2238	struct btrfs_path *dst_path, int key_type,
				2239	u64 min_offset, u64 *last_offset_ret)
				2240	{
				2241	struct btrfs_key min_key;
				2242	struct btrfs_key max_key;
				2243	struct btrfs_root *log = root->log_root;
				2244	struct extent_buffer *src;
				2245	int ret;
				2246	int i;
				2247	int nritems;
				2248	u64 first_offset = min_offset;
				2249	u64 last_offset = (u64)-1;
				2250
				2251	log = root->log_root;
				2252	max_key.objectid = inode->i_ino;
				2253	max_key.offset = (u64)-1;
				2254	max_key.type = key_type;
				2255
				2256	min_key.objectid = inode->i_ino;
				2257	min_key.type = key_type;
				2258	min_key.offset = min_offset;
				2259
				2260	path->keep_locks = 1;
				2261
				2262	ret = btrfs_search_forward(root, &min_key, &max_key,
				2263	path, 0, trans->transid);
				2264
				2265	/*
				2266	* we didn't find anything from this transaction, see if there
				2267	* is anything at all
				2268	*/
				2269	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2270	min_key.type != key_type) {
				2271	min_key.objectid = inode->i_ino;
				2272	min_key.type = key_type;
				2273	min_key.offset = (u64)-1;
				2274	btrfs_release_path(root, path);
				2275	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2276	if (ret < 0) {
				2277	btrfs_release_path(root, path);
				2278	return ret;
				2279	}
				2280	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2281
				2282	/* if ret == 0 there are items for this type,
				2283	* create a range to tell us the last key of this type.
				2284	* otherwise, there are no items in this directory after
				2285	* *min_offset, and we create a range to indicate that.
				2286	*/
				2287	if (ret == 0) {
				2288	struct btrfs_key tmp;
				2289	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2290	path->slots[0]);
				2291	if (key_type == tmp.type) {
				2292	first_offset = max(min_offset, tmp.offset) + 1;
				2293	}
				2294	}
				2295	goto done;
				2296	}
				2297
				2298	/* go backward to find any previous key */
				2299	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2300	if (ret == 0) {
				2301	struct btrfs_key tmp;
				2302	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2303	if (key_type == tmp.type) {
				2304	first_offset = tmp.offset;
				2305	ret = overwrite_item(trans, log, dst_path,
				2306	path->nodes[0], path->slots[0],
				2307	&tmp);
				2308	}
				2309	}
				2310	btrfs_release_path(root, path);
				2311
				2312	/* find the first key from this transaction again */
				2313	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2314	if (ret != 0) {
				2315	WARN_ON(1);
				2316	goto done;
				2317	}
				2318
				2319	/*
				2320	* we have a block from this transaction, log every item in it
				2321	* from our directory
				2322	*/
				2323	while(1) {
				2324	struct btrfs_key tmp;
				2325	src = path->nodes[0];
				2326	nritems = btrfs_header_nritems(src);
				2327	for (i = path->slots[0]; i < nritems; i++) {
				2328	btrfs_item_key_to_cpu(src, &min_key, i);
				2329
				2330	if (min_key.objectid != inode->i_ino \|\|
				2331	min_key.type != key_type)
				2332	goto done;
				2333	ret = overwrite_item(trans, log, dst_path, src, i,
				2334	&min_key);
				2335	BUG_ON(ret);
				2336	}
				2337	path->slots[0] = nritems;
				2338
				2339	/*
				2340	* look ahead to the next item and see if it is also
				2341	* from this directory and from this transaction
				2342	*/
				2343	ret = btrfs_next_leaf(root, path);
				2344	if (ret == 1) {
				2345	last_offset = (u64)-1;
				2346	goto done;
				2347	}
				2348	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2349	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2350	last_offset = (u64)-1;
				2351	goto done;
				2352	}
				2353	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2354	ret = overwrite_item(trans, log, dst_path,
				2355	path->nodes[0], path->slots[0],
				2356	&tmp);
				2357
				2358	BUG_ON(ret);
				2359	last_offset = tmp.offset;
				2360	goto done;
				2361	}
				2362	}
				2363	done:
				2364	*last_offset_ret = last_offset;
				2365	btrfs_release_path(root, path);
				2366	btrfs_release_path(log, dst_path);
				2367
				2368	/* insert the log range keys to indicate where the log is valid */
				2369	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2370	first_offset, last_offset);
				2371	BUG_ON(ret);
				2372	return 0;
				2373	}
				2374
				2375	/*
				2376	* logging directories is very similar to logging inodes, We find all the items
				2377	* from the current transaction and write them to the log.
				2378	*
				2379	* The recovery code scans the directory in the subvolume, and if it finds a
				2380	* key in the range logged that is not present in the log tree, then it means
				2381	* that dir entry was unlinked during the transaction.
				2382	*
				2383	* In order for that scan to work, we must include one key smaller than
				2384	* the smallest logged by this transaction and one key larger than the largest
				2385	* key logged by this transaction.
				2386	*/
				2387	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2388	struct btrfs_root root, struct inode inode,
				2389	struct btrfs_path *path,
				2390	struct btrfs_path *dst_path)
				2391	{
				2392	u64 min_key;
				2393	u64 max_key;
				2394	int ret;
				2395	int key_type = BTRFS_DIR_ITEM_KEY;
				2396
				2397	again:
				2398	min_key = 0;
				2399	max_key = 0;
				2400	while(1) {
				2401	ret = log_dir_items(trans, root, inode, path,
				2402	dst_path, key_type, min_key,
				2403	&max_key);
				2404	BUG_ON(ret);
				2405	if (max_key == (u64)-1)
				2406	break;
				2407	min_key = max_key + 1;
				2408	}
				2409
				2410	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2411	key_type = BTRFS_DIR_INDEX_KEY;
				2412	goto again;
				2413	}
				2414	return 0;
				2415	}
				2416
				2417	/*
				2418	* a helper function to drop items from the log before we relog an
				2419	* inode. max_key_type indicates the highest item type to remove.
				2420	* This cannot be run for file data extents because it does not
				2421	* free the extents they point to.
				2422	*/
				2423	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2424	struct btrfs_root *log,
				2425	struct btrfs_path *path,
				2426	u64 objectid, int max_key_type)
				2427	{
				2428	int ret;
				2429	struct btrfs_key key;
				2430	struct btrfs_key found_key;
				2431
				2432	key.objectid = objectid;
				2433	key.type = max_key_type;
				2434	key.offset = (u64)-1;
				2435
				2436	while(1) {
				2437	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2438
				2439	if (ret != 1)
				2440	break;
				2441
				2442	if (path->slots[0] == 0)
				2443	break;
				2444
				2445	path->slots[0]--;
				2446	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2447	path->slots[0]);
				2448
				2449	if (found_key.objectid != objectid)
				2450	break;
				2451
				2452	ret = btrfs_del_item(trans, log, path);
				2453	BUG_ON(ret);
				2454	btrfs_release_path(log, path);
				2455	}
				2456	btrfs_release_path(log, path);
				2457	return 0;
				2458	}
				2459
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2460	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2461	struct btrfs_root *log,
				2462	struct btrfs_path *dst_path,
				2463	struct extent_buffer *src,
				2464	int start_slot, int nr, int inode_only)
				2465	{
				2466	unsigned long src_offset;
				2467	unsigned long dst_offset;
				2468	struct btrfs_file_extent_item *extent;
				2469	struct btrfs_inode_item *inode_item;
				2470	int ret;
				2471	struct btrfs_key *ins_keys;
				2472	u32 *ins_sizes;
				2473	char *ins_data;
				2474	int i;
				2475
				2476	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2477	nr * sizeof(u32), GFP_NOFS);
				2478	ins_sizes = (u32 *)ins_data;
				2479	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2480
				2481	for (i = 0; i < nr; i++) {
				2482	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2483	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2484	}
				2485	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2486	ins_keys, ins_sizes, nr);
				2487	BUG_ON(ret);
				2488
				2489	for (i = 0; i < nr; i++) {
				2490	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2491	dst_path->slots[0]);
				2492
				2493	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2494
				2495	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2496	src_offset, ins_sizes[i]);
				2497
				2498	if (inode_only == LOG_INODE_EXISTS &&
				2499	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2500	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2501	dst_path->slots[0],
				2502	struct btrfs_inode_item);
				2503	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2504
				2505	/* set the generation to zero so the recover code
				2506	* can tell the difference between an logging
				2507	* just to say 'this inode exists' and a logging
				2508	* to say 'update this inode with these values'
				2509	*/
				2510	btrfs_set_inode_generation(dst_path->nodes[0],
				2511	inode_item, 0);
				2512	}
				2513	/* take a reference on file data extents so that truncates
				2514	* or deletes of this inode don't have to relog the inode
				2515	* again
				2516	*/
				2517	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2518	int found_type;
				2519	extent = btrfs_item_ptr(src, start_slot + i,
				2520	struct btrfs_file_extent_item);
				2521
				2522	found_type = btrfs_file_extent_type(src, extent);
				2523	if (found_type == BTRFS_FILE_EXTENT_REG) {
				2524	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2525	extent);
				2526	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2527	extent);
				2528	/* ds == 0 is a hole */
				2529	if (ds != 0) {
				2530	ret = btrfs_inc_extent_ref(trans, log,
				2531	ds, dl,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2532	dst_path->nodes[0]->start,
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2533	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2534	trans->transid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	2535	ins_keys[i].objectid);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2536	BUG_ON(ret);
				2537	}
				2538	}
				2539	}
				2540	dst_path->slots[0]++;
				2541	}
				2542
				2543	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2544	btrfs_release_path(log, dst_path);
				2545	kfree(ins_data);
				2546	return 0;
				2547	}
				2548
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2549	/* log a single inode in the tree log.
				2550	* At least one parent directory for this inode must exist in the tree
				2551	* or be logged already.
				2552	*
				2553	* Any items from this inode changed by the current transaction are copied
				2554	* to the log tree. An extra reference is taken on any extents in this
				2555	* file, allowing us to avoid a whole pile of corner cases around logging
				2556	* blocks that have been removed from the tree.
				2557	*
				2558	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2559	* does.
				2560	*
				2561	* This handles both files and directories.
				2562	*/
				2563	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2564	struct btrfs_root root, struct inode inode,
				2565	int inode_only)
				2566	{
				2567	struct btrfs_path *path;
				2568	struct btrfs_path *dst_path;
				2569	struct btrfs_key min_key;
				2570	struct btrfs_key max_key;
				2571	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2572	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2573	u32 size;
				2574	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2575	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2576	int ins_start_slot = 0;
				2577	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2578
				2579	log = root->log_root;
				2580
				2581	path = btrfs_alloc_path();
				2582	dst_path = btrfs_alloc_path();
				2583
				2584	min_key.objectid = inode->i_ino;
				2585	min_key.type = BTRFS_INODE_ITEM_KEY;
				2586	min_key.offset = 0;
				2587
				2588	max_key.objectid = inode->i_ino;
				2589	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2590	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2591	else
				2592	max_key.type = (u8)-1;
				2593	max_key.offset = (u64)-1;
				2594
				2595	/*
				2596	* if this inode has already been logged and we're in inode_only
				2597	* mode, we don't want to delete the things that have already
				2598	* been written to the log.
				2599	*
				2600	* But, if the inode has been through an inode_only log,
				2601	* the logged_trans field is not set. This allows us to catch
				2602	* any new names for this inode in the backrefs by logging it
				2603	* again
				2604	*/
				2605	if (inode_only == LOG_INODE_EXISTS &&
				2606	BTRFS_I(inode)->logged_trans == trans->transid) {
				2607	btrfs_free_path(path);
				2608	btrfs_free_path(dst_path);
				2609	goto out;
				2610	}
				2611	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2612
				2613	/*
				2614	* a brute force approach to making sure we get the most uptodate
				2615	* copies of everything.
				2616	*/
				2617	if (S_ISDIR(inode->i_mode)) {
				2618	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2619
				2620	if (inode_only == LOG_INODE_EXISTS)
				2621	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2622	ret = drop_objectid_items(trans, log, path,
				2623	inode->i_ino, max_key_type);
				2624	} else {
				2625	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2626	}
				2627	BUG_ON(ret);
				2628	path->keep_locks = 1;
				2629
				2630	while(1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2631	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2632	ret = btrfs_search_forward(root, &min_key, &max_key,
				2633	path, 0, trans->transid);
				2634	if (ret != 0)
				2635	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2636	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2637	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2638	if (min_key.objectid != inode->i_ino)
				2639	break;
				2640	if (min_key.type > max_key.type)
				2641	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2642
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2643	src = path->nodes[0];
				2644	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2645	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2646	ins_nr++;
				2647	goto next_slot;
				2648	} else if (!ins_nr) {
				2649	ins_start_slot = path->slots[0];
				2650	ins_nr = 1;
				2651	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2652	}
				2653
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2654	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2655	ins_nr, inode_only);
				2656	BUG_ON(ret);
				2657	ins_nr = 1;
				2658	ins_start_slot = path->slots[0];
				2659	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2660
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2661	nritems = btrfs_header_nritems(path->nodes[0]);
				2662	path->slots[0]++;
				2663	if (path->slots[0] < nritems) {
				2664	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2665	path->slots[0]);
				2666	goto again;
				2667	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2668	if (ins_nr) {
				2669	ret = copy_items(trans, log, dst_path, src,
				2670	ins_start_slot,
				2671	ins_nr, inode_only);
				2672	BUG_ON(ret);
				2673	ins_nr = 0;
				2674	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2675	btrfs_release_path(root, path);
				2676
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2677	if (min_key.offset < (u64)-1)
				2678	min_key.offset++;
				2679	else if (min_key.type < (u8)-1)
				2680	min_key.type++;
				2681	else if (min_key.objectid < (u64)-1)
				2682	min_key.objectid++;
				2683	else
				2684	break;
				2685	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2686	if (ins_nr) {
				2687	ret = copy_items(trans, log, dst_path, src,
				2688	ins_start_slot,
				2689	ins_nr, inode_only);
				2690	BUG_ON(ret);
				2691	ins_nr = 0;
				2692	}
				2693	WARN_ON(ins_nr);
Chris Mason	9623f9a	2008-09-11 17:42:42 -0400	[diff] [blame]	2694	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2695	btrfs_release_path(root, path);
				2696	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2697	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2698	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2699	BUG_ON(ret);
				2700	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2701	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2702	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2703
				2704	btrfs_free_path(path);
				2705	btrfs_free_path(dst_path);
				2706
				2707	mutex_lock(&root->fs_info->tree_log_mutex);
				2708	ret = update_log_root(trans, log);
				2709	BUG_ON(ret);
				2710	mutex_unlock(&root->fs_info->tree_log_mutex);
				2711	out:
				2712	return 0;
				2713	}
				2714
				2715	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2716	struct btrfs_root root, struct inode inode,
				2717	int inode_only)
				2718	{
				2719	int ret;
				2720
				2721	start_log_trans(trans, root);
				2722	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2723	end_log_trans(root);
				2724	return ret;
				2725	}
				2726
				2727	/*
				2728	* helper function around btrfs_log_inode to make sure newly created
				2729	* parent directories also end up in the log. A minimal inode and backref
				2730	* only logging is done of any parent directories that are older than
				2731	* the last committed transaction
				2732	*/
				2733	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2734	struct btrfs_root root, struct dentry dentry)
				2735	{
				2736	int inode_only = LOG_INODE_ALL;
				2737	struct super_block *sb;
				2738	int ret;
				2739
				2740	start_log_trans(trans, root);
				2741	sb = dentry->d_inode->i_sb;
				2742	while(1) {
				2743	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2744	inode_only);
				2745	BUG_ON(ret);
				2746	inode_only = LOG_INODE_EXISTS;
				2747
				2748	dentry = dentry->d_parent;
				2749	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2750	break;
				2751
				2752	if (BTRFS_I(dentry->d_inode)->generation <=
				2753	root->fs_info->last_trans_committed)
				2754	break;
				2755	}
				2756	end_log_trans(root);
				2757	return 0;
				2758	}
				2759
				2760	/*
				2761	* it is not safe to log dentry if the chunk root has added new
				2762	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2763	* If this returns 1, you must commit the transaction to safely get your
				2764	* data on disk.
				2765	*/
				2766	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2767	struct btrfs_root root, struct dentry dentry)
				2768	{
				2769	u64 gen;
				2770	gen = root->fs_info->last_trans_new_blockgroup;
				2771	if (gen > root->fs_info->last_trans_committed)
				2772	return 1;
				2773	else
				2774	return btrfs_log_dentry(trans, root, dentry);
				2775	}
				2776
				2777	/*
				2778	* should be called during mount to recover any replay any log trees
				2779	* from the FS
				2780	*/
				2781	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2782	{
				2783	int ret;
				2784	struct btrfs_path *path;
				2785	struct btrfs_trans_handle *trans;
				2786	struct btrfs_key key;
				2787	struct btrfs_key found_key;
				2788	struct btrfs_key tmp_key;
				2789	struct btrfs_root *log;
				2790	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2791	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2792	struct walk_control wc = {
				2793	.process_func = process_one_buffer,
				2794	.stage = 0,
				2795	};
				2796
				2797	fs_info->log_root_recovering = 1;
				2798	path = btrfs_alloc_path();
				2799	BUG_ON(!path);
				2800
				2801	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2802
				2803	wc.trans = trans;
				2804	wc.pin = 1;
				2805
				2806	walk_log_tree(trans, log_root_tree, &wc);
				2807
				2808	again:
				2809	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2810	key.offset = (u64)-1;
				2811	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2812
				2813	while(1) {
				2814	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2815	if (ret < 0)
				2816	break;
				2817	if (ret > 0) {
				2818	if (path->slots[0] == 0)
				2819	break;
				2820	path->slots[0]--;
				2821	}
				2822	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2823	path->slots[0]);
				2824	btrfs_release_path(log_root_tree, path);
				2825	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2826	break;
				2827
				2828	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2829	&found_key);
				2830	BUG_ON(!log);
				2831
				2832
				2833	tmp_key.objectid = found_key.offset;
				2834	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2835	tmp_key.offset = (u64)-1;
				2836
				2837	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				2838
				2839	BUG_ON(!wc.replay_dest);
				2840
				2841	btrfs_record_root_in_trans(wc.replay_dest);
				2842	ret = walk_log_tree(trans, log, &wc);
				2843	BUG_ON(ret);
				2844
				2845	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2846	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2847	path);
				2848	BUG_ON(ret);
				2849	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2850	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2851	if (ret == 0) {
				2852	wc.replay_dest->highest_inode = highest_inode;
				2853	wc.replay_dest->last_inode_alloc = highest_inode;
				2854	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2855
				2856	key.offset = found_key.offset - 1;
				2857	free_extent_buffer(log->node);
				2858	kfree(log);
				2859
				2860	if (found_key.offset == 0)
				2861	break;
				2862	}
				2863	btrfs_release_path(log_root_tree, path);
				2864
				2865	/* step one is to pin it all, step two is to replay just inodes */
				2866	if (wc.pin) {
				2867	wc.pin = 0;
				2868	wc.process_func = replay_one_buffer;
				2869	wc.stage = LOG_WALK_REPLAY_INODES;
				2870	goto again;
				2871	}
				2872	/* step three is to replay everything */
				2873	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2874	wc.stage++;
				2875	goto again;
				2876	}
				2877
				2878	btrfs_free_path(path);
				2879
				2880	free_extent_buffer(log_root_tree->node);
				2881	log_root_tree->log_root = NULL;
				2882	fs_info->log_root_recovering = 0;
				2883
				2884	/* step 4: commit the transaction, which also unpins the blocks */
				2885	btrfs_commit_transaction(trans, fs_info->tree_root);
				2886
				2887	kfree(log_root_tree);
				2888	return 0;
				2889	}