Blame - fs/btrfs/tree-log.c - fp2-dev/kernel/msm

blob: be4fc30a30e48a6a15b750d9c932caea4ca35a19 [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
				26
				27	/* magic values for the inode_only field in btrfs_log_inode:
				28	*
				29	* LOG_INODE_ALL means to log everything
				30	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				31	* during log replay
				32	*/
				33	#define LOG_INODE_ALL 0
				34	#define LOG_INODE_EXISTS 1
				35
				36	/*
				37	* stages for the tree walking. The first
				38	* stage (0) is to only pin down the blocks we find
				39	* the second stage (1) is to make sure that all the inodes
				40	* we find in the log are created in the subvolume.
				41	*
				42	* The last stage is to deal with directories and links and extents
				43	* and all the other fun semantics
				44	*/
				45	#define LOG_WALK_PIN_ONLY 0
				46	#define LOG_WALK_REPLAY_INODES 1
				47	#define LOG_WALK_REPLAY_ALL 2
				48
				49	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				50	struct btrfs_root root, struct inode inode,
				51	int inode_only);
				52
				53	/*
				54	* tree logging is a special write ahead log used to make sure that
				55	* fsyncs and O_SYNCs can happen without doing full tree commits.
				56	*
				57	* Full tree commits are expensive because they require commonly
				58	* modified blocks to be recowed, creating many dirty pages in the
				59	* extent tree an 4x-6x higher write load than ext3.
				60	*
				61	* Instead of doing a tree commit on every fsync, we use the
				62	* key ranges and transaction ids to find items for a given file or directory
				63	* that have changed in this transaction. Those items are copied into
				64	* a special tree (one per subvolume root), that tree is written to disk
				65	* and then the fsync is considered complete.
				66	*
				67	* After a crash, items are copied out of the log-tree back into the
				68	* subvolume tree. Any file data extents found are recorded in the extent
				69	* allocation tree, and the log-tree freed.
				70	*
				71	* The log tree is read three times, once to pin down all the extents it is
				72	* using in ram and once, once to create all the inodes logged in the tree
				73	* and once to do all the other items.
				74	*/
				75
				76	/*
				77	* btrfs_add_log_tree adds a new per-subvolume log tree into the
				78	* tree of log tree roots. This must be called with a tree log transaction
				79	* running (see start_log_trans).
				80	*/
				81	int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
				82	struct btrfs_root *root)
				83	{
				84	struct btrfs_key key;
				85	struct btrfs_root_item root_item;
				86	struct btrfs_inode_item *inode_item;
				87	struct extent_buffer *leaf;
				88	struct btrfs_root *new_root = root;
				89	int ret;
				90	u64 objectid = root->root_key.objectid;
				91
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	92	leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	93	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	94	trans->transid, 0, 0, 0);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	95	if (IS_ERR(leaf)) {
				96	ret = PTR_ERR(leaf);
				97	return ret;
				98	}
				99
				100	btrfs_set_header_nritems(leaf, 0);
				101	btrfs_set_header_level(leaf, 0);
				102	btrfs_set_header_bytenr(leaf, leaf->start);
				103	btrfs_set_header_generation(leaf, trans->transid);
				104	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				105
				106	write_extent_buffer(leaf, root->fs_info->fsid,
				107	(unsigned long)btrfs_header_fsid(leaf),
				108	BTRFS_FSID_SIZE);
				109	btrfs_mark_buffer_dirty(leaf);
				110
				111	inode_item = &root_item.inode;
				112	memset(inode_item, 0, sizeof(*inode_item));
				113	inode_item->generation = cpu_to_le64(1);
				114	inode_item->size = cpu_to_le64(3);
				115	inode_item->nlink = cpu_to_le32(1);
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	116	inode_item->nbytes = cpu_to_le64(root->leafsize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	117	inode_item->mode = cpu_to_le32(S_IFDIR \| 0755);
				118
				119	btrfs_set_root_bytenr(&root_item, leaf->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	120	btrfs_set_root_generation(&root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	121	btrfs_set_root_level(&root_item, 0);
				122	btrfs_set_root_refs(&root_item, 0);
				123	btrfs_set_root_used(&root_item, 0);
				124
				125	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
				126	root_item.drop_level = 0;
				127
				128	btrfs_tree_unlock(leaf);
				129	free_extent_buffer(leaf);
				130	leaf = NULL;
				131
				132	btrfs_set_root_dirid(&root_item, 0);
				133
				134	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				135	key.offset = objectid;
				136	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				137	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
				138	&root_item);
				139	if (ret)
				140	goto fail;
				141
				142	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
				143	&key);
				144	BUG_ON(!new_root);
				145
				146	WARN_ON(root->log_root);
				147	root->log_root = new_root;
				148
				149	/*
				150	* log trees do not get reference counted because they go away
				151	* before a real commit is actually done. They do store pointers
				152	* to file data extents, and those reference counts still get
				153	* updated (along with back refs to the log tree).
				154	*/
				155	new_root->ref_cows = 0;
				156	new_root->last_trans = trans->transid;
				157	fail:
				158	return ret;
				159	}
				160
				161	/*
				162	* start a sub transaction and setup the log tree
				163	* this increments the log tree writer count to make the people
				164	* syncing the tree wait for us to finish
				165	*/
				166	static int start_log_trans(struct btrfs_trans_handle *trans,
				167	struct btrfs_root *root)
				168	{
				169	int ret;
				170	mutex_lock(&root->fs_info->tree_log_mutex);
				171	if (!root->fs_info->log_root_tree) {
				172	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				173	BUG_ON(ret);
				174	}
				175	if (!root->log_root) {
				176	ret = btrfs_add_log_tree(trans, root);
				177	BUG_ON(ret);
				178	}
				179	atomic_inc(&root->fs_info->tree_log_writers);
				180	root->fs_info->tree_log_batch++;
				181	mutex_unlock(&root->fs_info->tree_log_mutex);
				182	return 0;
				183	}
				184
				185	/*
				186	* returns 0 if there was a log transaction running and we were able
				187	* to join, or returns -ENOENT if there were not transactions
				188	* in progress
				189	*/
				190	static int join_running_log_trans(struct btrfs_root *root)
				191	{
				192	int ret = -ENOENT;
				193
				194	smp_mb();
				195	if (!root->log_root)
				196	return -ENOENT;
				197
				198	mutex_lock(&root->fs_info->tree_log_mutex);
				199	if (root->log_root) {
				200	ret = 0;
				201	atomic_inc(&root->fs_info->tree_log_writers);
				202	root->fs_info->tree_log_batch++;
				203	}
				204	mutex_unlock(&root->fs_info->tree_log_mutex);
				205	return ret;
				206	}
				207
				208	/*
				209	* indicate we're done making changes to the log tree
				210	* and wake up anyone waiting to do a sync
				211	*/
				212	static int end_log_trans(struct btrfs_root *root)
				213	{
				214	atomic_dec(&root->fs_info->tree_log_writers);
				215	smp_mb();
				216	if (waitqueue_active(&root->fs_info->tree_log_wait))
				217	wake_up(&root->fs_info->tree_log_wait);
				218	return 0;
				219	}
				220
				221
				222	/*
				223	* the walk control struct is used to pass state down the chain when
				224	* processing the log tree. The stage field tells us which part
				225	* of the log tree processing we are currently doing. The others
				226	* are state fields used for that specific part
				227	*/
				228	struct walk_control {
				229	/* should we free the extent on disk when done? This is used
				230	* at transaction commit time while freeing a log tree
				231	*/
				232	int free;
				233
				234	/* should we write out the extent buffer? This is used
				235	* while flushing the log tree to disk during a sync
				236	*/
				237	int write;
				238
				239	/* should we wait for the extent buffer io to finish? Also used
				240	* while flushing the log tree to disk for a sync
				241	*/
				242	int wait;
				243
				244	/* pin only walk, we record which extents on disk belong to the
				245	* log trees
				246	*/
				247	int pin;
				248
				249	/* what stage of the replay code we're currently in */
				250	int stage;
				251
				252	/* the root we are currently replaying */
				253	struct btrfs_root *replay_dest;
				254
				255	/* the trans handle for the current replay */
				256	struct btrfs_trans_handle *trans;
				257
				258	/* the function that gets used to process blocks we find in the
				259	* tree. Note the extent_buffer might not be up to date when it is
				260	* passed in, and it must be checked or read if you need the data
				261	* inside it
				262	*/
				263	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				264	struct walk_control *wc, u64 gen);
				265	};
				266
				267	/*
				268	* process_func used to pin down extents, write them or wait on them
				269	*/
				270	static int process_one_buffer(struct btrfs_root *log,
				271	struct extent_buffer *eb,
				272	struct walk_control *wc, u64 gen)
				273	{
				274	if (wc->pin) {
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	275	mutex_lock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	276	btrfs_update_pinned_extents(log->fs_info->extent_root,
				277	eb->start, eb->len, 1);
Josef Bacik	2517920	2008-10-29 14:49:05 -0400	[diff] [blame]	278	mutex_unlock(&log->fs_info->pinned_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	279	}
				280
				281	if (btrfs_buffer_uptodate(eb, gen)) {
				282	if (wc->write)
				283	btrfs_write_tree_block(eb);
				284	if (wc->wait)
				285	btrfs_wait_tree_block_writeback(eb);
				286	}
				287	return 0;
				288	}
				289
				290	/*
				291	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				292	* to the src data we are copying out.
				293	*
				294	* root is the tree we are copying into, and path is a scratch
				295	* path for use in this function (it should be released on entry and
				296	* will be released on exit).
				297	*
				298	* If the key is already in the destination tree the existing item is
				299	* overwritten. If the existing item isn't big enough, it is extended.
				300	* If it is too large, it is truncated.
				301	*
				302	* If the key isn't in the destination yet, a new item is inserted.
				303	*/
				304	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				305	struct btrfs_root *root,
				306	struct btrfs_path *path,
				307	struct extent_buffer *eb, int slot,
				308	struct btrfs_key *key)
				309	{
				310	int ret;
				311	u32 item_size;
				312	u64 saved_i_size = 0;
				313	int save_old_i_size = 0;
				314	unsigned long src_ptr;
				315	unsigned long dst_ptr;
				316	int overwrite_root = 0;
				317
				318	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				319	overwrite_root = 1;
				320
				321	item_size = btrfs_item_size_nr(eb, slot);
				322	src_ptr = btrfs_item_ptr_offset(eb, slot);
				323
				324	/* look for the key in the destination tree */
				325	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				326	if (ret == 0) {
				327	char *src_copy;
				328	char *dst_copy;
				329	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				330	path->slots[0]);
				331	if (dst_size != item_size)
				332	goto insert;
				333
				334	if (item_size == 0) {
				335	btrfs_release_path(root, path);
				336	return 0;
				337	}
				338	dst_copy = kmalloc(item_size, GFP_NOFS);
				339	src_copy = kmalloc(item_size, GFP_NOFS);
				340
				341	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				342
				343	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				344	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				345	item_size);
				346	ret = memcmp(dst_copy, src_copy, item_size);
				347
				348	kfree(dst_copy);
				349	kfree(src_copy);
				350	/*
				351	* they have the same contents, just return, this saves
				352	* us from cowing blocks in the destination tree and doing
				353	* extra writes that may not have been done by a previous
				354	* sync
				355	*/
				356	if (ret == 0) {
				357	btrfs_release_path(root, path);
				358	return 0;
				359	}
				360
				361	}
				362	insert:
				363	btrfs_release_path(root, path);
				364	/* try to insert the key into the destination tree */
				365	ret = btrfs_insert_empty_item(trans, root, path,
				366	key, item_size);
				367
				368	/* make sure any existing item is the correct size */
				369	if (ret == -EEXIST) {
				370	u32 found_size;
				371	found_size = btrfs_item_size_nr(path->nodes[0],
				372	path->slots[0]);
				373	if (found_size > item_size) {
				374	btrfs_truncate_item(trans, root, path, item_size, 1);
				375	} else if (found_size < item_size) {
				376	ret = btrfs_del_item(trans, root,
				377	path);
				378	BUG_ON(ret);
				379
				380	btrfs_release_path(root, path);
				381	ret = btrfs_insert_empty_item(trans,
				382	root, path, key, item_size);
				383	BUG_ON(ret);
				384	}
				385	} else if (ret) {
				386	BUG();
				387	}
				388	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				389	path->slots[0]);
				390
				391	/* don't overwrite an existing inode if the generation number
				392	* was logged as zero. This is done when the tree logging code
				393	* is just logging an inode to make sure it exists after recovery.
				394	*
				395	* Also, don't overwrite i_size on directories during replay.
				396	* log replay inserts and removes directory items based on the
				397	* state of the tree found in the subvolume, and i_size is modified
				398	* as it goes
				399	*/
				400	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				401	struct btrfs_inode_item *src_item;
				402	struct btrfs_inode_item *dst_item;
				403
				404	src_item = (struct btrfs_inode_item *)src_ptr;
				405	dst_item = (struct btrfs_inode_item *)dst_ptr;
				406
				407	if (btrfs_inode_generation(eb, src_item) == 0)
				408	goto no_copy;
				409
				410	if (overwrite_root &&
				411	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				412	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				413	save_old_i_size = 1;
				414	saved_i_size = btrfs_inode_size(path->nodes[0],
				415	dst_item);
				416	}
				417	}
				418
				419	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				420	src_ptr, item_size);
				421
				422	if (save_old_i_size) {
				423	struct btrfs_inode_item *dst_item;
				424	dst_item = (struct btrfs_inode_item *)dst_ptr;
				425	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				426	}
				427
				428	/* make sure the generation is filled in */
				429	if (key->type == BTRFS_INODE_ITEM_KEY) {
				430	struct btrfs_inode_item *dst_item;
				431	dst_item = (struct btrfs_inode_item *)dst_ptr;
				432	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				433	btrfs_set_inode_generation(path->nodes[0], dst_item,
				434	trans->transid);
				435	}
				436	}
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	437
				438	if (overwrite_root &&
				439	key->type == BTRFS_EXTENT_DATA_KEY) {
				440	int extent_type;
				441	struct btrfs_file_extent_item *fi;
				442
				443	fi = (struct btrfs_file_extent_item *)dst_ptr;
				444	extent_type = btrfs_file_extent_type(path->nodes[0], fi);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame^]	445	if (extent_type == BTRFS_FILE_EXTENT_REG \|\|
				446	extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	447	struct btrfs_key ins;
				448	ins.objectid = btrfs_file_extent_disk_bytenr(
				449	path->nodes[0], fi);
				450	ins.offset = btrfs_file_extent_disk_num_bytes(
				451	path->nodes[0], fi);
				452	ins.type = BTRFS_EXTENT_ITEM_KEY;
				453
				454	/*
				455	* is this extent already allocated in the extent
				456	* allocation tree? If so, just add a reference
				457	*/
				458	ret = btrfs_lookup_extent(root, ins.objectid,
				459	ins.offset);
				460	if (ret == 0) {
				461	ret = btrfs_inc_extent_ref(trans, root,
				462	ins.objectid, ins.offset,
				463	path->nodes[0]->start,
				464	root->root_key.objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	465	trans->transid, key->objectid);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	466	} else {
				467	/*
				468	* insert the extent pointer in the extent
				469	* allocation tree
				470	*/
				471	ret = btrfs_alloc_logged_extent(trans, root,
				472	path->nodes[0]->start,
				473	root->root_key.objectid,
				474	trans->transid, key->objectid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	475	&ins);
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	476	BUG_ON(ret);
				477	}
				478	}
				479	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	480	no_copy:
				481	btrfs_mark_buffer_dirty(path->nodes[0]);
				482	btrfs_release_path(root, path);
				483	return 0;
				484	}
				485
				486	/*
				487	* simple helper to read an inode off the disk from a given root
				488	* This can only be called for subvolume roots and not for the log
				489	*/
				490	static noinline struct inode read_one_inode(struct btrfs_root root,
				491	u64 objectid)
				492	{
				493	struct inode *inode;
				494	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				495	if (inode->i_state & I_NEW) {
				496	BTRFS_I(inode)->root = root;
				497	BTRFS_I(inode)->location.objectid = objectid;
				498	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				499	BTRFS_I(inode)->location.offset = 0;
				500	btrfs_read_locked_inode(inode);
				501	unlock_new_inode(inode);
				502
				503	}
				504	if (is_bad_inode(inode)) {
				505	iput(inode);
				506	inode = NULL;
				507	}
				508	return inode;
				509	}
				510
				511	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				512	* subvolume 'root'. path is released on entry and should be released
				513	* on exit.
				514	*
				515	* extents in the log tree have not been allocated out of the extent
				516	* tree yet. So, this completes the allocation, taking a reference
				517	* as required if the extent already exists or creating a new extent
				518	* if it isn't in the extent allocation tree yet.
				519	*
				520	* The extent is inserted into the file, dropping any existing extents
				521	* from the file that overlap the new one.
				522	*/
				523	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				524	struct btrfs_root *root,
				525	struct btrfs_path *path,
				526	struct extent_buffer *eb, int slot,
				527	struct btrfs_key *key)
				528	{
				529	int found_type;
				530	u64 mask = root->sectorsize - 1;
				531	u64 extent_end;
				532	u64 alloc_hint;
				533	u64 start = key->offset;
				534	struct btrfs_file_extent_item *item;
				535	struct inode *inode = NULL;
				536	unsigned long size;
				537	int ret = 0;
				538
				539	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				540	found_type = btrfs_file_extent_type(eb, item);
				541
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame^]	542	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				543	found_type == BTRFS_FILE_EXTENT_PREALLOC)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	544	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				545	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
Chris Mason	c8b9781	2008-10-29 14:49:59 -0400	[diff] [blame]	546	size = btrfs_file_extent_inline_len(eb, item);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	547	extent_end = (start + size + mask) & ~mask;
				548	} else {
				549	ret = 0;
				550	goto out;
				551	}
				552
				553	inode = read_one_inode(root, key->objectid);
				554	if (!inode) {
				555	ret = -EIO;
				556	goto out;
				557	}
				558
				559	/*
				560	* first check to see if we already have this extent in the
				561	* file. This must be done before the btrfs_drop_extents run
				562	* so we don't try to drop this extent.
				563	*/
				564	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				565	start, 0);
				566
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame^]	567	if (ret == 0 &&
				568	(found_type == BTRFS_FILE_EXTENT_REG \|\|
				569	found_type == BTRFS_FILE_EXTENT_PREALLOC)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	570	struct btrfs_file_extent_item cmp1;
				571	struct btrfs_file_extent_item cmp2;
				572	struct btrfs_file_extent_item *existing;
				573	struct extent_buffer *leaf;
				574
				575	leaf = path->nodes[0];
				576	existing = btrfs_item_ptr(leaf, path->slots[0],
				577	struct btrfs_file_extent_item);
				578
				579	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				580	sizeof(cmp1));
				581	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				582	sizeof(cmp2));
				583
				584	/*
				585	* we already have a pointer to this exact extent,
				586	* we don't have to do anything
				587	*/
				588	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				589	btrfs_release_path(root, path);
				590	goto out;
				591	}
				592	}
				593	btrfs_release_path(root, path);
				594
				595	/* drop any overlapping extents */
				596	ret = btrfs_drop_extents(trans, root, inode,
				597	start, extent_end, start, &alloc_hint);
				598	BUG_ON(ret);
				599
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	600	/* insert the extent */
				601	ret = overwrite_item(trans, root, path, eb, slot, key);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	602	BUG_ON(ret);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	603
Yan Zheng	a76a3cd	2008-10-09 11:46:29 -0400	[diff] [blame]	604	/* btrfs_drop_extents changes i_bytes & i_blocks, update it here */
				605	inode_add_bytes(inode, extent_end - start);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	606	btrfs_update_inode(trans, root, inode);
				607	out:
				608	if (inode)
				609	iput(inode);
				610	return ret;
				611	}
				612
				613	/*
				614	* when cleaning up conflicts between the directory names in the
				615	* subvolume, directory names in the log and directory names in the
				616	* inode back references, we may have to unlink inodes from directories.
				617	*
				618	* This is a helper function to do the unlink of a specific directory
				619	* item
				620	*/
				621	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				622	struct btrfs_root *root,
				623	struct btrfs_path *path,
				624	struct inode *dir,
				625	struct btrfs_dir_item *di)
				626	{
				627	struct inode *inode;
				628	char *name;
				629	int name_len;
				630	struct extent_buffer *leaf;
				631	struct btrfs_key location;
				632	int ret;
				633
				634	leaf = path->nodes[0];
				635
				636	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				637	name_len = btrfs_dir_name_len(leaf, di);
				638	name = kmalloc(name_len, GFP_NOFS);
				639	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				640	btrfs_release_path(root, path);
				641
				642	inode = read_one_inode(root, location.objectid);
				643	BUG_ON(!inode);
				644
				645	btrfs_inc_nlink(inode);
				646	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
				647	kfree(name);
				648
				649	iput(inode);
				650	return ret;
				651	}
				652
				653	/*
				654	* helper function to see if a given name and sequence number found
				655	* in an inode back reference are already in a directory and correctly
				656	* point to this inode
				657	*/
				658	static noinline int inode_in_dir(struct btrfs_root *root,
				659	struct btrfs_path *path,
				660	u64 dirid, u64 objectid, u64 index,
				661	const char *name, int name_len)
				662	{
				663	struct btrfs_dir_item *di;
				664	struct btrfs_key location;
				665	int match = 0;
				666
				667	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				668	index, name, name_len, 0);
				669	if (di && !IS_ERR(di)) {
				670	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				671	if (location.objectid != objectid)
				672	goto out;
				673	} else
				674	goto out;
				675	btrfs_release_path(root, path);
				676
				677	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				678	if (di && !IS_ERR(di)) {
				679	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				680	if (location.objectid != objectid)
				681	goto out;
				682	} else
				683	goto out;
				684	match = 1;
				685	out:
				686	btrfs_release_path(root, path);
				687	return match;
				688	}
				689
				690	/*
				691	* helper function to check a log tree for a named back reference in
				692	* an inode. This is used to decide if a back reference that is
				693	* found in the subvolume conflicts with what we find in the log.
				694	*
				695	* inode backreferences may have multiple refs in a single item,
				696	* during replay we process one reference at a time, and we don't
				697	* want to delete valid links to a file from the subvolume if that
				698	* link is also in the log.
				699	*/
				700	static noinline int backref_in_log(struct btrfs_root *log,
				701	struct btrfs_key *key,
				702	char *name, int namelen)
				703	{
				704	struct btrfs_path *path;
				705	struct btrfs_inode_ref *ref;
				706	unsigned long ptr;
				707	unsigned long ptr_end;
				708	unsigned long name_ptr;
				709	int found_name_len;
				710	int item_size;
				711	int ret;
				712	int match = 0;
				713
				714	path = btrfs_alloc_path();
				715	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				716	if (ret != 0)
				717	goto out;
				718
				719	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				720	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				721	ptr_end = ptr + item_size;
				722	while (ptr < ptr_end) {
				723	ref = (struct btrfs_inode_ref *)ptr;
				724	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				725	if (found_name_len == namelen) {
				726	name_ptr = (unsigned long)(ref + 1);
				727	ret = memcmp_extent_buffer(path->nodes[0], name,
				728	name_ptr, namelen);
				729	if (ret == 0) {
				730	match = 1;
				731	goto out;
				732	}
				733	}
				734	ptr = (unsigned long)(ref + 1) + found_name_len;
				735	}
				736	out:
				737	btrfs_free_path(path);
				738	return match;
				739	}
				740
				741
				742	/*
				743	* replay one inode back reference item found in the log tree.
				744	* eb, slot and key refer to the buffer and key found in the log tree.
				745	* root is the destination we are replaying into, and path is for temp
				746	* use by this function. (it should be released on return).
				747	*/
				748	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				749	struct btrfs_root *root,
				750	struct btrfs_root *log,
				751	struct btrfs_path *path,
				752	struct extent_buffer *eb, int slot,
				753	struct btrfs_key *key)
				754	{
				755	struct inode *dir;
				756	int ret;
				757	struct btrfs_key location;
				758	struct btrfs_inode_ref *ref;
				759	struct btrfs_dir_item *di;
				760	struct inode *inode;
				761	char *name;
				762	int namelen;
				763	unsigned long ref_ptr;
				764	unsigned long ref_end;
				765
				766	location.objectid = key->objectid;
				767	location.type = BTRFS_INODE_ITEM_KEY;
				768	location.offset = 0;
				769
				770	/*
				771	* it is possible that we didn't log all the parent directories
				772	* for a given inode. If we don't find the dir, just don't
				773	* copy the back ref in. The link count fixup code will take
				774	* care of the rest
				775	*/
				776	dir = read_one_inode(root, key->offset);
				777	if (!dir)
				778	return -ENOENT;
				779
				780	inode = read_one_inode(root, key->objectid);
				781	BUG_ON(!dir);
				782
				783	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				784	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				785
				786	again:
				787	ref = (struct btrfs_inode_ref *)ref_ptr;
				788
				789	namelen = btrfs_inode_ref_name_len(eb, ref);
				790	name = kmalloc(namelen, GFP_NOFS);
				791	BUG_ON(!name);
				792
				793	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				794
				795	/* if we already have a perfect match, we're done */
				796	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				797	btrfs_inode_ref_index(eb, ref),
				798	name, namelen)) {
				799	goto out;
				800	}
				801
				802	/*
				803	* look for a conflicting back reference in the metadata.
				804	* if we find one we have to unlink that name of the file
				805	* before we add our new link. Later on, we overwrite any
				806	* existing back reference, and we don't want to create
				807	* dangling pointers in the directory.
				808	*/
				809	conflict_again:
				810	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				811	if (ret == 0) {
				812	char *victim_name;
				813	int victim_name_len;
				814	struct btrfs_inode_ref *victim_ref;
				815	unsigned long ptr;
				816	unsigned long ptr_end;
				817	struct extent_buffer *leaf = path->nodes[0];
				818
				819	/* are we trying to overwrite a back ref for the root directory
				820	* if so, just jump out, we're done
				821	*/
				822	if (key->objectid == key->offset)
				823	goto out_nowrite;
				824
				825	/* check all the names in this back reference to see
				826	* if they are in the log. if so, we allow them to stay
				827	* otherwise they must be unlinked as a conflict
				828	*/
				829	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				830	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				831	while(ptr < ptr_end) {
				832	victim_ref = (struct btrfs_inode_ref *)ptr;
				833	victim_name_len = btrfs_inode_ref_name_len(leaf,
				834	victim_ref);
				835	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				836	BUG_ON(!victim_name);
				837
				838	read_extent_buffer(leaf, victim_name,
				839	(unsigned long)(victim_ref + 1),
				840	victim_name_len);
				841
				842	if (!backref_in_log(log, key, victim_name,
				843	victim_name_len)) {
				844	btrfs_inc_nlink(inode);
				845	btrfs_release_path(root, path);
				846	ret = btrfs_unlink_inode(trans, root, dir,
				847	inode, victim_name,
				848	victim_name_len);
				849	kfree(victim_name);
				850	btrfs_release_path(root, path);
				851	goto conflict_again;
				852	}
				853	kfree(victim_name);
				854	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				855	}
				856	BUG_ON(ret);
				857	}
				858	btrfs_release_path(root, path);
				859
				860	/* look for a conflicting sequence number */
				861	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				862	btrfs_inode_ref_index(eb, ref),
				863	name, namelen, 0);
				864	if (di && !IS_ERR(di)) {
				865	ret = drop_one_dir_item(trans, root, path, dir, di);
				866	BUG_ON(ret);
				867	}
				868	btrfs_release_path(root, path);
				869
				870
				871	/* look for a conflicting name */
				872	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				873	name, namelen, 0);
				874	if (di && !IS_ERR(di)) {
				875	ret = drop_one_dir_item(trans, root, path, dir, di);
				876	BUG_ON(ret);
				877	}
				878	btrfs_release_path(root, path);
				879
				880	/* insert our name */
				881	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				882	btrfs_inode_ref_index(eb, ref));
				883	BUG_ON(ret);
				884
				885	btrfs_update_inode(trans, root, inode);
				886
				887	out:
				888	ref_ptr = (unsigned long)(ref + 1) + namelen;
				889	kfree(name);
				890	if (ref_ptr < ref_end)
				891	goto again;
				892
				893	/* finally write the back reference in the inode */
				894	ret = overwrite_item(trans, root, path, eb, slot, key);
				895	BUG_ON(ret);
				896
				897	out_nowrite:
				898	btrfs_release_path(root, path);
				899	iput(dir);
				900	iput(inode);
				901	return 0;
				902	}
				903
				904	/*
				905	* replay one csum item from the log tree into the subvolume 'root'
				906	* eb, slot and key all refer to the log tree
				907	* path is for temp use by this function and should be released on return
				908	*
				909	* This copies the checksums out of the log tree and inserts them into
				910	* the subvolume. Any existing checksums for this range in the file
				911	* are overwritten, and new items are added where required.
				912	*
				913	* We keep this simple by reusing the btrfs_ordered_sum code from
				914	* the data=ordered mode. This basically means making a copy
				915	* of all the checksums in ram, which we have to do anyway for kmap
				916	* rules.
				917	*
				918	* The copy is then sent down to btrfs_csum_file_blocks, which
				919	* does all the hard work of finding existing items in the file
				920	* or adding new ones.
				921	*/
				922	static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
				923	struct btrfs_root *root,
				924	struct btrfs_path *path,
				925	struct extent_buffer *eb, int slot,
				926	struct btrfs_key *key)
				927	{
				928	int ret;
				929	u32 item_size = btrfs_item_size_nr(eb, slot);
				930	u64 cur_offset;
				931	unsigned long file_bytes;
				932	struct btrfs_ordered_sum *sums;
				933	struct btrfs_sector_sum *sector_sum;
				934	struct inode *inode;
				935	unsigned long ptr;
				936
				937	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
				938	inode = read_one_inode(root, key->objectid);
				939	if (!inode) {
				940	return -EIO;
				941	}
				942
				943	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
				944	if (!sums) {
				945	iput(inode);
				946	return -ENOMEM;
				947	}
				948
				949	INIT_LIST_HEAD(&sums->list);
				950	sums->len = file_bytes;
				951	sums->file_offset = key->offset;
				952
				953	/*
				954	* copy all the sums into the ordered sum struct
				955	*/
				956	sector_sum = sums->sums;
				957	cur_offset = key->offset;
				958	ptr = btrfs_item_ptr_offset(eb, slot);
				959	while(item_size > 0) {
				960	sector_sum->offset = cur_offset;
				961	read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
				962	sector_sum++;
				963	item_size -= BTRFS_CRC32_SIZE;
				964	ptr += BTRFS_CRC32_SIZE;
				965	cur_offset += root->sectorsize;
				966	}
				967
				968	/* let btrfs_csum_file_blocks add them into the file */
				969	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
				970	BUG_ON(ret);
				971	kfree(sums);
				972	iput(inode);
				973
				974	return 0;
				975	}
				976	/*
				977	* There are a few corners where the link count of the file can't
				978	* be properly maintained during replay. So, instead of adding
				979	* lots of complexity to the log code, we just scan the backrefs
				980	* for any file that has been through replay.
				981	*
				982	* The scan will update the link count on the inode to reflect the
				983	* number of back refs found. If it goes down to zero, the iput
				984	* will free the inode.
				985	*/
				986	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				987	struct btrfs_root *root,
				988	struct inode *inode)
				989	{
				990	struct btrfs_path *path;
				991	int ret;
				992	struct btrfs_key key;
				993	u64 nlink = 0;
				994	unsigned long ptr;
				995	unsigned long ptr_end;
				996	int name_len;
				997
				998	key.objectid = inode->i_ino;
				999	key.type = BTRFS_INODE_REF_KEY;
				1000	key.offset = (u64)-1;
				1001
				1002	path = btrfs_alloc_path();
				1003
				1004	while(1) {
				1005	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1006	if (ret < 0)
				1007	break;
				1008	if (ret > 0) {
				1009	if (path->slots[0] == 0)
				1010	break;
				1011	path->slots[0]--;
				1012	}
				1013	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1014	path->slots[0]);
				1015	if (key.objectid != inode->i_ino \|\|
				1016	key.type != BTRFS_INODE_REF_KEY)
				1017	break;
				1018	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1019	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1020	path->slots[0]);
				1021	while(ptr < ptr_end) {
				1022	struct btrfs_inode_ref *ref;
				1023
				1024	ref = (struct btrfs_inode_ref *)ptr;
				1025	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1026	ref);
				1027	ptr = (unsigned long)(ref + 1) + name_len;
				1028	nlink++;
				1029	}
				1030
				1031	if (key.offset == 0)
				1032	break;
				1033	key.offset--;
				1034	btrfs_release_path(root, path);
				1035	}
				1036	btrfs_free_path(path);
				1037	if (nlink != inode->i_nlink) {
				1038	inode->i_nlink = nlink;
				1039	btrfs_update_inode(trans, root, inode);
				1040	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	1041	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1042
				1043	return 0;
				1044	}
				1045
				1046	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1047	struct btrfs_root *root,
				1048	struct btrfs_path *path)
				1049	{
				1050	int ret;
				1051	struct btrfs_key key;
				1052	struct inode *inode;
				1053
				1054	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1055	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1056	key.offset = (u64)-1;
				1057	while(1) {
				1058	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1059	if (ret < 0)
				1060	break;
				1061
				1062	if (ret == 1) {
				1063	if (path->slots[0] == 0)
				1064	break;
				1065	path->slots[0]--;
				1066	}
				1067
				1068	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1069	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1070	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1071	break;
				1072
				1073	ret = btrfs_del_item(trans, root, path);
				1074	BUG_ON(ret);
				1075
				1076	btrfs_release_path(root, path);
				1077	inode = read_one_inode(root, key.offset);
				1078	BUG_ON(!inode);
				1079
				1080	ret = fixup_inode_link_count(trans, root, inode);
				1081	BUG_ON(ret);
				1082
				1083	iput(inode);
				1084
				1085	if (key.offset == 0)
				1086	break;
				1087	key.offset--;
				1088	}
				1089	btrfs_release_path(root, path);
				1090	return 0;
				1091	}
				1092
				1093
				1094	/*
				1095	* record a given inode in the fixup dir so we can check its link
				1096	* count when replay is done. The link count is incremented here
				1097	* so the inode won't go away until we check it
				1098	*/
				1099	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1100	struct btrfs_root *root,
				1101	struct btrfs_path *path,
				1102	u64 objectid)
				1103	{
				1104	struct btrfs_key key;
				1105	int ret = 0;
				1106	struct inode *inode;
				1107
				1108	inode = read_one_inode(root, objectid);
				1109	BUG_ON(!inode);
				1110
				1111	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1112	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1113	key.offset = objectid;
				1114
				1115	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1116
				1117	btrfs_release_path(root, path);
				1118	if (ret == 0) {
				1119	btrfs_inc_nlink(inode);
				1120	btrfs_update_inode(trans, root, inode);
				1121	} else if (ret == -EEXIST) {
				1122	ret = 0;
				1123	} else {
				1124	BUG();
				1125	}
				1126	iput(inode);
				1127
				1128	return ret;
				1129	}
				1130
				1131	/*
				1132	* when replaying the log for a directory, we only insert names
				1133	* for inodes that actually exist. This means an fsync on a directory
				1134	* does not implicitly fsync all the new files in it
				1135	*/
				1136	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1137	struct btrfs_root *root,
				1138	struct btrfs_path *path,
				1139	u64 dirid, u64 index,
				1140	char *name, int name_len, u8 type,
				1141	struct btrfs_key *location)
				1142	{
				1143	struct inode *inode;
				1144	struct inode *dir;
				1145	int ret;
				1146
				1147	inode = read_one_inode(root, location->objectid);
				1148	if (!inode)
				1149	return -ENOENT;
				1150
				1151	dir = read_one_inode(root, dirid);
				1152	if (!dir) {
				1153	iput(inode);
				1154	return -EIO;
				1155	}
				1156	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1157
				1158	/* FIXME, put inode into FIXUP list */
				1159
				1160	iput(inode);
				1161	iput(dir);
				1162	return ret;
				1163	}
				1164
				1165	/*
				1166	* take a single entry in a log directory item and replay it into
				1167	* the subvolume.
				1168	*
				1169	* if a conflicting item exists in the subdirectory already,
				1170	* the inode it points to is unlinked and put into the link count
				1171	* fix up tree.
				1172	*
				1173	* If a name from the log points to a file or directory that does
				1174	* not exist in the FS, it is skipped. fsyncs on directories
				1175	* do not force down inodes inside that directory, just changes to the
				1176	* names or unlinks in a directory.
				1177	*/
				1178	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1179	struct btrfs_root *root,
				1180	struct btrfs_path *path,
				1181	struct extent_buffer *eb,
				1182	struct btrfs_dir_item *di,
				1183	struct btrfs_key *key)
				1184	{
				1185	char *name;
				1186	int name_len;
				1187	struct btrfs_dir_item *dst_di;
				1188	struct btrfs_key found_key;
				1189	struct btrfs_key log_key;
				1190	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1191	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1192	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1193	int ret;
				1194
				1195	dir = read_one_inode(root, key->objectid);
				1196	BUG_ON(!dir);
				1197
				1198	name_len = btrfs_dir_name_len(eb, di);
				1199	name = kmalloc(name_len, GFP_NOFS);
				1200	log_type = btrfs_dir_type(eb, di);
				1201	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1202	name_len);
				1203
				1204	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1205	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1206	if (exists == 0)
				1207	exists = 1;
				1208	else
				1209	exists = 0;
				1210	btrfs_release_path(root, path);
				1211
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1212	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1213	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1214	name, name_len, 1);
				1215	}
				1216	else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1217	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1218	key->objectid,
				1219	key->offset, name,
				1220	name_len, 1);
				1221	} else {
				1222	BUG();
				1223	}
				1224	if (!dst_di \|\| IS_ERR(dst_di)) {
				1225	/* we need a sequence number to insert, so we only
				1226	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1227	*/
				1228	if (key->type != BTRFS_DIR_INDEX_KEY)
				1229	goto out;
				1230	goto insert;
				1231	}
				1232
				1233	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1234	/* the existing item matches the logged item */
				1235	if (found_key.objectid == log_key.objectid &&
				1236	found_key.type == log_key.type &&
				1237	found_key.offset == log_key.offset &&
				1238	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1239	goto out;
				1240	}
				1241
				1242	/*
				1243	* don't drop the conflicting directory entry if the inode
				1244	* for the new entry doesn't exist
				1245	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1246	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1247	goto out;
				1248
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1249	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1250	BUG_ON(ret);
				1251
				1252	if (key->type == BTRFS_DIR_INDEX_KEY)
				1253	goto insert;
				1254	out:
				1255	btrfs_release_path(root, path);
				1256	kfree(name);
				1257	iput(dir);
				1258	return 0;
				1259
				1260	insert:
				1261	btrfs_release_path(root, path);
				1262	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1263	name, name_len, log_type, &log_key);
				1264
				1265	if (ret && ret != -ENOENT)
				1266	BUG();
				1267	goto out;
				1268	}
				1269
				1270	/*
				1271	* find all the names in a directory item and reconcile them into
				1272	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1273	* one name in a directory item, but the same code gets used for
				1274	* both directory index types
				1275	*/
				1276	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1277	struct btrfs_root *root,
				1278	struct btrfs_path *path,
				1279	struct extent_buffer *eb, int slot,
				1280	struct btrfs_key *key)
				1281	{
				1282	int ret;
				1283	u32 item_size = btrfs_item_size_nr(eb, slot);
				1284	struct btrfs_dir_item *di;
				1285	int name_len;
				1286	unsigned long ptr;
				1287	unsigned long ptr_end;
				1288
				1289	ptr = btrfs_item_ptr_offset(eb, slot);
				1290	ptr_end = ptr + item_size;
				1291	while(ptr < ptr_end) {
				1292	di = (struct btrfs_dir_item *)ptr;
				1293	name_len = btrfs_dir_name_len(eb, di);
				1294	ret = replay_one_name(trans, root, path, eb, di, key);
				1295	BUG_ON(ret);
				1296	ptr = (unsigned long)(di + 1);
				1297	ptr += name_len;
				1298	}
				1299	return 0;
				1300	}
				1301
				1302	/*
				1303	* directory replay has two parts. There are the standard directory
				1304	* items in the log copied from the subvolume, and range items
				1305	* created in the log while the subvolume was logged.
				1306	*
				1307	* The range items tell us which parts of the key space the log
				1308	* is authoritative for. During replay, if a key in the subvolume
				1309	* directory is in a logged range item, but not actually in the log
				1310	* that means it was deleted from the directory before the fsync
				1311	* and should be removed.
				1312	*/
				1313	static noinline int find_dir_range(struct btrfs_root *root,
				1314	struct btrfs_path *path,
				1315	u64 dirid, int key_type,
				1316	u64 start_ret, u64 end_ret)
				1317	{
				1318	struct btrfs_key key;
				1319	u64 found_end;
				1320	struct btrfs_dir_log_item *item;
				1321	int ret;
				1322	int nritems;
				1323
				1324	if (*start_ret == (u64)-1)
				1325	return 1;
				1326
				1327	key.objectid = dirid;
				1328	key.type = key_type;
				1329	key.offset = *start_ret;
				1330
				1331	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1332	if (ret < 0)
				1333	goto out;
				1334	if (ret > 0) {
				1335	if (path->slots[0] == 0)
				1336	goto out;
				1337	path->slots[0]--;
				1338	}
				1339	if (ret != 0)
				1340	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1341
				1342	if (key.type != key_type \|\| key.objectid != dirid) {
				1343	ret = 1;
				1344	goto next;
				1345	}
				1346	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1347	struct btrfs_dir_log_item);
				1348	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1349
				1350	if (start_ret >= key.offset && start_ret <= found_end) {
				1351	ret = 0;
				1352	*start_ret = key.offset;
				1353	*end_ret = found_end;
				1354	goto out;
				1355	}
				1356	ret = 1;
				1357	next:
				1358	/* check the next slot in the tree to see if it is a valid item */
				1359	nritems = btrfs_header_nritems(path->nodes[0]);
				1360	if (path->slots[0] >= nritems) {
				1361	ret = btrfs_next_leaf(root, path);
				1362	if (ret)
				1363	goto out;
				1364	} else {
				1365	path->slots[0]++;
				1366	}
				1367
				1368	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1369
				1370	if (key.type != key_type \|\| key.objectid != dirid) {
				1371	ret = 1;
				1372	goto out;
				1373	}
				1374	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1375	struct btrfs_dir_log_item);
				1376	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1377	*start_ret = key.offset;
				1378	*end_ret = found_end;
				1379	ret = 0;
				1380	out:
				1381	btrfs_release_path(root, path);
				1382	return ret;
				1383	}
				1384
				1385	/*
				1386	* this looks for a given directory item in the log. If the directory
				1387	* item is not in the log, the item is removed and the inode it points
				1388	* to is unlinked
				1389	*/
				1390	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1391	struct btrfs_root *root,
				1392	struct btrfs_root *log,
				1393	struct btrfs_path *path,
				1394	struct btrfs_path *log_path,
				1395	struct inode *dir,
				1396	struct btrfs_key *dir_key)
				1397	{
				1398	int ret;
				1399	struct extent_buffer *eb;
				1400	int slot;
				1401	u32 item_size;
				1402	struct btrfs_dir_item *di;
				1403	struct btrfs_dir_item *log_di;
				1404	int name_len;
				1405	unsigned long ptr;
				1406	unsigned long ptr_end;
				1407	char *name;
				1408	struct inode *inode;
				1409	struct btrfs_key location;
				1410
				1411	again:
				1412	eb = path->nodes[0];
				1413	slot = path->slots[0];
				1414	item_size = btrfs_item_size_nr(eb, slot);
				1415	ptr = btrfs_item_ptr_offset(eb, slot);
				1416	ptr_end = ptr + item_size;
				1417	while(ptr < ptr_end) {
				1418	di = (struct btrfs_dir_item *)ptr;
				1419	name_len = btrfs_dir_name_len(eb, di);
				1420	name = kmalloc(name_len, GFP_NOFS);
				1421	if (!name) {
				1422	ret = -ENOMEM;
				1423	goto out;
				1424	}
				1425	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1426	name_len);
				1427	log_di = NULL;
				1428	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1429	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1430	dir_key->objectid,
				1431	name, name_len, 0);
				1432	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1433	log_di = btrfs_lookup_dir_index_item(trans, log,
				1434	log_path,
				1435	dir_key->objectid,
				1436	dir_key->offset,
				1437	name, name_len, 0);
				1438	}
				1439	if (!log_di \|\| IS_ERR(log_di)) {
				1440	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1441	btrfs_release_path(root, path);
				1442	btrfs_release_path(log, log_path);
				1443	inode = read_one_inode(root, location.objectid);
				1444	BUG_ON(!inode);
				1445
				1446	ret = link_to_fixup_dir(trans, root,
				1447	path, location.objectid);
				1448	BUG_ON(ret);
				1449	btrfs_inc_nlink(inode);
				1450	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1451	name, name_len);
				1452	BUG_ON(ret);
				1453	kfree(name);
				1454	iput(inode);
				1455
				1456	/* there might still be more names under this key
				1457	* check and repeat if required
				1458	*/
				1459	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1460	0, 0);
				1461	if (ret == 0)
				1462	goto again;
				1463	ret = 0;
				1464	goto out;
				1465	}
				1466	btrfs_release_path(log, log_path);
				1467	kfree(name);
				1468
				1469	ptr = (unsigned long)(di + 1);
				1470	ptr += name_len;
				1471	}
				1472	ret = 0;
				1473	out:
				1474	btrfs_release_path(root, path);
				1475	btrfs_release_path(log, log_path);
				1476	return ret;
				1477	}
				1478
				1479	/*
				1480	* deletion replay happens before we copy any new directory items
				1481	* out of the log or out of backreferences from inodes. It
				1482	* scans the log to find ranges of keys that log is authoritative for,
				1483	* and then scans the directory to find items in those ranges that are
				1484	* not present in the log.
				1485	*
				1486	* Anything we don't find in the log is unlinked and removed from the
				1487	* directory.
				1488	*/
				1489	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1490	struct btrfs_root *root,
				1491	struct btrfs_root *log,
				1492	struct btrfs_path *path,
				1493	u64 dirid)
				1494	{
				1495	u64 range_start;
				1496	u64 range_end;
				1497	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1498	int ret = 0;
				1499	struct btrfs_key dir_key;
				1500	struct btrfs_key found_key;
				1501	struct btrfs_path *log_path;
				1502	struct inode *dir;
				1503
				1504	dir_key.objectid = dirid;
				1505	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1506	log_path = btrfs_alloc_path();
				1507	if (!log_path)
				1508	return -ENOMEM;
				1509
				1510	dir = read_one_inode(root, dirid);
				1511	/* it isn't an error if the inode isn't there, that can happen
				1512	* because we replay the deletes before we copy in the inode item
				1513	* from the log
				1514	*/
				1515	if (!dir) {
				1516	btrfs_free_path(log_path);
				1517	return 0;
				1518	}
				1519	again:
				1520	range_start = 0;
				1521	range_end = 0;
				1522	while(1) {
				1523	ret = find_dir_range(log, path, dirid, key_type,
				1524	&range_start, &range_end);
				1525	if (ret != 0)
				1526	break;
				1527
				1528	dir_key.offset = range_start;
				1529	while(1) {
				1530	int nritems;
				1531	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1532	0, 0);
				1533	if (ret < 0)
				1534	goto out;
				1535
				1536	nritems = btrfs_header_nritems(path->nodes[0]);
				1537	if (path->slots[0] >= nritems) {
				1538	ret = btrfs_next_leaf(root, path);
				1539	if (ret)
				1540	break;
				1541	}
				1542	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1543	path->slots[0]);
				1544	if (found_key.objectid != dirid \|\|
				1545	found_key.type != dir_key.type)
				1546	goto next_type;
				1547
				1548	if (found_key.offset > range_end)
				1549	break;
				1550
				1551	ret = check_item_in_log(trans, root, log, path,
				1552	log_path, dir, &found_key);
				1553	BUG_ON(ret);
				1554	if (found_key.offset == (u64)-1)
				1555	break;
				1556	dir_key.offset = found_key.offset + 1;
				1557	}
				1558	btrfs_release_path(root, path);
				1559	if (range_end == (u64)-1)
				1560	break;
				1561	range_start = range_end + 1;
				1562	}
				1563
				1564	next_type:
				1565	ret = 0;
				1566	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1567	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1568	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1569	btrfs_release_path(root, path);
				1570	goto again;
				1571	}
				1572	out:
				1573	btrfs_release_path(root, path);
				1574	btrfs_free_path(log_path);
				1575	iput(dir);
				1576	return ret;
				1577	}
				1578
				1579	/*
				1580	* the process_func used to replay items from the log tree. This
				1581	* gets called in two different stages. The first stage just looks
				1582	* for inodes and makes sure they are all copied into the subvolume.
				1583	*
				1584	* The second stage copies all the other item types from the log into
				1585	* the subvolume. The two stage approach is slower, but gets rid of
				1586	* lots of complexity around inodes referencing other inodes that exist
				1587	* only in the log (references come from either directory items or inode
				1588	* back refs).
				1589	*/
				1590	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1591	struct walk_control *wc, u64 gen)
				1592	{
				1593	int nritems;
				1594	struct btrfs_path *path;
				1595	struct btrfs_root *root = wc->replay_dest;
				1596	struct btrfs_key key;
				1597	u32 item_size;
				1598	int level;
				1599	int i;
				1600	int ret;
				1601
				1602	btrfs_read_buffer(eb, gen);
				1603
				1604	level = btrfs_header_level(eb);
				1605
				1606	if (level != 0)
				1607	return 0;
				1608
				1609	path = btrfs_alloc_path();
				1610	BUG_ON(!path);
				1611
				1612	nritems = btrfs_header_nritems(eb);
				1613	for (i = 0; i < nritems; i++) {
				1614	btrfs_item_key_to_cpu(eb, &key, i);
				1615	item_size = btrfs_item_size_nr(eb, i);
				1616
				1617	/* inode keys are done during the first stage */
				1618	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1619	wc->stage == LOG_WALK_REPLAY_INODES) {
				1620	struct inode *inode;
				1621	struct btrfs_inode_item *inode_item;
				1622	u32 mode;
				1623
				1624	inode_item = btrfs_item_ptr(eb, i,
				1625	struct btrfs_inode_item);
				1626	mode = btrfs_inode_mode(eb, inode_item);
				1627	if (S_ISDIR(mode)) {
				1628	ret = replay_dir_deletes(wc->trans,
				1629	root, log, path, key.objectid);
				1630	BUG_ON(ret);
				1631	}
				1632	ret = overwrite_item(wc->trans, root, path,
				1633	eb, i, &key);
				1634	BUG_ON(ret);
				1635
				1636	/* for regular files, truncate away
				1637	* extents past the new EOF
				1638	*/
				1639	if (S_ISREG(mode)) {
				1640	inode = read_one_inode(root,
				1641	key.objectid);
				1642	BUG_ON(!inode);
				1643
				1644	ret = btrfs_truncate_inode_items(wc->trans,
				1645	root, inode, inode->i_size,
				1646	BTRFS_EXTENT_DATA_KEY);
				1647	BUG_ON(ret);
				1648	iput(inode);
				1649	}
				1650	ret = link_to_fixup_dir(wc->trans, root,
				1651	path, key.objectid);
				1652	BUG_ON(ret);
				1653	}
				1654	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1655	continue;
				1656
				1657	/* these keys are simply copied */
				1658	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1659	ret = overwrite_item(wc->trans, root, path,
				1660	eb, i, &key);
				1661	BUG_ON(ret);
				1662	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1663	ret = add_inode_ref(wc->trans, root, log, path,
				1664	eb, i, &key);
				1665	BUG_ON(ret && ret != -ENOENT);
				1666	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1667	ret = replay_one_extent(wc->trans, root, path,
				1668	eb, i, &key);
				1669	BUG_ON(ret);
				1670	} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
				1671	ret = replay_one_csum(wc->trans, root, path,
				1672	eb, i, &key);
				1673	BUG_ON(ret);
				1674	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1675	key.type == BTRFS_DIR_INDEX_KEY) {
				1676	ret = replay_one_dir_item(wc->trans, root, path,
				1677	eb, i, &key);
				1678	BUG_ON(ret);
				1679	}
				1680	}
				1681	btrfs_free_path(path);
				1682	return 0;
				1683	}
				1684
				1685	static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
				1686	struct btrfs_root *root,
				1687	struct btrfs_path path, int level,
				1688	struct walk_control *wc)
				1689	{
				1690	u64 root_owner;
				1691	u64 root_gen;
				1692	u64 bytenr;
				1693	u64 ptr_gen;
				1694	struct extent_buffer *next;
				1695	struct extent_buffer *cur;
				1696	struct extent_buffer *parent;
				1697	u32 blocksize;
				1698	int ret = 0;
				1699
				1700	WARN_ON(*level < 0);
				1701	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1702
				1703	while(*level > 0) {
				1704	WARN_ON(*level < 0);
				1705	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1706	cur = path->nodes[*level];
				1707
				1708	if (btrfs_header_level(cur) != *level)
				1709	WARN_ON(1);
				1710
				1711	if (path->slots[*level] >=
				1712	btrfs_header_nritems(cur))
				1713	break;
				1714
				1715	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1716	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1717	blocksize = btrfs_level_size(root, *level - 1);
				1718
				1719	parent = path->nodes[*level];
				1720	root_owner = btrfs_header_owner(parent);
				1721	root_gen = btrfs_header_generation(parent);
				1722
				1723	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1724
				1725	wc->process_func(root, next, wc, ptr_gen);
				1726
				1727	if (*level == 1) {
				1728	path->slots[*level]++;
				1729	if (wc->free) {
				1730	btrfs_read_buffer(next, ptr_gen);
				1731
				1732	btrfs_tree_lock(next);
				1733	clean_tree_block(trans, root, next);
				1734	btrfs_wait_tree_block_writeback(next);
				1735	btrfs_tree_unlock(next);
				1736
				1737	ret = btrfs_drop_leaf_ref(trans, root, next);
				1738	BUG_ON(ret);
				1739
				1740	WARN_ON(root_owner !=
				1741	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1742	ret = btrfs_free_reserved_extent(root,
				1743	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1744	BUG_ON(ret);
				1745	}
				1746	free_extent_buffer(next);
				1747	continue;
				1748	}
				1749	btrfs_read_buffer(next, ptr_gen);
				1750
				1751	WARN_ON(*level <= 0);
				1752	if (path->nodes[*level-1])
				1753	free_extent_buffer(path->nodes[*level-1]);
				1754	path->nodes[*level-1] = next;
				1755	*level = btrfs_header_level(next);
				1756	path->slots[*level] = 0;
				1757	cond_resched();
				1758	}
				1759	WARN_ON(*level < 0);
				1760	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1761
				1762	if (path->nodes[*level] == root->node) {
				1763	parent = path->nodes[*level];
				1764	} else {
				1765	parent = path->nodes[*level + 1];
				1766	}
				1767	bytenr = path->nodes[*level]->start;
				1768
				1769	blocksize = btrfs_level_size(root, *level);
				1770	root_owner = btrfs_header_owner(parent);
				1771	root_gen = btrfs_header_generation(parent);
				1772
				1773	wc->process_func(root, path->nodes[*level], wc,
				1774	btrfs_header_generation(path->nodes[*level]));
				1775
				1776	if (wc->free) {
				1777	next = path->nodes[*level];
				1778	btrfs_tree_lock(next);
				1779	clean_tree_block(trans, root, next);
				1780	btrfs_wait_tree_block_writeback(next);
				1781	btrfs_tree_unlock(next);
				1782
				1783	if (*level == 0) {
				1784	ret = btrfs_drop_leaf_ref(trans, root, next);
				1785	BUG_ON(ret);
				1786	}
				1787	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1788	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1789	BUG_ON(ret);
				1790	}
				1791	free_extent_buffer(path->nodes[*level]);
				1792	path->nodes[*level] = NULL;
				1793	*level += 1;
				1794
				1795	cond_resched();
				1796	return 0;
				1797	}
				1798
				1799	static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
				1800	struct btrfs_root *root,
				1801	struct btrfs_path path, int level,
				1802	struct walk_control *wc)
				1803	{
				1804	u64 root_owner;
				1805	u64 root_gen;
				1806	int i;
				1807	int slot;
				1808	int ret;
				1809
				1810	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				1811	slot = path->slots[i];
				1812	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1813	struct extent_buffer *node;
				1814	node = path->nodes[i];
				1815	path->slots[i]++;
				1816	*level = i;
				1817	WARN_ON(*level == 0);
				1818	return 0;
				1819	} else {
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	1820	struct extent_buffer *parent;
				1821	if (path->nodes[*level] == root->node)
				1822	parent = path->nodes[*level];
				1823	else
				1824	parent = path->nodes[*level + 1];
				1825
				1826	root_owner = btrfs_header_owner(parent);
				1827	root_gen = btrfs_header_generation(parent);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1828	wc->process_func(root, path->nodes[*level], wc,
				1829	btrfs_header_generation(path->nodes[*level]));
				1830	if (wc->free) {
				1831	struct extent_buffer *next;
				1832
				1833	next = path->nodes[*level];
				1834
				1835	btrfs_tree_lock(next);
				1836	clean_tree_block(trans, root, next);
				1837	btrfs_wait_tree_block_writeback(next);
				1838	btrfs_tree_unlock(next);
				1839
				1840	if (*level == 0) {
				1841	ret = btrfs_drop_leaf_ref(trans, root,
				1842	next);
				1843	BUG_ON(ret);
				1844	}
				1845
				1846	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1847	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1848	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1849	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1850	BUG_ON(ret);
				1851	}
				1852	free_extent_buffer(path->nodes[*level]);
				1853	path->nodes[*level] = NULL;
				1854	*level = i + 1;
				1855	}
				1856	}
				1857	return 1;
				1858	}
				1859
				1860	/*
				1861	* drop the reference count on the tree rooted at 'snap'. This traverses
				1862	* the tree freeing any blocks that have a ref count of zero after being
				1863	* decremented.
				1864	*/
				1865	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1866	struct btrfs_root log, struct walk_control wc)
				1867	{
				1868	int ret = 0;
				1869	int wret;
				1870	int level;
				1871	struct btrfs_path *path;
				1872	int i;
				1873	int orig_level;
				1874
				1875	path = btrfs_alloc_path();
				1876	BUG_ON(!path);
				1877
				1878	level = btrfs_header_level(log->node);
				1879	orig_level = level;
				1880	path->nodes[level] = log->node;
				1881	extent_buffer_get(log->node);
				1882	path->slots[level] = 0;
				1883
				1884	while(1) {
				1885	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1886	if (wret > 0)
				1887	break;
				1888	if (wret < 0)
				1889	ret = wret;
				1890
				1891	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1892	if (wret > 0)
				1893	break;
				1894	if (wret < 0)
				1895	ret = wret;
				1896	}
				1897
				1898	/* was the root node processed? if not, catch it here */
				1899	if (path->nodes[orig_level]) {
				1900	wc->process_func(log, path->nodes[orig_level], wc,
				1901	btrfs_header_generation(path->nodes[orig_level]));
				1902	if (wc->free) {
				1903	struct extent_buffer *next;
				1904
				1905	next = path->nodes[orig_level];
				1906
				1907	btrfs_tree_lock(next);
				1908	clean_tree_block(trans, log, next);
				1909	btrfs_wait_tree_block_writeback(next);
				1910	btrfs_tree_unlock(next);
				1911
				1912	if (orig_level == 0) {
				1913	ret = btrfs_drop_leaf_ref(trans, log,
				1914	next);
				1915	BUG_ON(ret);
				1916	}
				1917	WARN_ON(log->root_key.objectid !=
				1918	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1919	ret = btrfs_free_reserved_extent(log, next->start,
				1920	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1921	BUG_ON(ret);
				1922	}
				1923	}
				1924
				1925	for (i = 0; i <= orig_level; i++) {
				1926	if (path->nodes[i]) {
				1927	free_extent_buffer(path->nodes[i]);
				1928	path->nodes[i] = NULL;
				1929	}
				1930	}
				1931	btrfs_free_path(path);
				1932	if (wc->free)
				1933	free_extent_buffer(log->node);
				1934	return ret;
				1935	}
				1936
				1937	int wait_log_commit(struct btrfs_root *log)
				1938	{
				1939	DEFINE_WAIT(wait);
				1940	u64 transid = log->fs_info->tree_log_transid;
				1941
				1942	do {
				1943	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1944	TASK_UNINTERRUPTIBLE);
				1945	mutex_unlock(&log->fs_info->tree_log_mutex);
				1946	if (atomic_read(&log->fs_info->tree_log_commit))
				1947	schedule();
				1948	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1949	mutex_lock(&log->fs_info->tree_log_mutex);
				1950	} while(transid == log->fs_info->tree_log_transid &&
				1951	atomic_read(&log->fs_info->tree_log_commit));
				1952	return 0;
				1953	}
				1954
				1955	/*
				1956	* btrfs_sync_log does sends a given tree log down to the disk and
				1957	* updates the super blocks to record it. When this call is done,
				1958	* you know that any inodes previously logged are safely on disk
				1959	*/
				1960	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1961	struct btrfs_root *root)
				1962	{
				1963	int ret;
				1964	unsigned long batch;
				1965	struct btrfs_root *log = root->log_root;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1966
				1967	mutex_lock(&log->fs_info->tree_log_mutex);
				1968	if (atomic_read(&log->fs_info->tree_log_commit)) {
				1969	wait_log_commit(log);
				1970	goto out;
				1971	}
				1972	atomic_set(&log->fs_info->tree_log_commit, 1);
				1973
				1974	while(1) {
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	1975	batch = log->fs_info->tree_log_batch;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1976	mutex_unlock(&log->fs_info->tree_log_mutex);
				1977	schedule_timeout_uninterruptible(1);
				1978	mutex_lock(&log->fs_info->tree_log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1979
				1980	while(atomic_read(&log->fs_info->tree_log_writers)) {
				1981	DEFINE_WAIT(wait);
				1982	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1983	TASK_UNINTERRUPTIBLE);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1984	mutex_unlock(&log->fs_info->tree_log_mutex);
				1985	if (atomic_read(&log->fs_info->tree_log_writers))
				1986	schedule();
				1987	mutex_lock(&log->fs_info->tree_log_mutex);
				1988	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1989	}
				1990	if (batch == log->fs_info->tree_log_batch)
				1991	break;
				1992	}
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1993
				1994	ret = btrfs_write_and_wait_marked_extents(log, &log->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1995	BUG_ON(ret);
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	1996	ret = btrfs_write_and_wait_marked_extents(root->fs_info->log_root_tree,
				1997	&root->fs_info->log_root_tree->dirty_log_pages);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1998	BUG_ON(ret);
				1999
				2000	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
				2001	log->fs_info->log_root_tree->node->start);
				2002	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
				2003	btrfs_header_level(log->fs_info->log_root_tree->node));
				2004
				2005	write_ctree_super(trans, log->fs_info->tree_root);
				2006	log->fs_info->tree_log_transid++;
				2007	log->fs_info->tree_log_batch = 0;
				2008	atomic_set(&log->fs_info->tree_log_commit, 0);
				2009	smp_mb();
				2010	if (waitqueue_active(&log->fs_info->tree_log_wait))
				2011	wake_up(&log->fs_info->tree_log_wait);
				2012	out:
				2013	mutex_unlock(&log->fs_info->tree_log_mutex);
				2014	return 0;
				2015
				2016	}
				2017
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2018	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2019	* at commit time of the full transaction
				2020	*/
				2021	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2022	{
				2023	int ret;
				2024	struct btrfs_root *log;
				2025	struct key;
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2026	u64 start;
				2027	u64 end;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2028	struct walk_control wc = {
				2029	.free = 1,
				2030	.process_func = process_one_buffer
				2031	};
				2032
				2033	if (!root->log_root)
				2034	return 0;
				2035
				2036	log = root->log_root;
				2037	ret = walk_log_tree(trans, log, &wc);
				2038	BUG_ON(ret);
				2039
Chris Mason	d0c803c	2008-09-11 16:17:57 -0400	[diff] [blame]	2040	while(1) {
				2041	ret = find_first_extent_bit(&log->dirty_log_pages,
				2042	0, &start, &end, EXTENT_DIRTY);
				2043	if (ret)
				2044	break;
				2045
				2046	clear_extent_dirty(&log->dirty_log_pages,
				2047	start, end, GFP_NOFS);
				2048	}
				2049
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2050	log = root->log_root;
				2051	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2052	&log->root_key);
				2053	BUG_ON(ret);
				2054	root->log_root = NULL;
				2055	kfree(root->log_root);
				2056	return 0;
				2057	}
				2058
				2059	/*
				2060	* helper function to update the item for a given subvolumes log root
				2061	* in the tree of log roots
				2062	*/
				2063	static int update_log_root(struct btrfs_trans_handle *trans,
				2064	struct btrfs_root *log)
				2065	{
				2066	u64 bytenr = btrfs_root_bytenr(&log->root_item);
				2067	int ret;
				2068
				2069	if (log->node->start == bytenr)
				2070	return 0;
				2071
				2072	btrfs_set_root_bytenr(&log->root_item, log->node->start);
Yan Zheng	84234f3	2008-10-29 14:49:05 -0400	[diff] [blame]	2073	btrfs_set_root_generation(&log->root_item, trans->transid);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2074	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				2075	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				2076	&log->root_key, &log->root_item);
				2077	BUG_ON(ret);
				2078	return ret;
				2079	}
				2080
				2081	/*
				2082	* If both a file and directory are logged, and unlinks or renames are
				2083	* mixed in, we have a few interesting corners:
				2084	*
				2085	* create file X in dir Y
				2086	* link file X to X.link in dir Y
				2087	* fsync file X
				2088	* unlink file X but leave X.link
				2089	* fsync dir Y
				2090	*
				2091	* After a crash we would expect only X.link to exist. But file X
				2092	* didn't get fsync'd again so the log has back refs for X and X.link.
				2093	*
				2094	* We solve this by removing directory entries and inode backrefs from the
				2095	* log when a file that was logged in the current transaction is
				2096	* unlinked. Any later fsync will include the updated log entries, and
				2097	* we'll be able to reconstruct the proper directory items from backrefs.
				2098	*
				2099	* This optimizations allows us to avoid relogging the entire inode
				2100	* or the entire directory.
				2101	*/
				2102	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2103	struct btrfs_root *root,
				2104	const char *name, int name_len,
				2105	struct inode *dir, u64 index)
				2106	{
				2107	struct btrfs_root *log;
				2108	struct btrfs_dir_item *di;
				2109	struct btrfs_path *path;
				2110	int ret;
				2111	int bytes_del = 0;
				2112
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2113	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2114	return 0;
				2115
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2116	ret = join_running_log_trans(root);
				2117	if (ret)
				2118	return 0;
				2119
				2120	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2121
				2122	log = root->log_root;
				2123	path = btrfs_alloc_path();
				2124	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2125	name, name_len, -1);
				2126	if (di && !IS_ERR(di)) {
				2127	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2128	bytes_del += name_len;
				2129	BUG_ON(ret);
				2130	}
				2131	btrfs_release_path(log, path);
				2132	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2133	index, name, name_len, -1);
				2134	if (di && !IS_ERR(di)) {
				2135	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2136	bytes_del += name_len;
				2137	BUG_ON(ret);
				2138	}
				2139
				2140	/* update the directory size in the log to reflect the names
				2141	* we have removed
				2142	*/
				2143	if (bytes_del) {
				2144	struct btrfs_key key;
				2145
				2146	key.objectid = dir->i_ino;
				2147	key.offset = 0;
				2148	key.type = BTRFS_INODE_ITEM_KEY;
				2149	btrfs_release_path(log, path);
				2150
				2151	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2152	if (ret == 0) {
				2153	struct btrfs_inode_item *item;
				2154	u64 i_size;
				2155
				2156	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2157	struct btrfs_inode_item);
				2158	i_size = btrfs_inode_size(path->nodes[0], item);
				2159	if (i_size > bytes_del)
				2160	i_size -= bytes_del;
				2161	else
				2162	i_size = 0;
				2163	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2164	btrfs_mark_buffer_dirty(path->nodes[0]);
				2165	} else
				2166	ret = 0;
				2167	btrfs_release_path(log, path);
				2168	}
				2169
				2170	btrfs_free_path(path);
				2171	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2172	end_log_trans(root);
				2173
				2174	return 0;
				2175	}
				2176
				2177	/* see comments for btrfs_del_dir_entries_in_log */
				2178	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2179	struct btrfs_root *root,
				2180	const char *name, int name_len,
				2181	struct inode *inode, u64 dirid)
				2182	{
				2183	struct btrfs_root *log;
				2184	u64 index;
				2185	int ret;
				2186
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2187	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2188	return 0;
				2189
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2190	ret = join_running_log_trans(root);
				2191	if (ret)
				2192	return 0;
				2193	log = root->log_root;
				2194	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2195
				2196	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2197	dirid, &index);
				2198	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2199	end_log_trans(root);
				2200
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2201	return ret;
				2202	}
				2203
				2204	/*
				2205	* creates a range item in the log for 'dirid'. first_offset and
				2206	* last_offset tell us which parts of the key space the log should
				2207	* be considered authoritative for.
				2208	*/
				2209	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2210	struct btrfs_root *log,
				2211	struct btrfs_path *path,
				2212	int key_type, u64 dirid,
				2213	u64 first_offset, u64 last_offset)
				2214	{
				2215	int ret;
				2216	struct btrfs_key key;
				2217	struct btrfs_dir_log_item *item;
				2218
				2219	key.objectid = dirid;
				2220	key.offset = first_offset;
				2221	if (key_type == BTRFS_DIR_ITEM_KEY)
				2222	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2223	else
				2224	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2225	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2226	BUG_ON(ret);
				2227
				2228	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2229	struct btrfs_dir_log_item);
				2230	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2231	btrfs_mark_buffer_dirty(path->nodes[0]);
				2232	btrfs_release_path(log, path);
				2233	return 0;
				2234	}
				2235
				2236	/*
				2237	* log all the items included in the current transaction for a given
				2238	* directory. This also creates the range items in the log tree required
				2239	* to replay anything deleted before the fsync
				2240	*/
				2241	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2242	struct btrfs_root root, struct inode inode,
				2243	struct btrfs_path *path,
				2244	struct btrfs_path *dst_path, int key_type,
				2245	u64 min_offset, u64 *last_offset_ret)
				2246	{
				2247	struct btrfs_key min_key;
				2248	struct btrfs_key max_key;
				2249	struct btrfs_root *log = root->log_root;
				2250	struct extent_buffer *src;
				2251	int ret;
				2252	int i;
				2253	int nritems;
				2254	u64 first_offset = min_offset;
				2255	u64 last_offset = (u64)-1;
				2256
				2257	log = root->log_root;
				2258	max_key.objectid = inode->i_ino;
				2259	max_key.offset = (u64)-1;
				2260	max_key.type = key_type;
				2261
				2262	min_key.objectid = inode->i_ino;
				2263	min_key.type = key_type;
				2264	min_key.offset = min_offset;
				2265
				2266	path->keep_locks = 1;
				2267
				2268	ret = btrfs_search_forward(root, &min_key, &max_key,
				2269	path, 0, trans->transid);
				2270
				2271	/*
				2272	* we didn't find anything from this transaction, see if there
				2273	* is anything at all
				2274	*/
				2275	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2276	min_key.type != key_type) {
				2277	min_key.objectid = inode->i_ino;
				2278	min_key.type = key_type;
				2279	min_key.offset = (u64)-1;
				2280	btrfs_release_path(root, path);
				2281	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2282	if (ret < 0) {
				2283	btrfs_release_path(root, path);
				2284	return ret;
				2285	}
				2286	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2287
				2288	/* if ret == 0 there are items for this type,
				2289	* create a range to tell us the last key of this type.
				2290	* otherwise, there are no items in this directory after
				2291	* *min_offset, and we create a range to indicate that.
				2292	*/
				2293	if (ret == 0) {
				2294	struct btrfs_key tmp;
				2295	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2296	path->slots[0]);
				2297	if (key_type == tmp.type) {
				2298	first_offset = max(min_offset, tmp.offset) + 1;
				2299	}
				2300	}
				2301	goto done;
				2302	}
				2303
				2304	/* go backward to find any previous key */
				2305	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2306	if (ret == 0) {
				2307	struct btrfs_key tmp;
				2308	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2309	if (key_type == tmp.type) {
				2310	first_offset = tmp.offset;
				2311	ret = overwrite_item(trans, log, dst_path,
				2312	path->nodes[0], path->slots[0],
				2313	&tmp);
				2314	}
				2315	}
				2316	btrfs_release_path(root, path);
				2317
				2318	/* find the first key from this transaction again */
				2319	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2320	if (ret != 0) {
				2321	WARN_ON(1);
				2322	goto done;
				2323	}
				2324
				2325	/*
				2326	* we have a block from this transaction, log every item in it
				2327	* from our directory
				2328	*/
				2329	while(1) {
				2330	struct btrfs_key tmp;
				2331	src = path->nodes[0];
				2332	nritems = btrfs_header_nritems(src);
				2333	for (i = path->slots[0]; i < nritems; i++) {
				2334	btrfs_item_key_to_cpu(src, &min_key, i);
				2335
				2336	if (min_key.objectid != inode->i_ino \|\|
				2337	min_key.type != key_type)
				2338	goto done;
				2339	ret = overwrite_item(trans, log, dst_path, src, i,
				2340	&min_key);
				2341	BUG_ON(ret);
				2342	}
				2343	path->slots[0] = nritems;
				2344
				2345	/*
				2346	* look ahead to the next item and see if it is also
				2347	* from this directory and from this transaction
				2348	*/
				2349	ret = btrfs_next_leaf(root, path);
				2350	if (ret == 1) {
				2351	last_offset = (u64)-1;
				2352	goto done;
				2353	}
				2354	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2355	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2356	last_offset = (u64)-1;
				2357	goto done;
				2358	}
				2359	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2360	ret = overwrite_item(trans, log, dst_path,
				2361	path->nodes[0], path->slots[0],
				2362	&tmp);
				2363
				2364	BUG_ON(ret);
				2365	last_offset = tmp.offset;
				2366	goto done;
				2367	}
				2368	}
				2369	done:
				2370	*last_offset_ret = last_offset;
				2371	btrfs_release_path(root, path);
				2372	btrfs_release_path(log, dst_path);
				2373
				2374	/* insert the log range keys to indicate where the log is valid */
				2375	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2376	first_offset, last_offset);
				2377	BUG_ON(ret);
				2378	return 0;
				2379	}
				2380
				2381	/*
				2382	* logging directories is very similar to logging inodes, We find all the items
				2383	* from the current transaction and write them to the log.
				2384	*
				2385	* The recovery code scans the directory in the subvolume, and if it finds a
				2386	* key in the range logged that is not present in the log tree, then it means
				2387	* that dir entry was unlinked during the transaction.
				2388	*
				2389	* In order for that scan to work, we must include one key smaller than
				2390	* the smallest logged by this transaction and one key larger than the largest
				2391	* key logged by this transaction.
				2392	*/
				2393	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2394	struct btrfs_root root, struct inode inode,
				2395	struct btrfs_path *path,
				2396	struct btrfs_path *dst_path)
				2397	{
				2398	u64 min_key;
				2399	u64 max_key;
				2400	int ret;
				2401	int key_type = BTRFS_DIR_ITEM_KEY;
				2402
				2403	again:
				2404	min_key = 0;
				2405	max_key = 0;
				2406	while(1) {
				2407	ret = log_dir_items(trans, root, inode, path,
				2408	dst_path, key_type, min_key,
				2409	&max_key);
				2410	BUG_ON(ret);
				2411	if (max_key == (u64)-1)
				2412	break;
				2413	min_key = max_key + 1;
				2414	}
				2415
				2416	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2417	key_type = BTRFS_DIR_INDEX_KEY;
				2418	goto again;
				2419	}
				2420	return 0;
				2421	}
				2422
				2423	/*
				2424	* a helper function to drop items from the log before we relog an
				2425	* inode. max_key_type indicates the highest item type to remove.
				2426	* This cannot be run for file data extents because it does not
				2427	* free the extents they point to.
				2428	*/
				2429	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2430	struct btrfs_root *log,
				2431	struct btrfs_path *path,
				2432	u64 objectid, int max_key_type)
				2433	{
				2434	int ret;
				2435	struct btrfs_key key;
				2436	struct btrfs_key found_key;
				2437
				2438	key.objectid = objectid;
				2439	key.type = max_key_type;
				2440	key.offset = (u64)-1;
				2441
				2442	while(1) {
				2443	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2444
				2445	if (ret != 1)
				2446	break;
				2447
				2448	if (path->slots[0] == 0)
				2449	break;
				2450
				2451	path->slots[0]--;
				2452	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2453	path->slots[0]);
				2454
				2455	if (found_key.objectid != objectid)
				2456	break;
				2457
				2458	ret = btrfs_del_item(trans, log, path);
				2459	BUG_ON(ret);
				2460	btrfs_release_path(log, path);
				2461	}
				2462	btrfs_release_path(log, path);
				2463	return 0;
				2464	}
				2465
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2466	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2467	struct btrfs_root *log,
				2468	struct btrfs_path *dst_path,
				2469	struct extent_buffer *src,
				2470	int start_slot, int nr, int inode_only)
				2471	{
				2472	unsigned long src_offset;
				2473	unsigned long dst_offset;
				2474	struct btrfs_file_extent_item *extent;
				2475	struct btrfs_inode_item *inode_item;
				2476	int ret;
				2477	struct btrfs_key *ins_keys;
				2478	u32 *ins_sizes;
				2479	char *ins_data;
				2480	int i;
				2481
				2482	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2483	nr * sizeof(u32), GFP_NOFS);
				2484	ins_sizes = (u32 *)ins_data;
				2485	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2486
				2487	for (i = 0; i < nr; i++) {
				2488	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2489	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2490	}
				2491	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2492	ins_keys, ins_sizes, nr);
				2493	BUG_ON(ret);
				2494
				2495	for (i = 0; i < nr; i++) {
				2496	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2497	dst_path->slots[0]);
				2498
				2499	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2500
				2501	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2502	src_offset, ins_sizes[i]);
				2503
				2504	if (inode_only == LOG_INODE_EXISTS &&
				2505	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2506	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2507	dst_path->slots[0],
				2508	struct btrfs_inode_item);
				2509	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2510
				2511	/* set the generation to zero so the recover code
				2512	* can tell the difference between an logging
				2513	* just to say 'this inode exists' and a logging
				2514	* to say 'update this inode with these values'
				2515	*/
				2516	btrfs_set_inode_generation(dst_path->nodes[0],
				2517	inode_item, 0);
				2518	}
				2519	/* take a reference on file data extents so that truncates
				2520	* or deletes of this inode don't have to relog the inode
				2521	* again
				2522	*/
				2523	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2524	int found_type;
				2525	extent = btrfs_item_ptr(src, start_slot + i,
				2526	struct btrfs_file_extent_item);
				2527
				2528	found_type = btrfs_file_extent_type(src, extent);
Yan Zheng	d899e05	2008-10-30 14:25:28 -0400	[diff] [blame^]	2529	if (found_type == BTRFS_FILE_EXTENT_REG \|\|
				2530	found_type == BTRFS_FILE_EXTENT_PREALLOC) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2531	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2532	extent);
				2533	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2534	extent);
				2535	/* ds == 0 is a hole */
				2536	if (ds != 0) {
				2537	ret = btrfs_inc_extent_ref(trans, log,
				2538	ds, dl,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2539	dst_path->nodes[0]->start,
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2540	BTRFS_TREE_LOG_OBJECTID,
Zheng Yan	31840ae	2008-09-23 13:14:14 -0400	[diff] [blame]	2541	trans->transid,
Yan Zheng	3bb1a1b	2008-10-09 11:46:24 -0400	[diff] [blame]	2542	ins_keys[i].objectid);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2543	BUG_ON(ret);
				2544	}
				2545	}
				2546	}
				2547	dst_path->slots[0]++;
				2548	}
				2549
				2550	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2551	btrfs_release_path(log, dst_path);
				2552	kfree(ins_data);
				2553	return 0;
				2554	}
				2555
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2556	/* log a single inode in the tree log.
				2557	* At least one parent directory for this inode must exist in the tree
				2558	* or be logged already.
				2559	*
				2560	* Any items from this inode changed by the current transaction are copied
				2561	* to the log tree. An extra reference is taken on any extents in this
				2562	* file, allowing us to avoid a whole pile of corner cases around logging
				2563	* blocks that have been removed from the tree.
				2564	*
				2565	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2566	* does.
				2567	*
				2568	* This handles both files and directories.
				2569	*/
				2570	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2571	struct btrfs_root root, struct inode inode,
				2572	int inode_only)
				2573	{
				2574	struct btrfs_path *path;
				2575	struct btrfs_path *dst_path;
				2576	struct btrfs_key min_key;
				2577	struct btrfs_key max_key;
				2578	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2579	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2580	u32 size;
				2581	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2582	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2583	int ins_start_slot = 0;
				2584	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2585
				2586	log = root->log_root;
				2587
				2588	path = btrfs_alloc_path();
				2589	dst_path = btrfs_alloc_path();
				2590
				2591	min_key.objectid = inode->i_ino;
				2592	min_key.type = BTRFS_INODE_ITEM_KEY;
				2593	min_key.offset = 0;
				2594
				2595	max_key.objectid = inode->i_ino;
				2596	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2597	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2598	else
				2599	max_key.type = (u8)-1;
				2600	max_key.offset = (u64)-1;
				2601
				2602	/*
				2603	* if this inode has already been logged and we're in inode_only
				2604	* mode, we don't want to delete the things that have already
				2605	* been written to the log.
				2606	*
				2607	* But, if the inode has been through an inode_only log,
				2608	* the logged_trans field is not set. This allows us to catch
				2609	* any new names for this inode in the backrefs by logging it
				2610	* again
				2611	*/
				2612	if (inode_only == LOG_INODE_EXISTS &&
				2613	BTRFS_I(inode)->logged_trans == trans->transid) {
				2614	btrfs_free_path(path);
				2615	btrfs_free_path(dst_path);
				2616	goto out;
				2617	}
				2618	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2619
				2620	/*
				2621	* a brute force approach to making sure we get the most uptodate
				2622	* copies of everything.
				2623	*/
				2624	if (S_ISDIR(inode->i_mode)) {
				2625	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2626
				2627	if (inode_only == LOG_INODE_EXISTS)
				2628	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2629	ret = drop_objectid_items(trans, log, path,
				2630	inode->i_ino, max_key_type);
				2631	} else {
				2632	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2633	}
				2634	BUG_ON(ret);
				2635	path->keep_locks = 1;
				2636
				2637	while(1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2638	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2639	ret = btrfs_search_forward(root, &min_key, &max_key,
				2640	path, 0, trans->transid);
				2641	if (ret != 0)
				2642	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2643	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2644	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2645	if (min_key.objectid != inode->i_ino)
				2646	break;
				2647	if (min_key.type > max_key.type)
				2648	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2649
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2650	src = path->nodes[0];
				2651	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2652	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2653	ins_nr++;
				2654	goto next_slot;
				2655	} else if (!ins_nr) {
				2656	ins_start_slot = path->slots[0];
				2657	ins_nr = 1;
				2658	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2659	}
				2660
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2661	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2662	ins_nr, inode_only);
				2663	BUG_ON(ret);
				2664	ins_nr = 1;
				2665	ins_start_slot = path->slots[0];
				2666	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2667
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2668	nritems = btrfs_header_nritems(path->nodes[0]);
				2669	path->slots[0]++;
				2670	if (path->slots[0] < nritems) {
				2671	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2672	path->slots[0]);
				2673	goto again;
				2674	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2675	if (ins_nr) {
				2676	ret = copy_items(trans, log, dst_path, src,
				2677	ins_start_slot,
				2678	ins_nr, inode_only);
				2679	BUG_ON(ret);
				2680	ins_nr = 0;
				2681	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2682	btrfs_release_path(root, path);
				2683
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2684	if (min_key.offset < (u64)-1)
				2685	min_key.offset++;
				2686	else if (min_key.type < (u8)-1)
				2687	min_key.type++;
				2688	else if (min_key.objectid < (u64)-1)
				2689	min_key.objectid++;
				2690	else
				2691	break;
				2692	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame]	2693	if (ins_nr) {
				2694	ret = copy_items(trans, log, dst_path, src,
				2695	ins_start_slot,
				2696	ins_nr, inode_only);
				2697	BUG_ON(ret);
				2698	ins_nr = 0;
				2699	}
				2700	WARN_ON(ins_nr);
Chris Mason	9623f9a	2008-09-11 17:42:42 -0400	[diff] [blame]	2701	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode)) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2702	btrfs_release_path(root, path);
				2703	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2704	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2705	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2706	BUG_ON(ret);
				2707	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2708	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2709	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2710
				2711	btrfs_free_path(path);
				2712	btrfs_free_path(dst_path);
				2713
				2714	mutex_lock(&root->fs_info->tree_log_mutex);
				2715	ret = update_log_root(trans, log);
				2716	BUG_ON(ret);
				2717	mutex_unlock(&root->fs_info->tree_log_mutex);
				2718	out:
				2719	return 0;
				2720	}
				2721
				2722	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2723	struct btrfs_root root, struct inode inode,
				2724	int inode_only)
				2725	{
				2726	int ret;
				2727
				2728	start_log_trans(trans, root);
				2729	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2730	end_log_trans(root);
				2731	return ret;
				2732	}
				2733
				2734	/*
				2735	* helper function around btrfs_log_inode to make sure newly created
				2736	* parent directories also end up in the log. A minimal inode and backref
				2737	* only logging is done of any parent directories that are older than
				2738	* the last committed transaction
				2739	*/
				2740	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2741	struct btrfs_root root, struct dentry dentry)
				2742	{
				2743	int inode_only = LOG_INODE_ALL;
				2744	struct super_block *sb;
				2745	int ret;
				2746
				2747	start_log_trans(trans, root);
				2748	sb = dentry->d_inode->i_sb;
				2749	while(1) {
				2750	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2751	inode_only);
				2752	BUG_ON(ret);
				2753	inode_only = LOG_INODE_EXISTS;
				2754
				2755	dentry = dentry->d_parent;
				2756	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2757	break;
				2758
				2759	if (BTRFS_I(dentry->d_inode)->generation <=
				2760	root->fs_info->last_trans_committed)
				2761	break;
				2762	}
				2763	end_log_trans(root);
				2764	return 0;
				2765	}
				2766
				2767	/*
				2768	* it is not safe to log dentry if the chunk root has added new
				2769	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2770	* If this returns 1, you must commit the transaction to safely get your
				2771	* data on disk.
				2772	*/
				2773	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2774	struct btrfs_root root, struct dentry dentry)
				2775	{
				2776	u64 gen;
				2777	gen = root->fs_info->last_trans_new_blockgroup;
				2778	if (gen > root->fs_info->last_trans_committed)
				2779	return 1;
				2780	else
				2781	return btrfs_log_dentry(trans, root, dentry);
				2782	}
				2783
				2784	/*
				2785	* should be called during mount to recover any replay any log trees
				2786	* from the FS
				2787	*/
				2788	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2789	{
				2790	int ret;
				2791	struct btrfs_path *path;
				2792	struct btrfs_trans_handle *trans;
				2793	struct btrfs_key key;
				2794	struct btrfs_key found_key;
				2795	struct btrfs_key tmp_key;
				2796	struct btrfs_root *log;
				2797	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2798	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2799	struct walk_control wc = {
				2800	.process_func = process_one_buffer,
				2801	.stage = 0,
				2802	};
				2803
				2804	fs_info->log_root_recovering = 1;
				2805	path = btrfs_alloc_path();
				2806	BUG_ON(!path);
				2807
				2808	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2809
				2810	wc.trans = trans;
				2811	wc.pin = 1;
				2812
				2813	walk_log_tree(trans, log_root_tree, &wc);
				2814
				2815	again:
				2816	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2817	key.offset = (u64)-1;
				2818	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2819
				2820	while(1) {
				2821	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2822	if (ret < 0)
				2823	break;
				2824	if (ret > 0) {
				2825	if (path->slots[0] == 0)
				2826	break;
				2827	path->slots[0]--;
				2828	}
				2829	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2830	path->slots[0]);
				2831	btrfs_release_path(log_root_tree, path);
				2832	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2833	break;
				2834
				2835	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2836	&found_key);
				2837	BUG_ON(!log);
				2838
				2839
				2840	tmp_key.objectid = found_key.offset;
				2841	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2842	tmp_key.offset = (u64)-1;
				2843
				2844	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				2845
				2846	BUG_ON(!wc.replay_dest);
				2847
				2848	btrfs_record_root_in_trans(wc.replay_dest);
				2849	ret = walk_log_tree(trans, log, &wc);
				2850	BUG_ON(ret);
				2851
				2852	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2853	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2854	path);
				2855	BUG_ON(ret);
				2856	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2857	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2858	if (ret == 0) {
				2859	wc.replay_dest->highest_inode = highest_inode;
				2860	wc.replay_dest->last_inode_alloc = highest_inode;
				2861	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2862
				2863	key.offset = found_key.offset - 1;
				2864	free_extent_buffer(log->node);
				2865	kfree(log);
				2866
				2867	if (found_key.offset == 0)
				2868	break;
				2869	}
				2870	btrfs_release_path(log_root_tree, path);
				2871
				2872	/* step one is to pin it all, step two is to replay just inodes */
				2873	if (wc.pin) {
				2874	wc.pin = 0;
				2875	wc.process_func = replay_one_buffer;
				2876	wc.stage = LOG_WALK_REPLAY_INODES;
				2877	goto again;
				2878	}
				2879	/* step three is to replay everything */
				2880	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2881	wc.stage++;
				2882	goto again;
				2883	}
				2884
				2885	btrfs_free_path(path);
				2886
				2887	free_extent_buffer(log_root_tree->node);
				2888	log_root_tree->log_root = NULL;
				2889	fs_info->log_root_recovering = 0;
				2890
				2891	/* step 4: commit the transaction, which also unpins the blocks */
				2892	btrfs_commit_transaction(trans, fs_info->tree_root);
				2893
				2894	kfree(log_root_tree);
				2895	return 0;
				2896	}