Blame - fs/btrfs/tree-log.c - kernel/msm-4.9

blob: ae96451bc223dfe7350887afbd802866a27f0f48 [file] [log] [blame]

Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1	/*
				2	* Copyright (C) 2008 Oracle. All rights reserved.
				3	*
				4	* This program is free software; you can redistribute it and/or
				5	* modify it under the terms of the GNU General Public
				6	* License v2 as published by the Free Software Foundation.
				7	*
				8	* This program is distributed in the hope that it will be useful,
				9	* but WITHOUT ANY WARRANTY; without even the implied warranty of
				10	* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
				11	* General Public License for more details.
				12	*
				13	* You should have received a copy of the GNU General Public
				14	* License along with this program; if not, write to the
				15	* Free Software Foundation, Inc., 59 Temple Place - Suite 330,
				16	* Boston, MA 021110-1307, USA.
				17	*/
				18
				19	#include <linux/sched.h>
				20	#include "ctree.h"
				21	#include "transaction.h"
				22	#include "disk-io.h"
				23	#include "locking.h"
				24	#include "print-tree.h"
				25	#include "compat.h"
				26
				27	/* magic values for the inode_only field in btrfs_log_inode:
				28	*
				29	* LOG_INODE_ALL means to log everything
				30	* LOG_INODE_EXISTS means to log just enough to recreate the inode
				31	* during log replay
				32	*/
				33	#define LOG_INODE_ALL 0
				34	#define LOG_INODE_EXISTS 1
				35
				36	/*
				37	* stages for the tree walking. The first
				38	* stage (0) is to only pin down the blocks we find
				39	* the second stage (1) is to make sure that all the inodes
				40	* we find in the log are created in the subvolume.
				41	*
				42	* The last stage is to deal with directories and links and extents
				43	* and all the other fun semantics
				44	*/
				45	#define LOG_WALK_PIN_ONLY 0
				46	#define LOG_WALK_REPLAY_INODES 1
				47	#define LOG_WALK_REPLAY_ALL 2
				48
				49	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				50	struct btrfs_root root, struct inode inode,
				51	int inode_only);
				52
				53	/*
				54	* tree logging is a special write ahead log used to make sure that
				55	* fsyncs and O_SYNCs can happen without doing full tree commits.
				56	*
				57	* Full tree commits are expensive because they require commonly
				58	* modified blocks to be recowed, creating many dirty pages in the
				59	* extent tree an 4x-6x higher write load than ext3.
				60	*
				61	* Instead of doing a tree commit on every fsync, we use the
				62	* key ranges and transaction ids to find items for a given file or directory
				63	* that have changed in this transaction. Those items are copied into
				64	* a special tree (one per subvolume root), that tree is written to disk
				65	* and then the fsync is considered complete.
				66	*
				67	* After a crash, items are copied out of the log-tree back into the
				68	* subvolume tree. Any file data extents found are recorded in the extent
				69	* allocation tree, and the log-tree freed.
				70	*
				71	* The log tree is read three times, once to pin down all the extents it is
				72	* using in ram and once, once to create all the inodes logged in the tree
				73	* and once to do all the other items.
				74	*/
				75
				76	/*
				77	* btrfs_add_log_tree adds a new per-subvolume log tree into the
				78	* tree of log tree roots. This must be called with a tree log transaction
				79	* running (see start_log_trans).
				80	*/
				81	int btrfs_add_log_tree(struct btrfs_trans_handle *trans,
				82	struct btrfs_root *root)
				83	{
				84	struct btrfs_key key;
				85	struct btrfs_root_item root_item;
				86	struct btrfs_inode_item *inode_item;
				87	struct extent_buffer *leaf;
				88	struct btrfs_root *new_root = root;
				89	int ret;
				90	u64 objectid = root->root_key.objectid;
				91
				92	leaf = btrfs_alloc_free_block(trans, root, root->leafsize,
				93	BTRFS_TREE_LOG_OBJECTID,
				94	0, 0, 0, 0, 0);
				95	if (IS_ERR(leaf)) {
				96	ret = PTR_ERR(leaf);
				97	return ret;
				98	}
				99
				100	btrfs_set_header_nritems(leaf, 0);
				101	btrfs_set_header_level(leaf, 0);
				102	btrfs_set_header_bytenr(leaf, leaf->start);
				103	btrfs_set_header_generation(leaf, trans->transid);
				104	btrfs_set_header_owner(leaf, BTRFS_TREE_LOG_OBJECTID);
				105
				106	write_extent_buffer(leaf, root->fs_info->fsid,
				107	(unsigned long)btrfs_header_fsid(leaf),
				108	BTRFS_FSID_SIZE);
				109	btrfs_mark_buffer_dirty(leaf);
				110
				111	inode_item = &root_item.inode;
				112	memset(inode_item, 0, sizeof(*inode_item));
				113	inode_item->generation = cpu_to_le64(1);
				114	inode_item->size = cpu_to_le64(3);
				115	inode_item->nlink = cpu_to_le32(1);
				116	inode_item->nblocks = cpu_to_le64(1);
				117	inode_item->mode = cpu_to_le32(S_IFDIR \| 0755);
				118
				119	btrfs_set_root_bytenr(&root_item, leaf->start);
				120	btrfs_set_root_level(&root_item, 0);
				121	btrfs_set_root_refs(&root_item, 0);
				122	btrfs_set_root_used(&root_item, 0);
				123
				124	memset(&root_item.drop_progress, 0, sizeof(root_item.drop_progress));
				125	root_item.drop_level = 0;
				126
				127	btrfs_tree_unlock(leaf);
				128	free_extent_buffer(leaf);
				129	leaf = NULL;
				130
				131	btrfs_set_root_dirid(&root_item, 0);
				132
				133	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				134	key.offset = objectid;
				135	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				136	ret = btrfs_insert_root(trans, root->fs_info->log_root_tree, &key,
				137	&root_item);
				138	if (ret)
				139	goto fail;
				140
				141	new_root = btrfs_read_fs_root_no_radix(root->fs_info->log_root_tree,
				142	&key);
				143	BUG_ON(!new_root);
				144
				145	WARN_ON(root->log_root);
				146	root->log_root = new_root;
				147
				148	/*
				149	* log trees do not get reference counted because they go away
				150	* before a real commit is actually done. They do store pointers
				151	* to file data extents, and those reference counts still get
				152	* updated (along with back refs to the log tree).
				153	*/
				154	new_root->ref_cows = 0;
				155	new_root->last_trans = trans->transid;
				156	fail:
				157	return ret;
				158	}
				159
				160	/*
				161	* start a sub transaction and setup the log tree
				162	* this increments the log tree writer count to make the people
				163	* syncing the tree wait for us to finish
				164	*/
				165	static int start_log_trans(struct btrfs_trans_handle *trans,
				166	struct btrfs_root *root)
				167	{
				168	int ret;
				169	mutex_lock(&root->fs_info->tree_log_mutex);
				170	if (!root->fs_info->log_root_tree) {
				171	ret = btrfs_init_log_root_tree(trans, root->fs_info);
				172	BUG_ON(ret);
				173	}
				174	if (!root->log_root) {
				175	ret = btrfs_add_log_tree(trans, root);
				176	BUG_ON(ret);
				177	}
				178	atomic_inc(&root->fs_info->tree_log_writers);
				179	root->fs_info->tree_log_batch++;
				180	mutex_unlock(&root->fs_info->tree_log_mutex);
				181	return 0;
				182	}
				183
				184	/*
				185	* returns 0 if there was a log transaction running and we were able
				186	* to join, or returns -ENOENT if there were not transactions
				187	* in progress
				188	*/
				189	static int join_running_log_trans(struct btrfs_root *root)
				190	{
				191	int ret = -ENOENT;
				192
				193	smp_mb();
				194	if (!root->log_root)
				195	return -ENOENT;
				196
				197	mutex_lock(&root->fs_info->tree_log_mutex);
				198	if (root->log_root) {
				199	ret = 0;
				200	atomic_inc(&root->fs_info->tree_log_writers);
				201	root->fs_info->tree_log_batch++;
				202	}
				203	mutex_unlock(&root->fs_info->tree_log_mutex);
				204	return ret;
				205	}
				206
				207	/*
				208	* indicate we're done making changes to the log tree
				209	* and wake up anyone waiting to do a sync
				210	*/
				211	static int end_log_trans(struct btrfs_root *root)
				212	{
				213	atomic_dec(&root->fs_info->tree_log_writers);
				214	smp_mb();
				215	if (waitqueue_active(&root->fs_info->tree_log_wait))
				216	wake_up(&root->fs_info->tree_log_wait);
				217	return 0;
				218	}
				219
				220
				221	/*
				222	* the walk control struct is used to pass state down the chain when
				223	* processing the log tree. The stage field tells us which part
				224	* of the log tree processing we are currently doing. The others
				225	* are state fields used for that specific part
				226	*/
				227	struct walk_control {
				228	/* should we free the extent on disk when done? This is used
				229	* at transaction commit time while freeing a log tree
				230	*/
				231	int free;
				232
				233	/* should we write out the extent buffer? This is used
				234	* while flushing the log tree to disk during a sync
				235	*/
				236	int write;
				237
				238	/* should we wait for the extent buffer io to finish? Also used
				239	* while flushing the log tree to disk for a sync
				240	*/
				241	int wait;
				242
				243	/* pin only walk, we record which extents on disk belong to the
				244	* log trees
				245	*/
				246	int pin;
				247
				248	/* what stage of the replay code we're currently in */
				249	int stage;
				250
				251	/* the root we are currently replaying */
				252	struct btrfs_root *replay_dest;
				253
				254	/* the trans handle for the current replay */
				255	struct btrfs_trans_handle *trans;
				256
				257	/* the function that gets used to process blocks we find in the
				258	* tree. Note the extent_buffer might not be up to date when it is
				259	* passed in, and it must be checked or read if you need the data
				260	* inside it
				261	*/
				262	int (process_func)(struct btrfs_root log, struct extent_buffer *eb,
				263	struct walk_control *wc, u64 gen);
				264	};
				265
				266	/*
				267	* process_func used to pin down extents, write them or wait on them
				268	*/
				269	static int process_one_buffer(struct btrfs_root *log,
				270	struct extent_buffer *eb,
				271	struct walk_control *wc, u64 gen)
				272	{
				273	if (wc->pin) {
				274	mutex_lock(&log->fs_info->alloc_mutex);
				275	btrfs_update_pinned_extents(log->fs_info->extent_root,
				276	eb->start, eb->len, 1);
				277	mutex_unlock(&log->fs_info->alloc_mutex);
				278	}
				279
				280	if (btrfs_buffer_uptodate(eb, gen)) {
				281	if (wc->write)
				282	btrfs_write_tree_block(eb);
				283	if (wc->wait)
				284	btrfs_wait_tree_block_writeback(eb);
				285	}
				286	return 0;
				287	}
				288
				289	/*
				290	* Item overwrite used by replay and tree logging. eb, slot and key all refer
				291	* to the src data we are copying out.
				292	*
				293	* root is the tree we are copying into, and path is a scratch
				294	* path for use in this function (it should be released on entry and
				295	* will be released on exit).
				296	*
				297	* If the key is already in the destination tree the existing item is
				298	* overwritten. If the existing item isn't big enough, it is extended.
				299	* If it is too large, it is truncated.
				300	*
				301	* If the key isn't in the destination yet, a new item is inserted.
				302	*/
				303	static noinline int overwrite_item(struct btrfs_trans_handle *trans,
				304	struct btrfs_root *root,
				305	struct btrfs_path *path,
				306	struct extent_buffer *eb, int slot,
				307	struct btrfs_key *key)
				308	{
				309	int ret;
				310	u32 item_size;
				311	u64 saved_i_size = 0;
				312	int save_old_i_size = 0;
				313	unsigned long src_ptr;
				314	unsigned long dst_ptr;
				315	int overwrite_root = 0;
				316
				317	if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				318	overwrite_root = 1;
				319
				320	item_size = btrfs_item_size_nr(eb, slot);
				321	src_ptr = btrfs_item_ptr_offset(eb, slot);
				322
				323	/* look for the key in the destination tree */
				324	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				325	if (ret == 0) {
				326	char *src_copy;
				327	char *dst_copy;
				328	u32 dst_size = btrfs_item_size_nr(path->nodes[0],
				329	path->slots[0]);
				330	if (dst_size != item_size)
				331	goto insert;
				332
				333	if (item_size == 0) {
				334	btrfs_release_path(root, path);
				335	return 0;
				336	}
				337	dst_copy = kmalloc(item_size, GFP_NOFS);
				338	src_copy = kmalloc(item_size, GFP_NOFS);
				339
				340	read_extent_buffer(eb, src_copy, src_ptr, item_size);
				341
				342	dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				343	read_extent_buffer(path->nodes[0], dst_copy, dst_ptr,
				344	item_size);
				345	ret = memcmp(dst_copy, src_copy, item_size);
				346
				347	kfree(dst_copy);
				348	kfree(src_copy);
				349	/*
				350	* they have the same contents, just return, this saves
				351	* us from cowing blocks in the destination tree and doing
				352	* extra writes that may not have been done by a previous
				353	* sync
				354	*/
				355	if (ret == 0) {
				356	btrfs_release_path(root, path);
				357	return 0;
				358	}
				359
				360	}
				361	insert:
				362	btrfs_release_path(root, path);
				363	/* try to insert the key into the destination tree */
				364	ret = btrfs_insert_empty_item(trans, root, path,
				365	key, item_size);
				366
				367	/* make sure any existing item is the correct size */
				368	if (ret == -EEXIST) {
				369	u32 found_size;
				370	found_size = btrfs_item_size_nr(path->nodes[0],
				371	path->slots[0]);
				372	if (found_size > item_size) {
				373	btrfs_truncate_item(trans, root, path, item_size, 1);
				374	} else if (found_size < item_size) {
				375	ret = btrfs_del_item(trans, root,
				376	path);
				377	BUG_ON(ret);
				378
				379	btrfs_release_path(root, path);
				380	ret = btrfs_insert_empty_item(trans,
				381	root, path, key, item_size);
				382	BUG_ON(ret);
				383	}
				384	} else if (ret) {
				385	BUG();
				386	}
				387	dst_ptr = btrfs_item_ptr_offset(path->nodes[0],
				388	path->slots[0]);
				389
				390	/* don't overwrite an existing inode if the generation number
				391	* was logged as zero. This is done when the tree logging code
				392	* is just logging an inode to make sure it exists after recovery.
				393	*
				394	* Also, don't overwrite i_size on directories during replay.
				395	* log replay inserts and removes directory items based on the
				396	* state of the tree found in the subvolume, and i_size is modified
				397	* as it goes
				398	*/
				399	if (key->type == BTRFS_INODE_ITEM_KEY && ret == -EEXIST) {
				400	struct btrfs_inode_item *src_item;
				401	struct btrfs_inode_item *dst_item;
				402
				403	src_item = (struct btrfs_inode_item *)src_ptr;
				404	dst_item = (struct btrfs_inode_item *)dst_ptr;
				405
				406	if (btrfs_inode_generation(eb, src_item) == 0)
				407	goto no_copy;
				408
				409	if (overwrite_root &&
				410	S_ISDIR(btrfs_inode_mode(eb, src_item)) &&
				411	S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) {
				412	save_old_i_size = 1;
				413	saved_i_size = btrfs_inode_size(path->nodes[0],
				414	dst_item);
				415	}
				416	}
				417
				418	copy_extent_buffer(path->nodes[0], eb, dst_ptr,
				419	src_ptr, item_size);
				420
				421	if (save_old_i_size) {
				422	struct btrfs_inode_item *dst_item;
				423	dst_item = (struct btrfs_inode_item *)dst_ptr;
				424	btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size);
				425	}
				426
				427	/* make sure the generation is filled in */
				428	if (key->type == BTRFS_INODE_ITEM_KEY) {
				429	struct btrfs_inode_item *dst_item;
				430	dst_item = (struct btrfs_inode_item *)dst_ptr;
				431	if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) {
				432	btrfs_set_inode_generation(path->nodes[0], dst_item,
				433	trans->transid);
				434	}
				435	}
				436	no_copy:
				437	btrfs_mark_buffer_dirty(path->nodes[0]);
				438	btrfs_release_path(root, path);
				439	return 0;
				440	}
				441
				442	/*
				443	* simple helper to read an inode off the disk from a given root
				444	* This can only be called for subvolume roots and not for the log
				445	*/
				446	static noinline struct inode read_one_inode(struct btrfs_root root,
				447	u64 objectid)
				448	{
				449	struct inode *inode;
				450	inode = btrfs_iget_locked(root->fs_info->sb, objectid, root);
				451	if (inode->i_state & I_NEW) {
				452	BTRFS_I(inode)->root = root;
				453	BTRFS_I(inode)->location.objectid = objectid;
				454	BTRFS_I(inode)->location.type = BTRFS_INODE_ITEM_KEY;
				455	BTRFS_I(inode)->location.offset = 0;
				456	btrfs_read_locked_inode(inode);
				457	unlock_new_inode(inode);
				458
				459	}
				460	if (is_bad_inode(inode)) {
				461	iput(inode);
				462	inode = NULL;
				463	}
				464	return inode;
				465	}
				466
				467	/* replays a single extent in 'eb' at 'slot' with 'key' into the
				468	* subvolume 'root'. path is released on entry and should be released
				469	* on exit.
				470	*
				471	* extents in the log tree have not been allocated out of the extent
				472	* tree yet. So, this completes the allocation, taking a reference
				473	* as required if the extent already exists or creating a new extent
				474	* if it isn't in the extent allocation tree yet.
				475	*
				476	* The extent is inserted into the file, dropping any existing extents
				477	* from the file that overlap the new one.
				478	*/
				479	static noinline int replay_one_extent(struct btrfs_trans_handle *trans,
				480	struct btrfs_root *root,
				481	struct btrfs_path *path,
				482	struct extent_buffer *eb, int slot,
				483	struct btrfs_key *key)
				484	{
				485	int found_type;
				486	u64 mask = root->sectorsize - 1;
				487	u64 extent_end;
				488	u64 alloc_hint;
				489	u64 start = key->offset;
				490	struct btrfs_file_extent_item *item;
				491	struct inode *inode = NULL;
				492	unsigned long size;
				493	int ret = 0;
				494
				495	item = btrfs_item_ptr(eb, slot, struct btrfs_file_extent_item);
				496	found_type = btrfs_file_extent_type(eb, item);
				497
				498	if (found_type == BTRFS_FILE_EXTENT_REG)
				499	extent_end = start + btrfs_file_extent_num_bytes(eb, item);
				500	else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				501	size = btrfs_file_extent_inline_len(eb,
				502	btrfs_item_nr(eb, slot));
				503	extent_end = (start + size + mask) & ~mask;
				504	} else {
				505	ret = 0;
				506	goto out;
				507	}
				508
				509	inode = read_one_inode(root, key->objectid);
				510	if (!inode) {
				511	ret = -EIO;
				512	goto out;
				513	}
				514
				515	/*
				516	* first check to see if we already have this extent in the
				517	* file. This must be done before the btrfs_drop_extents run
				518	* so we don't try to drop this extent.
				519	*/
				520	ret = btrfs_lookup_file_extent(trans, root, path, inode->i_ino,
				521	start, 0);
				522
				523	if (ret == 0 && found_type == BTRFS_FILE_EXTENT_REG) {
				524	struct btrfs_file_extent_item cmp1;
				525	struct btrfs_file_extent_item cmp2;
				526	struct btrfs_file_extent_item *existing;
				527	struct extent_buffer *leaf;
				528
				529	leaf = path->nodes[0];
				530	existing = btrfs_item_ptr(leaf, path->slots[0],
				531	struct btrfs_file_extent_item);
				532
				533	read_extent_buffer(eb, &cmp1, (unsigned long)item,
				534	sizeof(cmp1));
				535	read_extent_buffer(leaf, &cmp2, (unsigned long)existing,
				536	sizeof(cmp2));
				537
				538	/*
				539	* we already have a pointer to this exact extent,
				540	* we don't have to do anything
				541	*/
				542	if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) {
				543	btrfs_release_path(root, path);
				544	goto out;
				545	}
				546	}
				547	btrfs_release_path(root, path);
				548
				549	/* drop any overlapping extents */
				550	ret = btrfs_drop_extents(trans, root, inode,
				551	start, extent_end, start, &alloc_hint);
				552	BUG_ON(ret);
				553
				554	BUG_ON(ret);
				555	if (found_type == BTRFS_FILE_EXTENT_REG) {
				556	struct btrfs_key ins;
				557
				558	ins.objectid = btrfs_file_extent_disk_bytenr(eb, item);
				559	ins.offset = btrfs_file_extent_disk_num_bytes(eb, item);
				560	ins.type = BTRFS_EXTENT_ITEM_KEY;
				561
				562	/* insert the extent pointer in the file */
				563	ret = overwrite_item(trans, root, path, eb, slot, key);
				564	BUG_ON(ret);
				565
				566	/*
				567	* is this extent already allocated in the extent
				568	* allocation tree? If so, just add a reference
				569	*/
				570	ret = btrfs_lookup_extent(root, path, ins.objectid, ins.offset);
				571	btrfs_release_path(root, path);
				572	if (ret == 0) {
				573	ret = btrfs_inc_extent_ref(trans, root,
				574	ins.objectid, ins.offset,
				575	root->root_key.objectid,
				576	trans->transid, key->objectid, start);
				577	} else {
				578	/*
				579	* insert the extent pointer in the extent
				580	* allocation tree
				581	*/
				582	ret = btrfs_alloc_logged_extent(trans, root,
				583	root->root_key.objectid,
				584	trans->transid, key->objectid,
				585	start, &ins);
				586	BUG_ON(ret);
				587	}
				588	} else if (found_type == BTRFS_FILE_EXTENT_INLINE) {
				589	/* inline extents are easy, we just overwrite them */
				590	ret = overwrite_item(trans, root, path, eb, slot, key);
				591	BUG_ON(ret);
				592	}
				593	/* btrfs_drop_extents changes i_blocks, update it here */
				594	inode->i_blocks += (extent_end - start) >> 9;
				595	btrfs_update_inode(trans, root, inode);
				596	out:
				597	if (inode)
				598	iput(inode);
				599	return ret;
				600	}
				601
				602	/*
				603	* when cleaning up conflicts between the directory names in the
				604	* subvolume, directory names in the log and directory names in the
				605	* inode back references, we may have to unlink inodes from directories.
				606	*
				607	* This is a helper function to do the unlink of a specific directory
				608	* item
				609	*/
				610	static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans,
				611	struct btrfs_root *root,
				612	struct btrfs_path *path,
				613	struct inode *dir,
				614	struct btrfs_dir_item *di)
				615	{
				616	struct inode *inode;
				617	char *name;
				618	int name_len;
				619	struct extent_buffer *leaf;
				620	struct btrfs_key location;
				621	int ret;
				622
				623	leaf = path->nodes[0];
				624
				625	btrfs_dir_item_key_to_cpu(leaf, di, &location);
				626	name_len = btrfs_dir_name_len(leaf, di);
				627	name = kmalloc(name_len, GFP_NOFS);
				628	read_extent_buffer(leaf, name, (unsigned long)(di + 1), name_len);
				629	btrfs_release_path(root, path);
				630
				631	inode = read_one_inode(root, location.objectid);
				632	BUG_ON(!inode);
				633
				634	btrfs_inc_nlink(inode);
				635	ret = btrfs_unlink_inode(trans, root, dir, inode, name, name_len);
				636	kfree(name);
				637
				638	iput(inode);
				639	return ret;
				640	}
				641
				642	/*
				643	* helper function to see if a given name and sequence number found
				644	* in an inode back reference are already in a directory and correctly
				645	* point to this inode
				646	*/
				647	static noinline int inode_in_dir(struct btrfs_root *root,
				648	struct btrfs_path *path,
				649	u64 dirid, u64 objectid, u64 index,
				650	const char *name, int name_len)
				651	{
				652	struct btrfs_dir_item *di;
				653	struct btrfs_key location;
				654	int match = 0;
				655
				656	di = btrfs_lookup_dir_index_item(NULL, root, path, dirid,
				657	index, name, name_len, 0);
				658	if (di && !IS_ERR(di)) {
				659	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				660	if (location.objectid != objectid)
				661	goto out;
				662	} else
				663	goto out;
				664	btrfs_release_path(root, path);
				665
				666	di = btrfs_lookup_dir_item(NULL, root, path, dirid, name, name_len, 0);
				667	if (di && !IS_ERR(di)) {
				668	btrfs_dir_item_key_to_cpu(path->nodes[0], di, &location);
				669	if (location.objectid != objectid)
				670	goto out;
				671	} else
				672	goto out;
				673	match = 1;
				674	out:
				675	btrfs_release_path(root, path);
				676	return match;
				677	}
				678
				679	/*
				680	* helper function to check a log tree for a named back reference in
				681	* an inode. This is used to decide if a back reference that is
				682	* found in the subvolume conflicts with what we find in the log.
				683	*
				684	* inode backreferences may have multiple refs in a single item,
				685	* during replay we process one reference at a time, and we don't
				686	* want to delete valid links to a file from the subvolume if that
				687	* link is also in the log.
				688	*/
				689	static noinline int backref_in_log(struct btrfs_root *log,
				690	struct btrfs_key *key,
				691	char *name, int namelen)
				692	{
				693	struct btrfs_path *path;
				694	struct btrfs_inode_ref *ref;
				695	unsigned long ptr;
				696	unsigned long ptr_end;
				697	unsigned long name_ptr;
				698	int found_name_len;
				699	int item_size;
				700	int ret;
				701	int match = 0;
				702
				703	path = btrfs_alloc_path();
				704	ret = btrfs_search_slot(NULL, log, key, path, 0, 0);
				705	if (ret != 0)
				706	goto out;
				707
				708	item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
				709	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				710	ptr_end = ptr + item_size;
				711	while (ptr < ptr_end) {
				712	ref = (struct btrfs_inode_ref *)ptr;
				713	found_name_len = btrfs_inode_ref_name_len(path->nodes[0], ref);
				714	if (found_name_len == namelen) {
				715	name_ptr = (unsigned long)(ref + 1);
				716	ret = memcmp_extent_buffer(path->nodes[0], name,
				717	name_ptr, namelen);
				718	if (ret == 0) {
				719	match = 1;
				720	goto out;
				721	}
				722	}
				723	ptr = (unsigned long)(ref + 1) + found_name_len;
				724	}
				725	out:
				726	btrfs_free_path(path);
				727	return match;
				728	}
				729
				730
				731	/*
				732	* replay one inode back reference item found in the log tree.
				733	* eb, slot and key refer to the buffer and key found in the log tree.
				734	* root is the destination we are replaying into, and path is for temp
				735	* use by this function. (it should be released on return).
				736	*/
				737	static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
				738	struct btrfs_root *root,
				739	struct btrfs_root *log,
				740	struct btrfs_path *path,
				741	struct extent_buffer *eb, int slot,
				742	struct btrfs_key *key)
				743	{
				744	struct inode *dir;
				745	int ret;
				746	struct btrfs_key location;
				747	struct btrfs_inode_ref *ref;
				748	struct btrfs_dir_item *di;
				749	struct inode *inode;
				750	char *name;
				751	int namelen;
				752	unsigned long ref_ptr;
				753	unsigned long ref_end;
				754
				755	location.objectid = key->objectid;
				756	location.type = BTRFS_INODE_ITEM_KEY;
				757	location.offset = 0;
				758
				759	/*
				760	* it is possible that we didn't log all the parent directories
				761	* for a given inode. If we don't find the dir, just don't
				762	* copy the back ref in. The link count fixup code will take
				763	* care of the rest
				764	*/
				765	dir = read_one_inode(root, key->offset);
				766	if (!dir)
				767	return -ENOENT;
				768
				769	inode = read_one_inode(root, key->objectid);
				770	BUG_ON(!dir);
				771
				772	ref_ptr = btrfs_item_ptr_offset(eb, slot);
				773	ref_end = ref_ptr + btrfs_item_size_nr(eb, slot);
				774
				775	again:
				776	ref = (struct btrfs_inode_ref *)ref_ptr;
				777
				778	namelen = btrfs_inode_ref_name_len(eb, ref);
				779	name = kmalloc(namelen, GFP_NOFS);
				780	BUG_ON(!name);
				781
				782	read_extent_buffer(eb, name, (unsigned long)(ref + 1), namelen);
				783
				784	/* if we already have a perfect match, we're done */
				785	if (inode_in_dir(root, path, dir->i_ino, inode->i_ino,
				786	btrfs_inode_ref_index(eb, ref),
				787	name, namelen)) {
				788	goto out;
				789	}
				790
				791	/*
				792	* look for a conflicting back reference in the metadata.
				793	* if we find one we have to unlink that name of the file
				794	* before we add our new link. Later on, we overwrite any
				795	* existing back reference, and we don't want to create
				796	* dangling pointers in the directory.
				797	*/
				798	conflict_again:
				799	ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
				800	if (ret == 0) {
				801	char *victim_name;
				802	int victim_name_len;
				803	struct btrfs_inode_ref *victim_ref;
				804	unsigned long ptr;
				805	unsigned long ptr_end;
				806	struct extent_buffer *leaf = path->nodes[0];
				807
				808	/* are we trying to overwrite a back ref for the root directory
				809	* if so, just jump out, we're done
				810	*/
				811	if (key->objectid == key->offset)
				812	goto out_nowrite;
				813
				814	/* check all the names in this back reference to see
				815	* if they are in the log. if so, we allow them to stay
				816	* otherwise they must be unlinked as a conflict
				817	*/
				818	ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
				819	ptr_end = ptr + btrfs_item_size_nr(leaf, path->slots[0]);
				820	while(ptr < ptr_end) {
				821	victim_ref = (struct btrfs_inode_ref *)ptr;
				822	victim_name_len = btrfs_inode_ref_name_len(leaf,
				823	victim_ref);
				824	victim_name = kmalloc(victim_name_len, GFP_NOFS);
				825	BUG_ON(!victim_name);
				826
				827	read_extent_buffer(leaf, victim_name,
				828	(unsigned long)(victim_ref + 1),
				829	victim_name_len);
				830
				831	if (!backref_in_log(log, key, victim_name,
				832	victim_name_len)) {
				833	btrfs_inc_nlink(inode);
				834	btrfs_release_path(root, path);
				835	ret = btrfs_unlink_inode(trans, root, dir,
				836	inode, victim_name,
				837	victim_name_len);
				838	kfree(victim_name);
				839	btrfs_release_path(root, path);
				840	goto conflict_again;
				841	}
				842	kfree(victim_name);
				843	ptr = (unsigned long)(victim_ref + 1) + victim_name_len;
				844	}
				845	BUG_ON(ret);
				846	}
				847	btrfs_release_path(root, path);
				848
				849	/* look for a conflicting sequence number */
				850	di = btrfs_lookup_dir_index_item(trans, root, path, dir->i_ino,
				851	btrfs_inode_ref_index(eb, ref),
				852	name, namelen, 0);
				853	if (di && !IS_ERR(di)) {
				854	ret = drop_one_dir_item(trans, root, path, dir, di);
				855	BUG_ON(ret);
				856	}
				857	btrfs_release_path(root, path);
				858
				859
				860	/* look for a conflicting name */
				861	di = btrfs_lookup_dir_item(trans, root, path, dir->i_ino,
				862	name, namelen, 0);
				863	if (di && !IS_ERR(di)) {
				864	ret = drop_one_dir_item(trans, root, path, dir, di);
				865	BUG_ON(ret);
				866	}
				867	btrfs_release_path(root, path);
				868
				869	/* insert our name */
				870	ret = btrfs_add_link(trans, dir, inode, name, namelen, 0,
				871	btrfs_inode_ref_index(eb, ref));
				872	BUG_ON(ret);
				873
				874	btrfs_update_inode(trans, root, inode);
				875
				876	out:
				877	ref_ptr = (unsigned long)(ref + 1) + namelen;
				878	kfree(name);
				879	if (ref_ptr < ref_end)
				880	goto again;
				881
				882	/* finally write the back reference in the inode */
				883	ret = overwrite_item(trans, root, path, eb, slot, key);
				884	BUG_ON(ret);
				885
				886	out_nowrite:
				887	btrfs_release_path(root, path);
				888	iput(dir);
				889	iput(inode);
				890	return 0;
				891	}
				892
				893	/*
				894	* replay one csum item from the log tree into the subvolume 'root'
				895	* eb, slot and key all refer to the log tree
				896	* path is for temp use by this function and should be released on return
				897	*
				898	* This copies the checksums out of the log tree and inserts them into
				899	* the subvolume. Any existing checksums for this range in the file
				900	* are overwritten, and new items are added where required.
				901	*
				902	* We keep this simple by reusing the btrfs_ordered_sum code from
				903	* the data=ordered mode. This basically means making a copy
				904	* of all the checksums in ram, which we have to do anyway for kmap
				905	* rules.
				906	*
				907	* The copy is then sent down to btrfs_csum_file_blocks, which
				908	* does all the hard work of finding existing items in the file
				909	* or adding new ones.
				910	*/
				911	static noinline int replay_one_csum(struct btrfs_trans_handle *trans,
				912	struct btrfs_root *root,
				913	struct btrfs_path *path,
				914	struct extent_buffer *eb, int slot,
				915	struct btrfs_key *key)
				916	{
				917	int ret;
				918	u32 item_size = btrfs_item_size_nr(eb, slot);
				919	u64 cur_offset;
				920	unsigned long file_bytes;
				921	struct btrfs_ordered_sum *sums;
				922	struct btrfs_sector_sum *sector_sum;
				923	struct inode *inode;
				924	unsigned long ptr;
				925
				926	file_bytes = (item_size / BTRFS_CRC32_SIZE) * root->sectorsize;
				927	inode = read_one_inode(root, key->objectid);
				928	if (!inode) {
				929	return -EIO;
				930	}
				931
				932	sums = kzalloc(btrfs_ordered_sum_size(root, file_bytes), GFP_NOFS);
				933	if (!sums) {
				934	iput(inode);
				935	return -ENOMEM;
				936	}
				937
				938	INIT_LIST_HEAD(&sums->list);
				939	sums->len = file_bytes;
				940	sums->file_offset = key->offset;
				941
				942	/*
				943	* copy all the sums into the ordered sum struct
				944	*/
				945	sector_sum = sums->sums;
				946	cur_offset = key->offset;
				947	ptr = btrfs_item_ptr_offset(eb, slot);
				948	while(item_size > 0) {
				949	sector_sum->offset = cur_offset;
				950	read_extent_buffer(eb, &sector_sum->sum, ptr, BTRFS_CRC32_SIZE);
				951	sector_sum++;
				952	item_size -= BTRFS_CRC32_SIZE;
				953	ptr += BTRFS_CRC32_SIZE;
				954	cur_offset += root->sectorsize;
				955	}
				956
				957	/* let btrfs_csum_file_blocks add them into the file */
				958	ret = btrfs_csum_file_blocks(trans, root, inode, sums);
				959	BUG_ON(ret);
				960	kfree(sums);
				961	iput(inode);
				962
				963	return 0;
				964	}
				965	/*
				966	* There are a few corners where the link count of the file can't
				967	* be properly maintained during replay. So, instead of adding
				968	* lots of complexity to the log code, we just scan the backrefs
				969	* for any file that has been through replay.
				970	*
				971	* The scan will update the link count on the inode to reflect the
				972	* number of back refs found. If it goes down to zero, the iput
				973	* will free the inode.
				974	*/
				975	static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
				976	struct btrfs_root *root,
				977	struct inode *inode)
				978	{
				979	struct btrfs_path *path;
				980	int ret;
				981	struct btrfs_key key;
				982	u64 nlink = 0;
				983	unsigned long ptr;
				984	unsigned long ptr_end;
				985	int name_len;
				986
				987	key.objectid = inode->i_ino;
				988	key.type = BTRFS_INODE_REF_KEY;
				989	key.offset = (u64)-1;
				990
				991	path = btrfs_alloc_path();
				992
				993	while(1) {
				994	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				995	if (ret < 0)
				996	break;
				997	if (ret > 0) {
				998	if (path->slots[0] == 0)
				999	break;
				1000	path->slots[0]--;
				1001	}
				1002	btrfs_item_key_to_cpu(path->nodes[0], &key,
				1003	path->slots[0]);
				1004	if (key.objectid != inode->i_ino \|\|
				1005	key.type != BTRFS_INODE_REF_KEY)
				1006	break;
				1007	ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]);
				1008	ptr_end = ptr + btrfs_item_size_nr(path->nodes[0],
				1009	path->slots[0]);
				1010	while(ptr < ptr_end) {
				1011	struct btrfs_inode_ref *ref;
				1012
				1013	ref = (struct btrfs_inode_ref *)ptr;
				1014	name_len = btrfs_inode_ref_name_len(path->nodes[0],
				1015	ref);
				1016	ptr = (unsigned long)(ref + 1) + name_len;
				1017	nlink++;
				1018	}
				1019
				1020	if (key.offset == 0)
				1021	break;
				1022	key.offset--;
				1023	btrfs_release_path(root, path);
				1024	}
				1025	btrfs_free_path(path);
				1026	if (nlink != inode->i_nlink) {
				1027	inode->i_nlink = nlink;
				1028	btrfs_update_inode(trans, root, inode);
				1029	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	1030	BTRFS_I(inode)->index_cnt = (u64)-1;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1031
				1032	return 0;
				1033	}
				1034
				1035	static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
				1036	struct btrfs_root *root,
				1037	struct btrfs_path *path)
				1038	{
				1039	int ret;
				1040	struct btrfs_key key;
				1041	struct inode *inode;
				1042
				1043	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1044	key.type = BTRFS_ORPHAN_ITEM_KEY;
				1045	key.offset = (u64)-1;
				1046	while(1) {
				1047	ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
				1048	if (ret < 0)
				1049	break;
				1050
				1051	if (ret == 1) {
				1052	if (path->slots[0] == 0)
				1053	break;
				1054	path->slots[0]--;
				1055	}
				1056
				1057	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1058	if (key.objectid != BTRFS_TREE_LOG_FIXUP_OBJECTID \|\|
				1059	key.type != BTRFS_ORPHAN_ITEM_KEY)
				1060	break;
				1061
				1062	ret = btrfs_del_item(trans, root, path);
				1063	BUG_ON(ret);
				1064
				1065	btrfs_release_path(root, path);
				1066	inode = read_one_inode(root, key.offset);
				1067	BUG_ON(!inode);
				1068
				1069	ret = fixup_inode_link_count(trans, root, inode);
				1070	BUG_ON(ret);
				1071
				1072	iput(inode);
				1073
				1074	if (key.offset == 0)
				1075	break;
				1076	key.offset--;
				1077	}
				1078	btrfs_release_path(root, path);
				1079	return 0;
				1080	}
				1081
				1082
				1083	/*
				1084	* record a given inode in the fixup dir so we can check its link
				1085	* count when replay is done. The link count is incremented here
				1086	* so the inode won't go away until we check it
				1087	*/
				1088	static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
				1089	struct btrfs_root *root,
				1090	struct btrfs_path *path,
				1091	u64 objectid)
				1092	{
				1093	struct btrfs_key key;
				1094	int ret = 0;
				1095	struct inode *inode;
				1096
				1097	inode = read_one_inode(root, objectid);
				1098	BUG_ON(!inode);
				1099
				1100	key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID;
				1101	btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY);
				1102	key.offset = objectid;
				1103
				1104	ret = btrfs_insert_empty_item(trans, root, path, &key, 0);
				1105
				1106	btrfs_release_path(root, path);
				1107	if (ret == 0) {
				1108	btrfs_inc_nlink(inode);
				1109	btrfs_update_inode(trans, root, inode);
				1110	} else if (ret == -EEXIST) {
				1111	ret = 0;
				1112	} else {
				1113	BUG();
				1114	}
				1115	iput(inode);
				1116
				1117	return ret;
				1118	}
				1119
				1120	/*
				1121	* when replaying the log for a directory, we only insert names
				1122	* for inodes that actually exist. This means an fsync on a directory
				1123	* does not implicitly fsync all the new files in it
				1124	*/
				1125	static noinline int insert_one_name(struct btrfs_trans_handle *trans,
				1126	struct btrfs_root *root,
				1127	struct btrfs_path *path,
				1128	u64 dirid, u64 index,
				1129	char *name, int name_len, u8 type,
				1130	struct btrfs_key *location)
				1131	{
				1132	struct inode *inode;
				1133	struct inode *dir;
				1134	int ret;
				1135
				1136	inode = read_one_inode(root, location->objectid);
				1137	if (!inode)
				1138	return -ENOENT;
				1139
				1140	dir = read_one_inode(root, dirid);
				1141	if (!dir) {
				1142	iput(inode);
				1143	return -EIO;
				1144	}
				1145	ret = btrfs_add_link(trans, dir, inode, name, name_len, 1, index);
				1146
				1147	/* FIXME, put inode into FIXUP list */
				1148
				1149	iput(inode);
				1150	iput(dir);
				1151	return ret;
				1152	}
				1153
				1154	/*
				1155	* take a single entry in a log directory item and replay it into
				1156	* the subvolume.
				1157	*
				1158	* if a conflicting item exists in the subdirectory already,
				1159	* the inode it points to is unlinked and put into the link count
				1160	* fix up tree.
				1161	*
				1162	* If a name from the log points to a file or directory that does
				1163	* not exist in the FS, it is skipped. fsyncs on directories
				1164	* do not force down inodes inside that directory, just changes to the
				1165	* names or unlinks in a directory.
				1166	*/
				1167	static noinline int replay_one_name(struct btrfs_trans_handle *trans,
				1168	struct btrfs_root *root,
				1169	struct btrfs_path *path,
				1170	struct extent_buffer *eb,
				1171	struct btrfs_dir_item *di,
				1172	struct btrfs_key *key)
				1173	{
				1174	char *name;
				1175	int name_len;
				1176	struct btrfs_dir_item *dst_di;
				1177	struct btrfs_key found_key;
				1178	struct btrfs_key log_key;
				1179	struct inode *dir;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1180	u8 log_type;
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1181	int exists;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1182	int ret;
				1183
				1184	dir = read_one_inode(root, key->objectid);
				1185	BUG_ON(!dir);
				1186
				1187	name_len = btrfs_dir_name_len(eb, di);
				1188	name = kmalloc(name_len, GFP_NOFS);
				1189	log_type = btrfs_dir_type(eb, di);
				1190	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1191	name_len);
				1192
				1193	btrfs_dir_item_key_to_cpu(eb, di, &log_key);
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1194	exists = btrfs_lookup_inode(trans, root, path, &log_key, 0);
				1195	if (exists == 0)
				1196	exists = 1;
				1197	else
				1198	exists = 0;
				1199	btrfs_release_path(root, path);
				1200
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1201	if (key->type == BTRFS_DIR_ITEM_KEY) {
				1202	dst_di = btrfs_lookup_dir_item(trans, root, path, key->objectid,
				1203	name, name_len, 1);
				1204	}
				1205	else if (key->type == BTRFS_DIR_INDEX_KEY) {
				1206	dst_di = btrfs_lookup_dir_index_item(trans, root, path,
				1207	key->objectid,
				1208	key->offset, name,
				1209	name_len, 1);
				1210	} else {
				1211	BUG();
				1212	}
				1213	if (!dst_di \|\| IS_ERR(dst_di)) {
				1214	/* we need a sequence number to insert, so we only
				1215	* do inserts for the BTRFS_DIR_INDEX_KEY types
				1216	*/
				1217	if (key->type != BTRFS_DIR_INDEX_KEY)
				1218	goto out;
				1219	goto insert;
				1220	}
				1221
				1222	btrfs_dir_item_key_to_cpu(path->nodes[0], dst_di, &found_key);
				1223	/* the existing item matches the logged item */
				1224	if (found_key.objectid == log_key.objectid &&
				1225	found_key.type == log_key.type &&
				1226	found_key.offset == log_key.offset &&
				1227	btrfs_dir_type(path->nodes[0], dst_di) == log_type) {
				1228	goto out;
				1229	}
				1230
				1231	/*
				1232	* don't drop the conflicting directory entry if the inode
				1233	* for the new entry doesn't exist
				1234	*/
Chris Mason	4bef084	2008-09-08 11:18:08 -0400	[diff] [blame]	1235	if (!exists)
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1236	goto out;
				1237
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1238	ret = drop_one_dir_item(trans, root, path, dir, dst_di);
				1239	BUG_ON(ret);
				1240
				1241	if (key->type == BTRFS_DIR_INDEX_KEY)
				1242	goto insert;
				1243	out:
				1244	btrfs_release_path(root, path);
				1245	kfree(name);
				1246	iput(dir);
				1247	return 0;
				1248
				1249	insert:
				1250	btrfs_release_path(root, path);
				1251	ret = insert_one_name(trans, root, path, key->objectid, key->offset,
				1252	name, name_len, log_type, &log_key);
				1253
				1254	if (ret && ret != -ENOENT)
				1255	BUG();
				1256	goto out;
				1257	}
				1258
				1259	/*
				1260	* find all the names in a directory item and reconcile them into
				1261	* the subvolume. Only BTRFS_DIR_ITEM_KEY types will have more than
				1262	* one name in a directory item, but the same code gets used for
				1263	* both directory index types
				1264	*/
				1265	static noinline int replay_one_dir_item(struct btrfs_trans_handle *trans,
				1266	struct btrfs_root *root,
				1267	struct btrfs_path *path,
				1268	struct extent_buffer *eb, int slot,
				1269	struct btrfs_key *key)
				1270	{
				1271	int ret;
				1272	u32 item_size = btrfs_item_size_nr(eb, slot);
				1273	struct btrfs_dir_item *di;
				1274	int name_len;
				1275	unsigned long ptr;
				1276	unsigned long ptr_end;
				1277
				1278	ptr = btrfs_item_ptr_offset(eb, slot);
				1279	ptr_end = ptr + item_size;
				1280	while(ptr < ptr_end) {
				1281	di = (struct btrfs_dir_item *)ptr;
				1282	name_len = btrfs_dir_name_len(eb, di);
				1283	ret = replay_one_name(trans, root, path, eb, di, key);
				1284	BUG_ON(ret);
				1285	ptr = (unsigned long)(di + 1);
				1286	ptr += name_len;
				1287	}
				1288	return 0;
				1289	}
				1290
				1291	/*
				1292	* directory replay has two parts. There are the standard directory
				1293	* items in the log copied from the subvolume, and range items
				1294	* created in the log while the subvolume was logged.
				1295	*
				1296	* The range items tell us which parts of the key space the log
				1297	* is authoritative for. During replay, if a key in the subvolume
				1298	* directory is in a logged range item, but not actually in the log
				1299	* that means it was deleted from the directory before the fsync
				1300	* and should be removed.
				1301	*/
				1302	static noinline int find_dir_range(struct btrfs_root *root,
				1303	struct btrfs_path *path,
				1304	u64 dirid, int key_type,
				1305	u64 start_ret, u64 end_ret)
				1306	{
				1307	struct btrfs_key key;
				1308	u64 found_end;
				1309	struct btrfs_dir_log_item *item;
				1310	int ret;
				1311	int nritems;
				1312
				1313	if (*start_ret == (u64)-1)
				1314	return 1;
				1315
				1316	key.objectid = dirid;
				1317	key.type = key_type;
				1318	key.offset = *start_ret;
				1319
				1320	ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
				1321	if (ret < 0)
				1322	goto out;
				1323	if (ret > 0) {
				1324	if (path->slots[0] == 0)
				1325	goto out;
				1326	path->slots[0]--;
				1327	}
				1328	if (ret != 0)
				1329	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1330
				1331	if (key.type != key_type \|\| key.objectid != dirid) {
				1332	ret = 1;
				1333	goto next;
				1334	}
				1335	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1336	struct btrfs_dir_log_item);
				1337	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1338
				1339	if (start_ret >= key.offset && start_ret <= found_end) {
				1340	ret = 0;
				1341	*start_ret = key.offset;
				1342	*end_ret = found_end;
				1343	goto out;
				1344	}
				1345	ret = 1;
				1346	next:
				1347	/* check the next slot in the tree to see if it is a valid item */
				1348	nritems = btrfs_header_nritems(path->nodes[0]);
				1349	if (path->slots[0] >= nritems) {
				1350	ret = btrfs_next_leaf(root, path);
				1351	if (ret)
				1352	goto out;
				1353	} else {
				1354	path->slots[0]++;
				1355	}
				1356
				1357	btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
				1358
				1359	if (key.type != key_type \|\| key.objectid != dirid) {
				1360	ret = 1;
				1361	goto out;
				1362	}
				1363	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				1364	struct btrfs_dir_log_item);
				1365	found_end = btrfs_dir_log_end(path->nodes[0], item);
				1366	*start_ret = key.offset;
				1367	*end_ret = found_end;
				1368	ret = 0;
				1369	out:
				1370	btrfs_release_path(root, path);
				1371	return ret;
				1372	}
				1373
				1374	/*
				1375	* this looks for a given directory item in the log. If the directory
				1376	* item is not in the log, the item is removed and the inode it points
				1377	* to is unlinked
				1378	*/
				1379	static noinline int check_item_in_log(struct btrfs_trans_handle *trans,
				1380	struct btrfs_root *root,
				1381	struct btrfs_root *log,
				1382	struct btrfs_path *path,
				1383	struct btrfs_path *log_path,
				1384	struct inode *dir,
				1385	struct btrfs_key *dir_key)
				1386	{
				1387	int ret;
				1388	struct extent_buffer *eb;
				1389	int slot;
				1390	u32 item_size;
				1391	struct btrfs_dir_item *di;
				1392	struct btrfs_dir_item *log_di;
				1393	int name_len;
				1394	unsigned long ptr;
				1395	unsigned long ptr_end;
				1396	char *name;
				1397	struct inode *inode;
				1398	struct btrfs_key location;
				1399
				1400	again:
				1401	eb = path->nodes[0];
				1402	slot = path->slots[0];
				1403	item_size = btrfs_item_size_nr(eb, slot);
				1404	ptr = btrfs_item_ptr_offset(eb, slot);
				1405	ptr_end = ptr + item_size;
				1406	while(ptr < ptr_end) {
				1407	di = (struct btrfs_dir_item *)ptr;
				1408	name_len = btrfs_dir_name_len(eb, di);
				1409	name = kmalloc(name_len, GFP_NOFS);
				1410	if (!name) {
				1411	ret = -ENOMEM;
				1412	goto out;
				1413	}
				1414	read_extent_buffer(eb, name, (unsigned long)(di + 1),
				1415	name_len);
				1416	log_di = NULL;
				1417	if (dir_key->type == BTRFS_DIR_ITEM_KEY) {
				1418	log_di = btrfs_lookup_dir_item(trans, log, log_path,
				1419	dir_key->objectid,
				1420	name, name_len, 0);
				1421	} else if (dir_key->type == BTRFS_DIR_INDEX_KEY) {
				1422	log_di = btrfs_lookup_dir_index_item(trans, log,
				1423	log_path,
				1424	dir_key->objectid,
				1425	dir_key->offset,
				1426	name, name_len, 0);
				1427	}
				1428	if (!log_di \|\| IS_ERR(log_di)) {
				1429	btrfs_dir_item_key_to_cpu(eb, di, &location);
				1430	btrfs_release_path(root, path);
				1431	btrfs_release_path(log, log_path);
				1432	inode = read_one_inode(root, location.objectid);
				1433	BUG_ON(!inode);
				1434
				1435	ret = link_to_fixup_dir(trans, root,
				1436	path, location.objectid);
				1437	BUG_ON(ret);
				1438	btrfs_inc_nlink(inode);
				1439	ret = btrfs_unlink_inode(trans, root, dir, inode,
				1440	name, name_len);
				1441	BUG_ON(ret);
				1442	kfree(name);
				1443	iput(inode);
				1444
				1445	/* there might still be more names under this key
				1446	* check and repeat if required
				1447	*/
				1448	ret = btrfs_search_slot(NULL, root, dir_key, path,
				1449	0, 0);
				1450	if (ret == 0)
				1451	goto again;
				1452	ret = 0;
				1453	goto out;
				1454	}
				1455	btrfs_release_path(log, log_path);
				1456	kfree(name);
				1457
				1458	ptr = (unsigned long)(di + 1);
				1459	ptr += name_len;
				1460	}
				1461	ret = 0;
				1462	out:
				1463	btrfs_release_path(root, path);
				1464	btrfs_release_path(log, log_path);
				1465	return ret;
				1466	}
				1467
				1468	/*
				1469	* deletion replay happens before we copy any new directory items
				1470	* out of the log or out of backreferences from inodes. It
				1471	* scans the log to find ranges of keys that log is authoritative for,
				1472	* and then scans the directory to find items in those ranges that are
				1473	* not present in the log.
				1474	*
				1475	* Anything we don't find in the log is unlinked and removed from the
				1476	* directory.
				1477	*/
				1478	static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans,
				1479	struct btrfs_root *root,
				1480	struct btrfs_root *log,
				1481	struct btrfs_path *path,
				1482	u64 dirid)
				1483	{
				1484	u64 range_start;
				1485	u64 range_end;
				1486	int key_type = BTRFS_DIR_LOG_ITEM_KEY;
				1487	int ret = 0;
				1488	struct btrfs_key dir_key;
				1489	struct btrfs_key found_key;
				1490	struct btrfs_path *log_path;
				1491	struct inode *dir;
				1492
				1493	dir_key.objectid = dirid;
				1494	dir_key.type = BTRFS_DIR_ITEM_KEY;
				1495	log_path = btrfs_alloc_path();
				1496	if (!log_path)
				1497	return -ENOMEM;
				1498
				1499	dir = read_one_inode(root, dirid);
				1500	/* it isn't an error if the inode isn't there, that can happen
				1501	* because we replay the deletes before we copy in the inode item
				1502	* from the log
				1503	*/
				1504	if (!dir) {
				1505	btrfs_free_path(log_path);
				1506	return 0;
				1507	}
				1508	again:
				1509	range_start = 0;
				1510	range_end = 0;
				1511	while(1) {
				1512	ret = find_dir_range(log, path, dirid, key_type,
				1513	&range_start, &range_end);
				1514	if (ret != 0)
				1515	break;
				1516
				1517	dir_key.offset = range_start;
				1518	while(1) {
				1519	int nritems;
				1520	ret = btrfs_search_slot(NULL, root, &dir_key, path,
				1521	0, 0);
				1522	if (ret < 0)
				1523	goto out;
				1524
				1525	nritems = btrfs_header_nritems(path->nodes[0]);
				1526	if (path->slots[0] >= nritems) {
				1527	ret = btrfs_next_leaf(root, path);
				1528	if (ret)
				1529	break;
				1530	}
				1531	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				1532	path->slots[0]);
				1533	if (found_key.objectid != dirid \|\|
				1534	found_key.type != dir_key.type)
				1535	goto next_type;
				1536
				1537	if (found_key.offset > range_end)
				1538	break;
				1539
				1540	ret = check_item_in_log(trans, root, log, path,
				1541	log_path, dir, &found_key);
				1542	BUG_ON(ret);
				1543	if (found_key.offset == (u64)-1)
				1544	break;
				1545	dir_key.offset = found_key.offset + 1;
				1546	}
				1547	btrfs_release_path(root, path);
				1548	if (range_end == (u64)-1)
				1549	break;
				1550	range_start = range_end + 1;
				1551	}
				1552
				1553	next_type:
				1554	ret = 0;
				1555	if (key_type == BTRFS_DIR_LOG_ITEM_KEY) {
				1556	key_type = BTRFS_DIR_LOG_INDEX_KEY;
				1557	dir_key.type = BTRFS_DIR_INDEX_KEY;
				1558	btrfs_release_path(root, path);
				1559	goto again;
				1560	}
				1561	out:
				1562	btrfs_release_path(root, path);
				1563	btrfs_free_path(log_path);
				1564	iput(dir);
				1565	return ret;
				1566	}
				1567
				1568	/*
				1569	* the process_func used to replay items from the log tree. This
				1570	* gets called in two different stages. The first stage just looks
				1571	* for inodes and makes sure they are all copied into the subvolume.
				1572	*
				1573	* The second stage copies all the other item types from the log into
				1574	* the subvolume. The two stage approach is slower, but gets rid of
				1575	* lots of complexity around inodes referencing other inodes that exist
				1576	* only in the log (references come from either directory items or inode
				1577	* back refs).
				1578	*/
				1579	static int replay_one_buffer(struct btrfs_root log, struct extent_buffer eb,
				1580	struct walk_control *wc, u64 gen)
				1581	{
				1582	int nritems;
				1583	struct btrfs_path *path;
				1584	struct btrfs_root *root = wc->replay_dest;
				1585	struct btrfs_key key;
				1586	u32 item_size;
				1587	int level;
				1588	int i;
				1589	int ret;
				1590
				1591	btrfs_read_buffer(eb, gen);
				1592
				1593	level = btrfs_header_level(eb);
				1594
				1595	if (level != 0)
				1596	return 0;
				1597
				1598	path = btrfs_alloc_path();
				1599	BUG_ON(!path);
				1600
				1601	nritems = btrfs_header_nritems(eb);
				1602	for (i = 0; i < nritems; i++) {
				1603	btrfs_item_key_to_cpu(eb, &key, i);
				1604	item_size = btrfs_item_size_nr(eb, i);
				1605
				1606	/* inode keys are done during the first stage */
				1607	if (key.type == BTRFS_INODE_ITEM_KEY &&
				1608	wc->stage == LOG_WALK_REPLAY_INODES) {
				1609	struct inode *inode;
				1610	struct btrfs_inode_item *inode_item;
				1611	u32 mode;
				1612
				1613	inode_item = btrfs_item_ptr(eb, i,
				1614	struct btrfs_inode_item);
				1615	mode = btrfs_inode_mode(eb, inode_item);
				1616	if (S_ISDIR(mode)) {
				1617	ret = replay_dir_deletes(wc->trans,
				1618	root, log, path, key.objectid);
				1619	BUG_ON(ret);
				1620	}
				1621	ret = overwrite_item(wc->trans, root, path,
				1622	eb, i, &key);
				1623	BUG_ON(ret);
				1624
				1625	/* for regular files, truncate away
				1626	* extents past the new EOF
				1627	*/
				1628	if (S_ISREG(mode)) {
				1629	inode = read_one_inode(root,
				1630	key.objectid);
				1631	BUG_ON(!inode);
				1632
				1633	ret = btrfs_truncate_inode_items(wc->trans,
				1634	root, inode, inode->i_size,
				1635	BTRFS_EXTENT_DATA_KEY);
				1636	BUG_ON(ret);
				1637	iput(inode);
				1638	}
				1639	ret = link_to_fixup_dir(wc->trans, root,
				1640	path, key.objectid);
				1641	BUG_ON(ret);
				1642	}
				1643	if (wc->stage < LOG_WALK_REPLAY_ALL)
				1644	continue;
				1645
				1646	/* these keys are simply copied */
				1647	if (key.type == BTRFS_XATTR_ITEM_KEY) {
				1648	ret = overwrite_item(wc->trans, root, path,
				1649	eb, i, &key);
				1650	BUG_ON(ret);
				1651	} else if (key.type == BTRFS_INODE_REF_KEY) {
				1652	ret = add_inode_ref(wc->trans, root, log, path,
				1653	eb, i, &key);
				1654	BUG_ON(ret && ret != -ENOENT);
				1655	} else if (key.type == BTRFS_EXTENT_DATA_KEY) {
				1656	ret = replay_one_extent(wc->trans, root, path,
				1657	eb, i, &key);
				1658	BUG_ON(ret);
				1659	} else if (key.type == BTRFS_CSUM_ITEM_KEY) {
				1660	ret = replay_one_csum(wc->trans, root, path,
				1661	eb, i, &key);
				1662	BUG_ON(ret);
				1663	} else if (key.type == BTRFS_DIR_ITEM_KEY \|\|
				1664	key.type == BTRFS_DIR_INDEX_KEY) {
				1665	ret = replay_one_dir_item(wc->trans, root, path,
				1666	eb, i, &key);
				1667	BUG_ON(ret);
				1668	}
				1669	}
				1670	btrfs_free_path(path);
				1671	return 0;
				1672	}
				1673
				1674	static int noinline walk_down_log_tree(struct btrfs_trans_handle *trans,
				1675	struct btrfs_root *root,
				1676	struct btrfs_path path, int level,
				1677	struct walk_control *wc)
				1678	{
				1679	u64 root_owner;
				1680	u64 root_gen;
				1681	u64 bytenr;
				1682	u64 ptr_gen;
				1683	struct extent_buffer *next;
				1684	struct extent_buffer *cur;
				1685	struct extent_buffer *parent;
				1686	u32 blocksize;
				1687	int ret = 0;
				1688
				1689	WARN_ON(*level < 0);
				1690	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1691
				1692	while(*level > 0) {
				1693	WARN_ON(*level < 0);
				1694	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1695	cur = path->nodes[*level];
				1696
				1697	if (btrfs_header_level(cur) != *level)
				1698	WARN_ON(1);
				1699
				1700	if (path->slots[*level] >=
				1701	btrfs_header_nritems(cur))
				1702	break;
				1703
				1704	bytenr = btrfs_node_blockptr(cur, path->slots[*level]);
				1705	ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]);
				1706	blocksize = btrfs_level_size(root, *level - 1);
				1707
				1708	parent = path->nodes[*level];
				1709	root_owner = btrfs_header_owner(parent);
				1710	root_gen = btrfs_header_generation(parent);
				1711
				1712	next = btrfs_find_create_tree_block(root, bytenr, blocksize);
				1713
				1714	wc->process_func(root, next, wc, ptr_gen);
				1715
				1716	if (*level == 1) {
				1717	path->slots[*level]++;
				1718	if (wc->free) {
				1719	btrfs_read_buffer(next, ptr_gen);
				1720
				1721	btrfs_tree_lock(next);
				1722	clean_tree_block(trans, root, next);
				1723	btrfs_wait_tree_block_writeback(next);
				1724	btrfs_tree_unlock(next);
				1725
				1726	ret = btrfs_drop_leaf_ref(trans, root, next);
				1727	BUG_ON(ret);
				1728
				1729	WARN_ON(root_owner !=
				1730	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1731	ret = btrfs_free_reserved_extent(root,
				1732	bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1733	BUG_ON(ret);
				1734	}
				1735	free_extent_buffer(next);
				1736	continue;
				1737	}
				1738	btrfs_read_buffer(next, ptr_gen);
				1739
				1740	WARN_ON(*level <= 0);
				1741	if (path->nodes[*level-1])
				1742	free_extent_buffer(path->nodes[*level-1]);
				1743	path->nodes[*level-1] = next;
				1744	*level = btrfs_header_level(next);
				1745	path->slots[*level] = 0;
				1746	cond_resched();
				1747	}
				1748	WARN_ON(*level < 0);
				1749	WARN_ON(*level >= BTRFS_MAX_LEVEL);
				1750
				1751	if (path->nodes[*level] == root->node) {
				1752	parent = path->nodes[*level];
				1753	} else {
				1754	parent = path->nodes[*level + 1];
				1755	}
				1756	bytenr = path->nodes[*level]->start;
				1757
				1758	blocksize = btrfs_level_size(root, *level);
				1759	root_owner = btrfs_header_owner(parent);
				1760	root_gen = btrfs_header_generation(parent);
				1761
				1762	wc->process_func(root, path->nodes[*level], wc,
				1763	btrfs_header_generation(path->nodes[*level]));
				1764
				1765	if (wc->free) {
				1766	next = path->nodes[*level];
				1767	btrfs_tree_lock(next);
				1768	clean_tree_block(trans, root, next);
				1769	btrfs_wait_tree_block_writeback(next);
				1770	btrfs_tree_unlock(next);
				1771
				1772	if (*level == 0) {
				1773	ret = btrfs_drop_leaf_ref(trans, root, next);
				1774	BUG_ON(ret);
				1775	}
				1776	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1777	ret = btrfs_free_reserved_extent(root, bytenr, blocksize);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1778	BUG_ON(ret);
				1779	}
				1780	free_extent_buffer(path->nodes[*level]);
				1781	path->nodes[*level] = NULL;
				1782	*level += 1;
				1783
				1784	cond_resched();
				1785	return 0;
				1786	}
				1787
				1788	static int noinline walk_up_log_tree(struct btrfs_trans_handle *trans,
				1789	struct btrfs_root *root,
				1790	struct btrfs_path path, int level,
				1791	struct walk_control *wc)
				1792	{
				1793	u64 root_owner;
				1794	u64 root_gen;
				1795	int i;
				1796	int slot;
				1797	int ret;
				1798
				1799	for(i = *level; i < BTRFS_MAX_LEVEL - 1 && path->nodes[i]; i++) {
				1800	slot = path->slots[i];
				1801	if (slot < btrfs_header_nritems(path->nodes[i]) - 1) {
				1802	struct extent_buffer *node;
				1803	node = path->nodes[i];
				1804	path->slots[i]++;
				1805	*level = i;
				1806	WARN_ON(*level == 0);
				1807	return 0;
				1808	} else {
				1809	if (path->nodes[*level] == root->node) {
				1810	root_owner = root->root_key.objectid;
				1811	root_gen =
				1812	btrfs_header_generation(path->nodes[*level]);
				1813	} else {
				1814	struct extent_buffer *node;
				1815	node = path->nodes[*level + 1];
				1816	root_owner = btrfs_header_owner(node);
				1817	root_gen = btrfs_header_generation(node);
				1818	}
				1819	wc->process_func(root, path->nodes[*level], wc,
				1820	btrfs_header_generation(path->nodes[*level]));
				1821	if (wc->free) {
				1822	struct extent_buffer *next;
				1823
				1824	next = path->nodes[*level];
				1825
				1826	btrfs_tree_lock(next);
				1827	clean_tree_block(trans, root, next);
				1828	btrfs_wait_tree_block_writeback(next);
				1829	btrfs_tree_unlock(next);
				1830
				1831	if (*level == 0) {
				1832	ret = btrfs_drop_leaf_ref(trans, root,
				1833	next);
				1834	BUG_ON(ret);
				1835	}
				1836
				1837	WARN_ON(root_owner != BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1838	ret = btrfs_free_reserved_extent(root,
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1839	path->nodes[*level]->start,
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1840	path->nodes[*level]->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1841	BUG_ON(ret);
				1842	}
				1843	free_extent_buffer(path->nodes[*level]);
				1844	path->nodes[*level] = NULL;
				1845	*level = i + 1;
				1846	}
				1847	}
				1848	return 1;
				1849	}
				1850
				1851	/*
				1852	* drop the reference count on the tree rooted at 'snap'. This traverses
				1853	* the tree freeing any blocks that have a ref count of zero after being
				1854	* decremented.
				1855	*/
				1856	static int walk_log_tree(struct btrfs_trans_handle *trans,
				1857	struct btrfs_root log, struct walk_control wc)
				1858	{
				1859	int ret = 0;
				1860	int wret;
				1861	int level;
				1862	struct btrfs_path *path;
				1863	int i;
				1864	int orig_level;
				1865
				1866	path = btrfs_alloc_path();
				1867	BUG_ON(!path);
				1868
				1869	level = btrfs_header_level(log->node);
				1870	orig_level = level;
				1871	path->nodes[level] = log->node;
				1872	extent_buffer_get(log->node);
				1873	path->slots[level] = 0;
				1874
				1875	while(1) {
				1876	wret = walk_down_log_tree(trans, log, path, &level, wc);
				1877	if (wret > 0)
				1878	break;
				1879	if (wret < 0)
				1880	ret = wret;
				1881
				1882	wret = walk_up_log_tree(trans, log, path, &level, wc);
				1883	if (wret > 0)
				1884	break;
				1885	if (wret < 0)
				1886	ret = wret;
				1887	}
				1888
				1889	/* was the root node processed? if not, catch it here */
				1890	if (path->nodes[orig_level]) {
				1891	wc->process_func(log, path->nodes[orig_level], wc,
				1892	btrfs_header_generation(path->nodes[orig_level]));
				1893	if (wc->free) {
				1894	struct extent_buffer *next;
				1895
				1896	next = path->nodes[orig_level];
				1897
				1898	btrfs_tree_lock(next);
				1899	clean_tree_block(trans, log, next);
				1900	btrfs_wait_tree_block_writeback(next);
				1901	btrfs_tree_unlock(next);
				1902
				1903	if (orig_level == 0) {
				1904	ret = btrfs_drop_leaf_ref(trans, log,
				1905	next);
				1906	BUG_ON(ret);
				1907	}
				1908	WARN_ON(log->root_key.objectid !=
				1909	BTRFS_TREE_LOG_OBJECTID);
Chris Mason	d00aff0	2008-09-11 15:54:42 -0400	[diff] [blame]	1910	ret = btrfs_free_reserved_extent(log, next->start,
				1911	next->len);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1912	BUG_ON(ret);
				1913	}
				1914	}
				1915
				1916	for (i = 0; i <= orig_level; i++) {
				1917	if (path->nodes[i]) {
				1918	free_extent_buffer(path->nodes[i]);
				1919	path->nodes[i] = NULL;
				1920	}
				1921	}
				1922	btrfs_free_path(path);
				1923	if (wc->free)
				1924	free_extent_buffer(log->node);
				1925	return ret;
				1926	}
				1927
				1928	int wait_log_commit(struct btrfs_root *log)
				1929	{
				1930	DEFINE_WAIT(wait);
				1931	u64 transid = log->fs_info->tree_log_transid;
				1932
				1933	do {
				1934	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1935	TASK_UNINTERRUPTIBLE);
				1936	mutex_unlock(&log->fs_info->tree_log_mutex);
				1937	if (atomic_read(&log->fs_info->tree_log_commit))
				1938	schedule();
				1939	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1940	mutex_lock(&log->fs_info->tree_log_mutex);
				1941	} while(transid == log->fs_info->tree_log_transid &&
				1942	atomic_read(&log->fs_info->tree_log_commit));
				1943	return 0;
				1944	}
				1945
				1946	/*
				1947	* btrfs_sync_log does sends a given tree log down to the disk and
				1948	* updates the super blocks to record it. When this call is done,
				1949	* you know that any inodes previously logged are safely on disk
				1950	*/
				1951	int btrfs_sync_log(struct btrfs_trans_handle *trans,
				1952	struct btrfs_root *root)
				1953	{
				1954	int ret;
				1955	unsigned long batch;
				1956	struct btrfs_root *log = root->log_root;
				1957	struct walk_control wc = {
				1958	.write = 1,
				1959	.process_func = process_one_buffer
				1960	};
				1961
				1962	mutex_lock(&log->fs_info->tree_log_mutex);
				1963	if (atomic_read(&log->fs_info->tree_log_commit)) {
				1964	wait_log_commit(log);
				1965	goto out;
				1966	}
				1967	atomic_set(&log->fs_info->tree_log_commit, 1);
				1968
				1969	while(1) {
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	1970	batch = log->fs_info->tree_log_batch;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1971	mutex_unlock(&log->fs_info->tree_log_mutex);
				1972	schedule_timeout_uninterruptible(1);
				1973	mutex_lock(&log->fs_info->tree_log_mutex);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1974
				1975	while(atomic_read(&log->fs_info->tree_log_writers)) {
				1976	DEFINE_WAIT(wait);
				1977	prepare_to_wait(&log->fs_info->tree_log_wait, &wait,
				1978	TASK_UNINTERRUPTIBLE);
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	1979	mutex_unlock(&log->fs_info->tree_log_mutex);
				1980	if (atomic_read(&log->fs_info->tree_log_writers))
				1981	schedule();
				1982	mutex_lock(&log->fs_info->tree_log_mutex);
				1983	finish_wait(&log->fs_info->tree_log_wait, &wait);
				1984	}
				1985	if (batch == log->fs_info->tree_log_batch)
				1986	break;
				1987	}
				1988	ret = walk_log_tree(trans, log, &wc);
				1989	BUG_ON(ret);
				1990
				1991	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
				1992	BUG_ON(ret);
				1993
				1994	wc.wait = 1;
				1995
				1996	ret = walk_log_tree(trans, log, &wc);
				1997	BUG_ON(ret);
				1998
				1999	ret = walk_log_tree(trans, log->fs_info->log_root_tree, &wc);
				2000	BUG_ON(ret);
				2001
				2002	btrfs_set_super_log_root(&root->fs_info->super_for_commit,
				2003	log->fs_info->log_root_tree->node->start);
				2004	btrfs_set_super_log_root_level(&root->fs_info->super_for_commit,
				2005	btrfs_header_level(log->fs_info->log_root_tree->node));
				2006
				2007	write_ctree_super(trans, log->fs_info->tree_root);
				2008	log->fs_info->tree_log_transid++;
				2009	log->fs_info->tree_log_batch = 0;
				2010	atomic_set(&log->fs_info->tree_log_commit, 0);
				2011	smp_mb();
				2012	if (waitqueue_active(&log->fs_info->tree_log_wait))
				2013	wake_up(&log->fs_info->tree_log_wait);
				2014	out:
				2015	mutex_unlock(&log->fs_info->tree_log_mutex);
				2016	return 0;
				2017
				2018	}
				2019
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2020	/* * free all the extents used by the tree log. This should be called
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2021	* at commit time of the full transaction
				2022	*/
				2023	int btrfs_free_log(struct btrfs_trans_handle trans, struct btrfs_root root)
				2024	{
				2025	int ret;
				2026	struct btrfs_root *log;
				2027	struct key;
				2028	struct walk_control wc = {
				2029	.free = 1,
				2030	.process_func = process_one_buffer
				2031	};
				2032
				2033	if (!root->log_root)
				2034	return 0;
				2035
				2036	log = root->log_root;
				2037	ret = walk_log_tree(trans, log, &wc);
				2038	BUG_ON(ret);
				2039
				2040	log = root->log_root;
				2041	ret = btrfs_del_root(trans, root->fs_info->log_root_tree,
				2042	&log->root_key);
				2043	BUG_ON(ret);
				2044	root->log_root = NULL;
				2045	kfree(root->log_root);
				2046	return 0;
				2047	}
				2048
				2049	/*
				2050	* helper function to update the item for a given subvolumes log root
				2051	* in the tree of log roots
				2052	*/
				2053	static int update_log_root(struct btrfs_trans_handle *trans,
				2054	struct btrfs_root *log)
				2055	{
				2056	u64 bytenr = btrfs_root_bytenr(&log->root_item);
				2057	int ret;
				2058
				2059	if (log->node->start == bytenr)
				2060	return 0;
				2061
				2062	btrfs_set_root_bytenr(&log->root_item, log->node->start);
				2063	btrfs_set_root_level(&log->root_item, btrfs_header_level(log->node));
				2064	ret = btrfs_update_root(trans, log->fs_info->log_root_tree,
				2065	&log->root_key, &log->root_item);
				2066	BUG_ON(ret);
				2067	return ret;
				2068	}
				2069
				2070	/*
				2071	* If both a file and directory are logged, and unlinks or renames are
				2072	* mixed in, we have a few interesting corners:
				2073	*
				2074	* create file X in dir Y
				2075	* link file X to X.link in dir Y
				2076	* fsync file X
				2077	* unlink file X but leave X.link
				2078	* fsync dir Y
				2079	*
				2080	* After a crash we would expect only X.link to exist. But file X
				2081	* didn't get fsync'd again so the log has back refs for X and X.link.
				2082	*
				2083	* We solve this by removing directory entries and inode backrefs from the
				2084	* log when a file that was logged in the current transaction is
				2085	* unlinked. Any later fsync will include the updated log entries, and
				2086	* we'll be able to reconstruct the proper directory items from backrefs.
				2087	*
				2088	* This optimizations allows us to avoid relogging the entire inode
				2089	* or the entire directory.
				2090	*/
				2091	int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans,
				2092	struct btrfs_root *root,
				2093	const char *name, int name_len,
				2094	struct inode *dir, u64 index)
				2095	{
				2096	struct btrfs_root *log;
				2097	struct btrfs_dir_item *di;
				2098	struct btrfs_path *path;
				2099	int ret;
				2100	int bytes_del = 0;
				2101
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2102	if (BTRFS_I(dir)->logged_trans < trans->transid)
				2103	return 0;
				2104
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2105	ret = join_running_log_trans(root);
				2106	if (ret)
				2107	return 0;
				2108
				2109	mutex_lock(&BTRFS_I(dir)->log_mutex);
				2110
				2111	log = root->log_root;
				2112	path = btrfs_alloc_path();
				2113	di = btrfs_lookup_dir_item(trans, log, path, dir->i_ino,
				2114	name, name_len, -1);
				2115	if (di && !IS_ERR(di)) {
				2116	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2117	bytes_del += name_len;
				2118	BUG_ON(ret);
				2119	}
				2120	btrfs_release_path(log, path);
				2121	di = btrfs_lookup_dir_index_item(trans, log, path, dir->i_ino,
				2122	index, name, name_len, -1);
				2123	if (di && !IS_ERR(di)) {
				2124	ret = btrfs_delete_one_dir_name(trans, log, path, di);
				2125	bytes_del += name_len;
				2126	BUG_ON(ret);
				2127	}
				2128
				2129	/* update the directory size in the log to reflect the names
				2130	* we have removed
				2131	*/
				2132	if (bytes_del) {
				2133	struct btrfs_key key;
				2134
				2135	key.objectid = dir->i_ino;
				2136	key.offset = 0;
				2137	key.type = BTRFS_INODE_ITEM_KEY;
				2138	btrfs_release_path(log, path);
				2139
				2140	ret = btrfs_search_slot(trans, log, &key, path, 0, 1);
				2141	if (ret == 0) {
				2142	struct btrfs_inode_item *item;
				2143	u64 i_size;
				2144
				2145	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2146	struct btrfs_inode_item);
				2147	i_size = btrfs_inode_size(path->nodes[0], item);
				2148	if (i_size > bytes_del)
				2149	i_size -= bytes_del;
				2150	else
				2151	i_size = 0;
				2152	btrfs_set_inode_size(path->nodes[0], item, i_size);
				2153	btrfs_mark_buffer_dirty(path->nodes[0]);
				2154	} else
				2155	ret = 0;
				2156	btrfs_release_path(log, path);
				2157	}
				2158
				2159	btrfs_free_path(path);
				2160	mutex_unlock(&BTRFS_I(dir)->log_mutex);
				2161	end_log_trans(root);
				2162
				2163	return 0;
				2164	}
				2165
				2166	/* see comments for btrfs_del_dir_entries_in_log */
				2167	int btrfs_del_inode_ref_in_log(struct btrfs_trans_handle *trans,
				2168	struct btrfs_root *root,
				2169	const char *name, int name_len,
				2170	struct inode *inode, u64 dirid)
				2171	{
				2172	struct btrfs_root *log;
				2173	u64 index;
				2174	int ret;
				2175
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2176	if (BTRFS_I(inode)->logged_trans < trans->transid)
				2177	return 0;
				2178
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2179	ret = join_running_log_trans(root);
				2180	if (ret)
				2181	return 0;
				2182	log = root->log_root;
				2183	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2184
				2185	ret = btrfs_del_inode_ref(trans, log, name, name_len, inode->i_ino,
				2186	dirid, &index);
				2187	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2188	end_log_trans(root);
				2189
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2190	return ret;
				2191	}
				2192
				2193	/*
				2194	* creates a range item in the log for 'dirid'. first_offset and
				2195	* last_offset tell us which parts of the key space the log should
				2196	* be considered authoritative for.
				2197	*/
				2198	static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans,
				2199	struct btrfs_root *log,
				2200	struct btrfs_path *path,
				2201	int key_type, u64 dirid,
				2202	u64 first_offset, u64 last_offset)
				2203	{
				2204	int ret;
				2205	struct btrfs_key key;
				2206	struct btrfs_dir_log_item *item;
				2207
				2208	key.objectid = dirid;
				2209	key.offset = first_offset;
				2210	if (key_type == BTRFS_DIR_ITEM_KEY)
				2211	key.type = BTRFS_DIR_LOG_ITEM_KEY;
				2212	else
				2213	key.type = BTRFS_DIR_LOG_INDEX_KEY;
				2214	ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item));
				2215	BUG_ON(ret);
				2216
				2217	item = btrfs_item_ptr(path->nodes[0], path->slots[0],
				2218	struct btrfs_dir_log_item);
				2219	btrfs_set_dir_log_end(path->nodes[0], item, last_offset);
				2220	btrfs_mark_buffer_dirty(path->nodes[0]);
				2221	btrfs_release_path(log, path);
				2222	return 0;
				2223	}
				2224
				2225	/*
				2226	* log all the items included in the current transaction for a given
				2227	* directory. This also creates the range items in the log tree required
				2228	* to replay anything deleted before the fsync
				2229	*/
				2230	static noinline int log_dir_items(struct btrfs_trans_handle *trans,
				2231	struct btrfs_root root, struct inode inode,
				2232	struct btrfs_path *path,
				2233	struct btrfs_path *dst_path, int key_type,
				2234	u64 min_offset, u64 *last_offset_ret)
				2235	{
				2236	struct btrfs_key min_key;
				2237	struct btrfs_key max_key;
				2238	struct btrfs_root *log = root->log_root;
				2239	struct extent_buffer *src;
				2240	int ret;
				2241	int i;
				2242	int nritems;
				2243	u64 first_offset = min_offset;
				2244	u64 last_offset = (u64)-1;
				2245
				2246	log = root->log_root;
				2247	max_key.objectid = inode->i_ino;
				2248	max_key.offset = (u64)-1;
				2249	max_key.type = key_type;
				2250
				2251	min_key.objectid = inode->i_ino;
				2252	min_key.type = key_type;
				2253	min_key.offset = min_offset;
				2254
				2255	path->keep_locks = 1;
				2256
				2257	ret = btrfs_search_forward(root, &min_key, &max_key,
				2258	path, 0, trans->transid);
				2259
				2260	/*
				2261	* we didn't find anything from this transaction, see if there
				2262	* is anything at all
				2263	*/
				2264	if (ret != 0 \|\| min_key.objectid != inode->i_ino \|\|
				2265	min_key.type != key_type) {
				2266	min_key.objectid = inode->i_ino;
				2267	min_key.type = key_type;
				2268	min_key.offset = (u64)-1;
				2269	btrfs_release_path(root, path);
				2270	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2271	if (ret < 0) {
				2272	btrfs_release_path(root, path);
				2273	return ret;
				2274	}
				2275	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2276
				2277	/* if ret == 0 there are items for this type,
				2278	* create a range to tell us the last key of this type.
				2279	* otherwise, there are no items in this directory after
				2280	* *min_offset, and we create a range to indicate that.
				2281	*/
				2282	if (ret == 0) {
				2283	struct btrfs_key tmp;
				2284	btrfs_item_key_to_cpu(path->nodes[0], &tmp,
				2285	path->slots[0]);
				2286	if (key_type == tmp.type) {
				2287	first_offset = max(min_offset, tmp.offset) + 1;
				2288	}
				2289	}
				2290	goto done;
				2291	}
				2292
				2293	/* go backward to find any previous key */
				2294	ret = btrfs_previous_item(root, path, inode->i_ino, key_type);
				2295	if (ret == 0) {
				2296	struct btrfs_key tmp;
				2297	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2298	if (key_type == tmp.type) {
				2299	first_offset = tmp.offset;
				2300	ret = overwrite_item(trans, log, dst_path,
				2301	path->nodes[0], path->slots[0],
				2302	&tmp);
				2303	}
				2304	}
				2305	btrfs_release_path(root, path);
				2306
				2307	/* find the first key from this transaction again */
				2308	ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
				2309	if (ret != 0) {
				2310	WARN_ON(1);
				2311	goto done;
				2312	}
				2313
				2314	/*
				2315	* we have a block from this transaction, log every item in it
				2316	* from our directory
				2317	*/
				2318	while(1) {
				2319	struct btrfs_key tmp;
				2320	src = path->nodes[0];
				2321	nritems = btrfs_header_nritems(src);
				2322	for (i = path->slots[0]; i < nritems; i++) {
				2323	btrfs_item_key_to_cpu(src, &min_key, i);
				2324
				2325	if (min_key.objectid != inode->i_ino \|\|
				2326	min_key.type != key_type)
				2327	goto done;
				2328	ret = overwrite_item(trans, log, dst_path, src, i,
				2329	&min_key);
				2330	BUG_ON(ret);
				2331	}
				2332	path->slots[0] = nritems;
				2333
				2334	/*
				2335	* look ahead to the next item and see if it is also
				2336	* from this directory and from this transaction
				2337	*/
				2338	ret = btrfs_next_leaf(root, path);
				2339	if (ret == 1) {
				2340	last_offset = (u64)-1;
				2341	goto done;
				2342	}
				2343	btrfs_item_key_to_cpu(path->nodes[0], &tmp, path->slots[0]);
				2344	if (tmp.objectid != inode->i_ino \|\| tmp.type != key_type) {
				2345	last_offset = (u64)-1;
				2346	goto done;
				2347	}
				2348	if (btrfs_header_generation(path->nodes[0]) != trans->transid) {
				2349	ret = overwrite_item(trans, log, dst_path,
				2350	path->nodes[0], path->slots[0],
				2351	&tmp);
				2352
				2353	BUG_ON(ret);
				2354	last_offset = tmp.offset;
				2355	goto done;
				2356	}
				2357	}
				2358	done:
				2359	*last_offset_ret = last_offset;
				2360	btrfs_release_path(root, path);
				2361	btrfs_release_path(log, dst_path);
				2362
				2363	/* insert the log range keys to indicate where the log is valid */
				2364	ret = insert_dir_log_key(trans, log, path, key_type, inode->i_ino,
				2365	first_offset, last_offset);
				2366	BUG_ON(ret);
				2367	return 0;
				2368	}
				2369
				2370	/*
				2371	* logging directories is very similar to logging inodes, We find all the items
				2372	* from the current transaction and write them to the log.
				2373	*
				2374	* The recovery code scans the directory in the subvolume, and if it finds a
				2375	* key in the range logged that is not present in the log tree, then it means
				2376	* that dir entry was unlinked during the transaction.
				2377	*
				2378	* In order for that scan to work, we must include one key smaller than
				2379	* the smallest logged by this transaction and one key larger than the largest
				2380	* key logged by this transaction.
				2381	*/
				2382	static noinline int log_directory_changes(struct btrfs_trans_handle *trans,
				2383	struct btrfs_root root, struct inode inode,
				2384	struct btrfs_path *path,
				2385	struct btrfs_path *dst_path)
				2386	{
				2387	u64 min_key;
				2388	u64 max_key;
				2389	int ret;
				2390	int key_type = BTRFS_DIR_ITEM_KEY;
				2391
				2392	again:
				2393	min_key = 0;
				2394	max_key = 0;
				2395	while(1) {
				2396	ret = log_dir_items(trans, root, inode, path,
				2397	dst_path, key_type, min_key,
				2398	&max_key);
				2399	BUG_ON(ret);
				2400	if (max_key == (u64)-1)
				2401	break;
				2402	min_key = max_key + 1;
				2403	}
				2404
				2405	if (key_type == BTRFS_DIR_ITEM_KEY) {
				2406	key_type = BTRFS_DIR_INDEX_KEY;
				2407	goto again;
				2408	}
				2409	return 0;
				2410	}
				2411
				2412	/*
				2413	* a helper function to drop items from the log before we relog an
				2414	* inode. max_key_type indicates the highest item type to remove.
				2415	* This cannot be run for file data extents because it does not
				2416	* free the extents they point to.
				2417	*/
				2418	static int drop_objectid_items(struct btrfs_trans_handle *trans,
				2419	struct btrfs_root *log,
				2420	struct btrfs_path *path,
				2421	u64 objectid, int max_key_type)
				2422	{
				2423	int ret;
				2424	struct btrfs_key key;
				2425	struct btrfs_key found_key;
				2426
				2427	key.objectid = objectid;
				2428	key.type = max_key_type;
				2429	key.offset = (u64)-1;
				2430
				2431	while(1) {
				2432	ret = btrfs_search_slot(trans, log, &key, path, -1, 1);
				2433
				2434	if (ret != 1)
				2435	break;
				2436
				2437	if (path->slots[0] == 0)
				2438	break;
				2439
				2440	path->slots[0]--;
				2441	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2442	path->slots[0]);
				2443
				2444	if (found_key.objectid != objectid)
				2445	break;
				2446
				2447	ret = btrfs_del_item(trans, log, path);
				2448	BUG_ON(ret);
				2449	btrfs_release_path(log, path);
				2450	}
				2451	btrfs_release_path(log, path);
				2452	return 0;
				2453	}
				2454
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2455	static noinline int copy_items(struct btrfs_trans_handle *trans,
				2456	struct btrfs_root *log,
				2457	struct btrfs_path *dst_path,
				2458	struct extent_buffer *src,
				2459	int start_slot, int nr, int inode_only)
				2460	{
				2461	unsigned long src_offset;
				2462	unsigned long dst_offset;
				2463	struct btrfs_file_extent_item *extent;
				2464	struct btrfs_inode_item *inode_item;
				2465	int ret;
				2466	struct btrfs_key *ins_keys;
				2467	u32 *ins_sizes;
				2468	char *ins_data;
				2469	int i;
				2470
				2471	ins_data = kmalloc(nr * sizeof(struct btrfs_key) +
				2472	nr * sizeof(u32), GFP_NOFS);
				2473	ins_sizes = (u32 *)ins_data;
				2474	ins_keys = (struct btrfs_key )(ins_data + nr sizeof(u32));
				2475
				2476	for (i = 0; i < nr; i++) {
				2477	ins_sizes[i] = btrfs_item_size_nr(src, i + start_slot);
				2478	btrfs_item_key_to_cpu(src, ins_keys + i, i + start_slot);
				2479	}
				2480	ret = btrfs_insert_empty_items(trans, log, dst_path,
				2481	ins_keys, ins_sizes, nr);
				2482	BUG_ON(ret);
				2483
				2484	for (i = 0; i < nr; i++) {
				2485	dst_offset = btrfs_item_ptr_offset(dst_path->nodes[0],
				2486	dst_path->slots[0]);
				2487
				2488	src_offset = btrfs_item_ptr_offset(src, start_slot + i);
				2489
				2490	copy_extent_buffer(dst_path->nodes[0], src, dst_offset,
				2491	src_offset, ins_sizes[i]);
				2492
				2493	if (inode_only == LOG_INODE_EXISTS &&
				2494	ins_keys[i].type == BTRFS_INODE_ITEM_KEY) {
				2495	inode_item = btrfs_item_ptr(dst_path->nodes[0],
				2496	dst_path->slots[0],
				2497	struct btrfs_inode_item);
				2498	btrfs_set_inode_size(dst_path->nodes[0], inode_item, 0);
				2499
				2500	/* set the generation to zero so the recover code
				2501	* can tell the difference between an logging
				2502	* just to say 'this inode exists' and a logging
				2503	* to say 'update this inode with these values'
				2504	*/
				2505	btrfs_set_inode_generation(dst_path->nodes[0],
				2506	inode_item, 0);
				2507	}
				2508	/* take a reference on file data extents so that truncates
				2509	* or deletes of this inode don't have to relog the inode
				2510	* again
				2511	*/
				2512	if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY) {
				2513	int found_type;
				2514	extent = btrfs_item_ptr(src, start_slot + i,
				2515	struct btrfs_file_extent_item);
				2516
				2517	found_type = btrfs_file_extent_type(src, extent);
				2518	if (found_type == BTRFS_FILE_EXTENT_REG) {
				2519	u64 ds = btrfs_file_extent_disk_bytenr(src,
				2520	extent);
				2521	u64 dl = btrfs_file_extent_disk_num_bytes(src,
				2522	extent);
				2523	/* ds == 0 is a hole */
				2524	if (ds != 0) {
				2525	ret = btrfs_inc_extent_ref(trans, log,
				2526	ds, dl,
				2527	BTRFS_TREE_LOG_OBJECTID,
				2528	0, ins_keys[i].objectid,
				2529	ins_keys[i].offset);
				2530	BUG_ON(ret);
				2531	}
				2532	}
				2533	}
				2534	dst_path->slots[0]++;
				2535	}
				2536
				2537	btrfs_mark_buffer_dirty(dst_path->nodes[0]);
				2538	btrfs_release_path(log, dst_path);
				2539	kfree(ins_data);
				2540	return 0;
				2541	}
				2542
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2543	/* log a single inode in the tree log.
				2544	* At least one parent directory for this inode must exist in the tree
				2545	* or be logged already.
				2546	*
				2547	* Any items from this inode changed by the current transaction are copied
				2548	* to the log tree. An extra reference is taken on any extents in this
				2549	* file, allowing us to avoid a whole pile of corner cases around logging
				2550	* blocks that have been removed from the tree.
				2551	*
				2552	* See LOG_INODE_ALL and related defines for a description of what inode_only
				2553	* does.
				2554	*
				2555	* This handles both files and directories.
				2556	*/
				2557	static int __btrfs_log_inode(struct btrfs_trans_handle *trans,
				2558	struct btrfs_root root, struct inode inode,
				2559	int inode_only)
				2560	{
				2561	struct btrfs_path *path;
				2562	struct btrfs_path *dst_path;
				2563	struct btrfs_key min_key;
				2564	struct btrfs_key max_key;
				2565	struct btrfs_root *log = root->log_root;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2566	struct extent_buffer *src = NULL;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2567	u32 size;
				2568	int ret;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2569	int nritems;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2570	int ins_start_slot = 0;
				2571	int ins_nr;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2572
				2573	log = root->log_root;
				2574
				2575	path = btrfs_alloc_path();
				2576	dst_path = btrfs_alloc_path();
				2577
				2578	min_key.objectid = inode->i_ino;
				2579	min_key.type = BTRFS_INODE_ITEM_KEY;
				2580	min_key.offset = 0;
				2581
				2582	max_key.objectid = inode->i_ino;
				2583	if (inode_only == LOG_INODE_EXISTS \|\| S_ISDIR(inode->i_mode))
				2584	max_key.type = BTRFS_XATTR_ITEM_KEY;
				2585	else
				2586	max_key.type = (u8)-1;
				2587	max_key.offset = (u64)-1;
				2588
				2589	/*
				2590	* if this inode has already been logged and we're in inode_only
				2591	* mode, we don't want to delete the things that have already
				2592	* been written to the log.
				2593	*
				2594	* But, if the inode has been through an inode_only log,
				2595	* the logged_trans field is not set. This allows us to catch
				2596	* any new names for this inode in the backrefs by logging it
				2597	* again
				2598	*/
				2599	if (inode_only == LOG_INODE_EXISTS &&
				2600	BTRFS_I(inode)->logged_trans == trans->transid) {
				2601	btrfs_free_path(path);
				2602	btrfs_free_path(dst_path);
				2603	goto out;
				2604	}
				2605	mutex_lock(&BTRFS_I(inode)->log_mutex);
				2606
				2607	/*
				2608	* a brute force approach to making sure we get the most uptodate
				2609	* copies of everything.
				2610	*/
				2611	if (S_ISDIR(inode->i_mode)) {
				2612	int max_key_type = BTRFS_DIR_LOG_INDEX_KEY;
				2613
				2614	if (inode_only == LOG_INODE_EXISTS)
				2615	max_key_type = BTRFS_XATTR_ITEM_KEY;
				2616	ret = drop_objectid_items(trans, log, path,
				2617	inode->i_ino, max_key_type);
				2618	} else {
				2619	ret = btrfs_truncate_inode_items(trans, log, inode, 0, 0);
				2620	}
				2621	BUG_ON(ret);
				2622	path->keep_locks = 1;
				2623
				2624	while(1) {
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2625	ins_nr = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2626	ret = btrfs_search_forward(root, &min_key, &max_key,
				2627	path, 0, trans->transid);
				2628	if (ret != 0)
				2629	break;
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2630	again:
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2631	/* note, ins_nr might be > 0 here, cleanup outside the loop */
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2632	if (min_key.objectid != inode->i_ino)
				2633	break;
				2634	if (min_key.type > max_key.type)
				2635	break;
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2636
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2637	src = path->nodes[0];
				2638	size = btrfs_item_size_nr(src, path->slots[0]);
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2639	if (ins_nr && ins_start_slot + ins_nr == path->slots[0]) {
				2640	ins_nr++;
				2641	goto next_slot;
				2642	} else if (!ins_nr) {
				2643	ins_start_slot = path->slots[0];
				2644	ins_nr = 1;
				2645	goto next_slot;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2646	}
				2647
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2648	ret = copy_items(trans, log, dst_path, src, ins_start_slot,
				2649	ins_nr, inode_only);
				2650	BUG_ON(ret);
				2651	ins_nr = 1;
				2652	ins_start_slot = path->slots[0];
				2653	next_slot:
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2654
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2655	nritems = btrfs_header_nritems(path->nodes[0]);
				2656	path->slots[0]++;
				2657	if (path->slots[0] < nritems) {
				2658	btrfs_item_key_to_cpu(path->nodes[0], &min_key,
				2659	path->slots[0]);
				2660	goto again;
				2661	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2662	if (ins_nr) {
				2663	ret = copy_items(trans, log, dst_path, src,
				2664	ins_start_slot,
				2665	ins_nr, inode_only);
				2666	BUG_ON(ret);
				2667	ins_nr = 0;
				2668	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2669	btrfs_release_path(root, path);
				2670
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2671	if (min_key.offset < (u64)-1)
				2672	min_key.offset++;
				2673	else if (min_key.type < (u8)-1)
				2674	min_key.type++;
				2675	else if (min_key.objectid < (u64)-1)
				2676	min_key.objectid++;
				2677	else
				2678	break;
				2679	}
Chris Mason	31ff1cd	2008-09-11 16:17:57 -0400	[diff] [blame^]	2680	if (ins_nr) {
				2681	ret = copy_items(trans, log, dst_path, src,
				2682	ins_start_slot,
				2683	ins_nr, inode_only);
				2684	BUG_ON(ret);
				2685	ins_nr = 0;
				2686	}
				2687	WARN_ON(ins_nr);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2688	if (inode_only == LOG_INODE_ALL && S_ISDIR(inode->i_mode) &&
				2689	BTRFS_I(inode)->log_dirty_trans >= trans->transid) {
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2690	btrfs_release_path(root, path);
				2691	btrfs_release_path(log, dst_path);
Chris Mason	49eb7e4	2008-09-11 15:53:12 -0400	[diff] [blame]	2692	BTRFS_I(inode)->log_dirty_trans = 0;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2693	ret = log_directory_changes(trans, root, inode, path, dst_path);
				2694	BUG_ON(ret);
				2695	}
Chris Mason	3a5f1d4	2008-09-11 15:53:37 -0400	[diff] [blame]	2696	BTRFS_I(inode)->logged_trans = trans->transid;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2697	mutex_unlock(&BTRFS_I(inode)->log_mutex);
				2698
				2699	btrfs_free_path(path);
				2700	btrfs_free_path(dst_path);
				2701
				2702	mutex_lock(&root->fs_info->tree_log_mutex);
				2703	ret = update_log_root(trans, log);
				2704	BUG_ON(ret);
				2705	mutex_unlock(&root->fs_info->tree_log_mutex);
				2706	out:
				2707	return 0;
				2708	}
				2709
				2710	int btrfs_log_inode(struct btrfs_trans_handle *trans,
				2711	struct btrfs_root root, struct inode inode,
				2712	int inode_only)
				2713	{
				2714	int ret;
				2715
				2716	start_log_trans(trans, root);
				2717	ret = __btrfs_log_inode(trans, root, inode, inode_only);
				2718	end_log_trans(root);
				2719	return ret;
				2720	}
				2721
				2722	/*
				2723	* helper function around btrfs_log_inode to make sure newly created
				2724	* parent directories also end up in the log. A minimal inode and backref
				2725	* only logging is done of any parent directories that are older than
				2726	* the last committed transaction
				2727	*/
				2728	int btrfs_log_dentry(struct btrfs_trans_handle *trans,
				2729	struct btrfs_root root, struct dentry dentry)
				2730	{
				2731	int inode_only = LOG_INODE_ALL;
				2732	struct super_block *sb;
				2733	int ret;
				2734
				2735	start_log_trans(trans, root);
				2736	sb = dentry->d_inode->i_sb;
				2737	while(1) {
				2738	ret = __btrfs_log_inode(trans, root, dentry->d_inode,
				2739	inode_only);
				2740	BUG_ON(ret);
				2741	inode_only = LOG_INODE_EXISTS;
				2742
				2743	dentry = dentry->d_parent;
				2744	if (!dentry \|\| !dentry->d_inode \|\| sb != dentry->d_inode->i_sb)
				2745	break;
				2746
				2747	if (BTRFS_I(dentry->d_inode)->generation <=
				2748	root->fs_info->last_trans_committed)
				2749	break;
				2750	}
				2751	end_log_trans(root);
				2752	return 0;
				2753	}
				2754
				2755	/*
				2756	* it is not safe to log dentry if the chunk root has added new
				2757	* chunks. This returns 0 if the dentry was logged, and 1 otherwise.
				2758	* If this returns 1, you must commit the transaction to safely get your
				2759	* data on disk.
				2760	*/
				2761	int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans,
				2762	struct btrfs_root root, struct dentry dentry)
				2763	{
				2764	u64 gen;
				2765	gen = root->fs_info->last_trans_new_blockgroup;
				2766	if (gen > root->fs_info->last_trans_committed)
				2767	return 1;
				2768	else
				2769	return btrfs_log_dentry(trans, root, dentry);
				2770	}
				2771
				2772	/*
				2773	* should be called during mount to recover any replay any log trees
				2774	* from the FS
				2775	*/
				2776	int btrfs_recover_log_trees(struct btrfs_root *log_root_tree)
				2777	{
				2778	int ret;
				2779	struct btrfs_path *path;
				2780	struct btrfs_trans_handle *trans;
				2781	struct btrfs_key key;
				2782	struct btrfs_key found_key;
				2783	struct btrfs_key tmp_key;
				2784	struct btrfs_root *log;
				2785	struct btrfs_fs_info *fs_info = log_root_tree->fs_info;
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2786	u64 highest_inode;
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2787	struct walk_control wc = {
				2788	.process_func = process_one_buffer,
				2789	.stage = 0,
				2790	};
				2791
				2792	fs_info->log_root_recovering = 1;
				2793	path = btrfs_alloc_path();
				2794	BUG_ON(!path);
				2795
				2796	trans = btrfs_start_transaction(fs_info->tree_root, 1);
				2797
				2798	wc.trans = trans;
				2799	wc.pin = 1;
				2800
				2801	walk_log_tree(trans, log_root_tree, &wc);
				2802
				2803	again:
				2804	key.objectid = BTRFS_TREE_LOG_OBJECTID;
				2805	key.offset = (u64)-1;
				2806	btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY);
				2807
				2808	while(1) {
				2809	ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0);
				2810	if (ret < 0)
				2811	break;
				2812	if (ret > 0) {
				2813	if (path->slots[0] == 0)
				2814	break;
				2815	path->slots[0]--;
				2816	}
				2817	btrfs_item_key_to_cpu(path->nodes[0], &found_key,
				2818	path->slots[0]);
				2819	btrfs_release_path(log_root_tree, path);
				2820	if (found_key.objectid != BTRFS_TREE_LOG_OBJECTID)
				2821	break;
				2822
				2823	log = btrfs_read_fs_root_no_radix(log_root_tree,
				2824	&found_key);
				2825	BUG_ON(!log);
				2826
				2827
				2828	tmp_key.objectid = found_key.offset;
				2829	tmp_key.type = BTRFS_ROOT_ITEM_KEY;
				2830	tmp_key.offset = (u64)-1;
				2831
				2832	wc.replay_dest = btrfs_read_fs_root_no_name(fs_info, &tmp_key);
				2833
				2834	BUG_ON(!wc.replay_dest);
				2835
				2836	btrfs_record_root_in_trans(wc.replay_dest);
				2837	ret = walk_log_tree(trans, log, &wc);
				2838	BUG_ON(ret);
				2839
				2840	if (wc.stage == LOG_WALK_REPLAY_ALL) {
				2841	ret = fixup_inode_link_counts(trans, wc.replay_dest,
				2842	path);
				2843	BUG_ON(ret);
				2844	}
Chris Mason	8d5bf1c	2008-09-11 15:51:21 -0400	[diff] [blame]	2845	ret = btrfs_find_highest_inode(wc.replay_dest, &highest_inode);
				2846	if (ret == 0) {
				2847	wc.replay_dest->highest_inode = highest_inode;
				2848	wc.replay_dest->last_inode_alloc = highest_inode;
				2849	}
Chris Mason	e02119d	2008-09-05 16:13:11 -0400	[diff] [blame]	2850
				2851	key.offset = found_key.offset - 1;
				2852	free_extent_buffer(log->node);
				2853	kfree(log);
				2854
				2855	if (found_key.offset == 0)
				2856	break;
				2857	}
				2858	btrfs_release_path(log_root_tree, path);
				2859
				2860	/* step one is to pin it all, step two is to replay just inodes */
				2861	if (wc.pin) {
				2862	wc.pin = 0;
				2863	wc.process_func = replay_one_buffer;
				2864	wc.stage = LOG_WALK_REPLAY_INODES;
				2865	goto again;
				2866	}
				2867	/* step three is to replay everything */
				2868	if (wc.stage < LOG_WALK_REPLAY_ALL) {
				2869	wc.stage++;
				2870	goto again;
				2871	}
				2872
				2873	btrfs_free_path(path);
				2874
				2875	free_extent_buffer(log_root_tree->node);
				2876	log_root_tree->log_root = NULL;
				2877	fs_info->log_root_recovering = 0;
				2878
				2879	/* step 4: commit the transaction, which also unpins the blocks */
				2880	btrfs_commit_transaction(trans, fs_info->tree_root);
				2881
				2882	kfree(log_root_tree);
				2883	return 0;
				2884	}