Blame - fs/ext3/inode.c - kernel/msm-4.9

blob: 8a824f4ce5c608d231a6862b5a1c86de167d0882 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/fs/ext3/inode.c
				3	*
				4	* Copyright (C) 1992, 1993, 1994, 1995
				5	* Remy Card (card@masi.ibp.fr)
				6	* Laboratoire MASI - Institut Blaise Pascal
				7	* Universite Pierre et Marie Curie (Paris VI)
				8	*
				9	* from
				10	*
				11	* linux/fs/minix/inode.c
				12	*
				13	* Copyright (C) 1991, 1992 Linus Torvalds
				14	*
				15	* Goal-directed block allocation by Stephen Tweedie
Dave Kleikamp	e9ad562	2006-09-27 01:49:35 -0700	[diff] [blame]	16	* (sct@redhat.com), 1993, 1998
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	17	* Big-endian to little-endian byte-swapping/bitmaps by
				18	* David S. Miller (davem@caip.rutgers.edu), 1995
				19	* 64-bit file support on 64-bit platforms by Jakub Jelinek
Dave Kleikamp	e9ad562	2006-09-27 01:49:35 -0700	[diff] [blame]	20	* (jj@sunsite.ms.mff.cuni.cz)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	21	*
				22	* Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
				23	*/
				24
				25	#include <linux/module.h>
				26	#include <linux/fs.h>
				27	#include <linux/time.h>
				28	#include <linux/ext3_jbd.h>
				29	#include <linux/jbd.h>
				30	#include <linux/smp_lock.h>
				31	#include <linux/highuid.h>
				32	#include <linux/pagemap.h>
				33	#include <linux/quotaops.h>
				34	#include <linux/string.h>
				35	#include <linux/buffer_head.h>
				36	#include <linux/writeback.h>
				37	#include <linux/mpage.h>
				38	#include <linux/uio.h>
Jens Axboe	caa38fb	2006-07-23 01:41:26 +0200	[diff] [blame]	39	#include <linux/bio.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	#include "xattr.h"
				41	#include "acl.h"
				42
				43	static int ext3_writepage_trans_blocks(struct inode *inode);
				44
				45	/*
				46	* Test whether an inode is a fast symlink.
				47	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	48	static int ext3_inode_is_fast_symlink(struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	49	{
				50	int ea_blocks = EXT3_I(inode)->i_file_acl ?
				51	(inode->i_sb->s_blocksize >> 9) : 0;
				52
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	53	return (S_ISLNK(inode->i_mode) && inode->i_blocks - ea_blocks == 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	54	}
				55
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	56	/*
				57	* The ext3 forget function must perform a revoke if we are freeing data
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	58	* which has been journaled. Metadata (eg. indirect blocks) must be
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	59	* revoked in all cases.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	60	*
				61	* "bh" may be NULL: a metadata block may have been freed from memory
				62	* but there may still be a record of it in the journal, and that record
				63	* still needs to be revoked.
				64	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	65	int ext3_forget(handle_t handle, int is_metadata, struct inode inode,
Mingming Cao	1c2bf37	2006-06-25 05:48:06 -0700	[diff] [blame]	66	struct buffer_head *bh, ext3_fsblk_t blocknr)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	67	{
				68	int err;
				69
				70	might_sleep();
				71
				72	BUFFER_TRACE(bh, "enter");
				73
				74	jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
				75	"data mode %lx\n",
				76	bh, is_metadata, inode->i_mode,
				77	test_opt(inode->i_sb, DATA_FLAGS));
				78
				79	/* Never use the revoke function if we are doing full data
				80	* journaling: there is no need to, and a V1 superblock won't
				81	* support it. Otherwise, only skip the revoke on un-journaled
				82	* data blocks. */
				83
				84	if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA \|\|
				85	(!is_metadata && !ext3_should_journal_data(inode))) {
				86	if (bh) {
				87	BUFFER_TRACE(bh, "call journal_forget");
				88	return ext3_journal_forget(handle, bh);
				89	}
				90	return 0;
				91	}
				92
				93	/*
				94	* data!=journal && (is_metadata \|\| should_journal_data(inode))
				95	*/
				96	BUFFER_TRACE(bh, "call ext3_journal_revoke");
				97	err = ext3_journal_revoke(handle, blocknr, bh);
				98	if (err)
				99	ext3_abort(inode->i_sb, __FUNCTION__,
				100	"error %d when attempting revoke", err);
				101	BUFFER_TRACE(bh, "exit");
				102	return err;
				103	}
				104
				105	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	106	* Work out how many blocks we need to proceed with the next chunk of a
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	107	* truncate transaction.
				108	*/
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	109	static unsigned long blocks_for_truncate(struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	110	{
				111	unsigned long needed;
				112
				113	needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
				114
				115	/* Give ourselves just enough room to cope with inodes in which
				116	* i_blocks is corrupt: we've seen disk corruptions in the past
				117	* which resulted in random data in an inode which looked enough
				118	* like a regular file for ext3 to try to delete it. Things
				119	* will go a bit crazy if that happens, but at least we should
				120	* try not to panic the whole kernel. */
				121	if (needed < 2)
				122	needed = 2;
				123
				124	/* But we need to bound the transaction so we don't overflow the
				125	* journal. */
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	126	if (needed > EXT3_MAX_TRANS_DATA)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	127	needed = EXT3_MAX_TRANS_DATA;
				128
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	129	return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	130	}
				131
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	132	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	133	* Truncate transactions can be complex and absolutely huge. So we need to
				134	* be able to restart the transaction at a conventient checkpoint to make
				135	* sure we don't overflow the journal.
				136	*
				137	* start_transaction gets us a new handle for a truncate transaction,
				138	* and extend_transaction tries to extend the existing one a bit. If
				139	* extend fails, we need to propagate the failure up and restart the
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	140	* transaction in the top-level truncate loop. --sct
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	141	*/
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	142	static handle_t start_transaction(struct inode inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	143	{
				144	handle_t *result;
				145
				146	result = ext3_journal_start(inode, blocks_for_truncate(inode));
				147	if (!IS_ERR(result))
				148	return result;
				149
				150	ext3_std_error(inode->i_sb, PTR_ERR(result));
				151	return result;
				152	}
				153
				154	/*
				155	* Try to extend this transaction for the purposes of truncation.
				156	*
				157	* Returns 0 if we managed to create more room. If we can't create more
				158	* room, and the transaction must be restarted we return 1.
				159	*/
				160	static int try_to_extend_transaction(handle_t handle, struct inode inode)
				161	{
				162	if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
				163	return 0;
				164	if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
				165	return 0;
				166	return 1;
				167	}
				168
				169	/*
				170	* Restart the transaction associated with *handle. This does a commit,
				171	* so before we call here everything must be consistently dirtied against
				172	* this transaction.
				173	*/
				174	static int ext3_journal_test_restart(handle_t handle, struct inode inode)
				175	{
				176	jbd_debug(2, "restarting handle %p\n", handle);
				177	return ext3_journal_restart(handle, blocks_for_truncate(inode));
				178	}
				179
				180	/*
				181	* Called at the last iput() if i_nlink is zero.
				182	*/
				183	void ext3_delete_inode (struct inode * inode)
				184	{
				185	handle_t *handle;
				186
Mark Fasheh	fef2665	2005-09-09 13:01:31 -0700	[diff] [blame]	187	truncate_inode_pages(&inode->i_data, 0);
				188
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	189	if (is_bad_inode(inode))
				190	goto no_delete;
				191
				192	handle = start_transaction(inode);
				193	if (IS_ERR(handle)) {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	194	/*
				195	* If we're going to skip the normal cleanup, we still need to
				196	* make sure that the in-core orphan linked list is properly
				197	* cleaned up.
				198	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199	ext3_orphan_del(NULL, inode);
				200	goto no_delete;
				201	}
				202
				203	if (IS_SYNC(inode))
				204	handle->h_sync = 1;
				205	inode->i_size = 0;
				206	if (inode->i_blocks)
				207	ext3_truncate(inode);
				208	/*
				209	* Kill off the orphan record which ext3_truncate created.
				210	* AKPM: I think this can be inside the above `if'.
				211	* Note that ext3_orphan_del() has to be able to cope with the
				212	* deletion of a non-existent orphan - this is because we don't
				213	* know if ext3_truncate() actually created an orphan record.
				214	* (Well, we could do this if we need to, but heck - it works)
				215	*/
				216	ext3_orphan_del(handle, inode);
				217	EXT3_I(inode)->i_dtime = get_seconds();
				218
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	219	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	220	* One subtle ordering requirement: if anything has gone wrong
				221	* (transaction abort, IO errors, whatever), then we can still
				222	* do these next steps (the fs will already have been marked as
				223	* having errors), but we can't free the inode if the mark_dirty
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	224	* fails.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	225	*/
				226	if (ext3_mark_inode_dirty(handle, inode))
				227	/* If that failed, just do the required in-core inode clear. */
				228	clear_inode(inode);
				229	else
				230	ext3_free_inode(handle, inode);
				231	ext3_journal_stop(handle);
				232	return;
				233	no_delete:
				234	clear_inode(inode); /* We must guarantee clearing of inode... */
				235	}
				236
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	237	typedef struct {
				238	__le32 *p;
				239	__le32 key;
				240	struct buffer_head *bh;
				241	} Indirect;
				242
				243	static inline void add_chain(Indirect p, struct buffer_head bh, __le32 *v)
				244	{
				245	p->key = *(p->p = v);
				246	p->bh = bh;
				247	}
				248
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	249	static int verify_chain(Indirect from, Indirect to)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	250	{
				251	while (from <= to && from->key == *from->p)
				252	from++;
				253	return (from > to);
				254	}
				255
				256	/**
				257	* ext3_block_to_path - parse the block number into array of offsets
				258	* @inode: inode in question (we are only interested in its superblock)
				259	* @i_block: block number to be parsed
				260	* @offsets: array to store the offsets in
				261	* @boundary: set this non-zero if the referred-to block is likely to be
				262	* followed (on disk) by an indirect block.
				263	*
				264	* To store the locations of file's data ext3 uses a data structure common
				265	* for UNIX filesystems - tree of pointers anchored in the inode, with
				266	* data blocks at leaves and indirect blocks in intermediate nodes.
				267	* This function translates the block number into path in that tree -
				268	* return value is the path length and @offsets[n] is the offset of
				269	* pointer to (n+1)th node in the nth one. If @block is out of range
				270	* (negative or too large) warning is printed and zero returned.
				271	*
				272	* Note: function doesn't find node addresses, so no IO is needed. All
				273	* we need to know is the capacity of indirect blocks (taken from the
				274	* inode->i_sb).
				275	*/
				276
				277	/*
				278	* Portability note: the last comparison (check that we fit into triple
				279	* indirect block) is spelled differently, because otherwise on an
				280	* architecture with 32-bit longs and 8Kb pages we might get into trouble
				281	* if our filesystem had 8Kb blocks. We might use long long, but that would
				282	* kill us on x86. Oh, well, at least the sign propagation does not matter -
				283	* i_block would have to be negative in the very beginning, so we would not
				284	* get there at all.
				285	*/
				286
				287	static int ext3_block_to_path(struct inode *inode,
				288	long i_block, int offsets[4], int *boundary)
				289	{
				290	int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				291	int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
				292	const long direct_blocks = EXT3_NDIR_BLOCKS,
				293	indirect_blocks = ptrs,
				294	double_blocks = (1 << (ptrs_bits * 2));
				295	int n = 0;
				296	int final = 0;
				297
				298	if (i_block < 0) {
				299	ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
				300	} else if (i_block < direct_blocks) {
				301	offsets[n++] = i_block;
				302	final = direct_blocks;
				303	} else if ( (i_block -= direct_blocks) < indirect_blocks) {
				304	offsets[n++] = EXT3_IND_BLOCK;
				305	offsets[n++] = i_block;
				306	final = ptrs;
				307	} else if ((i_block -= indirect_blocks) < double_blocks) {
				308	offsets[n++] = EXT3_DIND_BLOCK;
				309	offsets[n++] = i_block >> ptrs_bits;
				310	offsets[n++] = i_block & (ptrs - 1);
				311	final = ptrs;
				312	} else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
				313	offsets[n++] = EXT3_TIND_BLOCK;
				314	offsets[n++] = i_block >> (ptrs_bits * 2);
				315	offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
				316	offsets[n++] = i_block & (ptrs - 1);
				317	final = ptrs;
				318	} else {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	319	ext3_warning(inode->i_sb, "ext3_block_to_path", "block > big");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	320	}
				321	if (boundary)
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	322	*boundary = final - 1 - (i_block & (ptrs - 1));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	323	return n;
				324	}
				325
				326	/**
				327	* ext3_get_branch - read the chain of indirect blocks leading to data
				328	* @inode: inode in question
				329	* @depth: depth of the chain (1 - direct pointer, etc.)
				330	* @offsets: offsets of pointers in inode/indirect blocks
				331	* @chain: place to store the result
				332	* @err: here we store the error value
				333	*
				334	* Function fills the array of triples <key, p, bh> and returns %NULL
				335	* if everything went OK or the pointer to the last filled triple
				336	* (incomplete one) otherwise. Upon the return chain[i].key contains
				337	* the number of (i+1)-th block in the chain (as it is stored in memory,
				338	* i.e. little-endian 32-bit), chain[i].p contains the address of that
				339	* number (it points into struct inode for i==0 and into the bh->b_data
				340	* for i>0) and chain[i].bh points to the buffer_head of i-th indirect
				341	* block for i>0 and NULL for i==0. In other words, it holds the block
				342	* numbers of the chain, addresses they were taken from (and where we can
				343	* verify that chain did not change) and buffer_heads hosting these
				344	* numbers.
				345	*
				346	* Function stops when it stumbles upon zero pointer (absent block)
				347	* (pointer to last triple returned, *@err == 0)
				348	* or when it gets an IO error reading an indirect block
				349	* (ditto, *@err == -EIO)
				350	* or when it notices that chain had been changed while it was reading
				351	* (ditto, *@err == -EAGAIN)
				352	* or when it reads all @depth-1 indirect blocks successfully and finds
				353	* the whole chain, all way to the data (returns %NULL, *err == 0).
				354	*/
				355	static Indirect ext3_get_branch(struct inode inode, int depth, int *offsets,
				356	Indirect chain[4], int *err)
				357	{
				358	struct super_block *sb = inode->i_sb;
				359	Indirect *p = chain;
				360	struct buffer_head *bh;
				361
				362	*err = 0;
				363	/* i_data is not going away, no lock needed */
				364	add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
				365	if (!p->key)
				366	goto no_block;
				367	while (--depth) {
				368	bh = sb_bread(sb, le32_to_cpu(p->key));
				369	if (!bh)
				370	goto failure;
				371	/* Reader: pointers */
				372	if (!verify_chain(chain, p))
				373	goto changed;
				374	add_chain(++p, bh, (__le32)bh->b_data + ++offsets);
				375	/* Reader: end */
				376	if (!p->key)
				377	goto no_block;
				378	}
				379	return NULL;
				380
				381	changed:
				382	brelse(bh);
				383	*err = -EAGAIN;
				384	goto no_block;
				385	failure:
				386	*err = -EIO;
				387	no_block:
				388	return p;
				389	}
				390
				391	/**
				392	* ext3_find_near - find a place for allocation with sufficient locality
				393	* @inode: owner
				394	* @ind: descriptor of indirect block.
				395	*
				396	* This function returns the prefered place for block allocation.
				397	* It is used when heuristic for sequential allocation fails.
				398	* Rules are:
				399	* + if there is a block to the left of our position - allocate near it.
				400	* + if pointer will live in indirect block - allocate near that block.
				401	* + if pointer will live in inode - allocate in the same
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	402	* cylinder group.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	403	*
				404	* In the latter case we colour the starting block by the callers PID to
				405	* prevent it from clashing with concurrent allocations for a different inode
				406	* in the same block group. The PID is used here so that functionally related
				407	* files will be close-by on-disk.
				408	*
				409	* Caller must make sure that @ind is valid and will stay that way.
				410	*/
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	411	static ext3_fsblk_t ext3_find_near(struct inode inode, Indirect ind)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	412	{
				413	struct ext3_inode_info *ei = EXT3_I(inode);
				414	__le32 start = ind->bh ? (__le32) ind->bh->b_data : ei->i_data;
				415	__le32 *p;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	416	ext3_fsblk_t bg_start;
				417	ext3_grpblk_t colour;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	418
				419	/* Try to find previous block */
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	420	for (p = ind->p - 1; p >= start; p--) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	421	if (*p)
				422	return le32_to_cpu(*p);
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	423	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	424
				425	/* No such thing, so let's try location of indirect block */
				426	if (ind->bh)
				427	return ind->bh->b_blocknr;
				428
				429	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	430	* It is going to be referred to from the inode itself? OK, just put it
				431	* into the same cylinder group then.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432	*/
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	433	bg_start = ext3_group_first_block_no(inode->i_sb, ei->i_block_group);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	434	colour = (current->pid % 16) *
				435	(EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
				436	return bg_start + colour;
				437	}
				438
				439	/**
				440	* ext3_find_goal - find a prefered place for allocation.
				441	* @inode: owner
				442	* @block: block we want
				443	* @chain: chain of indirect blocks
				444	* @partial: pointer to the last triple within a chain
				445	* @goal: place to store the result.
				446	*
				447	* Normally this function find the prefered place for block allocation,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	448	* stores it in *@goal and returns zero.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	449	*/
				450
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	451	static ext3_fsblk_t ext3_find_goal(struct inode *inode, long block,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	452	Indirect chain[4], Indirect *partial)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	453	{
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	454	struct ext3_block_alloc_info *block_i;
				455
				456	block_i = EXT3_I(inode)->i_block_alloc_info;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	457
				458	/*
				459	* try the heuristic for sequential allocation,
				460	* failing that at least try to get decent locality.
				461	*/
				462	if (block_i && (block == block_i->last_alloc_logical_block + 1)
				463	&& (block_i->last_alloc_physical_block != 0)) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	464	return block_i->last_alloc_physical_block + 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	465	}
				466
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	467	return ext3_find_near(inode, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	468	}
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	469
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	470	/**
				471	* ext3_blks_to_allocate: Look up the block map and count the number
				472	* of direct blocks need to be allocated for the given branch.
				473	*
Dave Kleikamp	e9ad562	2006-09-27 01:49:35 -0700	[diff] [blame]	474	* @branch: chain of indirect blocks
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	475	* @k: number of blocks need for indirect blocks
				476	* @blks: number of data blocks to be mapped.
				477	* @blocks_to_boundary: the offset in the indirect block
				478	*
				479	* return the total number of blocks to be allocate, including the
				480	* direct and indirect blocks.
				481	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	482	static int ext3_blks_to_allocate(Indirect *branch, int k, unsigned long blks,
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	483	int blocks_to_boundary)
				484	{
				485	unsigned long count = 0;
				486
				487	/*
				488	* Simple case, [t,d]Indirect block(s) has not allocated yet
				489	* then it's clear blocks on that path have not allocated
				490	*/
				491	if (k > 0) {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	492	/* right now we don't handle cross boundary allocation */
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	493	if (blks < blocks_to_boundary + 1)
				494	count += blks;
				495	else
				496	count += blocks_to_boundary + 1;
				497	return count;
				498	}
				499
				500	count++;
				501	while (count < blks && count <= blocks_to_boundary &&
				502	le32_to_cpu(*(branch[0].p + count)) == 0) {
				503	count++;
				504	}
				505	return count;
				506	}
				507
				508	/**
				509	* ext3_alloc_blocks: multiple allocate blocks needed for a branch
				510	* @indirect_blks: the number of blocks need to allocate for indirect
				511	* blocks
				512	*
				513	* @new_blocks: on return it will store the new block numbers for
				514	* the indirect blocks(if needed) and the first direct block,
				515	* @blks: on return it will store the total number of allocated
				516	* direct blocks
				517	*/
				518	static int ext3_alloc_blocks(handle_t handle, struct inode inode,
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	519	ext3_fsblk_t goal, int indirect_blks, int blks,
				520	ext3_fsblk_t new_blocks[4], int *err)
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	521	{
				522	int target, i;
				523	unsigned long count = 0;
				524	int index = 0;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	525	ext3_fsblk_t current_block = 0;
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	526	int ret = 0;
				527
				528	/*
				529	* Here we try to allocate the requested multiple blocks at once,
				530	* on a best-effort basis.
				531	* To build a branch, we should allocate blocks for
				532	* the indirect blocks(if not allocated yet), and at least
				533	* the first direct block of this branch. That's the
				534	* minimum number of blocks need to allocate(required)
				535	*/
				536	target = blks + indirect_blks;
				537
				538	while (1) {
				539	count = target;
				540	/* allocating blocks for indirect blocks and direct blocks */
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	541	current_block = ext3_new_blocks(handle,inode,goal,&count,err);
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	542	if (*err)
				543	goto failed_out;
				544
				545	target -= count;
				546	/* allocate blocks for indirect blocks */
				547	while (index < indirect_blks && count) {
				548	new_blocks[index++] = current_block++;
				549	count--;
				550	}
				551
				552	if (count > 0)
				553	break;
				554	}
				555
				556	/* save the new block number for the first direct block */
				557	new_blocks[index] = current_block;
				558
				559	/* total number of blocks allocated for direct blocks */
				560	ret = count;
				561	*err = 0;
				562	return ret;
				563	failed_out:
				564	for (i = 0; i <index; i++)
				565	ext3_free_blocks(handle, inode, new_blocks[i], 1);
				566	return ret;
				567	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568
				569	/**
				570	* ext3_alloc_branch - allocate and set up a chain of blocks.
				571	* @inode: owner
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	572	* @indirect_blks: number of allocated indirect blocks
				573	* @blks: number of allocated direct blocks
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	574	* @offsets: offsets (in the blocks) to store the pointers to next.
				575	* @branch: place to store the chain in.
				576	*
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	577	* This function allocates blocks, zeroes out all but the last one,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	578	* links them into chain and (if we are synchronous) writes them to disk.
				579	* In other words, it prepares a branch that can be spliced onto the
				580	* inode. It stores the information about that chain in the branch[], in
				581	* the same format as ext3_get_branch() would do. We are calling it after
				582	* we had read the existing part of chain and partial points to the last
				583	* triple of that (one with zero ->key). Upon the exit we have the same
Glauber de Oliveira Costa	5b11687	2005-10-30 15:02:48 -0800	[diff] [blame]	584	* picture as after the successful ext3_get_block(), except that in one
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	585	* place chain is disconnected - *branch->p is still zero (we did not
				586	* set the last link), but branch->key contains the number that should
				587	* be placed into *branch->p to fill that gap.
				588	*
				589	* If allocation fails we free all blocks we've allocated (and forget
				590	* their buffer_heads) and return the error value the from failed
				591	* ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
				592	* as described above and return 0.
				593	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	594	static int ext3_alloc_branch(handle_t handle, struct inode inode,
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	595	int indirect_blks, int *blks, ext3_fsblk_t goal,
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	596	int offsets, Indirect branch)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	597	{
				598	int blocksize = inode->i_sb->s_blocksize;
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	599	int i, n = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	600	int err = 0;
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	601	struct buffer_head *bh;
				602	int num;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	603	ext3_fsblk_t new_blocks[4];
				604	ext3_fsblk_t current_block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	606	num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
				607	*blks, new_blocks, &err);
				608	if (err)
				609	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	610
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	611	branch[0].key = cpu_to_le32(new_blocks[0]);
				612	/*
				613	* metadata blocks and data blocks are allocated.
				614	*/
				615	for (n = 1; n <= indirect_blks; n++) {
				616	/*
				617	* Get buffer_head for parent block, zero it out
				618	* and set the pointer to new one, then send
				619	* parent to disk.
				620	*/
				621	bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
				622	branch[n].bh = bh;
				623	lock_buffer(bh);
				624	BUFFER_TRACE(bh, "call get_create_access");
				625	err = ext3_journal_get_create_access(handle, bh);
				626	if (err) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	627	unlock_buffer(bh);
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	628	brelse(bh);
				629	goto failed;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	630	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	631
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	632	memset(bh->b_data, 0, blocksize);
				633	branch[n].p = (__le32 *) bh->b_data + offsets[n];
				634	branch[n].key = cpu_to_le32(new_blocks[n]);
				635	*branch[n].p = branch[n].key;
				636	if ( n == indirect_blks) {
				637	current_block = new_blocks[n];
				638	/*
				639	* End of chain, update the last new metablock of
				640	* the chain to point to the new allocated
				641	* data blocks numbers
				642	*/
				643	for (i=1; i < num; i++)
				644	*(branch[n].p + i) = cpu_to_le32(++current_block);
				645	}
				646	BUFFER_TRACE(bh, "marking uptodate");
				647	set_buffer_uptodate(bh);
				648	unlock_buffer(bh);
				649
				650	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				651	err = ext3_journal_dirty_metadata(handle, bh);
				652	if (err)
				653	goto failed;
				654	}
				655	*blks = num;
				656	return err;
				657	failed:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	658	/* Allocation failed, free what we already allocated */
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	659	for (i = 1; i <= n ; i++) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	660	BUFFER_TRACE(branch[i].bh, "call journal_forget");
				661	ext3_journal_forget(handle, branch[i].bh);
				662	}
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	663	for (i = 0; i <indirect_blks; i++)
				664	ext3_free_blocks(handle, inode, new_blocks[i], 1);
				665
				666	ext3_free_blocks(handle, inode, new_blocks[i], num);
				667
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	668	return err;
				669	}
				670
				671	/**
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	672	* ext3_splice_branch - splice the allocated branch onto inode.
				673	* @inode: owner
				674	* @block: (logical) number of block we are adding
				675	* @chain: chain of indirect blocks (with a missing link - see
				676	* ext3_alloc_branch)
				677	* @where: location of missing link
				678	* @num: number of indirect blocks we are adding
				679	* @blks: number of direct blocks we are adding
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	680	*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	681	* This function fills the missing link and does all housekeeping needed in
				682	* inode (->i_blocks, etc.). In case of success we end up with the full
				683	* chain to new block and return 0.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	684	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	685	static int ext3_splice_branch(handle_t handle, struct inode inode,
				686	long block, Indirect *where, int num, int blks)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	687	{
				688	int i;
				689	int err = 0;
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	690	struct ext3_block_alloc_info *block_i;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	691	ext3_fsblk_t current_block;
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	692
				693	block_i = EXT3_I(inode)->i_block_alloc_info;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	694	/*
				695	* If we're splicing into a [td]indirect block (as opposed to the
				696	* inode) then we need to get write access to the [td]indirect block
				697	* before the splice.
				698	*/
				699	if (where->bh) {
				700	BUFFER_TRACE(where->bh, "get_write_access");
				701	err = ext3_journal_get_write_access(handle, where->bh);
				702	if (err)
				703	goto err_out;
				704	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	705	/* That's it */
				706
				707	*where->p = where->key;
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	708
				709	/*
				710	* Update the host buffer_head or inode to point to more just allocated
				711	* direct blocks blocks
				712	*/
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	713	if (num == 0 && blks > 1) {
Mingming Cao	5dea517	2006-05-03 19:55:12 -0700	[diff] [blame]	714	current_block = le32_to_cpu(where->key) + 1;
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	715	for (i = 1; i < blks; i++)
				716	*(where->p + i ) = cpu_to_le32(current_block++);
				717	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	718
				719	/*
				720	* update the most recently allocated logical & physical block
				721	* in i_block_alloc_info, to assist find the proper goal block for next
				722	* allocation
				723	*/
				724	if (block_i) {
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	725	block_i->last_alloc_logical_block = block + blks - 1;
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	726	block_i->last_alloc_physical_block =
Mingming Cao	5dea517	2006-05-03 19:55:12 -0700	[diff] [blame]	727	le32_to_cpu(where[num].key) + blks - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	728	}
				729
				730	/* We are done with atomic stuff, now do the rest of housekeeping */
				731
				732	inode->i_ctime = CURRENT_TIME_SEC;
				733	ext3_mark_inode_dirty(handle, inode);
				734
				735	/* had we spliced it onto indirect block? */
				736	if (where->bh) {
				737	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	738	* If we spliced it onto an indirect block, we haven't
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	739	* altered the inode. Note however that if it is being spliced
				740	* onto an indirect block at the very end of the file (the
				741	* file is growing) then we will alter the inode to reflect
				742	* the new i_size. But that is not done here - it is done in
				743	* generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
				744	*/
				745	jbd_debug(5, "splicing indirect only\n");
				746	BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
				747	err = ext3_journal_dirty_metadata(handle, where->bh);
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	748	if (err)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	749	goto err_out;
				750	} else {
				751	/*
				752	* OK, we spliced it into the inode itself on a direct block.
				753	* Inode was dirtied above.
				754	*/
				755	jbd_debug(5, "splicing direct\n");
				756	}
				757	return err;
				758
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	759	err_out:
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	760	for (i = 1; i <= num; i++) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	761	BUFFER_TRACE(where[i].bh, "call journal_forget");
				762	ext3_journal_forget(handle, where[i].bh);
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	763	ext3_free_blocks(handle,inode,le32_to_cpu(where[i-1].key),1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	764	}
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	765	ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
				766
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	767	return err;
				768	}
				769
				770	/*
				771	* Allocation strategy is simple: if we have to allocate something, we will
				772	* have to go the whole way to leaf. So let's do it before attaching anything
				773	* to tree, set linkage between the newborn blocks, write them if sync is
				774	* required, recheck the path, free and repeat if check fails, otherwise
				775	* set the last missing link (that will protect us from any truncate-generated
				776	* removals - all blocks on the path are immune now) and possibly force the
				777	* write on the parent block.
				778	* That has a nice additional property: no special recovery from the failed
				779	* allocations is needed - we simply release blocks and do not touch anything
				780	* reachable from inode.
				781	*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	782	* `handle' can be NULL if create == 0.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	783	*
				784	* The BKL may not be held on entry here. Be sure to take it early.
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	785	* return > 0, # of blocks mapped or allocated.
				786	* return = 0, if plain lookup failed.
				787	* return < 0, error case.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	788	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	789	int ext3_get_blocks_handle(handle_t handle, struct inode inode,
				790	sector_t iblock, unsigned long maxblocks,
				791	struct buffer_head *bh_result,
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	792	int create, int extend_disksize)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	793	{
				794	int err = -EIO;
				795	int offsets[4];
				796	Indirect chain[4];
				797	Indirect *partial;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	798	ext3_fsblk_t goal;
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	799	int indirect_blks;
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	800	int blocks_to_boundary = 0;
				801	int depth;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	802	struct ext3_inode_info *ei = EXT3_I(inode);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	803	int count = 0;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	804	ext3_fsblk_t first_block = 0;
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	805
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	806
				807	J_ASSERT(handle != NULL \|\| create == 0);
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	808	depth = ext3_block_to_path(inode,iblock,offsets,&blocks_to_boundary);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	809
				810	if (depth == 0)
				811	goto out;
				812
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	813	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				814
				815	/* Simplest case - block found, no allocation needed */
				816	if (!partial) {
Mingming Cao	5dea517	2006-05-03 19:55:12 -0700	[diff] [blame]	817	first_block = le32_to_cpu(chain[depth - 1].key);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	818	clear_buffer_new(bh_result);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	819	count++;
				820	/map more blocks/
				821	while (count < maxblocks && count <= blocks_to_boundary) {
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	822	ext3_fsblk_t blk;
Mingming Cao	5dea517	2006-05-03 19:55:12 -0700	[diff] [blame]	823
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	824	if (!verify_chain(chain, partial)) {
				825	/*
				826	* Indirect block might be removed by
				827	* truncate while we were reading it.
				828	* Handling of that case: forget what we've
				829	* got now. Flag the err as EAGAIN, so it
				830	* will reread.
				831	*/
				832	err = -EAGAIN;
				833	count = 0;
				834	break;
				835	}
Mingming Cao	5dea517	2006-05-03 19:55:12 -0700	[diff] [blame]	836	blk = le32_to_cpu(*(chain[depth-1].p + count));
				837
				838	if (blk == first_block + count)
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	839	count++;
				840	else
				841	break;
				842	}
				843	if (err != -EAGAIN)
				844	goto got_it;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	845	}
				846
				847	/* Next simple case - plain lookup or failed read of indirect block */
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	848	if (!create \|\| err == -EIO)
				849	goto cleanup;
				850
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	851	mutex_lock(&ei->truncate_mutex);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	852
				853	/*
				854	* If the indirect block is missing while we are reading
				855	* the chain(ext3_get_branch() returns -EAGAIN err), or
				856	* if the chain has been changed after we grab the semaphore,
				857	* (either because another process truncated this branch, or
				858	* another get_block allocated this branch) re-grab the chain to see if
				859	* the request block has been allocated or not.
				860	*
				861	* Since we already block the truncate/other get_block
				862	* at this point, we will have the current copy of the chain when we
				863	* splice the branch into the tree.
				864	*/
				865	if (err == -EAGAIN \|\| !verify_chain(chain, partial)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	866	while (partial > chain) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	867	brelse(partial->bh);
				868	partial--;
				869	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	870	partial = ext3_get_branch(inode, depth, offsets, chain, &err);
				871	if (!partial) {
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	872	count++;
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	873	mutex_unlock(&ei->truncate_mutex);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	874	if (err)
				875	goto cleanup;
				876	clear_buffer_new(bh_result);
				877	goto got_it;
				878	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	879	}
				880
				881	/*
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	882	* Okay, we need to do block allocation. Lazily initialize the block
				883	* allocation info here if necessary
				884	*/
				885	if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	886	ext3_init_block_alloc_info(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	887
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	888	goal = ext3_find_goal(inode, iblock, chain, partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	889
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	890	/* the number of blocks need to allocate for [d,t]indirect blocks */
				891	indirect_blks = (chain + depth) - partial - 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	892
				893	/*
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	894	* Next look up the indirect map to count the totoal number of
				895	* direct blocks to allocate for this branch.
				896	*/
				897	count = ext3_blks_to_allocate(partial, indirect_blks,
				898	maxblocks, blocks_to_boundary);
				899	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	900	* Block out ext3_truncate while we alter the tree
				901	*/
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	902	err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	903	offsets + (partial - chain), partial);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	904
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	905	/*
				906	* The ext3_splice_branch call will free and forget any buffers
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	907	* on the new chain if there is a failure, but that risks using
				908	* up transaction credits, especially for bitmaps where the
				909	* credits cannot be returned. Can we handle this somehow? We
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	910	* may need to return -EAGAIN upwards in the worst case. --sct
				911	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	912	if (!err)
Mingming Cao	b47b247	2006-03-26 01:37:56 -0800	[diff] [blame]	913	err = ext3_splice_branch(handle, inode, iblock,
				914	partial, indirect_blks, count);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	915	/*
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	916	* i_disksize growing is protected by truncate_mutex. Don't forget to
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	917	* protect it if you're about to implement concurrent
				918	* ext3_get_block() -bzzz
				919	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	if (!err && extend_disksize && inode->i_size > ei->i_disksize)
				921	ei->i_disksize = inode->i_size;
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	922	mutex_unlock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	923	if (err)
				924	goto cleanup;
				925
				926	set_buffer_new(bh_result);
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	927	got_it:
				928	map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
Suparna Bhattacharya	20acaa1	2006-09-16 12:15:58 -0700	[diff] [blame]	929	if (count > blocks_to_boundary)
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	930	set_buffer_boundary(bh_result);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	931	err = count;
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	932	/* Clean up and exit */
				933	partial = chain + depth - 1; /* the whole chain */
				934	cleanup:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	935	while (partial > chain) {
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	936	BUFFER_TRACE(partial->bh, "call brelse");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	937	brelse(partial->bh);
				938	partial--;
				939	}
Mingming Cao	fe55c45	2005-05-01 08:59:20 -0700	[diff] [blame]	940	BUFFER_TRACE(bh_result, "returned");
				941	out:
				942	return err;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	943	}
				944
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	945	#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
				946
Badari Pulavarty	f91a2ad	2006-03-26 01:38:04 -0800	[diff] [blame]	947	static int ext3_get_block(struct inode *inode, sector_t iblock,
				948	struct buffer_head *bh_result, int create)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	949	{
Dmitriy Monakhov	3e4fdaf	2007-02-10 01:46:35 -0800	[diff] [blame^]	950	handle_t *handle = ext3_journal_current_handle();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	951	int ret = 0;
Badari Pulavarty	1d8fa7a	2006-03-26 01:38:02 -0800	[diff] [blame]	952	unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	953
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	954	if (!create)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	955	goto get_block; /* A read */
				956
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	957	if (max_blocks == 1)
				958	goto get_block; /* A single block get */
				959
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	960	if (handle->h_transaction->t_state == T_LOCKED) {
				961	/*
				962	* Huge direct-io writes can hold off commits for long
				963	* periods of time. Let this commit run.
				964	*/
				965	ext3_journal_stop(handle);
				966	handle = ext3_journal_start(inode, DIO_CREDITS);
				967	if (IS_ERR(handle))
				968	ret = PTR_ERR(handle);
				969	goto get_block;
				970	}
				971
				972	if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
				973	/*
				974	* Getting low on buffer credits...
				975	*/
				976	ret = ext3_journal_extend(handle, DIO_CREDITS);
				977	if (ret > 0) {
				978	/*
				979	* Couldn't extend the transaction. Start a new one.
				980	*/
				981	ret = ext3_journal_restart(handle, DIO_CREDITS);
				982	}
				983	}
				984
				985	get_block:
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	986	if (ret == 0) {
				987	ret = ext3_get_blocks_handle(handle, inode, iblock,
				988	max_blocks, bh_result, create, 0);
				989	if (ret > 0) {
				990	bh_result->b_size = (ret << inode->i_blkbits);
				991	ret = 0;
				992	}
				993	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	994	return ret;
				995	}
				996
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	997	/*
				998	* `handle' can be NULL if create is zero
				999	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1000	struct buffer_head ext3_getblk(handle_t handle, struct inode *inode,
				1001	long block, int create, int *errp)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1002	{
				1003	struct buffer_head dummy;
				1004	int fatal = 0, err;
				1005
				1006	J_ASSERT(handle != NULL \|\| create == 0);
				1007
				1008	dummy.b_state = 0;
				1009	dummy.b_blocknr = -1000;
				1010	buffer_trace_init(&dummy.b_history);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	1011	err = ext3_get_blocks_handle(handle, inode, block, 1,
				1012	&dummy, create, 1);
Badari Pulavarty	3665d0e	2006-09-08 09:48:21 -0700	[diff] [blame]	1013	/*
				1014	* ext3_get_blocks_handle() returns number of blocks
				1015	* mapped. 0 in case of a HOLE.
				1016	*/
				1017	if (err > 0) {
				1018	if (err > 1)
				1019	WARN_ON(1);
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	1020	err = 0;
Mingming Cao	89747d3	2006-03-26 01:37:55 -0800	[diff] [blame]	1021	}
				1022	*errp = err;
				1023	if (!err && buffer_mapped(&dummy)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1024	struct buffer_head *bh;
				1025	bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
Glauber de Oliveira Costa	2973dfd	2005-10-30 15:03:05 -0800	[diff] [blame]	1026	if (!bh) {
				1027	*errp = -EIO;
				1028	goto err;
				1029	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1030	if (buffer_new(&dummy)) {
				1031	J_ASSERT(create != 0);
				1032	J_ASSERT(handle != 0);
				1033
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1034	/*
				1035	* Now that we do not always journal data, we should
				1036	* keep in mind whether this should always journal the
				1037	* new buffer as metadata. For now, regular file
				1038	* writes use ext3_get_block instead, so it's not a
				1039	* problem.
				1040	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1041	lock_buffer(bh);
				1042	BUFFER_TRACE(bh, "call get_create_access");
				1043	fatal = ext3_journal_get_create_access(handle, bh);
				1044	if (!fatal && !buffer_uptodate(bh)) {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1045	memset(bh->b_data,0,inode->i_sb->s_blocksize);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1046	set_buffer_uptodate(bh);
				1047	}
				1048	unlock_buffer(bh);
				1049	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				1050	err = ext3_journal_dirty_metadata(handle, bh);
				1051	if (!fatal)
				1052	fatal = err;
				1053	} else {
				1054	BUFFER_TRACE(bh, "not a new buffer");
				1055	}
				1056	if (fatal) {
				1057	*errp = fatal;
				1058	brelse(bh);
				1059	bh = NULL;
				1060	}
				1061	return bh;
				1062	}
Glauber de Oliveira Costa	2973dfd	2005-10-30 15:03:05 -0800	[diff] [blame]	1063	err:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1064	return NULL;
				1065	}
				1066
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1067	struct buffer_head ext3_bread(handle_t handle, struct inode *inode,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1068	int block, int create, int *err)
				1069	{
				1070	struct buffer_head * bh;
				1071
				1072	bh = ext3_getblk(handle, inode, block, create, err);
				1073	if (!bh)
				1074	return bh;
				1075	if (buffer_uptodate(bh))
				1076	return bh;
Jens Axboe	caa38fb	2006-07-23 01:41:26 +0200	[diff] [blame]	1077	ll_rw_block(READ_META, 1, &bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1078	wait_on_buffer(bh);
				1079	if (buffer_uptodate(bh))
				1080	return bh;
				1081	put_bh(bh);
				1082	*err = -EIO;
				1083	return NULL;
				1084	}
				1085
				1086	static int walk_page_buffers( handle_t *handle,
				1087	struct buffer_head *head,
				1088	unsigned from,
				1089	unsigned to,
				1090	int *partial,
				1091	int (fn)( handle_t handle,
				1092	struct buffer_head *bh))
				1093	{
				1094	struct buffer_head *bh;
				1095	unsigned block_start, block_end;
				1096	unsigned blocksize = head->b_size;
				1097	int err, ret = 0;
				1098	struct buffer_head *next;
				1099
				1100	for ( bh = head, block_start = 0;
				1101	ret == 0 && (bh != head \|\| !block_start);
Dave Kleikamp	e9ad562	2006-09-27 01:49:35 -0700	[diff] [blame]	1102	block_start = block_end, bh = next)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1103	{
				1104	next = bh->b_this_page;
				1105	block_end = block_start + blocksize;
				1106	if (block_end <= from \|\| block_start >= to) {
				1107	if (partial && !buffer_uptodate(bh))
				1108	*partial = 1;
				1109	continue;
				1110	}
				1111	err = (*fn)(handle, bh);
				1112	if (!ret)
				1113	ret = err;
				1114	}
				1115	return ret;
				1116	}
				1117
				1118	/*
				1119	* To preserve ordering, it is essential that the hole instantiation and
				1120	* the data write be encapsulated in a single transaction. We cannot
				1121	* close off a transaction and start a new one between the ext3_get_block()
				1122	* and the commit_write(). So doing the journal_start at the start of
				1123	* prepare_write() is the right place.
				1124	*
				1125	* Also, this function can nest inside ext3_writepage() ->
				1126	* block_write_full_page(). In that case, we know that ext3_writepage()
				1127	* has generated enough buffer credits to do the whole page. So we won't
				1128	* block on the journal in that case, which is good, because the caller may
				1129	* be PF_MEMALLOC.
				1130	*
				1131	* By accident, ext3 can be reentered when a transaction is open via
				1132	* quota file writes. If we were to commit the transaction while thus
				1133	* reentered, there can be a deadlock - we would be holding a quota
				1134	* lock, and the commit would never complete if another thread had a
				1135	* transaction open and was blocking on the quota lock - a ranking
				1136	* violation.
				1137	*
				1138	* So what we do is to rely on the fact that journal_stop/journal_start
				1139	* will _not_ run commit under these circumstances because handle->h_ref
				1140	* is elevated. We'll still have enough credits for the tiny quotafile
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1141	* write.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1142	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1143	static int do_journal_get_write_access(handle_t *handle,
				1144	struct buffer_head *bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1145	{
				1146	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1147	return 0;
				1148	return ext3_journal_get_write_access(handle, bh);
				1149	}
				1150
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1151	/*
				1152	* The idea of this helper function is following:
				1153	* if prepare_write has allocated some blocks, but not all of them, the
				1154	* transaction must include the content of the newly allocated blocks.
				1155	* This content is expected to be set to zeroes by block_prepare_write().
				1156	* 2006/10/14 SAW
				1157	*/
				1158	static int ext3_prepare_failure(struct file file, struct page page,
				1159	unsigned from, unsigned to)
				1160	{
				1161	struct address_space *mapping;
				1162	struct buffer_head bh, head, *next;
				1163	unsigned block_start, block_end;
				1164	unsigned blocksize;
				1165	int ret;
				1166	handle_t *handle = ext3_journal_current_handle();
				1167
				1168	mapping = page->mapping;
				1169	if (ext3_should_writeback_data(mapping->host)) {
				1170	/* optimization: no constraints about data */
				1171	skip:
				1172	return ext3_journal_stop(handle);
				1173	}
				1174
				1175	head = page_buffers(page);
				1176	blocksize = head->b_size;
				1177	for ( bh = head, block_start = 0;
				1178	bh != head \|\| !block_start;
				1179	block_start = block_end, bh = next)
				1180	{
				1181	next = bh->b_this_page;
				1182	block_end = block_start + blocksize;
				1183	if (block_end <= from)
				1184	continue;
				1185	if (block_start >= to) {
				1186	block_start = to;
				1187	break;
				1188	}
				1189	if (!buffer_mapped(bh))
				1190	/* prepare_write failed on this bh */
				1191	break;
				1192	if (ext3_should_journal_data(mapping->host)) {
				1193	ret = do_journal_get_write_access(handle, bh);
				1194	if (ret) {
				1195	ext3_journal_stop(handle);
				1196	return ret;
				1197	}
				1198	}
				1199	/*
				1200	* block_start here becomes the first block where the current iteration
				1201	* of prepare_write failed.
				1202	*/
				1203	}
				1204	if (block_start <= from)
				1205	goto skip;
				1206
				1207	/* commit allocated and zeroed buffers */
				1208	return mapping->a_ops->commit_write(file, page, from, block_start);
				1209	}
				1210
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1211	static int ext3_prepare_write(struct file file, struct page page,
				1212	unsigned from, unsigned to)
				1213	{
				1214	struct inode *inode = page->mapping->host;
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1215	int ret, ret2;
				1216	int needed_blocks = ext3_writepage_trans_blocks(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1217	handle_t *handle;
				1218	int retries = 0;
				1219
				1220	retry:
				1221	handle = ext3_journal_start(inode, needed_blocks);
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1222	if (IS_ERR(handle))
				1223	return PTR_ERR(handle);
Badari Pulavarty	0e31f51	2006-07-30 03:04:14 -0700	[diff] [blame]	1224	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1225	ret = nobh_prepare_write(page, from, to, ext3_get_block);
				1226	else
				1227	ret = block_prepare_write(page, from, to, ext3_get_block);
				1228	if (ret)
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1229	goto failure;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1230
				1231	if (ext3_should_journal_data(inode)) {
				1232	ret = walk_page_buffers(handle, page_buffers(page),
				1233	from, to, NULL, do_journal_get_write_access);
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1234	if (ret)
				1235	/* fatal error, just put the handle and return */
				1236	journal_stop(handle);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1237	}
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1238	return ret;
				1239
				1240	failure:
				1241	ret2 = ext3_prepare_failure(file, page, from, to);
				1242	if (ret2 < 0)
				1243	return ret2;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1244	if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
				1245	goto retry;
Andrey Savochkin	e92a4d5	2006-12-06 20:37:34 -0800	[diff] [blame]	1246	/* retry number exceeded, or other error like -EDQUOT */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1247	return ret;
				1248	}
				1249
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1250	int ext3_journal_dirty_data(handle_t handle, struct buffer_head bh)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1251	{
				1252	int err = journal_dirty_data(handle, bh);
				1253	if (err)
				1254	ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
				1255	bh, handle,err);
				1256	return err;
				1257	}
				1258
				1259	/* For commit_write() in data=journal mode */
				1260	static int commit_write_fn(handle_t handle, struct buffer_head bh)
				1261	{
				1262	if (!buffer_mapped(bh) \|\| buffer_freed(bh))
				1263	return 0;
				1264	set_buffer_uptodate(bh);
				1265	return ext3_journal_dirty_metadata(handle, bh);
				1266	}
				1267
				1268	/*
				1269	* We need to pick up the new inode size which generic_commit_write gave us
				1270	* `file' can be NULL - eg, when called from page_symlink().
				1271	*
				1272	* ext3 never places buffers on inode->i_mapping->private_list. metadata
				1273	* buffers are managed internally.
				1274	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1275	static int ext3_ordered_commit_write(struct file file, struct page page,
				1276	unsigned from, unsigned to)
				1277	{
				1278	handle_t *handle = ext3_journal_current_handle();
				1279	struct inode *inode = page->mapping->host;
				1280	int ret = 0, ret2;
				1281
				1282	ret = walk_page_buffers(handle, page_buffers(page),
				1283	from, to, NULL, ext3_journal_dirty_data);
				1284
				1285	if (ret == 0) {
				1286	/*
				1287	* generic_commit_write() will run mark_inode_dirty() if i_size
				1288	* changes. So let's piggyback the i_disksize mark_inode_dirty
				1289	* into that.
				1290	*/
				1291	loff_t new_i_size;
				1292
				1293	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1294	if (new_i_size > EXT3_I(inode)->i_disksize)
				1295	EXT3_I(inode)->i_disksize = new_i_size;
				1296	ret = generic_commit_write(file, page, from, to);
				1297	}
				1298	ret2 = ext3_journal_stop(handle);
				1299	if (!ret)
				1300	ret = ret2;
				1301	return ret;
				1302	}
				1303
				1304	static int ext3_writeback_commit_write(struct file file, struct page page,
				1305	unsigned from, unsigned to)
				1306	{
				1307	handle_t *handle = ext3_journal_current_handle();
				1308	struct inode *inode = page->mapping->host;
				1309	int ret = 0, ret2;
				1310	loff_t new_i_size;
				1311
				1312	new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1313	if (new_i_size > EXT3_I(inode)->i_disksize)
				1314	EXT3_I(inode)->i_disksize = new_i_size;
				1315
Badari Pulavarty	0e31f51	2006-07-30 03:04:14 -0700	[diff] [blame]	1316	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1317	ret = nobh_commit_write(file, page, from, to);
				1318	else
				1319	ret = generic_commit_write(file, page, from, to);
				1320
				1321	ret2 = ext3_journal_stop(handle);
				1322	if (!ret)
				1323	ret = ret2;
				1324	return ret;
				1325	}
				1326
				1327	static int ext3_journalled_commit_write(struct file *file,
				1328	struct page *page, unsigned from, unsigned to)
				1329	{
				1330	handle_t *handle = ext3_journal_current_handle();
				1331	struct inode *inode = page->mapping->host;
				1332	int ret = 0, ret2;
				1333	int partial = 0;
				1334	loff_t pos;
				1335
				1336	/*
				1337	* Here we duplicate the generic_commit_write() functionality
				1338	*/
				1339	pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
				1340
				1341	ret = walk_page_buffers(handle, page_buffers(page), from,
				1342	to, &partial, commit_write_fn);
				1343	if (!partial)
				1344	SetPageUptodate(page);
				1345	if (pos > inode->i_size)
				1346	i_size_write(inode, pos);
				1347	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1348	if (inode->i_size > EXT3_I(inode)->i_disksize) {
				1349	EXT3_I(inode)->i_disksize = inode->i_size;
				1350	ret2 = ext3_mark_inode_dirty(handle, inode);
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1351	if (!ret)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1352	ret = ret2;
				1353	}
				1354	ret2 = ext3_journal_stop(handle);
				1355	if (!ret)
				1356	ret = ret2;
				1357	return ret;
				1358	}
				1359
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1360	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1361	* bmap() is special. It gets used by applications such as lilo and by
				1362	* the swapper to find the on-disk block of a specific piece of data.
				1363	*
				1364	* Naturally, this is dangerous if the block concerned is still in the
				1365	* journal. If somebody makes a swapfile on an ext3 data-journaling
				1366	* filesystem and enables swap, then they may get a nasty shock when the
				1367	* data getting swapped to that swapfile suddenly gets overwritten by
				1368	* the original zero's written out previously to the journal and
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1369	* awaiting writeback in the kernel's buffer cache.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1370	*
				1371	* So, if we see any bmap calls here on a modified, data-journaled file,
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1372	* take extra steps to flush any blocks which might be in the cache.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1373	*/
				1374	static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
				1375	{
				1376	struct inode *inode = mapping->host;
				1377	journal_t *journal;
				1378	int err;
				1379
				1380	if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1381	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1382	* This is a REALLY heavyweight approach, but the use of
				1383	* bmap on dirty files is expected to be extremely rare:
				1384	* only if we run lilo or swapon on a freshly made file
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1385	* do we expect this to happen.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1386	*
				1387	* (bmap requires CAP_SYS_RAWIO so this does not
				1388	* represent an unprivileged user DOS attack --- we'd be
				1389	* in trouble if mortal users could trigger this path at
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1390	* will.)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1391	*
				1392	* NB. EXT3_STATE_JDATA is not set on files other than
				1393	* regular files. If somebody wants to bmap a directory
				1394	* or symlink and gets confused because the buffer
				1395	* hasn't yet been flushed to disk, they deserve
				1396	* everything they get.
				1397	*/
				1398
				1399	EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
				1400	journal = EXT3_JOURNAL(inode);
				1401	journal_lock_updates(journal);
				1402	err = journal_flush(journal);
				1403	journal_unlock_updates(journal);
				1404
				1405	if (err)
				1406	return 0;
				1407	}
				1408
				1409	return generic_block_bmap(mapping,block,ext3_get_block);
				1410	}
				1411
				1412	static int bget_one(handle_t handle, struct buffer_head bh)
				1413	{
				1414	get_bh(bh);
				1415	return 0;
				1416	}
				1417
				1418	static int bput_one(handle_t handle, struct buffer_head bh)
				1419	{
				1420	put_bh(bh);
				1421	return 0;
				1422	}
				1423
				1424	static int journal_dirty_data_fn(handle_t handle, struct buffer_head bh)
				1425	{
				1426	if (buffer_mapped(bh))
				1427	return ext3_journal_dirty_data(handle, bh);
				1428	return 0;
				1429	}
				1430
				1431	/*
				1432	* Note that we always start a transaction even if we're not journalling
				1433	* data. This is to preserve ordering: any hole instantiation within
				1434	* __block_write_full_page -> ext3_get_block() should be journalled
				1435	* along with the data so we don't crash and then get metadata which
				1436	* refers to old data.
				1437	*
				1438	* In all journalling modes block_write_full_page() will start the I/O.
				1439	*
				1440	* Problem:
				1441	*
				1442	* ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
				1443	* ext3_writepage()
				1444	*
				1445	* Similar for:
				1446	*
				1447	* ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
				1448	*
				1449	* Same applies to ext3_get_block(). We will deadlock on various things like
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	1450	* lock_journal and i_truncate_mutex.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1451	*
				1452	* Setting PF_MEMALLOC here doesn't work - too many internal memory
				1453	* allocations fail.
				1454	*
				1455	* 16May01: If we're reentered then journal_current_handle() will be
				1456	* non-zero. We simply return.
				1457	*
				1458	* 1 July 2001: @@@ FIXME:
				1459	* In journalled data mode, a data buffer may be metadata against the
				1460	* current transaction. But the same file is part of a shared mapping
				1461	* and someone does a writepage() on it.
				1462	*
				1463	* We will move the buffer onto the async_data list, but after it has
				1464	* been dirtied. So there's a small window where we have dirty data on
				1465	* BJ_Metadata.
				1466	*
				1467	* Note that this only applies to the last partial page in the file. The
				1468	* bit which block_write_full_page() uses prepare/commit for. (That's
				1469	* broken code anyway: it's wrong for msync()).
				1470	*
				1471	* It's a rare case: affects the final partial page, for journalled data
				1472	* where the file is subject to bith write() and writepage() in the same
				1473	* transction. To fix it we'll need a custom block_write_full_page().
				1474	* We'll probably need that anyway for journalling writepage() output.
				1475	*
				1476	* We don't honour synchronous mounts for writepage(). That would be
				1477	* disastrous. Any write() or metadata operation will sync the fs for
				1478	* us.
				1479	*
				1480	* AKPM2: if all the page's buffers are mapped to disk and !data=journal,
				1481	* we don't need to open a transaction here.
				1482	*/
				1483	static int ext3_ordered_writepage(struct page *page,
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1484	struct writeback_control *wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1485	{
				1486	struct inode *inode = page->mapping->host;
				1487	struct buffer_head *page_bufs;
				1488	handle_t *handle = NULL;
				1489	int ret = 0;
				1490	int err;
				1491
				1492	J_ASSERT(PageLocked(page));
				1493
				1494	/*
				1495	* We give up here if we're reentered, because it might be for a
				1496	* different filesystem.
				1497	*/
				1498	if (ext3_journal_current_handle())
				1499	goto out_fail;
				1500
				1501	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1502
				1503	if (IS_ERR(handle)) {
				1504	ret = PTR_ERR(handle);
				1505	goto out_fail;
				1506	}
				1507
				1508	if (!page_has_buffers(page)) {
				1509	create_empty_buffers(page, inode->i_sb->s_blocksize,
				1510	(1 << BH_Dirty)\|(1 << BH_Uptodate));
				1511	}
				1512	page_bufs = page_buffers(page);
				1513	walk_page_buffers(handle, page_bufs, 0,
				1514	PAGE_CACHE_SIZE, NULL, bget_one);
				1515
				1516	ret = block_write_full_page(page, ext3_get_block, wbc);
				1517
				1518	/*
				1519	* The page can become unlocked at any point now, and
				1520	* truncate can then come in and change things. So we
				1521	* can't touch page from now on. But page_bufs is
				1522	* safe due to elevated refcount.
				1523	*/
				1524
				1525	/*
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1526	* And attach them to the current transaction. But only if
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1527	* block_write_full_page() succeeded. Otherwise they are unmapped,
				1528	* and generally junk.
				1529	*/
				1530	if (ret == 0) {
				1531	err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
				1532	NULL, journal_dirty_data_fn);
				1533	if (!ret)
				1534	ret = err;
				1535	}
				1536	walk_page_buffers(handle, page_bufs, 0,
				1537	PAGE_CACHE_SIZE, NULL, bput_one);
				1538	err = ext3_journal_stop(handle);
				1539	if (!ret)
				1540	ret = err;
				1541	return ret;
				1542
				1543	out_fail:
				1544	redirty_page_for_writepage(wbc, page);
				1545	unlock_page(page);
				1546	return ret;
				1547	}
				1548
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1549	static int ext3_writeback_writepage(struct page *page,
				1550	struct writeback_control *wbc)
				1551	{
				1552	struct inode *inode = page->mapping->host;
				1553	handle_t *handle = NULL;
				1554	int ret = 0;
				1555	int err;
				1556
				1557	if (ext3_journal_current_handle())
				1558	goto out_fail;
				1559
				1560	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1561	if (IS_ERR(handle)) {
				1562	ret = PTR_ERR(handle);
				1563	goto out_fail;
				1564	}
				1565
Badari Pulavarty	0e31f51	2006-07-30 03:04:14 -0700	[diff] [blame]	1566	if (test_opt(inode->i_sb, NOBH) && ext3_should_writeback_data(inode))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1567	ret = nobh_writepage(page, ext3_get_block, wbc);
				1568	else
				1569	ret = block_write_full_page(page, ext3_get_block, wbc);
				1570
				1571	err = ext3_journal_stop(handle);
				1572	if (!ret)
				1573	ret = err;
				1574	return ret;
				1575
				1576	out_fail:
				1577	redirty_page_for_writepage(wbc, page);
				1578	unlock_page(page);
				1579	return ret;
				1580	}
				1581
				1582	static int ext3_journalled_writepage(struct page *page,
				1583	struct writeback_control *wbc)
				1584	{
				1585	struct inode *inode = page->mapping->host;
				1586	handle_t *handle = NULL;
				1587	int ret = 0;
				1588	int err;
				1589
				1590	if (ext3_journal_current_handle())
				1591	goto no_write;
				1592
				1593	handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
				1594	if (IS_ERR(handle)) {
				1595	ret = PTR_ERR(handle);
				1596	goto no_write;
				1597	}
				1598
				1599	if (!page_has_buffers(page) \|\| PageChecked(page)) {
				1600	/*
				1601	* It's mmapped pagecache. Add buffers and journal it. There
				1602	* doesn't seem much point in redirtying the page here.
				1603	*/
				1604	ClearPageChecked(page);
				1605	ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
				1606	ext3_get_block);
Denis Lunev	ab4eb43	2005-11-13 16:07:17 -0800	[diff] [blame]	1607	if (ret != 0) {
				1608	ext3_journal_stop(handle);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1609	goto out_unlock;
Denis Lunev	ab4eb43	2005-11-13 16:07:17 -0800	[diff] [blame]	1610	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1611	ret = walk_page_buffers(handle, page_buffers(page), 0,
				1612	PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
				1613
				1614	err = walk_page_buffers(handle, page_buffers(page), 0,
				1615	PAGE_CACHE_SIZE, NULL, commit_write_fn);
				1616	if (ret == 0)
				1617	ret = err;
				1618	EXT3_I(inode)->i_state \|= EXT3_STATE_JDATA;
				1619	unlock_page(page);
				1620	} else {
				1621	/*
				1622	* It may be a page full of checkpoint-mode buffers. We don't
				1623	* really know unless we go poke around in the buffer_heads.
				1624	* But block_write_full_page will do the right thing.
				1625	*/
				1626	ret = block_write_full_page(page, ext3_get_block, wbc);
				1627	}
				1628	err = ext3_journal_stop(handle);
				1629	if (!ret)
				1630	ret = err;
				1631	out:
				1632	return ret;
				1633
				1634	no_write:
				1635	redirty_page_for_writepage(wbc, page);
				1636	out_unlock:
				1637	unlock_page(page);
				1638	goto out;
				1639	}
				1640
				1641	static int ext3_readpage(struct file file, struct page page)
				1642	{
				1643	return mpage_readpage(page, ext3_get_block);
				1644	}
				1645
				1646	static int
				1647	ext3_readpages(struct file file, struct address_space mapping,
				1648	struct list_head *pages, unsigned nr_pages)
				1649	{
				1650	return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
				1651	}
				1652
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1653	static void ext3_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1654	{
				1655	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1656
				1657	/*
				1658	* If it's a full truncate we just forget about the pending dirtying
				1659	*/
				1660	if (offset == 0)
				1661	ClearPageChecked(page);
				1662
NeilBrown	2ff28e2	2006-03-26 01:37:18 -0800	[diff] [blame]	1663	journal_invalidatepage(journal, page, offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1664	}
				1665
Al Viro	27496a8	2005-10-21 03:20:48 -0400	[diff] [blame]	1666	static int ext3_releasepage(struct page *page, gfp_t wait)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1667	{
				1668	journal_t *journal = EXT3_JOURNAL(page->mapping->host);
				1669
				1670	WARN_ON(PageChecked(page));
				1671	if (!page_has_buffers(page))
				1672	return 0;
				1673	return journal_try_to_free_buffers(journal, page, wait);
				1674	}
				1675
				1676	/*
				1677	* If the O_DIRECT write will extend the file then add this inode to the
				1678	* orphan list. So recovery will truncate it back to the original size
				1679	* if the machine crashes during the write.
				1680	*
				1681	* If the O_DIRECT write is intantiating holes inside i_size and the machine
				1682	* crashes then stale disk data _may_ be exposed inside the file.
				1683	*/
				1684	static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
				1685	const struct iovec *iov, loff_t offset,
				1686	unsigned long nr_segs)
				1687	{
				1688	struct file *file = iocb->ki_filp;
				1689	struct inode *inode = file->f_mapping->host;
				1690	struct ext3_inode_info *ei = EXT3_I(inode);
				1691	handle_t *handle = NULL;
				1692	ssize_t ret;
				1693	int orphan = 0;
				1694	size_t count = iov_length(iov, nr_segs);
				1695
				1696	if (rw == WRITE) {
				1697	loff_t final_size = offset + count;
				1698
				1699	handle = ext3_journal_start(inode, DIO_CREDITS);
				1700	if (IS_ERR(handle)) {
				1701	ret = PTR_ERR(handle);
				1702	goto out;
				1703	}
				1704	if (final_size > inode->i_size) {
				1705	ret = ext3_orphan_add(handle, inode);
				1706	if (ret)
				1707	goto out_stop;
				1708	orphan = 1;
				1709	ei->i_disksize = inode->i_size;
				1710	}
				1711	}
				1712
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	1713	ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1714	offset, nr_segs,
Badari Pulavarty	f91a2ad	2006-03-26 01:38:04 -0800	[diff] [blame]	1715	ext3_get_block, NULL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1716
				1717	/*
Badari Pulavarty	f91a2ad	2006-03-26 01:38:04 -0800	[diff] [blame]	1718	* Reacquire the handle: ext3_get_block() can restart the transaction
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1719	*/
Dmitriy Monakhov	3e4fdaf	2007-02-10 01:46:35 -0800	[diff] [blame^]	1720	handle = ext3_journal_current_handle();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1721
				1722	out_stop:
				1723	if (handle) {
				1724	int err;
				1725
				1726	if (orphan && inode->i_nlink)
				1727	ext3_orphan_del(handle, inode);
				1728	if (orphan && ret > 0) {
				1729	loff_t end = offset + ret;
				1730	if (end > inode->i_size) {
				1731	ei->i_disksize = end;
				1732	i_size_write(inode, end);
				1733	/*
				1734	* We're going to return a positive `ret'
				1735	* here due to non-zero-length I/O, so there's
				1736	* no way of reporting error returns from
				1737	* ext3_mark_inode_dirty() to userspace. So
				1738	* ignore it.
				1739	*/
				1740	ext3_mark_inode_dirty(handle, inode);
				1741	}
				1742	}
				1743	err = ext3_journal_stop(handle);
				1744	if (ret == 0)
				1745	ret = err;
				1746	}
				1747	out:
				1748	return ret;
				1749	}
				1750
				1751	/*
				1752	* Pages can be marked dirty completely asynchronously from ext3's journalling
				1753	* activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
				1754	* much here because ->set_page_dirty is called under VFS locks. The page is
				1755	* not necessarily locked.
				1756	*
				1757	* We cannot just dirty the page and leave attached buffers clean, because the
				1758	* buffers' dirty state is "definitive". We cannot just set the buffers dirty
				1759	* or jbddirty because all the journalling code will explode.
				1760	*
				1761	* So what we do is to mark the page "pending dirty" and next time writepage
				1762	* is called, propagate that into the buffers appropriately.
				1763	*/
				1764	static int ext3_journalled_set_page_dirty(struct page *page)
				1765	{
				1766	SetPageChecked(page);
				1767	return __set_page_dirty_nobuffers(page);
				1768	}
				1769
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	1770	static const struct address_space_operations ext3_ordered_aops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1771	.readpage = ext3_readpage,
				1772	.readpages = ext3_readpages,
				1773	.writepage = ext3_ordered_writepage,
				1774	.sync_page = block_sync_page,
				1775	.prepare_write = ext3_prepare_write,
				1776	.commit_write = ext3_ordered_commit_write,
				1777	.bmap = ext3_bmap,
				1778	.invalidatepage = ext3_invalidatepage,
				1779	.releasepage = ext3_releasepage,
				1780	.direct_IO = ext3_direct_IO,
Christoph Lameter	e965f96	2006-02-01 03:05:41 -0800	[diff] [blame]	1781	.migratepage = buffer_migrate_page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1782	};
				1783
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	1784	static const struct address_space_operations ext3_writeback_aops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1785	.readpage = ext3_readpage,
				1786	.readpages = ext3_readpages,
				1787	.writepage = ext3_writeback_writepage,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1788	.sync_page = block_sync_page,
				1789	.prepare_write = ext3_prepare_write,
				1790	.commit_write = ext3_writeback_commit_write,
				1791	.bmap = ext3_bmap,
				1792	.invalidatepage = ext3_invalidatepage,
				1793	.releasepage = ext3_releasepage,
				1794	.direct_IO = ext3_direct_IO,
Christoph Lameter	e965f96	2006-02-01 03:05:41 -0800	[diff] [blame]	1795	.migratepage = buffer_migrate_page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1796	};
				1797
Christoph Hellwig	f5e54d6	2006-06-28 04:26:44 -0700	[diff] [blame]	1798	static const struct address_space_operations ext3_journalled_aops = {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1799	.readpage = ext3_readpage,
				1800	.readpages = ext3_readpages,
				1801	.writepage = ext3_journalled_writepage,
				1802	.sync_page = block_sync_page,
				1803	.prepare_write = ext3_prepare_write,
				1804	.commit_write = ext3_journalled_commit_write,
				1805	.set_page_dirty = ext3_journalled_set_page_dirty,
				1806	.bmap = ext3_bmap,
				1807	.invalidatepage = ext3_invalidatepage,
				1808	.releasepage = ext3_releasepage,
				1809	};
				1810
				1811	void ext3_set_aops(struct inode *inode)
				1812	{
				1813	if (ext3_should_order_data(inode))
				1814	inode->i_mapping->a_ops = &ext3_ordered_aops;
				1815	else if (ext3_should_writeback_data(inode))
				1816	inode->i_mapping->a_ops = &ext3_writeback_aops;
				1817	else
				1818	inode->i_mapping->a_ops = &ext3_journalled_aops;
				1819	}
				1820
				1821	/*
				1822	* ext3_block_truncate_page() zeroes out a mapping from file offset `from'
				1823	* up to the end of the block which corresponds to `from'.
				1824	* This required during truncate. We need to physically zero the tail end
				1825	* of that block so it doesn't yield old data if the file is later grown.
				1826	*/
				1827	static int ext3_block_truncate_page(handle_t handle, struct page page,
				1828	struct address_space *mapping, loff_t from)
				1829	{
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	1830	ext3_fsblk_t index = from >> PAGE_CACHE_SHIFT;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1831	unsigned offset = from & (PAGE_CACHE_SIZE-1);
				1832	unsigned blocksize, iblock, length, pos;
				1833	struct inode *inode = mapping->host;
				1834	struct buffer_head *bh;
				1835	int err = 0;
				1836	void *kaddr;
				1837
				1838	blocksize = inode->i_sb->s_blocksize;
				1839	length = blocksize - (offset & (blocksize - 1));
				1840	iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
				1841
				1842	/*
				1843	* For "nobh" option, we can only work if we don't need to
				1844	* read-in the page - otherwise we create buffers to do the IO.
				1845	*/
Badari Pulavarty	cd6ef84	2006-03-11 03:27:14 -0800	[diff] [blame]	1846	if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
				1847	ext3_should_writeback_data(inode) && PageUptodate(page)) {
				1848	kaddr = kmap_atomic(page, KM_USER0);
				1849	memset(kaddr + offset, 0, length);
				1850	flush_dcache_page(page);
				1851	kunmap_atomic(kaddr, KM_USER0);
				1852	set_page_dirty(page);
				1853	goto unlock;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1854	}
				1855
				1856	if (!page_has_buffers(page))
				1857	create_empty_buffers(page, blocksize, 0);
				1858
				1859	/* Find the buffer that contains "offset" */
				1860	bh = page_buffers(page);
				1861	pos = blocksize;
				1862	while (offset >= pos) {
				1863	bh = bh->b_this_page;
				1864	iblock++;
				1865	pos += blocksize;
				1866	}
				1867
				1868	err = 0;
				1869	if (buffer_freed(bh)) {
				1870	BUFFER_TRACE(bh, "freed: skip");
				1871	goto unlock;
				1872	}
				1873
				1874	if (!buffer_mapped(bh)) {
				1875	BUFFER_TRACE(bh, "unmapped");
				1876	ext3_get_block(inode, iblock, bh, 0);
				1877	/* unmapped? It's a hole - nothing to do */
				1878	if (!buffer_mapped(bh)) {
				1879	BUFFER_TRACE(bh, "still unmapped");
				1880	goto unlock;
				1881	}
				1882	}
				1883
				1884	/* Ok, it's mapped. Make sure it's up-to-date */
				1885	if (PageUptodate(page))
				1886	set_buffer_uptodate(bh);
				1887
				1888	if (!buffer_uptodate(bh)) {
				1889	err = -EIO;
				1890	ll_rw_block(READ, 1, &bh);
				1891	wait_on_buffer(bh);
				1892	/* Uhhuh. Read error. Complain and punt. */
				1893	if (!buffer_uptodate(bh))
				1894	goto unlock;
				1895	}
				1896
				1897	if (ext3_should_journal_data(inode)) {
				1898	BUFFER_TRACE(bh, "get write access");
				1899	err = ext3_journal_get_write_access(handle, bh);
				1900	if (err)
				1901	goto unlock;
				1902	}
				1903
				1904	kaddr = kmap_atomic(page, KM_USER0);
				1905	memset(kaddr + offset, 0, length);
				1906	flush_dcache_page(page);
				1907	kunmap_atomic(kaddr, KM_USER0);
				1908
				1909	BUFFER_TRACE(bh, "zeroed end of block");
				1910
				1911	err = 0;
				1912	if (ext3_should_journal_data(inode)) {
				1913	err = ext3_journal_dirty_metadata(handle, bh);
				1914	} else {
				1915	if (ext3_should_order_data(inode))
				1916	err = ext3_journal_dirty_data(handle, bh);
				1917	mark_buffer_dirty(bh);
				1918	}
				1919
				1920	unlock:
				1921	unlock_page(page);
				1922	page_cache_release(page);
				1923	return err;
				1924	}
				1925
				1926	/*
				1927	* Probably it should be a library function... search for first non-zero word
				1928	* or memcmp with zero_page, whatever is better for particular architecture.
				1929	* Linus?
				1930	*/
				1931	static inline int all_zeroes(__le32 p, __le32 q)
				1932	{
				1933	while (p < q)
				1934	if (*p++)
				1935	return 0;
				1936	return 1;
				1937	}
				1938
				1939	/**
				1940	* ext3_find_shared - find the indirect blocks for partial truncation.
				1941	* @inode: inode in question
				1942	* @depth: depth of the affected branch
				1943	* @offsets: offsets of pointers in that branch (see ext3_block_to_path)
				1944	* @chain: place to store the pointers to partial indirect blocks
				1945	* @top: place to the (detached) top of branch
				1946	*
				1947	* This is a helper function used by ext3_truncate().
				1948	*
				1949	* When we do truncate() we may have to clean the ends of several
				1950	* indirect blocks but leave the blocks themselves alive. Block is
				1951	* partially truncated if some data below the new i_size is refered
				1952	* from it (and it is on the path to the first completely truncated
				1953	* data block, indeed). We have to free the top of that path along
				1954	* with everything to the right of the path. Since no allocation
				1955	* past the truncation point is possible until ext3_truncate()
				1956	* finishes, we may safely do the latter, but top of branch may
				1957	* require special attention - pageout below the truncation point
				1958	* might try to populate it.
				1959	*
				1960	* We atomically detach the top of branch from the tree, store the
				1961	* block number of its root in *@top, pointers to buffer_heads of
				1962	* partially truncated blocks - in @chain[].bh and pointers to
				1963	* their last elements that should not be removed - in
				1964	* @chain[].p. Return value is the pointer to last filled element
				1965	* of @chain.
				1966	*
				1967	* The work left to caller to do the actual freeing of subtrees:
				1968	* a) free the subtree starting from *@top
				1969	* b) free the subtrees whose roots are stored in
				1970	* (@chain[i].p+1 .. end of @chain[i].bh->b_data)
				1971	* c) free the subtrees growing from the inode past the @chain[0].
				1972	* (no partially truncated stuff there). */
				1973
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	1974	static Indirect ext3_find_shared(struct inode inode, int depth,
				1975	int offsets[4], Indirect chain[4], __le32 *top)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1976	{
				1977	Indirect partial, p;
				1978	int k, err;
				1979
				1980	*top = 0;
				1981	/* Make k index the deepest non-null offest + 1 */
				1982	for (k = depth; k > 1 && !offsets[k-1]; k--)
				1983	;
				1984	partial = ext3_get_branch(inode, k, offsets, chain, &err);
				1985	/* Writer: pointers */
				1986	if (!partial)
				1987	partial = chain + k-1;
				1988	/*
				1989	* If the branch acquired continuation since we've looked at it -
				1990	* fine, it should all survive and (new) top doesn't belong to us.
				1991	*/
				1992	if (!partial->key && *partial->p)
				1993	/* Writer: end */
				1994	goto no_top;
				1995	for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
				1996	;
				1997	/*
				1998	* OK, we've found the last block that must survive. The rest of our
				1999	* branch should be detached before unlocking. However, if that rest
				2000	* of branch is all ours and does not grow immediately from the inode
				2001	* it's easier to cheat and just decrement partial->p.
				2002	*/
				2003	if (p == chain + k - 1 && p > chain) {
				2004	p->p--;
				2005	} else {
				2006	top = p->p;
				2007	/* Nope, don't do this in ext3. Must leave the tree intact */
				2008	#if 0
				2009	*p->p = 0;
				2010	#endif
				2011	}
				2012	/* Writer: end */
				2013
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2014	while(partial > p) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2015	brelse(partial->bh);
				2016	partial--;
				2017	}
				2018	no_top:
				2019	return partial;
				2020	}
				2021
				2022	/*
				2023	* Zero a number of block pointers in either an inode or an indirect block.
				2024	* If we restart the transaction we must again get write access to the
				2025	* indirect block for further modification.
				2026	*
				2027	* We release `count' blocks on disk, but (last - first) may be greater
				2028	* than `count' because there can be holes in there.
				2029	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2030	static void ext3_clear_blocks(handle_t handle, struct inode inode,
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2031	struct buffer_head *bh, ext3_fsblk_t block_to_free,
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2032	unsigned long count, __le32 first, __le32 last)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2033	{
				2034	__le32 *p;
				2035	if (try_to_extend_transaction(handle, inode)) {
				2036	if (bh) {
				2037	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				2038	ext3_journal_dirty_metadata(handle, bh);
				2039	}
				2040	ext3_mark_inode_dirty(handle, inode);
				2041	ext3_journal_test_restart(handle, inode);
				2042	if (bh) {
				2043	BUFFER_TRACE(bh, "retaking write access");
				2044	ext3_journal_get_write_access(handle, bh);
				2045	}
				2046	}
				2047
				2048	/*
				2049	* Any buffers which are on the journal will be in memory. We find
				2050	* them on the hash table so journal_revoke() will run journal_forget()
				2051	* on them. We've already detached each block from the file, so
				2052	* bforget() in journal_forget() should be safe.
				2053	*
				2054	* AKPM: turn on bforget in journal_forget()!!!
				2055	*/
				2056	for (p = first; p < last; p++) {
				2057	u32 nr = le32_to_cpu(*p);
				2058	if (nr) {
				2059	struct buffer_head *bh;
				2060
				2061	*p = 0;
				2062	bh = sb_find_get_block(inode->i_sb, nr);
				2063	ext3_forget(handle, 0, inode, bh, nr);
				2064	}
				2065	}
				2066
				2067	ext3_free_blocks(handle, inode, block_to_free, count);
				2068	}
				2069
				2070	/**
				2071	* ext3_free_data - free a list of data blocks
				2072	* @handle: handle for this transaction
				2073	* @inode: inode we are dealing with
				2074	* @this_bh: indirect buffer_head which contains @first and @last
				2075	* @first: array of block numbers
				2076	* @last: points immediately past the end of array
				2077	*
				2078	* We are freeing all blocks refered from that array (numbers are stored as
				2079	* little-endian 32-bit) and updating @inode->i_blocks appropriately.
				2080	*
				2081	* We accumulate contiguous runs of blocks to free. Conveniently, if these
				2082	* blocks are contiguous then releasing them at one time will only affect one
				2083	* or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
				2084	* actually use a lot of journal space.
				2085	*
				2086	* @this_bh will be %NULL if @first and @last point into the inode's direct
				2087	* block pointers.
				2088	*/
				2089	static void ext3_free_data(handle_t handle, struct inode inode,
				2090	struct buffer_head *this_bh,
				2091	__le32 first, __le32 last)
				2092	{
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2093	ext3_fsblk_t block_to_free = 0; /* Starting block # of a run */
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2094	unsigned long count = 0; /* Number of blocks in the run */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2095	__le32 block_to_free_p = NULL; / Pointer into inode/ind
				2096	corresponding to
				2097	block_to_free */
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2098	ext3_fsblk_t nr; /* Current block # */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2099	__le32 p; / Pointer into inode/ind
				2100	for current block */
				2101	int err;
				2102
				2103	if (this_bh) { /* For indirect block */
				2104	BUFFER_TRACE(this_bh, "get_write_access");
				2105	err = ext3_journal_get_write_access(handle, this_bh);
				2106	/* Important: if we can't update the indirect pointers
				2107	* to the blocks, we can't free them. */
				2108	if (err)
				2109	return;
				2110	}
				2111
				2112	for (p = first; p < last; p++) {
				2113	nr = le32_to_cpu(*p);
				2114	if (nr) {
				2115	/* accumulate blocks to free if they're contiguous */
				2116	if (count == 0) {
				2117	block_to_free = nr;
				2118	block_to_free_p = p;
				2119	count = 1;
				2120	} else if (nr == block_to_free + count) {
				2121	count++;
				2122	} else {
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2123	ext3_clear_blocks(handle, inode, this_bh,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2124	block_to_free,
				2125	count, block_to_free_p, p);
				2126	block_to_free = nr;
				2127	block_to_free_p = p;
				2128	count = 1;
				2129	}
				2130	}
				2131	}
				2132
				2133	if (count > 0)
				2134	ext3_clear_blocks(handle, inode, this_bh, block_to_free,
				2135	count, block_to_free_p, p);
				2136
				2137	if (this_bh) {
				2138	BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
				2139	ext3_journal_dirty_metadata(handle, this_bh);
				2140	}
				2141	}
				2142
				2143	/**
				2144	* ext3_free_branches - free an array of branches
				2145	* @handle: JBD handle for this transaction
				2146	* @inode: inode we are dealing with
				2147	* @parent_bh: the buffer_head which contains @first and @last
				2148	* @first: array of block numbers
				2149	* @last: pointer immediately past the end of array
				2150	* @depth: depth of the branches to free
				2151	*
				2152	* We are freeing all blocks refered from these branches (numbers are
				2153	* stored as little-endian 32-bit) and updating @inode->i_blocks
				2154	* appropriately.
				2155	*/
				2156	static void ext3_free_branches(handle_t handle, struct inode inode,
				2157	struct buffer_head *parent_bh,
				2158	__le32 first, __le32 last, int depth)
				2159	{
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2160	ext3_fsblk_t nr;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2161	__le32 *p;
				2162
				2163	if (is_handle_aborted(handle))
				2164	return;
				2165
				2166	if (depth--) {
				2167	struct buffer_head *bh;
				2168	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2169	p = last;
				2170	while (--p >= first) {
				2171	nr = le32_to_cpu(*p);
				2172	if (!nr)
				2173	continue; /* A hole */
				2174
				2175	/* Go read the buffer for the next level down */
				2176	bh = sb_bread(inode->i_sb, nr);
				2177
				2178	/*
				2179	* A read failure? Report error and clear slot
				2180	* (should be rare).
				2181	*/
				2182	if (!bh) {
				2183	ext3_error(inode->i_sb, "ext3_free_branches",
Eric Sandeen	eee194e	2006-09-27 01:49:30 -0700	[diff] [blame]	2184	"Read failure, inode=%lu, block="E3FSBLK,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2185	inode->i_ino, nr);
				2186	continue;
				2187	}
				2188
				2189	/* This zaps the entire block. Bottom up. */
				2190	BUFFER_TRACE(bh, "free child branches");
				2191	ext3_free_branches(handle, inode, bh,
				2192	(__le32*)bh->b_data,
				2193	(__le32*)bh->b_data + addr_per_block,
				2194	depth);
				2195
				2196	/*
				2197	* We've probably journalled the indirect block several
				2198	* times during the truncate. But it's no longer
				2199	* needed and we now drop it from the transaction via
				2200	* journal_revoke().
				2201	*
				2202	* That's easy if it's exclusively part of this
				2203	* transaction. But if it's part of the committing
				2204	* transaction then journal_forget() will simply
				2205	* brelse() it. That means that if the underlying
				2206	* block is reallocated in ext3_get_block(),
				2207	* unmap_underlying_metadata() will find this block
				2208	* and will try to get rid of it. damn, damn.
				2209	*
				2210	* If this block has already been committed to the
				2211	* journal, a revoke record will be written. And
				2212	* revoke records must be emitted before clearing
				2213	* this block's bit in the bitmaps.
				2214	*/
				2215	ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
				2216
				2217	/*
				2218	* Everything below this this pointer has been
				2219	* released. Now let this top-of-subtree go.
				2220	*
				2221	* We want the freeing of this indirect block to be
				2222	* atomic in the journal with the updating of the
				2223	* bitmap block which owns it. So make some room in
				2224	* the journal.
				2225	*
				2226	* We zero the parent pointer after freeing its
				2227	* pointee in the bitmaps, so if extend_transaction()
				2228	* for some reason fails to put the bitmap changes and
				2229	* the release into the same transaction, recovery
				2230	* will merely complain about releasing a free block,
				2231	* rather than leaking blocks.
				2232	*/
				2233	if (is_handle_aborted(handle))
				2234	return;
				2235	if (try_to_extend_transaction(handle, inode)) {
				2236	ext3_mark_inode_dirty(handle, inode);
				2237	ext3_journal_test_restart(handle, inode);
				2238	}
				2239
				2240	ext3_free_blocks(handle, inode, nr, 1);
				2241
				2242	if (parent_bh) {
				2243	/*
				2244	* The block which we have just freed is
				2245	* pointed to by an indirect block: journal it
				2246	*/
				2247	BUFFER_TRACE(parent_bh, "get_write_access");
				2248	if (!ext3_journal_get_write_access(handle,
				2249	parent_bh)){
				2250	*p = 0;
				2251	BUFFER_TRACE(parent_bh,
				2252	"call ext3_journal_dirty_metadata");
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2253	ext3_journal_dirty_metadata(handle,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2254	parent_bh);
				2255	}
				2256	}
				2257	}
				2258	} else {
				2259	/* We have reached the bottom of the tree. */
				2260	BUFFER_TRACE(parent_bh, "free data blocks");
				2261	ext3_free_data(handle, inode, parent_bh, first, last);
				2262	}
				2263	}
				2264
				2265	/*
				2266	* ext3_truncate()
				2267	*
				2268	* We block out ext3_get_block() block instantiations across the entire
				2269	* transaction, and VFS/VM ensures that ext3_truncate() cannot run
				2270	* simultaneously on behalf of the same inode.
				2271	*
				2272	* As we work through the truncate and commmit bits of it to the journal there
				2273	* is one core, guiding principle: the file's tree must always be consistent on
				2274	* disk. We must be able to restart the truncate after a crash.
				2275	*
				2276	* The file's tree may be transiently inconsistent in memory (although it
				2277	* probably isn't), but whenever we close off and commit a journal transaction,
				2278	* the contents of (the filesystem + the journal) must be consistent and
				2279	* restartable. It's pretty simple, really: bottom up, right to left (although
				2280	* left-to-right works OK too).
				2281	*
				2282	* Note that at recovery time, journal replay occurs before the restart of
				2283	* truncate against the orphan inode list.
				2284	*
				2285	* The committed inode has the new, desired i_size (which is the same as
				2286	* i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
				2287	* that this inode's truncate did not complete and it will again call
				2288	* ext3_truncate() to have another go. So there will be instantiated blocks
				2289	* to the right of the truncation point in a crashed ext3 filesystem. But
				2290	* that's fine - as long as they are linked from the inode, the post-crash
				2291	* ext3_truncate() run will find them and release them.
				2292	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2293	void ext3_truncate(struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2294	{
				2295	handle_t *handle;
				2296	struct ext3_inode_info *ei = EXT3_I(inode);
				2297	__le32 *i_data = ei->i_data;
				2298	int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
				2299	struct address_space *mapping = inode->i_mapping;
				2300	int offsets[4];
				2301	Indirect chain[4];
				2302	Indirect *partial;
				2303	__le32 nr = 0;
				2304	int n;
				2305	long last_block;
				2306	unsigned blocksize = inode->i_sb->s_blocksize;
				2307	struct page *page;
				2308
				2309	if (!(S_ISREG(inode->i_mode) \|\| S_ISDIR(inode->i_mode) \|\|
				2310	S_ISLNK(inode->i_mode)))
				2311	return;
				2312	if (ext3_inode_is_fast_symlink(inode))
				2313	return;
				2314	if (IS_APPEND(inode) \|\| IS_IMMUTABLE(inode))
				2315	return;
				2316
				2317	/*
				2318	* We have to lock the EOF page here, because lock_page() nests
				2319	* outside journal_start().
				2320	*/
				2321	if ((inode->i_size & (blocksize - 1)) == 0) {
				2322	/* Block boundary? Nothing to do */
				2323	page = NULL;
				2324	} else {
				2325	page = grab_cache_page(mapping,
				2326	inode->i_size >> PAGE_CACHE_SHIFT);
				2327	if (!page)
				2328	return;
				2329	}
				2330
				2331	handle = start_transaction(inode);
				2332	if (IS_ERR(handle)) {
				2333	if (page) {
				2334	clear_highpage(page);
				2335	flush_dcache_page(page);
				2336	unlock_page(page);
				2337	page_cache_release(page);
				2338	}
				2339	return; /* AKPM: return what? */
				2340	}
				2341
				2342	last_block = (inode->i_size + blocksize-1)
				2343	>> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
				2344
				2345	if (page)
				2346	ext3_block_truncate_page(handle, page, mapping, inode->i_size);
				2347
				2348	n = ext3_block_to_path(inode, last_block, offsets, NULL);
				2349	if (n == 0)
				2350	goto out_stop; /* error */
				2351
				2352	/*
				2353	* OK. This truncate is going to happen. We add the inode to the
				2354	* orphan list, so that if this truncate spans multiple transactions,
				2355	* and we crash, we will resume the truncate when the filesystem
				2356	* recovers. It also marks the inode dirty, to catch the new size.
				2357	*
				2358	* Implication: the file must always be in a sane, consistent
				2359	* truncatable state while each transaction commits.
				2360	*/
				2361	if (ext3_orphan_add(handle, inode))
				2362	goto out_stop;
				2363
				2364	/*
				2365	* The orphan list entry will now protect us from any crash which
				2366	* occurs before the truncate completes, so it is now safe to propagate
				2367	* the new, shorter inode size (held for now in i_size) into the
				2368	* on-disk inode. We do this via i_disksize, which is the value which
				2369	* ext3 really writes onto the disk inode.
				2370	*/
				2371	ei->i_disksize = inode->i_size;
				2372
				2373	/*
				2374	* From here we block out all ext3_get_block() callers who want to
				2375	* modify the block allocation tree.
				2376	*/
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	2377	mutex_lock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2378
				2379	if (n == 1) { /* direct blocks */
				2380	ext3_free_data(handle, inode, NULL, i_data+offsets[0],
				2381	i_data + EXT3_NDIR_BLOCKS);
				2382	goto do_indirects;
				2383	}
				2384
				2385	partial = ext3_find_shared(inode, n, offsets, chain, &nr);
				2386	/* Kill the top of shared branch (not detached) */
				2387	if (nr) {
				2388	if (partial == chain) {
				2389	/* Shared branch grows from the inode */
				2390	ext3_free_branches(handle, inode, NULL,
				2391	&nr, &nr+1, (chain+n-1) - partial);
				2392	*partial->p = 0;
				2393	/*
				2394	* We mark the inode dirty prior to restart,
				2395	* and prior to stop. No need for it here.
				2396	*/
				2397	} else {
				2398	/* Shared branch grows from an indirect block */
				2399	BUFFER_TRACE(partial->bh, "get_write_access");
				2400	ext3_free_branches(handle, inode, partial->bh,
				2401	partial->p,
				2402	partial->p+1, (chain+n-1) - partial);
				2403	}
				2404	}
				2405	/* Clear the ends of indirect blocks on the shared branch */
				2406	while (partial > chain) {
				2407	ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
				2408	(__le32*)partial->bh->b_data+addr_per_block,
				2409	(chain+n-1) - partial);
				2410	BUFFER_TRACE(partial->bh, "call brelse");
				2411	brelse (partial->bh);
				2412	partial--;
				2413	}
				2414	do_indirects:
				2415	/* Kill the remaining (whole) subtrees */
				2416	switch (offsets[0]) {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2417	default:
				2418	nr = i_data[EXT3_IND_BLOCK];
				2419	if (nr) {
				2420	ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 1);
				2421	i_data[EXT3_IND_BLOCK] = 0;
				2422	}
				2423	case EXT3_IND_BLOCK:
				2424	nr = i_data[EXT3_DIND_BLOCK];
				2425	if (nr) {
				2426	ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 2);
				2427	i_data[EXT3_DIND_BLOCK] = 0;
				2428	}
				2429	case EXT3_DIND_BLOCK:
				2430	nr = i_data[EXT3_TIND_BLOCK];
				2431	if (nr) {
				2432	ext3_free_branches(handle, inode, NULL, &nr, &nr+1, 3);
				2433	i_data[EXT3_TIND_BLOCK] = 0;
				2434	}
				2435	case EXT3_TIND_BLOCK:
				2436	;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2437	}
				2438
				2439	ext3_discard_reservation(inode);
				2440
Arjan van de Ven	9746151	2006-03-23 03:00:42 -0800	[diff] [blame]	2441	mutex_unlock(&ei->truncate_mutex);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2442	inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
				2443	ext3_mark_inode_dirty(handle, inode);
				2444
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2445	/*
				2446	* In a multi-transaction truncate, we only make the final transaction
				2447	* synchronous
				2448	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2449	if (IS_SYNC(inode))
				2450	handle->h_sync = 1;
				2451	out_stop:
				2452	/*
				2453	* If this was a simple ftruncate(), and the file will remain alive
				2454	* then we need to clear up the orphan record which we created above.
				2455	* However, if this was a real unlink then we were called by
				2456	* ext3_delete_inode(), and we allow that function to clean up the
				2457	* orphan info for us.
				2458	*/
				2459	if (inode->i_nlink)
				2460	ext3_orphan_del(handle, inode);
				2461
				2462	ext3_journal_stop(handle);
				2463	}
				2464
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2465	static ext3_fsblk_t ext3_get_inode_block(struct super_block *sb,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2466	unsigned long ino, struct ext3_iloc *iloc)
				2467	{
				2468	unsigned long desc, group_desc, block_group;
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2469	unsigned long offset;
				2470	ext3_fsblk_t block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2471	struct buffer_head *bh;
				2472	struct ext3_group_desc * gdp;
				2473
Neil Brown	2ccb48e	2006-07-30 03:03:01 -0700	[diff] [blame]	2474	if (!ext3_valid_inum(sb, ino)) {
				2475	/*
				2476	* This error is already checked for in namei.c unless we are
				2477	* looking at an NFS filehandle, in which case no error
				2478	* report is needed
				2479	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2480	return 0;
				2481	}
Neil Brown	2ccb48e	2006-07-30 03:03:01 -0700	[diff] [blame]	2482
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2483	block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
				2484	if (block_group >= EXT3_SB(sb)->s_groups_count) {
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2485	ext3_error(sb,"ext3_get_inode_block","group >= groups count");
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2486	return 0;
				2487	}
				2488	smp_rmb();
				2489	group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
				2490	desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
				2491	bh = EXT3_SB(sb)->s_group_desc[group_desc];
				2492	if (!bh) {
				2493	ext3_error (sb, "ext3_get_inode_block",
				2494	"Descriptor not loaded");
				2495	return 0;
				2496	}
				2497
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	2498	gdp = (struct ext3_group_desc *)bh->b_data;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2499	/*
				2500	* Figure out the offset within the block group inode table
				2501	*/
				2502	offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
				2503	EXT3_INODE_SIZE(sb);
				2504	block = le32_to_cpu(gdp[desc].bg_inode_table) +
				2505	(offset >> EXT3_BLOCK_SIZE_BITS(sb));
				2506
				2507	iloc->block_group = block_group;
				2508	iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
				2509	return block;
				2510	}
				2511
				2512	/*
				2513	* ext3_get_inode_loc returns with an extra refcount against the inode's
				2514	* underlying buffer_head on success. If 'in_mem' is true, we have all
				2515	* data in memory that is needed to recreate the on-disk version of this
				2516	* inode.
				2517	*/
				2518	static int __ext3_get_inode_loc(struct inode *inode,
				2519	struct ext3_iloc *iloc, int in_mem)
				2520	{
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2521	ext3_fsblk_t block;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2522	struct buffer_head *bh;
				2523
				2524	block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
				2525	if (!block)
				2526	return -EIO;
				2527
				2528	bh = sb_getblk(inode->i_sb, block);
				2529	if (!bh) {
				2530	ext3_error (inode->i_sb, "ext3_get_inode_loc",
				2531	"unable to read inode block - "
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2532	"inode=%lu, block="E3FSBLK,
				2533	inode->i_ino, block);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2534	return -EIO;
				2535	}
				2536	if (!buffer_uptodate(bh)) {
				2537	lock_buffer(bh);
				2538	if (buffer_uptodate(bh)) {
				2539	/* someone brought it uptodate while we waited */
				2540	unlock_buffer(bh);
				2541	goto has_buffer;
				2542	}
				2543
				2544	/*
				2545	* If we have all information of the inode in memory and this
				2546	* is the only valid inode in the block, we need not read the
				2547	* block.
				2548	*/
				2549	if (in_mem) {
				2550	struct buffer_head *bitmap_bh;
				2551	struct ext3_group_desc *desc;
				2552	int inodes_per_buffer;
				2553	int inode_offset, i;
				2554	int block_group;
				2555	int start;
				2556
				2557	block_group = (inode->i_ino - 1) /
				2558	EXT3_INODES_PER_GROUP(inode->i_sb);
				2559	inodes_per_buffer = bh->b_size /
				2560	EXT3_INODE_SIZE(inode->i_sb);
				2561	inode_offset = ((inode->i_ino - 1) %
				2562	EXT3_INODES_PER_GROUP(inode->i_sb));
				2563	start = inode_offset & ~(inodes_per_buffer - 1);
				2564
				2565	/* Is the inode bitmap in cache? */
				2566	desc = ext3_get_group_desc(inode->i_sb,
				2567	block_group, NULL);
				2568	if (!desc)
				2569	goto make_io;
				2570
				2571	bitmap_bh = sb_getblk(inode->i_sb,
				2572	le32_to_cpu(desc->bg_inode_bitmap));
				2573	if (!bitmap_bh)
				2574	goto make_io;
				2575
				2576	/*
				2577	* If the inode bitmap isn't in cache then the
				2578	* optimisation may end up performing two reads instead
				2579	* of one, so skip it.
				2580	*/
				2581	if (!buffer_uptodate(bitmap_bh)) {
				2582	brelse(bitmap_bh);
				2583	goto make_io;
				2584	}
				2585	for (i = start; i < start + inodes_per_buffer; i++) {
				2586	if (i == inode_offset)
				2587	continue;
				2588	if (ext3_test_bit(i, bitmap_bh->b_data))
				2589	break;
				2590	}
				2591	brelse(bitmap_bh);
				2592	if (i == start + inodes_per_buffer) {
				2593	/* all other inodes are free, so skip I/O */
				2594	memset(bh->b_data, 0, bh->b_size);
				2595	set_buffer_uptodate(bh);
				2596	unlock_buffer(bh);
				2597	goto has_buffer;
				2598	}
				2599	}
				2600
				2601	make_io:
				2602	/*
				2603	* There are other valid inodes in the buffer, this inode
				2604	* has in-inode xattrs, or we don't have this inode in memory.
				2605	* Read the block from disk.
				2606	*/
				2607	get_bh(bh);
				2608	bh->b_end_io = end_buffer_read_sync;
Jens Axboe	caa38fb	2006-07-23 01:41:26 +0200	[diff] [blame]	2609	submit_bh(READ_META, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2610	wait_on_buffer(bh);
				2611	if (!buffer_uptodate(bh)) {
				2612	ext3_error(inode->i_sb, "ext3_get_inode_loc",
				2613	"unable to read inode block - "
Mingming Cao	43d23f9	2006-06-25 05:48:07 -0700	[diff] [blame]	2614	"inode=%lu, block="E3FSBLK,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2615	inode->i_ino, block);
				2616	brelse(bh);
				2617	return -EIO;
				2618	}
				2619	}
				2620	has_buffer:
				2621	iloc->bh = bh;
				2622	return 0;
				2623	}
				2624
				2625	int ext3_get_inode_loc(struct inode inode, struct ext3_iloc iloc)
				2626	{
				2627	/* We have all inode data except xattrs in memory here. */
				2628	return __ext3_get_inode_loc(inode, iloc,
				2629	!(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
				2630	}
				2631
				2632	void ext3_set_inode_flags(struct inode *inode)
				2633	{
				2634	unsigned int flags = EXT3_I(inode)->i_flags;
				2635
				2636	inode->i_flags &= ~(S_SYNC\|S_APPEND\|S_IMMUTABLE\|S_NOATIME\|S_DIRSYNC);
				2637	if (flags & EXT3_SYNC_FL)
				2638	inode->i_flags \|= S_SYNC;
				2639	if (flags & EXT3_APPEND_FL)
				2640	inode->i_flags \|= S_APPEND;
				2641	if (flags & EXT3_IMMUTABLE_FL)
				2642	inode->i_flags \|= S_IMMUTABLE;
				2643	if (flags & EXT3_NOATIME_FL)
				2644	inode->i_flags \|= S_NOATIME;
				2645	if (flags & EXT3_DIRSYNC_FL)
				2646	inode->i_flags \|= S_DIRSYNC;
				2647	}
				2648
				2649	void ext3_read_inode(struct inode * inode)
				2650	{
				2651	struct ext3_iloc iloc;
				2652	struct ext3_inode *raw_inode;
				2653	struct ext3_inode_info *ei = EXT3_I(inode);
				2654	struct buffer_head *bh;
				2655	int block;
				2656
				2657	#ifdef CONFIG_EXT3_FS_POSIX_ACL
				2658	ei->i_acl = EXT3_ACL_NOT_CACHED;
				2659	ei->i_default_acl = EXT3_ACL_NOT_CACHED;
				2660	#endif
				2661	ei->i_block_alloc_info = NULL;
				2662
				2663	if (__ext3_get_inode_loc(inode, &iloc, 0))
				2664	goto bad_inode;
				2665	bh = iloc.bh;
				2666	raw_inode = ext3_raw_inode(&iloc);
				2667	inode->i_mode = le16_to_cpu(raw_inode->i_mode);
				2668	inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
				2669	inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
				2670	if(!(test_opt (inode->i_sb, NO_UID32))) {
				2671	inode->i_uid \|= le16_to_cpu(raw_inode->i_uid_high) << 16;
				2672	inode->i_gid \|= le16_to_cpu(raw_inode->i_gid_high) << 16;
				2673	}
				2674	inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
				2675	inode->i_size = le32_to_cpu(raw_inode->i_size);
				2676	inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
				2677	inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
				2678	inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
				2679	inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
				2680
				2681	ei->i_state = 0;
				2682	ei->i_dir_start_lookup = 0;
				2683	ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
				2684	/* We now have enough fields to check if the inode was active or not.
				2685	* This is needed because nfsd might try to access dead inodes
				2686	* the test is that same one that e2fsck uses
				2687	* NeilBrown 1999oct15
				2688	*/
				2689	if (inode->i_nlink == 0) {
				2690	if (inode->i_mode == 0 \|\|
				2691	!(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
				2692	/* this inode is deleted */
				2693	brelse (bh);
				2694	goto bad_inode;
				2695	}
				2696	/* The only unlinked inodes we let through here have
				2697	* valid i_mode and are being read by the orphan
				2698	* recovery code: that's fine, we're about to complete
				2699	* the process of deleting those. */
				2700	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2701	inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
				2702	ei->i_flags = le32_to_cpu(raw_inode->i_flags);
				2703	#ifdef EXT3_FRAGMENTS
				2704	ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
				2705	ei->i_frag_no = raw_inode->i_frag;
				2706	ei->i_frag_size = raw_inode->i_fsize;
				2707	#endif
				2708	ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
				2709	if (!S_ISREG(inode->i_mode)) {
				2710	ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
				2711	} else {
				2712	inode->i_size \|=
				2713	((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
				2714	}
				2715	ei->i_disksize = inode->i_size;
				2716	inode->i_generation = le32_to_cpu(raw_inode->i_generation);
				2717	ei->i_block_group = iloc.block_group;
				2718	/*
				2719	* NOTE! The in-memory inode i_data array is in little-endian order
				2720	* even on big-endian machines: we do NOT byteswap the block numbers!
				2721	*/
				2722	for (block = 0; block < EXT3_N_BLOCKS; block++)
				2723	ei->i_data[block] = raw_inode->i_block[block];
				2724	INIT_LIST_HEAD(&ei->i_orphan);
				2725
				2726	if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
				2727	EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
				2728	/*
				2729	* When mke2fs creates big inodes it does not zero out
				2730	* the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
				2731	* so ignore those first few inodes.
				2732	*/
				2733	ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
				2734	if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
				2735	EXT3_INODE_SIZE(inode->i_sb))
				2736	goto bad_inode;
				2737	if (ei->i_extra_isize == 0) {
				2738	/* The extra space is currently unused. Use it. */
				2739	ei->i_extra_isize = sizeof(struct ext3_inode) -
				2740	EXT3_GOOD_OLD_INODE_SIZE;
				2741	} else {
				2742	__le32 magic = (void )raw_inode +
				2743	EXT3_GOOD_OLD_INODE_SIZE +
				2744	ei->i_extra_isize;
				2745	if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
				2746	ei->i_state \|= EXT3_STATE_XATTR;
				2747	}
				2748	} else
				2749	ei->i_extra_isize = 0;
				2750
				2751	if (S_ISREG(inode->i_mode)) {
				2752	inode->i_op = &ext3_file_inode_operations;
				2753	inode->i_fop = &ext3_file_operations;
				2754	ext3_set_aops(inode);
				2755	} else if (S_ISDIR(inode->i_mode)) {
				2756	inode->i_op = &ext3_dir_inode_operations;
				2757	inode->i_fop = &ext3_dir_operations;
				2758	} else if (S_ISLNK(inode->i_mode)) {
				2759	if (ext3_inode_is_fast_symlink(inode))
				2760	inode->i_op = &ext3_fast_symlink_inode_operations;
				2761	else {
				2762	inode->i_op = &ext3_symlink_inode_operations;
				2763	ext3_set_aops(inode);
				2764	}
				2765	} else {
				2766	inode->i_op = &ext3_special_inode_operations;
				2767	if (raw_inode->i_block[0])
				2768	init_special_inode(inode, inode->i_mode,
				2769	old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2770	else
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2771	init_special_inode(inode, inode->i_mode,
				2772	new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
				2773	}
				2774	brelse (iloc.bh);
				2775	ext3_set_inode_flags(inode);
				2776	return;
				2777
				2778	bad_inode:
				2779	make_bad_inode(inode);
				2780	return;
				2781	}
				2782
				2783	/*
				2784	* Post the struct inode info into an on-disk inode location in the
				2785	* buffer-cache. This gobbles the caller's reference to the
				2786	* buffer_head in the inode location struct.
				2787	*
				2788	* The caller must have write access to iloc->bh.
				2789	*/
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2790	static int ext3_do_update_inode(handle_t *handle,
				2791	struct inode *inode,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2792	struct ext3_iloc *iloc)
				2793	{
				2794	struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
				2795	struct ext3_inode_info *ei = EXT3_I(inode);
				2796	struct buffer_head *bh = iloc->bh;
				2797	int err = 0, rc, block;
				2798
				2799	/* For fields not not tracking in the in-memory inode,
				2800	* initialise them to zero for new inodes. */
				2801	if (ei->i_state & EXT3_STATE_NEW)
				2802	memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
				2803
				2804	raw_inode->i_mode = cpu_to_le16(inode->i_mode);
				2805	if(!(test_opt(inode->i_sb, NO_UID32))) {
				2806	raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
				2807	raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
				2808	/*
				2809	* Fix up interoperability with old kernels. Otherwise, old inodes get
				2810	* re-used with the upper 16 bits of the uid/gid intact
				2811	*/
				2812	if(!ei->i_dtime) {
				2813	raw_inode->i_uid_high =
				2814	cpu_to_le16(high_16_bits(inode->i_uid));
				2815	raw_inode->i_gid_high =
				2816	cpu_to_le16(high_16_bits(inode->i_gid));
				2817	} else {
				2818	raw_inode->i_uid_high = 0;
				2819	raw_inode->i_gid_high = 0;
				2820	}
				2821	} else {
				2822	raw_inode->i_uid_low =
				2823	cpu_to_le16(fs_high2lowuid(inode->i_uid));
				2824	raw_inode->i_gid_low =
				2825	cpu_to_le16(fs_high2lowgid(inode->i_gid));
				2826	raw_inode->i_uid_high = 0;
				2827	raw_inode->i_gid_high = 0;
				2828	}
				2829	raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
				2830	raw_inode->i_size = cpu_to_le32(ei->i_disksize);
				2831	raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
				2832	raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
				2833	raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
				2834	raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
				2835	raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
				2836	raw_inode->i_flags = cpu_to_le32(ei->i_flags);
				2837	#ifdef EXT3_FRAGMENTS
				2838	raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
				2839	raw_inode->i_frag = ei->i_frag_no;
				2840	raw_inode->i_fsize = ei->i_frag_size;
				2841	#endif
				2842	raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
				2843	if (!S_ISREG(inode->i_mode)) {
				2844	raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
				2845	} else {
				2846	raw_inode->i_size_high =
				2847	cpu_to_le32(ei->i_disksize >> 32);
				2848	if (ei->i_disksize > 0x7fffffffULL) {
				2849	struct super_block *sb = inode->i_sb;
				2850	if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
				2851	EXT3_FEATURE_RO_COMPAT_LARGE_FILE) \|\|
				2852	EXT3_SB(sb)->s_es->s_rev_level ==
				2853	cpu_to_le32(EXT3_GOOD_OLD_REV)) {
				2854	/* If this is the first large file
				2855	* created, add a flag to the superblock.
				2856	*/
				2857	err = ext3_journal_get_write_access(handle,
				2858	EXT3_SB(sb)->s_sbh);
				2859	if (err)
				2860	goto out_brelse;
				2861	ext3_update_dynamic_rev(sb);
				2862	EXT3_SET_RO_COMPAT_FEATURE(sb,
				2863	EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
				2864	sb->s_dirt = 1;
				2865	handle->h_sync = 1;
				2866	err = ext3_journal_dirty_metadata(handle,
				2867	EXT3_SB(sb)->s_sbh);
				2868	}
				2869	}
				2870	}
				2871	raw_inode->i_generation = cpu_to_le32(inode->i_generation);
				2872	if (S_ISCHR(inode->i_mode) \|\| S_ISBLK(inode->i_mode)) {
				2873	if (old_valid_dev(inode->i_rdev)) {
				2874	raw_inode->i_block[0] =
				2875	cpu_to_le32(old_encode_dev(inode->i_rdev));
				2876	raw_inode->i_block[1] = 0;
				2877	} else {
				2878	raw_inode->i_block[0] = 0;
				2879	raw_inode->i_block[1] =
				2880	cpu_to_le32(new_encode_dev(inode->i_rdev));
				2881	raw_inode->i_block[2] = 0;
				2882	}
				2883	} else for (block = 0; block < EXT3_N_BLOCKS; block++)
				2884	raw_inode->i_block[block] = ei->i_data[block];
				2885
Andreas Gruenbacher	ff87b37	2005-07-07 17:57:00 -0700	[diff] [blame]	2886	if (ei->i_extra_isize)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2887	raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
				2888
				2889	BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
				2890	rc = ext3_journal_dirty_metadata(handle, bh);
				2891	if (!err)
				2892	err = rc;
				2893	ei->i_state &= ~EXT3_STATE_NEW;
				2894
				2895	out_brelse:
				2896	brelse (bh);
				2897	ext3_std_error(inode->i_sb, err);
				2898	return err;
				2899	}
				2900
				2901	/*
				2902	* ext3_write_inode()
				2903	*
				2904	* We are called from a few places:
				2905	*
				2906	* - Within generic_file_write() for O_SYNC files.
				2907	* Here, there will be no transaction running. We wait for any running
				2908	* trasnaction to commit.
				2909	*
				2910	* - Within sys_sync(), kupdate and such.
				2911	* We wait on commit, if tol to.
				2912	*
				2913	* - Within prune_icache() (PF_MEMALLOC == true)
				2914	* Here we simply return. We can't afford to block kswapd on the
				2915	* journal commit.
				2916	*
				2917	* In all cases it is actually safe for us to return without doing anything,
				2918	* because the inode has been copied into a raw inode buffer in
				2919	* ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
				2920	* knfsd.
				2921	*
				2922	* Note that we are absolutely dependent upon all inode dirtiers doing the
				2923	* right thing: they must call mark_inode_dirty() after dirtying info in
				2924	* which we are interested.
				2925	*
				2926	* It would be a bug for them to not do this. The code:
				2927	*
				2928	* mark_inode_dirty(inode)
				2929	* stuff();
				2930	* inode->i_size = expr;
				2931	*
				2932	* is in error because a kswapd-driven write_inode() could occur while
				2933	* `stuff()' is running, and the new i_size will be lost. Plus the inode
				2934	* will no longer be on the superblock's dirty inode list.
				2935	*/
				2936	int ext3_write_inode(struct inode *inode, int wait)
				2937	{
				2938	if (current->flags & PF_MEMALLOC)
				2939	return 0;
				2940
				2941	if (ext3_journal_current_handle()) {
				2942	jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
				2943	dump_stack();
				2944	return -EIO;
				2945	}
				2946
				2947	if (!wait)
				2948	return 0;
				2949
				2950	return ext3_force_commit(inode->i_sb);
				2951	}
				2952
				2953	/*
				2954	* ext3_setattr()
				2955	*
				2956	* Called from notify_change.
				2957	*
				2958	* We want to trap VFS attempts to truncate the file as soon as
				2959	* possible. In particular, we want to make sure that when the VFS
				2960	* shrinks i_size, we put the inode on the orphan list and modify
				2961	* i_disksize immediately, so that during the subsequent flushing of
				2962	* dirty pages and freeing of disk blocks, we can guarantee that any
				2963	* commit will leave the blocks being flushed in an unused state on
				2964	* disk. (On recovery, the inode will get truncated and the blocks will
				2965	* be freed, so we have a strong guarantee that no future commit will
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	2966	* leave these blocks visible to the user.)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2967	*
				2968	* Called with inode->sem down.
				2969	*/
				2970	int ext3_setattr(struct dentry dentry, struct iattr attr)
				2971	{
				2972	struct inode *inode = dentry->d_inode;
				2973	int error, rc = 0;
				2974	const unsigned int ia_valid = attr->ia_valid;
				2975
				2976	error = inode_change_ok(inode, attr);
				2977	if (error)
				2978	return error;
				2979
				2980	if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) \|\|
				2981	(ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
				2982	handle_t *handle;
				2983
				2984	/* (user+group)*(old+new) structure, inode write (sb,
				2985	* inode block, ? - but truncate inode update has it) */
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	2986	handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
				2987	EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2988	if (IS_ERR(handle)) {
				2989	error = PTR_ERR(handle);
				2990	goto err_out;
				2991	}
				2992	error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
				2993	if (error) {
				2994	ext3_journal_stop(handle);
				2995	return error;
				2996	}
				2997	/* Update corresponding info in inode so that everything is in
				2998	* one transaction */
				2999	if (attr->ia_valid & ATTR_UID)
				3000	inode->i_uid = attr->ia_uid;
				3001	if (attr->ia_valid & ATTR_GID)
				3002	inode->i_gid = attr->ia_gid;
				3003	error = ext3_mark_inode_dirty(handle, inode);
				3004	ext3_journal_stop(handle);
				3005	}
				3006
				3007	if (S_ISREG(inode->i_mode) &&
				3008	attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
				3009	handle_t *handle;
				3010
				3011	handle = ext3_journal_start(inode, 3);
				3012	if (IS_ERR(handle)) {
				3013	error = PTR_ERR(handle);
				3014	goto err_out;
				3015	}
				3016
				3017	error = ext3_orphan_add(handle, inode);
				3018	EXT3_I(inode)->i_disksize = attr->ia_size;
				3019	rc = ext3_mark_inode_dirty(handle, inode);
				3020	if (!error)
				3021	error = rc;
				3022	ext3_journal_stop(handle);
				3023	}
				3024
				3025	rc = inode_setattr(inode, attr);
				3026
				3027	/* If inode_setattr's call to ext3_truncate failed to get a
				3028	* transaction handle at all, we need to clean up the in-core
				3029	* orphan list manually. */
				3030	if (inode->i_nlink)
				3031	ext3_orphan_del(NULL, inode);
				3032
				3033	if (!rc && (ia_valid & ATTR_MODE))
				3034	rc = ext3_acl_chmod(inode);
				3035
				3036	err_out:
				3037	ext3_std_error(inode->i_sb, error);
				3038	if (!error)
				3039	error = rc;
				3040	return error;
				3041	}
				3042
				3043
				3044	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	3045	* How many blocks doth make a writepage()?
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3046	*
				3047	* With N blocks per page, it may be:
				3048	* N data blocks
				3049	* 2 indirect block
				3050	* 2 dindirect
				3051	* 1 tindirect
				3052	* N+5 bitmap blocks (from the above)
				3053	* N+5 group descriptor summary blocks
				3054	* 1 inode block
				3055	* 1 superblock.
				3056	* 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
				3057	*
				3058	* 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
				3059	*
				3060	* With ordered or writeback data it's the same, less the N data blocks.
				3061	*
				3062	* If the inode's direct blocks can hold an integral number of pages then a
				3063	* page cannot straddle two indirect blocks, and we can only touch one indirect
				3064	* and dindirect block, and the "5" above becomes "3".
				3065	*
				3066	* This still overestimates under most circumstances. If we were to pass the
				3067	* start and end offsets in here as well we could do block_to_path() on each
				3068	* block and work out the exact number of indirects which are touched. Pah.
				3069	*/
				3070
				3071	static int ext3_writepage_trans_blocks(struct inode *inode)
				3072	{
				3073	int bpp = ext3_journal_blocks_per_page(inode);
				3074	int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
				3075	int ret;
				3076
				3077	if (ext3_should_journal_data(inode))
				3078	ret = 3 * (bpp + indirects) + 2;
				3079	else
				3080	ret = 2 * (bpp + indirects) + 2;
				3081
				3082	#ifdef CONFIG_QUOTA
				3083	/* We know that structure was already allocated during DQUOT_INIT so
				3084	* we will be updating only the data blocks + inodes */
Jan Kara	1f54587	2005-06-23 22:01:04 -0700	[diff] [blame]	3085	ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3086	#endif
				3087
				3088	return ret;
				3089	}
				3090
				3091	/*
				3092	* The caller must have previously called ext3_reserve_inode_write().
				3093	* Give this, we know that the caller already has write access to iloc->bh.
				3094	*/
				3095	int ext3_mark_iloc_dirty(handle_t *handle,
				3096	struct inode inode, struct ext3_iloc iloc)
				3097	{
				3098	int err = 0;
				3099
				3100	/* the do_update_inode consumes one bh->b_count */
				3101	get_bh(iloc->bh);
				3102
				3103	/* ext3_do_update_inode() does journal_dirty_metadata */
				3104	err = ext3_do_update_inode(handle, inode, iloc);
				3105	put_bh(iloc->bh);
				3106	return err;
				3107	}
				3108
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	3109	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3110	* On success, We end up with an outstanding reference count against
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	3111	* iloc->bh. This _must_ be cleaned up later.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3112	*/
				3113
				3114	int
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	3115	ext3_reserve_inode_write(handle_t handle, struct inode inode,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3116	struct ext3_iloc *iloc)
				3117	{
				3118	int err = 0;
				3119	if (handle) {
				3120	err = ext3_get_inode_loc(inode, iloc);
				3121	if (!err) {
				3122	BUFFER_TRACE(iloc->bh, "get_write_access");
				3123	err = ext3_journal_get_write_access(handle, iloc->bh);
				3124	if (err) {
				3125	brelse(iloc->bh);
				3126	iloc->bh = NULL;
				3127	}
				3128	}
				3129	}
				3130	ext3_std_error(inode->i_sb, err);
				3131	return err;
				3132	}
				3133
				3134	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	3135	* What we do here is to mark the in-core inode as clean with respect to inode
				3136	* dirtiness (it may still be data-dirty).
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3137	* This means that the in-core inode may be reaped by prune_icache
				3138	* without having to perform any I/O. This is a very good thing,
				3139	* because any task may call prune_icache - even ones which
				3140	* have a transaction open against a different journal.
				3141	*
				3142	* Is this cheating? Not really. Sure, we haven't written the
				3143	* inode out, but prune_icache isn't a user-visible syncing function.
				3144	* Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
				3145	* we start and wait on commits.
				3146	*
				3147	* Is this efficient/effective? Well, we're being nice to the system
				3148	* by cleaning up our inodes proactively so they can be reaped
				3149	* without I/O. But we are potentially leaving up to five seconds'
				3150	* worth of inodes floating about which prune_icache wants us to
				3151	* write out. One way to fix that would be to get prune_icache()
				3152	* to do a write_super() to free up some memory. It has the desired
				3153	* effect.
				3154	*/
				3155	int ext3_mark_inode_dirty(handle_t handle, struct inode inode)
				3156	{
				3157	struct ext3_iloc iloc;
				3158	int err;
				3159
				3160	might_sleep();
				3161	err = ext3_reserve_inode_write(handle, inode, &iloc);
				3162	if (!err)
				3163	err = ext3_mark_iloc_dirty(handle, inode, &iloc);
				3164	return err;
				3165	}
				3166
				3167	/*
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	3168	* ext3_dirty_inode() is called from __mark_inode_dirty()
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3169	*
				3170	* We're really interested in the case where a file is being extended.
				3171	* i_size has been changed by generic_commit_write() and we thus need
				3172	* to include the updated inode in the current transaction.
				3173	*
				3174	* Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
				3175	* are allocated to the file.
				3176	*
				3177	* If the inode is marked synchronous, we don't honour that here - doing
				3178	* so would cause a commit on atime updates, which we don't bother doing.
				3179	* We handle synchronous inodes at the highest possible level.
				3180	*/
				3181	void ext3_dirty_inode(struct inode *inode)
				3182	{
				3183	handle_t *current_handle = ext3_journal_current_handle();
				3184	handle_t *handle;
				3185
				3186	handle = ext3_journal_start(inode, 2);
				3187	if (IS_ERR(handle))
				3188	goto out;
				3189	if (current_handle &&
				3190	current_handle->h_transaction != handle->h_transaction) {
				3191	/* This task has a transaction open against a different fs */
				3192	printk(KERN_EMERG "%s: transactions do not match!\n",
				3193	__FUNCTION__);
				3194	} else {
				3195	jbd_debug(5, "marking dirty. outer handle=%p\n",
				3196	current_handle);
				3197	ext3_mark_inode_dirty(handle, inode);
				3198	}
				3199	ext3_journal_stop(handle);
				3200	out:
				3201	return;
				3202	}
				3203
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	3204	#if 0
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	3205	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3206	* Bind an inode's backing buffer_head into this transaction, to prevent
				3207	* it from being flushed to disk early. Unlike
				3208	* ext3_reserve_inode_write, this leaves behind no bh reference and
				3209	* returns no iloc structure, so the caller needs to repeat the iloc
				3210	* lookup to mark the inode dirty later.
				3211	*/
Andrew Morton	d6859bf	2006-03-26 01:38:03 -0800	[diff] [blame]	3212	static int ext3_pin_inode(handle_t handle, struct inode inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3213	{
				3214	struct ext3_iloc iloc;
				3215
				3216	int err = 0;
				3217	if (handle) {
				3218	err = ext3_get_inode_loc(inode, &iloc);
				3219	if (!err) {
				3220	BUFFER_TRACE(iloc.bh, "get_write_access");
				3221	err = journal_get_write_access(handle, iloc.bh);
				3222	if (!err)
Mingming Cao	ae6ddcc	2006-09-27 01:49:27 -0700	[diff] [blame]	3223	err = ext3_journal_dirty_metadata(handle,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3224	iloc.bh);
				3225	brelse(iloc.bh);
				3226	}
				3227	}
				3228	ext3_std_error(inode->i_sb, err);
				3229	return err;
				3230	}
				3231	#endif
				3232
				3233	int ext3_change_inode_journal_flag(struct inode *inode, int val)
				3234	{
				3235	journal_t *journal;
				3236	handle_t *handle;
				3237	int err;
				3238
				3239	/*
				3240	* We have to be very careful here: changing a data block's
				3241	* journaling status dynamically is dangerous. If we write a
				3242	* data block to the journal, change the status and then delete
				3243	* that block, we risk forgetting to revoke the old log record
				3244	* from the journal and so a subsequent replay can corrupt data.
				3245	* So, first we make sure that the journal is empty and that
				3246	* nobody is changing anything.
				3247	*/
				3248
				3249	journal = EXT3_JOURNAL(inode);
				3250	if (is_journal_aborted(journal) \|\| IS_RDONLY(inode))
				3251	return -EROFS;
				3252
				3253	journal_lock_updates(journal);
				3254	journal_flush(journal);
				3255
				3256	/*
				3257	* OK, there are no updates running now, and all cached data is
				3258	* synced to disk. We are now in a completely consistent state
				3259	* which doesn't have anything in the journal, and we know that
				3260	* no filesystem updates are running, so it is safe to modify
				3261	* the inode's in-core data-journaling state flag now.
				3262	*/
				3263
				3264	if (val)
				3265	EXT3_I(inode)->i_flags \|= EXT3_JOURNAL_DATA_FL;
				3266	else
				3267	EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
				3268	ext3_set_aops(inode);
				3269
				3270	journal_unlock_updates(journal);
				3271
				3272	/* Finally we can mark the inode as dirty. */
				3273
				3274	handle = ext3_journal_start(inode, 1);
				3275	if (IS_ERR(handle))
				3276	return PTR_ERR(handle);
				3277
				3278	err = ext3_mark_inode_dirty(handle, inode);
				3279	handle->h_sync = 1;
				3280	ext3_journal_stop(handle);
				3281	ext3_std_error(inode->i_sb, err);
				3282
				3283	return err;
				3284	}