blob: 0cd126176bbbcf942bb91c4f8e3f1d5bbc554868 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
2 * linux/fs/ext3/inode.c
3 *
4 * Copyright (C) 1992, 1993, 1994, 1995
5 * Remy Card (card@masi.ibp.fr)
6 * Laboratoire MASI - Institut Blaise Pascal
7 * Universite Pierre et Marie Curie (Paris VI)
8 *
9 * from
10 *
11 * linux/fs/minix/inode.c
12 *
13 * Copyright (C) 1991, 1992 Linus Torvalds
14 *
15 * Goal-directed block allocation by Stephen Tweedie
16 * (sct@redhat.com), 1993, 1998
17 * Big-endian to little-endian byte-swapping/bitmaps by
18 * David S. Miller (davem@caip.rutgers.edu), 1995
19 * 64-bit file support on 64-bit platforms by Jakub Jelinek
20 * (jj@sunsite.ms.mff.cuni.cz)
21 *
22 * Assorted race fixes, rewrite of ext3_get_block() by Al Viro, 2000
23 */
24
25#include <linux/module.h>
26#include <linux/fs.h>
27#include <linux/time.h>
28#include <linux/ext3_jbd.h>
29#include <linux/jbd.h>
30#include <linux/smp_lock.h>
31#include <linux/highuid.h>
32#include <linux/pagemap.h>
33#include <linux/quotaops.h>
34#include <linux/string.h>
35#include <linux/buffer_head.h>
36#include <linux/writeback.h>
37#include <linux/mpage.h>
38#include <linux/uio.h>
39#include "xattr.h"
40#include "acl.h"
41
42static int ext3_writepage_trans_blocks(struct inode *inode);
43
44/*
45 * Test whether an inode is a fast symlink.
46 */
47static inline int ext3_inode_is_fast_symlink(struct inode *inode)
48{
49 int ea_blocks = EXT3_I(inode)->i_file_acl ?
50 (inode->i_sb->s_blocksize >> 9) : 0;
51
52 return (S_ISLNK(inode->i_mode) &&
53 inode->i_blocks - ea_blocks == 0);
54}
55
56/* The ext3 forget function must perform a revoke if we are freeing data
57 * which has been journaled. Metadata (eg. indirect blocks) must be
58 * revoked in all cases.
59 *
60 * "bh" may be NULL: a metadata block may have been freed from memory
61 * but there may still be a record of it in the journal, and that record
62 * still needs to be revoked.
63 */
64
65int ext3_forget(handle_t *handle, int is_metadata,
66 struct inode *inode, struct buffer_head *bh,
67 int blocknr)
68{
69 int err;
70
71 might_sleep();
72
73 BUFFER_TRACE(bh, "enter");
74
75 jbd_debug(4, "forgetting bh %p: is_metadata = %d, mode %o, "
76 "data mode %lx\n",
77 bh, is_metadata, inode->i_mode,
78 test_opt(inode->i_sb, DATA_FLAGS));
79
80 /* Never use the revoke function if we are doing full data
81 * journaling: there is no need to, and a V1 superblock won't
82 * support it. Otherwise, only skip the revoke on un-journaled
83 * data blocks. */
84
85 if (test_opt(inode->i_sb, DATA_FLAGS) == EXT3_MOUNT_JOURNAL_DATA ||
86 (!is_metadata && !ext3_should_journal_data(inode))) {
87 if (bh) {
88 BUFFER_TRACE(bh, "call journal_forget");
89 return ext3_journal_forget(handle, bh);
90 }
91 return 0;
92 }
93
94 /*
95 * data!=journal && (is_metadata || should_journal_data(inode))
96 */
97 BUFFER_TRACE(bh, "call ext3_journal_revoke");
98 err = ext3_journal_revoke(handle, blocknr, bh);
99 if (err)
100 ext3_abort(inode->i_sb, __FUNCTION__,
101 "error %d when attempting revoke", err);
102 BUFFER_TRACE(bh, "exit");
103 return err;
104}
105
106/*
107 * Work out how many blocks we need to progress with the next chunk of a
108 * truncate transaction.
109 */
110
111static unsigned long blocks_for_truncate(struct inode *inode)
112{
113 unsigned long needed;
114
115 needed = inode->i_blocks >> (inode->i_sb->s_blocksize_bits - 9);
116
117 /* Give ourselves just enough room to cope with inodes in which
118 * i_blocks is corrupt: we've seen disk corruptions in the past
119 * which resulted in random data in an inode which looked enough
120 * like a regular file for ext3 to try to delete it. Things
121 * will go a bit crazy if that happens, but at least we should
122 * try not to panic the whole kernel. */
123 if (needed < 2)
124 needed = 2;
125
126 /* But we need to bound the transaction so we don't overflow the
127 * journal. */
128 if (needed > EXT3_MAX_TRANS_DATA)
129 needed = EXT3_MAX_TRANS_DATA;
130
Jan Kara1f545872005-06-23 22:01:04 -0700131 return EXT3_DATA_TRANS_BLOCKS(inode->i_sb) + needed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700132}
133
134/*
135 * Truncate transactions can be complex and absolutely huge. So we need to
136 * be able to restart the transaction at a conventient checkpoint to make
137 * sure we don't overflow the journal.
138 *
139 * start_transaction gets us a new handle for a truncate transaction,
140 * and extend_transaction tries to extend the existing one a bit. If
141 * extend fails, we need to propagate the failure up and restart the
142 * transaction in the top-level truncate loop. --sct
143 */
144
145static handle_t *start_transaction(struct inode *inode)
146{
147 handle_t *result;
148
149 result = ext3_journal_start(inode, blocks_for_truncate(inode));
150 if (!IS_ERR(result))
151 return result;
152
153 ext3_std_error(inode->i_sb, PTR_ERR(result));
154 return result;
155}
156
157/*
158 * Try to extend this transaction for the purposes of truncation.
159 *
160 * Returns 0 if we managed to create more room. If we can't create more
161 * room, and the transaction must be restarted we return 1.
162 */
163static int try_to_extend_transaction(handle_t *handle, struct inode *inode)
164{
165 if (handle->h_buffer_credits > EXT3_RESERVE_TRANS_BLOCKS)
166 return 0;
167 if (!ext3_journal_extend(handle, blocks_for_truncate(inode)))
168 return 0;
169 return 1;
170}
171
172/*
173 * Restart the transaction associated with *handle. This does a commit,
174 * so before we call here everything must be consistently dirtied against
175 * this transaction.
176 */
177static int ext3_journal_test_restart(handle_t *handle, struct inode *inode)
178{
179 jbd_debug(2, "restarting handle %p\n", handle);
180 return ext3_journal_restart(handle, blocks_for_truncate(inode));
181}
182
183/*
184 * Called at the last iput() if i_nlink is zero.
185 */
186void ext3_delete_inode (struct inode * inode)
187{
188 handle_t *handle;
189
Mark Fashehfef26652005-09-09 13:01:31 -0700190 truncate_inode_pages(&inode->i_data, 0);
191
Linus Torvalds1da177e2005-04-16 15:20:36 -0700192 if (is_bad_inode(inode))
193 goto no_delete;
194
195 handle = start_transaction(inode);
196 if (IS_ERR(handle)) {
197 /* If we're going to skip the normal cleanup, we still
198 * need to make sure that the in-core orphan linked list
199 * is properly cleaned up. */
200 ext3_orphan_del(NULL, inode);
201 goto no_delete;
202 }
203
204 if (IS_SYNC(inode))
205 handle->h_sync = 1;
206 inode->i_size = 0;
207 if (inode->i_blocks)
208 ext3_truncate(inode);
209 /*
210 * Kill off the orphan record which ext3_truncate created.
211 * AKPM: I think this can be inside the above `if'.
212 * Note that ext3_orphan_del() has to be able to cope with the
213 * deletion of a non-existent orphan - this is because we don't
214 * know if ext3_truncate() actually created an orphan record.
215 * (Well, we could do this if we need to, but heck - it works)
216 */
217 ext3_orphan_del(handle, inode);
218 EXT3_I(inode)->i_dtime = get_seconds();
219
220 /*
221 * One subtle ordering requirement: if anything has gone wrong
222 * (transaction abort, IO errors, whatever), then we can still
223 * do these next steps (the fs will already have been marked as
224 * having errors), but we can't free the inode if the mark_dirty
225 * fails.
226 */
227 if (ext3_mark_inode_dirty(handle, inode))
228 /* If that failed, just do the required in-core inode clear. */
229 clear_inode(inode);
230 else
231 ext3_free_inode(handle, inode);
232 ext3_journal_stop(handle);
233 return;
234no_delete:
235 clear_inode(inode); /* We must guarantee clearing of inode... */
236}
237
Linus Torvalds1da177e2005-04-16 15:20:36 -0700238typedef struct {
239 __le32 *p;
240 __le32 key;
241 struct buffer_head *bh;
242} Indirect;
243
244static inline void add_chain(Indirect *p, struct buffer_head *bh, __le32 *v)
245{
246 p->key = *(p->p = v);
247 p->bh = bh;
248}
249
250static inline int verify_chain(Indirect *from, Indirect *to)
251{
252 while (from <= to && from->key == *from->p)
253 from++;
254 return (from > to);
255}
256
257/**
258 * ext3_block_to_path - parse the block number into array of offsets
259 * @inode: inode in question (we are only interested in its superblock)
260 * @i_block: block number to be parsed
261 * @offsets: array to store the offsets in
262 * @boundary: set this non-zero if the referred-to block is likely to be
263 * followed (on disk) by an indirect block.
264 *
265 * To store the locations of file's data ext3 uses a data structure common
266 * for UNIX filesystems - tree of pointers anchored in the inode, with
267 * data blocks at leaves and indirect blocks in intermediate nodes.
268 * This function translates the block number into path in that tree -
269 * return value is the path length and @offsets[n] is the offset of
270 * pointer to (n+1)th node in the nth one. If @block is out of range
271 * (negative or too large) warning is printed and zero returned.
272 *
273 * Note: function doesn't find node addresses, so no IO is needed. All
274 * we need to know is the capacity of indirect blocks (taken from the
275 * inode->i_sb).
276 */
277
278/*
279 * Portability note: the last comparison (check that we fit into triple
280 * indirect block) is spelled differently, because otherwise on an
281 * architecture with 32-bit longs and 8Kb pages we might get into trouble
282 * if our filesystem had 8Kb blocks. We might use long long, but that would
283 * kill us on x86. Oh, well, at least the sign propagation does not matter -
284 * i_block would have to be negative in the very beginning, so we would not
285 * get there at all.
286 */
287
288static int ext3_block_to_path(struct inode *inode,
289 long i_block, int offsets[4], int *boundary)
290{
291 int ptrs = EXT3_ADDR_PER_BLOCK(inode->i_sb);
292 int ptrs_bits = EXT3_ADDR_PER_BLOCK_BITS(inode->i_sb);
293 const long direct_blocks = EXT3_NDIR_BLOCKS,
294 indirect_blocks = ptrs,
295 double_blocks = (1 << (ptrs_bits * 2));
296 int n = 0;
297 int final = 0;
298
299 if (i_block < 0) {
300 ext3_warning (inode->i_sb, "ext3_block_to_path", "block < 0");
301 } else if (i_block < direct_blocks) {
302 offsets[n++] = i_block;
303 final = direct_blocks;
304 } else if ( (i_block -= direct_blocks) < indirect_blocks) {
305 offsets[n++] = EXT3_IND_BLOCK;
306 offsets[n++] = i_block;
307 final = ptrs;
308 } else if ((i_block -= indirect_blocks) < double_blocks) {
309 offsets[n++] = EXT3_DIND_BLOCK;
310 offsets[n++] = i_block >> ptrs_bits;
311 offsets[n++] = i_block & (ptrs - 1);
312 final = ptrs;
313 } else if (((i_block -= double_blocks) >> (ptrs_bits * 2)) < ptrs) {
314 offsets[n++] = EXT3_TIND_BLOCK;
315 offsets[n++] = i_block >> (ptrs_bits * 2);
316 offsets[n++] = (i_block >> ptrs_bits) & (ptrs - 1);
317 offsets[n++] = i_block & (ptrs - 1);
318 final = ptrs;
319 } else {
320 ext3_warning (inode->i_sb, "ext3_block_to_path", "block > big");
321 }
322 if (boundary)
Mingming Cao89747d32006-03-26 01:37:55 -0800323 *boundary = final - 1 - (i_block & (ptrs - 1));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700324 return n;
325}
326
327/**
328 * ext3_get_branch - read the chain of indirect blocks leading to data
329 * @inode: inode in question
330 * @depth: depth of the chain (1 - direct pointer, etc.)
331 * @offsets: offsets of pointers in inode/indirect blocks
332 * @chain: place to store the result
333 * @err: here we store the error value
334 *
335 * Function fills the array of triples <key, p, bh> and returns %NULL
336 * if everything went OK or the pointer to the last filled triple
337 * (incomplete one) otherwise. Upon the return chain[i].key contains
338 * the number of (i+1)-th block in the chain (as it is stored in memory,
339 * i.e. little-endian 32-bit), chain[i].p contains the address of that
340 * number (it points into struct inode for i==0 and into the bh->b_data
341 * for i>0) and chain[i].bh points to the buffer_head of i-th indirect
342 * block for i>0 and NULL for i==0. In other words, it holds the block
343 * numbers of the chain, addresses they were taken from (and where we can
344 * verify that chain did not change) and buffer_heads hosting these
345 * numbers.
346 *
347 * Function stops when it stumbles upon zero pointer (absent block)
348 * (pointer to last triple returned, *@err == 0)
349 * or when it gets an IO error reading an indirect block
350 * (ditto, *@err == -EIO)
351 * or when it notices that chain had been changed while it was reading
352 * (ditto, *@err == -EAGAIN)
353 * or when it reads all @depth-1 indirect blocks successfully and finds
354 * the whole chain, all way to the data (returns %NULL, *err == 0).
355 */
356static Indirect *ext3_get_branch(struct inode *inode, int depth, int *offsets,
357 Indirect chain[4], int *err)
358{
359 struct super_block *sb = inode->i_sb;
360 Indirect *p = chain;
361 struct buffer_head *bh;
362
363 *err = 0;
364 /* i_data is not going away, no lock needed */
365 add_chain (chain, NULL, EXT3_I(inode)->i_data + *offsets);
366 if (!p->key)
367 goto no_block;
368 while (--depth) {
369 bh = sb_bread(sb, le32_to_cpu(p->key));
370 if (!bh)
371 goto failure;
372 /* Reader: pointers */
373 if (!verify_chain(chain, p))
374 goto changed;
375 add_chain(++p, bh, (__le32*)bh->b_data + *++offsets);
376 /* Reader: end */
377 if (!p->key)
378 goto no_block;
379 }
380 return NULL;
381
382changed:
383 brelse(bh);
384 *err = -EAGAIN;
385 goto no_block;
386failure:
387 *err = -EIO;
388no_block:
389 return p;
390}
391
392/**
393 * ext3_find_near - find a place for allocation with sufficient locality
394 * @inode: owner
395 * @ind: descriptor of indirect block.
396 *
397 * This function returns the prefered place for block allocation.
398 * It is used when heuristic for sequential allocation fails.
399 * Rules are:
400 * + if there is a block to the left of our position - allocate near it.
401 * + if pointer will live in indirect block - allocate near that block.
402 * + if pointer will live in inode - allocate in the same
403 * cylinder group.
404 *
405 * In the latter case we colour the starting block by the callers PID to
406 * prevent it from clashing with concurrent allocations for a different inode
407 * in the same block group. The PID is used here so that functionally related
408 * files will be close-by on-disk.
409 *
410 * Caller must make sure that @ind is valid and will stay that way.
411 */
412
413static unsigned long ext3_find_near(struct inode *inode, Indirect *ind)
414{
415 struct ext3_inode_info *ei = EXT3_I(inode);
416 __le32 *start = ind->bh ? (__le32*) ind->bh->b_data : ei->i_data;
417 __le32 *p;
418 unsigned long bg_start;
419 unsigned long colour;
420
421 /* Try to find previous block */
422 for (p = ind->p - 1; p >= start; p--)
423 if (*p)
424 return le32_to_cpu(*p);
425
426 /* No such thing, so let's try location of indirect block */
427 if (ind->bh)
428 return ind->bh->b_blocknr;
429
430 /*
431 * It is going to be refered from inode itself? OK, just put it into
432 * the same cylinder group then.
433 */
434 bg_start = (ei->i_block_group * EXT3_BLOCKS_PER_GROUP(inode->i_sb)) +
435 le32_to_cpu(EXT3_SB(inode->i_sb)->s_es->s_first_data_block);
436 colour = (current->pid % 16) *
437 (EXT3_BLOCKS_PER_GROUP(inode->i_sb) / 16);
438 return bg_start + colour;
439}
440
441/**
442 * ext3_find_goal - find a prefered place for allocation.
443 * @inode: owner
444 * @block: block we want
445 * @chain: chain of indirect blocks
446 * @partial: pointer to the last triple within a chain
447 * @goal: place to store the result.
448 *
449 * Normally this function find the prefered place for block allocation,
Mingming Caofe55c452005-05-01 08:59:20 -0700450 * stores it in *@goal and returns zero.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451 */
452
Mingming Caofe55c452005-05-01 08:59:20 -0700453static unsigned long ext3_find_goal(struct inode *inode, long block,
454 Indirect chain[4], Indirect *partial)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700455{
456 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
457
458 /*
459 * try the heuristic for sequential allocation,
460 * failing that at least try to get decent locality.
461 */
462 if (block_i && (block == block_i->last_alloc_logical_block + 1)
463 && (block_i->last_alloc_physical_block != 0)) {
Mingming Caofe55c452005-05-01 08:59:20 -0700464 return block_i->last_alloc_physical_block + 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700465 }
466
Mingming Caofe55c452005-05-01 08:59:20 -0700467 return ext3_find_near(inode, partial);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700468}
Mingming Caob47b2472006-03-26 01:37:56 -0800469/**
470 * ext3_blks_to_allocate: Look up the block map and count the number
471 * of direct blocks need to be allocated for the given branch.
472 *
473 * @branch: chain of indirect blocks
474 * @k: number of blocks need for indirect blocks
475 * @blks: number of data blocks to be mapped.
476 * @blocks_to_boundary: the offset in the indirect block
477 *
478 * return the total number of blocks to be allocate, including the
479 * direct and indirect blocks.
480 */
481static int
482ext3_blks_to_allocate(Indirect * branch, int k, unsigned long blks,
483 int blocks_to_boundary)
484{
485 unsigned long count = 0;
486
487 /*
488 * Simple case, [t,d]Indirect block(s) has not allocated yet
489 * then it's clear blocks on that path have not allocated
490 */
491 if (k > 0) {
492 /* right now don't hanel cross boundary allocation */
493 if (blks < blocks_to_boundary + 1)
494 count += blks;
495 else
496 count += blocks_to_boundary + 1;
497 return count;
498 }
499
500 count++;
501 while (count < blks && count <= blocks_to_boundary &&
502 le32_to_cpu(*(branch[0].p + count)) == 0) {
503 count++;
504 }
505 return count;
506}
507
508/**
509 * ext3_alloc_blocks: multiple allocate blocks needed for a branch
510 * @indirect_blks: the number of blocks need to allocate for indirect
511 * blocks
512 *
513 * @new_blocks: on return it will store the new block numbers for
514 * the indirect blocks(if needed) and the first direct block,
515 * @blks: on return it will store the total number of allocated
516 * direct blocks
517 */
518static int ext3_alloc_blocks(handle_t *handle, struct inode *inode,
519 unsigned long goal, int indirect_blks, int blks,
520 unsigned long long new_blocks[4], int *err)
521{
522 int target, i;
523 unsigned long count = 0;
524 int index = 0;
525 unsigned long current_block = 0;
526 int ret = 0;
527
528 /*
529 * Here we try to allocate the requested multiple blocks at once,
530 * on a best-effort basis.
531 * To build a branch, we should allocate blocks for
532 * the indirect blocks(if not allocated yet), and at least
533 * the first direct block of this branch. That's the
534 * minimum number of blocks need to allocate(required)
535 */
536 target = blks + indirect_blks;
537
538 while (1) {
539 count = target;
540 /* allocating blocks for indirect blocks and direct blocks */
541 current_block = ext3_new_blocks(handle, inode, goal, &count, err);
542 if (*err)
543 goto failed_out;
544
545 target -= count;
546 /* allocate blocks for indirect blocks */
547 while (index < indirect_blks && count) {
548 new_blocks[index++] = current_block++;
549 count--;
550 }
551
552 if (count > 0)
553 break;
554 }
555
556 /* save the new block number for the first direct block */
557 new_blocks[index] = current_block;
558
559 /* total number of blocks allocated for direct blocks */
560 ret = count;
561 *err = 0;
562 return ret;
563failed_out:
564 for (i = 0; i <index; i++)
565 ext3_free_blocks(handle, inode, new_blocks[i], 1);
566 return ret;
567}
Linus Torvalds1da177e2005-04-16 15:20:36 -0700568
569/**
570 * ext3_alloc_branch - allocate and set up a chain of blocks.
571 * @inode: owner
Mingming Caob47b2472006-03-26 01:37:56 -0800572 * @indirect_blks: number of allocated indirect blocks
573 * @blks: number of allocated direct blocks
Linus Torvalds1da177e2005-04-16 15:20:36 -0700574 * @offsets: offsets (in the blocks) to store the pointers to next.
575 * @branch: place to store the chain in.
576 *
Mingming Caob47b2472006-03-26 01:37:56 -0800577 * This function allocates blocks, zeroes out all but the last one,
Linus Torvalds1da177e2005-04-16 15:20:36 -0700578 * links them into chain and (if we are synchronous) writes them to disk.
579 * In other words, it prepares a branch that can be spliced onto the
580 * inode. It stores the information about that chain in the branch[], in
581 * the same format as ext3_get_branch() would do. We are calling it after
582 * we had read the existing part of chain and partial points to the last
583 * triple of that (one with zero ->key). Upon the exit we have the same
Glauber de Oliveira Costa5b116872005-10-30 15:02:48 -0800584 * picture as after the successful ext3_get_block(), except that in one
Linus Torvalds1da177e2005-04-16 15:20:36 -0700585 * place chain is disconnected - *branch->p is still zero (we did not
586 * set the last link), but branch->key contains the number that should
587 * be placed into *branch->p to fill that gap.
588 *
589 * If allocation fails we free all blocks we've allocated (and forget
590 * their buffer_heads) and return the error value the from failed
591 * ext3_alloc_block() (normally -ENOSPC). Otherwise we set the chain
592 * as described above and return 0.
593 */
594
595static int ext3_alloc_branch(handle_t *handle, struct inode *inode,
Mingming Caob47b2472006-03-26 01:37:56 -0800596 int indirect_blks, int *blks, unsigned long goal,
597 int *offsets, Indirect *branch)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700598{
599 int blocksize = inode->i_sb->s_blocksize;
Mingming Caob47b2472006-03-26 01:37:56 -0800600 int i, n = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700601 int err = 0;
Mingming Caob47b2472006-03-26 01:37:56 -0800602 struct buffer_head *bh;
603 int num;
604 unsigned long long new_blocks[4];
605 unsigned long long current_block;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700606
Mingming Caob47b2472006-03-26 01:37:56 -0800607 num = ext3_alloc_blocks(handle, inode, goal, indirect_blks,
608 *blks, new_blocks, &err);
609 if (err)
610 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700611
Mingming Caob47b2472006-03-26 01:37:56 -0800612 branch[0].key = cpu_to_le32(new_blocks[0]);
613 /*
614 * metadata blocks and data blocks are allocated.
615 */
616 for (n = 1; n <= indirect_blks; n++) {
617 /*
618 * Get buffer_head for parent block, zero it out
619 * and set the pointer to new one, then send
620 * parent to disk.
621 */
622 bh = sb_getblk(inode->i_sb, new_blocks[n-1]);
623 branch[n].bh = bh;
624 lock_buffer(bh);
625 BUFFER_TRACE(bh, "call get_create_access");
626 err = ext3_journal_get_create_access(handle, bh);
627 if (err) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700628 unlock_buffer(bh);
Mingming Caob47b2472006-03-26 01:37:56 -0800629 brelse(bh);
630 goto failed;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700631 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700632
Mingming Caob47b2472006-03-26 01:37:56 -0800633 memset(bh->b_data, 0, blocksize);
634 branch[n].p = (__le32 *) bh->b_data + offsets[n];
635 branch[n].key = cpu_to_le32(new_blocks[n]);
636 *branch[n].p = branch[n].key;
637 if ( n == indirect_blks) {
638 current_block = new_blocks[n];
639 /*
640 * End of chain, update the last new metablock of
641 * the chain to point to the new allocated
642 * data blocks numbers
643 */
644 for (i=1; i < num; i++)
645 *(branch[n].p + i) = cpu_to_le32(++current_block);
646 }
647 BUFFER_TRACE(bh, "marking uptodate");
648 set_buffer_uptodate(bh);
649 unlock_buffer(bh);
650
651 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
652 err = ext3_journal_dirty_metadata(handle, bh);
653 if (err)
654 goto failed;
655 }
656 *blks = num;
657 return err;
658failed:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 /* Allocation failed, free what we already allocated */
Mingming Caob47b2472006-03-26 01:37:56 -0800660 for (i = 1; i <= n ; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700661 BUFFER_TRACE(branch[i].bh, "call journal_forget");
662 ext3_journal_forget(handle, branch[i].bh);
663 }
Mingming Caob47b2472006-03-26 01:37:56 -0800664 for (i = 0; i <indirect_blks; i++)
665 ext3_free_blocks(handle, inode, new_blocks[i], 1);
666
667 ext3_free_blocks(handle, inode, new_blocks[i], num);
668
Linus Torvalds1da177e2005-04-16 15:20:36 -0700669 return err;
670}
671
672/**
673 * ext3_splice_branch - splice the allocated branch onto inode.
674 * @inode: owner
675 * @block: (logical) number of block we are adding
676 * @chain: chain of indirect blocks (with a missing link - see
677 * ext3_alloc_branch)
678 * @where: location of missing link
Mingming Caob47b2472006-03-26 01:37:56 -0800679 * @num: number of indirect blocks we are adding
680 * @blks: number of direct blocks we are adding
Linus Torvalds1da177e2005-04-16 15:20:36 -0700681 *
Mingming Caofe55c452005-05-01 08:59:20 -0700682 * This function fills the missing link and does all housekeeping needed in
Linus Torvalds1da177e2005-04-16 15:20:36 -0700683 * inode (->i_blocks, etc.). In case of success we end up with the full
Mingming Caofe55c452005-05-01 08:59:20 -0700684 * chain to new block and return 0.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700685 */
686
687static int ext3_splice_branch(handle_t *handle, struct inode *inode, long block,
Mingming Caob47b2472006-03-26 01:37:56 -0800688 Indirect *where, int num, int blks)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700689{
690 int i;
691 int err = 0;
692 struct ext3_block_alloc_info *block_i = EXT3_I(inode)->i_block_alloc_info;
Mingming Caob47b2472006-03-26 01:37:56 -0800693 unsigned long current_block;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700694 /*
695 * If we're splicing into a [td]indirect block (as opposed to the
696 * inode) then we need to get write access to the [td]indirect block
697 * before the splice.
698 */
699 if (where->bh) {
700 BUFFER_TRACE(where->bh, "get_write_access");
701 err = ext3_journal_get_write_access(handle, where->bh);
702 if (err)
703 goto err_out;
704 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700705 /* That's it */
706
707 *where->p = where->key;
Mingming Caob47b2472006-03-26 01:37:56 -0800708 /* update host bufferhead or inode to point to
709 * more just allocated direct blocks blocks */
710 if (num == 0 && blks > 1) {
711 current_block = le32_to_cpu(where->key + 1);
712 for (i = 1; i < blks; i++)
713 *(where->p + i ) = cpu_to_le32(current_block++);
714 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700715
716 /*
717 * update the most recently allocated logical & physical block
718 * in i_block_alloc_info, to assist find the proper goal block for next
719 * allocation
720 */
721 if (block_i) {
Mingming Caob47b2472006-03-26 01:37:56 -0800722 block_i->last_alloc_logical_block = block + blks - 1;
723 block_i->last_alloc_physical_block = le32_to_cpu(where[num].key + blks - 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700724 }
725
726 /* We are done with atomic stuff, now do the rest of housekeeping */
727
728 inode->i_ctime = CURRENT_TIME_SEC;
729 ext3_mark_inode_dirty(handle, inode);
730
731 /* had we spliced it onto indirect block? */
732 if (where->bh) {
733 /*
734 * akpm: If we spliced it onto an indirect block, we haven't
735 * altered the inode. Note however that if it is being spliced
736 * onto an indirect block at the very end of the file (the
737 * file is growing) then we *will* alter the inode to reflect
738 * the new i_size. But that is not done here - it is done in
739 * generic_commit_write->__mark_inode_dirty->ext3_dirty_inode.
740 */
741 jbd_debug(5, "splicing indirect only\n");
742 BUFFER_TRACE(where->bh, "call ext3_journal_dirty_metadata");
743 err = ext3_journal_dirty_metadata(handle, where->bh);
744 if (err)
745 goto err_out;
746 } else {
747 /*
748 * OK, we spliced it into the inode itself on a direct block.
749 * Inode was dirtied above.
750 */
751 jbd_debug(5, "splicing direct\n");
752 }
753 return err;
754
Linus Torvalds1da177e2005-04-16 15:20:36 -0700755err_out:
Mingming Caob47b2472006-03-26 01:37:56 -0800756 for (i = 1; i <= num; i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700757 BUFFER_TRACE(where[i].bh, "call journal_forget");
758 ext3_journal_forget(handle, where[i].bh);
Mingming Caob47b2472006-03-26 01:37:56 -0800759 ext3_free_blocks(handle, inode, le32_to_cpu(where[i-1].key), 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700760 }
Mingming Caob47b2472006-03-26 01:37:56 -0800761 ext3_free_blocks(handle, inode, le32_to_cpu(where[num].key), blks);
762
Linus Torvalds1da177e2005-04-16 15:20:36 -0700763 return err;
764}
765
766/*
767 * Allocation strategy is simple: if we have to allocate something, we will
768 * have to go the whole way to leaf. So let's do it before attaching anything
769 * to tree, set linkage between the newborn blocks, write them if sync is
770 * required, recheck the path, free and repeat if check fails, otherwise
771 * set the last missing link (that will protect us from any truncate-generated
772 * removals - all blocks on the path are immune now) and possibly force the
773 * write on the parent block.
774 * That has a nice additional property: no special recovery from the failed
775 * allocations is needed - we simply release blocks and do not touch anything
776 * reachable from inode.
777 *
778 * akpm: `handle' can be NULL if create == 0.
779 *
780 * The BKL may not be held on entry here. Be sure to take it early.
Mingming Cao89747d32006-03-26 01:37:55 -0800781 * return > 0, # of blocks mapped or allocated.
782 * return = 0, if plain lookup failed.
783 * return < 0, error case.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700784 */
785
Andrew Mortond8733c22006-03-23 03:00:11 -0800786int
Mingming Cao89747d32006-03-26 01:37:55 -0800787ext3_get_blocks_handle(handle_t *handle, struct inode *inode, sector_t iblock,
788 unsigned long maxblocks, struct buffer_head *bh_result,
789 int create, int extend_disksize)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700790{
791 int err = -EIO;
792 int offsets[4];
793 Indirect chain[4];
794 Indirect *partial;
795 unsigned long goal;
Mingming Caob47b2472006-03-26 01:37:56 -0800796 int indirect_blks;
Mingming Cao89747d32006-03-26 01:37:55 -0800797 int blocks_to_boundary = 0;
798 int depth;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700799 struct ext3_inode_info *ei = EXT3_I(inode);
Mingming Cao89747d32006-03-26 01:37:55 -0800800 int count = 0;
801 unsigned long first_block = 0;
802
Linus Torvalds1da177e2005-04-16 15:20:36 -0700803
804 J_ASSERT(handle != NULL || create == 0);
Mingming Cao89747d32006-03-26 01:37:55 -0800805 depth = ext3_block_to_path(inode, iblock, offsets, &blocks_to_boundary);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700806
807 if (depth == 0)
808 goto out;
809
Linus Torvalds1da177e2005-04-16 15:20:36 -0700810 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
811
812 /* Simplest case - block found, no allocation needed */
813 if (!partial) {
Mingming Cao89747d32006-03-26 01:37:55 -0800814 first_block = chain[depth - 1].key;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 clear_buffer_new(bh_result);
Mingming Cao89747d32006-03-26 01:37:55 -0800816 count++;
817 /*map more blocks*/
818 while (count < maxblocks && count <= blocks_to_boundary) {
819 if (!verify_chain(chain, partial)) {
820 /*
821 * Indirect block might be removed by
822 * truncate while we were reading it.
823 * Handling of that case: forget what we've
824 * got now. Flag the err as EAGAIN, so it
825 * will reread.
826 */
827 err = -EAGAIN;
828 count = 0;
829 break;
830 }
831 if (le32_to_cpu(*(chain[depth-1].p+count) ==
832 (first_block + count)))
833 count++;
834 else
835 break;
836 }
837 if (err != -EAGAIN)
838 goto got_it;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 }
840
841 /* Next simple case - plain lookup or failed read of indirect block */
Mingming Caofe55c452005-05-01 08:59:20 -0700842 if (!create || err == -EIO)
843 goto cleanup;
844
Arjan van de Ven97461512006-03-23 03:00:42 -0800845 mutex_lock(&ei->truncate_mutex);
Mingming Caofe55c452005-05-01 08:59:20 -0700846
847 /*
848 * If the indirect block is missing while we are reading
849 * the chain(ext3_get_branch() returns -EAGAIN err), or
850 * if the chain has been changed after we grab the semaphore,
851 * (either because another process truncated this branch, or
852 * another get_block allocated this branch) re-grab the chain to see if
853 * the request block has been allocated or not.
854 *
855 * Since we already block the truncate/other get_block
856 * at this point, we will have the current copy of the chain when we
857 * splice the branch into the tree.
858 */
859 if (err == -EAGAIN || !verify_chain(chain, partial)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700860 while (partial > chain) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700861 brelse(partial->bh);
862 partial--;
863 }
Mingming Caofe55c452005-05-01 08:59:20 -0700864 partial = ext3_get_branch(inode, depth, offsets, chain, &err);
865 if (!partial) {
Mingming Cao89747d32006-03-26 01:37:55 -0800866 count++;
Arjan van de Ven97461512006-03-23 03:00:42 -0800867 mutex_unlock(&ei->truncate_mutex);
Mingming Caofe55c452005-05-01 08:59:20 -0700868 if (err)
869 goto cleanup;
870 clear_buffer_new(bh_result);
871 goto got_it;
872 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 }
874
875 /*
Mingming Caofe55c452005-05-01 08:59:20 -0700876 * Okay, we need to do block allocation. Lazily initialize the block
877 * allocation info here if necessary
878 */
879 if (S_ISREG(inode->i_mode) && (!ei->i_block_alloc_info))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700880 ext3_init_block_alloc_info(inode);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881
Mingming Caofe55c452005-05-01 08:59:20 -0700882 goal = ext3_find_goal(inode, iblock, chain, partial);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883
Mingming Caob47b2472006-03-26 01:37:56 -0800884 /* the number of blocks need to allocate for [d,t]indirect blocks */
885 indirect_blks = (chain + depth) - partial - 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700886
887 /*
Mingming Caob47b2472006-03-26 01:37:56 -0800888 * Next look up the indirect map to count the totoal number of
889 * direct blocks to allocate for this branch.
890 */
891 count = ext3_blks_to_allocate(partial, indirect_blks,
892 maxblocks, blocks_to_boundary);
893 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894 * Block out ext3_truncate while we alter the tree
895 */
Mingming Caob47b2472006-03-26 01:37:56 -0800896 err = ext3_alloc_branch(handle, inode, indirect_blks, &count, goal,
Mingming Caofe55c452005-05-01 08:59:20 -0700897 offsets + (partial - chain), partial);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700898
Mingming Caofe55c452005-05-01 08:59:20 -0700899 /*
900 * The ext3_splice_branch call will free and forget any buffers
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 * on the new chain if there is a failure, but that risks using
902 * up transaction credits, especially for bitmaps where the
903 * credits cannot be returned. Can we handle this somehow? We
Mingming Caofe55c452005-05-01 08:59:20 -0700904 * may need to return -EAGAIN upwards in the worst case. --sct
905 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700906 if (!err)
Mingming Caob47b2472006-03-26 01:37:56 -0800907 err = ext3_splice_branch(handle, inode, iblock,
908 partial, indirect_blks, count);
Mingming Caofe55c452005-05-01 08:59:20 -0700909 /*
Arjan van de Ven97461512006-03-23 03:00:42 -0800910 * i_disksize growing is protected by truncate_mutex. Don't forget to
Mingming Caofe55c452005-05-01 08:59:20 -0700911 * protect it if you're about to implement concurrent
912 * ext3_get_block() -bzzz
913 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 if (!err && extend_disksize && inode->i_size > ei->i_disksize)
915 ei->i_disksize = inode->i_size;
Arjan van de Ven97461512006-03-23 03:00:42 -0800916 mutex_unlock(&ei->truncate_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700917 if (err)
918 goto cleanup;
919
920 set_buffer_new(bh_result);
Mingming Caofe55c452005-05-01 08:59:20 -0700921got_it:
922 map_bh(bh_result, inode->i_sb, le32_to_cpu(chain[depth-1].key));
Mingming Cao89747d32006-03-26 01:37:55 -0800923 if (blocks_to_boundary == 0)
Mingming Caofe55c452005-05-01 08:59:20 -0700924 set_buffer_boundary(bh_result);
Mingming Cao89747d32006-03-26 01:37:55 -0800925 err = count;
Mingming Caofe55c452005-05-01 08:59:20 -0700926 /* Clean up and exit */
927 partial = chain + depth - 1; /* the whole chain */
928cleanup:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 while (partial > chain) {
Mingming Caofe55c452005-05-01 08:59:20 -0700930 BUFFER_TRACE(partial->bh, "call brelse");
Linus Torvalds1da177e2005-04-16 15:20:36 -0700931 brelse(partial->bh);
932 partial--;
933 }
Mingming Caofe55c452005-05-01 08:59:20 -0700934 BUFFER_TRACE(bh_result, "returned");
935out:
936 return err;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700937}
938
Linus Torvalds1da177e2005-04-16 15:20:36 -0700939#define DIO_CREDITS (EXT3_RESERVE_TRANS_BLOCKS + 32)
940
941static int
942ext3_direct_io_get_blocks(struct inode *inode, sector_t iblock,
Mingming Caob47b2472006-03-26 01:37:56 -0800943 struct buffer_head *bh_result, int create)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700944{
945 handle_t *handle = journal_current_handle();
946 int ret = 0;
Badari Pulavarty1d8fa7a2006-03-26 01:38:02 -0800947 unsigned max_blocks = bh_result->b_size >> inode->i_blkbits;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700948
Mingming Cao89747d32006-03-26 01:37:55 -0800949 if (!create)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700950 goto get_block; /* A read */
951
Mingming Cao89747d32006-03-26 01:37:55 -0800952 if (max_blocks == 1)
953 goto get_block; /* A single block get */
954
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 if (handle->h_transaction->t_state == T_LOCKED) {
956 /*
957 * Huge direct-io writes can hold off commits for long
958 * periods of time. Let this commit run.
959 */
960 ext3_journal_stop(handle);
961 handle = ext3_journal_start(inode, DIO_CREDITS);
962 if (IS_ERR(handle))
963 ret = PTR_ERR(handle);
964 goto get_block;
965 }
966
967 if (handle->h_buffer_credits <= EXT3_RESERVE_TRANS_BLOCKS) {
968 /*
969 * Getting low on buffer credits...
970 */
971 ret = ext3_journal_extend(handle, DIO_CREDITS);
972 if (ret > 0) {
973 /*
974 * Couldn't extend the transaction. Start a new one.
975 */
976 ret = ext3_journal_restart(handle, DIO_CREDITS);
977 }
978 }
979
980get_block:
Mingming Cao89747d32006-03-26 01:37:55 -0800981 if (ret == 0) {
982 ret = ext3_get_blocks_handle(handle, inode, iblock,
983 max_blocks, bh_result, create, 0);
984 if (ret > 0) {
985 bh_result->b_size = (ret << inode->i_blkbits);
986 ret = 0;
987 }
988 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700989 return ret;
990}
991
Mingming Cao89747d32006-03-26 01:37:55 -0800992static int ext3_get_block(struct inode *inode, sector_t iblock,
993 struct buffer_head *bh_result, int create)
994{
Badari Pulavarty1d8fa7a2006-03-26 01:38:02 -0800995 return ext3_direct_io_get_blocks(inode, iblock, bh_result, create);
Mingming Cao89747d32006-03-26 01:37:55 -0800996}
997
Linus Torvalds1da177e2005-04-16 15:20:36 -0700998/*
999 * `handle' can be NULL if create is zero
1000 */
1001struct buffer_head *ext3_getblk(handle_t *handle, struct inode * inode,
1002 long block, int create, int * errp)
1003{
1004 struct buffer_head dummy;
1005 int fatal = 0, err;
1006
1007 J_ASSERT(handle != NULL || create == 0);
1008
1009 dummy.b_state = 0;
1010 dummy.b_blocknr = -1000;
1011 buffer_trace_init(&dummy.b_history);
Mingming Cao89747d32006-03-26 01:37:55 -08001012 err = ext3_get_blocks_handle(handle, inode, block, 1,
1013 &dummy, create, 1);
1014 if (err == 1) {
1015 err = 0;
1016 } else if (err >= 0) {
1017 WARN_ON(1);
1018 err = -EIO;
1019 }
1020 *errp = err;
1021 if (!err && buffer_mapped(&dummy)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 struct buffer_head *bh;
1023 bh = sb_getblk(inode->i_sb, dummy.b_blocknr);
Glauber de Oliveira Costa2973dfd2005-10-30 15:03:05 -08001024 if (!bh) {
1025 *errp = -EIO;
1026 goto err;
1027 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001028 if (buffer_new(&dummy)) {
1029 J_ASSERT(create != 0);
1030 J_ASSERT(handle != 0);
1031
1032 /* Now that we do not always journal data, we
1033 should keep in mind whether this should
1034 always journal the new buffer as metadata.
1035 For now, regular file writes use
1036 ext3_get_block instead, so it's not a
1037 problem. */
1038 lock_buffer(bh);
1039 BUFFER_TRACE(bh, "call get_create_access");
1040 fatal = ext3_journal_get_create_access(handle, bh);
1041 if (!fatal && !buffer_uptodate(bh)) {
1042 memset(bh->b_data, 0, inode->i_sb->s_blocksize);
1043 set_buffer_uptodate(bh);
1044 }
1045 unlock_buffer(bh);
1046 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1047 err = ext3_journal_dirty_metadata(handle, bh);
1048 if (!fatal)
1049 fatal = err;
1050 } else {
1051 BUFFER_TRACE(bh, "not a new buffer");
1052 }
1053 if (fatal) {
1054 *errp = fatal;
1055 brelse(bh);
1056 bh = NULL;
1057 }
1058 return bh;
1059 }
Glauber de Oliveira Costa2973dfd2005-10-30 15:03:05 -08001060err:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001061 return NULL;
1062}
1063
1064struct buffer_head *ext3_bread(handle_t *handle, struct inode * inode,
1065 int block, int create, int *err)
1066{
1067 struct buffer_head * bh;
1068
1069 bh = ext3_getblk(handle, inode, block, create, err);
1070 if (!bh)
1071 return bh;
1072 if (buffer_uptodate(bh))
1073 return bh;
1074 ll_rw_block(READ, 1, &bh);
1075 wait_on_buffer(bh);
1076 if (buffer_uptodate(bh))
1077 return bh;
1078 put_bh(bh);
1079 *err = -EIO;
1080 return NULL;
1081}
1082
1083static int walk_page_buffers( handle_t *handle,
1084 struct buffer_head *head,
1085 unsigned from,
1086 unsigned to,
1087 int *partial,
1088 int (*fn)( handle_t *handle,
1089 struct buffer_head *bh))
1090{
1091 struct buffer_head *bh;
1092 unsigned block_start, block_end;
1093 unsigned blocksize = head->b_size;
1094 int err, ret = 0;
1095 struct buffer_head *next;
1096
1097 for ( bh = head, block_start = 0;
1098 ret == 0 && (bh != head || !block_start);
1099 block_start = block_end, bh = next)
1100 {
1101 next = bh->b_this_page;
1102 block_end = block_start + blocksize;
1103 if (block_end <= from || block_start >= to) {
1104 if (partial && !buffer_uptodate(bh))
1105 *partial = 1;
1106 continue;
1107 }
1108 err = (*fn)(handle, bh);
1109 if (!ret)
1110 ret = err;
1111 }
1112 return ret;
1113}
1114
1115/*
1116 * To preserve ordering, it is essential that the hole instantiation and
1117 * the data write be encapsulated in a single transaction. We cannot
1118 * close off a transaction and start a new one between the ext3_get_block()
1119 * and the commit_write(). So doing the journal_start at the start of
1120 * prepare_write() is the right place.
1121 *
1122 * Also, this function can nest inside ext3_writepage() ->
1123 * block_write_full_page(). In that case, we *know* that ext3_writepage()
1124 * has generated enough buffer credits to do the whole page. So we won't
1125 * block on the journal in that case, which is good, because the caller may
1126 * be PF_MEMALLOC.
1127 *
1128 * By accident, ext3 can be reentered when a transaction is open via
1129 * quota file writes. If we were to commit the transaction while thus
1130 * reentered, there can be a deadlock - we would be holding a quota
1131 * lock, and the commit would never complete if another thread had a
1132 * transaction open and was blocking on the quota lock - a ranking
1133 * violation.
1134 *
1135 * So what we do is to rely on the fact that journal_stop/journal_start
1136 * will _not_ run commit under these circumstances because handle->h_ref
1137 * is elevated. We'll still have enough credits for the tiny quotafile
1138 * write.
1139 */
1140
1141static int do_journal_get_write_access(handle_t *handle,
1142 struct buffer_head *bh)
1143{
1144 if (!buffer_mapped(bh) || buffer_freed(bh))
1145 return 0;
1146 return ext3_journal_get_write_access(handle, bh);
1147}
1148
1149static int ext3_prepare_write(struct file *file, struct page *page,
1150 unsigned from, unsigned to)
1151{
1152 struct inode *inode = page->mapping->host;
1153 int ret, needed_blocks = ext3_writepage_trans_blocks(inode);
1154 handle_t *handle;
1155 int retries = 0;
1156
1157retry:
1158 handle = ext3_journal_start(inode, needed_blocks);
1159 if (IS_ERR(handle)) {
1160 ret = PTR_ERR(handle);
1161 goto out;
1162 }
1163 if (test_opt(inode->i_sb, NOBH))
1164 ret = nobh_prepare_write(page, from, to, ext3_get_block);
1165 else
1166 ret = block_prepare_write(page, from, to, ext3_get_block);
1167 if (ret)
1168 goto prepare_write_failed;
1169
1170 if (ext3_should_journal_data(inode)) {
1171 ret = walk_page_buffers(handle, page_buffers(page),
1172 from, to, NULL, do_journal_get_write_access);
1173 }
1174prepare_write_failed:
1175 if (ret)
1176 ext3_journal_stop(handle);
1177 if (ret == -ENOSPC && ext3_should_retry_alloc(inode->i_sb, &retries))
1178 goto retry;
1179out:
1180 return ret;
1181}
1182
1183int
1184ext3_journal_dirty_data(handle_t *handle, struct buffer_head *bh)
1185{
1186 int err = journal_dirty_data(handle, bh);
1187 if (err)
1188 ext3_journal_abort_handle(__FUNCTION__, __FUNCTION__,
1189 bh, handle,err);
1190 return err;
1191}
1192
1193/* For commit_write() in data=journal mode */
1194static int commit_write_fn(handle_t *handle, struct buffer_head *bh)
1195{
1196 if (!buffer_mapped(bh) || buffer_freed(bh))
1197 return 0;
1198 set_buffer_uptodate(bh);
1199 return ext3_journal_dirty_metadata(handle, bh);
1200}
1201
1202/*
1203 * We need to pick up the new inode size which generic_commit_write gave us
1204 * `file' can be NULL - eg, when called from page_symlink().
1205 *
1206 * ext3 never places buffers on inode->i_mapping->private_list. metadata
1207 * buffers are managed internally.
1208 */
1209
1210static int ext3_ordered_commit_write(struct file *file, struct page *page,
1211 unsigned from, unsigned to)
1212{
1213 handle_t *handle = ext3_journal_current_handle();
1214 struct inode *inode = page->mapping->host;
1215 int ret = 0, ret2;
1216
1217 ret = walk_page_buffers(handle, page_buffers(page),
1218 from, to, NULL, ext3_journal_dirty_data);
1219
1220 if (ret == 0) {
1221 /*
1222 * generic_commit_write() will run mark_inode_dirty() if i_size
1223 * changes. So let's piggyback the i_disksize mark_inode_dirty
1224 * into that.
1225 */
1226 loff_t new_i_size;
1227
1228 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1229 if (new_i_size > EXT3_I(inode)->i_disksize)
1230 EXT3_I(inode)->i_disksize = new_i_size;
1231 ret = generic_commit_write(file, page, from, to);
1232 }
1233 ret2 = ext3_journal_stop(handle);
1234 if (!ret)
1235 ret = ret2;
1236 return ret;
1237}
1238
1239static int ext3_writeback_commit_write(struct file *file, struct page *page,
1240 unsigned from, unsigned to)
1241{
1242 handle_t *handle = ext3_journal_current_handle();
1243 struct inode *inode = page->mapping->host;
1244 int ret = 0, ret2;
1245 loff_t new_i_size;
1246
1247 new_i_size = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1248 if (new_i_size > EXT3_I(inode)->i_disksize)
1249 EXT3_I(inode)->i_disksize = new_i_size;
1250
1251 if (test_opt(inode->i_sb, NOBH))
1252 ret = nobh_commit_write(file, page, from, to);
1253 else
1254 ret = generic_commit_write(file, page, from, to);
1255
1256 ret2 = ext3_journal_stop(handle);
1257 if (!ret)
1258 ret = ret2;
1259 return ret;
1260}
1261
1262static int ext3_journalled_commit_write(struct file *file,
1263 struct page *page, unsigned from, unsigned to)
1264{
1265 handle_t *handle = ext3_journal_current_handle();
1266 struct inode *inode = page->mapping->host;
1267 int ret = 0, ret2;
1268 int partial = 0;
1269 loff_t pos;
1270
1271 /*
1272 * Here we duplicate the generic_commit_write() functionality
1273 */
1274 pos = ((loff_t)page->index << PAGE_CACHE_SHIFT) + to;
1275
1276 ret = walk_page_buffers(handle, page_buffers(page), from,
1277 to, &partial, commit_write_fn);
1278 if (!partial)
1279 SetPageUptodate(page);
1280 if (pos > inode->i_size)
1281 i_size_write(inode, pos);
1282 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1283 if (inode->i_size > EXT3_I(inode)->i_disksize) {
1284 EXT3_I(inode)->i_disksize = inode->i_size;
1285 ret2 = ext3_mark_inode_dirty(handle, inode);
1286 if (!ret)
1287 ret = ret2;
1288 }
1289 ret2 = ext3_journal_stop(handle);
1290 if (!ret)
1291 ret = ret2;
1292 return ret;
1293}
1294
1295/*
1296 * bmap() is special. It gets used by applications such as lilo and by
1297 * the swapper to find the on-disk block of a specific piece of data.
1298 *
1299 * Naturally, this is dangerous if the block concerned is still in the
1300 * journal. If somebody makes a swapfile on an ext3 data-journaling
1301 * filesystem and enables swap, then they may get a nasty shock when the
1302 * data getting swapped to that swapfile suddenly gets overwritten by
1303 * the original zero's written out previously to the journal and
1304 * awaiting writeback in the kernel's buffer cache.
1305 *
1306 * So, if we see any bmap calls here on a modified, data-journaled file,
1307 * take extra steps to flush any blocks which might be in the cache.
1308 */
1309static sector_t ext3_bmap(struct address_space *mapping, sector_t block)
1310{
1311 struct inode *inode = mapping->host;
1312 journal_t *journal;
1313 int err;
1314
1315 if (EXT3_I(inode)->i_state & EXT3_STATE_JDATA) {
1316 /*
1317 * This is a REALLY heavyweight approach, but the use of
1318 * bmap on dirty files is expected to be extremely rare:
1319 * only if we run lilo or swapon on a freshly made file
1320 * do we expect this to happen.
1321 *
1322 * (bmap requires CAP_SYS_RAWIO so this does not
1323 * represent an unprivileged user DOS attack --- we'd be
1324 * in trouble if mortal users could trigger this path at
1325 * will.)
1326 *
1327 * NB. EXT3_STATE_JDATA is not set on files other than
1328 * regular files. If somebody wants to bmap a directory
1329 * or symlink and gets confused because the buffer
1330 * hasn't yet been flushed to disk, they deserve
1331 * everything they get.
1332 */
1333
1334 EXT3_I(inode)->i_state &= ~EXT3_STATE_JDATA;
1335 journal = EXT3_JOURNAL(inode);
1336 journal_lock_updates(journal);
1337 err = journal_flush(journal);
1338 journal_unlock_updates(journal);
1339
1340 if (err)
1341 return 0;
1342 }
1343
1344 return generic_block_bmap(mapping,block,ext3_get_block);
1345}
1346
1347static int bget_one(handle_t *handle, struct buffer_head *bh)
1348{
1349 get_bh(bh);
1350 return 0;
1351}
1352
1353static int bput_one(handle_t *handle, struct buffer_head *bh)
1354{
1355 put_bh(bh);
1356 return 0;
1357}
1358
1359static int journal_dirty_data_fn(handle_t *handle, struct buffer_head *bh)
1360{
1361 if (buffer_mapped(bh))
1362 return ext3_journal_dirty_data(handle, bh);
1363 return 0;
1364}
1365
1366/*
1367 * Note that we always start a transaction even if we're not journalling
1368 * data. This is to preserve ordering: any hole instantiation within
1369 * __block_write_full_page -> ext3_get_block() should be journalled
1370 * along with the data so we don't crash and then get metadata which
1371 * refers to old data.
1372 *
1373 * In all journalling modes block_write_full_page() will start the I/O.
1374 *
1375 * Problem:
1376 *
1377 * ext3_writepage() -> kmalloc() -> __alloc_pages() -> page_launder() ->
1378 * ext3_writepage()
1379 *
1380 * Similar for:
1381 *
1382 * ext3_file_write() -> generic_file_write() -> __alloc_pages() -> ...
1383 *
1384 * Same applies to ext3_get_block(). We will deadlock on various things like
Arjan van de Ven97461512006-03-23 03:00:42 -08001385 * lock_journal and i_truncate_mutex.
Linus Torvalds1da177e2005-04-16 15:20:36 -07001386 *
1387 * Setting PF_MEMALLOC here doesn't work - too many internal memory
1388 * allocations fail.
1389 *
1390 * 16May01: If we're reentered then journal_current_handle() will be
1391 * non-zero. We simply *return*.
1392 *
1393 * 1 July 2001: @@@ FIXME:
1394 * In journalled data mode, a data buffer may be metadata against the
1395 * current transaction. But the same file is part of a shared mapping
1396 * and someone does a writepage() on it.
1397 *
1398 * We will move the buffer onto the async_data list, but *after* it has
1399 * been dirtied. So there's a small window where we have dirty data on
1400 * BJ_Metadata.
1401 *
1402 * Note that this only applies to the last partial page in the file. The
1403 * bit which block_write_full_page() uses prepare/commit for. (That's
1404 * broken code anyway: it's wrong for msync()).
1405 *
1406 * It's a rare case: affects the final partial page, for journalled data
1407 * where the file is subject to bith write() and writepage() in the same
1408 * transction. To fix it we'll need a custom block_write_full_page().
1409 * We'll probably need that anyway for journalling writepage() output.
1410 *
1411 * We don't honour synchronous mounts for writepage(). That would be
1412 * disastrous. Any write() or metadata operation will sync the fs for
1413 * us.
1414 *
1415 * AKPM2: if all the page's buffers are mapped to disk and !data=journal,
1416 * we don't need to open a transaction here.
1417 */
1418static int ext3_ordered_writepage(struct page *page,
1419 struct writeback_control *wbc)
1420{
1421 struct inode *inode = page->mapping->host;
1422 struct buffer_head *page_bufs;
1423 handle_t *handle = NULL;
1424 int ret = 0;
1425 int err;
1426
1427 J_ASSERT(PageLocked(page));
1428
1429 /*
1430 * We give up here if we're reentered, because it might be for a
1431 * different filesystem.
1432 */
1433 if (ext3_journal_current_handle())
1434 goto out_fail;
1435
1436 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1437
1438 if (IS_ERR(handle)) {
1439 ret = PTR_ERR(handle);
1440 goto out_fail;
1441 }
1442
1443 if (!page_has_buffers(page)) {
1444 create_empty_buffers(page, inode->i_sb->s_blocksize,
1445 (1 << BH_Dirty)|(1 << BH_Uptodate));
1446 }
1447 page_bufs = page_buffers(page);
1448 walk_page_buffers(handle, page_bufs, 0,
1449 PAGE_CACHE_SIZE, NULL, bget_one);
1450
1451 ret = block_write_full_page(page, ext3_get_block, wbc);
1452
1453 /*
1454 * The page can become unlocked at any point now, and
1455 * truncate can then come in and change things. So we
1456 * can't touch *page from now on. But *page_bufs is
1457 * safe due to elevated refcount.
1458 */
1459
1460 /*
1461 * And attach them to the current transaction. But only if
1462 * block_write_full_page() succeeded. Otherwise they are unmapped,
1463 * and generally junk.
1464 */
1465 if (ret == 0) {
1466 err = walk_page_buffers(handle, page_bufs, 0, PAGE_CACHE_SIZE,
1467 NULL, journal_dirty_data_fn);
1468 if (!ret)
1469 ret = err;
1470 }
1471 walk_page_buffers(handle, page_bufs, 0,
1472 PAGE_CACHE_SIZE, NULL, bput_one);
1473 err = ext3_journal_stop(handle);
1474 if (!ret)
1475 ret = err;
1476 return ret;
1477
1478out_fail:
1479 redirty_page_for_writepage(wbc, page);
1480 unlock_page(page);
1481 return ret;
1482}
1483
Linus Torvalds1da177e2005-04-16 15:20:36 -07001484static int ext3_writeback_writepage(struct page *page,
1485 struct writeback_control *wbc)
1486{
1487 struct inode *inode = page->mapping->host;
1488 handle_t *handle = NULL;
1489 int ret = 0;
1490 int err;
1491
1492 if (ext3_journal_current_handle())
1493 goto out_fail;
1494
1495 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1496 if (IS_ERR(handle)) {
1497 ret = PTR_ERR(handle);
1498 goto out_fail;
1499 }
1500
1501 if (test_opt(inode->i_sb, NOBH))
1502 ret = nobh_writepage(page, ext3_get_block, wbc);
1503 else
1504 ret = block_write_full_page(page, ext3_get_block, wbc);
1505
1506 err = ext3_journal_stop(handle);
1507 if (!ret)
1508 ret = err;
1509 return ret;
1510
1511out_fail:
1512 redirty_page_for_writepage(wbc, page);
1513 unlock_page(page);
1514 return ret;
1515}
1516
1517static int ext3_journalled_writepage(struct page *page,
1518 struct writeback_control *wbc)
1519{
1520 struct inode *inode = page->mapping->host;
1521 handle_t *handle = NULL;
1522 int ret = 0;
1523 int err;
1524
1525 if (ext3_journal_current_handle())
1526 goto no_write;
1527
1528 handle = ext3_journal_start(inode, ext3_writepage_trans_blocks(inode));
1529 if (IS_ERR(handle)) {
1530 ret = PTR_ERR(handle);
1531 goto no_write;
1532 }
1533
1534 if (!page_has_buffers(page) || PageChecked(page)) {
1535 /*
1536 * It's mmapped pagecache. Add buffers and journal it. There
1537 * doesn't seem much point in redirtying the page here.
1538 */
1539 ClearPageChecked(page);
1540 ret = block_prepare_write(page, 0, PAGE_CACHE_SIZE,
1541 ext3_get_block);
Denis Lunevab4eb432005-11-13 16:07:17 -08001542 if (ret != 0) {
1543 ext3_journal_stop(handle);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001544 goto out_unlock;
Denis Lunevab4eb432005-11-13 16:07:17 -08001545 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001546 ret = walk_page_buffers(handle, page_buffers(page), 0,
1547 PAGE_CACHE_SIZE, NULL, do_journal_get_write_access);
1548
1549 err = walk_page_buffers(handle, page_buffers(page), 0,
1550 PAGE_CACHE_SIZE, NULL, commit_write_fn);
1551 if (ret == 0)
1552 ret = err;
1553 EXT3_I(inode)->i_state |= EXT3_STATE_JDATA;
1554 unlock_page(page);
1555 } else {
1556 /*
1557 * It may be a page full of checkpoint-mode buffers. We don't
1558 * really know unless we go poke around in the buffer_heads.
1559 * But block_write_full_page will do the right thing.
1560 */
1561 ret = block_write_full_page(page, ext3_get_block, wbc);
1562 }
1563 err = ext3_journal_stop(handle);
1564 if (!ret)
1565 ret = err;
1566out:
1567 return ret;
1568
1569no_write:
1570 redirty_page_for_writepage(wbc, page);
1571out_unlock:
1572 unlock_page(page);
1573 goto out;
1574}
1575
1576static int ext3_readpage(struct file *file, struct page *page)
1577{
1578 return mpage_readpage(page, ext3_get_block);
1579}
1580
1581static int
1582ext3_readpages(struct file *file, struct address_space *mapping,
1583 struct list_head *pages, unsigned nr_pages)
1584{
1585 return mpage_readpages(mapping, pages, nr_pages, ext3_get_block);
1586}
1587
NeilBrown2ff28e22006-03-26 01:37:18 -08001588static void ext3_invalidatepage(struct page *page, unsigned long offset)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589{
1590 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1591
1592 /*
1593 * If it's a full truncate we just forget about the pending dirtying
1594 */
1595 if (offset == 0)
1596 ClearPageChecked(page);
1597
NeilBrown2ff28e22006-03-26 01:37:18 -08001598 journal_invalidatepage(journal, page, offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001599}
1600
Al Viro27496a82005-10-21 03:20:48 -04001601static int ext3_releasepage(struct page *page, gfp_t wait)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001602{
1603 journal_t *journal = EXT3_JOURNAL(page->mapping->host);
1604
1605 WARN_ON(PageChecked(page));
1606 if (!page_has_buffers(page))
1607 return 0;
1608 return journal_try_to_free_buffers(journal, page, wait);
1609}
1610
1611/*
1612 * If the O_DIRECT write will extend the file then add this inode to the
1613 * orphan list. So recovery will truncate it back to the original size
1614 * if the machine crashes during the write.
1615 *
1616 * If the O_DIRECT write is intantiating holes inside i_size and the machine
1617 * crashes then stale disk data _may_ be exposed inside the file.
1618 */
1619static ssize_t ext3_direct_IO(int rw, struct kiocb *iocb,
1620 const struct iovec *iov, loff_t offset,
1621 unsigned long nr_segs)
1622{
1623 struct file *file = iocb->ki_filp;
1624 struct inode *inode = file->f_mapping->host;
1625 struct ext3_inode_info *ei = EXT3_I(inode);
1626 handle_t *handle = NULL;
1627 ssize_t ret;
1628 int orphan = 0;
1629 size_t count = iov_length(iov, nr_segs);
1630
1631 if (rw == WRITE) {
1632 loff_t final_size = offset + count;
1633
1634 handle = ext3_journal_start(inode, DIO_CREDITS);
1635 if (IS_ERR(handle)) {
1636 ret = PTR_ERR(handle);
1637 goto out;
1638 }
1639 if (final_size > inode->i_size) {
1640 ret = ext3_orphan_add(handle, inode);
1641 if (ret)
1642 goto out_stop;
1643 orphan = 1;
1644 ei->i_disksize = inode->i_size;
1645 }
1646 }
1647
1648 ret = blockdev_direct_IO(rw, iocb, inode, inode->i_sb->s_bdev, iov,
1649 offset, nr_segs,
1650 ext3_direct_io_get_blocks, NULL);
1651
1652 /*
1653 * Reacquire the handle: ext3_direct_io_get_block() can restart the
1654 * transaction
1655 */
1656 handle = journal_current_handle();
1657
1658out_stop:
1659 if (handle) {
1660 int err;
1661
1662 if (orphan && inode->i_nlink)
1663 ext3_orphan_del(handle, inode);
1664 if (orphan && ret > 0) {
1665 loff_t end = offset + ret;
1666 if (end > inode->i_size) {
1667 ei->i_disksize = end;
1668 i_size_write(inode, end);
1669 /*
1670 * We're going to return a positive `ret'
1671 * here due to non-zero-length I/O, so there's
1672 * no way of reporting error returns from
1673 * ext3_mark_inode_dirty() to userspace. So
1674 * ignore it.
1675 */
1676 ext3_mark_inode_dirty(handle, inode);
1677 }
1678 }
1679 err = ext3_journal_stop(handle);
1680 if (ret == 0)
1681 ret = err;
1682 }
1683out:
1684 return ret;
1685}
1686
1687/*
1688 * Pages can be marked dirty completely asynchronously from ext3's journalling
1689 * activity. By filemap_sync_pte(), try_to_unmap_one(), etc. We cannot do
1690 * much here because ->set_page_dirty is called under VFS locks. The page is
1691 * not necessarily locked.
1692 *
1693 * We cannot just dirty the page and leave attached buffers clean, because the
1694 * buffers' dirty state is "definitive". We cannot just set the buffers dirty
1695 * or jbddirty because all the journalling code will explode.
1696 *
1697 * So what we do is to mark the page "pending dirty" and next time writepage
1698 * is called, propagate that into the buffers appropriately.
1699 */
1700static int ext3_journalled_set_page_dirty(struct page *page)
1701{
1702 SetPageChecked(page);
1703 return __set_page_dirty_nobuffers(page);
1704}
1705
1706static struct address_space_operations ext3_ordered_aops = {
1707 .readpage = ext3_readpage,
1708 .readpages = ext3_readpages,
1709 .writepage = ext3_ordered_writepage,
1710 .sync_page = block_sync_page,
1711 .prepare_write = ext3_prepare_write,
1712 .commit_write = ext3_ordered_commit_write,
1713 .bmap = ext3_bmap,
1714 .invalidatepage = ext3_invalidatepage,
1715 .releasepage = ext3_releasepage,
1716 .direct_IO = ext3_direct_IO,
Christoph Lametere965f962006-02-01 03:05:41 -08001717 .migratepage = buffer_migrate_page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001718};
1719
1720static struct address_space_operations ext3_writeback_aops = {
1721 .readpage = ext3_readpage,
1722 .readpages = ext3_readpages,
1723 .writepage = ext3_writeback_writepage,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001724 .sync_page = block_sync_page,
1725 .prepare_write = ext3_prepare_write,
1726 .commit_write = ext3_writeback_commit_write,
1727 .bmap = ext3_bmap,
1728 .invalidatepage = ext3_invalidatepage,
1729 .releasepage = ext3_releasepage,
1730 .direct_IO = ext3_direct_IO,
Christoph Lametere965f962006-02-01 03:05:41 -08001731 .migratepage = buffer_migrate_page,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001732};
1733
1734static struct address_space_operations ext3_journalled_aops = {
1735 .readpage = ext3_readpage,
1736 .readpages = ext3_readpages,
1737 .writepage = ext3_journalled_writepage,
1738 .sync_page = block_sync_page,
1739 .prepare_write = ext3_prepare_write,
1740 .commit_write = ext3_journalled_commit_write,
1741 .set_page_dirty = ext3_journalled_set_page_dirty,
1742 .bmap = ext3_bmap,
1743 .invalidatepage = ext3_invalidatepage,
1744 .releasepage = ext3_releasepage,
1745};
1746
1747void ext3_set_aops(struct inode *inode)
1748{
1749 if (ext3_should_order_data(inode))
1750 inode->i_mapping->a_ops = &ext3_ordered_aops;
1751 else if (ext3_should_writeback_data(inode))
1752 inode->i_mapping->a_ops = &ext3_writeback_aops;
1753 else
1754 inode->i_mapping->a_ops = &ext3_journalled_aops;
1755}
1756
1757/*
1758 * ext3_block_truncate_page() zeroes out a mapping from file offset `from'
1759 * up to the end of the block which corresponds to `from'.
1760 * This required during truncate. We need to physically zero the tail end
1761 * of that block so it doesn't yield old data if the file is later grown.
1762 */
1763static int ext3_block_truncate_page(handle_t *handle, struct page *page,
1764 struct address_space *mapping, loff_t from)
1765{
1766 unsigned long index = from >> PAGE_CACHE_SHIFT;
1767 unsigned offset = from & (PAGE_CACHE_SIZE-1);
1768 unsigned blocksize, iblock, length, pos;
1769 struct inode *inode = mapping->host;
1770 struct buffer_head *bh;
1771 int err = 0;
1772 void *kaddr;
1773
1774 blocksize = inode->i_sb->s_blocksize;
1775 length = blocksize - (offset & (blocksize - 1));
1776 iblock = index << (PAGE_CACHE_SHIFT - inode->i_sb->s_blocksize_bits);
1777
1778 /*
1779 * For "nobh" option, we can only work if we don't need to
1780 * read-in the page - otherwise we create buffers to do the IO.
1781 */
Badari Pulavartycd6ef842006-03-11 03:27:14 -08001782 if (!page_has_buffers(page) && test_opt(inode->i_sb, NOBH) &&
1783 ext3_should_writeback_data(inode) && PageUptodate(page)) {
1784 kaddr = kmap_atomic(page, KM_USER0);
1785 memset(kaddr + offset, 0, length);
1786 flush_dcache_page(page);
1787 kunmap_atomic(kaddr, KM_USER0);
1788 set_page_dirty(page);
1789 goto unlock;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 }
1791
1792 if (!page_has_buffers(page))
1793 create_empty_buffers(page, blocksize, 0);
1794
1795 /* Find the buffer that contains "offset" */
1796 bh = page_buffers(page);
1797 pos = blocksize;
1798 while (offset >= pos) {
1799 bh = bh->b_this_page;
1800 iblock++;
1801 pos += blocksize;
1802 }
1803
1804 err = 0;
1805 if (buffer_freed(bh)) {
1806 BUFFER_TRACE(bh, "freed: skip");
1807 goto unlock;
1808 }
1809
1810 if (!buffer_mapped(bh)) {
1811 BUFFER_TRACE(bh, "unmapped");
1812 ext3_get_block(inode, iblock, bh, 0);
1813 /* unmapped? It's a hole - nothing to do */
1814 if (!buffer_mapped(bh)) {
1815 BUFFER_TRACE(bh, "still unmapped");
1816 goto unlock;
1817 }
1818 }
1819
1820 /* Ok, it's mapped. Make sure it's up-to-date */
1821 if (PageUptodate(page))
1822 set_buffer_uptodate(bh);
1823
1824 if (!buffer_uptodate(bh)) {
1825 err = -EIO;
1826 ll_rw_block(READ, 1, &bh);
1827 wait_on_buffer(bh);
1828 /* Uhhuh. Read error. Complain and punt. */
1829 if (!buffer_uptodate(bh))
1830 goto unlock;
1831 }
1832
1833 if (ext3_should_journal_data(inode)) {
1834 BUFFER_TRACE(bh, "get write access");
1835 err = ext3_journal_get_write_access(handle, bh);
1836 if (err)
1837 goto unlock;
1838 }
1839
1840 kaddr = kmap_atomic(page, KM_USER0);
1841 memset(kaddr + offset, 0, length);
1842 flush_dcache_page(page);
1843 kunmap_atomic(kaddr, KM_USER0);
1844
1845 BUFFER_TRACE(bh, "zeroed end of block");
1846
1847 err = 0;
1848 if (ext3_should_journal_data(inode)) {
1849 err = ext3_journal_dirty_metadata(handle, bh);
1850 } else {
1851 if (ext3_should_order_data(inode))
1852 err = ext3_journal_dirty_data(handle, bh);
1853 mark_buffer_dirty(bh);
1854 }
1855
1856unlock:
1857 unlock_page(page);
1858 page_cache_release(page);
1859 return err;
1860}
1861
1862/*
1863 * Probably it should be a library function... search for first non-zero word
1864 * or memcmp with zero_page, whatever is better for particular architecture.
1865 * Linus?
1866 */
1867static inline int all_zeroes(__le32 *p, __le32 *q)
1868{
1869 while (p < q)
1870 if (*p++)
1871 return 0;
1872 return 1;
1873}
1874
1875/**
1876 * ext3_find_shared - find the indirect blocks for partial truncation.
1877 * @inode: inode in question
1878 * @depth: depth of the affected branch
1879 * @offsets: offsets of pointers in that branch (see ext3_block_to_path)
1880 * @chain: place to store the pointers to partial indirect blocks
1881 * @top: place to the (detached) top of branch
1882 *
1883 * This is a helper function used by ext3_truncate().
1884 *
1885 * When we do truncate() we may have to clean the ends of several
1886 * indirect blocks but leave the blocks themselves alive. Block is
1887 * partially truncated if some data below the new i_size is refered
1888 * from it (and it is on the path to the first completely truncated
1889 * data block, indeed). We have to free the top of that path along
1890 * with everything to the right of the path. Since no allocation
1891 * past the truncation point is possible until ext3_truncate()
1892 * finishes, we may safely do the latter, but top of branch may
1893 * require special attention - pageout below the truncation point
1894 * might try to populate it.
1895 *
1896 * We atomically detach the top of branch from the tree, store the
1897 * block number of its root in *@top, pointers to buffer_heads of
1898 * partially truncated blocks - in @chain[].bh and pointers to
1899 * their last elements that should not be removed - in
1900 * @chain[].p. Return value is the pointer to last filled element
1901 * of @chain.
1902 *
1903 * The work left to caller to do the actual freeing of subtrees:
1904 * a) free the subtree starting from *@top
1905 * b) free the subtrees whose roots are stored in
1906 * (@chain[i].p+1 .. end of @chain[i].bh->b_data)
1907 * c) free the subtrees growing from the inode past the @chain[0].
1908 * (no partially truncated stuff there). */
1909
1910static Indirect *ext3_find_shared(struct inode *inode,
1911 int depth,
1912 int offsets[4],
1913 Indirect chain[4],
1914 __le32 *top)
1915{
1916 Indirect *partial, *p;
1917 int k, err;
1918
1919 *top = 0;
1920 /* Make k index the deepest non-null offest + 1 */
1921 for (k = depth; k > 1 && !offsets[k-1]; k--)
1922 ;
1923 partial = ext3_get_branch(inode, k, offsets, chain, &err);
1924 /* Writer: pointers */
1925 if (!partial)
1926 partial = chain + k-1;
1927 /*
1928 * If the branch acquired continuation since we've looked at it -
1929 * fine, it should all survive and (new) top doesn't belong to us.
1930 */
1931 if (!partial->key && *partial->p)
1932 /* Writer: end */
1933 goto no_top;
1934 for (p=partial; p>chain && all_zeroes((__le32*)p->bh->b_data,p->p); p--)
1935 ;
1936 /*
1937 * OK, we've found the last block that must survive. The rest of our
1938 * branch should be detached before unlocking. However, if that rest
1939 * of branch is all ours and does not grow immediately from the inode
1940 * it's easier to cheat and just decrement partial->p.
1941 */
1942 if (p == chain + k - 1 && p > chain) {
1943 p->p--;
1944 } else {
1945 *top = *p->p;
1946 /* Nope, don't do this in ext3. Must leave the tree intact */
1947#if 0
1948 *p->p = 0;
1949#endif
1950 }
1951 /* Writer: end */
1952
1953 while(partial > p)
1954 {
1955 brelse(partial->bh);
1956 partial--;
1957 }
1958no_top:
1959 return partial;
1960}
1961
1962/*
1963 * Zero a number of block pointers in either an inode or an indirect block.
1964 * If we restart the transaction we must again get write access to the
1965 * indirect block for further modification.
1966 *
1967 * We release `count' blocks on disk, but (last - first) may be greater
1968 * than `count' because there can be holes in there.
1969 */
1970static void
1971ext3_clear_blocks(handle_t *handle, struct inode *inode, struct buffer_head *bh,
1972 unsigned long block_to_free, unsigned long count,
1973 __le32 *first, __le32 *last)
1974{
1975 __le32 *p;
1976 if (try_to_extend_transaction(handle, inode)) {
1977 if (bh) {
1978 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
1979 ext3_journal_dirty_metadata(handle, bh);
1980 }
1981 ext3_mark_inode_dirty(handle, inode);
1982 ext3_journal_test_restart(handle, inode);
1983 if (bh) {
1984 BUFFER_TRACE(bh, "retaking write access");
1985 ext3_journal_get_write_access(handle, bh);
1986 }
1987 }
1988
1989 /*
1990 * Any buffers which are on the journal will be in memory. We find
1991 * them on the hash table so journal_revoke() will run journal_forget()
1992 * on them. We've already detached each block from the file, so
1993 * bforget() in journal_forget() should be safe.
1994 *
1995 * AKPM: turn on bforget in journal_forget()!!!
1996 */
1997 for (p = first; p < last; p++) {
1998 u32 nr = le32_to_cpu(*p);
1999 if (nr) {
2000 struct buffer_head *bh;
2001
2002 *p = 0;
2003 bh = sb_find_get_block(inode->i_sb, nr);
2004 ext3_forget(handle, 0, inode, bh, nr);
2005 }
2006 }
2007
2008 ext3_free_blocks(handle, inode, block_to_free, count);
2009}
2010
2011/**
2012 * ext3_free_data - free a list of data blocks
2013 * @handle: handle for this transaction
2014 * @inode: inode we are dealing with
2015 * @this_bh: indirect buffer_head which contains *@first and *@last
2016 * @first: array of block numbers
2017 * @last: points immediately past the end of array
2018 *
2019 * We are freeing all blocks refered from that array (numbers are stored as
2020 * little-endian 32-bit) and updating @inode->i_blocks appropriately.
2021 *
2022 * We accumulate contiguous runs of blocks to free. Conveniently, if these
2023 * blocks are contiguous then releasing them at one time will only affect one
2024 * or two bitmap blocks (+ group descriptor(s) and superblock) and we won't
2025 * actually use a lot of journal space.
2026 *
2027 * @this_bh will be %NULL if @first and @last point into the inode's direct
2028 * block pointers.
2029 */
2030static void ext3_free_data(handle_t *handle, struct inode *inode,
2031 struct buffer_head *this_bh,
2032 __le32 *first, __le32 *last)
2033{
2034 unsigned long block_to_free = 0; /* Starting block # of a run */
2035 unsigned long count = 0; /* Number of blocks in the run */
2036 __le32 *block_to_free_p = NULL; /* Pointer into inode/ind
2037 corresponding to
2038 block_to_free */
2039 unsigned long nr; /* Current block # */
2040 __le32 *p; /* Pointer into inode/ind
2041 for current block */
2042 int err;
2043
2044 if (this_bh) { /* For indirect block */
2045 BUFFER_TRACE(this_bh, "get_write_access");
2046 err = ext3_journal_get_write_access(handle, this_bh);
2047 /* Important: if we can't update the indirect pointers
2048 * to the blocks, we can't free them. */
2049 if (err)
2050 return;
2051 }
2052
2053 for (p = first; p < last; p++) {
2054 nr = le32_to_cpu(*p);
2055 if (nr) {
2056 /* accumulate blocks to free if they're contiguous */
2057 if (count == 0) {
2058 block_to_free = nr;
2059 block_to_free_p = p;
2060 count = 1;
2061 } else if (nr == block_to_free + count) {
2062 count++;
2063 } else {
2064 ext3_clear_blocks(handle, inode, this_bh,
2065 block_to_free,
2066 count, block_to_free_p, p);
2067 block_to_free = nr;
2068 block_to_free_p = p;
2069 count = 1;
2070 }
2071 }
2072 }
2073
2074 if (count > 0)
2075 ext3_clear_blocks(handle, inode, this_bh, block_to_free,
2076 count, block_to_free_p, p);
2077
2078 if (this_bh) {
2079 BUFFER_TRACE(this_bh, "call ext3_journal_dirty_metadata");
2080 ext3_journal_dirty_metadata(handle, this_bh);
2081 }
2082}
2083
2084/**
2085 * ext3_free_branches - free an array of branches
2086 * @handle: JBD handle for this transaction
2087 * @inode: inode we are dealing with
2088 * @parent_bh: the buffer_head which contains *@first and *@last
2089 * @first: array of block numbers
2090 * @last: pointer immediately past the end of array
2091 * @depth: depth of the branches to free
2092 *
2093 * We are freeing all blocks refered from these branches (numbers are
2094 * stored as little-endian 32-bit) and updating @inode->i_blocks
2095 * appropriately.
2096 */
2097static void ext3_free_branches(handle_t *handle, struct inode *inode,
2098 struct buffer_head *parent_bh,
2099 __le32 *first, __le32 *last, int depth)
2100{
2101 unsigned long nr;
2102 __le32 *p;
2103
2104 if (is_handle_aborted(handle))
2105 return;
2106
2107 if (depth--) {
2108 struct buffer_head *bh;
2109 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2110 p = last;
2111 while (--p >= first) {
2112 nr = le32_to_cpu(*p);
2113 if (!nr)
2114 continue; /* A hole */
2115
2116 /* Go read the buffer for the next level down */
2117 bh = sb_bread(inode->i_sb, nr);
2118
2119 /*
2120 * A read failure? Report error and clear slot
2121 * (should be rare).
2122 */
2123 if (!bh) {
2124 ext3_error(inode->i_sb, "ext3_free_branches",
2125 "Read failure, inode=%ld, block=%ld",
2126 inode->i_ino, nr);
2127 continue;
2128 }
2129
2130 /* This zaps the entire block. Bottom up. */
2131 BUFFER_TRACE(bh, "free child branches");
2132 ext3_free_branches(handle, inode, bh,
2133 (__le32*)bh->b_data,
2134 (__le32*)bh->b_data + addr_per_block,
2135 depth);
2136
2137 /*
2138 * We've probably journalled the indirect block several
2139 * times during the truncate. But it's no longer
2140 * needed and we now drop it from the transaction via
2141 * journal_revoke().
2142 *
2143 * That's easy if it's exclusively part of this
2144 * transaction. But if it's part of the committing
2145 * transaction then journal_forget() will simply
2146 * brelse() it. That means that if the underlying
2147 * block is reallocated in ext3_get_block(),
2148 * unmap_underlying_metadata() will find this block
2149 * and will try to get rid of it. damn, damn.
2150 *
2151 * If this block has already been committed to the
2152 * journal, a revoke record will be written. And
2153 * revoke records must be emitted *before* clearing
2154 * this block's bit in the bitmaps.
2155 */
2156 ext3_forget(handle, 1, inode, bh, bh->b_blocknr);
2157
2158 /*
2159 * Everything below this this pointer has been
2160 * released. Now let this top-of-subtree go.
2161 *
2162 * We want the freeing of this indirect block to be
2163 * atomic in the journal with the updating of the
2164 * bitmap block which owns it. So make some room in
2165 * the journal.
2166 *
2167 * We zero the parent pointer *after* freeing its
2168 * pointee in the bitmaps, so if extend_transaction()
2169 * for some reason fails to put the bitmap changes and
2170 * the release into the same transaction, recovery
2171 * will merely complain about releasing a free block,
2172 * rather than leaking blocks.
2173 */
2174 if (is_handle_aborted(handle))
2175 return;
2176 if (try_to_extend_transaction(handle, inode)) {
2177 ext3_mark_inode_dirty(handle, inode);
2178 ext3_journal_test_restart(handle, inode);
2179 }
2180
2181 ext3_free_blocks(handle, inode, nr, 1);
2182
2183 if (parent_bh) {
2184 /*
2185 * The block which we have just freed is
2186 * pointed to by an indirect block: journal it
2187 */
2188 BUFFER_TRACE(parent_bh, "get_write_access");
2189 if (!ext3_journal_get_write_access(handle,
2190 parent_bh)){
2191 *p = 0;
2192 BUFFER_TRACE(parent_bh,
2193 "call ext3_journal_dirty_metadata");
2194 ext3_journal_dirty_metadata(handle,
2195 parent_bh);
2196 }
2197 }
2198 }
2199 } else {
2200 /* We have reached the bottom of the tree. */
2201 BUFFER_TRACE(parent_bh, "free data blocks");
2202 ext3_free_data(handle, inode, parent_bh, first, last);
2203 }
2204}
2205
2206/*
2207 * ext3_truncate()
2208 *
2209 * We block out ext3_get_block() block instantiations across the entire
2210 * transaction, and VFS/VM ensures that ext3_truncate() cannot run
2211 * simultaneously on behalf of the same inode.
2212 *
2213 * As we work through the truncate and commmit bits of it to the journal there
2214 * is one core, guiding principle: the file's tree must always be consistent on
2215 * disk. We must be able to restart the truncate after a crash.
2216 *
2217 * The file's tree may be transiently inconsistent in memory (although it
2218 * probably isn't), but whenever we close off and commit a journal transaction,
2219 * the contents of (the filesystem + the journal) must be consistent and
2220 * restartable. It's pretty simple, really: bottom up, right to left (although
2221 * left-to-right works OK too).
2222 *
2223 * Note that at recovery time, journal replay occurs *before* the restart of
2224 * truncate against the orphan inode list.
2225 *
2226 * The committed inode has the new, desired i_size (which is the same as
2227 * i_disksize in this case). After a crash, ext3_orphan_cleanup() will see
2228 * that this inode's truncate did not complete and it will again call
2229 * ext3_truncate() to have another go. So there will be instantiated blocks
2230 * to the right of the truncation point in a crashed ext3 filesystem. But
2231 * that's fine - as long as they are linked from the inode, the post-crash
2232 * ext3_truncate() run will find them and release them.
2233 */
2234
2235void ext3_truncate(struct inode * inode)
2236{
2237 handle_t *handle;
2238 struct ext3_inode_info *ei = EXT3_I(inode);
2239 __le32 *i_data = ei->i_data;
2240 int addr_per_block = EXT3_ADDR_PER_BLOCK(inode->i_sb);
2241 struct address_space *mapping = inode->i_mapping;
2242 int offsets[4];
2243 Indirect chain[4];
2244 Indirect *partial;
2245 __le32 nr = 0;
2246 int n;
2247 long last_block;
2248 unsigned blocksize = inode->i_sb->s_blocksize;
2249 struct page *page;
2250
2251 if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
2252 S_ISLNK(inode->i_mode)))
2253 return;
2254 if (ext3_inode_is_fast_symlink(inode))
2255 return;
2256 if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
2257 return;
2258
2259 /*
2260 * We have to lock the EOF page here, because lock_page() nests
2261 * outside journal_start().
2262 */
2263 if ((inode->i_size & (blocksize - 1)) == 0) {
2264 /* Block boundary? Nothing to do */
2265 page = NULL;
2266 } else {
2267 page = grab_cache_page(mapping,
2268 inode->i_size >> PAGE_CACHE_SHIFT);
2269 if (!page)
2270 return;
2271 }
2272
2273 handle = start_transaction(inode);
2274 if (IS_ERR(handle)) {
2275 if (page) {
2276 clear_highpage(page);
2277 flush_dcache_page(page);
2278 unlock_page(page);
2279 page_cache_release(page);
2280 }
2281 return; /* AKPM: return what? */
2282 }
2283
2284 last_block = (inode->i_size + blocksize-1)
2285 >> EXT3_BLOCK_SIZE_BITS(inode->i_sb);
2286
2287 if (page)
2288 ext3_block_truncate_page(handle, page, mapping, inode->i_size);
2289
2290 n = ext3_block_to_path(inode, last_block, offsets, NULL);
2291 if (n == 0)
2292 goto out_stop; /* error */
2293
2294 /*
2295 * OK. This truncate is going to happen. We add the inode to the
2296 * orphan list, so that if this truncate spans multiple transactions,
2297 * and we crash, we will resume the truncate when the filesystem
2298 * recovers. It also marks the inode dirty, to catch the new size.
2299 *
2300 * Implication: the file must always be in a sane, consistent
2301 * truncatable state while each transaction commits.
2302 */
2303 if (ext3_orphan_add(handle, inode))
2304 goto out_stop;
2305
2306 /*
2307 * The orphan list entry will now protect us from any crash which
2308 * occurs before the truncate completes, so it is now safe to propagate
2309 * the new, shorter inode size (held for now in i_size) into the
2310 * on-disk inode. We do this via i_disksize, which is the value which
2311 * ext3 *really* writes onto the disk inode.
2312 */
2313 ei->i_disksize = inode->i_size;
2314
2315 /*
2316 * From here we block out all ext3_get_block() callers who want to
2317 * modify the block allocation tree.
2318 */
Arjan van de Ven97461512006-03-23 03:00:42 -08002319 mutex_lock(&ei->truncate_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002320
2321 if (n == 1) { /* direct blocks */
2322 ext3_free_data(handle, inode, NULL, i_data+offsets[0],
2323 i_data + EXT3_NDIR_BLOCKS);
2324 goto do_indirects;
2325 }
2326
2327 partial = ext3_find_shared(inode, n, offsets, chain, &nr);
2328 /* Kill the top of shared branch (not detached) */
2329 if (nr) {
2330 if (partial == chain) {
2331 /* Shared branch grows from the inode */
2332 ext3_free_branches(handle, inode, NULL,
2333 &nr, &nr+1, (chain+n-1) - partial);
2334 *partial->p = 0;
2335 /*
2336 * We mark the inode dirty prior to restart,
2337 * and prior to stop. No need for it here.
2338 */
2339 } else {
2340 /* Shared branch grows from an indirect block */
2341 BUFFER_TRACE(partial->bh, "get_write_access");
2342 ext3_free_branches(handle, inode, partial->bh,
2343 partial->p,
2344 partial->p+1, (chain+n-1) - partial);
2345 }
2346 }
2347 /* Clear the ends of indirect blocks on the shared branch */
2348 while (partial > chain) {
2349 ext3_free_branches(handle, inode, partial->bh, partial->p + 1,
2350 (__le32*)partial->bh->b_data+addr_per_block,
2351 (chain+n-1) - partial);
2352 BUFFER_TRACE(partial->bh, "call brelse");
2353 brelse (partial->bh);
2354 partial--;
2355 }
2356do_indirects:
2357 /* Kill the remaining (whole) subtrees */
2358 switch (offsets[0]) {
2359 default:
2360 nr = i_data[EXT3_IND_BLOCK];
2361 if (nr) {
2362 ext3_free_branches(handle, inode, NULL,
2363 &nr, &nr+1, 1);
2364 i_data[EXT3_IND_BLOCK] = 0;
2365 }
2366 case EXT3_IND_BLOCK:
2367 nr = i_data[EXT3_DIND_BLOCK];
2368 if (nr) {
2369 ext3_free_branches(handle, inode, NULL,
2370 &nr, &nr+1, 2);
2371 i_data[EXT3_DIND_BLOCK] = 0;
2372 }
2373 case EXT3_DIND_BLOCK:
2374 nr = i_data[EXT3_TIND_BLOCK];
2375 if (nr) {
2376 ext3_free_branches(handle, inode, NULL,
2377 &nr, &nr+1, 3);
2378 i_data[EXT3_TIND_BLOCK] = 0;
2379 }
2380 case EXT3_TIND_BLOCK:
2381 ;
2382 }
2383
2384 ext3_discard_reservation(inode);
2385
Arjan van de Ven97461512006-03-23 03:00:42 -08002386 mutex_unlock(&ei->truncate_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002387 inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC;
2388 ext3_mark_inode_dirty(handle, inode);
2389
2390 /* In a multi-transaction truncate, we only make the final
2391 * transaction synchronous */
2392 if (IS_SYNC(inode))
2393 handle->h_sync = 1;
2394out_stop:
2395 /*
2396 * If this was a simple ftruncate(), and the file will remain alive
2397 * then we need to clear up the orphan record which we created above.
2398 * However, if this was a real unlink then we were called by
2399 * ext3_delete_inode(), and we allow that function to clean up the
2400 * orphan info for us.
2401 */
2402 if (inode->i_nlink)
2403 ext3_orphan_del(handle, inode);
2404
2405 ext3_journal_stop(handle);
2406}
2407
2408static unsigned long ext3_get_inode_block(struct super_block *sb,
2409 unsigned long ino, struct ext3_iloc *iloc)
2410{
2411 unsigned long desc, group_desc, block_group;
2412 unsigned long offset, block;
2413 struct buffer_head *bh;
2414 struct ext3_group_desc * gdp;
2415
2416
2417 if ((ino != EXT3_ROOT_INO &&
2418 ino != EXT3_JOURNAL_INO &&
2419 ino != EXT3_RESIZE_INO &&
2420 ino < EXT3_FIRST_INO(sb)) ||
2421 ino > le32_to_cpu(
2422 EXT3_SB(sb)->s_es->s_inodes_count)) {
2423 ext3_error (sb, "ext3_get_inode_block",
2424 "bad inode number: %lu", ino);
2425 return 0;
2426 }
2427 block_group = (ino - 1) / EXT3_INODES_PER_GROUP(sb);
2428 if (block_group >= EXT3_SB(sb)->s_groups_count) {
2429 ext3_error (sb, "ext3_get_inode_block",
2430 "group >= groups count");
2431 return 0;
2432 }
2433 smp_rmb();
2434 group_desc = block_group >> EXT3_DESC_PER_BLOCK_BITS(sb);
2435 desc = block_group & (EXT3_DESC_PER_BLOCK(sb) - 1);
2436 bh = EXT3_SB(sb)->s_group_desc[group_desc];
2437 if (!bh) {
2438 ext3_error (sb, "ext3_get_inode_block",
2439 "Descriptor not loaded");
2440 return 0;
2441 }
2442
2443 gdp = (struct ext3_group_desc *) bh->b_data;
2444 /*
2445 * Figure out the offset within the block group inode table
2446 */
2447 offset = ((ino - 1) % EXT3_INODES_PER_GROUP(sb)) *
2448 EXT3_INODE_SIZE(sb);
2449 block = le32_to_cpu(gdp[desc].bg_inode_table) +
2450 (offset >> EXT3_BLOCK_SIZE_BITS(sb));
2451
2452 iloc->block_group = block_group;
2453 iloc->offset = offset & (EXT3_BLOCK_SIZE(sb) - 1);
2454 return block;
2455}
2456
2457/*
2458 * ext3_get_inode_loc returns with an extra refcount against the inode's
2459 * underlying buffer_head on success. If 'in_mem' is true, we have all
2460 * data in memory that is needed to recreate the on-disk version of this
2461 * inode.
2462 */
2463static int __ext3_get_inode_loc(struct inode *inode,
2464 struct ext3_iloc *iloc, int in_mem)
2465{
2466 unsigned long block;
2467 struct buffer_head *bh;
2468
2469 block = ext3_get_inode_block(inode->i_sb, inode->i_ino, iloc);
2470 if (!block)
2471 return -EIO;
2472
2473 bh = sb_getblk(inode->i_sb, block);
2474 if (!bh) {
2475 ext3_error (inode->i_sb, "ext3_get_inode_loc",
2476 "unable to read inode block - "
2477 "inode=%lu, block=%lu", inode->i_ino, block);
2478 return -EIO;
2479 }
2480 if (!buffer_uptodate(bh)) {
2481 lock_buffer(bh);
2482 if (buffer_uptodate(bh)) {
2483 /* someone brought it uptodate while we waited */
2484 unlock_buffer(bh);
2485 goto has_buffer;
2486 }
2487
2488 /*
2489 * If we have all information of the inode in memory and this
2490 * is the only valid inode in the block, we need not read the
2491 * block.
2492 */
2493 if (in_mem) {
2494 struct buffer_head *bitmap_bh;
2495 struct ext3_group_desc *desc;
2496 int inodes_per_buffer;
2497 int inode_offset, i;
2498 int block_group;
2499 int start;
2500
2501 block_group = (inode->i_ino - 1) /
2502 EXT3_INODES_PER_GROUP(inode->i_sb);
2503 inodes_per_buffer = bh->b_size /
2504 EXT3_INODE_SIZE(inode->i_sb);
2505 inode_offset = ((inode->i_ino - 1) %
2506 EXT3_INODES_PER_GROUP(inode->i_sb));
2507 start = inode_offset & ~(inodes_per_buffer - 1);
2508
2509 /* Is the inode bitmap in cache? */
2510 desc = ext3_get_group_desc(inode->i_sb,
2511 block_group, NULL);
2512 if (!desc)
2513 goto make_io;
2514
2515 bitmap_bh = sb_getblk(inode->i_sb,
2516 le32_to_cpu(desc->bg_inode_bitmap));
2517 if (!bitmap_bh)
2518 goto make_io;
2519
2520 /*
2521 * If the inode bitmap isn't in cache then the
2522 * optimisation may end up performing two reads instead
2523 * of one, so skip it.
2524 */
2525 if (!buffer_uptodate(bitmap_bh)) {
2526 brelse(bitmap_bh);
2527 goto make_io;
2528 }
2529 for (i = start; i < start + inodes_per_buffer; i++) {
2530 if (i == inode_offset)
2531 continue;
2532 if (ext3_test_bit(i, bitmap_bh->b_data))
2533 break;
2534 }
2535 brelse(bitmap_bh);
2536 if (i == start + inodes_per_buffer) {
2537 /* all other inodes are free, so skip I/O */
2538 memset(bh->b_data, 0, bh->b_size);
2539 set_buffer_uptodate(bh);
2540 unlock_buffer(bh);
2541 goto has_buffer;
2542 }
2543 }
2544
2545make_io:
2546 /*
2547 * There are other valid inodes in the buffer, this inode
2548 * has in-inode xattrs, or we don't have this inode in memory.
2549 * Read the block from disk.
2550 */
2551 get_bh(bh);
2552 bh->b_end_io = end_buffer_read_sync;
2553 submit_bh(READ, bh);
2554 wait_on_buffer(bh);
2555 if (!buffer_uptodate(bh)) {
2556 ext3_error(inode->i_sb, "ext3_get_inode_loc",
2557 "unable to read inode block - "
2558 "inode=%lu, block=%lu",
2559 inode->i_ino, block);
2560 brelse(bh);
2561 return -EIO;
2562 }
2563 }
2564has_buffer:
2565 iloc->bh = bh;
2566 return 0;
2567}
2568
2569int ext3_get_inode_loc(struct inode *inode, struct ext3_iloc *iloc)
2570{
2571 /* We have all inode data except xattrs in memory here. */
2572 return __ext3_get_inode_loc(inode, iloc,
2573 !(EXT3_I(inode)->i_state & EXT3_STATE_XATTR));
2574}
2575
2576void ext3_set_inode_flags(struct inode *inode)
2577{
2578 unsigned int flags = EXT3_I(inode)->i_flags;
2579
2580 inode->i_flags &= ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC);
2581 if (flags & EXT3_SYNC_FL)
2582 inode->i_flags |= S_SYNC;
2583 if (flags & EXT3_APPEND_FL)
2584 inode->i_flags |= S_APPEND;
2585 if (flags & EXT3_IMMUTABLE_FL)
2586 inode->i_flags |= S_IMMUTABLE;
2587 if (flags & EXT3_NOATIME_FL)
2588 inode->i_flags |= S_NOATIME;
2589 if (flags & EXT3_DIRSYNC_FL)
2590 inode->i_flags |= S_DIRSYNC;
2591}
2592
2593void ext3_read_inode(struct inode * inode)
2594{
2595 struct ext3_iloc iloc;
2596 struct ext3_inode *raw_inode;
2597 struct ext3_inode_info *ei = EXT3_I(inode);
2598 struct buffer_head *bh;
2599 int block;
2600
2601#ifdef CONFIG_EXT3_FS_POSIX_ACL
2602 ei->i_acl = EXT3_ACL_NOT_CACHED;
2603 ei->i_default_acl = EXT3_ACL_NOT_CACHED;
2604#endif
2605 ei->i_block_alloc_info = NULL;
2606
2607 if (__ext3_get_inode_loc(inode, &iloc, 0))
2608 goto bad_inode;
2609 bh = iloc.bh;
2610 raw_inode = ext3_raw_inode(&iloc);
2611 inode->i_mode = le16_to_cpu(raw_inode->i_mode);
2612 inode->i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low);
2613 inode->i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low);
2614 if(!(test_opt (inode->i_sb, NO_UID32))) {
2615 inode->i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16;
2616 inode->i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16;
2617 }
2618 inode->i_nlink = le16_to_cpu(raw_inode->i_links_count);
2619 inode->i_size = le32_to_cpu(raw_inode->i_size);
2620 inode->i_atime.tv_sec = le32_to_cpu(raw_inode->i_atime);
2621 inode->i_ctime.tv_sec = le32_to_cpu(raw_inode->i_ctime);
2622 inode->i_mtime.tv_sec = le32_to_cpu(raw_inode->i_mtime);
2623 inode->i_atime.tv_nsec = inode->i_ctime.tv_nsec = inode->i_mtime.tv_nsec = 0;
2624
2625 ei->i_state = 0;
2626 ei->i_dir_start_lookup = 0;
2627 ei->i_dtime = le32_to_cpu(raw_inode->i_dtime);
2628 /* We now have enough fields to check if the inode was active or not.
2629 * This is needed because nfsd might try to access dead inodes
2630 * the test is that same one that e2fsck uses
2631 * NeilBrown 1999oct15
2632 */
2633 if (inode->i_nlink == 0) {
2634 if (inode->i_mode == 0 ||
2635 !(EXT3_SB(inode->i_sb)->s_mount_state & EXT3_ORPHAN_FS)) {
2636 /* this inode is deleted */
2637 brelse (bh);
2638 goto bad_inode;
2639 }
2640 /* The only unlinked inodes we let through here have
2641 * valid i_mode and are being read by the orphan
2642 * recovery code: that's fine, we're about to complete
2643 * the process of deleting those. */
2644 }
2645 inode->i_blksize = PAGE_SIZE; /* This is the optimal IO size
2646 * (for stat), not the fs block
2647 * size */
2648 inode->i_blocks = le32_to_cpu(raw_inode->i_blocks);
2649 ei->i_flags = le32_to_cpu(raw_inode->i_flags);
2650#ifdef EXT3_FRAGMENTS
2651 ei->i_faddr = le32_to_cpu(raw_inode->i_faddr);
2652 ei->i_frag_no = raw_inode->i_frag;
2653 ei->i_frag_size = raw_inode->i_fsize;
2654#endif
2655 ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
2656 if (!S_ISREG(inode->i_mode)) {
2657 ei->i_dir_acl = le32_to_cpu(raw_inode->i_dir_acl);
2658 } else {
2659 inode->i_size |=
2660 ((__u64)le32_to_cpu(raw_inode->i_size_high)) << 32;
2661 }
2662 ei->i_disksize = inode->i_size;
2663 inode->i_generation = le32_to_cpu(raw_inode->i_generation);
2664 ei->i_block_group = iloc.block_group;
2665 /*
2666 * NOTE! The in-memory inode i_data array is in little-endian order
2667 * even on big-endian machines: we do NOT byteswap the block numbers!
2668 */
2669 for (block = 0; block < EXT3_N_BLOCKS; block++)
2670 ei->i_data[block] = raw_inode->i_block[block];
2671 INIT_LIST_HEAD(&ei->i_orphan);
2672
2673 if (inode->i_ino >= EXT3_FIRST_INO(inode->i_sb) + 1 &&
2674 EXT3_INODE_SIZE(inode->i_sb) > EXT3_GOOD_OLD_INODE_SIZE) {
2675 /*
2676 * When mke2fs creates big inodes it does not zero out
2677 * the unused bytes above EXT3_GOOD_OLD_INODE_SIZE,
2678 * so ignore those first few inodes.
2679 */
2680 ei->i_extra_isize = le16_to_cpu(raw_inode->i_extra_isize);
2681 if (EXT3_GOOD_OLD_INODE_SIZE + ei->i_extra_isize >
2682 EXT3_INODE_SIZE(inode->i_sb))
2683 goto bad_inode;
2684 if (ei->i_extra_isize == 0) {
2685 /* The extra space is currently unused. Use it. */
2686 ei->i_extra_isize = sizeof(struct ext3_inode) -
2687 EXT3_GOOD_OLD_INODE_SIZE;
2688 } else {
2689 __le32 *magic = (void *)raw_inode +
2690 EXT3_GOOD_OLD_INODE_SIZE +
2691 ei->i_extra_isize;
2692 if (*magic == cpu_to_le32(EXT3_XATTR_MAGIC))
2693 ei->i_state |= EXT3_STATE_XATTR;
2694 }
2695 } else
2696 ei->i_extra_isize = 0;
2697
2698 if (S_ISREG(inode->i_mode)) {
2699 inode->i_op = &ext3_file_inode_operations;
2700 inode->i_fop = &ext3_file_operations;
2701 ext3_set_aops(inode);
2702 } else if (S_ISDIR(inode->i_mode)) {
2703 inode->i_op = &ext3_dir_inode_operations;
2704 inode->i_fop = &ext3_dir_operations;
2705 } else if (S_ISLNK(inode->i_mode)) {
2706 if (ext3_inode_is_fast_symlink(inode))
2707 inode->i_op = &ext3_fast_symlink_inode_operations;
2708 else {
2709 inode->i_op = &ext3_symlink_inode_operations;
2710 ext3_set_aops(inode);
2711 }
2712 } else {
2713 inode->i_op = &ext3_special_inode_operations;
2714 if (raw_inode->i_block[0])
2715 init_special_inode(inode, inode->i_mode,
2716 old_decode_dev(le32_to_cpu(raw_inode->i_block[0])));
2717 else
2718 init_special_inode(inode, inode->i_mode,
2719 new_decode_dev(le32_to_cpu(raw_inode->i_block[1])));
2720 }
2721 brelse (iloc.bh);
2722 ext3_set_inode_flags(inode);
2723 return;
2724
2725bad_inode:
2726 make_bad_inode(inode);
2727 return;
2728}
2729
2730/*
2731 * Post the struct inode info into an on-disk inode location in the
2732 * buffer-cache. This gobbles the caller's reference to the
2733 * buffer_head in the inode location struct.
2734 *
2735 * The caller must have write access to iloc->bh.
2736 */
2737static int ext3_do_update_inode(handle_t *handle,
2738 struct inode *inode,
2739 struct ext3_iloc *iloc)
2740{
2741 struct ext3_inode *raw_inode = ext3_raw_inode(iloc);
2742 struct ext3_inode_info *ei = EXT3_I(inode);
2743 struct buffer_head *bh = iloc->bh;
2744 int err = 0, rc, block;
2745
2746 /* For fields not not tracking in the in-memory inode,
2747 * initialise them to zero for new inodes. */
2748 if (ei->i_state & EXT3_STATE_NEW)
2749 memset(raw_inode, 0, EXT3_SB(inode->i_sb)->s_inode_size);
2750
2751 raw_inode->i_mode = cpu_to_le16(inode->i_mode);
2752 if(!(test_opt(inode->i_sb, NO_UID32))) {
2753 raw_inode->i_uid_low = cpu_to_le16(low_16_bits(inode->i_uid));
2754 raw_inode->i_gid_low = cpu_to_le16(low_16_bits(inode->i_gid));
2755/*
2756 * Fix up interoperability with old kernels. Otherwise, old inodes get
2757 * re-used with the upper 16 bits of the uid/gid intact
2758 */
2759 if(!ei->i_dtime) {
2760 raw_inode->i_uid_high =
2761 cpu_to_le16(high_16_bits(inode->i_uid));
2762 raw_inode->i_gid_high =
2763 cpu_to_le16(high_16_bits(inode->i_gid));
2764 } else {
2765 raw_inode->i_uid_high = 0;
2766 raw_inode->i_gid_high = 0;
2767 }
2768 } else {
2769 raw_inode->i_uid_low =
2770 cpu_to_le16(fs_high2lowuid(inode->i_uid));
2771 raw_inode->i_gid_low =
2772 cpu_to_le16(fs_high2lowgid(inode->i_gid));
2773 raw_inode->i_uid_high = 0;
2774 raw_inode->i_gid_high = 0;
2775 }
2776 raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
2777 raw_inode->i_size = cpu_to_le32(ei->i_disksize);
2778 raw_inode->i_atime = cpu_to_le32(inode->i_atime.tv_sec);
2779 raw_inode->i_ctime = cpu_to_le32(inode->i_ctime.tv_sec);
2780 raw_inode->i_mtime = cpu_to_le32(inode->i_mtime.tv_sec);
2781 raw_inode->i_blocks = cpu_to_le32(inode->i_blocks);
2782 raw_inode->i_dtime = cpu_to_le32(ei->i_dtime);
2783 raw_inode->i_flags = cpu_to_le32(ei->i_flags);
2784#ifdef EXT3_FRAGMENTS
2785 raw_inode->i_faddr = cpu_to_le32(ei->i_faddr);
2786 raw_inode->i_frag = ei->i_frag_no;
2787 raw_inode->i_fsize = ei->i_frag_size;
2788#endif
2789 raw_inode->i_file_acl = cpu_to_le32(ei->i_file_acl);
2790 if (!S_ISREG(inode->i_mode)) {
2791 raw_inode->i_dir_acl = cpu_to_le32(ei->i_dir_acl);
2792 } else {
2793 raw_inode->i_size_high =
2794 cpu_to_le32(ei->i_disksize >> 32);
2795 if (ei->i_disksize > 0x7fffffffULL) {
2796 struct super_block *sb = inode->i_sb;
2797 if (!EXT3_HAS_RO_COMPAT_FEATURE(sb,
2798 EXT3_FEATURE_RO_COMPAT_LARGE_FILE) ||
2799 EXT3_SB(sb)->s_es->s_rev_level ==
2800 cpu_to_le32(EXT3_GOOD_OLD_REV)) {
2801 /* If this is the first large file
2802 * created, add a flag to the superblock.
2803 */
2804 err = ext3_journal_get_write_access(handle,
2805 EXT3_SB(sb)->s_sbh);
2806 if (err)
2807 goto out_brelse;
2808 ext3_update_dynamic_rev(sb);
2809 EXT3_SET_RO_COMPAT_FEATURE(sb,
2810 EXT3_FEATURE_RO_COMPAT_LARGE_FILE);
2811 sb->s_dirt = 1;
2812 handle->h_sync = 1;
2813 err = ext3_journal_dirty_metadata(handle,
2814 EXT3_SB(sb)->s_sbh);
2815 }
2816 }
2817 }
2818 raw_inode->i_generation = cpu_to_le32(inode->i_generation);
2819 if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) {
2820 if (old_valid_dev(inode->i_rdev)) {
2821 raw_inode->i_block[0] =
2822 cpu_to_le32(old_encode_dev(inode->i_rdev));
2823 raw_inode->i_block[1] = 0;
2824 } else {
2825 raw_inode->i_block[0] = 0;
2826 raw_inode->i_block[1] =
2827 cpu_to_le32(new_encode_dev(inode->i_rdev));
2828 raw_inode->i_block[2] = 0;
2829 }
2830 } else for (block = 0; block < EXT3_N_BLOCKS; block++)
2831 raw_inode->i_block[block] = ei->i_data[block];
2832
Andreas Gruenbacherff87b372005-07-07 17:57:00 -07002833 if (ei->i_extra_isize)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002834 raw_inode->i_extra_isize = cpu_to_le16(ei->i_extra_isize);
2835
2836 BUFFER_TRACE(bh, "call ext3_journal_dirty_metadata");
2837 rc = ext3_journal_dirty_metadata(handle, bh);
2838 if (!err)
2839 err = rc;
2840 ei->i_state &= ~EXT3_STATE_NEW;
2841
2842out_brelse:
2843 brelse (bh);
2844 ext3_std_error(inode->i_sb, err);
2845 return err;
2846}
2847
2848/*
2849 * ext3_write_inode()
2850 *
2851 * We are called from a few places:
2852 *
2853 * - Within generic_file_write() for O_SYNC files.
2854 * Here, there will be no transaction running. We wait for any running
2855 * trasnaction to commit.
2856 *
2857 * - Within sys_sync(), kupdate and such.
2858 * We wait on commit, if tol to.
2859 *
2860 * - Within prune_icache() (PF_MEMALLOC == true)
2861 * Here we simply return. We can't afford to block kswapd on the
2862 * journal commit.
2863 *
2864 * In all cases it is actually safe for us to return without doing anything,
2865 * because the inode has been copied into a raw inode buffer in
2866 * ext3_mark_inode_dirty(). This is a correctness thing for O_SYNC and for
2867 * knfsd.
2868 *
2869 * Note that we are absolutely dependent upon all inode dirtiers doing the
2870 * right thing: they *must* call mark_inode_dirty() after dirtying info in
2871 * which we are interested.
2872 *
2873 * It would be a bug for them to not do this. The code:
2874 *
2875 * mark_inode_dirty(inode)
2876 * stuff();
2877 * inode->i_size = expr;
2878 *
2879 * is in error because a kswapd-driven write_inode() could occur while
2880 * `stuff()' is running, and the new i_size will be lost. Plus the inode
2881 * will no longer be on the superblock's dirty inode list.
2882 */
2883int ext3_write_inode(struct inode *inode, int wait)
2884{
2885 if (current->flags & PF_MEMALLOC)
2886 return 0;
2887
2888 if (ext3_journal_current_handle()) {
2889 jbd_debug(0, "called recursively, non-PF_MEMALLOC!\n");
2890 dump_stack();
2891 return -EIO;
2892 }
2893
2894 if (!wait)
2895 return 0;
2896
2897 return ext3_force_commit(inode->i_sb);
2898}
2899
2900/*
2901 * ext3_setattr()
2902 *
2903 * Called from notify_change.
2904 *
2905 * We want to trap VFS attempts to truncate the file as soon as
2906 * possible. In particular, we want to make sure that when the VFS
2907 * shrinks i_size, we put the inode on the orphan list and modify
2908 * i_disksize immediately, so that during the subsequent flushing of
2909 * dirty pages and freeing of disk blocks, we can guarantee that any
2910 * commit will leave the blocks being flushed in an unused state on
2911 * disk. (On recovery, the inode will get truncated and the blocks will
2912 * be freed, so we have a strong guarantee that no future commit will
2913 * leave these blocks visible to the user.)
2914 *
2915 * Called with inode->sem down.
2916 */
2917int ext3_setattr(struct dentry *dentry, struct iattr *attr)
2918{
2919 struct inode *inode = dentry->d_inode;
2920 int error, rc = 0;
2921 const unsigned int ia_valid = attr->ia_valid;
2922
2923 error = inode_change_ok(inode, attr);
2924 if (error)
2925 return error;
2926
2927 if ((ia_valid & ATTR_UID && attr->ia_uid != inode->i_uid) ||
2928 (ia_valid & ATTR_GID && attr->ia_gid != inode->i_gid)) {
2929 handle_t *handle;
2930
2931 /* (user+group)*(old+new) structure, inode write (sb,
2932 * inode block, ? - but truncate inode update has it) */
Jan Kara1f545872005-06-23 22:01:04 -07002933 handle = ext3_journal_start(inode, 2*(EXT3_QUOTA_INIT_BLOCKS(inode->i_sb)+
2934 EXT3_QUOTA_DEL_BLOCKS(inode->i_sb))+3);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002935 if (IS_ERR(handle)) {
2936 error = PTR_ERR(handle);
2937 goto err_out;
2938 }
2939 error = DQUOT_TRANSFER(inode, attr) ? -EDQUOT : 0;
2940 if (error) {
2941 ext3_journal_stop(handle);
2942 return error;
2943 }
2944 /* Update corresponding info in inode so that everything is in
2945 * one transaction */
2946 if (attr->ia_valid & ATTR_UID)
2947 inode->i_uid = attr->ia_uid;
2948 if (attr->ia_valid & ATTR_GID)
2949 inode->i_gid = attr->ia_gid;
2950 error = ext3_mark_inode_dirty(handle, inode);
2951 ext3_journal_stop(handle);
2952 }
2953
2954 if (S_ISREG(inode->i_mode) &&
2955 attr->ia_valid & ATTR_SIZE && attr->ia_size < inode->i_size) {
2956 handle_t *handle;
2957
2958 handle = ext3_journal_start(inode, 3);
2959 if (IS_ERR(handle)) {
2960 error = PTR_ERR(handle);
2961 goto err_out;
2962 }
2963
2964 error = ext3_orphan_add(handle, inode);
2965 EXT3_I(inode)->i_disksize = attr->ia_size;
2966 rc = ext3_mark_inode_dirty(handle, inode);
2967 if (!error)
2968 error = rc;
2969 ext3_journal_stop(handle);
2970 }
2971
2972 rc = inode_setattr(inode, attr);
2973
2974 /* If inode_setattr's call to ext3_truncate failed to get a
2975 * transaction handle at all, we need to clean up the in-core
2976 * orphan list manually. */
2977 if (inode->i_nlink)
2978 ext3_orphan_del(NULL, inode);
2979
2980 if (!rc && (ia_valid & ATTR_MODE))
2981 rc = ext3_acl_chmod(inode);
2982
2983err_out:
2984 ext3_std_error(inode->i_sb, error);
2985 if (!error)
2986 error = rc;
2987 return error;
2988}
2989
2990
2991/*
2992 * akpm: how many blocks doth make a writepage()?
2993 *
2994 * With N blocks per page, it may be:
2995 * N data blocks
2996 * 2 indirect block
2997 * 2 dindirect
2998 * 1 tindirect
2999 * N+5 bitmap blocks (from the above)
3000 * N+5 group descriptor summary blocks
3001 * 1 inode block
3002 * 1 superblock.
3003 * 2 * EXT3_SINGLEDATA_TRANS_BLOCKS for the quote files
3004 *
3005 * 3 * (N + 5) + 2 + 2 * EXT3_SINGLEDATA_TRANS_BLOCKS
3006 *
3007 * With ordered or writeback data it's the same, less the N data blocks.
3008 *
3009 * If the inode's direct blocks can hold an integral number of pages then a
3010 * page cannot straddle two indirect blocks, and we can only touch one indirect
3011 * and dindirect block, and the "5" above becomes "3".
3012 *
3013 * This still overestimates under most circumstances. If we were to pass the
3014 * start and end offsets in here as well we could do block_to_path() on each
3015 * block and work out the exact number of indirects which are touched. Pah.
3016 */
3017
3018static int ext3_writepage_trans_blocks(struct inode *inode)
3019{
3020 int bpp = ext3_journal_blocks_per_page(inode);
3021 int indirects = (EXT3_NDIR_BLOCKS % bpp) ? 5 : 3;
3022 int ret;
3023
3024 if (ext3_should_journal_data(inode))
3025 ret = 3 * (bpp + indirects) + 2;
3026 else
3027 ret = 2 * (bpp + indirects) + 2;
3028
3029#ifdef CONFIG_QUOTA
3030 /* We know that structure was already allocated during DQUOT_INIT so
3031 * we will be updating only the data blocks + inodes */
Jan Kara1f545872005-06-23 22:01:04 -07003032 ret += 2*EXT3_QUOTA_TRANS_BLOCKS(inode->i_sb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003033#endif
3034
3035 return ret;
3036}
3037
3038/*
3039 * The caller must have previously called ext3_reserve_inode_write().
3040 * Give this, we know that the caller already has write access to iloc->bh.
3041 */
3042int ext3_mark_iloc_dirty(handle_t *handle,
3043 struct inode *inode, struct ext3_iloc *iloc)
3044{
3045 int err = 0;
3046
3047 /* the do_update_inode consumes one bh->b_count */
3048 get_bh(iloc->bh);
3049
3050 /* ext3_do_update_inode() does journal_dirty_metadata */
3051 err = ext3_do_update_inode(handle, inode, iloc);
3052 put_bh(iloc->bh);
3053 return err;
3054}
3055
3056/*
3057 * On success, We end up with an outstanding reference count against
3058 * iloc->bh. This _must_ be cleaned up later.
3059 */
3060
3061int
3062ext3_reserve_inode_write(handle_t *handle, struct inode *inode,
3063 struct ext3_iloc *iloc)
3064{
3065 int err = 0;
3066 if (handle) {
3067 err = ext3_get_inode_loc(inode, iloc);
3068 if (!err) {
3069 BUFFER_TRACE(iloc->bh, "get_write_access");
3070 err = ext3_journal_get_write_access(handle, iloc->bh);
3071 if (err) {
3072 brelse(iloc->bh);
3073 iloc->bh = NULL;
3074 }
3075 }
3076 }
3077 ext3_std_error(inode->i_sb, err);
3078 return err;
3079}
3080
3081/*
3082 * akpm: What we do here is to mark the in-core inode as clean
3083 * with respect to inode dirtiness (it may still be data-dirty).
3084 * This means that the in-core inode may be reaped by prune_icache
3085 * without having to perform any I/O. This is a very good thing,
3086 * because *any* task may call prune_icache - even ones which
3087 * have a transaction open against a different journal.
3088 *
3089 * Is this cheating? Not really. Sure, we haven't written the
3090 * inode out, but prune_icache isn't a user-visible syncing function.
3091 * Whenever the user wants stuff synced (sys_sync, sys_msync, sys_fsync)
3092 * we start and wait on commits.
3093 *
3094 * Is this efficient/effective? Well, we're being nice to the system
3095 * by cleaning up our inodes proactively so they can be reaped
3096 * without I/O. But we are potentially leaving up to five seconds'
3097 * worth of inodes floating about which prune_icache wants us to
3098 * write out. One way to fix that would be to get prune_icache()
3099 * to do a write_super() to free up some memory. It has the desired
3100 * effect.
3101 */
3102int ext3_mark_inode_dirty(handle_t *handle, struct inode *inode)
3103{
3104 struct ext3_iloc iloc;
3105 int err;
3106
3107 might_sleep();
3108 err = ext3_reserve_inode_write(handle, inode, &iloc);
3109 if (!err)
3110 err = ext3_mark_iloc_dirty(handle, inode, &iloc);
3111 return err;
3112}
3113
3114/*
3115 * akpm: ext3_dirty_inode() is called from __mark_inode_dirty()
3116 *
3117 * We're really interested in the case where a file is being extended.
3118 * i_size has been changed by generic_commit_write() and we thus need
3119 * to include the updated inode in the current transaction.
3120 *
3121 * Also, DQUOT_ALLOC_SPACE() will always dirty the inode when blocks
3122 * are allocated to the file.
3123 *
3124 * If the inode is marked synchronous, we don't honour that here - doing
3125 * so would cause a commit on atime updates, which we don't bother doing.
3126 * We handle synchronous inodes at the highest possible level.
3127 */
3128void ext3_dirty_inode(struct inode *inode)
3129{
3130 handle_t *current_handle = ext3_journal_current_handle();
3131 handle_t *handle;
3132
3133 handle = ext3_journal_start(inode, 2);
3134 if (IS_ERR(handle))
3135 goto out;
3136 if (current_handle &&
3137 current_handle->h_transaction != handle->h_transaction) {
3138 /* This task has a transaction open against a different fs */
3139 printk(KERN_EMERG "%s: transactions do not match!\n",
3140 __FUNCTION__);
3141 } else {
3142 jbd_debug(5, "marking dirty. outer handle=%p\n",
3143 current_handle);
3144 ext3_mark_inode_dirty(handle, inode);
3145 }
3146 ext3_journal_stop(handle);
3147out:
3148 return;
3149}
3150
3151#ifdef AKPM
3152/*
3153 * Bind an inode's backing buffer_head into this transaction, to prevent
3154 * it from being flushed to disk early. Unlike
3155 * ext3_reserve_inode_write, this leaves behind no bh reference and
3156 * returns no iloc structure, so the caller needs to repeat the iloc
3157 * lookup to mark the inode dirty later.
3158 */
3159static inline int
3160ext3_pin_inode(handle_t *handle, struct inode *inode)
3161{
3162 struct ext3_iloc iloc;
3163
3164 int err = 0;
3165 if (handle) {
3166 err = ext3_get_inode_loc(inode, &iloc);
3167 if (!err) {
3168 BUFFER_TRACE(iloc.bh, "get_write_access");
3169 err = journal_get_write_access(handle, iloc.bh);
3170 if (!err)
3171 err = ext3_journal_dirty_metadata(handle,
3172 iloc.bh);
3173 brelse(iloc.bh);
3174 }
3175 }
3176 ext3_std_error(inode->i_sb, err);
3177 return err;
3178}
3179#endif
3180
3181int ext3_change_inode_journal_flag(struct inode *inode, int val)
3182{
3183 journal_t *journal;
3184 handle_t *handle;
3185 int err;
3186
3187 /*
3188 * We have to be very careful here: changing a data block's
3189 * journaling status dynamically is dangerous. If we write a
3190 * data block to the journal, change the status and then delete
3191 * that block, we risk forgetting to revoke the old log record
3192 * from the journal and so a subsequent replay can corrupt data.
3193 * So, first we make sure that the journal is empty and that
3194 * nobody is changing anything.
3195 */
3196
3197 journal = EXT3_JOURNAL(inode);
3198 if (is_journal_aborted(journal) || IS_RDONLY(inode))
3199 return -EROFS;
3200
3201 journal_lock_updates(journal);
3202 journal_flush(journal);
3203
3204 /*
3205 * OK, there are no updates running now, and all cached data is
3206 * synced to disk. We are now in a completely consistent state
3207 * which doesn't have anything in the journal, and we know that
3208 * no filesystem updates are running, so it is safe to modify
3209 * the inode's in-core data-journaling state flag now.
3210 */
3211
3212 if (val)
3213 EXT3_I(inode)->i_flags |= EXT3_JOURNAL_DATA_FL;
3214 else
3215 EXT3_I(inode)->i_flags &= ~EXT3_JOURNAL_DATA_FL;
3216 ext3_set_aops(inode);
3217
3218 journal_unlock_updates(journal);
3219
3220 /* Finally we can mark the inode as dirty. */
3221
3222 handle = ext3_journal_start(inode, 1);
3223 if (IS_ERR(handle))
3224 return PTR_ERR(handle);
3225
3226 err = ext3_mark_inode_dirty(handle, inode);
3227 handle->h_sync = 1;
3228 ext3_journal_stop(handle);
3229 ext3_std_error(inode->i_sb, err);
3230
3231 return err;
3232}