blob: 5e815034aabdc9aee42316e148afc5275578b376 [file] [log] [blame]
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001/*
2 * This file is part of UBIFS.
3 *
4 * Copyright (C) 2006-2008 Nokia Corporation.
5 *
6 * This program is free software; you can redistribute it and/or modify it
7 * under the terms of the GNU General Public License version 2 as published by
8 * the Free Software Foundation.
9 *
10 * This program is distributed in the hope that it will be useful, but WITHOUT
11 * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
12 * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
13 * more details.
14 *
15 * You should have received a copy of the GNU General Public License along with
16 * this program; if not, write to the Free Software Foundation, Inc., 51
17 * Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
18 *
19 * Authors: Adrian Hunter
20 * Artem Bityutskiy (Битюцкий Артём)
21 */
22
23/*
24 * This file contains journal replay code. It runs when the file-system is being
25 * mounted and requires no locking.
26 *
27 * The larger is the journal, the longer it takes to scan it, so the longer it
28 * takes to mount UBIFS. This is why the journal has limited size which may be
29 * changed depending on the system requirements. But a larger journal gives
30 * faster I/O speed because it writes the index less frequently. So this is a
31 * trade-off. Also, the journal is indexed by the in-memory index (TNC), so the
32 * larger is the journal, the more memory its index may consume.
33 */
34
35#include "ubifs.h"
36
37/*
38 * Replay flags.
39 *
40 * REPLAY_DELETION: node was deleted
41 * REPLAY_REF: node is a reference node
42 */
43enum {
44 REPLAY_DELETION = 1,
45 REPLAY_REF = 2,
46};
47
48/**
49 * struct replay_entry - replay tree entry.
50 * @lnum: logical eraseblock number of the node
51 * @offs: node offset
52 * @len: node length
53 * @sqnum: node sequence number
54 * @flags: replay flags
55 * @rb: links the replay tree
56 * @key: node key
57 * @nm: directory entry name
58 * @old_size: truncation old size
59 * @new_size: truncation new size
60 * @free: amount of free space in a bud
61 * @dirty: amount of dirty space in a bud from padding and deletion nodes
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +030062 * @jhead: journal head number of the bud
Artem Bityutskiy1e517642008-07-14 19:08:37 +030063 *
64 * UBIFS journal replay must compare node sequence numbers, which means it must
65 * build a tree of node information to insert into the TNC.
66 */
67struct replay_entry {
68 int lnum;
69 int offs;
70 int len;
71 unsigned long long sqnum;
72 int flags;
73 struct rb_node rb;
74 union ubifs_key key;
75 union {
76 struct qstr nm;
77 struct {
78 loff_t old_size;
79 loff_t new_size;
80 };
81 struct {
82 int free;
83 int dirty;
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +030084 int jhead;
Artem Bityutskiy1e517642008-07-14 19:08:37 +030085 };
86 };
87};
88
89/**
90 * struct bud_entry - entry in the list of buds to replay.
91 * @list: next bud in the list
92 * @bud: bud description object
93 * @free: free bytes in the bud
94 * @sqnum: reference node sequence number
95 */
96struct bud_entry {
97 struct list_head list;
98 struct ubifs_bud *bud;
99 int free;
100 unsigned long long sqnum;
101};
102
103/**
104 * set_bud_lprops - set free and dirty space used by a bud.
105 * @c: UBIFS file-system description object
106 * @r: replay entry of bud
107 */
108static int set_bud_lprops(struct ubifs_info *c, struct replay_entry *r)
109{
110 const struct ubifs_lprops *lp;
111 int err = 0, dirty;
112
113 ubifs_get_lprops(c);
114
115 lp = ubifs_lpt_lookup_dirty(c, r->lnum);
116 if (IS_ERR(lp)) {
117 err = PTR_ERR(lp);
118 goto out;
119 }
120
121 dirty = lp->dirty;
122 if (r->offs == 0 && (lp->free != c->leb_size || lp->dirty != 0)) {
123 /*
124 * The LEB was added to the journal with a starting offset of
125 * zero which means the LEB must have been empty. The LEB
126 * property values should be lp->free == c->leb_size and
127 * lp->dirty == 0, but that is not the case. The reason is that
Artem Bityutskiy7a9c3e32011-05-13 13:02:00 +0300128 * the LEB had been garbage collected before it became the bud,
129 * and there was not commit inbetween. The garbage collector
130 * resets the free and dirty space without recording it
131 * anywhere except lprops, so if there was no commit then
132 * lprops does not have that information.
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300133 *
134 * We do not need to adjust free space because the scan has told
135 * us the exact value which is recorded in the replay entry as
136 * r->free.
137 *
138 * However we do need to subtract from the dirty space the
139 * amount of space that the garbage collector reclaimed, which
140 * is the whole LEB minus the amount of space that was free.
141 */
142 dbg_mnt("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
143 lp->free, lp->dirty);
144 dbg_gc("bud LEB %d was GC'd (%d free, %d dirty)", r->lnum,
145 lp->free, lp->dirty);
146 dirty -= c->leb_size - lp->free;
147 /*
148 * If the replay order was perfect the dirty space would now be
Artem Bityutskiy7d4e9cc2009-03-20 19:11:12 +0200149 * zero. The order is not perfect because the journal heads
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +0200150 * race with each other. This is not a problem but is does mean
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300151 * that the dirty space may temporarily exceed c->leb_size
152 * during the replay.
153 */
154 if (dirty != 0)
155 dbg_msg("LEB %d lp: %d free %d dirty "
156 "replay: %d free %d dirty", r->lnum, lp->free,
157 lp->dirty, r->free, r->dirty);
158 }
159 lp = ubifs_change_lp(c, lp, r->free, dirty + r->dirty,
160 lp->flags | LPROPS_TAKEN, 0);
161 if (IS_ERR(lp)) {
162 err = PTR_ERR(lp);
163 goto out;
164 }
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +0300165
166 /* Make sure the journal head points to the latest bud */
167 err = ubifs_wbuf_seek_nolock(&c->jheads[r->jhead].wbuf, r->lnum,
168 c->leb_size - r->free, UBI_SHORTTERM);
169
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300170out:
171 ubifs_release_lprops(c);
172 return err;
173}
174
175/**
176 * trun_remove_range - apply a replay entry for a truncation to the TNC.
177 * @c: UBIFS file-system description object
178 * @r: replay entry of truncation
179 */
180static int trun_remove_range(struct ubifs_info *c, struct replay_entry *r)
181{
182 unsigned min_blk, max_blk;
183 union ubifs_key min_key, max_key;
184 ino_t ino;
185
186 min_blk = r->new_size / UBIFS_BLOCK_SIZE;
187 if (r->new_size & (UBIFS_BLOCK_SIZE - 1))
188 min_blk += 1;
189
190 max_blk = r->old_size / UBIFS_BLOCK_SIZE;
191 if ((r->old_size & (UBIFS_BLOCK_SIZE - 1)) == 0)
192 max_blk -= 1;
193
194 ino = key_inum(c, &r->key);
195
196 data_key_init(c, &min_key, ino, min_blk);
197 data_key_init(c, &max_key, ino, max_blk);
198
199 return ubifs_tnc_remove_range(c, &min_key, &max_key);
200}
201
202/**
203 * apply_replay_entry - apply a replay entry to the TNC.
204 * @c: UBIFS file-system description object
205 * @r: replay entry to apply
206 *
207 * Apply a replay entry to the TNC.
208 */
209static int apply_replay_entry(struct ubifs_info *c, struct replay_entry *r)
210{
211 int err, deletion = ((r->flags & REPLAY_DELETION) != 0);
212
213 dbg_mnt("LEB %d:%d len %d flgs %d sqnum %llu %s", r->lnum,
214 r->offs, r->len, r->flags, r->sqnum, DBGKEY(&r->key));
215
216 /* Set c->replay_sqnum to help deal with dangling branches. */
217 c->replay_sqnum = r->sqnum;
218
219 if (r->flags & REPLAY_REF)
220 err = set_bud_lprops(c, r);
221 else if (is_hash_key(c, &r->key)) {
222 if (deletion)
223 err = ubifs_tnc_remove_nm(c, &r->key, &r->nm);
224 else
225 err = ubifs_tnc_add_nm(c, &r->key, r->lnum, r->offs,
226 r->len, &r->nm);
227 } else {
228 if (deletion)
229 switch (key_type(c, &r->key)) {
230 case UBIFS_INO_KEY:
231 {
232 ino_t inum = key_inum(c, &r->key);
233
234 err = ubifs_tnc_remove_ino(c, inum);
235 break;
236 }
237 case UBIFS_TRUN_KEY:
238 err = trun_remove_range(c, r);
239 break;
240 default:
241 err = ubifs_tnc_remove(c, &r->key);
242 break;
243 }
244 else
245 err = ubifs_tnc_add(c, &r->key, r->lnum, r->offs,
246 r->len);
247 if (err)
248 return err;
249
250 if (c->need_recovery)
251 err = ubifs_recover_size_accum(c, &r->key, deletion,
252 r->new_size);
253 }
254
255 return err;
256}
257
258/**
259 * destroy_replay_tree - destroy the replay.
260 * @c: UBIFS file-system description object
261 *
262 * Destroy the replay tree.
263 */
264static void destroy_replay_tree(struct ubifs_info *c)
265{
266 struct rb_node *this = c->replay_tree.rb_node;
267 struct replay_entry *r;
268
269 while (this) {
270 if (this->rb_left) {
271 this = this->rb_left;
272 continue;
273 } else if (this->rb_right) {
274 this = this->rb_right;
275 continue;
276 }
277 r = rb_entry(this, struct replay_entry, rb);
278 this = rb_parent(this);
279 if (this) {
280 if (this->rb_left == &r->rb)
281 this->rb_left = NULL;
282 else
283 this->rb_right = NULL;
284 }
285 if (is_hash_key(c, &r->key))
286 kfree(r->nm.name);
287 kfree(r);
288 }
289 c->replay_tree = RB_ROOT;
290}
291
292/**
293 * apply_replay_tree - apply the replay tree to the TNC.
294 * @c: UBIFS file-system description object
295 *
296 * Apply the replay tree.
297 * Returns zero in case of success and a negative error code in case of
298 * failure.
299 */
300static int apply_replay_tree(struct ubifs_info *c)
301{
302 struct rb_node *this = rb_first(&c->replay_tree);
303
304 while (this) {
305 struct replay_entry *r;
306 int err;
307
308 cond_resched();
309
310 r = rb_entry(this, struct replay_entry, rb);
311 err = apply_replay_entry(c, r);
312 if (err)
313 return err;
314 this = rb_next(this);
315 }
316 return 0;
317}
318
319/**
320 * insert_node - insert a node to the replay tree.
321 * @c: UBIFS file-system description object
322 * @lnum: node logical eraseblock number
323 * @offs: node offset
324 * @len: node length
325 * @key: node key
326 * @sqnum: sequence number
327 * @deletion: non-zero if this is a deletion
328 * @used: number of bytes in use in a LEB
329 * @old_size: truncation old size
330 * @new_size: truncation new size
331 *
332 * This function inserts a scanned non-direntry node to the replay tree. The
333 * replay tree is an RB-tree containing @struct replay_entry elements which are
334 * indexed by the sequence number. The replay tree is applied at the very end
335 * of the replay process. Since the tree is sorted in sequence number order,
336 * the older modifications are applied first. This function returns zero in
337 * case of success and a negative error code in case of failure.
338 */
339static int insert_node(struct ubifs_info *c, int lnum, int offs, int len,
340 union ubifs_key *key, unsigned long long sqnum,
341 int deletion, int *used, loff_t old_size,
342 loff_t new_size)
343{
344 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
345 struct replay_entry *r;
346
347 if (key_inum(c, key) >= c->highest_inum)
348 c->highest_inum = key_inum(c, key);
349
350 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
351 while (*p) {
352 parent = *p;
353 r = rb_entry(parent, struct replay_entry, rb);
354 if (sqnum < r->sqnum) {
355 p = &(*p)->rb_left;
356 continue;
357 } else if (sqnum > r->sqnum) {
358 p = &(*p)->rb_right;
359 continue;
360 }
361 ubifs_err("duplicate sqnum in replay");
362 return -EINVAL;
363 }
364
365 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
366 if (!r)
367 return -ENOMEM;
368
369 if (!deletion)
370 *used += ALIGN(len, 8);
371 r->lnum = lnum;
372 r->offs = offs;
373 r->len = len;
374 r->sqnum = sqnum;
375 r->flags = (deletion ? REPLAY_DELETION : 0);
376 r->old_size = old_size;
377 r->new_size = new_size;
378 key_copy(c, key, &r->key);
379
380 rb_link_node(&r->rb, parent, p);
381 rb_insert_color(&r->rb, &c->replay_tree);
382 return 0;
383}
384
385/**
386 * insert_dent - insert a directory entry node into the replay tree.
387 * @c: UBIFS file-system description object
388 * @lnum: node logical eraseblock number
389 * @offs: node offset
390 * @len: node length
391 * @key: node key
392 * @name: directory entry name
393 * @nlen: directory entry name length
394 * @sqnum: sequence number
395 * @deletion: non-zero if this is a deletion
396 * @used: number of bytes in use in a LEB
397 *
398 * This function inserts a scanned directory entry node to the replay tree.
399 * Returns zero in case of success and a negative error code in case of
400 * failure.
401 *
402 * This function is also used for extended attribute entries because they are
403 * implemented as directory entry nodes.
404 */
405static int insert_dent(struct ubifs_info *c, int lnum, int offs, int len,
406 union ubifs_key *key, const char *name, int nlen,
407 unsigned long long sqnum, int deletion, int *used)
408{
409 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
410 struct replay_entry *r;
411 char *nbuf;
412
413 if (key_inum(c, key) >= c->highest_inum)
414 c->highest_inum = key_inum(c, key);
415
416 dbg_mnt("add LEB %d:%d, key %s", lnum, offs, DBGKEY(key));
417 while (*p) {
418 parent = *p;
419 r = rb_entry(parent, struct replay_entry, rb);
420 if (sqnum < r->sqnum) {
421 p = &(*p)->rb_left;
422 continue;
423 }
424 if (sqnum > r->sqnum) {
425 p = &(*p)->rb_right;
426 continue;
427 }
428 ubifs_err("duplicate sqnum in replay");
429 return -EINVAL;
430 }
431
432 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
433 if (!r)
434 return -ENOMEM;
435 nbuf = kmalloc(nlen + 1, GFP_KERNEL);
436 if (!nbuf) {
437 kfree(r);
438 return -ENOMEM;
439 }
440
441 if (!deletion)
442 *used += ALIGN(len, 8);
443 r->lnum = lnum;
444 r->offs = offs;
445 r->len = len;
446 r->sqnum = sqnum;
447 r->nm.len = nlen;
448 memcpy(nbuf, name, nlen);
449 nbuf[nlen] = '\0';
450 r->nm.name = nbuf;
451 r->flags = (deletion ? REPLAY_DELETION : 0);
452 key_copy(c, key, &r->key);
453
454 ubifs_assert(!*p);
455 rb_link_node(&r->rb, parent, p);
456 rb_insert_color(&r->rb, &c->replay_tree);
457 return 0;
458}
459
460/**
461 * ubifs_validate_entry - validate directory or extended attribute entry node.
462 * @c: UBIFS file-system description object
463 * @dent: the node to validate
464 *
465 * This function validates directory or extended attribute entry node @dent.
466 * Returns zero if the node is all right and a %-EINVAL if not.
467 */
468int ubifs_validate_entry(struct ubifs_info *c,
469 const struct ubifs_dent_node *dent)
470{
471 int key_type = key_type_flash(c, dent->key);
472 int nlen = le16_to_cpu(dent->nlen);
473
474 if (le32_to_cpu(dent->ch.len) != nlen + UBIFS_DENT_NODE_SZ + 1 ||
475 dent->type >= UBIFS_ITYPES_CNT ||
476 nlen > UBIFS_MAX_NLEN || dent->name[nlen] != 0 ||
477 strnlen(dent->name, nlen) != nlen ||
478 le64_to_cpu(dent->inum) > MAX_INUM) {
479 ubifs_err("bad %s node", key_type == UBIFS_DENT_KEY ?
480 "directory entry" : "extended attribute entry");
481 return -EINVAL;
482 }
483
484 if (key_type != UBIFS_DENT_KEY && key_type != UBIFS_XENT_KEY) {
485 ubifs_err("bad key type %d", key_type);
486 return -EINVAL;
487 }
488
489 return 0;
490}
491
492/**
493 * replay_bud - replay a bud logical eraseblock.
494 * @c: UBIFS file-system description object
495 * @lnum: bud logical eraseblock number to replay
496 * @offs: bud start offset
497 * @jhead: journal head to which this bud belongs
498 * @free: amount of free space in the bud is returned here
499 * @dirty: amount of dirty space from padding and deletion nodes is returned
500 * here
501 *
502 * This function returns zero in case of success and a negative error code in
503 * case of failure.
504 */
505static int replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
506 int *free, int *dirty)
507{
508 int err = 0, used = 0;
509 struct ubifs_scan_leb *sleb;
510 struct ubifs_scan_node *snod;
511 struct ubifs_bud *bud;
512
Artem Bityutskiyc839e292011-05-13 12:26:54 +0300513 dbg_mnt("replay bud LEB %d, head %d, offs %d", lnum, jhead, offs);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300514 if (c->need_recovery)
515 sleb = ubifs_recover_leb(c, lnum, offs, c->sbuf, jhead != GCHD);
516 else
Artem Bityutskiy348709b2009-08-25 15:00:55 +0300517 sleb = ubifs_scan(c, lnum, offs, c->sbuf, 0);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300518 if (IS_ERR(sleb))
519 return PTR_ERR(sleb);
520
521 /*
522 * The bud does not have to start from offset zero - the beginning of
523 * the 'lnum' LEB may contain previously committed data. One of the
524 * things we have to do in replay is to correctly update lprops with
525 * newer information about this LEB.
526 *
527 * At this point lprops thinks that this LEB has 'c->leb_size - offs'
528 * bytes of free space because it only contain information about
529 * committed data.
530 *
531 * But we know that real amount of free space is 'c->leb_size -
532 * sleb->endpt', and the space in the 'lnum' LEB between 'offs' and
533 * 'sleb->endpt' is used by bud data. We have to correctly calculate
534 * how much of these data are dirty and update lprops with this
535 * information.
536 *
537 * The dirt in that LEB region is comprised of padding nodes, deletion
538 * nodes, truncation nodes and nodes which are obsoleted by subsequent
539 * nodes in this LEB. So instead of calculating clean space, we
540 * calculate used space ('used' variable).
541 */
542
543 list_for_each_entry(snod, &sleb->nodes, list) {
544 int deletion = 0;
545
546 cond_resched();
547
548 if (snod->sqnum >= SQNUM_WATERMARK) {
549 ubifs_err("file system's life ended");
550 goto out_dump;
551 }
552
553 if (snod->sqnum > c->max_sqnum)
554 c->max_sqnum = snod->sqnum;
555
556 switch (snod->type) {
557 case UBIFS_INO_NODE:
558 {
559 struct ubifs_ino_node *ino = snod->node;
560 loff_t new_size = le64_to_cpu(ino->size);
561
562 if (le32_to_cpu(ino->nlink) == 0)
563 deletion = 1;
564 err = insert_node(c, lnum, snod->offs, snod->len,
565 &snod->key, snod->sqnum, deletion,
566 &used, 0, new_size);
567 break;
568 }
569 case UBIFS_DATA_NODE:
570 {
571 struct ubifs_data_node *dn = snod->node;
572 loff_t new_size = le32_to_cpu(dn->size) +
573 key_block(c, &snod->key) *
574 UBIFS_BLOCK_SIZE;
575
576 err = insert_node(c, lnum, snod->offs, snod->len,
577 &snod->key, snod->sqnum, deletion,
578 &used, 0, new_size);
579 break;
580 }
581 case UBIFS_DENT_NODE:
582 case UBIFS_XENT_NODE:
583 {
584 struct ubifs_dent_node *dent = snod->node;
585
586 err = ubifs_validate_entry(c, dent);
587 if (err)
588 goto out_dump;
589
590 err = insert_dent(c, lnum, snod->offs, snod->len,
591 &snod->key, dent->name,
592 le16_to_cpu(dent->nlen), snod->sqnum,
593 !le64_to_cpu(dent->inum), &used);
594 break;
595 }
596 case UBIFS_TRUN_NODE:
597 {
598 struct ubifs_trun_node *trun = snod->node;
599 loff_t old_size = le64_to_cpu(trun->old_size);
600 loff_t new_size = le64_to_cpu(trun->new_size);
601 union ubifs_key key;
602
603 /* Validate truncation node */
604 if (old_size < 0 || old_size > c->max_inode_sz ||
605 new_size < 0 || new_size > c->max_inode_sz ||
606 old_size <= new_size) {
607 ubifs_err("bad truncation node");
608 goto out_dump;
609 }
610
611 /*
612 * Create a fake truncation key just to use the same
613 * functions which expect nodes to have keys.
614 */
615 trun_key_init(c, &key, le32_to_cpu(trun->inum));
616 err = insert_node(c, lnum, snod->offs, snod->len,
617 &key, snod->sqnum, 1, &used,
618 old_size, new_size);
619 break;
620 }
621 default:
622 ubifs_err("unexpected node type %d in bud LEB %d:%d",
623 snod->type, lnum, snod->offs);
624 err = -EINVAL;
625 goto out_dump;
626 }
627 if (err)
628 goto out;
629 }
630
631 bud = ubifs_search_bud(c, lnum);
632 if (!bud)
633 BUG();
634
635 ubifs_assert(sleb->endpt - offs >= used);
636 ubifs_assert(sleb->endpt % c->min_io_size == 0);
637
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300638 *dirty = sleb->endpt - offs - used;
639 *free = c->leb_size - sleb->endpt;
Artem Bityutskiyc839e292011-05-13 12:26:54 +0300640 dbg_mnt("bud LEB %d replied: dirty %d, free %d", lnum, *dirty, *free);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300641
642out:
643 ubifs_scan_destroy(sleb);
644 return err;
645
646out_dump:
647 ubifs_err("bad node is at LEB %d:%d", lnum, snod->offs);
648 dbg_dump_node(c, snod->node);
649 ubifs_scan_destroy(sleb);
650 return -EINVAL;
651}
652
653/**
654 * insert_ref_node - insert a reference node to the replay tree.
655 * @c: UBIFS file-system description object
656 * @lnum: node logical eraseblock number
657 * @offs: node offset
658 * @sqnum: sequence number
659 * @free: amount of free space in bud
660 * @dirty: amount of dirty space from padding and deletion nodes
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +0300661 * @jhead: journal head number for the bud
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300662 *
663 * This function inserts a reference node to the replay tree and returns zero
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +0200664 * in case of success or a negative error code in case of failure.
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300665 */
666static int insert_ref_node(struct ubifs_info *c, int lnum, int offs,
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +0300667 unsigned long long sqnum, int free, int dirty,
668 int jhead)
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300669{
670 struct rb_node **p = &c->replay_tree.rb_node, *parent = NULL;
671 struct replay_entry *r;
672
673 dbg_mnt("add ref LEB %d:%d", lnum, offs);
674 while (*p) {
675 parent = *p;
676 r = rb_entry(parent, struct replay_entry, rb);
677 if (sqnum < r->sqnum) {
678 p = &(*p)->rb_left;
679 continue;
680 } else if (sqnum > r->sqnum) {
681 p = &(*p)->rb_right;
682 continue;
683 }
684 ubifs_err("duplicate sqnum in replay tree");
685 return -EINVAL;
686 }
687
688 r = kzalloc(sizeof(struct replay_entry), GFP_KERNEL);
689 if (!r)
690 return -ENOMEM;
691
692 r->lnum = lnum;
693 r->offs = offs;
694 r->sqnum = sqnum;
695 r->flags = REPLAY_REF;
696 r->free = free;
697 r->dirty = dirty;
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +0300698 r->jhead = jhead;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300699
700 rb_link_node(&r->rb, parent, p);
701 rb_insert_color(&r->rb, &c->replay_tree);
702 return 0;
703}
704
705/**
706 * replay_buds - replay all buds.
707 * @c: UBIFS file-system description object
708 *
709 * This function returns zero in case of success and a negative error code in
710 * case of failure.
711 */
712static int replay_buds(struct ubifs_info *c)
713{
714 struct bud_entry *b;
715 int err, uninitialized_var(free), uninitialized_var(dirty);
Artem Bityutskiy7703f092011-05-13 16:02:19 +0300716 unsigned long long prev_sqnum = 0;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300717
718 list_for_each_entry(b, &c->replay_buds, list) {
719 err = replay_bud(c, b->bud->lnum, b->bud->start, b->bud->jhead,
720 &free, &dirty);
721 if (err)
722 return err;
723 err = insert_ref_node(c, b->bud->lnum, b->bud->start, b->sqnum,
Artem Bityutskiy52c6e6f2011-04-25 18:46:31 +0300724 free, dirty, b->bud->jhead);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300725 if (err)
726 return err;
Artem Bityutskiy7703f092011-05-13 16:02:19 +0300727
728 ubifs_assert(b->sqnum > prev_sqnum);
729 prev_sqnum = b->sqnum;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300730 }
731
732 return 0;
733}
734
735/**
736 * destroy_bud_list - destroy the list of buds to replay.
737 * @c: UBIFS file-system description object
738 */
739static void destroy_bud_list(struct ubifs_info *c)
740{
741 struct bud_entry *b;
742
743 while (!list_empty(&c->replay_buds)) {
744 b = list_entry(c->replay_buds.next, struct bud_entry, list);
745 list_del(&b->list);
746 kfree(b);
747 }
748}
749
750/**
751 * add_replay_bud - add a bud to the list of buds to replay.
752 * @c: UBIFS file-system description object
753 * @lnum: bud logical eraseblock number to replay
754 * @offs: bud start offset
755 * @jhead: journal head to which this bud belongs
756 * @sqnum: reference node sequence number
757 *
758 * This function returns zero in case of success and a negative error code in
759 * case of failure.
760 */
761static int add_replay_bud(struct ubifs_info *c, int lnum, int offs, int jhead,
762 unsigned long long sqnum)
763{
764 struct ubifs_bud *bud;
765 struct bud_entry *b;
766
767 dbg_mnt("add replay bud LEB %d:%d, head %d", lnum, offs, jhead);
768
769 bud = kmalloc(sizeof(struct ubifs_bud), GFP_KERNEL);
770 if (!bud)
771 return -ENOMEM;
772
773 b = kmalloc(sizeof(struct bud_entry), GFP_KERNEL);
774 if (!b) {
775 kfree(bud);
776 return -ENOMEM;
777 }
778
779 bud->lnum = lnum;
780 bud->start = offs;
781 bud->jhead = jhead;
782 ubifs_add_bud(c, bud);
783
784 b->bud = bud;
785 b->sqnum = sqnum;
786 list_add_tail(&b->list, &c->replay_buds);
787
788 return 0;
789}
790
791/**
792 * validate_ref - validate a reference node.
793 * @c: UBIFS file-system description object
794 * @ref: the reference node to validate
795 * @ref_lnum: LEB number of the reference node
796 * @ref_offs: reference node offset
797 *
798 * This function returns %1 if a bud reference already exists for the LEB. %0 is
799 * returned if the reference node is new, otherwise %-EINVAL is returned if
800 * validation failed.
801 */
802static int validate_ref(struct ubifs_info *c, const struct ubifs_ref_node *ref)
803{
804 struct ubifs_bud *bud;
805 int lnum = le32_to_cpu(ref->lnum);
806 unsigned int offs = le32_to_cpu(ref->offs);
807 unsigned int jhead = le32_to_cpu(ref->jhead);
808
809 /*
810 * ref->offs may point to the end of LEB when the journal head points
811 * to the end of LEB and we write reference node for it during commit.
812 * So this is why we require 'offs > c->leb_size'.
813 */
814 if (jhead >= c->jhead_cnt || lnum >= c->leb_cnt ||
815 lnum < c->main_first || offs > c->leb_size ||
816 offs & (c->min_io_size - 1))
817 return -EINVAL;
818
819 /* Make sure we have not already looked at this bud */
820 bud = ubifs_search_bud(c, lnum);
821 if (bud) {
822 if (bud->jhead == jhead && bud->start <= offs)
823 return 1;
824 ubifs_err("bud at LEB %d:%d was already referred", lnum, offs);
825 return -EINVAL;
826 }
827
828 return 0;
829}
830
831/**
832 * replay_log_leb - replay a log logical eraseblock.
833 * @c: UBIFS file-system description object
834 * @lnum: log logical eraseblock to replay
835 * @offs: offset to start replaying from
836 * @sbuf: scan buffer
837 *
838 * This function replays a log LEB and returns zero in case of success, %1 if
839 * this is the last LEB in the log, and a negative error code in case of
840 * failure.
841 */
842static int replay_log_leb(struct ubifs_info *c, int lnum, int offs, void *sbuf)
843{
844 int err;
845 struct ubifs_scan_leb *sleb;
846 struct ubifs_scan_node *snod;
847 const struct ubifs_cs_node *node;
848
849 dbg_mnt("replay log LEB %d:%d", lnum, offs);
Artem Bityutskiy348709b2009-08-25 15:00:55 +0300850 sleb = ubifs_scan(c, lnum, offs, sbuf, c->need_recovery);
851 if (IS_ERR(sleb)) {
Artem Bityutskiyed43f2f2009-06-29 17:59:23 +0300852 if (PTR_ERR(sleb) != -EUCLEAN || !c->need_recovery)
853 return PTR_ERR(sleb);
Artem Bityutskiy7d08ae32010-10-17 15:50:19 +0300854 /*
855 * Note, the below function will recover this log LEB only if
856 * it is the last, because unclean reboots can possibly corrupt
857 * only the tail of the log.
858 */
Artem Bityutskiyed43f2f2009-06-29 17:59:23 +0300859 sleb = ubifs_recover_log_leb(c, lnum, offs, sbuf);
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300860 if (IS_ERR(sleb))
861 return PTR_ERR(sleb);
862 }
863
864 if (sleb->nodes_cnt == 0) {
865 err = 1;
866 goto out;
867 }
868
869 node = sleb->buf;
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300870 snod = list_entry(sleb->nodes.next, struct ubifs_scan_node, list);
871 if (c->cs_sqnum == 0) {
872 /*
873 * This is the first log LEB we are looking at, make sure that
874 * the first node is a commit start node. Also record its
875 * sequence number so that UBIFS can determine where the log
876 * ends, because all nodes which were have higher sequence
877 * numbers.
878 */
879 if (snod->type != UBIFS_CS_NODE) {
880 dbg_err("first log node at LEB %d:%d is not CS node",
881 lnum, offs);
882 goto out_dump;
883 }
884 if (le64_to_cpu(node->cmt_no) != c->cmt_no) {
885 dbg_err("first CS node at LEB %d:%d has wrong "
886 "commit number %llu expected %llu",
887 lnum, offs,
888 (unsigned long long)le64_to_cpu(node->cmt_no),
889 c->cmt_no);
890 goto out_dump;
891 }
892
893 c->cs_sqnum = le64_to_cpu(node->ch.sqnum);
894 dbg_mnt("commit start sqnum %llu", c->cs_sqnum);
895 }
896
897 if (snod->sqnum < c->cs_sqnum) {
898 /*
899 * This means that we reached end of log and now
900 * look to the older log data, which was already
901 * committed but the eraseblock was not erased (UBIFS
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +0200902 * only un-maps it). So this basically means we have to
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300903 * exit with "end of log" code.
904 */
905 err = 1;
906 goto out;
907 }
908
909 /* Make sure the first node sits at offset zero of the LEB */
910 if (snod->offs != 0) {
911 dbg_err("first node is not at zero offset");
912 goto out_dump;
913 }
914
915 list_for_each_entry(snod, &sleb->nodes, list) {
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300916 cond_resched();
917
918 if (snod->sqnum >= SQNUM_WATERMARK) {
919 ubifs_err("file system's life ended");
920 goto out_dump;
921 }
922
923 if (snod->sqnum < c->cs_sqnum) {
924 dbg_err("bad sqnum %llu, commit sqnum %llu",
925 snod->sqnum, c->cs_sqnum);
926 goto out_dump;
927 }
928
929 if (snod->sqnum > c->max_sqnum)
930 c->max_sqnum = snod->sqnum;
931
932 switch (snod->type) {
933 case UBIFS_REF_NODE: {
934 const struct ubifs_ref_node *ref = snod->node;
935
936 err = validate_ref(c, ref);
937 if (err == 1)
938 break; /* Already have this bud */
939 if (err)
940 goto out_dump;
941
942 err = add_replay_bud(c, le32_to_cpu(ref->lnum),
943 le32_to_cpu(ref->offs),
944 le32_to_cpu(ref->jhead),
945 snod->sqnum);
946 if (err)
947 goto out;
948
949 break;
950 }
951 case UBIFS_CS_NODE:
952 /* Make sure it sits at the beginning of LEB */
953 if (snod->offs != 0) {
954 ubifs_err("unexpected node in log");
955 goto out_dump;
956 }
957 break;
958 default:
959 ubifs_err("unexpected node in log");
960 goto out_dump;
961 }
962 }
963
964 if (sleb->endpt || c->lhead_offs >= c->leb_size) {
965 c->lhead_lnum = lnum;
966 c->lhead_offs = sleb->endpt;
967 }
968
969 err = !sleb->endpt;
970out:
971 ubifs_scan_destroy(sleb);
972 return err;
973
974out_dump:
Adrian Hunter681947d2009-06-24 09:59:38 +0300975 ubifs_err("log error detected while replaying the log at LEB %d:%d",
Artem Bityutskiy1e517642008-07-14 19:08:37 +0300976 lnum, offs + snod->offs);
977 dbg_dump_node(c, snod->node);
978 ubifs_scan_destroy(sleb);
979 return -EINVAL;
980}
981
982/**
983 * take_ihead - update the status of the index head in lprops to 'taken'.
984 * @c: UBIFS file-system description object
985 *
986 * This function returns the amount of free space in the index head LEB or a
987 * negative error code.
988 */
989static int take_ihead(struct ubifs_info *c)
990{
991 const struct ubifs_lprops *lp;
992 int err, free;
993
994 ubifs_get_lprops(c);
995
996 lp = ubifs_lpt_lookup_dirty(c, c->ihead_lnum);
997 if (IS_ERR(lp)) {
998 err = PTR_ERR(lp);
999 goto out;
1000 }
1001
1002 free = lp->free;
1003
1004 lp = ubifs_change_lp(c, lp, LPROPS_NC, LPROPS_NC,
1005 lp->flags | LPROPS_TAKEN, 0);
1006 if (IS_ERR(lp)) {
1007 err = PTR_ERR(lp);
1008 goto out;
1009 }
1010
1011 err = free;
1012out:
1013 ubifs_release_lprops(c);
1014 return err;
1015}
1016
1017/**
1018 * ubifs_replay_journal - replay journal.
1019 * @c: UBIFS file-system description object
1020 *
1021 * This function scans the journal, replays and cleans it up. It makes sure all
1022 * memory data structures related to uncommitted journal are built (dirty TNC
1023 * tree, tree of buds, modified lprops, etc).
1024 */
1025int ubifs_replay_journal(struct ubifs_info *c)
1026{
1027 int err, i, lnum, offs, free;
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001028
1029 BUILD_BUG_ON(UBIFS_TRUN_KEY > 5);
1030
1031 /* Update the status of the index head in lprops to 'taken' */
1032 free = take_ihead(c);
1033 if (free < 0)
1034 return free; /* Error code */
1035
1036 if (c->ihead_offs != c->leb_size - free) {
1037 ubifs_err("bad index head LEB %d:%d", c->ihead_lnum,
1038 c->ihead_offs);
1039 return -EINVAL;
1040 }
1041
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001042 dbg_mnt("start replaying the journal");
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001043 c->replaying = 1;
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001044 lnum = c->ltail_lnum = c->lhead_lnum;
1045 offs = c->lhead_offs;
1046
1047 for (i = 0; i < c->log_lebs; i++, lnum++) {
1048 if (lnum >= UBIFS_LOG_LNUM + c->log_lebs) {
1049 /*
1050 * The log is logically circular, we reached the last
1051 * LEB, switch to the first one.
1052 */
1053 lnum = UBIFS_LOG_LNUM;
1054 offs = 0;
1055 }
Artem Bityutskiy6599fcb2010-10-18 10:00:40 +03001056 err = replay_log_leb(c, lnum, offs, c->sbuf);
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001057 if (err == 1)
1058 /* We hit the end of the log */
1059 break;
1060 if (err)
1061 goto out;
1062 offs = 0;
1063 }
1064
1065 err = replay_buds(c);
1066 if (err)
1067 goto out;
1068
1069 err = apply_replay_tree(c);
1070 if (err)
1071 goto out;
1072
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +02001073 /*
Artem Bityutskiyb1375452011-03-29 18:04:05 +03001074 * UBIFS budgeting calculations use @c->bi.uncommitted_idx variable
1075 * to roughly estimate index growth. Things like @c->bi.min_idx_lebs
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +02001076 * depend on it. This means we have to initialize it to make sure
1077 * budgeting works properly.
1078 */
Artem Bityutskiyb1375452011-03-29 18:04:05 +03001079 c->bi.uncommitted_idx = atomic_long_read(&c->dirty_zn_cnt);
1080 c->bi.uncommitted_idx *= c->max_idx_node_sz;
Artem Bityutskiy6edbfaf2008-12-30 20:06:49 +02001081
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001082 ubifs_assert(c->bud_bytes <= c->max_bud_bytes || c->need_recovery);
1083 dbg_mnt("finished, log head LEB %d:%d, max_sqnum %llu, "
1084 "highest_inum %lu", c->lhead_lnum, c->lhead_offs, c->max_sqnum,
Artem Bityutskiye84461a2008-10-29 12:08:43 +02001085 (unsigned long)c->highest_inum);
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001086out:
1087 destroy_replay_tree(c);
1088 destroy_bud_list(c);
Artem Bityutskiy1e517642008-07-14 19:08:37 +03001089 c->replaying = 0;
1090 return err;
1091}