blob: f1220ec1896fab49c416b0e67613a0eb1aa71652 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Tim Shimmin87c199c2006-06-09 14:56:16 +10002 * Copyright (c) 2000-2006 Silicon Graphics, Inc.
Nathan Scott7b718762005-11-02 14:58:39 +11003 * All Rights Reserved.
Linus Torvalds1da177e2005-04-16 15:20:36 -07004 *
Nathan Scott7b718762005-11-02 14:58:39 +11005 * This program is free software; you can redistribute it and/or
6 * modify it under the terms of the GNU General Public License as
Linus Torvalds1da177e2005-04-16 15:20:36 -07007 * published by the Free Software Foundation.
8 *
Nathan Scott7b718762005-11-02 14:58:39 +11009 * This program is distributed in the hope that it would be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 * GNU General Public License for more details.
Linus Torvalds1da177e2005-04-16 15:20:36 -070013 *
Nathan Scott7b718762005-11-02 14:58:39 +110014 * You should have received a copy of the GNU General Public License
15 * along with this program; if not, write the Free Software Foundation,
16 * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
Linus Torvalds1da177e2005-04-16 15:20:36 -070017 */
Linus Torvalds1da177e2005-04-16 15:20:36 -070018#include "xfs.h"
Nathan Scotta844f452005-11-02 14:38:42 +110019#include "xfs_fs.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include "xfs_types.h"
Nathan Scotta844f452005-11-02 14:38:42 +110021#include "xfs_bit.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070022#include "xfs_log.h"
Nathan Scotta844f452005-11-02 14:38:42 +110023#include "xfs_inum.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070024#include "xfs_trans.h"
Nathan Scotta844f452005-11-02 14:38:42 +110025#include "xfs_sb.h"
26#include "xfs_ag.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070027#include "xfs_dir2.h"
28#include "xfs_dmapi.h"
29#include "xfs_mount.h"
30#include "xfs_error.h"
31#include "xfs_bmap_btree.h"
Nathan Scotta844f452005-11-02 14:38:42 +110032#include "xfs_alloc_btree.h"
33#include "xfs_ialloc_btree.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070034#include "xfs_dir2_sf.h"
Nathan Scotta844f452005-11-02 14:38:42 +110035#include "xfs_attr_sf.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070036#include "xfs_dinode.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070037#include "xfs_inode.h"
Nathan Scotta844f452005-11-02 14:38:42 +110038#include "xfs_inode_item.h"
Nathan Scotta844f452005-11-02 14:38:42 +110039#include "xfs_alloc.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070040#include "xfs_ialloc.h"
41#include "xfs_log_priv.h"
42#include "xfs_buf_item.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070043#include "xfs_log_recover.h"
44#include "xfs_extfree_item.h"
45#include "xfs_trans_priv.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070046#include "xfs_quota.h"
47#include "xfs_rw.h"
Christoph Hellwig43355092008-03-27 18:01:08 +110048#include "xfs_utils.h"
Christoph Hellwig0b1b2132009-12-14 23:14:59 +000049#include "xfs_trace.h"
Linus Torvalds1da177e2005-04-16 15:20:36 -070050
51STATIC int xlog_find_zeroed(xlog_t *, xfs_daddr_t *);
52STATIC int xlog_clear_stale_blocks(xlog_t *, xfs_lsn_t);
Linus Torvalds1da177e2005-04-16 15:20:36 -070053#if defined(DEBUG)
54STATIC void xlog_recover_check_summary(xlog_t *);
Linus Torvalds1da177e2005-04-16 15:20:36 -070055#else
56#define xlog_recover_check_summary(log)
Linus Torvalds1da177e2005-04-16 15:20:36 -070057#endif
58
Linus Torvalds1da177e2005-04-16 15:20:36 -070059/*
60 * Sector aligned buffer routines for buffer create/read/write/access
61 */
62
Alex Elderff30a622010-04-13 15:22:58 +100063/*
64 * Verify the given count of basic blocks is valid number of blocks
65 * to specify for an operation involving the given XFS log buffer.
66 * Returns nonzero if the count is valid, 0 otherwise.
67 */
68
69static inline int
70xlog_buf_bbcount_valid(
71 xlog_t *log,
72 int bbcount)
73{
74 return bbcount > 0 && bbcount <= log->l_logBBsize;
75}
76
Alex Elder36adecf2010-04-13 15:21:13 +100077/*
78 * Allocate a buffer to hold log data. The buffer needs to be able
79 * to map to a range of nbblks basic blocks at any valid (basic
80 * block) offset within the log.
81 */
Eric Sandeen5d77c0d2009-11-19 15:52:00 +000082STATIC xfs_buf_t *
Linus Torvalds1da177e2005-04-16 15:20:36 -070083xlog_get_bp(
84 xlog_t *log,
Dave Chinner32281492009-01-22 15:37:47 +110085 int nbblks)
Linus Torvalds1da177e2005-04-16 15:20:36 -070086{
Alex Elderff30a622010-04-13 15:22:58 +100087 if (!xlog_buf_bbcount_valid(log, nbblks)) {
88 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
89 nbblks);
90 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
Dave Chinner32281492009-01-22 15:37:47 +110091 return NULL;
92 }
Linus Torvalds1da177e2005-04-16 15:20:36 -070093
Alex Elder36adecf2010-04-13 15:21:13 +100094 /*
95 * We do log I/O in units of log sectors (a power-of-2
96 * multiple of the basic block size), so we round up the
97 * requested size to acommodate the basic blocks required
98 * for complete log sectors.
99 *
100 * In addition, the buffer may be used for a non-sector-
101 * aligned block offset, in which case an I/O of the
102 * requested size could extend beyond the end of the
103 * buffer. If the requested size is only 1 basic block it
104 * will never straddle a sector boundary, so this won't be
105 * an issue. Nor will this be a problem if the log I/O is
106 * done in basic blocks (sector size 1). But otherwise we
107 * extend the buffer by one extra log sector to ensure
108 * there's space to accomodate this possiblility.
109 */
Alex Elder69ce58f2010-04-20 17:09:59 +1000110 if (nbblks > 1 && log->l_sectBBsize > 1)
111 nbblks += log->l_sectBBsize;
112 nbblks = round_up(nbblks, log->l_sectBBsize);
Alex Elder36adecf2010-04-13 15:21:13 +1000113
Dave Chinner32281492009-01-22 15:37:47 +1100114 return xfs_buf_get_noaddr(BBTOB(nbblks), log->l_mp->m_logdev_targp);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700115}
116
Eric Sandeen5d77c0d2009-11-19 15:52:00 +0000117STATIC void
Linus Torvalds1da177e2005-04-16 15:20:36 -0700118xlog_put_bp(
119 xfs_buf_t *bp)
120{
121 xfs_buf_free(bp);
122}
123
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100124STATIC xfs_caddr_t
125xlog_align(
126 xlog_t *log,
127 xfs_daddr_t blk_no,
128 int nbblks,
129 xfs_buf_t *bp)
130{
131 xfs_caddr_t ptr;
132
Alex Elder69ce58f2010-04-20 17:09:59 +1000133 if (log->l_sectBBsize == 1)
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100134 return XFS_BUF_PTR(bp);
135
136 ptr = XFS_BUF_PTR(bp) + BBTOB((int)blk_no & log->l_sectbb_mask);
137 ASSERT(XFS_BUF_SIZE(bp) >=
138 BBTOB(nbblks + (blk_no & log->l_sectbb_mask)));
139 return ptr;
140}
141
Linus Torvalds1da177e2005-04-16 15:20:36 -0700142
143/*
144 * nbblks should be uint, but oh well. Just want to catch that 32-bit length.
145 */
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100146STATIC int
147xlog_bread_noalign(
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 xlog_t *log,
149 xfs_daddr_t blk_no,
150 int nbblks,
151 xfs_buf_t *bp)
152{
153 int error;
154
Alex Elderff30a622010-04-13 15:22:58 +1000155 if (!xlog_buf_bbcount_valid(log, nbblks)) {
156 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
157 nbblks);
158 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
Dave Chinner32281492009-01-22 15:37:47 +1100159 return EFSCORRUPTED;
160 }
161
Alex Elder69ce58f2010-04-20 17:09:59 +1000162 blk_no = round_down(blk_no, log->l_sectBBsize);
163 nbblks = round_up(nbblks, log->l_sectBBsize);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700164
165 ASSERT(nbblks > 0);
166 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700167
168 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
169 XFS_BUF_READ(bp);
170 XFS_BUF_BUSY(bp);
171 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
172 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
173
174 xfsbdstrat(log->l_mp, bp);
David Chinnerd64e31a2008-04-10 12:22:17 +1000175 error = xfs_iowait(bp);
176 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700177 xfs_ioerror_alert("xlog_bread", log->l_mp,
178 bp, XFS_BUF_ADDR(bp));
179 return error;
180}
181
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100182STATIC int
183xlog_bread(
184 xlog_t *log,
185 xfs_daddr_t blk_no,
186 int nbblks,
187 xfs_buf_t *bp,
188 xfs_caddr_t *offset)
189{
190 int error;
191
192 error = xlog_bread_noalign(log, blk_no, nbblks, bp);
193 if (error)
194 return error;
195
196 *offset = xlog_align(log, blk_no, nbblks, bp);
197 return 0;
198}
199
Linus Torvalds1da177e2005-04-16 15:20:36 -0700200/*
201 * Write out the buffer at the given block for the given number of blocks.
202 * The buffer is kept locked across the write and is returned locked.
203 * This can only be used for synchronous log writes.
204 */
Christoph Hellwigba0f32d2005-06-21 15:36:52 +1000205STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -0700206xlog_bwrite(
207 xlog_t *log,
208 xfs_daddr_t blk_no,
209 int nbblks,
210 xfs_buf_t *bp)
211{
212 int error;
213
Alex Elderff30a622010-04-13 15:22:58 +1000214 if (!xlog_buf_bbcount_valid(log, nbblks)) {
215 xlog_warn("XFS: Invalid block length (0x%x) given for buffer",
216 nbblks);
217 XFS_ERROR_REPORT(__func__, XFS_ERRLEVEL_HIGH, log->l_mp);
Dave Chinner32281492009-01-22 15:37:47 +1100218 return EFSCORRUPTED;
219 }
220
Alex Elder69ce58f2010-04-20 17:09:59 +1000221 blk_no = round_down(blk_no, log->l_sectBBsize);
222 nbblks = round_up(nbblks, log->l_sectBBsize);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700223
224 ASSERT(nbblks > 0);
225 ASSERT(BBTOB(nbblks) <= XFS_BUF_SIZE(bp));
226
227 XFS_BUF_SET_ADDR(bp, log->l_logBBstart + blk_no);
228 XFS_BUF_ZEROFLAGS(bp);
229 XFS_BUF_BUSY(bp);
230 XFS_BUF_HOLD(bp);
231 XFS_BUF_PSEMA(bp, PRIBIO);
232 XFS_BUF_SET_COUNT(bp, BBTOB(nbblks));
233 XFS_BUF_SET_TARGET(bp, log->l_mp->m_logdev_targp);
234
235 if ((error = xfs_bwrite(log->l_mp, bp)))
236 xfs_ioerror_alert("xlog_bwrite", log->l_mp,
237 bp, XFS_BUF_ADDR(bp));
238 return error;
239}
240
Linus Torvalds1da177e2005-04-16 15:20:36 -0700241#ifdef DEBUG
242/*
243 * dump debug superblock and log record information
244 */
245STATIC void
246xlog_header_check_dump(
247 xfs_mount_t *mp,
248 xlog_rec_header_t *head)
249{
Joe Perches03daa572009-12-14 18:01:10 -0800250 cmn_err(CE_DEBUG, "%s: SB : uuid = %pU, fmt = %d\n",
251 __func__, &mp->m_sb.sb_uuid, XLOG_FMT);
252 cmn_err(CE_DEBUG, " log : uuid = %pU, fmt = %d\n",
253 &head->h_fs_uuid, be32_to_cpu(head->h_fmt));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700254}
255#else
256#define xlog_header_check_dump(mp, head)
257#endif
258
259/*
260 * check log record header for recovery
261 */
262STATIC int
263xlog_header_check_recover(
264 xfs_mount_t *mp,
265 xlog_rec_header_t *head)
266{
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000267 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700268
269 /*
270 * IRIX doesn't write the h_fmt field and leaves it zeroed
271 * (XLOG_FMT_UNKNOWN). This stops us from trying to recover
272 * a dirty log created in IRIX.
273 */
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000274 if (unlikely(be32_to_cpu(head->h_fmt) != XLOG_FMT)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700275 xlog_warn(
276 "XFS: dirty log written in incompatible format - can't recover");
277 xlog_header_check_dump(mp, head);
278 XFS_ERROR_REPORT("xlog_header_check_recover(1)",
279 XFS_ERRLEVEL_HIGH, mp);
280 return XFS_ERROR(EFSCORRUPTED);
281 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
282 xlog_warn(
283 "XFS: dirty log entry has mismatched uuid - can't recover");
284 xlog_header_check_dump(mp, head);
285 XFS_ERROR_REPORT("xlog_header_check_recover(2)",
286 XFS_ERRLEVEL_HIGH, mp);
287 return XFS_ERROR(EFSCORRUPTED);
288 }
289 return 0;
290}
291
292/*
293 * read the head block of the log and check the header
294 */
295STATIC int
296xlog_header_check_mount(
297 xfs_mount_t *mp,
298 xlog_rec_header_t *head)
299{
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000300 ASSERT(be32_to_cpu(head->h_magicno) == XLOG_HEADER_MAGIC_NUM);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700301
302 if (uuid_is_nil(&head->h_fs_uuid)) {
303 /*
304 * IRIX doesn't write the h_fs_uuid or h_fmt fields. If
305 * h_fs_uuid is nil, we assume this log was last mounted
306 * by IRIX and continue.
307 */
308 xlog_warn("XFS: nil uuid in log - IRIX style log");
309 } else if (unlikely(!uuid_equal(&mp->m_sb.sb_uuid, &head->h_fs_uuid))) {
310 xlog_warn("XFS: log has mismatched uuid - can't recover");
311 xlog_header_check_dump(mp, head);
312 XFS_ERROR_REPORT("xlog_header_check_mount",
313 XFS_ERRLEVEL_HIGH, mp);
314 return XFS_ERROR(EFSCORRUPTED);
315 }
316 return 0;
317}
318
319STATIC void
320xlog_recover_iodone(
321 struct xfs_buf *bp)
322{
Linus Torvalds1da177e2005-04-16 15:20:36 -0700323 if (XFS_BUF_GETERROR(bp)) {
324 /*
325 * We're not going to bother about retrying
326 * this during recovery. One strike!
327 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700328 xfs_ioerror_alert("xlog_recover_iodone",
Christoph Hellwig15ac08a2008-12-09 04:47:30 -0500329 bp->b_mount, bp, XFS_BUF_ADDR(bp));
330 xfs_force_shutdown(bp->b_mount, SHUTDOWN_META_IO_ERROR);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700331 }
Christoph Hellwig15ac08a2008-12-09 04:47:30 -0500332 bp->b_mount = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700333 XFS_BUF_CLR_IODONE_FUNC(bp);
334 xfs_biodone(bp);
335}
336
337/*
338 * This routine finds (to an approximation) the first block in the physical
339 * log which contains the given cycle. It uses a binary search algorithm.
340 * Note that the algorithm can not be perfect because the disk will not
341 * necessarily be perfect.
342 */
David Chinnera8272ce2007-11-23 16:28:09 +1100343STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -0700344xlog_find_cycle_start(
345 xlog_t *log,
346 xfs_buf_t *bp,
347 xfs_daddr_t first_blk,
348 xfs_daddr_t *last_blk,
349 uint cycle)
350{
351 xfs_caddr_t offset;
352 xfs_daddr_t mid_blk;
Alex Eldere3bb2e32010-04-15 18:17:30 +0000353 xfs_daddr_t end_blk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700354 uint mid_cycle;
355 int error;
356
Alex Eldere3bb2e32010-04-15 18:17:30 +0000357 end_blk = *last_blk;
358 mid_blk = BLK_AVG(first_blk, end_blk);
359 while (mid_blk != first_blk && mid_blk != end_blk) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100360 error = xlog_bread(log, mid_blk, 1, bp, &offset);
361 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700362 return error;
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000363 mid_cycle = xlog_get_cycle(offset);
Alex Eldere3bb2e32010-04-15 18:17:30 +0000364 if (mid_cycle == cycle)
365 end_blk = mid_blk; /* last_half_cycle == mid_cycle */
366 else
367 first_blk = mid_blk; /* first_half_cycle == mid_cycle */
368 mid_blk = BLK_AVG(first_blk, end_blk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700369 }
Alex Eldere3bb2e32010-04-15 18:17:30 +0000370 ASSERT((mid_blk == first_blk && mid_blk+1 == end_blk) ||
371 (mid_blk == end_blk && mid_blk-1 == first_blk));
372
373 *last_blk = end_blk;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374
375 return 0;
376}
377
378/*
Alex Elder3f943d82010-04-15 18:17:34 +0000379 * Check that a range of blocks does not contain stop_on_cycle_no.
380 * Fill in *new_blk with the block offset where such a block is
381 * found, or with -1 (an invalid block number) if there is no such
382 * block in the range. The scan needs to occur from front to back
383 * and the pointer into the region must be updated since a later
384 * routine will need to perform another test.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700385 */
386STATIC int
387xlog_find_verify_cycle(
388 xlog_t *log,
389 xfs_daddr_t start_blk,
390 int nbblks,
391 uint stop_on_cycle_no,
392 xfs_daddr_t *new_blk)
393{
394 xfs_daddr_t i, j;
395 uint cycle;
396 xfs_buf_t *bp;
397 xfs_daddr_t bufblks;
398 xfs_caddr_t buf = NULL;
399 int error = 0;
400
Alex Elder6881a222010-04-13 15:22:29 +1000401 /*
402 * Greedily allocate a buffer big enough to handle the full
403 * range of basic blocks we'll be examining. If that fails,
404 * try a smaller size. We need to be able to read at least
405 * a log sector, or we're out of luck.
406 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700407 bufblks = 1 << ffs(nbblks);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700408 while (!(bp = xlog_get_bp(log, bufblks))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700409 bufblks >>= 1;
Alex Elder69ce58f2010-04-20 17:09:59 +1000410 if (bufblks < log->l_sectBBsize)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 return ENOMEM;
412 }
413
414 for (i = start_blk; i < start_blk + nbblks; i += bufblks) {
415 int bcount;
416
417 bcount = min(bufblks, (start_blk + nbblks - i));
418
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100419 error = xlog_bread(log, i, bcount, bp, &buf);
420 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700421 goto out;
422
Linus Torvalds1da177e2005-04-16 15:20:36 -0700423 for (j = 0; j < bcount; j++) {
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000424 cycle = xlog_get_cycle(buf);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700425 if (cycle == stop_on_cycle_no) {
426 *new_blk = i+j;
427 goto out;
428 }
429
430 buf += BBSIZE;
431 }
432 }
433
434 *new_blk = -1;
435
436out:
437 xlog_put_bp(bp);
438 return error;
439}
440
441/*
442 * Potentially backup over partial log record write.
443 *
444 * In the typical case, last_blk is the number of the block directly after
445 * a good log record. Therefore, we subtract one to get the block number
446 * of the last block in the given buffer. extra_bblks contains the number
447 * of blocks we would have read on a previous read. This happens when the
448 * last log record is split over the end of the physical log.
449 *
450 * extra_bblks is the number of blocks potentially verified on a previous
451 * call to this routine.
452 */
453STATIC int
454xlog_find_verify_log_record(
455 xlog_t *log,
456 xfs_daddr_t start_blk,
457 xfs_daddr_t *last_blk,
458 int extra_bblks)
459{
460 xfs_daddr_t i;
461 xfs_buf_t *bp;
462 xfs_caddr_t offset = NULL;
463 xlog_rec_header_t *head = NULL;
464 int error = 0;
465 int smallmem = 0;
466 int num_blks = *last_blk - start_blk;
467 int xhdrs;
468
469 ASSERT(start_blk != 0 || *last_blk != start_blk);
470
471 if (!(bp = xlog_get_bp(log, num_blks))) {
472 if (!(bp = xlog_get_bp(log, 1)))
473 return ENOMEM;
474 smallmem = 1;
475 } else {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100476 error = xlog_bread(log, start_blk, num_blks, bp, &offset);
477 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700478 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700479 offset += ((num_blks - 1) << BBSHIFT);
480 }
481
482 for (i = (*last_blk) - 1; i >= 0; i--) {
483 if (i < start_blk) {
484 /* valid log record not found */
485 xlog_warn(
486 "XFS: Log inconsistent (didn't find previous header)");
487 ASSERT(0);
488 error = XFS_ERROR(EIO);
489 goto out;
490 }
491
492 if (smallmem) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100493 error = xlog_bread(log, i, 1, bp, &offset);
494 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495 goto out;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 }
497
498 head = (xlog_rec_header_t *)offset;
499
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000500 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(head->h_magicno))
Linus Torvalds1da177e2005-04-16 15:20:36 -0700501 break;
502
503 if (!smallmem)
504 offset -= BBSIZE;
505 }
506
507 /*
508 * We hit the beginning of the physical log & still no header. Return
509 * to caller. If caller can handle a return of -1, then this routine
510 * will be called again for the end of the physical log.
511 */
512 if (i == -1) {
513 error = -1;
514 goto out;
515 }
516
517 /*
518 * We have the final block of the good log (the first block
519 * of the log record _before_ the head. So we check the uuid.
520 */
521 if ((error = xlog_header_check_mount(log->l_mp, head)))
522 goto out;
523
524 /*
525 * We may have found a log record header before we expected one.
526 * last_blk will be the 1st block # with a given cycle #. We may end
527 * up reading an entire log record. In this case, we don't want to
528 * reset last_blk. Only when last_blk points in the middle of a log
529 * record do we update last_blk.
530 */
Eric Sandeen62118702008-03-06 13:44:28 +1100531 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000532 uint h_size = be32_to_cpu(head->h_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700533
534 xhdrs = h_size / XLOG_HEADER_CYCLE_SIZE;
535 if (h_size % XLOG_HEADER_CYCLE_SIZE)
536 xhdrs++;
537 } else {
538 xhdrs = 1;
539 }
540
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000541 if (*last_blk - i + extra_bblks !=
542 BTOBB(be32_to_cpu(head->h_len)) + xhdrs)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700543 *last_blk = i;
544
545out:
546 xlog_put_bp(bp);
547 return error;
548}
549
550/*
551 * Head is defined to be the point of the log where the next log write
552 * write could go. This means that incomplete LR writes at the end are
553 * eliminated when calculating the head. We aren't guaranteed that previous
554 * LR have complete transactions. We only know that a cycle number of
555 * current cycle number -1 won't be present in the log if we start writing
556 * from our current block number.
557 *
558 * last_blk contains the block number of the first block with a given
559 * cycle number.
560 *
561 * Return: zero if normal, non-zero if error.
562 */
Christoph Hellwigba0f32d2005-06-21 15:36:52 +1000563STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -0700564xlog_find_head(
565 xlog_t *log,
566 xfs_daddr_t *return_head_blk)
567{
568 xfs_buf_t *bp;
569 xfs_caddr_t offset;
570 xfs_daddr_t new_blk, first_blk, start_blk, last_blk, head_blk;
571 int num_scan_bblks;
572 uint first_half_cycle, last_half_cycle;
573 uint stop_on_cycle;
574 int error, log_bbnum = log->l_logBBsize;
575
576 /* Is the end of the log device zeroed? */
577 if ((error = xlog_find_zeroed(log, &first_blk)) == -1) {
578 *return_head_blk = first_blk;
579
580 /* Is the whole lot zeroed? */
581 if (!first_blk) {
582 /* Linux XFS shouldn't generate totally zeroed logs -
583 * mkfs etc write a dummy unmount record to a fresh
584 * log so we can store the uuid in there
585 */
586 xlog_warn("XFS: totally zeroed log");
587 }
588
589 return 0;
590 } else if (error) {
591 xlog_warn("XFS: empty log check failed");
592 return error;
593 }
594
595 first_blk = 0; /* get cycle # of 1st block */
596 bp = xlog_get_bp(log, 1);
597 if (!bp)
598 return ENOMEM;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100599
600 error = xlog_bread(log, 0, 1, bp, &offset);
601 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700602 goto bp_err;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100603
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000604 first_half_cycle = xlog_get_cycle(offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700605
606 last_blk = head_blk = log_bbnum - 1; /* get cycle # of last block */
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100607 error = xlog_bread(log, last_blk, 1, bp, &offset);
608 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 goto bp_err;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100610
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000611 last_half_cycle = xlog_get_cycle(offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700612 ASSERT(last_half_cycle != 0);
613
614 /*
615 * If the 1st half cycle number is equal to the last half cycle number,
616 * then the entire log is stamped with the same cycle number. In this
617 * case, head_blk can't be set to zero (which makes sense). The below
618 * math doesn't work out properly with head_blk equal to zero. Instead,
619 * we set it to log_bbnum which is an invalid block number, but this
620 * value makes the math correct. If head_blk doesn't changed through
621 * all the tests below, *head_blk is set to zero at the very end rather
622 * than log_bbnum. In a sense, log_bbnum and zero are the same block
623 * in a circular file.
624 */
625 if (first_half_cycle == last_half_cycle) {
626 /*
627 * In this case we believe that the entire log should have
628 * cycle number last_half_cycle. We need to scan backwards
629 * from the end verifying that there are no holes still
630 * containing last_half_cycle - 1. If we find such a hole,
631 * then the start of that hole will be the new head. The
632 * simple case looks like
633 * x | x ... | x - 1 | x
634 * Another case that fits this picture would be
635 * x | x + 1 | x ... | x
Nathan Scottc41564b2006-03-29 08:55:14 +1000636 * In this case the head really is somewhere at the end of the
Linus Torvalds1da177e2005-04-16 15:20:36 -0700637 * log, as one of the latest writes at the beginning was
638 * incomplete.
639 * One more case is
640 * x | x + 1 | x ... | x - 1 | x
641 * This is really the combination of the above two cases, and
642 * the head has to end up at the start of the x-1 hole at the
643 * end of the log.
644 *
645 * In the 256k log case, we will read from the beginning to the
646 * end of the log and search for cycle numbers equal to x-1.
647 * We don't worry about the x+1 blocks that we encounter,
648 * because we know that they cannot be the head since the log
649 * started with x.
650 */
651 head_blk = log_bbnum;
652 stop_on_cycle = last_half_cycle - 1;
653 } else {
654 /*
655 * In this case we want to find the first block with cycle
656 * number matching last_half_cycle. We expect the log to be
657 * some variation on
Alex Elder3f943d82010-04-15 18:17:34 +0000658 * x + 1 ... | x ... | x
Linus Torvalds1da177e2005-04-16 15:20:36 -0700659 * The first block with cycle number x (last_half_cycle) will
660 * be where the new head belongs. First we do a binary search
661 * for the first occurrence of last_half_cycle. The binary
662 * search may not be totally accurate, so then we scan back
663 * from there looking for occurrences of last_half_cycle before
664 * us. If that backwards scan wraps around the beginning of
665 * the log, then we look for occurrences of last_half_cycle - 1
666 * at the end of the log. The cases we're looking for look
667 * like
Alex Elder3f943d82010-04-15 18:17:34 +0000668 * v binary search stopped here
669 * x + 1 ... | x | x + 1 | x ... | x
670 * ^ but we want to locate this spot
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 * or
Linus Torvalds1da177e2005-04-16 15:20:36 -0700672 * <---------> less than scan distance
Alex Elder3f943d82010-04-15 18:17:34 +0000673 * x + 1 ... | x ... | x - 1 | x
674 * ^ we want to locate this spot
Linus Torvalds1da177e2005-04-16 15:20:36 -0700675 */
676 stop_on_cycle = last_half_cycle;
677 if ((error = xlog_find_cycle_start(log, bp, first_blk,
678 &head_blk, last_half_cycle)))
679 goto bp_err;
680 }
681
682 /*
683 * Now validate the answer. Scan back some number of maximum possible
684 * blocks and make sure each one has the expected cycle number. The
685 * maximum is determined by the total possible amount of buffering
686 * in the in-core log. The following number can be made tighter if
687 * we actually look at the block size of the filesystem.
688 */
689 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
690 if (head_blk >= num_scan_bblks) {
691 /*
692 * We are guaranteed that the entire check can be performed
693 * in one buffer.
694 */
695 start_blk = head_blk - num_scan_bblks;
696 if ((error = xlog_find_verify_cycle(log,
697 start_blk, num_scan_bblks,
698 stop_on_cycle, &new_blk)))
699 goto bp_err;
700 if (new_blk != -1)
701 head_blk = new_blk;
702 } else { /* need to read 2 parts of log */
703 /*
704 * We are going to scan backwards in the log in two parts.
705 * First we scan the physical end of the log. In this part
706 * of the log, we are looking for blocks with cycle number
707 * last_half_cycle - 1.
708 * If we find one, then we know that the log starts there, as
709 * we've found a hole that didn't get written in going around
710 * the end of the physical log. The simple case for this is
711 * x + 1 ... | x ... | x - 1 | x
712 * <---------> less than scan distance
713 * If all of the blocks at the end of the log have cycle number
714 * last_half_cycle, then we check the blocks at the start of
715 * the log looking for occurrences of last_half_cycle. If we
716 * find one, then our current estimate for the location of the
717 * first occurrence of last_half_cycle is wrong and we move
718 * back to the hole we've found. This case looks like
719 * x + 1 ... | x | x + 1 | x ...
720 * ^ binary search stopped here
721 * Another case we need to handle that only occurs in 256k
722 * logs is
723 * x + 1 ... | x ... | x+1 | x ...
724 * ^ binary search stops here
725 * In a 256k log, the scan at the end of the log will see the
726 * x + 1 blocks. We need to skip past those since that is
727 * certainly not the head of the log. By searching for
728 * last_half_cycle-1 we accomplish that.
729 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700730 ASSERT(head_blk <= INT_MAX &&
Alex Elder3f943d82010-04-15 18:17:34 +0000731 (xfs_daddr_t) num_scan_bblks >= head_blk);
732 start_blk = log_bbnum - (num_scan_bblks - head_blk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700733 if ((error = xlog_find_verify_cycle(log, start_blk,
734 num_scan_bblks - (int)head_blk,
735 (stop_on_cycle - 1), &new_blk)))
736 goto bp_err;
737 if (new_blk != -1) {
738 head_blk = new_blk;
Alex Elder9db127e2010-04-15 18:17:26 +0000739 goto validate_head;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700740 }
741
742 /*
743 * Scan beginning of log now. The last part of the physical
744 * log is good. This scan needs to verify that it doesn't find
745 * the last_half_cycle.
746 */
747 start_blk = 0;
748 ASSERT(head_blk <= INT_MAX);
749 if ((error = xlog_find_verify_cycle(log,
750 start_blk, (int)head_blk,
751 stop_on_cycle, &new_blk)))
752 goto bp_err;
753 if (new_blk != -1)
754 head_blk = new_blk;
755 }
756
Alex Elder9db127e2010-04-15 18:17:26 +0000757validate_head:
Linus Torvalds1da177e2005-04-16 15:20:36 -0700758 /*
759 * Now we need to make sure head_blk is not pointing to a block in
760 * the middle of a log record.
761 */
762 num_scan_bblks = XLOG_REC_SHIFT(log);
763 if (head_blk >= num_scan_bblks) {
764 start_blk = head_blk - num_scan_bblks; /* don't read head_blk */
765
766 /* start ptr at last block ptr before head_blk */
767 if ((error = xlog_find_verify_log_record(log, start_blk,
768 &head_blk, 0)) == -1) {
769 error = XFS_ERROR(EIO);
770 goto bp_err;
771 } else if (error)
772 goto bp_err;
773 } else {
774 start_blk = 0;
775 ASSERT(head_blk <= INT_MAX);
776 if ((error = xlog_find_verify_log_record(log, start_blk,
777 &head_blk, 0)) == -1) {
778 /* We hit the beginning of the log during our search */
Alex Elder3f943d82010-04-15 18:17:34 +0000779 start_blk = log_bbnum - (num_scan_bblks - head_blk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 new_blk = log_bbnum;
781 ASSERT(start_blk <= INT_MAX &&
782 (xfs_daddr_t) log_bbnum-start_blk >= 0);
783 ASSERT(head_blk <= INT_MAX);
784 if ((error = xlog_find_verify_log_record(log,
785 start_blk, &new_blk,
786 (int)head_blk)) == -1) {
787 error = XFS_ERROR(EIO);
788 goto bp_err;
789 } else if (error)
790 goto bp_err;
791 if (new_blk != log_bbnum)
792 head_blk = new_blk;
793 } else if (error)
794 goto bp_err;
795 }
796
797 xlog_put_bp(bp);
798 if (head_blk == log_bbnum)
799 *return_head_blk = 0;
800 else
801 *return_head_blk = head_blk;
802 /*
803 * When returning here, we have a good block number. Bad block
804 * means that during a previous crash, we didn't have a clean break
805 * from cycle number N to cycle number N-1. In this case, we need
806 * to find the first block with cycle number N-1.
807 */
808 return 0;
809
810 bp_err:
811 xlog_put_bp(bp);
812
813 if (error)
814 xlog_warn("XFS: failed to find log head");
815 return error;
816}
817
818/*
819 * Find the sync block number or the tail of the log.
820 *
821 * This will be the block number of the last record to have its
822 * associated buffers synced to disk. Every log record header has
823 * a sync lsn embedded in it. LSNs hold block numbers, so it is easy
824 * to get a sync block number. The only concern is to figure out which
825 * log record header to believe.
826 *
827 * The following algorithm uses the log record header with the largest
828 * lsn. The entire log record does not need to be valid. We only care
829 * that the header is valid.
830 *
831 * We could speed up search by using current head_blk buffer, but it is not
832 * available.
833 */
Eric Sandeen5d77c0d2009-11-19 15:52:00 +0000834STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -0700835xlog_find_tail(
836 xlog_t *log,
837 xfs_daddr_t *head_blk,
Eric Sandeen65be6052006-01-11 15:34:19 +1100838 xfs_daddr_t *tail_blk)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839{
840 xlog_rec_header_t *rhead;
841 xlog_op_header_t *op_head;
842 xfs_caddr_t offset = NULL;
843 xfs_buf_t *bp;
844 int error, i, found;
845 xfs_daddr_t umount_data_blk;
846 xfs_daddr_t after_umount_blk;
847 xfs_lsn_t tail_lsn;
848 int hblks;
849
850 found = 0;
851
852 /*
853 * Find previous log record
854 */
855 if ((error = xlog_find_head(log, head_blk)))
856 return error;
857
858 bp = xlog_get_bp(log, 1);
859 if (!bp)
860 return ENOMEM;
861 if (*head_blk == 0) { /* special case */
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100862 error = xlog_bread(log, 0, 1, bp, &offset);
863 if (error)
Alex Elder9db127e2010-04-15 18:17:26 +0000864 goto done;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100865
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000866 if (xlog_get_cycle(offset) == 0) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700867 *tail_blk = 0;
868 /* leave all other log inited values alone */
Alex Elder9db127e2010-04-15 18:17:26 +0000869 goto done;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870 }
871 }
872
873 /*
874 * Search backwards looking for log record header block
875 */
876 ASSERT(*head_blk < INT_MAX);
877 for (i = (int)(*head_blk) - 1; i >= 0; i--) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100878 error = xlog_bread(log, i, 1, bp, &offset);
879 if (error)
Alex Elder9db127e2010-04-15 18:17:26 +0000880 goto done;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100881
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000882 if (XLOG_HEADER_MAGIC_NUM == be32_to_cpu(*(__be32 *)offset)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700883 found = 1;
884 break;
885 }
886 }
887 /*
888 * If we haven't found the log record header block, start looking
889 * again from the end of the physical log. XXXmiken: There should be
890 * a check here to make sure we didn't search more than N blocks in
891 * the previous code.
892 */
893 if (!found) {
894 for (i = log->l_logBBsize - 1; i >= (int)(*head_blk); i--) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100895 error = xlog_bread(log, i, 1, bp, &offset);
896 if (error)
Alex Elder9db127e2010-04-15 18:17:26 +0000897 goto done;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100898
Linus Torvalds1da177e2005-04-16 15:20:36 -0700899 if (XLOG_HEADER_MAGIC_NUM ==
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000900 be32_to_cpu(*(__be32 *)offset)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700901 found = 2;
902 break;
903 }
904 }
905 }
906 if (!found) {
907 xlog_warn("XFS: xlog_find_tail: couldn't find sync record");
908 ASSERT(0);
909 return XFS_ERROR(EIO);
910 }
911
912 /* find blk_no of tail of log */
913 rhead = (xlog_rec_header_t *)offset;
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000914 *tail_blk = BLOCK_LSN(be64_to_cpu(rhead->h_tail_lsn));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700915
916 /*
917 * Reset log values according to the state of the log when we
918 * crashed. In the case where head_blk == 0, we bump curr_cycle
919 * one because the next write starts a new cycle rather than
920 * continuing the cycle of the last good log record. At this
921 * point we have guaranteed that all partial log records have been
922 * accounted for. Therefore, we know that the last good log record
923 * written was complete and ended exactly on the end boundary
924 * of the physical log.
925 */
926 log->l_prev_block = i;
927 log->l_curr_block = (int)*head_blk;
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000928 log->l_curr_cycle = be32_to_cpu(rhead->h_cycle);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700929 if (found == 2)
930 log->l_curr_cycle++;
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000931 log->l_tail_lsn = be64_to_cpu(rhead->h_tail_lsn);
932 log->l_last_sync_lsn = be64_to_cpu(rhead->h_lsn);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700933 log->l_grant_reserve_cycle = log->l_curr_cycle;
934 log->l_grant_reserve_bytes = BBTOB(log->l_curr_block);
935 log->l_grant_write_cycle = log->l_curr_cycle;
936 log->l_grant_write_bytes = BBTOB(log->l_curr_block);
937
938 /*
939 * Look for unmount record. If we find it, then we know there
940 * was a clean unmount. Since 'i' could be the last block in
941 * the physical log, we convert to a log block before comparing
942 * to the head_blk.
943 *
944 * Save the current tail lsn to use to pass to
945 * xlog_clear_stale_blocks() below. We won't want to clear the
946 * unmount record if there is one, so we pass the lsn of the
947 * unmount record rather than the block after it.
948 */
Eric Sandeen62118702008-03-06 13:44:28 +1100949 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000950 int h_size = be32_to_cpu(rhead->h_size);
951 int h_version = be32_to_cpu(rhead->h_version);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700952
953 if ((h_version & XLOG_VERSION_2) &&
954 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
955 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
956 if (h_size % XLOG_HEADER_CYCLE_SIZE)
957 hblks++;
958 } else {
959 hblks = 1;
960 }
961 } else {
962 hblks = 1;
963 }
964 after_umount_blk = (i + hblks + (int)
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000965 BTOBB(be32_to_cpu(rhead->h_len))) % log->l_logBBsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700966 tail_lsn = log->l_tail_lsn;
967 if (*head_blk == after_umount_blk &&
Christoph Hellwigb53e6752007-10-12 10:59:34 +1000968 be32_to_cpu(rhead->h_num_logops) == 1) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700969 umount_data_blk = (i + hblks) % log->l_logBBsize;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100970 error = xlog_bread(log, umount_data_blk, 1, bp, &offset);
971 if (error)
Alex Elder9db127e2010-04-15 18:17:26 +0000972 goto done;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +0100973
Linus Torvalds1da177e2005-04-16 15:20:36 -0700974 op_head = (xlog_op_header_t *)offset;
975 if (op_head->oh_flags & XLOG_UNMOUNT_TRANS) {
976 /*
977 * Set tail and last sync so that newly written
978 * log records will point recovery to after the
979 * current unmount record.
980 */
Christoph Hellwig03bea6f2007-10-12 10:58:05 +1000981 log->l_tail_lsn =
982 xlog_assign_lsn(log->l_curr_cycle,
983 after_umount_blk);
984 log->l_last_sync_lsn =
985 xlog_assign_lsn(log->l_curr_cycle,
986 after_umount_blk);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700987 *tail_blk = after_umount_blk;
David Chinner92821e22007-05-24 15:26:31 +1000988
989 /*
990 * Note that the unmount was clean. If the unmount
991 * was not clean, we need to know this to rebuild the
992 * superblock counters from the perag headers if we
993 * have a filesystem using non-persistent counters.
994 */
995 log->l_mp->m_flags |= XFS_MOUNT_WAS_CLEAN;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700996 }
997 }
998
999 /*
1000 * Make sure that there are no blocks in front of the head
1001 * with the same cycle number as the head. This can happen
1002 * because we allow multiple outstanding log writes concurrently,
1003 * and the later writes might make it out before earlier ones.
1004 *
1005 * We use the lsn from before modifying it so that we'll never
1006 * overwrite the unmount record after a clean unmount.
1007 *
1008 * Do this only if we are going to recover the filesystem
1009 *
1010 * NOTE: This used to say "if (!readonly)"
1011 * However on Linux, we can & do recover a read-only filesystem.
1012 * We only skip recovery if NORECOVERY is specified on mount,
1013 * in which case we would not be here.
1014 *
1015 * But... if the -device- itself is readonly, just skip this.
1016 * We can't recover this device anyway, so it won't matter.
1017 */
Alex Elder9db127e2010-04-15 18:17:26 +00001018 if (!xfs_readonly_buftarg(log->l_mp->m_logdev_targp))
Linus Torvalds1da177e2005-04-16 15:20:36 -07001019 error = xlog_clear_stale_blocks(log, tail_lsn);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001020
Alex Elder9db127e2010-04-15 18:17:26 +00001021done:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001022 xlog_put_bp(bp);
1023
1024 if (error)
1025 xlog_warn("XFS: failed to locate log tail");
1026 return error;
1027}
1028
1029/*
1030 * Is the log zeroed at all?
1031 *
1032 * The last binary search should be changed to perform an X block read
1033 * once X becomes small enough. You can then search linearly through
1034 * the X blocks. This will cut down on the number of reads we need to do.
1035 *
1036 * If the log is partially zeroed, this routine will pass back the blkno
1037 * of the first block with cycle number 0. It won't have a complete LR
1038 * preceding it.
1039 *
1040 * Return:
1041 * 0 => the log is completely written to
1042 * -1 => use *blk_no as the first block of the log
1043 * >0 => error has occurred
1044 */
David Chinnera8272ce2007-11-23 16:28:09 +11001045STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -07001046xlog_find_zeroed(
1047 xlog_t *log,
1048 xfs_daddr_t *blk_no)
1049{
1050 xfs_buf_t *bp;
1051 xfs_caddr_t offset;
1052 uint first_cycle, last_cycle;
1053 xfs_daddr_t new_blk, last_blk, start_blk;
1054 xfs_daddr_t num_scan_bblks;
1055 int error, log_bbnum = log->l_logBBsize;
1056
Nathan Scott6fdf8cc2006-06-28 10:13:52 +10001057 *blk_no = 0;
1058
Linus Torvalds1da177e2005-04-16 15:20:36 -07001059 /* check totally zeroed log */
1060 bp = xlog_get_bp(log, 1);
1061 if (!bp)
1062 return ENOMEM;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001063 error = xlog_bread(log, 0, 1, bp, &offset);
1064 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001065 goto bp_err;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001066
Christoph Hellwig03bea6f2007-10-12 10:58:05 +10001067 first_cycle = xlog_get_cycle(offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001068 if (first_cycle == 0) { /* completely zeroed log */
1069 *blk_no = 0;
1070 xlog_put_bp(bp);
1071 return -1;
1072 }
1073
1074 /* check partially zeroed log */
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001075 error = xlog_bread(log, log_bbnum-1, 1, bp, &offset);
1076 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001077 goto bp_err;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001078
Christoph Hellwig03bea6f2007-10-12 10:58:05 +10001079 last_cycle = xlog_get_cycle(offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001080 if (last_cycle != 0) { /* log completely written to */
1081 xlog_put_bp(bp);
1082 return 0;
1083 } else if (first_cycle != 1) {
1084 /*
1085 * If the cycle of the last block is zero, the cycle of
1086 * the first block must be 1. If it's not, maybe we're
1087 * not looking at a log... Bail out.
1088 */
1089 xlog_warn("XFS: Log inconsistent or not a log (last==0, first!=1)");
1090 return XFS_ERROR(EINVAL);
1091 }
1092
1093 /* we have a partially zeroed log */
1094 last_blk = log_bbnum-1;
1095 if ((error = xlog_find_cycle_start(log, bp, 0, &last_blk, 0)))
1096 goto bp_err;
1097
1098 /*
1099 * Validate the answer. Because there is no way to guarantee that
1100 * the entire log is made up of log records which are the same size,
1101 * we scan over the defined maximum blocks. At this point, the maximum
1102 * is not chosen to mean anything special. XXXmiken
1103 */
1104 num_scan_bblks = XLOG_TOTAL_REC_SHIFT(log);
1105 ASSERT(num_scan_bblks <= INT_MAX);
1106
1107 if (last_blk < num_scan_bblks)
1108 num_scan_bblks = last_blk;
1109 start_blk = last_blk - num_scan_bblks;
1110
1111 /*
1112 * We search for any instances of cycle number 0 that occur before
1113 * our current estimate of the head. What we're trying to detect is
1114 * 1 ... | 0 | 1 | 0...
1115 * ^ binary search ends here
1116 */
1117 if ((error = xlog_find_verify_cycle(log, start_blk,
1118 (int)num_scan_bblks, 0, &new_blk)))
1119 goto bp_err;
1120 if (new_blk != -1)
1121 last_blk = new_blk;
1122
1123 /*
1124 * Potentially backup over partial log record write. We don't need
1125 * to search the end of the log because we know it is zero.
1126 */
1127 if ((error = xlog_find_verify_log_record(log, start_blk,
1128 &last_blk, 0)) == -1) {
1129 error = XFS_ERROR(EIO);
1130 goto bp_err;
1131 } else if (error)
1132 goto bp_err;
1133
1134 *blk_no = last_blk;
1135bp_err:
1136 xlog_put_bp(bp);
1137 if (error)
1138 return error;
1139 return -1;
1140}
1141
1142/*
1143 * These are simple subroutines used by xlog_clear_stale_blocks() below
1144 * to initialize a buffer full of empty log record headers and write
1145 * them into the log.
1146 */
1147STATIC void
1148xlog_add_record(
1149 xlog_t *log,
1150 xfs_caddr_t buf,
1151 int cycle,
1152 int block,
1153 int tail_cycle,
1154 int tail_block)
1155{
1156 xlog_rec_header_t *recp = (xlog_rec_header_t *)buf;
1157
1158 memset(buf, 0, BBSIZE);
Christoph Hellwigb53e6752007-10-12 10:59:34 +10001159 recp->h_magicno = cpu_to_be32(XLOG_HEADER_MAGIC_NUM);
1160 recp->h_cycle = cpu_to_be32(cycle);
1161 recp->h_version = cpu_to_be32(
Eric Sandeen62118702008-03-06 13:44:28 +11001162 xfs_sb_version_haslogv2(&log->l_mp->m_sb) ? 2 : 1);
Christoph Hellwigb53e6752007-10-12 10:59:34 +10001163 recp->h_lsn = cpu_to_be64(xlog_assign_lsn(cycle, block));
1164 recp->h_tail_lsn = cpu_to_be64(xlog_assign_lsn(tail_cycle, tail_block));
1165 recp->h_fmt = cpu_to_be32(XLOG_FMT);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001166 memcpy(&recp->h_fs_uuid, &log->l_mp->m_sb.sb_uuid, sizeof(uuid_t));
1167}
1168
1169STATIC int
1170xlog_write_log_records(
1171 xlog_t *log,
1172 int cycle,
1173 int start_block,
1174 int blocks,
1175 int tail_cycle,
1176 int tail_block)
1177{
1178 xfs_caddr_t offset;
1179 xfs_buf_t *bp;
1180 int balign, ealign;
Alex Elder69ce58f2010-04-20 17:09:59 +10001181 int sectbb = log->l_sectBBsize;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001182 int end_block = start_block + blocks;
1183 int bufblks;
1184 int error = 0;
1185 int i, j = 0;
1186
Alex Elder6881a222010-04-13 15:22:29 +10001187 /*
1188 * Greedily allocate a buffer big enough to handle the full
1189 * range of basic blocks to be written. If that fails, try
1190 * a smaller size. We need to be able to write at least a
1191 * log sector, or we're out of luck.
1192 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001193 bufblks = 1 << ffs(blocks);
1194 while (!(bp = xlog_get_bp(log, bufblks))) {
1195 bufblks >>= 1;
Alex Elder69ce58f2010-04-20 17:09:59 +10001196 if (bufblks < sectbb)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001197 return ENOMEM;
1198 }
1199
1200 /* We may need to do a read at the start to fill in part of
1201 * the buffer in the starting sector not covered by the first
1202 * write below.
1203 */
Alex Elder5c17f532010-04-13 15:22:48 +10001204 balign = round_down(start_block, sectbb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001205 if (balign != start_block) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001206 error = xlog_bread_noalign(log, start_block, 1, bp);
1207 if (error)
1208 goto out_put_bp;
1209
Linus Torvalds1da177e2005-04-16 15:20:36 -07001210 j = start_block - balign;
1211 }
1212
1213 for (i = start_block; i < end_block; i += bufblks) {
1214 int bcount, endcount;
1215
1216 bcount = min(bufblks, end_block - start_block);
1217 endcount = bcount - j;
1218
1219 /* We may need to do a read at the end to fill in part of
1220 * the buffer in the final sector not covered by the write.
1221 * If this is the same sector as the above read, skip it.
1222 */
Alex Elder5c17f532010-04-13 15:22:48 +10001223 ealign = round_down(end_block, sectbb);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001224 if (j == 0 && (start_block + endcount > ealign)) {
1225 offset = XFS_BUF_PTR(bp);
1226 balign = BBTOB(ealign - start_block);
David Chinner234f56a2008-04-10 12:24:24 +10001227 error = XFS_BUF_SET_PTR(bp, offset + balign,
1228 BBTOB(sectbb));
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001229 if (error)
1230 break;
1231
1232 error = xlog_bread_noalign(log, ealign, sectbb, bp);
1233 if (error)
1234 break;
1235
1236 error = XFS_BUF_SET_PTR(bp, offset, bufblks);
David Chinner234f56a2008-04-10 12:24:24 +10001237 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001238 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001239 }
1240
1241 offset = xlog_align(log, start_block, endcount, bp);
1242 for (; j < endcount; j++) {
1243 xlog_add_record(log, offset, cycle, i+j,
1244 tail_cycle, tail_block);
1245 offset += BBSIZE;
1246 }
1247 error = xlog_bwrite(log, start_block, endcount, bp);
1248 if (error)
1249 break;
1250 start_block += endcount;
1251 j = 0;
1252 }
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01001253
1254 out_put_bp:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001255 xlog_put_bp(bp);
1256 return error;
1257}
1258
1259/*
1260 * This routine is called to blow away any incomplete log writes out
1261 * in front of the log head. We do this so that we won't become confused
1262 * if we come up, write only a little bit more, and then crash again.
1263 * If we leave the partial log records out there, this situation could
1264 * cause us to think those partial writes are valid blocks since they
1265 * have the current cycle number. We get rid of them by overwriting them
1266 * with empty log records with the old cycle number rather than the
1267 * current one.
1268 *
1269 * The tail lsn is passed in rather than taken from
1270 * the log so that we will not write over the unmount record after a
1271 * clean unmount in a 512 block log. Doing so would leave the log without
1272 * any valid log records in it until a new one was written. If we crashed
1273 * during that time we would not be able to recover.
1274 */
1275STATIC int
1276xlog_clear_stale_blocks(
1277 xlog_t *log,
1278 xfs_lsn_t tail_lsn)
1279{
1280 int tail_cycle, head_cycle;
1281 int tail_block, head_block;
1282 int tail_distance, max_distance;
1283 int distance;
1284 int error;
1285
1286 tail_cycle = CYCLE_LSN(tail_lsn);
1287 tail_block = BLOCK_LSN(tail_lsn);
1288 head_cycle = log->l_curr_cycle;
1289 head_block = log->l_curr_block;
1290
1291 /*
1292 * Figure out the distance between the new head of the log
1293 * and the tail. We want to write over any blocks beyond the
1294 * head that we may have written just before the crash, but
1295 * we don't want to overwrite the tail of the log.
1296 */
1297 if (head_cycle == tail_cycle) {
1298 /*
1299 * The tail is behind the head in the physical log,
1300 * so the distance from the head to the tail is the
1301 * distance from the head to the end of the log plus
1302 * the distance from the beginning of the log to the
1303 * tail.
1304 */
1305 if (unlikely(head_block < tail_block || head_block >= log->l_logBBsize)) {
1306 XFS_ERROR_REPORT("xlog_clear_stale_blocks(1)",
1307 XFS_ERRLEVEL_LOW, log->l_mp);
1308 return XFS_ERROR(EFSCORRUPTED);
1309 }
1310 tail_distance = tail_block + (log->l_logBBsize - head_block);
1311 } else {
1312 /*
1313 * The head is behind the tail in the physical log,
1314 * so the distance from the head to the tail is just
1315 * the tail block minus the head block.
1316 */
1317 if (unlikely(head_block >= tail_block || head_cycle != (tail_cycle + 1))){
1318 XFS_ERROR_REPORT("xlog_clear_stale_blocks(2)",
1319 XFS_ERRLEVEL_LOW, log->l_mp);
1320 return XFS_ERROR(EFSCORRUPTED);
1321 }
1322 tail_distance = tail_block - head_block;
1323 }
1324
1325 /*
1326 * If the head is right up against the tail, we can't clear
1327 * anything.
1328 */
1329 if (tail_distance <= 0) {
1330 ASSERT(tail_distance == 0);
1331 return 0;
1332 }
1333
1334 max_distance = XLOG_TOTAL_REC_SHIFT(log);
1335 /*
1336 * Take the smaller of the maximum amount of outstanding I/O
1337 * we could have and the distance to the tail to clear out.
1338 * We take the smaller so that we don't overwrite the tail and
1339 * we don't waste all day writing from the head to the tail
1340 * for no reason.
1341 */
1342 max_distance = MIN(max_distance, tail_distance);
1343
1344 if ((head_block + max_distance) <= log->l_logBBsize) {
1345 /*
1346 * We can stomp all the blocks we need to without
1347 * wrapping around the end of the log. Just do it
1348 * in a single write. Use the cycle number of the
1349 * current cycle minus one so that the log will look like:
1350 * n ... | n - 1 ...
1351 */
1352 error = xlog_write_log_records(log, (head_cycle - 1),
1353 head_block, max_distance, tail_cycle,
1354 tail_block);
1355 if (error)
1356 return error;
1357 } else {
1358 /*
1359 * We need to wrap around the end of the physical log in
1360 * order to clear all the blocks. Do it in two separate
1361 * I/Os. The first write should be from the head to the
1362 * end of the physical log, and it should use the current
1363 * cycle number minus one just like above.
1364 */
1365 distance = log->l_logBBsize - head_block;
1366 error = xlog_write_log_records(log, (head_cycle - 1),
1367 head_block, distance, tail_cycle,
1368 tail_block);
1369
1370 if (error)
1371 return error;
1372
1373 /*
1374 * Now write the blocks at the start of the physical log.
1375 * This writes the remainder of the blocks we want to clear.
1376 * It uses the current cycle number since we're now on the
1377 * same cycle as the head so that we get:
1378 * n ... n ... | n - 1 ...
1379 * ^^^^^ blocks we're writing
1380 */
1381 distance = max_distance - (log->l_logBBsize - head_block);
1382 error = xlog_write_log_records(log, head_cycle, 0, distance,
1383 tail_cycle, tail_block);
1384 if (error)
1385 return error;
1386 }
1387
1388 return 0;
1389}
1390
1391/******************************************************************************
1392 *
1393 * Log recover routines
1394 *
1395 ******************************************************************************
1396 */
1397
1398STATIC xlog_recover_t *
1399xlog_recover_find_tid(
Dave Chinnerf0a76952010-01-11 11:49:57 +00001400 struct hlist_head *head,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001401 xlog_tid_t tid)
1402{
Dave Chinnerf0a76952010-01-11 11:49:57 +00001403 xlog_recover_t *trans;
1404 struct hlist_node *n;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001405
Dave Chinnerf0a76952010-01-11 11:49:57 +00001406 hlist_for_each_entry(trans, n, head, r_list) {
1407 if (trans->r_log_tid == tid)
1408 return trans;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001409 }
Dave Chinnerf0a76952010-01-11 11:49:57 +00001410 return NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001411}
1412
1413STATIC void
Dave Chinnerf0a76952010-01-11 11:49:57 +00001414xlog_recover_new_tid(
1415 struct hlist_head *head,
1416 xlog_tid_t tid,
1417 xfs_lsn_t lsn)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001418{
Dave Chinnerf0a76952010-01-11 11:49:57 +00001419 xlog_recover_t *trans;
1420
1421 trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP);
1422 trans->r_log_tid = tid;
1423 trans->r_lsn = lsn;
1424 INIT_LIST_HEAD(&trans->r_itemq);
1425
1426 INIT_HLIST_NODE(&trans->r_list);
1427 hlist_add_head(&trans->r_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001428}
1429
1430STATIC void
1431xlog_recover_add_item(
Dave Chinnerf0a76952010-01-11 11:49:57 +00001432 struct list_head *head)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001433{
1434 xlog_recover_item_t *item;
1435
1436 item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP);
Dave Chinnerf0a76952010-01-11 11:49:57 +00001437 INIT_LIST_HEAD(&item->ri_list);
1438 list_add_tail(&item->ri_list, head);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001439}
1440
1441STATIC int
1442xlog_recover_add_to_cont_trans(
Dave Chinner9abbc532010-04-13 15:06:46 +10001443 struct log *log,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001444 xlog_recover_t *trans,
1445 xfs_caddr_t dp,
1446 int len)
1447{
1448 xlog_recover_item_t *item;
1449 xfs_caddr_t ptr, old_ptr;
1450 int old_len;
1451
Dave Chinnerf0a76952010-01-11 11:49:57 +00001452 if (list_empty(&trans->r_itemq)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001453 /* finish copying rest of trans header */
1454 xlog_recover_add_item(&trans->r_itemq);
1455 ptr = (xfs_caddr_t) &trans->r_theader +
1456 sizeof(xfs_trans_header_t) - len;
1457 memcpy(ptr, dp, len); /* d, s, l */
1458 return 0;
1459 }
Dave Chinnerf0a76952010-01-11 11:49:57 +00001460 /* take the tail entry */
1461 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001462
1463 old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
1464 old_len = item->ri_buf[item->ri_cnt-1].i_len;
1465
Christoph Hellwig760dea62005-09-02 16:56:02 +10001466 ptr = kmem_realloc(old_ptr, len+old_len, old_len, 0u);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001467 memcpy(&ptr[old_len], dp, len); /* d, s, l */
1468 item->ri_buf[item->ri_cnt-1].i_len += len;
1469 item->ri_buf[item->ri_cnt-1].i_addr = ptr;
Dave Chinner9abbc532010-04-13 15:06:46 +10001470 trace_xfs_log_recover_item_add_cont(log, trans, item, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001471 return 0;
1472}
1473
1474/*
1475 * The next region to add is the start of a new region. It could be
1476 * a whole region or it could be the first part of a new region. Because
1477 * of this, the assumption here is that the type and size fields of all
1478 * format structures fit into the first 32 bits of the structure.
1479 *
1480 * This works because all regions must be 32 bit aligned. Therefore, we
1481 * either have both fields or we have neither field. In the case we have
1482 * neither field, the data part of the region is zero length. We only have
1483 * a log_op_header and can throw away the header since a new one will appear
1484 * later. If we have at least 4 bytes, then we can determine how many regions
1485 * will appear in the current log item.
1486 */
1487STATIC int
1488xlog_recover_add_to_trans(
Dave Chinner9abbc532010-04-13 15:06:46 +10001489 struct log *log,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001490 xlog_recover_t *trans,
1491 xfs_caddr_t dp,
1492 int len)
1493{
1494 xfs_inode_log_format_t *in_f; /* any will do */
1495 xlog_recover_item_t *item;
1496 xfs_caddr_t ptr;
1497
1498 if (!len)
1499 return 0;
Dave Chinnerf0a76952010-01-11 11:49:57 +00001500 if (list_empty(&trans->r_itemq)) {
David Chinner5a792c42008-10-30 17:40:09 +11001501 /* we need to catch log corruptions here */
1502 if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) {
1503 xlog_warn("XFS: xlog_recover_add_to_trans: "
1504 "bad header magic number");
1505 ASSERT(0);
1506 return XFS_ERROR(EIO);
1507 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001508 if (len == sizeof(xfs_trans_header_t))
1509 xlog_recover_add_item(&trans->r_itemq);
1510 memcpy(&trans->r_theader, dp, len); /* d, s, l */
1511 return 0;
1512 }
1513
1514 ptr = kmem_alloc(len, KM_SLEEP);
1515 memcpy(ptr, dp, len);
1516 in_f = (xfs_inode_log_format_t *)ptr;
1517
Dave Chinnerf0a76952010-01-11 11:49:57 +00001518 /* take the tail entry */
1519 item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list);
1520 if (item->ri_total != 0 &&
1521 item->ri_total == item->ri_cnt) {
1522 /* tail item is in use, get a new one */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001523 xlog_recover_add_item(&trans->r_itemq);
Dave Chinnerf0a76952010-01-11 11:49:57 +00001524 item = list_entry(trans->r_itemq.prev,
1525 xlog_recover_item_t, ri_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001526 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001527
1528 if (item->ri_total == 0) { /* first region to be added */
Christoph Hellwige8fa6b42009-03-03 14:48:36 -05001529 if (in_f->ilf_size == 0 ||
1530 in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) {
1531 xlog_warn(
1532 "XFS: bad number of regions (%d) in inode log format",
1533 in_f->ilf_size);
1534 ASSERT(0);
1535 return XFS_ERROR(EIO);
1536 }
1537
1538 item->ri_total = in_f->ilf_size;
1539 item->ri_buf =
1540 kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t),
1541 KM_SLEEP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001542 }
1543 ASSERT(item->ri_total > item->ri_cnt);
1544 /* Description region is ri_buf[0] */
1545 item->ri_buf[item->ri_cnt].i_addr = ptr;
1546 item->ri_buf[item->ri_cnt].i_len = len;
1547 item->ri_cnt++;
Dave Chinner9abbc532010-04-13 15:06:46 +10001548 trace_xfs_log_recover_item_add(log, trans, item, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001549 return 0;
1550}
1551
Dave Chinnerf0a76952010-01-11 11:49:57 +00001552/*
1553 * Sort the log items in the transaction. Cancelled buffers need
1554 * to be put first so they are processed before any items that might
1555 * modify the buffers. If they are cancelled, then the modifications
1556 * don't need to be replayed.
1557 */
Linus Torvalds1da177e2005-04-16 15:20:36 -07001558STATIC int
1559xlog_recover_reorder_trans(
Dave Chinner9abbc532010-04-13 15:06:46 +10001560 struct log *log,
1561 xlog_recover_t *trans,
1562 int pass)
Linus Torvalds1da177e2005-04-16 15:20:36 -07001563{
Dave Chinnerf0a76952010-01-11 11:49:57 +00001564 xlog_recover_item_t *item, *n;
1565 LIST_HEAD(sort_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001566
Dave Chinnerf0a76952010-01-11 11:49:57 +00001567 list_splice_init(&trans->r_itemq, &sort_list);
1568 list_for_each_entry_safe(item, n, &sort_list, ri_list) {
1569 xfs_buf_log_format_t *buf_f;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001570
Dave Chinnerf0a76952010-01-11 11:49:57 +00001571 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
1572
1573 switch (ITEM_TYPE(item)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07001574 case XFS_LI_BUF:
Dave Chinnerf0a76952010-01-11 11:49:57 +00001575 if (!(buf_f->blf_flags & XFS_BLI_CANCEL)) {
Dave Chinner9abbc532010-04-13 15:06:46 +10001576 trace_xfs_log_recover_item_reorder_head(log,
1577 trans, item, pass);
Dave Chinnerf0a76952010-01-11 11:49:57 +00001578 list_move(&item->ri_list, &trans->r_itemq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001579 break;
1580 }
1581 case XFS_LI_INODE:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001582 case XFS_LI_DQUOT:
1583 case XFS_LI_QUOTAOFF:
1584 case XFS_LI_EFD:
1585 case XFS_LI_EFI:
Dave Chinner9abbc532010-04-13 15:06:46 +10001586 trace_xfs_log_recover_item_reorder_tail(log,
1587 trans, item, pass);
Dave Chinnerf0a76952010-01-11 11:49:57 +00001588 list_move_tail(&item->ri_list, &trans->r_itemq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001589 break;
1590 default:
1591 xlog_warn(
1592 "XFS: xlog_recover_reorder_trans: unrecognized type of log operation");
1593 ASSERT(0);
1594 return XFS_ERROR(EIO);
1595 }
Dave Chinnerf0a76952010-01-11 11:49:57 +00001596 }
1597 ASSERT(list_empty(&sort_list));
Linus Torvalds1da177e2005-04-16 15:20:36 -07001598 return 0;
1599}
1600
1601/*
1602 * Build up the table of buf cancel records so that we don't replay
1603 * cancelled data in the second pass. For buffer records that are
1604 * not cancel records, there is nothing to do here so we just return.
1605 *
1606 * If we get a cancel record which is already in the table, this indicates
1607 * that the buffer was cancelled multiple times. In order to ensure
1608 * that during pass 2 we keep the record in the table until we reach its
1609 * last occurrence in the log, we keep a reference count in the cancel
1610 * record in the table to tell us how many times we expect to see this
1611 * record during the second pass.
1612 */
1613STATIC void
1614xlog_recover_do_buffer_pass1(
1615 xlog_t *log,
1616 xfs_buf_log_format_t *buf_f)
1617{
1618 xfs_buf_cancel_t *bcp;
1619 xfs_buf_cancel_t *nextp;
1620 xfs_buf_cancel_t *prevp;
1621 xfs_buf_cancel_t **bucket;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001622 xfs_daddr_t blkno = 0;
1623 uint len = 0;
1624 ushort flags = 0;
1625
1626 switch (buf_f->blf_type) {
1627 case XFS_LI_BUF:
1628 blkno = buf_f->blf_blkno;
1629 len = buf_f->blf_len;
1630 flags = buf_f->blf_flags;
1631 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001632 }
1633
1634 /*
1635 * If this isn't a cancel buffer item, then just return.
1636 */
Dave Chinner9abbc532010-04-13 15:06:46 +10001637 if (!(flags & XFS_BLI_CANCEL)) {
1638 trace_xfs_log_recover_buf_not_cancel(log, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001639 return;
Dave Chinner9abbc532010-04-13 15:06:46 +10001640 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001641
1642 /*
1643 * Insert an xfs_buf_cancel record into the hash table of
1644 * them. If there is already an identical record, bump
1645 * its reference count.
1646 */
1647 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1648 XLOG_BC_TABLE_SIZE];
1649 /*
1650 * If the hash bucket is empty then just insert a new record into
1651 * the bucket.
1652 */
1653 if (*bucket == NULL) {
1654 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1655 KM_SLEEP);
1656 bcp->bc_blkno = blkno;
1657 bcp->bc_len = len;
1658 bcp->bc_refcount = 1;
1659 bcp->bc_next = NULL;
1660 *bucket = bcp;
1661 return;
1662 }
1663
1664 /*
1665 * The hash bucket is not empty, so search for duplicates of our
1666 * record. If we find one them just bump its refcount. If not
1667 * then add us at the end of the list.
1668 */
1669 prevp = NULL;
1670 nextp = *bucket;
1671 while (nextp != NULL) {
1672 if (nextp->bc_blkno == blkno && nextp->bc_len == len) {
1673 nextp->bc_refcount++;
Dave Chinner9abbc532010-04-13 15:06:46 +10001674 trace_xfs_log_recover_buf_cancel_ref_inc(log, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001675 return;
1676 }
1677 prevp = nextp;
1678 nextp = nextp->bc_next;
1679 }
1680 ASSERT(prevp != NULL);
1681 bcp = (xfs_buf_cancel_t *)kmem_alloc(sizeof(xfs_buf_cancel_t),
1682 KM_SLEEP);
1683 bcp->bc_blkno = blkno;
1684 bcp->bc_len = len;
1685 bcp->bc_refcount = 1;
1686 bcp->bc_next = NULL;
1687 prevp->bc_next = bcp;
Dave Chinner9abbc532010-04-13 15:06:46 +10001688 trace_xfs_log_recover_buf_cancel_add(log, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001689}
1690
1691/*
1692 * Check to see whether the buffer being recovered has a corresponding
1693 * entry in the buffer cancel record table. If it does then return 1
1694 * so that it will be cancelled, otherwise return 0. If the buffer is
1695 * actually a buffer cancel item (XFS_BLI_CANCEL is set), then decrement
1696 * the refcount on the entry in the table and remove it from the table
1697 * if this is the last reference.
1698 *
1699 * We remove the cancel record from the table when we encounter its
1700 * last occurrence in the log so that if the same buffer is re-used
1701 * again after its last cancellation we actually replay the changes
1702 * made at that point.
1703 */
1704STATIC int
1705xlog_check_buffer_cancelled(
1706 xlog_t *log,
1707 xfs_daddr_t blkno,
1708 uint len,
1709 ushort flags)
1710{
1711 xfs_buf_cancel_t *bcp;
1712 xfs_buf_cancel_t *prevp;
1713 xfs_buf_cancel_t **bucket;
1714
1715 if (log->l_buf_cancel_table == NULL) {
1716 /*
1717 * There is nothing in the table built in pass one,
1718 * so this buffer must not be cancelled.
1719 */
1720 ASSERT(!(flags & XFS_BLI_CANCEL));
1721 return 0;
1722 }
1723
1724 bucket = &log->l_buf_cancel_table[(__uint64_t)blkno %
1725 XLOG_BC_TABLE_SIZE];
1726 bcp = *bucket;
1727 if (bcp == NULL) {
1728 /*
1729 * There is no corresponding entry in the table built
1730 * in pass one, so this buffer has not been cancelled.
1731 */
1732 ASSERT(!(flags & XFS_BLI_CANCEL));
1733 return 0;
1734 }
1735
1736 /*
1737 * Search for an entry in the buffer cancel table that
1738 * matches our buffer.
1739 */
1740 prevp = NULL;
1741 while (bcp != NULL) {
1742 if (bcp->bc_blkno == blkno && bcp->bc_len == len) {
1743 /*
1744 * We've go a match, so return 1 so that the
1745 * recovery of this buffer is cancelled.
1746 * If this buffer is actually a buffer cancel
1747 * log item, then decrement the refcount on the
1748 * one in the table and remove it if this is the
1749 * last reference.
1750 */
1751 if (flags & XFS_BLI_CANCEL) {
1752 bcp->bc_refcount--;
1753 if (bcp->bc_refcount == 0) {
1754 if (prevp == NULL) {
1755 *bucket = bcp->bc_next;
1756 } else {
1757 prevp->bc_next = bcp->bc_next;
1758 }
Denys Vlasenkof0e2d932008-05-19 16:31:57 +10001759 kmem_free(bcp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001760 }
1761 }
1762 return 1;
1763 }
1764 prevp = bcp;
1765 bcp = bcp->bc_next;
1766 }
1767 /*
1768 * We didn't find a corresponding entry in the table, so
1769 * return 0 so that the buffer is NOT cancelled.
1770 */
1771 ASSERT(!(flags & XFS_BLI_CANCEL));
1772 return 0;
1773}
1774
1775STATIC int
1776xlog_recover_do_buffer_pass2(
1777 xlog_t *log,
1778 xfs_buf_log_format_t *buf_f)
1779{
Linus Torvalds1da177e2005-04-16 15:20:36 -07001780 xfs_daddr_t blkno = 0;
1781 ushort flags = 0;
1782 uint len = 0;
1783
1784 switch (buf_f->blf_type) {
1785 case XFS_LI_BUF:
1786 blkno = buf_f->blf_blkno;
1787 flags = buf_f->blf_flags;
1788 len = buf_f->blf_len;
1789 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001790 }
1791
1792 return xlog_check_buffer_cancelled(log, blkno, len, flags);
1793}
1794
1795/*
1796 * Perform recovery for a buffer full of inodes. In these buffers,
1797 * the only data which should be recovered is that which corresponds
1798 * to the di_next_unlinked pointers in the on disk inode structures.
1799 * The rest of the data for the inodes is always logged through the
1800 * inodes themselves rather than the inode buffer and is recovered
1801 * in xlog_recover_do_inode_trans().
1802 *
1803 * The only time when buffers full of inodes are fully recovered is
1804 * when the buffer is full of newly allocated inodes. In this case
1805 * the buffer will not be marked as an inode buffer and so will be
1806 * sent to xlog_recover_do_reg_buffer() below during recovery.
1807 */
1808STATIC int
1809xlog_recover_do_inode_buffer(
1810 xfs_mount_t *mp,
1811 xlog_recover_item_t *item,
1812 xfs_buf_t *bp,
1813 xfs_buf_log_format_t *buf_f)
1814{
1815 int i;
1816 int item_index;
1817 int bit;
1818 int nbits;
1819 int reg_buf_offset;
1820 int reg_buf_bytes;
1821 int next_unlinked_offset;
1822 int inodes_per_buf;
1823 xfs_agino_t *logged_nextp;
1824 xfs_agino_t *buffer_nextp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001825 unsigned int *data_map = NULL;
1826 unsigned int map_size = 0;
1827
Dave Chinner9abbc532010-04-13 15:06:46 +10001828 trace_xfs_log_recover_buf_inode_buf(mp->m_log, buf_f);
1829
Linus Torvalds1da177e2005-04-16 15:20:36 -07001830 switch (buf_f->blf_type) {
1831 case XFS_LI_BUF:
1832 data_map = buf_f->blf_data_map;
1833 map_size = buf_f->blf_map_size;
1834 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001835 }
1836 /*
1837 * Set the variables corresponding to the current region to
1838 * 0 so that we'll initialize them on the first pass through
1839 * the loop.
1840 */
1841 reg_buf_offset = 0;
1842 reg_buf_bytes = 0;
1843 bit = 0;
1844 nbits = 0;
1845 item_index = 0;
1846 inodes_per_buf = XFS_BUF_COUNT(bp) >> mp->m_sb.sb_inodelog;
1847 for (i = 0; i < inodes_per_buf; i++) {
1848 next_unlinked_offset = (i * mp->m_sb.sb_inodesize) +
1849 offsetof(xfs_dinode_t, di_next_unlinked);
1850
1851 while (next_unlinked_offset >=
1852 (reg_buf_offset + reg_buf_bytes)) {
1853 /*
1854 * The next di_next_unlinked field is beyond
1855 * the current logged region. Find the next
1856 * logged region that contains or is beyond
1857 * the current di_next_unlinked field.
1858 */
1859 bit += nbits;
1860 bit = xfs_next_bit(data_map, map_size, bit);
1861
1862 /*
1863 * If there are no more logged regions in the
1864 * buffer, then we're done.
1865 */
1866 if (bit == -1) {
1867 return 0;
1868 }
1869
1870 nbits = xfs_contig_bits(data_map, map_size,
1871 bit);
1872 ASSERT(nbits > 0);
1873 reg_buf_offset = bit << XFS_BLI_SHIFT;
1874 reg_buf_bytes = nbits << XFS_BLI_SHIFT;
1875 item_index++;
1876 }
1877
1878 /*
1879 * If the current logged region starts after the current
1880 * di_next_unlinked field, then move on to the next
1881 * di_next_unlinked field.
1882 */
1883 if (next_unlinked_offset < reg_buf_offset) {
1884 continue;
1885 }
1886
1887 ASSERT(item->ri_buf[item_index].i_addr != NULL);
1888 ASSERT((item->ri_buf[item_index].i_len % XFS_BLI_CHUNK) == 0);
1889 ASSERT((reg_buf_offset + reg_buf_bytes) <= XFS_BUF_COUNT(bp));
1890
1891 /*
1892 * The current logged region contains a copy of the
1893 * current di_next_unlinked field. Extract its value
1894 * and copy it to the buffer copy.
1895 */
1896 logged_nextp = (xfs_agino_t *)
1897 ((char *)(item->ri_buf[item_index].i_addr) +
1898 (next_unlinked_offset - reg_buf_offset));
1899 if (unlikely(*logged_nextp == 0)) {
1900 xfs_fs_cmn_err(CE_ALERT, mp,
1901 "bad inode buffer log record (ptr = 0x%p, bp = 0x%p). XFS trying to replay bad (0) inode di_next_unlinked field",
1902 item, bp);
1903 XFS_ERROR_REPORT("xlog_recover_do_inode_buf",
1904 XFS_ERRLEVEL_LOW, mp);
1905 return XFS_ERROR(EFSCORRUPTED);
1906 }
1907
1908 buffer_nextp = (xfs_agino_t *)xfs_buf_offset(bp,
1909 next_unlinked_offset);
Tim Shimmin87c199c2006-06-09 14:56:16 +10001910 *buffer_nextp = *logged_nextp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001911 }
1912
1913 return 0;
1914}
1915
1916/*
1917 * Perform a 'normal' buffer recovery. Each logged region of the
1918 * buffer should be copied over the corresponding region in the
1919 * given buffer. The bitmap in the buf log format structure indicates
1920 * where to place the logged data.
1921 */
1922/*ARGSUSED*/
1923STATIC void
1924xlog_recover_do_reg_buffer(
Dave Chinner9abbc532010-04-13 15:06:46 +10001925 struct xfs_mount *mp,
Linus Torvalds1da177e2005-04-16 15:20:36 -07001926 xlog_recover_item_t *item,
1927 xfs_buf_t *bp,
1928 xfs_buf_log_format_t *buf_f)
1929{
1930 int i;
1931 int bit;
1932 int nbits;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001933 unsigned int *data_map = NULL;
1934 unsigned int map_size = 0;
1935 int error;
1936
Dave Chinner9abbc532010-04-13 15:06:46 +10001937 trace_xfs_log_recover_buf_reg_buf(mp->m_log, buf_f);
1938
Linus Torvalds1da177e2005-04-16 15:20:36 -07001939 switch (buf_f->blf_type) {
1940 case XFS_LI_BUF:
1941 data_map = buf_f->blf_data_map;
1942 map_size = buf_f->blf_map_size;
1943 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001944 }
1945 bit = 0;
1946 i = 1; /* 0 is the buf format structure */
1947 while (1) {
1948 bit = xfs_next_bit(data_map, map_size, bit);
1949 if (bit == -1)
1950 break;
1951 nbits = xfs_contig_bits(data_map, map_size, bit);
1952 ASSERT(nbits > 0);
Christoph Hellwig4b809162007-08-16 15:37:36 +10001953 ASSERT(item->ri_buf[i].i_addr != NULL);
Linus Torvalds1da177e2005-04-16 15:20:36 -07001954 ASSERT(item->ri_buf[i].i_len % XFS_BLI_CHUNK == 0);
1955 ASSERT(XFS_BUF_COUNT(bp) >=
1956 ((uint)bit << XFS_BLI_SHIFT)+(nbits<<XFS_BLI_SHIFT));
1957
1958 /*
1959 * Do a sanity check if this is a dquot buffer. Just checking
1960 * the first dquot in the buffer should do. XXXThis is
1961 * probably a good thing to do for other buf types also.
1962 */
1963 error = 0;
Nathan Scottc8ad20f2005-06-21 15:38:48 +10001964 if (buf_f->blf_flags &
1965 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02001966 if (item->ri_buf[i].i_addr == NULL) {
1967 cmn_err(CE_ALERT,
1968 "XFS: NULL dquot in %s.", __func__);
1969 goto next;
1970 }
Jan Rekorajski8ec6dba2009-11-16 11:57:02 +00001971 if (item->ri_buf[i].i_len < sizeof(xfs_disk_dquot_t)) {
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02001972 cmn_err(CE_ALERT,
1973 "XFS: dquot too small (%d) in %s.",
1974 item->ri_buf[i].i_len, __func__);
1975 goto next;
1976 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07001977 error = xfs_qm_dqcheck((xfs_disk_dquot_t *)
1978 item->ri_buf[i].i_addr,
1979 -1, 0, XFS_QMOPT_DOWARN,
1980 "dquot_buf_recover");
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02001981 if (error)
1982 goto next;
Linus Torvalds1da177e2005-04-16 15:20:36 -07001983 }
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02001984
1985 memcpy(xfs_buf_offset(bp,
1986 (uint)bit << XFS_BLI_SHIFT), /* dest */
1987 item->ri_buf[i].i_addr, /* source */
1988 nbits<<XFS_BLI_SHIFT); /* length */
1989 next:
Linus Torvalds1da177e2005-04-16 15:20:36 -07001990 i++;
1991 bit += nbits;
1992 }
1993
1994 /* Shouldn't be any more regions */
1995 ASSERT(i == item->ri_total);
1996}
1997
1998/*
1999 * Do some primitive error checking on ondisk dquot data structures.
2000 */
2001int
2002xfs_qm_dqcheck(
2003 xfs_disk_dquot_t *ddq,
2004 xfs_dqid_t id,
2005 uint type, /* used only when IO_dorepair is true */
2006 uint flags,
2007 char *str)
2008{
2009 xfs_dqblk_t *d = (xfs_dqblk_t *)ddq;
2010 int errs = 0;
2011
2012 /*
2013 * We can encounter an uninitialized dquot buffer for 2 reasons:
2014 * 1. If we crash while deleting the quotainode(s), and those blks got
2015 * used for user data. This is because we take the path of regular
2016 * file deletion; however, the size field of quotainodes is never
2017 * updated, so all the tricks that we play in itruncate_finish
2018 * don't quite matter.
2019 *
2020 * 2. We don't play the quota buffers when there's a quotaoff logitem.
2021 * But the allocation will be replayed so we'll end up with an
2022 * uninitialized quota block.
2023 *
2024 * This is all fine; things are still consistent, and we haven't lost
2025 * any quota information. Just don't complain about bad dquot blks.
2026 */
Christoph Hellwig1149d962005-11-02 15:01:12 +11002027 if (be16_to_cpu(ddq->d_magic) != XFS_DQUOT_MAGIC) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002028 if (flags & XFS_QMOPT_DOWARN)
2029 cmn_err(CE_ALERT,
2030 "%s : XFS dquot ID 0x%x, magic 0x%x != 0x%x",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002031 str, id, be16_to_cpu(ddq->d_magic), XFS_DQUOT_MAGIC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002032 errs++;
2033 }
Christoph Hellwig1149d962005-11-02 15:01:12 +11002034 if (ddq->d_version != XFS_DQUOT_VERSION) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002035 if (flags & XFS_QMOPT_DOWARN)
2036 cmn_err(CE_ALERT,
2037 "%s : XFS dquot ID 0x%x, version 0x%x != 0x%x",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002038 str, id, ddq->d_version, XFS_DQUOT_VERSION);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002039 errs++;
2040 }
2041
Christoph Hellwig1149d962005-11-02 15:01:12 +11002042 if (ddq->d_flags != XFS_DQ_USER &&
2043 ddq->d_flags != XFS_DQ_PROJ &&
2044 ddq->d_flags != XFS_DQ_GROUP) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002045 if (flags & XFS_QMOPT_DOWARN)
2046 cmn_err(CE_ALERT,
2047 "%s : XFS dquot ID 0x%x, unknown flags 0x%x",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002048 str, id, ddq->d_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002049 errs++;
2050 }
2051
Christoph Hellwig1149d962005-11-02 15:01:12 +11002052 if (id != -1 && id != be32_to_cpu(ddq->d_id)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002053 if (flags & XFS_QMOPT_DOWARN)
2054 cmn_err(CE_ALERT,
2055 "%s : ondisk-dquot 0x%p, ID mismatch: "
2056 "0x%x expected, found id 0x%x",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002057 str, ddq, id, be32_to_cpu(ddq->d_id));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002058 errs++;
2059 }
2060
2061 if (!errs && ddq->d_id) {
Christoph Hellwig1149d962005-11-02 15:01:12 +11002062 if (ddq->d_blk_softlimit &&
2063 be64_to_cpu(ddq->d_bcount) >=
2064 be64_to_cpu(ddq->d_blk_softlimit)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002065 if (!ddq->d_btimer) {
2066 if (flags & XFS_QMOPT_DOWARN)
2067 cmn_err(CE_ALERT,
2068 "%s : Dquot ID 0x%x (0x%p) "
2069 "BLK TIMER NOT STARTED",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002070 str, (int)be32_to_cpu(ddq->d_id), ddq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002071 errs++;
2072 }
2073 }
Christoph Hellwig1149d962005-11-02 15:01:12 +11002074 if (ddq->d_ino_softlimit &&
2075 be64_to_cpu(ddq->d_icount) >=
2076 be64_to_cpu(ddq->d_ino_softlimit)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002077 if (!ddq->d_itimer) {
2078 if (flags & XFS_QMOPT_DOWARN)
2079 cmn_err(CE_ALERT,
2080 "%s : Dquot ID 0x%x (0x%p) "
2081 "INODE TIMER NOT STARTED",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002082 str, (int)be32_to_cpu(ddq->d_id), ddq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002083 errs++;
2084 }
2085 }
Christoph Hellwig1149d962005-11-02 15:01:12 +11002086 if (ddq->d_rtb_softlimit &&
2087 be64_to_cpu(ddq->d_rtbcount) >=
2088 be64_to_cpu(ddq->d_rtb_softlimit)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002089 if (!ddq->d_rtbtimer) {
2090 if (flags & XFS_QMOPT_DOWARN)
2091 cmn_err(CE_ALERT,
2092 "%s : Dquot ID 0x%x (0x%p) "
2093 "RTBLK TIMER NOT STARTED",
Christoph Hellwig1149d962005-11-02 15:01:12 +11002094 str, (int)be32_to_cpu(ddq->d_id), ddq);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002095 errs++;
2096 }
2097 }
2098 }
2099
2100 if (!errs || !(flags & XFS_QMOPT_DQREPAIR))
2101 return errs;
2102
2103 if (flags & XFS_QMOPT_DOWARN)
2104 cmn_err(CE_NOTE, "Re-initializing dquot ID 0x%x", id);
2105
2106 /*
2107 * Typically, a repair is only requested by quotacheck.
2108 */
2109 ASSERT(id != -1);
2110 ASSERT(flags & XFS_QMOPT_DQREPAIR);
2111 memset(d, 0, sizeof(xfs_dqblk_t));
Christoph Hellwig1149d962005-11-02 15:01:12 +11002112
2113 d->dd_diskdq.d_magic = cpu_to_be16(XFS_DQUOT_MAGIC);
2114 d->dd_diskdq.d_version = XFS_DQUOT_VERSION;
2115 d->dd_diskdq.d_flags = type;
2116 d->dd_diskdq.d_id = cpu_to_be32(id);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002117
2118 return errs;
2119}
2120
2121/*
2122 * Perform a dquot buffer recovery.
2123 * Simple algorithm: if we have found a QUOTAOFF logitem of the same type
2124 * (ie. USR or GRP), then just toss this buffer away; don't recover it.
2125 * Else, treat it as a regular buffer and do recovery.
2126 */
2127STATIC void
2128xlog_recover_do_dquot_buffer(
2129 xfs_mount_t *mp,
2130 xlog_t *log,
2131 xlog_recover_item_t *item,
2132 xfs_buf_t *bp,
2133 xfs_buf_log_format_t *buf_f)
2134{
2135 uint type;
2136
Dave Chinner9abbc532010-04-13 15:06:46 +10002137 trace_xfs_log_recover_buf_dquot_buf(log, buf_f);
2138
Linus Torvalds1da177e2005-04-16 15:20:36 -07002139 /*
2140 * Filesystems are required to send in quota flags at mount time.
2141 */
2142 if (mp->m_qflags == 0) {
2143 return;
2144 }
2145
2146 type = 0;
2147 if (buf_f->blf_flags & XFS_BLI_UDQUOT_BUF)
2148 type |= XFS_DQ_USER;
Nathan Scottc8ad20f2005-06-21 15:38:48 +10002149 if (buf_f->blf_flags & XFS_BLI_PDQUOT_BUF)
2150 type |= XFS_DQ_PROJ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002151 if (buf_f->blf_flags & XFS_BLI_GDQUOT_BUF)
2152 type |= XFS_DQ_GROUP;
2153 /*
2154 * This type of quotas was turned off, so ignore this buffer
2155 */
2156 if (log->l_quotaoffs_flag & type)
2157 return;
2158
Dave Chinner9abbc532010-04-13 15:06:46 +10002159 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002160}
2161
2162/*
2163 * This routine replays a modification made to a buffer at runtime.
2164 * There are actually two types of buffer, regular and inode, which
2165 * are handled differently. Inode buffers are handled differently
2166 * in that we only recover a specific set of data from them, namely
2167 * the inode di_next_unlinked fields. This is because all other inode
2168 * data is actually logged via inode records and any data we replay
2169 * here which overlaps that may be stale.
2170 *
2171 * When meta-data buffers are freed at run time we log a buffer item
2172 * with the XFS_BLI_CANCEL bit set to indicate that previous copies
2173 * of the buffer in the log should not be replayed at recovery time.
2174 * This is so that if the blocks covered by the buffer are reused for
2175 * file data before we crash we don't end up replaying old, freed
2176 * meta-data into a user's file.
2177 *
2178 * To handle the cancellation of buffer log items, we make two passes
2179 * over the log during recovery. During the first we build a table of
2180 * those buffers which have been cancelled, and during the second we
2181 * only replay those buffers which do not have corresponding cancel
2182 * records in the table. See xlog_recover_do_buffer_pass[1,2] above
2183 * for more details on the implementation of the table of cancel records.
2184 */
2185STATIC int
2186xlog_recover_do_buffer_trans(
2187 xlog_t *log,
2188 xlog_recover_item_t *item,
2189 int pass)
2190{
2191 xfs_buf_log_format_t *buf_f;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002192 xfs_mount_t *mp;
2193 xfs_buf_t *bp;
2194 int error;
2195 int cancel;
2196 xfs_daddr_t blkno;
2197 int len;
2198 ushort flags;
Christoph Hellwig6ad112b2009-11-24 18:02:23 +00002199 uint buf_flags;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002200
2201 buf_f = (xfs_buf_log_format_t *)item->ri_buf[0].i_addr;
2202
2203 if (pass == XLOG_RECOVER_PASS1) {
2204 /*
2205 * In this pass we're only looking for buf items
2206 * with the XFS_BLI_CANCEL bit set.
2207 */
2208 xlog_recover_do_buffer_pass1(log, buf_f);
2209 return 0;
2210 } else {
2211 /*
2212 * In this pass we want to recover all the buffers
2213 * which have not been cancelled and are not
2214 * cancellation buffers themselves. The routine
2215 * we call here will tell us whether or not to
2216 * continue with the replay of this buffer.
2217 */
2218 cancel = xlog_recover_do_buffer_pass2(log, buf_f);
2219 if (cancel) {
Dave Chinner9abbc532010-04-13 15:06:46 +10002220 trace_xfs_log_recover_buf_cancel(log, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002221 return 0;
2222 }
2223 }
Dave Chinner9abbc532010-04-13 15:06:46 +10002224 trace_xfs_log_recover_buf_recover(log, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002225 switch (buf_f->blf_type) {
2226 case XFS_LI_BUF:
2227 blkno = buf_f->blf_blkno;
2228 len = buf_f->blf_len;
2229 flags = buf_f->blf_flags;
2230 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002231 default:
2232 xfs_fs_cmn_err(CE_ALERT, log->l_mp,
Nathan Scottfc1f8c12005-11-02 11:44:33 +11002233 "xfs_log_recover: unknown buffer type 0x%x, logdev %s",
2234 buf_f->blf_type, log->l_mp->m_logname ?
2235 log->l_mp->m_logname : "internal");
Linus Torvalds1da177e2005-04-16 15:20:36 -07002236 XFS_ERROR_REPORT("xlog_recover_do_buffer_trans",
2237 XFS_ERRLEVEL_LOW, log->l_mp);
2238 return XFS_ERROR(EFSCORRUPTED);
2239 }
2240
2241 mp = log->l_mp;
Christoph Hellwig0cadda12010-01-19 09:56:44 +00002242 buf_flags = XBF_LOCK;
Christoph Hellwig6ad112b2009-11-24 18:02:23 +00002243 if (!(flags & XFS_BLI_INODE_BUF))
Christoph Hellwig0cadda12010-01-19 09:56:44 +00002244 buf_flags |= XBF_MAPPED;
Christoph Hellwig6ad112b2009-11-24 18:02:23 +00002245
2246 bp = xfs_buf_read(mp->m_ddev_targp, blkno, len, buf_flags);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002247 if (XFS_BUF_ISERROR(bp)) {
2248 xfs_ioerror_alert("xlog_recover_do..(read#1)", log->l_mp,
2249 bp, blkno);
2250 error = XFS_BUF_GETERROR(bp);
2251 xfs_buf_relse(bp);
2252 return error;
2253 }
2254
2255 error = 0;
2256 if (flags & XFS_BLI_INODE_BUF) {
2257 error = xlog_recover_do_inode_buffer(mp, item, bp, buf_f);
Nathan Scottc8ad20f2005-06-21 15:38:48 +10002258 } else if (flags &
2259 (XFS_BLI_UDQUOT_BUF|XFS_BLI_PDQUOT_BUF|XFS_BLI_GDQUOT_BUF)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002260 xlog_recover_do_dquot_buffer(mp, log, item, bp, buf_f);
2261 } else {
Dave Chinner9abbc532010-04-13 15:06:46 +10002262 xlog_recover_do_reg_buffer(mp, item, bp, buf_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002263 }
2264 if (error)
2265 return XFS_ERROR(error);
2266
2267 /*
2268 * Perform delayed write on the buffer. Asynchronous writes will be
2269 * slower when taking into account all the buffers to be flushed.
2270 *
2271 * Also make sure that only inode buffers with good sizes stay in
2272 * the buffer cache. The kernel moves inodes in buffers of 1 block
2273 * or XFS_INODE_CLUSTER_SIZE bytes, whichever is bigger. The inode
2274 * buffers in the log can be a different size if the log was generated
2275 * by an older kernel using unclustered inode buffers or a newer kernel
2276 * running with a different inode cluster size. Regardless, if the
2277 * the inode buffer size isn't MAX(blocksize, XFS_INODE_CLUSTER_SIZE)
2278 * for *our* value of XFS_INODE_CLUSTER_SIZE, then we need to keep
2279 * the buffer out of the buffer cache so that the buffer won't
2280 * overlap with future reads of those inodes.
2281 */
2282 if (XFS_DINODE_MAGIC ==
Christoph Hellwigb53e6752007-10-12 10:59:34 +10002283 be16_to_cpu(*((__be16 *)xfs_buf_offset(bp, 0))) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07002284 (XFS_BUF_COUNT(bp) != MAX(log->l_mp->m_sb.sb_blocksize,
2285 (__uint32_t)XFS_INODE_CLUSTER_SIZE(log->l_mp)))) {
2286 XFS_BUF_STALE(bp);
2287 error = xfs_bwrite(mp, bp);
2288 } else {
Christoph Hellwig15ac08a2008-12-09 04:47:30 -05002289 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2290 bp->b_mount = mp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002291 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2292 xfs_bdwrite(mp, bp);
2293 }
2294
2295 return (error);
2296}
2297
2298STATIC int
2299xlog_recover_do_inode_trans(
2300 xlog_t *log,
2301 xlog_recover_item_t *item,
2302 int pass)
2303{
2304 xfs_inode_log_format_t *in_f;
2305 xfs_mount_t *mp;
2306 xfs_buf_t *bp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002307 xfs_dinode_t *dip;
2308 xfs_ino_t ino;
2309 int len;
2310 xfs_caddr_t src;
2311 xfs_caddr_t dest;
2312 int error;
2313 int attr_index;
2314 uint fields;
Christoph Hellwig347d1c02007-08-28 13:57:51 +10002315 xfs_icdinode_t *dicp;
Tim Shimmin6d192a92006-06-09 14:55:38 +10002316 int need_free = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002317
2318 if (pass == XLOG_RECOVER_PASS1) {
2319 return 0;
2320 }
2321
Tim Shimmin6d192a92006-06-09 14:55:38 +10002322 if (item->ri_buf[0].i_len == sizeof(xfs_inode_log_format_t)) {
2323 in_f = (xfs_inode_log_format_t *)item->ri_buf[0].i_addr;
2324 } else {
2325 in_f = (xfs_inode_log_format_t *)kmem_alloc(
2326 sizeof(xfs_inode_log_format_t), KM_SLEEP);
2327 need_free = 1;
2328 error = xfs_inode_item_format_convert(&item->ri_buf[0], in_f);
2329 if (error)
2330 goto error;
2331 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002332 ino = in_f->ilf_ino;
2333 mp = log->l_mp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002334
2335 /*
2336 * Inode buffers can be freed, look out for it,
2337 * and do not replay the inode.
2338 */
Christoph Hellwiga1941892008-11-28 14:23:40 +11002339 if (xlog_check_buffer_cancelled(log, in_f->ilf_blkno,
2340 in_f->ilf_len, 0)) {
Tim Shimmin6d192a92006-06-09 14:55:38 +10002341 error = 0;
Dave Chinner9abbc532010-04-13 15:06:46 +10002342 trace_xfs_log_recover_inode_cancel(log, in_f);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002343 goto error;
2344 }
Dave Chinner9abbc532010-04-13 15:06:46 +10002345 trace_xfs_log_recover_inode_recover(log, in_f);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002346
Christoph Hellwig6ad112b2009-11-24 18:02:23 +00002347 bp = xfs_buf_read(mp->m_ddev_targp, in_f->ilf_blkno, in_f->ilf_len,
Christoph Hellwig0cadda12010-01-19 09:56:44 +00002348 XBF_LOCK);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002349 if (XFS_BUF_ISERROR(bp)) {
2350 xfs_ioerror_alert("xlog_recover_do..(read#2)", mp,
Christoph Hellwiga1941892008-11-28 14:23:40 +11002351 bp, in_f->ilf_blkno);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002352 error = XFS_BUF_GETERROR(bp);
2353 xfs_buf_relse(bp);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002354 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002355 }
2356 error = 0;
2357 ASSERT(in_f->ilf_fields & XFS_ILOG_CORE);
Christoph Hellwiga1941892008-11-28 14:23:40 +11002358 dip = (xfs_dinode_t *)xfs_buf_offset(bp, in_f->ilf_boffset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002359
2360 /*
2361 * Make sure the place we're flushing out to really looks
2362 * like an inode!
2363 */
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002364 if (unlikely(be16_to_cpu(dip->di_magic) != XFS_DINODE_MAGIC)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002365 xfs_buf_relse(bp);
2366 xfs_fs_cmn_err(CE_ALERT, mp,
2367 "xfs_inode_recover: Bad inode magic number, dino ptr = 0x%p, dino bp = 0x%p, ino = %Ld",
2368 dip, bp, ino);
2369 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(1)",
2370 XFS_ERRLEVEL_LOW, mp);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002371 error = EFSCORRUPTED;
2372 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002373 }
Christoph Hellwig347d1c02007-08-28 13:57:51 +10002374 dicp = (xfs_icdinode_t *)(item->ri_buf[1].i_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002375 if (unlikely(dicp->di_magic != XFS_DINODE_MAGIC)) {
2376 xfs_buf_relse(bp);
2377 xfs_fs_cmn_err(CE_ALERT, mp,
2378 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, ino %Ld",
2379 item, ino);
2380 XFS_ERROR_REPORT("xlog_recover_do_inode_trans(2)",
2381 XFS_ERRLEVEL_LOW, mp);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002382 error = EFSCORRUPTED;
2383 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002384 }
2385
2386 /* Skip replay when the on disk inode is newer than the log one */
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002387 if (dicp->di_flushiter < be16_to_cpu(dip->di_flushiter)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002388 /*
2389 * Deal with the wrap case, DI_MAX_FLUSH is less
2390 * than smaller numbers
2391 */
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002392 if (be16_to_cpu(dip->di_flushiter) == DI_MAX_FLUSH &&
Christoph Hellwig347d1c02007-08-28 13:57:51 +10002393 dicp->di_flushiter < (DI_MAX_FLUSH >> 1)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002394 /* do nothing */
2395 } else {
2396 xfs_buf_relse(bp);
Dave Chinner9abbc532010-04-13 15:06:46 +10002397 trace_xfs_log_recover_inode_skip(log, in_f);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002398 error = 0;
2399 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002400 }
2401 }
2402 /* Take the opportunity to reset the flush iteration count */
2403 dicp->di_flushiter = 0;
2404
2405 if (unlikely((dicp->di_mode & S_IFMT) == S_IFREG)) {
2406 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2407 (dicp->di_format != XFS_DINODE_FMT_BTREE)) {
2408 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(3)",
2409 XFS_ERRLEVEL_LOW, mp, dicp);
2410 xfs_buf_relse(bp);
2411 xfs_fs_cmn_err(CE_ALERT, mp,
2412 "xfs_inode_recover: Bad regular inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2413 item, dip, bp, ino);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002414 error = EFSCORRUPTED;
2415 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002416 }
2417 } else if (unlikely((dicp->di_mode & S_IFMT) == S_IFDIR)) {
2418 if ((dicp->di_format != XFS_DINODE_FMT_EXTENTS) &&
2419 (dicp->di_format != XFS_DINODE_FMT_BTREE) &&
2420 (dicp->di_format != XFS_DINODE_FMT_LOCAL)) {
2421 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(4)",
2422 XFS_ERRLEVEL_LOW, mp, dicp);
2423 xfs_buf_relse(bp);
2424 xfs_fs_cmn_err(CE_ALERT, mp,
2425 "xfs_inode_recover: Bad dir inode log record, rec ptr 0x%p, ino ptr = 0x%p, ino bp = 0x%p, ino %Ld",
2426 item, dip, bp, ino);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002427 error = EFSCORRUPTED;
2428 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002429 }
2430 }
2431 if (unlikely(dicp->di_nextents + dicp->di_anextents > dicp->di_nblocks)){
2432 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(5)",
2433 XFS_ERRLEVEL_LOW, mp, dicp);
2434 xfs_buf_relse(bp);
2435 xfs_fs_cmn_err(CE_ALERT, mp,
2436 "xfs_inode_recover: Bad inode log record, rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, total extents = %d, nblocks = %Ld",
2437 item, dip, bp, ino,
2438 dicp->di_nextents + dicp->di_anextents,
2439 dicp->di_nblocks);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002440 error = EFSCORRUPTED;
2441 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002442 }
2443 if (unlikely(dicp->di_forkoff > mp->m_sb.sb_inodesize)) {
2444 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(6)",
2445 XFS_ERRLEVEL_LOW, mp, dicp);
2446 xfs_buf_relse(bp);
2447 xfs_fs_cmn_err(CE_ALERT, mp,
2448 "xfs_inode_recover: Bad inode log rec ptr 0x%p, dino ptr 0x%p, dino bp 0x%p, ino %Ld, forkoff 0x%x",
2449 item, dip, bp, ino, dicp->di_forkoff);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002450 error = EFSCORRUPTED;
2451 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002452 }
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002453 if (unlikely(item->ri_buf[1].i_len > sizeof(struct xfs_icdinode))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07002454 XFS_CORRUPTION_ERROR("xlog_recover_do_inode_trans(7)",
2455 XFS_ERRLEVEL_LOW, mp, dicp);
2456 xfs_buf_relse(bp);
2457 xfs_fs_cmn_err(CE_ALERT, mp,
2458 "xfs_inode_recover: Bad inode log record length %d, rec ptr 0x%p",
2459 item->ri_buf[1].i_len, item);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002460 error = EFSCORRUPTED;
2461 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002462 }
2463
2464 /* The core is in in-core format */
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002465 xfs_dinode_to_disk(dip, (xfs_icdinode_t *)item->ri_buf[1].i_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002466
2467 /* the rest is in on-disk format */
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002468 if (item->ri_buf[1].i_len > sizeof(struct xfs_icdinode)) {
2469 memcpy((xfs_caddr_t) dip + sizeof(struct xfs_icdinode),
2470 item->ri_buf[1].i_addr + sizeof(struct xfs_icdinode),
2471 item->ri_buf[1].i_len - sizeof(struct xfs_icdinode));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002472 }
2473
2474 fields = in_f->ilf_fields;
2475 switch (fields & (XFS_ILOG_DEV | XFS_ILOG_UUID)) {
2476 case XFS_ILOG_DEV:
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002477 xfs_dinode_put_rdev(dip, in_f->ilf_u.ilfu_rdev);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002478 break;
2479 case XFS_ILOG_UUID:
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002480 memcpy(XFS_DFORK_DPTR(dip),
2481 &in_f->ilf_u.ilfu_uuid,
2482 sizeof(uuid_t));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002483 break;
2484 }
2485
2486 if (in_f->ilf_size == 2)
2487 goto write_inode_buffer;
2488 len = item->ri_buf[2].i_len;
2489 src = item->ri_buf[2].i_addr;
2490 ASSERT(in_f->ilf_size <= 4);
2491 ASSERT((in_f->ilf_size == 3) || (fields & XFS_ILOG_AFORK));
2492 ASSERT(!(fields & XFS_ILOG_DFORK) ||
2493 (len == in_f->ilf_dsize));
2494
2495 switch (fields & XFS_ILOG_DFORK) {
2496 case XFS_ILOG_DDATA:
2497 case XFS_ILOG_DEXT:
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002498 memcpy(XFS_DFORK_DPTR(dip), src, len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002499 break;
2500
2501 case XFS_ILOG_DBROOT:
Christoph Hellwig7cc95a82008-10-30 17:14:34 +11002502 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src, len,
Christoph Hellwig81591fe2008-11-28 14:23:39 +11002503 (xfs_bmdr_block_t *)XFS_DFORK_DPTR(dip),
Linus Torvalds1da177e2005-04-16 15:20:36 -07002504 XFS_DFORK_DSIZE(dip, mp));
2505 break;
2506
2507 default:
2508 /*
2509 * There are no data fork flags set.
2510 */
2511 ASSERT((fields & XFS_ILOG_DFORK) == 0);
2512 break;
2513 }
2514
2515 /*
2516 * If we logged any attribute data, recover it. There may or
2517 * may not have been any other non-core data logged in this
2518 * transaction.
2519 */
2520 if (in_f->ilf_fields & XFS_ILOG_AFORK) {
2521 if (in_f->ilf_fields & XFS_ILOG_DFORK) {
2522 attr_index = 3;
2523 } else {
2524 attr_index = 2;
2525 }
2526 len = item->ri_buf[attr_index].i_len;
2527 src = item->ri_buf[attr_index].i_addr;
2528 ASSERT(len == in_f->ilf_asize);
2529
2530 switch (in_f->ilf_fields & XFS_ILOG_AFORK) {
2531 case XFS_ILOG_ADATA:
2532 case XFS_ILOG_AEXT:
2533 dest = XFS_DFORK_APTR(dip);
2534 ASSERT(len <= XFS_DFORK_ASIZE(dip, mp));
2535 memcpy(dest, src, len);
2536 break;
2537
2538 case XFS_ILOG_ABROOT:
2539 dest = XFS_DFORK_APTR(dip);
Christoph Hellwig7cc95a82008-10-30 17:14:34 +11002540 xfs_bmbt_to_bmdr(mp, (struct xfs_btree_block *)src,
2541 len, (xfs_bmdr_block_t*)dest,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002542 XFS_DFORK_ASIZE(dip, mp));
2543 break;
2544
2545 default:
2546 xlog_warn("XFS: xlog_recover_do_inode_trans: Invalid flag");
2547 ASSERT(0);
2548 xfs_buf_relse(bp);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002549 error = EIO;
2550 goto error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002551 }
2552 }
2553
2554write_inode_buffer:
Christoph Hellwigdd0bbad2009-03-16 08:19:59 +01002555 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2556 bp->b_mount = mp;
2557 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2558 xfs_bdwrite(mp, bp);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002559error:
2560 if (need_free)
Denys Vlasenkof0e2d932008-05-19 16:31:57 +10002561 kmem_free(in_f);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002562 return XFS_ERROR(error);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002563}
2564
2565/*
2566 * Recover QUOTAOFF records. We simply make a note of it in the xlog_t
2567 * structure, so that we know not to do any dquot item or dquot buffer recovery,
2568 * of that type.
2569 */
2570STATIC int
2571xlog_recover_do_quotaoff_trans(
2572 xlog_t *log,
2573 xlog_recover_item_t *item,
2574 int pass)
2575{
2576 xfs_qoff_logformat_t *qoff_f;
2577
2578 if (pass == XLOG_RECOVER_PASS2) {
2579 return (0);
2580 }
2581
2582 qoff_f = (xfs_qoff_logformat_t *)item->ri_buf[0].i_addr;
2583 ASSERT(qoff_f);
2584
2585 /*
2586 * The logitem format's flag tells us if this was user quotaoff,
Nathan Scott77a7cce2006-01-11 15:35:57 +11002587 * group/project quotaoff or both.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002588 */
2589 if (qoff_f->qf_flags & XFS_UQUOTA_ACCT)
2590 log->l_quotaoffs_flag |= XFS_DQ_USER;
Nathan Scott77a7cce2006-01-11 15:35:57 +11002591 if (qoff_f->qf_flags & XFS_PQUOTA_ACCT)
2592 log->l_quotaoffs_flag |= XFS_DQ_PROJ;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002593 if (qoff_f->qf_flags & XFS_GQUOTA_ACCT)
2594 log->l_quotaoffs_flag |= XFS_DQ_GROUP;
2595
2596 return (0);
2597}
2598
2599/*
2600 * Recover a dquot record
2601 */
2602STATIC int
2603xlog_recover_do_dquot_trans(
2604 xlog_t *log,
2605 xlog_recover_item_t *item,
2606 int pass)
2607{
2608 xfs_mount_t *mp;
2609 xfs_buf_t *bp;
2610 struct xfs_disk_dquot *ddq, *recddq;
2611 int error;
2612 xfs_dq_logformat_t *dq_f;
2613 uint type;
2614
2615 if (pass == XLOG_RECOVER_PASS1) {
2616 return 0;
2617 }
2618 mp = log->l_mp;
2619
2620 /*
2621 * Filesystems are required to send in quota flags at mount time.
2622 */
2623 if (mp->m_qflags == 0)
2624 return (0);
2625
2626 recddq = (xfs_disk_dquot_t *)item->ri_buf[1].i_addr;
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02002627
2628 if (item->ri_buf[1].i_addr == NULL) {
2629 cmn_err(CE_ALERT,
2630 "XFS: NULL dquot in %s.", __func__);
2631 return XFS_ERROR(EIO);
2632 }
Jan Rekorajski8ec6dba2009-11-16 11:57:02 +00002633 if (item->ri_buf[1].i_len < sizeof(xfs_disk_dquot_t)) {
Christoph Hellwig0c5e1ce2009-06-08 15:33:21 +02002634 cmn_err(CE_ALERT,
2635 "XFS: dquot too small (%d) in %s.",
2636 item->ri_buf[1].i_len, __func__);
2637 return XFS_ERROR(EIO);
2638 }
2639
Linus Torvalds1da177e2005-04-16 15:20:36 -07002640 /*
2641 * This type of quotas was turned off, so ignore this record.
2642 */
Christoph Hellwigb53e6752007-10-12 10:59:34 +10002643 type = recddq->d_flags & (XFS_DQ_USER | XFS_DQ_PROJ | XFS_DQ_GROUP);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002644 ASSERT(type);
2645 if (log->l_quotaoffs_flag & type)
2646 return (0);
2647
2648 /*
2649 * At this point we know that quota was _not_ turned off.
2650 * Since the mount flags are not indicating to us otherwise, this
2651 * must mean that quota is on, and the dquot needs to be replayed.
2652 * Remember that we may not have fully recovered the superblock yet,
2653 * so we can't do the usual trick of looking at the SB quota bits.
2654 *
2655 * The other possibility, of course, is that the quota subsystem was
2656 * removed since the last mount - ENOSYS.
2657 */
2658 dq_f = (xfs_dq_logformat_t *)item->ri_buf[0].i_addr;
2659 ASSERT(dq_f);
2660 if ((error = xfs_qm_dqcheck(recddq,
2661 dq_f->qlf_id,
2662 0, XFS_QMOPT_DOWARN,
2663 "xlog_recover_do_dquot_trans (log copy)"))) {
2664 return XFS_ERROR(EIO);
2665 }
2666 ASSERT(dq_f->qlf_len == 1);
2667
2668 error = xfs_read_buf(mp, mp->m_ddev_targp,
2669 dq_f->qlf_blkno,
2670 XFS_FSB_TO_BB(mp, dq_f->qlf_len),
2671 0, &bp);
2672 if (error) {
2673 xfs_ioerror_alert("xlog_recover_do..(read#3)", mp,
2674 bp, dq_f->qlf_blkno);
2675 return error;
2676 }
2677 ASSERT(bp);
2678 ddq = (xfs_disk_dquot_t *)xfs_buf_offset(bp, dq_f->qlf_boffset);
2679
2680 /*
2681 * At least the magic num portion should be on disk because this
2682 * was among a chunk of dquots created earlier, and we did some
2683 * minimal initialization then.
2684 */
2685 if (xfs_qm_dqcheck(ddq, dq_f->qlf_id, 0, XFS_QMOPT_DOWARN,
2686 "xlog_recover_do_dquot_trans")) {
2687 xfs_buf_relse(bp);
2688 return XFS_ERROR(EIO);
2689 }
2690
2691 memcpy(ddq, recddq, item->ri_buf[1].i_len);
2692
2693 ASSERT(dq_f->qlf_size == 2);
Christoph Hellwig15ac08a2008-12-09 04:47:30 -05002694 ASSERT(bp->b_mount == NULL || bp->b_mount == mp);
2695 bp->b_mount = mp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002696 XFS_BUF_SET_IODONE_FUNC(bp, xlog_recover_iodone);
2697 xfs_bdwrite(mp, bp);
2698
2699 return (0);
2700}
2701
2702/*
2703 * This routine is called to create an in-core extent free intent
2704 * item from the efi format structure which was logged on disk.
2705 * It allocates an in-core efi, copies the extents from the format
2706 * structure into it, and adds the efi to the AIL with the given
2707 * LSN.
2708 */
Tim Shimmin6d192a92006-06-09 14:55:38 +10002709STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -07002710xlog_recover_do_efi_trans(
2711 xlog_t *log,
2712 xlog_recover_item_t *item,
2713 xfs_lsn_t lsn,
2714 int pass)
2715{
Tim Shimmin6d192a92006-06-09 14:55:38 +10002716 int error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002717 xfs_mount_t *mp;
2718 xfs_efi_log_item_t *efip;
2719 xfs_efi_log_format_t *efi_formatp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002720
2721 if (pass == XLOG_RECOVER_PASS1) {
Tim Shimmin6d192a92006-06-09 14:55:38 +10002722 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002723 }
2724
2725 efi_formatp = (xfs_efi_log_format_t *)item->ri_buf[0].i_addr;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002726
2727 mp = log->l_mp;
2728 efip = xfs_efi_init(mp, efi_formatp->efi_nextents);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002729 if ((error = xfs_efi_copy_format(&(item->ri_buf[0]),
2730 &(efip->efi_format)))) {
2731 xfs_efi_item_free(efip);
2732 return error;
2733 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002734 efip->efi_next_extent = efi_formatp->efi_nextents;
2735 efip->efi_flags |= XFS_EFI_COMMITTED;
2736
David Chinnera9c21c12008-10-30 17:39:35 +11002737 spin_lock(&log->l_ailp->xa_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002738 /*
David Chinner783a2f62008-10-30 17:39:58 +11002739 * xfs_trans_ail_update() drops the AIL lock.
Linus Torvalds1da177e2005-04-16 15:20:36 -07002740 */
David Chinner783a2f62008-10-30 17:39:58 +11002741 xfs_trans_ail_update(log->l_ailp, (xfs_log_item_t *)efip, lsn);
Tim Shimmin6d192a92006-06-09 14:55:38 +10002742 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002743}
2744
2745
2746/*
2747 * This routine is called when an efd format structure is found in
2748 * a committed transaction in the log. It's purpose is to cancel
2749 * the corresponding efi if it was still in the log. To do this
2750 * it searches the AIL for the efi with an id equal to that in the
2751 * efd format structure. If we find it, we remove the efi from the
2752 * AIL and free it.
2753 */
2754STATIC void
2755xlog_recover_do_efd_trans(
2756 xlog_t *log,
2757 xlog_recover_item_t *item,
2758 int pass)
2759{
Linus Torvalds1da177e2005-04-16 15:20:36 -07002760 xfs_efd_log_format_t *efd_formatp;
2761 xfs_efi_log_item_t *efip = NULL;
2762 xfs_log_item_t *lip;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002763 __uint64_t efi_id;
David Chinner27d8d5f2008-10-30 17:38:39 +11002764 struct xfs_ail_cursor cur;
David Chinner783a2f62008-10-30 17:39:58 +11002765 struct xfs_ail *ailp = log->l_ailp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002766
2767 if (pass == XLOG_RECOVER_PASS1) {
2768 return;
2769 }
2770
2771 efd_formatp = (xfs_efd_log_format_t *)item->ri_buf[0].i_addr;
Tim Shimmin6d192a92006-06-09 14:55:38 +10002772 ASSERT((item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_32_t) +
2773 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_32_t)))) ||
2774 (item->ri_buf[0].i_len == (sizeof(xfs_efd_log_format_64_t) +
2775 ((efd_formatp->efd_nextents - 1) * sizeof(xfs_extent_64_t)))));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002776 efi_id = efd_formatp->efd_efi_id;
2777
2778 /*
2779 * Search for the efi with the id in the efd format structure
2780 * in the AIL.
2781 */
David Chinnera9c21c12008-10-30 17:39:35 +11002782 spin_lock(&ailp->xa_lock);
2783 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002784 while (lip != NULL) {
2785 if (lip->li_type == XFS_LI_EFI) {
2786 efip = (xfs_efi_log_item_t *)lip;
2787 if (efip->efi_format.efi_id == efi_id) {
2788 /*
David Chinner783a2f62008-10-30 17:39:58 +11002789 * xfs_trans_ail_delete() drops the
Linus Torvalds1da177e2005-04-16 15:20:36 -07002790 * AIL lock.
2791 */
David Chinner783a2f62008-10-30 17:39:58 +11002792 xfs_trans_ail_delete(ailp, lip);
David Chinner8ae2c0f2007-11-23 16:28:17 +11002793 xfs_efi_item_free(efip);
David Chinnera9c21c12008-10-30 17:39:35 +11002794 spin_lock(&ailp->xa_lock);
David Chinner27d8d5f2008-10-30 17:38:39 +11002795 break;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002796 }
2797 }
David Chinnera9c21c12008-10-30 17:39:35 +11002798 lip = xfs_trans_ail_cursor_next(ailp, &cur);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002799 }
David Chinnera9c21c12008-10-30 17:39:35 +11002800 xfs_trans_ail_cursor_done(ailp, &cur);
2801 spin_unlock(&ailp->xa_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002802}
2803
2804/*
2805 * Perform the transaction
2806 *
2807 * If the transaction modifies a buffer or inode, do it now. Otherwise,
2808 * EFIs and EFDs get queued up by adding entries into the AIL for them.
2809 */
2810STATIC int
2811xlog_recover_do_trans(
2812 xlog_t *log,
2813 xlog_recover_t *trans,
2814 int pass)
2815{
2816 int error = 0;
Dave Chinnerf0a76952010-01-11 11:49:57 +00002817 xlog_recover_item_t *item;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002818
Dave Chinner9abbc532010-04-13 15:06:46 +10002819 error = xlog_recover_reorder_trans(log, trans, pass);
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002820 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07002821 return error;
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002822
Dave Chinnerf0a76952010-01-11 11:49:57 +00002823 list_for_each_entry(item, &trans->r_itemq, ri_list) {
Dave Chinner9abbc532010-04-13 15:06:46 +10002824 trace_xfs_log_recover_item_recover(log, trans, item, pass);
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002825 switch (ITEM_TYPE(item)) {
2826 case XFS_LI_BUF:
2827 error = xlog_recover_do_buffer_trans(log, item, pass);
2828 break;
2829 case XFS_LI_INODE:
2830 error = xlog_recover_do_inode_trans(log, item, pass);
2831 break;
2832 case XFS_LI_EFI:
2833 error = xlog_recover_do_efi_trans(log, item,
2834 trans->r_lsn, pass);
2835 break;
2836 case XFS_LI_EFD:
Linus Torvalds1da177e2005-04-16 15:20:36 -07002837 xlog_recover_do_efd_trans(log, item, pass);
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002838 error = 0;
2839 break;
2840 case XFS_LI_DQUOT:
2841 error = xlog_recover_do_dquot_trans(log, item, pass);
2842 break;
2843 case XFS_LI_QUOTAOFF:
2844 error = xlog_recover_do_quotaoff_trans(log, item,
2845 pass);
2846 break;
2847 default:
2848 xlog_warn(
2849 "XFS: invalid item type (%d) xlog_recover_do_trans", ITEM_TYPE(item));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002850 ASSERT(0);
2851 error = XFS_ERROR(EIO);
2852 break;
2853 }
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002854
2855 if (error)
2856 return error;
Dave Chinnerf0a76952010-01-11 11:49:57 +00002857 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002858
Christoph Hellwigff0205e2009-03-16 08:20:52 +01002859 return 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002860}
2861
2862/*
2863 * Free up any resources allocated by the transaction
2864 *
2865 * Remember that EFIs, EFDs, and IUNLINKs are handled later.
2866 */
2867STATIC void
2868xlog_recover_free_trans(
2869 xlog_recover_t *trans)
2870{
Dave Chinnerf0a76952010-01-11 11:49:57 +00002871 xlog_recover_item_t *item, *n;
Linus Torvalds1da177e2005-04-16 15:20:36 -07002872 int i;
2873
Dave Chinnerf0a76952010-01-11 11:49:57 +00002874 list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) {
2875 /* Free the regions in the item. */
2876 list_del(&item->ri_list);
2877 for (i = 0; i < item->ri_cnt; i++)
2878 kmem_free(item->ri_buf[i].i_addr);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002879 /* Free the item itself */
Dave Chinnerf0a76952010-01-11 11:49:57 +00002880 kmem_free(item->ri_buf);
2881 kmem_free(item);
2882 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002883 /* Free the transaction recover structure */
Denys Vlasenkof0e2d932008-05-19 16:31:57 +10002884 kmem_free(trans);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002885}
2886
2887STATIC int
2888xlog_recover_commit_trans(
2889 xlog_t *log,
Linus Torvalds1da177e2005-04-16 15:20:36 -07002890 xlog_recover_t *trans,
2891 int pass)
2892{
2893 int error;
2894
Dave Chinnerf0a76952010-01-11 11:49:57 +00002895 hlist_del(&trans->r_list);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002896 if ((error = xlog_recover_do_trans(log, trans, pass)))
2897 return error;
2898 xlog_recover_free_trans(trans); /* no error */
2899 return 0;
2900}
2901
2902STATIC int
2903xlog_recover_unmount_trans(
2904 xlog_recover_t *trans)
2905{
2906 /* Do nothing now */
2907 xlog_warn("XFS: xlog_recover_unmount_trans: Unmount LR");
2908 return 0;
2909}
2910
2911/*
2912 * There are two valid states of the r_state field. 0 indicates that the
2913 * transaction structure is in a normal state. We have either seen the
2914 * start of the transaction or the last operation we added was not a partial
2915 * operation. If the last operation we added to the transaction was a
2916 * partial operation, we need to mark r_state with XLOG_WAS_CONT_TRANS.
2917 *
2918 * NOTE: skip LRs with 0 data length.
2919 */
2920STATIC int
2921xlog_recover_process_data(
2922 xlog_t *log,
Dave Chinnerf0a76952010-01-11 11:49:57 +00002923 struct hlist_head rhash[],
Linus Torvalds1da177e2005-04-16 15:20:36 -07002924 xlog_rec_header_t *rhead,
2925 xfs_caddr_t dp,
2926 int pass)
2927{
2928 xfs_caddr_t lp;
2929 int num_logops;
2930 xlog_op_header_t *ohead;
2931 xlog_recover_t *trans;
2932 xlog_tid_t tid;
2933 int error;
2934 unsigned long hash;
2935 uint flags;
2936
Christoph Hellwigb53e6752007-10-12 10:59:34 +10002937 lp = dp + be32_to_cpu(rhead->h_len);
2938 num_logops = be32_to_cpu(rhead->h_num_logops);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002939
2940 /* check the log format matches our own - else we can't recover */
2941 if (xlog_header_check_recover(log->l_mp, rhead))
2942 return (XFS_ERROR(EIO));
2943
2944 while ((dp < lp) && num_logops) {
2945 ASSERT(dp + sizeof(xlog_op_header_t) <= lp);
2946 ohead = (xlog_op_header_t *)dp;
2947 dp += sizeof(xlog_op_header_t);
2948 if (ohead->oh_clientid != XFS_TRANSACTION &&
2949 ohead->oh_clientid != XFS_LOG) {
2950 xlog_warn(
2951 "XFS: xlog_recover_process_data: bad clientid");
2952 ASSERT(0);
2953 return (XFS_ERROR(EIO));
2954 }
Christoph Hellwig67fcb7b2007-10-12 10:58:59 +10002955 tid = be32_to_cpu(ohead->oh_tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002956 hash = XLOG_RHASH(tid);
Dave Chinnerf0a76952010-01-11 11:49:57 +00002957 trans = xlog_recover_find_tid(&rhash[hash], tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002958 if (trans == NULL) { /* not found; add new tid */
2959 if (ohead->oh_flags & XLOG_START_TRANS)
2960 xlog_recover_new_tid(&rhash[hash], tid,
Christoph Hellwigb53e6752007-10-12 10:59:34 +10002961 be64_to_cpu(rhead->h_lsn));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002962 } else {
Lachlan McIlroy9742bb92008-01-10 16:43:36 +11002963 if (dp + be32_to_cpu(ohead->oh_len) > lp) {
2964 xlog_warn(
2965 "XFS: xlog_recover_process_data: bad length");
2966 WARN_ON(1);
2967 return (XFS_ERROR(EIO));
2968 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07002969 flags = ohead->oh_flags & ~XLOG_END_TRANS;
2970 if (flags & XLOG_WAS_CONT_TRANS)
2971 flags &= ~XLOG_CONTINUE_TRANS;
2972 switch (flags) {
2973 case XLOG_COMMIT_TRANS:
2974 error = xlog_recover_commit_trans(log,
Dave Chinnerf0a76952010-01-11 11:49:57 +00002975 trans, pass);
Linus Torvalds1da177e2005-04-16 15:20:36 -07002976 break;
2977 case XLOG_UNMOUNT_TRANS:
2978 error = xlog_recover_unmount_trans(trans);
2979 break;
2980 case XLOG_WAS_CONT_TRANS:
Dave Chinner9abbc532010-04-13 15:06:46 +10002981 error = xlog_recover_add_to_cont_trans(log,
2982 trans, dp,
2983 be32_to_cpu(ohead->oh_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002984 break;
2985 case XLOG_START_TRANS:
2986 xlog_warn(
2987 "XFS: xlog_recover_process_data: bad transaction");
2988 ASSERT(0);
2989 error = XFS_ERROR(EIO);
2990 break;
2991 case 0:
2992 case XLOG_CONTINUE_TRANS:
Dave Chinner9abbc532010-04-13 15:06:46 +10002993 error = xlog_recover_add_to_trans(log, trans,
Christoph Hellwig67fcb7b2007-10-12 10:58:59 +10002994 dp, be32_to_cpu(ohead->oh_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07002995 break;
2996 default:
2997 xlog_warn(
2998 "XFS: xlog_recover_process_data: bad flag");
2999 ASSERT(0);
3000 error = XFS_ERROR(EIO);
3001 break;
3002 }
3003 if (error)
3004 return error;
3005 }
Christoph Hellwig67fcb7b2007-10-12 10:58:59 +10003006 dp += be32_to_cpu(ohead->oh_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003007 num_logops--;
3008 }
3009 return 0;
3010}
3011
3012/*
3013 * Process an extent free intent item that was recovered from
3014 * the log. We need to free the extents that it describes.
3015 */
David Chinner3c1e2bb2008-04-10 12:21:11 +10003016STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -07003017xlog_recover_process_efi(
3018 xfs_mount_t *mp,
3019 xfs_efi_log_item_t *efip)
3020{
3021 xfs_efd_log_item_t *efdp;
3022 xfs_trans_t *tp;
3023 int i;
David Chinner3c1e2bb2008-04-10 12:21:11 +10003024 int error = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003025 xfs_extent_t *extp;
3026 xfs_fsblock_t startblock_fsb;
3027
3028 ASSERT(!(efip->efi_flags & XFS_EFI_RECOVERED));
3029
3030 /*
3031 * First check the validity of the extents described by the
3032 * EFI. If any are bad, then assume that all are bad and
3033 * just toss the EFI.
3034 */
3035 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3036 extp = &(efip->efi_format.efi_extents[i]);
3037 startblock_fsb = XFS_BB_TO_FSB(mp,
3038 XFS_FSB_TO_DADDR(mp, extp->ext_start));
3039 if ((startblock_fsb == 0) ||
3040 (extp->ext_len == 0) ||
3041 (startblock_fsb >= mp->m_sb.sb_dblocks) ||
3042 (extp->ext_len >= mp->m_sb.sb_agblocks)) {
3043 /*
3044 * This will pull the EFI from the AIL and
3045 * free the memory associated with it.
3046 */
3047 xfs_efi_release(efip, efip->efi_format.efi_nextents);
David Chinner3c1e2bb2008-04-10 12:21:11 +10003048 return XFS_ERROR(EIO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003049 }
3050 }
3051
3052 tp = xfs_trans_alloc(mp, 0);
David Chinner3c1e2bb2008-04-10 12:21:11 +10003053 error = xfs_trans_reserve(tp, 0, XFS_ITRUNCATE_LOG_RES(mp), 0, 0, 0);
David Chinnerfc6149d2008-04-10 12:21:53 +10003054 if (error)
3055 goto abort_error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003056 efdp = xfs_trans_get_efd(tp, efip, efip->efi_format.efi_nextents);
3057
3058 for (i = 0; i < efip->efi_format.efi_nextents; i++) {
3059 extp = &(efip->efi_format.efi_extents[i]);
David Chinnerfc6149d2008-04-10 12:21:53 +10003060 error = xfs_free_extent(tp, extp->ext_start, extp->ext_len);
3061 if (error)
3062 goto abort_error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003063 xfs_trans_log_efd_extent(tp, efdp, extp->ext_start,
3064 extp->ext_len);
3065 }
3066
3067 efip->efi_flags |= XFS_EFI_RECOVERED;
David Chinnere5720ee2008-04-10 12:21:18 +10003068 error = xfs_trans_commit(tp, 0);
David Chinner3c1e2bb2008-04-10 12:21:11 +10003069 return error;
David Chinnerfc6149d2008-04-10 12:21:53 +10003070
3071abort_error:
3072 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3073 return error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003074}
3075
3076/*
Linus Torvalds1da177e2005-04-16 15:20:36 -07003077 * When this is called, all of the EFIs which did not have
3078 * corresponding EFDs should be in the AIL. What we do now
3079 * is free the extents associated with each one.
3080 *
3081 * Since we process the EFIs in normal transactions, they
3082 * will be removed at some point after the commit. This prevents
3083 * us from just walking down the list processing each one.
3084 * We'll use a flag in the EFI to skip those that we've already
3085 * processed and use the AIL iteration mechanism's generation
3086 * count to try to speed this up at least a bit.
3087 *
3088 * When we start, we know that the EFIs are the only things in
3089 * the AIL. As we process them, however, other items are added
3090 * to the AIL. Since everything added to the AIL must come after
3091 * everything already in the AIL, we stop processing as soon as
3092 * we see something other than an EFI in the AIL.
3093 */
David Chinner3c1e2bb2008-04-10 12:21:11 +10003094STATIC int
Linus Torvalds1da177e2005-04-16 15:20:36 -07003095xlog_recover_process_efis(
3096 xlog_t *log)
3097{
3098 xfs_log_item_t *lip;
3099 xfs_efi_log_item_t *efip;
David Chinner3c1e2bb2008-04-10 12:21:11 +10003100 int error = 0;
David Chinner27d8d5f2008-10-30 17:38:39 +11003101 struct xfs_ail_cursor cur;
David Chinnera9c21c12008-10-30 17:39:35 +11003102 struct xfs_ail *ailp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003103
David Chinnera9c21c12008-10-30 17:39:35 +11003104 ailp = log->l_ailp;
3105 spin_lock(&ailp->xa_lock);
3106 lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003107 while (lip != NULL) {
3108 /*
3109 * We're done when we see something other than an EFI.
David Chinner27d8d5f2008-10-30 17:38:39 +11003110 * There should be no EFIs left in the AIL now.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003111 */
3112 if (lip->li_type != XFS_LI_EFI) {
David Chinner27d8d5f2008-10-30 17:38:39 +11003113#ifdef DEBUG
David Chinnera9c21c12008-10-30 17:39:35 +11003114 for (; lip; lip = xfs_trans_ail_cursor_next(ailp, &cur))
David Chinner27d8d5f2008-10-30 17:38:39 +11003115 ASSERT(lip->li_type != XFS_LI_EFI);
3116#endif
Linus Torvalds1da177e2005-04-16 15:20:36 -07003117 break;
3118 }
3119
3120 /*
3121 * Skip EFIs that we've already processed.
3122 */
3123 efip = (xfs_efi_log_item_t *)lip;
3124 if (efip->efi_flags & XFS_EFI_RECOVERED) {
David Chinnera9c21c12008-10-30 17:39:35 +11003125 lip = xfs_trans_ail_cursor_next(ailp, &cur);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003126 continue;
3127 }
3128
David Chinnera9c21c12008-10-30 17:39:35 +11003129 spin_unlock(&ailp->xa_lock);
3130 error = xlog_recover_process_efi(log->l_mp, efip);
3131 spin_lock(&ailp->xa_lock);
David Chinner27d8d5f2008-10-30 17:38:39 +11003132 if (error)
3133 goto out;
David Chinnera9c21c12008-10-30 17:39:35 +11003134 lip = xfs_trans_ail_cursor_next(ailp, &cur);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003135 }
David Chinner27d8d5f2008-10-30 17:38:39 +11003136out:
David Chinnera9c21c12008-10-30 17:39:35 +11003137 xfs_trans_ail_cursor_done(ailp, &cur);
3138 spin_unlock(&ailp->xa_lock);
David Chinner3c1e2bb2008-04-10 12:21:11 +10003139 return error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003140}
3141
3142/*
3143 * This routine performs a transaction to null out a bad inode pointer
3144 * in an agi unlinked inode hash bucket.
3145 */
3146STATIC void
3147xlog_recover_clear_agi_bucket(
3148 xfs_mount_t *mp,
3149 xfs_agnumber_t agno,
3150 int bucket)
3151{
3152 xfs_trans_t *tp;
3153 xfs_agi_t *agi;
3154 xfs_buf_t *agibp;
3155 int offset;
3156 int error;
3157
3158 tp = xfs_trans_alloc(mp, XFS_TRANS_CLEAR_AGI_BUCKET);
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003159 error = xfs_trans_reserve(tp, 0, XFS_CLEAR_AGI_BUCKET_LOG_RES(mp),
3160 0, 0, 0);
David Chinnere5720ee2008-04-10 12:21:18 +10003161 if (error)
3162 goto out_abort;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003163
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003164 error = xfs_read_agi(mp, tp, agno, &agibp);
3165 if (error)
David Chinnere5720ee2008-04-10 12:21:18 +10003166 goto out_abort;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003167
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003168 agi = XFS_BUF_TO_AGI(agibp);
Christoph Hellwig16259e72005-11-02 15:11:25 +11003169 agi->agi_unlinked[bucket] = cpu_to_be32(NULLAGINO);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003170 offset = offsetof(xfs_agi_t, agi_unlinked) +
3171 (sizeof(xfs_agino_t) * bucket);
3172 xfs_trans_log_buf(tp, agibp, offset,
3173 (offset + sizeof(xfs_agino_t) - 1));
3174
David Chinnere5720ee2008-04-10 12:21:18 +10003175 error = xfs_trans_commit(tp, 0);
3176 if (error)
3177 goto out_error;
3178 return;
3179
3180out_abort:
3181 xfs_trans_cancel(tp, XFS_TRANS_ABORT);
3182out_error:
3183 xfs_fs_cmn_err(CE_WARN, mp, "xlog_recover_clear_agi_bucket: "
3184 "failed to clear agi %d. Continuing.", agno);
3185 return;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003186}
3187
Christoph Hellwig23fac502008-11-28 14:23:40 +11003188STATIC xfs_agino_t
3189xlog_recover_process_one_iunlink(
3190 struct xfs_mount *mp,
3191 xfs_agnumber_t agno,
3192 xfs_agino_t agino,
3193 int bucket)
3194{
3195 struct xfs_buf *ibp;
3196 struct xfs_dinode *dip;
3197 struct xfs_inode *ip;
3198 xfs_ino_t ino;
3199 int error;
3200
3201 ino = XFS_AGINO_TO_INO(mp, agno, agino);
3202 error = xfs_iget(mp, NULL, ino, 0, 0, &ip, 0);
3203 if (error)
3204 goto fail;
3205
3206 /*
3207 * Get the on disk inode to find the next inode in the bucket.
3208 */
Christoph Hellwig0cadda12010-01-19 09:56:44 +00003209 error = xfs_itobp(mp, NULL, ip, &dip, &ibp, XBF_LOCK);
Christoph Hellwig23fac502008-11-28 14:23:40 +11003210 if (error)
Christoph Hellwig0e446672008-11-28 14:23:42 +11003211 goto fail_iput;
Christoph Hellwig23fac502008-11-28 14:23:40 +11003212
Christoph Hellwig23fac502008-11-28 14:23:40 +11003213 ASSERT(ip->i_d.di_nlink == 0);
Christoph Hellwig0e446672008-11-28 14:23:42 +11003214 ASSERT(ip->i_d.di_mode != 0);
Christoph Hellwig23fac502008-11-28 14:23:40 +11003215
3216 /* setup for the next pass */
3217 agino = be32_to_cpu(dip->di_next_unlinked);
3218 xfs_buf_relse(ibp);
3219
3220 /*
3221 * Prevent any DMAPI event from being sent when the reference on
3222 * the inode is dropped.
3223 */
3224 ip->i_d.di_dmevmask = 0;
3225
Christoph Hellwig0e446672008-11-28 14:23:42 +11003226 IRELE(ip);
Christoph Hellwig23fac502008-11-28 14:23:40 +11003227 return agino;
3228
Christoph Hellwig0e446672008-11-28 14:23:42 +11003229 fail_iput:
3230 IRELE(ip);
Christoph Hellwig23fac502008-11-28 14:23:40 +11003231 fail:
3232 /*
3233 * We can't read in the inode this bucket points to, or this inode
3234 * is messed up. Just ditch this bucket of inodes. We will lose
3235 * some inodes and space, but at least we won't hang.
3236 *
3237 * Call xlog_recover_clear_agi_bucket() to perform a transaction to
3238 * clear the inode pointer in the bucket.
3239 */
3240 xlog_recover_clear_agi_bucket(mp, agno, bucket);
3241 return NULLAGINO;
3242}
3243
Linus Torvalds1da177e2005-04-16 15:20:36 -07003244/*
3245 * xlog_iunlink_recover
3246 *
3247 * This is called during recovery to process any inodes which
3248 * we unlinked but not freed when the system crashed. These
3249 * inodes will be on the lists in the AGI blocks. What we do
3250 * here is scan all the AGIs and fully truncate and free any
3251 * inodes found on the lists. Each inode is removed from the
3252 * lists when it has been fully truncated and is freed. The
3253 * freeing of the inode and its removal from the list must be
3254 * atomic.
3255 */
Eric Sandeend96f8f82009-07-02 00:09:33 -05003256STATIC void
Linus Torvalds1da177e2005-04-16 15:20:36 -07003257xlog_recover_process_iunlinks(
3258 xlog_t *log)
3259{
3260 xfs_mount_t *mp;
3261 xfs_agnumber_t agno;
3262 xfs_agi_t *agi;
3263 xfs_buf_t *agibp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003264 xfs_agino_t agino;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003265 int bucket;
3266 int error;
3267 uint mp_dmevmask;
3268
3269 mp = log->l_mp;
3270
3271 /*
3272 * Prevent any DMAPI event from being sent while in this function.
3273 */
3274 mp_dmevmask = mp->m_dmevmask;
3275 mp->m_dmevmask = 0;
3276
3277 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
3278 /*
3279 * Find the agi for this ag.
3280 */
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003281 error = xfs_read_agi(mp, NULL, agno, &agibp);
3282 if (error) {
3283 /*
3284 * AGI is b0rked. Don't process it.
3285 *
3286 * We should probably mark the filesystem as corrupt
3287 * after we've recovered all the ag's we can....
3288 */
3289 continue;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003290 }
3291 agi = XFS_BUF_TO_AGI(agibp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003292
3293 for (bucket = 0; bucket < XFS_AGI_UNLINKED_BUCKETS; bucket++) {
Christoph Hellwig16259e72005-11-02 15:11:25 +11003294 agino = be32_to_cpu(agi->agi_unlinked[bucket]);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003295 while (agino != NULLAGINO) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003296 /*
3297 * Release the agi buffer so that it can
3298 * be acquired in the normal course of the
3299 * transaction to truncate and free the inode.
3300 */
3301 xfs_buf_relse(agibp);
3302
Christoph Hellwig23fac502008-11-28 14:23:40 +11003303 agino = xlog_recover_process_one_iunlink(mp,
3304 agno, agino, bucket);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003305
3306 /*
3307 * Reacquire the agibuffer and continue around
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003308 * the loop. This should never fail as we know
3309 * the buffer was good earlier on.
Linus Torvalds1da177e2005-04-16 15:20:36 -07003310 */
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003311 error = xfs_read_agi(mp, NULL, agno, &agibp);
3312 ASSERT(error == 0);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003313 agi = XFS_BUF_TO_AGI(agibp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003314 }
3315 }
3316
3317 /*
3318 * Release the buffer for the current agi so we can
3319 * go on to the next one.
3320 */
3321 xfs_buf_relse(agibp);
3322 }
3323
3324 mp->m_dmevmask = mp_dmevmask;
3325}
3326
3327
3328#ifdef DEBUG
3329STATIC void
3330xlog_pack_data_checksum(
3331 xlog_t *log,
3332 xlog_in_core_t *iclog,
3333 int size)
3334{
3335 int i;
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003336 __be32 *up;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003337 uint chksum = 0;
3338
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003339 up = (__be32 *)iclog->ic_datap;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003340 /* divide length by 4 to get # words */
3341 for (i = 0; i < (size >> 2); i++) {
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003342 chksum ^= be32_to_cpu(*up);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003343 up++;
3344 }
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003345 iclog->ic_header.h_chksum = cpu_to_be32(chksum);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003346}
3347#else
3348#define xlog_pack_data_checksum(log, iclog, size)
3349#endif
3350
3351/*
3352 * Stamp cycle number in every block
3353 */
3354void
3355xlog_pack_data(
3356 xlog_t *log,
3357 xlog_in_core_t *iclog,
3358 int roundoff)
3359{
3360 int i, j, k;
3361 int size = iclog->ic_offset + roundoff;
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003362 __be32 cycle_lsn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003363 xfs_caddr_t dp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003364
3365 xlog_pack_data_checksum(log, iclog, size);
3366
3367 cycle_lsn = CYCLE_LSN_DISK(iclog->ic_header.h_lsn);
3368
3369 dp = iclog->ic_datap;
3370 for (i = 0; i < BTOBB(size) &&
3371 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003372 iclog->ic_header.h_cycle_data[i] = *(__be32 *)dp;
3373 *(__be32 *)dp = cycle_lsn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003374 dp += BBSIZE;
3375 }
3376
Eric Sandeen62118702008-03-06 13:44:28 +11003377 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
Christoph Hellwigb28708d2008-11-28 14:23:38 +11003378 xlog_in_core_2_t *xhdr = iclog->ic_data;
3379
Linus Torvalds1da177e2005-04-16 15:20:36 -07003380 for ( ; i < BTOBB(size); i++) {
3381 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3382 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003383 xhdr[j].hic_xheader.xh_cycle_data[k] = *(__be32 *)dp;
3384 *(__be32 *)dp = cycle_lsn;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003385 dp += BBSIZE;
3386 }
3387
3388 for (i = 1; i < log->l_iclog_heads; i++) {
3389 xhdr[i].hic_xheader.xh_cycle = cycle_lsn;
3390 }
3391 }
3392}
3393
Linus Torvalds1da177e2005-04-16 15:20:36 -07003394STATIC void
3395xlog_unpack_data(
3396 xlog_rec_header_t *rhead,
3397 xfs_caddr_t dp,
3398 xlog_t *log)
3399{
3400 int i, j, k;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003401
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003402 for (i = 0; i < BTOBB(be32_to_cpu(rhead->h_len)) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07003403 i < (XLOG_HEADER_CYCLE_SIZE / BBSIZE); i++) {
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003404 *(__be32 *)dp = *(__be32 *)&rhead->h_cycle_data[i];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003405 dp += BBSIZE;
3406 }
3407
Eric Sandeen62118702008-03-06 13:44:28 +11003408 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
Christoph Hellwigb28708d2008-11-28 14:23:38 +11003409 xlog_in_core_2_t *xhdr = (xlog_in_core_2_t *)rhead;
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003410 for ( ; i < BTOBB(be32_to_cpu(rhead->h_len)); i++) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003411 j = i / (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
3412 k = i % (XLOG_HEADER_CYCLE_SIZE / BBSIZE);
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003413 *(__be32 *)dp = xhdr[j].hic_xheader.xh_cycle_data[k];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003414 dp += BBSIZE;
3415 }
3416 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003417}
3418
3419STATIC int
3420xlog_valid_rec_header(
3421 xlog_t *log,
3422 xlog_rec_header_t *rhead,
3423 xfs_daddr_t blkno)
3424{
3425 int hlen;
3426
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003427 if (unlikely(be32_to_cpu(rhead->h_magicno) != XLOG_HEADER_MAGIC_NUM)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003428 XFS_ERROR_REPORT("xlog_valid_rec_header(1)",
3429 XFS_ERRLEVEL_LOW, log->l_mp);
3430 return XFS_ERROR(EFSCORRUPTED);
3431 }
3432 if (unlikely(
3433 (!rhead->h_version ||
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003434 (be32_to_cpu(rhead->h_version) & (~XLOG_VERSION_OKBITS))))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003435 xlog_warn("XFS: %s: unrecognised log version (%d).",
Harvey Harrison34a622b2008-04-10 12:19:21 +10003436 __func__, be32_to_cpu(rhead->h_version));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003437 return XFS_ERROR(EIO);
3438 }
3439
3440 /* LR body must have data or it wouldn't have been written */
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003441 hlen = be32_to_cpu(rhead->h_len);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003442 if (unlikely( hlen <= 0 || hlen > INT_MAX )) {
3443 XFS_ERROR_REPORT("xlog_valid_rec_header(2)",
3444 XFS_ERRLEVEL_LOW, log->l_mp);
3445 return XFS_ERROR(EFSCORRUPTED);
3446 }
3447 if (unlikely( blkno > log->l_logBBsize || blkno > INT_MAX )) {
3448 XFS_ERROR_REPORT("xlog_valid_rec_header(3)",
3449 XFS_ERRLEVEL_LOW, log->l_mp);
3450 return XFS_ERROR(EFSCORRUPTED);
3451 }
3452 return 0;
3453}
3454
3455/*
3456 * Read the log from tail to head and process the log records found.
3457 * Handle the two cases where the tail and head are in the same cycle
3458 * and where the active portion of the log wraps around the end of
3459 * the physical log separately. The pass parameter is passed through
3460 * to the routines called to process the data and is not looked at
3461 * here.
3462 */
3463STATIC int
3464xlog_do_recovery_pass(
3465 xlog_t *log,
3466 xfs_daddr_t head_blk,
3467 xfs_daddr_t tail_blk,
3468 int pass)
3469{
3470 xlog_rec_header_t *rhead;
3471 xfs_daddr_t blk_no;
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003472 xfs_caddr_t offset;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003473 xfs_buf_t *hbp, *dbp;
3474 int error = 0, h_size;
3475 int bblks, split_bblks;
3476 int hblks, split_hblks, wrapped_hblks;
Dave Chinnerf0a76952010-01-11 11:49:57 +00003477 struct hlist_head rhash[XLOG_RHASH_SIZE];
Linus Torvalds1da177e2005-04-16 15:20:36 -07003478
3479 ASSERT(head_blk != tail_blk);
3480
3481 /*
3482 * Read the header of the tail block and get the iclog buffer size from
3483 * h_size. Use this to tell how many sectors make up the log header.
3484 */
Eric Sandeen62118702008-03-06 13:44:28 +11003485 if (xfs_sb_version_haslogv2(&log->l_mp->m_sb)) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003486 /*
3487 * When using variable length iclogs, read first sector of
3488 * iclog header and extract the header size from it. Get a
3489 * new hbp that is the correct size.
3490 */
3491 hbp = xlog_get_bp(log, 1);
3492 if (!hbp)
3493 return ENOMEM;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003494
3495 error = xlog_bread(log, tail_blk, 1, hbp, &offset);
3496 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003497 goto bread_err1;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003498
Linus Torvalds1da177e2005-04-16 15:20:36 -07003499 rhead = (xlog_rec_header_t *)offset;
3500 error = xlog_valid_rec_header(log, rhead, tail_blk);
3501 if (error)
3502 goto bread_err1;
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003503 h_size = be32_to_cpu(rhead->h_size);
3504 if ((be32_to_cpu(rhead->h_version) & XLOG_VERSION_2) &&
Linus Torvalds1da177e2005-04-16 15:20:36 -07003505 (h_size > XLOG_HEADER_CYCLE_SIZE)) {
3506 hblks = h_size / XLOG_HEADER_CYCLE_SIZE;
3507 if (h_size % XLOG_HEADER_CYCLE_SIZE)
3508 hblks++;
3509 xlog_put_bp(hbp);
3510 hbp = xlog_get_bp(log, hblks);
3511 } else {
3512 hblks = 1;
3513 }
3514 } else {
Alex Elder69ce58f2010-04-20 17:09:59 +10003515 ASSERT(log->l_sectBBsize == 1);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003516 hblks = 1;
3517 hbp = xlog_get_bp(log, 1);
3518 h_size = XLOG_BIG_RECORD_BSIZE;
3519 }
3520
3521 if (!hbp)
3522 return ENOMEM;
3523 dbp = xlog_get_bp(log, BTOBB(h_size));
3524 if (!dbp) {
3525 xlog_put_bp(hbp);
3526 return ENOMEM;
3527 }
3528
3529 memset(rhash, 0, sizeof(rhash));
3530 if (tail_blk <= head_blk) {
3531 for (blk_no = tail_blk; blk_no < head_blk; ) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003532 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3533 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003534 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003535
Linus Torvalds1da177e2005-04-16 15:20:36 -07003536 rhead = (xlog_rec_header_t *)offset;
3537 error = xlog_valid_rec_header(log, rhead, blk_no);
3538 if (error)
3539 goto bread_err2;
3540
3541 /* blocks in data section */
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003542 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003543 error = xlog_bread(log, blk_no + hblks, bblks, dbp,
3544 &offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003545 if (error)
3546 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003547
Linus Torvalds1da177e2005-04-16 15:20:36 -07003548 xlog_unpack_data(rhead, offset, log);
3549 if ((error = xlog_recover_process_data(log,
3550 rhash, rhead, offset, pass)))
3551 goto bread_err2;
3552 blk_no += bblks + hblks;
3553 }
3554 } else {
3555 /*
3556 * Perform recovery around the end of the physical log.
3557 * When the head is not on the same cycle number as the tail,
3558 * we can't do a sequential recovery as above.
3559 */
3560 blk_no = tail_blk;
3561 while (blk_no < log->l_logBBsize) {
3562 /*
3563 * Check for header wrapping around physical end-of-log
3564 */
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003565 offset = XFS_BUF_PTR(hbp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003566 split_hblks = 0;
3567 wrapped_hblks = 0;
3568 if (blk_no + hblks <= log->l_logBBsize) {
3569 /* Read header in one read */
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003570 error = xlog_bread(log, blk_no, hblks, hbp,
3571 &offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003572 if (error)
3573 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003574 } else {
3575 /* This LR is split across physical log end */
3576 if (blk_no != log->l_logBBsize) {
3577 /* some data before physical log end */
3578 ASSERT(blk_no <= INT_MAX);
3579 split_hblks = log->l_logBBsize - (int)blk_no;
3580 ASSERT(split_hblks > 0);
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003581 error = xlog_bread(log, blk_no,
3582 split_hblks, hbp,
3583 &offset);
3584 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003585 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003586 }
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003587
Linus Torvalds1da177e2005-04-16 15:20:36 -07003588 /*
3589 * Note: this black magic still works with
3590 * large sector sizes (non-512) only because:
3591 * - we increased the buffer size originally
3592 * by 1 sector giving us enough extra space
3593 * for the second read;
3594 * - the log start is guaranteed to be sector
3595 * aligned;
3596 * - we read the log end (LR header start)
3597 * _first_, then the log start (LR header end)
3598 * - order is important.
3599 */
David Chinner234f56a2008-04-10 12:24:24 +10003600 wrapped_hblks = hblks - split_hblks;
David Chinner234f56a2008-04-10 12:24:24 +10003601 error = XFS_BUF_SET_PTR(hbp,
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003602 offset + BBTOB(split_hblks),
Linus Torvalds1da177e2005-04-16 15:20:36 -07003603 BBTOB(hblks - split_hblks));
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003604 if (error)
3605 goto bread_err2;
3606
3607 error = xlog_bread_noalign(log, 0,
3608 wrapped_hblks, hbp);
3609 if (error)
3610 goto bread_err2;
3611
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003612 error = XFS_BUF_SET_PTR(hbp, offset,
David Chinner234f56a2008-04-10 12:24:24 +10003613 BBTOB(hblks));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003614 if (error)
3615 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003616 }
3617 rhead = (xlog_rec_header_t *)offset;
3618 error = xlog_valid_rec_header(log, rhead,
3619 split_hblks ? blk_no : 0);
3620 if (error)
3621 goto bread_err2;
3622
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003623 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003624 blk_no += hblks;
3625
3626 /* Read in data for log record */
3627 if (blk_no + bblks <= log->l_logBBsize) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003628 error = xlog_bread(log, blk_no, bblks, dbp,
3629 &offset);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003630 if (error)
3631 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003632 } else {
3633 /* This log record is split across the
3634 * physical end of log */
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003635 offset = XFS_BUF_PTR(dbp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003636 split_bblks = 0;
3637 if (blk_no != log->l_logBBsize) {
3638 /* some data is before the physical
3639 * end of log */
3640 ASSERT(!wrapped_hblks);
3641 ASSERT(blk_no <= INT_MAX);
3642 split_bblks =
3643 log->l_logBBsize - (int)blk_no;
3644 ASSERT(split_bblks > 0);
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003645 error = xlog_bread(log, blk_no,
3646 split_bblks, dbp,
3647 &offset);
3648 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003649 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003650 }
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003651
Linus Torvalds1da177e2005-04-16 15:20:36 -07003652 /*
3653 * Note: this black magic still works with
3654 * large sector sizes (non-512) only because:
3655 * - we increased the buffer size originally
3656 * by 1 sector giving us enough extra space
3657 * for the second read;
3658 * - the log start is guaranteed to be sector
3659 * aligned;
3660 * - we read the log end (LR header start)
3661 * _first_, then the log start (LR header end)
3662 * - order is important.
3663 */
David Chinner234f56a2008-04-10 12:24:24 +10003664 error = XFS_BUF_SET_PTR(dbp,
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003665 offset + BBTOB(split_bblks),
Linus Torvalds1da177e2005-04-16 15:20:36 -07003666 BBTOB(bblks - split_bblks));
David Chinner234f56a2008-04-10 12:24:24 +10003667 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003668 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003669
3670 error = xlog_bread_noalign(log, wrapped_hblks,
3671 bblks - split_bblks,
3672 dbp);
3673 if (error)
3674 goto bread_err2;
3675
Andy Polingfc5bc4c2009-11-03 17:26:47 +00003676 error = XFS_BUF_SET_PTR(dbp, offset, h_size);
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003677 if (error)
3678 goto bread_err2;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003679 }
3680 xlog_unpack_data(rhead, offset, log);
3681 if ((error = xlog_recover_process_data(log, rhash,
3682 rhead, offset, pass)))
3683 goto bread_err2;
3684 blk_no += bblks;
3685 }
3686
3687 ASSERT(blk_no >= log->l_logBBsize);
3688 blk_no -= log->l_logBBsize;
3689
3690 /* read first part of physical log */
3691 while (blk_no < head_blk) {
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003692 error = xlog_bread(log, blk_no, hblks, hbp, &offset);
3693 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003694 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003695
Linus Torvalds1da177e2005-04-16 15:20:36 -07003696 rhead = (xlog_rec_header_t *)offset;
3697 error = xlog_valid_rec_header(log, rhead, blk_no);
3698 if (error)
3699 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003700
Christoph Hellwigb53e6752007-10-12 10:59:34 +10003701 bblks = (int)BTOBB(be32_to_cpu(rhead->h_len));
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003702 error = xlog_bread(log, blk_no+hblks, bblks, dbp,
3703 &offset);
3704 if (error)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003705 goto bread_err2;
Christoph Hellwig076e6ac2009-03-16 08:24:13 +01003706
Linus Torvalds1da177e2005-04-16 15:20:36 -07003707 xlog_unpack_data(rhead, offset, log);
3708 if ((error = xlog_recover_process_data(log, rhash,
3709 rhead, offset, pass)))
3710 goto bread_err2;
3711 blk_no += bblks + hblks;
3712 }
3713 }
3714
3715 bread_err2:
3716 xlog_put_bp(dbp);
3717 bread_err1:
3718 xlog_put_bp(hbp);
3719 return error;
3720}
3721
3722/*
3723 * Do the recovery of the log. We actually do this in two phases.
3724 * The two passes are necessary in order to implement the function
3725 * of cancelling a record written into the log. The first pass
3726 * determines those things which have been cancelled, and the
3727 * second pass replays log items normally except for those which
3728 * have been cancelled. The handling of the replay and cancellations
3729 * takes place in the log item type specific routines.
3730 *
3731 * The table of items which have cancel records in the log is allocated
3732 * and freed at this level, since only here do we know when all of
3733 * the log recovery has been completed.
3734 */
3735STATIC int
3736xlog_do_log_recovery(
3737 xlog_t *log,
3738 xfs_daddr_t head_blk,
3739 xfs_daddr_t tail_blk)
3740{
3741 int error;
3742
3743 ASSERT(head_blk != tail_blk);
3744
3745 /*
3746 * First do a pass to find all of the cancelled buf log items.
3747 * Store them in the buf_cancel_table for use in the second pass.
3748 */
3749 log->l_buf_cancel_table =
3750 (xfs_buf_cancel_t **)kmem_zalloc(XLOG_BC_TABLE_SIZE *
3751 sizeof(xfs_buf_cancel_t*),
3752 KM_SLEEP);
3753 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3754 XLOG_RECOVER_PASS1);
3755 if (error != 0) {
Denys Vlasenkof0e2d932008-05-19 16:31:57 +10003756 kmem_free(log->l_buf_cancel_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003757 log->l_buf_cancel_table = NULL;
3758 return error;
3759 }
3760 /*
3761 * Then do a second pass to actually recover the items in the log.
3762 * When it is complete free the table of buf cancel items.
3763 */
3764 error = xlog_do_recovery_pass(log, head_blk, tail_blk,
3765 XLOG_RECOVER_PASS2);
3766#ifdef DEBUG
Tim Shimmin6d192a92006-06-09 14:55:38 +10003767 if (!error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003768 int i;
3769
3770 for (i = 0; i < XLOG_BC_TABLE_SIZE; i++)
3771 ASSERT(log->l_buf_cancel_table[i] == NULL);
3772 }
3773#endif /* DEBUG */
3774
Denys Vlasenkof0e2d932008-05-19 16:31:57 +10003775 kmem_free(log->l_buf_cancel_table);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003776 log->l_buf_cancel_table = NULL;
3777
3778 return error;
3779}
3780
3781/*
3782 * Do the actual recovery
3783 */
3784STATIC int
3785xlog_do_recover(
3786 xlog_t *log,
3787 xfs_daddr_t head_blk,
3788 xfs_daddr_t tail_blk)
3789{
3790 int error;
3791 xfs_buf_t *bp;
3792 xfs_sb_t *sbp;
3793
3794 /*
3795 * First replay the images in the log.
3796 */
3797 error = xlog_do_log_recovery(log, head_blk, tail_blk);
3798 if (error) {
3799 return error;
3800 }
3801
3802 XFS_bflush(log->l_mp->m_ddev_targp);
3803
3804 /*
3805 * If IO errors happened during recovery, bail out.
3806 */
3807 if (XFS_FORCED_SHUTDOWN(log->l_mp)) {
3808 return (EIO);
3809 }
3810
3811 /*
3812 * We now update the tail_lsn since much of the recovery has completed
3813 * and there may be space available to use. If there were no extent
3814 * or iunlinks, we can free up the entire log and set the tail_lsn to
3815 * be the last_sync_lsn. This was set in xlog_find_tail to be the
3816 * lsn of the last known good LR on disk. If there are extent frees
3817 * or iunlinks they will have some entries in the AIL; so we look at
3818 * the AIL to determine how to set the tail_lsn.
3819 */
3820 xlog_assign_tail_lsn(log->l_mp);
3821
3822 /*
3823 * Now that we've finished replaying all buffer and inode
3824 * updates, re-read in the superblock.
3825 */
3826 bp = xfs_getsb(log->l_mp, 0);
3827 XFS_BUF_UNDONE(bp);
Lachlan McIlroybebf9632007-10-15 13:18:02 +10003828 ASSERT(!(XFS_BUF_ISWRITE(bp)));
3829 ASSERT(!(XFS_BUF_ISDELAYWRITE(bp)));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003830 XFS_BUF_READ(bp);
Lachlan McIlroybebf9632007-10-15 13:18:02 +10003831 XFS_BUF_UNASYNC(bp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003832 xfsbdstrat(log->l_mp, bp);
David Chinnerd64e31a2008-04-10 12:22:17 +10003833 error = xfs_iowait(bp);
3834 if (error) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003835 xfs_ioerror_alert("xlog_do_recover",
3836 log->l_mp, bp, XFS_BUF_ADDR(bp));
3837 ASSERT(0);
3838 xfs_buf_relse(bp);
3839 return error;
3840 }
3841
3842 /* Convert superblock from on-disk format */
3843 sbp = &log->l_mp->m_sb;
Christoph Hellwig2bdf7cd2007-08-28 13:58:06 +10003844 xfs_sb_from_disk(sbp, XFS_BUF_TO_SBP(bp));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003845 ASSERT(sbp->sb_magicnum == XFS_SB_MAGIC);
Eric Sandeen62118702008-03-06 13:44:28 +11003846 ASSERT(xfs_sb_good_version(sbp));
Linus Torvalds1da177e2005-04-16 15:20:36 -07003847 xfs_buf_relse(bp);
3848
Lachlan McIlroy5478eea2007-02-10 18:36:29 +11003849 /* We've re-read the superblock so re-initialize per-cpu counters */
3850 xfs_icsb_reinit_counters(log->l_mp);
3851
Linus Torvalds1da177e2005-04-16 15:20:36 -07003852 xlog_recover_check_summary(log);
3853
3854 /* Normal transactions can now occur */
3855 log->l_flags &= ~XLOG_ACTIVE_RECOVERY;
3856 return 0;
3857}
3858
3859/*
3860 * Perform recovery and re-initialize some log variables in xlog_find_tail.
3861 *
3862 * Return error or zero.
3863 */
3864int
3865xlog_recover(
Eric Sandeen65be6052006-01-11 15:34:19 +11003866 xlog_t *log)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003867{
3868 xfs_daddr_t head_blk, tail_blk;
3869 int error;
3870
3871 /* find the tail of the log */
Eric Sandeen65be6052006-01-11 15:34:19 +11003872 if ((error = xlog_find_tail(log, &head_blk, &tail_blk)))
Linus Torvalds1da177e2005-04-16 15:20:36 -07003873 return error;
3874
3875 if (tail_blk != head_blk) {
3876 /* There used to be a comment here:
3877 *
3878 * disallow recovery on read-only mounts. note -- mount
3879 * checks for ENOSPC and turns it into an intelligent
3880 * error message.
3881 * ...but this is no longer true. Now, unless you specify
3882 * NORECOVERY (in which case this function would never be
3883 * called), we just go ahead and recover. We do this all
3884 * under the vfs layer, so we can get away with it unless
3885 * the device itself is read-only, in which case we fail.
3886 */
Utako Kusaka3a02ee12007-05-08 13:50:06 +10003887 if ((error = xfs_dev_is_read_only(log->l_mp, "recovery"))) {
Linus Torvalds1da177e2005-04-16 15:20:36 -07003888 return error;
3889 }
3890
3891 cmn_err(CE_NOTE,
Nathan Scottfc1f8c12005-11-02 11:44:33 +11003892 "Starting XFS recovery on filesystem: %s (logdev: %s)",
3893 log->l_mp->m_fsname, log->l_mp->m_logname ?
3894 log->l_mp->m_logname : "internal");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003895
3896 error = xlog_do_recover(log, head_blk, tail_blk);
3897 log->l_flags |= XLOG_RECOVERY_NEEDED;
3898 }
3899 return error;
3900}
3901
3902/*
3903 * In the first part of recovery we replay inodes and buffers and build
3904 * up the list of extent free items which need to be processed. Here
3905 * we process the extent free items and clean up the on disk unlinked
3906 * inode lists. This is separated from the first part of recovery so
3907 * that the root and real-time bitmap inodes can be read in from disk in
3908 * between the two stages. This is necessary so that we can free space
3909 * in the real-time portion of the file system.
3910 */
3911int
3912xlog_recover_finish(
Christoph Hellwig42490232008-08-13 16:49:32 +10003913 xlog_t *log)
Linus Torvalds1da177e2005-04-16 15:20:36 -07003914{
3915 /*
3916 * Now we're ready to do the transactions needed for the
3917 * rest of recovery. Start with completing all the extent
3918 * free intent records and then process the unlinked inode
3919 * lists. At this point, we essentially run in normal mode
3920 * except that we're still performing recovery actions
3921 * rather than accepting new requests.
3922 */
3923 if (log->l_flags & XLOG_RECOVERY_NEEDED) {
David Chinner3c1e2bb2008-04-10 12:21:11 +10003924 int error;
3925 error = xlog_recover_process_efis(log);
3926 if (error) {
3927 cmn_err(CE_ALERT,
3928 "Failed to recover EFIs on filesystem: %s",
3929 log->l_mp->m_fsname);
3930 return error;
3931 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003932 /*
3933 * Sync the log to get all the EFIs out of the AIL.
3934 * This isn't absolutely necessary, but it helps in
3935 * case the unlink transactions would have problems
3936 * pushing the EFIs out of the way.
3937 */
Christoph Hellwiga14a3482010-01-19 09:56:46 +00003938 xfs_log_force(log->l_mp, XFS_LOG_SYNC);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003939
Christoph Hellwig42490232008-08-13 16:49:32 +10003940 xlog_recover_process_iunlinks(log);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003941
3942 xlog_recover_check_summary(log);
3943
3944 cmn_err(CE_NOTE,
Nathan Scottfc1f8c12005-11-02 11:44:33 +11003945 "Ending XFS recovery on filesystem: %s (logdev: %s)",
3946 log->l_mp->m_fsname, log->l_mp->m_logname ?
3947 log->l_mp->m_logname : "internal");
Linus Torvalds1da177e2005-04-16 15:20:36 -07003948 log->l_flags &= ~XLOG_RECOVERY_NEEDED;
3949 } else {
3950 cmn_err(CE_DEBUG,
Nathan Scottb6574522006-06-09 15:29:40 +10003951 "!Ending clean XFS mount for filesystem: %s\n",
Linus Torvalds1da177e2005-04-16 15:20:36 -07003952 log->l_mp->m_fsname);
3953 }
3954 return 0;
3955}
3956
3957
3958#if defined(DEBUG)
3959/*
3960 * Read all of the agf and agi counters and check that they
3961 * are consistent with the superblock counters.
3962 */
3963void
3964xlog_recover_check_summary(
3965 xlog_t *log)
3966{
3967 xfs_mount_t *mp;
3968 xfs_agf_t *agfp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003969 xfs_buf_t *agfbp;
3970 xfs_buf_t *agibp;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003971 xfs_agnumber_t agno;
3972 __uint64_t freeblks;
3973 __uint64_t itotal;
3974 __uint64_t ifree;
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003975 int error;
Linus Torvalds1da177e2005-04-16 15:20:36 -07003976
3977 mp = log->l_mp;
3978
3979 freeblks = 0LL;
3980 itotal = 0LL;
3981 ifree = 0LL;
3982 for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
From: Christoph Hellwig48056212008-11-28 14:23:38 +11003983 error = xfs_read_agf(mp, NULL, agno, 0, &agfbp);
3984 if (error) {
3985 xfs_fs_cmn_err(CE_ALERT, mp,
3986 "xlog_recover_check_summary(agf)"
3987 "agf read failed agno %d error %d",
3988 agno, error);
3989 } else {
3990 agfp = XFS_BUF_TO_AGF(agfbp);
3991 freeblks += be32_to_cpu(agfp->agf_freeblks) +
3992 be32_to_cpu(agfp->agf_flcount);
3993 xfs_buf_relse(agfbp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003994 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07003995
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11003996 error = xfs_read_agi(mp, NULL, agno, &agibp);
3997 if (!error) {
3998 struct xfs_agi *agi = XFS_BUF_TO_AGI(agibp);
Linus Torvalds1da177e2005-04-16 15:20:36 -07003999
Christoph Hellwig5e1be0f2008-11-28 14:23:37 +11004000 itotal += be32_to_cpu(agi->agi_count);
4001 ifree += be32_to_cpu(agi->agi_freecount);
4002 xfs_buf_relse(agibp);
4003 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004004 }
Linus Torvalds1da177e2005-04-16 15:20:36 -07004005}
4006#endif /* DEBUG */