blob: 8e08efcaede26c06d01271b5dbebeafd55a0183d [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Uwe Zeisbergerf30c2262006-10-03 23:01:26 +02002 * linux/fs/jbd/commit.c
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070023
24/*
25 * Default IO end handler for temporary BJ_IO buffer_heads.
26 */
27static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
28{
29 BUFFER_TRACE(bh, "");
30 if (uptodate)
31 set_buffer_uptodate(bh);
32 else
33 clear_buffer_uptodate(bh);
34 unlock_buffer(bh);
35}
36
37/*
38 * When an ext3-ordered file is truncated, it is possible that many pages are
39 * not sucessfully freed, because they are attached to a committing transaction.
40 * After the transaction commits, these pages are left on the LRU, with no
41 * ->mapping, and with attached buffers. These pages are trivially reclaimable
42 * by the VM, but their apparent absence upsets the VM accounting, and it makes
43 * the numbers in /proc/meminfo look odd.
44 *
45 * So here, we have a buffer which has just come off the forget list. Look to
46 * see if we can strip all buffers from the backing page.
47 *
48 * Called under lock_journal(), and possibly under journal_datalist_lock. The
49 * caller provided us with a ref against the buffer, and we drop that here.
50 */
51static void release_buffer_page(struct buffer_head *bh)
52{
53 struct page *page;
54
55 if (buffer_dirty(bh))
56 goto nope;
57 if (atomic_read(&bh->b_count) != 1)
58 goto nope;
59 page = bh->b_page;
60 if (!page)
61 goto nope;
62 if (page->mapping)
63 goto nope;
64
65 /* OK, it's a truncated page */
66 if (TestSetPageLocked(page))
67 goto nope;
68
69 page_cache_get(page);
70 __brelse(bh);
71 try_to_free_buffers(page);
72 unlock_page(page);
73 page_cache_release(page);
74 return;
75
76nope:
77 __brelse(bh);
78}
79
80/*
81 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
82 * held. For ranking reasons we must trylock. If we lose, schedule away and
83 * return 0. j_list_lock is dropped in this case.
84 */
85static int inverted_lock(journal_t *journal, struct buffer_head *bh)
86{
87 if (!jbd_trylock_bh_state(bh)) {
88 spin_unlock(&journal->j_list_lock);
89 schedule();
90 return 0;
91 }
92 return 1;
93}
94
95/* Done it all: now write the commit record. We should have
96 * cleaned up our previous buffers by now, so if we are in abort
97 * mode we can now just skip the rest of the journal write
98 * entirely.
99 *
100 * Returns 1 if the journal needs to be aborted or 0 on success
101 */
102static int journal_write_commit_record(journal_t *journal,
103 transaction_t *commit_transaction)
104{
105 struct journal_head *descriptor;
106 struct buffer_head *bh;
107 int i, ret;
108 int barrier_done = 0;
109
110 if (is_journal_aborted(journal))
111 return 0;
112
113 descriptor = journal_get_descriptor_buffer(journal);
114 if (!descriptor)
115 return 1;
116
117 bh = jh2bh(descriptor);
118
119 /* AKPM: buglet - add `i' to tmp! */
120 for (i = 0; i < bh->b_size; i += 512) {
121 journal_header_t *tmp = (journal_header_t*)bh->b_data;
122 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
123 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
124 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
125 }
126
127 JBUFFER_TRACE(descriptor, "write commit block");
128 set_buffer_dirty(bh);
129 if (journal->j_flags & JFS_BARRIER) {
130 set_buffer_ordered(bh);
131 barrier_done = 1;
132 }
133 ret = sync_dirty_buffer(bh);
Neil Brown28ae0942008-02-08 04:22:13 -0800134 if (barrier_done)
135 clear_buffer_ordered(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700136 /* is it possible for another commit to fail at roughly
137 * the same time as this one? If so, we don't want to
138 * trust the barrier flag in the super, but instead want
139 * to remember if we sent a barrier request
140 */
141 if (ret == -EOPNOTSUPP && barrier_done) {
142 char b[BDEVNAME_SIZE];
143
144 printk(KERN_WARNING
145 "JBD: barrier-based sync failed on %s - "
146 "disabling barriers\n",
147 bdevname(journal->j_dev, b));
148 spin_lock(&journal->j_state_lock);
149 journal->j_flags &= ~JFS_BARRIER;
150 spin_unlock(&journal->j_state_lock);
151
152 /* And try again, without the barrier */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
Jan Kara3998b932006-09-25 23:30:53 -0700163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
Hisashi Hifumi6f5a9da2006-12-22 01:11:50 -0800251 } else if (!locked && buffer_locked(bh)) {
252 __journal_file_buffer(jh, commit_transaction,
253 BJ_Locked);
254 jbd_unlock_bh_state(bh);
255 put_bh(bh);
256 } else {
Jan Kara3998b932006-09-25 23:30:53 -0700257 BUFFER_TRACE(bh, "writeout complete: unfile");
258 __journal_unfile_buffer(jh);
259 jbd_unlock_bh_state(bh);
260 if (locked)
261 unlock_buffer(bh);
262 journal_remove_journal_head(bh);
263 /* Once for our safety reference, once for
264 * journal_remove_journal_head() */
265 put_bh(bh);
266 put_bh(bh);
267 }
268
Nick Piggin95c354f2008-01-30 13:31:20 +0100269 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
Jan Kara3998b932006-09-25 23:30:53 -0700270 spin_unlock(&journal->j_list_lock);
271 goto write_out_data;
272 }
273 }
274 spin_unlock(&journal->j_list_lock);
275 journal_do_submit_data(wbuf, bufs);
276}
277
Linus Torvalds1da177e2005-04-16 15:20:36 -0700278/*
279 * journal_commit_transaction
280 *
281 * The primary function for committing a transaction to the log. This
282 * function is called by the journal thread to begin a complete commit.
283 */
284void journal_commit_transaction(journal_t *journal)
285{
286 transaction_t *commit_transaction;
287 struct journal_head *jh, *new_jh, *descriptor;
288 struct buffer_head **wbuf = journal->j_wbuf;
289 int bufs;
290 int flags;
291 int err;
292 unsigned long blocknr;
293 char *tagp = NULL;
294 journal_header_t *header;
295 journal_block_tag_t *tag = NULL;
296 int space_left = 0;
297 int first_tag = 0;
298 int tag_flag;
299 int i;
300
301 /*
302 * First job: lock down the current transaction and wait for
303 * all outstanding updates to complete.
304 */
305
306#ifdef COMMIT_STATS
307 spin_lock(&journal->j_list_lock);
308 summarise_journal_usage(journal);
309 spin_unlock(&journal->j_list_lock);
310#endif
311
312 /* Do we need to erase the effects of a prior journal_flush? */
313 if (journal->j_flags & JFS_FLUSHED) {
314 jbd_debug(3, "super block updated\n");
315 journal_update_superblock(journal, 1);
316 } else {
317 jbd_debug(3, "superblock not updated\n");
318 }
319
320 J_ASSERT(journal->j_running_transaction != NULL);
321 J_ASSERT(journal->j_committing_transaction == NULL);
322
323 commit_transaction = journal->j_running_transaction;
324 J_ASSERT(commit_transaction->t_state == T_RUNNING);
325
326 jbd_debug(1, "JBD: starting commit of transaction %d\n",
327 commit_transaction->t_tid);
328
329 spin_lock(&journal->j_state_lock);
330 commit_transaction->t_state = T_LOCKED;
331
332 spin_lock(&commit_transaction->t_handle_lock);
333 while (commit_transaction->t_updates) {
334 DEFINE_WAIT(wait);
335
336 prepare_to_wait(&journal->j_wait_updates, &wait,
337 TASK_UNINTERRUPTIBLE);
338 if (commit_transaction->t_updates) {
339 spin_unlock(&commit_transaction->t_handle_lock);
340 spin_unlock(&journal->j_state_lock);
341 schedule();
342 spin_lock(&journal->j_state_lock);
343 spin_lock(&commit_transaction->t_handle_lock);
344 }
345 finish_wait(&journal->j_wait_updates, &wait);
346 }
347 spin_unlock(&commit_transaction->t_handle_lock);
348
349 J_ASSERT (commit_transaction->t_outstanding_credits <=
350 journal->j_max_transaction_buffers);
351
352 /*
353 * First thing we are allowed to do is to discard any remaining
354 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
355 * that there are no such buffers: if a large filesystem
356 * operation like a truncate needs to split itself over multiple
357 * transactions, then it may try to do a journal_restart() while
358 * there are still BJ_Reserved buffers outstanding. These must
359 * be released cleanly from the current transaction.
360 *
361 * In this case, the filesystem must still reserve write access
362 * again before modifying the buffer in the new transaction, but
363 * we do not require it to remember exactly which old buffers it
364 * has reserved. This is consistent with the existing behaviour
365 * that multiple journal_get_write_access() calls to the same
366 * buffer are perfectly permissable.
367 */
368 while (commit_transaction->t_reserved_list) {
369 jh = commit_transaction->t_reserved_list;
370 JBUFFER_TRACE(jh, "reserved, unused: refile");
371 /*
372 * A journal_get_undo_access()+journal_release_buffer() may
373 * leave undo-committed data.
374 */
375 if (jh->b_committed_data) {
376 struct buffer_head *bh = jh2bh(jh);
377
378 jbd_lock_bh_state(bh);
Mingming Caoc089d492007-10-16 18:38:25 -0400379 jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhlf99d49a2005-11-07 01:01:34 -0800380 jh->b_committed_data = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700381 jbd_unlock_bh_state(bh);
382 }
383 journal_refile_buffer(journal, jh);
384 }
385
386 /*
387 * Now try to drop any written-back buffers from the journal's
388 * checkpoint lists. We do this *before* commit because it potentially
389 * frees some memory
390 */
391 spin_lock(&journal->j_list_lock);
392 __journal_clean_checkpoint_list(journal);
393 spin_unlock(&journal->j_list_lock);
394
395 jbd_debug (3, "JBD: commit phase 1\n");
396
397 /*
398 * Switch to a new revoke table.
399 */
400 journal_switch_revoke_table(journal);
401
402 commit_transaction->t_state = T_FLUSH;
403 journal->j_committing_transaction = commit_transaction;
404 journal->j_running_transaction = NULL;
405 commit_transaction->t_log_start = journal->j_head;
406 wake_up(&journal->j_wait_transaction_locked);
407 spin_unlock(&journal->j_state_lock);
408
409 jbd_debug (3, "JBD: commit phase 2\n");
410
411 /*
412 * First, drop modified flag: all accesses to the buffers
413 * will be tracked for a new trasaction only -bzzz
414 */
415 spin_lock(&journal->j_list_lock);
416 if (commit_transaction->t_buffers) {
417 new_jh = jh = commit_transaction->t_buffers->b_tnext;
418 do {
419 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
420 new_jh->b_modified == 0);
421 new_jh->b_modified = 0;
422 new_jh = new_jh->b_tnext;
423 } while (new_jh != jh);
424 }
425 spin_unlock(&journal->j_list_lock);
426
427 /*
428 * Now start flushing things to disk, in the order they appear
429 * on the transaction lists. Data blocks go first.
430 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700431 err = 0;
Jan Kara3998b932006-09-25 23:30:53 -0700432 journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700433
434 /*
435 * Wait for all previously submitted IO to complete.
436 */
Jan Kara3998b932006-09-25 23:30:53 -0700437 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700438 while (commit_transaction->t_locked_list) {
439 struct buffer_head *bh;
440
441 jh = commit_transaction->t_locked_list->b_tprev;
442 bh = jh2bh(jh);
443 get_bh(bh);
444 if (buffer_locked(bh)) {
445 spin_unlock(&journal->j_list_lock);
446 wait_on_buffer(bh);
447 if (unlikely(!buffer_uptodate(bh)))
448 err = -EIO;
449 spin_lock(&journal->j_list_lock);
450 }
451 if (!inverted_lock(journal, bh)) {
452 put_bh(bh);
453 spin_lock(&journal->j_list_lock);
454 continue;
455 }
456 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
457 __journal_unfile_buffer(jh);
458 jbd_unlock_bh_state(bh);
459 journal_remove_journal_head(bh);
460 put_bh(bh);
461 } else {
462 jbd_unlock_bh_state(bh);
463 }
464 put_bh(bh);
465 cond_resched_lock(&journal->j_list_lock);
466 }
467 spin_unlock(&journal->j_list_lock);
468
469 if (err)
Jan Kara7a266e72007-10-18 23:39:22 -0700470 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471
472 journal_write_revoke_records(journal, commit_transaction);
473
474 jbd_debug(3, "JBD: commit phase 2\n");
475
476 /*
477 * If we found any dirty or locked buffers, then we should have
478 * looped back up to the write_out_data label. If there weren't
479 * any then journal_clean_data_list should have wiped the list
480 * clean by now, so check that it is in fact empty.
481 */
482 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
483
484 jbd_debug (3, "JBD: commit phase 3\n");
485
486 /*
487 * Way to go: we have now written out all of the data for a
488 * transaction! Now comes the tricky part: we need to write out
489 * metadata. Loop over the transaction's entire buffer list:
490 */
491 commit_transaction->t_state = T_COMMIT;
492
493 descriptor = NULL;
494 bufs = 0;
495 while (commit_transaction->t_buffers) {
496
497 /* Find the next buffer to be journaled... */
498
499 jh = commit_transaction->t_buffers;
500
501 /* If we're in abort mode, we just un-journal the buffer and
502 release it for background writing. */
503
504 if (is_journal_aborted(journal)) {
505 JBUFFER_TRACE(jh, "journal is aborting: refile");
506 journal_refile_buffer(journal, jh);
507 /* If that was the last one, we need to clean up
508 * any descriptor buffers which may have been
509 * already allocated, even if we are now
510 * aborting. */
511 if (!commit_transaction->t_buffers)
512 goto start_journal_io;
513 continue;
514 }
515
516 /* Make sure we have a descriptor block in which to
517 record the metadata buffer. */
518
519 if (!descriptor) {
520 struct buffer_head *bh;
521
522 J_ASSERT (bufs == 0);
523
524 jbd_debug(4, "JBD: get descriptor\n");
525
526 descriptor = journal_get_descriptor_buffer(journal);
527 if (!descriptor) {
Jan Kara7a266e72007-10-18 23:39:22 -0700528 journal_abort(journal, -EIO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529 continue;
530 }
531
532 bh = jh2bh(descriptor);
533 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
534 (unsigned long long)bh->b_blocknr, bh->b_data);
535 header = (journal_header_t *)&bh->b_data[0];
536 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
537 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
538 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
539
540 tagp = &bh->b_data[sizeof(journal_header_t)];
541 space_left = bh->b_size - sizeof(journal_header_t);
542 first_tag = 1;
543 set_buffer_jwrite(bh);
544 set_buffer_dirty(bh);
545 wbuf[bufs++] = bh;
546
547 /* Record it so that we can wait for IO
548 completion later */
549 BUFFER_TRACE(bh, "ph3: file as descriptor");
550 journal_file_buffer(descriptor, commit_transaction,
551 BJ_LogCtl);
552 }
553
554 /* Where is the buffer to be written? */
555
556 err = journal_next_log_block(journal, &blocknr);
557 /* If the block mapping failed, just abandon the buffer
558 and repeat this loop: we'll fall into the
559 refile-on-abort condition above. */
560 if (err) {
Jan Kara7a266e72007-10-18 23:39:22 -0700561 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700562 continue;
563 }
564
565 /*
566 * start_this_handle() uses t_outstanding_credits to determine
567 * the free space in the log, but this counter is changed
568 * by journal_next_log_block() also.
569 */
570 commit_transaction->t_outstanding_credits--;
571
572 /* Bump b_count to prevent truncate from stumbling over
573 the shadowed buffer! @@@ This can go if we ever get
574 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
575 atomic_inc(&jh2bh(jh)->b_count);
576
577 /* Make a temporary IO buffer with which to write it out
578 (this will requeue both the metadata buffer and the
579 temporary IO buffer). new_bh goes on BJ_IO*/
580
581 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
582 /*
583 * akpm: journal_write_metadata_buffer() sets
584 * new_bh->b_transaction to commit_transaction.
585 * We need to clean this up before we release new_bh
586 * (which is of type BJ_IO)
587 */
588 JBUFFER_TRACE(jh, "ph3: write metadata");
589 flags = journal_write_metadata_buffer(commit_transaction,
590 jh, &new_jh, blocknr);
591 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
592 wbuf[bufs++] = jh2bh(new_jh);
593
594 /* Record the new block's tag in the current descriptor
595 buffer */
596
597 tag_flag = 0;
598 if (flags & 1)
599 tag_flag |= JFS_FLAG_ESCAPE;
600 if (!first_tag)
601 tag_flag |= JFS_FLAG_SAME_UUID;
602
603 tag = (journal_block_tag_t *) tagp;
604 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
605 tag->t_flags = cpu_to_be32(tag_flag);
606 tagp += sizeof(journal_block_tag_t);
607 space_left -= sizeof(journal_block_tag_t);
608
609 if (first_tag) {
610 memcpy (tagp, journal->j_uuid, 16);
611 tagp += 16;
612 space_left -= 16;
613 first_tag = 0;
614 }
615
616 /* If there's no more to do, or if the descriptor is full,
617 let the IO rip! */
618
619 if (bufs == journal->j_wbufsize ||
620 commit_transaction->t_buffers == NULL ||
621 space_left < sizeof(journal_block_tag_t) + 16) {
622
623 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
624
625 /* Write an end-of-descriptor marker before
626 submitting the IOs. "tag" still points to
627 the last tag we set up. */
628
629 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
630
631start_journal_io:
632 for (i = 0; i < bufs; i++) {
633 struct buffer_head *bh = wbuf[i];
634 lock_buffer(bh);
635 clear_buffer_dirty(bh);
636 set_buffer_uptodate(bh);
637 bh->b_end_io = journal_end_buffer_io_sync;
638 submit_bh(WRITE, bh);
639 }
640 cond_resched();
641
642 /* Force a new descriptor to be generated next
643 time round the loop. */
644 descriptor = NULL;
645 bufs = 0;
646 }
647 }
648
649 /* Lo and behold: we have just managed to send a transaction to
650 the log. Before we can commit it, wait for the IO so far to
651 complete. Control buffers being written are on the
652 transaction's t_log_list queue, and metadata buffers are on
653 the t_iobuf_list queue.
654
655 Wait for the buffers in reverse order. That way we are
656 less likely to be woken up until all IOs have completed, and
657 so we incur less scheduling load.
658 */
659
660 jbd_debug(3, "JBD: commit phase 4\n");
661
662 /*
663 * akpm: these are BJ_IO, and j_list_lock is not needed.
664 * See __journal_try_to_free_buffer.
665 */
666wait_for_iobuf:
667 while (commit_transaction->t_iobuf_list != NULL) {
668 struct buffer_head *bh;
669
670 jh = commit_transaction->t_iobuf_list->b_tprev;
671 bh = jh2bh(jh);
672 if (buffer_locked(bh)) {
673 wait_on_buffer(bh);
674 goto wait_for_iobuf;
675 }
676 if (cond_resched())
677 goto wait_for_iobuf;
678
679 if (unlikely(!buffer_uptodate(bh)))
680 err = -EIO;
681
682 clear_buffer_jwrite(bh);
683
684 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
685 journal_unfile_buffer(journal, jh);
686
687 /*
688 * ->t_iobuf_list should contain only dummy buffer_heads
689 * which were created by journal_write_metadata_buffer().
690 */
691 BUFFER_TRACE(bh, "dumping temporary bh");
692 journal_put_journal_head(jh);
693 __brelse(bh);
694 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
695 free_buffer_head(bh);
696
697 /* We also have to unlock and free the corresponding
698 shadowed buffer */
699 jh = commit_transaction->t_shadow_list->b_tprev;
700 bh = jh2bh(jh);
701 clear_bit(BH_JWrite, &bh->b_state);
702 J_ASSERT_BH(bh, buffer_jbddirty(bh));
703
704 /* The metadata is now released for reuse, but we need
705 to remember it against this transaction so that when
706 we finally commit, we can do any checkpointing
707 required. */
708 JBUFFER_TRACE(jh, "file as BJ_Forget");
709 journal_file_buffer(jh, commit_transaction, BJ_Forget);
710 /* Wake up any transactions which were waiting for this
711 IO to complete */
712 wake_up_bit(&bh->b_state, BH_Unshadow);
713 JBUFFER_TRACE(jh, "brelse shadowed buffer");
714 __brelse(bh);
715 }
716
717 J_ASSERT (commit_transaction->t_shadow_list == NULL);
718
719 jbd_debug(3, "JBD: commit phase 5\n");
720
721 /* Here we wait for the revoke record and descriptor record buffers */
722 wait_for_ctlbuf:
723 while (commit_transaction->t_log_list != NULL) {
724 struct buffer_head *bh;
725
726 jh = commit_transaction->t_log_list->b_tprev;
727 bh = jh2bh(jh);
728 if (buffer_locked(bh)) {
729 wait_on_buffer(bh);
730 goto wait_for_ctlbuf;
731 }
732 if (cond_resched())
733 goto wait_for_ctlbuf;
734
735 if (unlikely(!buffer_uptodate(bh)))
736 err = -EIO;
737
738 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
739 clear_buffer_jwrite(bh);
740 journal_unfile_buffer(journal, jh);
741 journal_put_journal_head(jh);
742 __brelse(bh); /* One for getblk */
743 /* AKPM: bforget here */
744 }
745
746 jbd_debug(3, "JBD: commit phase 6\n");
747
748 if (journal_write_commit_record(journal, commit_transaction))
749 err = -EIO;
750
751 if (err)
Jan Kara7a266e72007-10-18 23:39:22 -0700752 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700753
754 /* End of a transaction! Finally, we can do checkpoint
755 processing: any buffers committed as a result of this
756 transaction can be removed from any checkpoint list it was on
757 before. */
758
759 jbd_debug(3, "JBD: commit phase 7\n");
760
761 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
762 J_ASSERT(commit_transaction->t_buffers == NULL);
763 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
764 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
765 J_ASSERT(commit_transaction->t_shadow_list == NULL);
766 J_ASSERT(commit_transaction->t_log_list == NULL);
767
768restart_loop:
Jan Karae6c9f5c2005-09-06 15:19:09 -0700769 /*
770 * As there are other places (journal_unmap_buffer()) adding buffers
771 * to this list we have to be careful and hold the j_list_lock.
772 */
773 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700774 while (commit_transaction->t_forget) {
775 transaction_t *cp_transaction;
776 struct buffer_head *bh;
777
778 jh = commit_transaction->t_forget;
Jan Karae6c9f5c2005-09-06 15:19:09 -0700779 spin_unlock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700780 bh = jh2bh(jh);
781 jbd_lock_bh_state(bh);
782 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
783 jh->b_transaction == journal->j_running_transaction);
784
785 /*
786 * If there is undo-protected committed data against
787 * this buffer, then we can remove it now. If it is a
788 * buffer needing such protection, the old frozen_data
789 * field now points to a committed version of the
790 * buffer, so rotate that field to the new committed
791 * data.
792 *
793 * Otherwise, we can just throw away the frozen data now.
794 */
795 if (jh->b_committed_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400796 jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700797 jh->b_committed_data = NULL;
798 if (jh->b_frozen_data) {
799 jh->b_committed_data = jh->b_frozen_data;
800 jh->b_frozen_data = NULL;
801 }
802 } else if (jh->b_frozen_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400803 jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700804 jh->b_frozen_data = NULL;
805 }
806
807 spin_lock(&journal->j_list_lock);
808 cp_transaction = jh->b_cp_transaction;
809 if (cp_transaction) {
810 JBUFFER_TRACE(jh, "remove from old cp transaction");
811 __journal_remove_checkpoint(jh);
812 }
813
814 /* Only re-checkpoint the buffer_head if it is marked
815 * dirty. If the buffer was added to the BJ_Forget list
816 * by journal_forget, it may no longer be dirty and
817 * there's no point in keeping a checkpoint record for
818 * it. */
819
820 /* A buffer which has been freed while still being
821 * journaled by a previous transaction may end up still
822 * being dirty here, but we want to avoid writing back
823 * that buffer in the future now that the last use has
824 * been committed. That's not only a performance gain,
825 * it also stops aliasing problems if the buffer is left
826 * behind for writeback and gets reallocated for another
827 * use in a different page. */
828 if (buffer_freed(bh)) {
829 clear_buffer_freed(bh);
830 clear_buffer_jbddirty(bh);
831 }
832
833 if (buffer_jbddirty(bh)) {
834 JBUFFER_TRACE(jh, "add to new checkpointing trans");
835 __journal_insert_checkpoint(jh, commit_transaction);
836 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
837 __journal_refile_buffer(jh);
838 jbd_unlock_bh_state(bh);
839 } else {
840 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara9ada7342006-06-23 02:05:25 -0700841 /* The buffer on BJ_Forget list and not jbddirty means
842 * it has been freed by this transaction and hence it
843 * could not have been reallocated until this
844 * transaction has committed. *BUT* it could be
845 * reallocated once we have written all the data to
846 * disk and before we process the buffer on BJ_Forget
847 * list. */
848 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
849 __journal_refile_buffer(jh);
850 if (!jh->b_transaction) {
851 jbd_unlock_bh_state(bh);
852 /* needs a brelse */
853 journal_remove_journal_head(bh);
854 release_buffer_page(bh);
855 } else
856 jbd_unlock_bh_state(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700857 }
Jan Karae6c9f5c2005-09-06 15:19:09 -0700858 cond_resched_lock(&journal->j_list_lock);
859 }
860 spin_unlock(&journal->j_list_lock);
861 /*
Jan Karad4beaf42007-12-04 23:45:27 -0800862 * This is a bit sleazy. We use j_list_lock to protect transition
863 * of a transaction into T_FINISHED state and calling
864 * __journal_drop_transaction(). Otherwise we could race with
865 * other checkpointing code processing the transaction...
Jan Karae6c9f5c2005-09-06 15:19:09 -0700866 */
867 spin_lock(&journal->j_state_lock);
868 spin_lock(&journal->j_list_lock);
869 /*
870 * Now recheck if some buffers did not get attached to the transaction
871 * while the lock was dropped...
872 */
873 if (commit_transaction->t_forget) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700874 spin_unlock(&journal->j_list_lock);
Jan Karae6c9f5c2005-09-06 15:19:09 -0700875 spin_unlock(&journal->j_state_lock);
876 goto restart_loop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700877 }
878
879 /* Done with this transaction! */
880
881 jbd_debug(3, "JBD: commit phase 8\n");
882
883 J_ASSERT(commit_transaction->t_state == T_COMMIT);
884
Linus Torvalds1da177e2005-04-16 15:20:36 -0700885 commit_transaction->t_state = T_FINISHED;
886 J_ASSERT(commit_transaction == journal->j_committing_transaction);
887 journal->j_commit_sequence = commit_transaction->t_tid;
888 journal->j_committing_transaction = NULL;
889 spin_unlock(&journal->j_state_lock);
890
Jan Karafe28e422007-07-15 23:37:18 -0700891 if (commit_transaction->t_checkpoint_list == NULL &&
892 commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700893 __journal_drop_transaction(journal, commit_transaction);
894 } else {
895 if (journal->j_checkpoint_transactions == NULL) {
896 journal->j_checkpoint_transactions = commit_transaction;
897 commit_transaction->t_cpnext = commit_transaction;
898 commit_transaction->t_cpprev = commit_transaction;
899 } else {
900 commit_transaction->t_cpnext =
901 journal->j_checkpoint_transactions;
902 commit_transaction->t_cpprev =
903 commit_transaction->t_cpnext->t_cpprev;
904 commit_transaction->t_cpnext->t_cpprev =
905 commit_transaction;
906 commit_transaction->t_cpprev->t_cpnext =
907 commit_transaction;
908 }
909 }
910 spin_unlock(&journal->j_list_lock);
911
912 jbd_debug(1, "JBD: commit %d complete, head %d\n",
913 journal->j_commit_sequence, journal->j_tail_sequence);
914
915 wake_up(&journal->j_wait_done_commit);
916}