blob: 10be51290a27e887c7cc503dbbb21ef4cc82301e [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Uwe Zeisbergerf30c2262006-10-03 23:01:26 +02002 * linux/fs/jbd/commit.c
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
114 descriptor = journal_get_descriptor_buffer(journal);
115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
123 tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
130 if (journal->j_flags & JFS_BARRIER) {
131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
148 journal->j_flags &= ~JFS_BARRIER;
149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
158 journal_put_journal_head(descriptor);
159
160 return (ret == -EIO);
161}
162
Jan Kara3998b932006-09-25 23:30:53 -0700163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
242 __journal_file_buffer(jh, commit_transaction,
243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
254 __journal_unfile_buffer(jh);
255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
258 journal_remove_journal_head(bh);
259 /* Once for our safety reference, once for
260 * journal_remove_journal_head() */
261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
Linus Torvalds1da177e2005-04-16 15:20:36 -0700274/*
275 * journal_commit_transaction
276 *
277 * The primary function for committing a transaction to the log. This
278 * function is called by the journal thread to begin a complete commit.
279 */
280void journal_commit_transaction(journal_t *journal)
281{
282 transaction_t *commit_transaction;
283 struct journal_head *jh, *new_jh, *descriptor;
284 struct buffer_head **wbuf = journal->j_wbuf;
285 int bufs;
286 int flags;
287 int err;
288 unsigned long blocknr;
289 char *tagp = NULL;
290 journal_header_t *header;
291 journal_block_tag_t *tag = NULL;
292 int space_left = 0;
293 int first_tag = 0;
294 int tag_flag;
295 int i;
296
297 /*
298 * First job: lock down the current transaction and wait for
299 * all outstanding updates to complete.
300 */
301
302#ifdef COMMIT_STATS
303 spin_lock(&journal->j_list_lock);
304 summarise_journal_usage(journal);
305 spin_unlock(&journal->j_list_lock);
306#endif
307
308 /* Do we need to erase the effects of a prior journal_flush? */
309 if (journal->j_flags & JFS_FLUSHED) {
310 jbd_debug(3, "super block updated\n");
311 journal_update_superblock(journal, 1);
312 } else {
313 jbd_debug(3, "superblock not updated\n");
314 }
315
316 J_ASSERT(journal->j_running_transaction != NULL);
317 J_ASSERT(journal->j_committing_transaction == NULL);
318
319 commit_transaction = journal->j_running_transaction;
320 J_ASSERT(commit_transaction->t_state == T_RUNNING);
321
322 jbd_debug(1, "JBD: starting commit of transaction %d\n",
323 commit_transaction->t_tid);
324
325 spin_lock(&journal->j_state_lock);
326 commit_transaction->t_state = T_LOCKED;
327
328 spin_lock(&commit_transaction->t_handle_lock);
329 while (commit_transaction->t_updates) {
330 DEFINE_WAIT(wait);
331
332 prepare_to_wait(&journal->j_wait_updates, &wait,
333 TASK_UNINTERRUPTIBLE);
334 if (commit_transaction->t_updates) {
335 spin_unlock(&commit_transaction->t_handle_lock);
336 spin_unlock(&journal->j_state_lock);
337 schedule();
338 spin_lock(&journal->j_state_lock);
339 spin_lock(&commit_transaction->t_handle_lock);
340 }
341 finish_wait(&journal->j_wait_updates, &wait);
342 }
343 spin_unlock(&commit_transaction->t_handle_lock);
344
345 J_ASSERT (commit_transaction->t_outstanding_credits <=
346 journal->j_max_transaction_buffers);
347
348 /*
349 * First thing we are allowed to do is to discard any remaining
350 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
351 * that there are no such buffers: if a large filesystem
352 * operation like a truncate needs to split itself over multiple
353 * transactions, then it may try to do a journal_restart() while
354 * there are still BJ_Reserved buffers outstanding. These must
355 * be released cleanly from the current transaction.
356 *
357 * In this case, the filesystem must still reserve write access
358 * again before modifying the buffer in the new transaction, but
359 * we do not require it to remember exactly which old buffers it
360 * has reserved. This is consistent with the existing behaviour
361 * that multiple journal_get_write_access() calls to the same
362 * buffer are perfectly permissable.
363 */
364 while (commit_transaction->t_reserved_list) {
365 jh = commit_transaction->t_reserved_list;
366 JBUFFER_TRACE(jh, "reserved, unused: refile");
367 /*
368 * A journal_get_undo_access()+journal_release_buffer() may
369 * leave undo-committed data.
370 */
371 if (jh->b_committed_data) {
372 struct buffer_head *bh = jh2bh(jh);
373
374 jbd_lock_bh_state(bh);
Badari Pulavartyea817392006-08-27 01:23:52 -0700375 jbd_slab_free(jh->b_committed_data, bh->b_size);
Jesper Juhlf99d49a2005-11-07 01:01:34 -0800376 jh->b_committed_data = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700377 jbd_unlock_bh_state(bh);
378 }
379 journal_refile_buffer(journal, jh);
380 }
381
382 /*
383 * Now try to drop any written-back buffers from the journal's
384 * checkpoint lists. We do this *before* commit because it potentially
385 * frees some memory
386 */
387 spin_lock(&journal->j_list_lock);
388 __journal_clean_checkpoint_list(journal);
389 spin_unlock(&journal->j_list_lock);
390
391 jbd_debug (3, "JBD: commit phase 1\n");
392
393 /*
394 * Switch to a new revoke table.
395 */
396 journal_switch_revoke_table(journal);
397
398 commit_transaction->t_state = T_FLUSH;
399 journal->j_committing_transaction = commit_transaction;
400 journal->j_running_transaction = NULL;
401 commit_transaction->t_log_start = journal->j_head;
402 wake_up(&journal->j_wait_transaction_locked);
403 spin_unlock(&journal->j_state_lock);
404
405 jbd_debug (3, "JBD: commit phase 2\n");
406
407 /*
408 * First, drop modified flag: all accesses to the buffers
409 * will be tracked for a new trasaction only -bzzz
410 */
411 spin_lock(&journal->j_list_lock);
412 if (commit_transaction->t_buffers) {
413 new_jh = jh = commit_transaction->t_buffers->b_tnext;
414 do {
415 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
416 new_jh->b_modified == 0);
417 new_jh->b_modified = 0;
418 new_jh = new_jh->b_tnext;
419 } while (new_jh != jh);
420 }
421 spin_unlock(&journal->j_list_lock);
422
423 /*
424 * Now start flushing things to disk, in the order they appear
425 * on the transaction lists. Data blocks go first.
426 */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700427 err = 0;
Jan Kara3998b932006-09-25 23:30:53 -0700428 journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429
430 /*
431 * Wait for all previously submitted IO to complete.
432 */
Jan Kara3998b932006-09-25 23:30:53 -0700433 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700434 while (commit_transaction->t_locked_list) {
435 struct buffer_head *bh;
436
437 jh = commit_transaction->t_locked_list->b_tprev;
438 bh = jh2bh(jh);
439 get_bh(bh);
440 if (buffer_locked(bh)) {
441 spin_unlock(&journal->j_list_lock);
442 wait_on_buffer(bh);
443 if (unlikely(!buffer_uptodate(bh)))
444 err = -EIO;
445 spin_lock(&journal->j_list_lock);
446 }
447 if (!inverted_lock(journal, bh)) {
448 put_bh(bh);
449 spin_lock(&journal->j_list_lock);
450 continue;
451 }
452 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
453 __journal_unfile_buffer(jh);
454 jbd_unlock_bh_state(bh);
455 journal_remove_journal_head(bh);
456 put_bh(bh);
457 } else {
458 jbd_unlock_bh_state(bh);
459 }
460 put_bh(bh);
461 cond_resched_lock(&journal->j_list_lock);
462 }
463 spin_unlock(&journal->j_list_lock);
464
465 if (err)
466 __journal_abort_hard(journal);
467
468 journal_write_revoke_records(journal, commit_transaction);
469
470 jbd_debug(3, "JBD: commit phase 2\n");
471
472 /*
473 * If we found any dirty or locked buffers, then we should have
474 * looped back up to the write_out_data label. If there weren't
475 * any then journal_clean_data_list should have wiped the list
476 * clean by now, so check that it is in fact empty.
477 */
478 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
479
480 jbd_debug (3, "JBD: commit phase 3\n");
481
482 /*
483 * Way to go: we have now written out all of the data for a
484 * transaction! Now comes the tricky part: we need to write out
485 * metadata. Loop over the transaction's entire buffer list:
486 */
487 commit_transaction->t_state = T_COMMIT;
488
489 descriptor = NULL;
490 bufs = 0;
491 while (commit_transaction->t_buffers) {
492
493 /* Find the next buffer to be journaled... */
494
495 jh = commit_transaction->t_buffers;
496
497 /* If we're in abort mode, we just un-journal the buffer and
498 release it for background writing. */
499
500 if (is_journal_aborted(journal)) {
501 JBUFFER_TRACE(jh, "journal is aborting: refile");
502 journal_refile_buffer(journal, jh);
503 /* If that was the last one, we need to clean up
504 * any descriptor buffers which may have been
505 * already allocated, even if we are now
506 * aborting. */
507 if (!commit_transaction->t_buffers)
508 goto start_journal_io;
509 continue;
510 }
511
512 /* Make sure we have a descriptor block in which to
513 record the metadata buffer. */
514
515 if (!descriptor) {
516 struct buffer_head *bh;
517
518 J_ASSERT (bufs == 0);
519
520 jbd_debug(4, "JBD: get descriptor\n");
521
522 descriptor = journal_get_descriptor_buffer(journal);
523 if (!descriptor) {
524 __journal_abort_hard(journal);
525 continue;
526 }
527
528 bh = jh2bh(descriptor);
529 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
530 (unsigned long long)bh->b_blocknr, bh->b_data);
531 header = (journal_header_t *)&bh->b_data[0];
532 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
533 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
534 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
535
536 tagp = &bh->b_data[sizeof(journal_header_t)];
537 space_left = bh->b_size - sizeof(journal_header_t);
538 first_tag = 1;
539 set_buffer_jwrite(bh);
540 set_buffer_dirty(bh);
541 wbuf[bufs++] = bh;
542
543 /* Record it so that we can wait for IO
544 completion later */
545 BUFFER_TRACE(bh, "ph3: file as descriptor");
546 journal_file_buffer(descriptor, commit_transaction,
547 BJ_LogCtl);
548 }
549
550 /* Where is the buffer to be written? */
551
552 err = journal_next_log_block(journal, &blocknr);
553 /* If the block mapping failed, just abandon the buffer
554 and repeat this loop: we'll fall into the
555 refile-on-abort condition above. */
556 if (err) {
557 __journal_abort_hard(journal);
558 continue;
559 }
560
561 /*
562 * start_this_handle() uses t_outstanding_credits to determine
563 * the free space in the log, but this counter is changed
564 * by journal_next_log_block() also.
565 */
566 commit_transaction->t_outstanding_credits--;
567
568 /* Bump b_count to prevent truncate from stumbling over
569 the shadowed buffer! @@@ This can go if we ever get
570 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
571 atomic_inc(&jh2bh(jh)->b_count);
572
573 /* Make a temporary IO buffer with which to write it out
574 (this will requeue both the metadata buffer and the
575 temporary IO buffer). new_bh goes on BJ_IO*/
576
577 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
578 /*
579 * akpm: journal_write_metadata_buffer() sets
580 * new_bh->b_transaction to commit_transaction.
581 * We need to clean this up before we release new_bh
582 * (which is of type BJ_IO)
583 */
584 JBUFFER_TRACE(jh, "ph3: write metadata");
585 flags = journal_write_metadata_buffer(commit_transaction,
586 jh, &new_jh, blocknr);
587 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
588 wbuf[bufs++] = jh2bh(new_jh);
589
590 /* Record the new block's tag in the current descriptor
591 buffer */
592
593 tag_flag = 0;
594 if (flags & 1)
595 tag_flag |= JFS_FLAG_ESCAPE;
596 if (!first_tag)
597 tag_flag |= JFS_FLAG_SAME_UUID;
598
599 tag = (journal_block_tag_t *) tagp;
600 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
601 tag->t_flags = cpu_to_be32(tag_flag);
602 tagp += sizeof(journal_block_tag_t);
603 space_left -= sizeof(journal_block_tag_t);
604
605 if (first_tag) {
606 memcpy (tagp, journal->j_uuid, 16);
607 tagp += 16;
608 space_left -= 16;
609 first_tag = 0;
610 }
611
612 /* If there's no more to do, or if the descriptor is full,
613 let the IO rip! */
614
615 if (bufs == journal->j_wbufsize ||
616 commit_transaction->t_buffers == NULL ||
617 space_left < sizeof(journal_block_tag_t) + 16) {
618
619 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
620
621 /* Write an end-of-descriptor marker before
622 submitting the IOs. "tag" still points to
623 the last tag we set up. */
624
625 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
626
627start_journal_io:
628 for (i = 0; i < bufs; i++) {
629 struct buffer_head *bh = wbuf[i];
630 lock_buffer(bh);
631 clear_buffer_dirty(bh);
632 set_buffer_uptodate(bh);
633 bh->b_end_io = journal_end_buffer_io_sync;
634 submit_bh(WRITE, bh);
635 }
636 cond_resched();
637
638 /* Force a new descriptor to be generated next
639 time round the loop. */
640 descriptor = NULL;
641 bufs = 0;
642 }
643 }
644
645 /* Lo and behold: we have just managed to send a transaction to
646 the log. Before we can commit it, wait for the IO so far to
647 complete. Control buffers being written are on the
648 transaction's t_log_list queue, and metadata buffers are on
649 the t_iobuf_list queue.
650
651 Wait for the buffers in reverse order. That way we are
652 less likely to be woken up until all IOs have completed, and
653 so we incur less scheduling load.
654 */
655
656 jbd_debug(3, "JBD: commit phase 4\n");
657
658 /*
659 * akpm: these are BJ_IO, and j_list_lock is not needed.
660 * See __journal_try_to_free_buffer.
661 */
662wait_for_iobuf:
663 while (commit_transaction->t_iobuf_list != NULL) {
664 struct buffer_head *bh;
665
666 jh = commit_transaction->t_iobuf_list->b_tprev;
667 bh = jh2bh(jh);
668 if (buffer_locked(bh)) {
669 wait_on_buffer(bh);
670 goto wait_for_iobuf;
671 }
672 if (cond_resched())
673 goto wait_for_iobuf;
674
675 if (unlikely(!buffer_uptodate(bh)))
676 err = -EIO;
677
678 clear_buffer_jwrite(bh);
679
680 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
681 journal_unfile_buffer(journal, jh);
682
683 /*
684 * ->t_iobuf_list should contain only dummy buffer_heads
685 * which were created by journal_write_metadata_buffer().
686 */
687 BUFFER_TRACE(bh, "dumping temporary bh");
688 journal_put_journal_head(jh);
689 __brelse(bh);
690 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
691 free_buffer_head(bh);
692
693 /* We also have to unlock and free the corresponding
694 shadowed buffer */
695 jh = commit_transaction->t_shadow_list->b_tprev;
696 bh = jh2bh(jh);
697 clear_bit(BH_JWrite, &bh->b_state);
698 J_ASSERT_BH(bh, buffer_jbddirty(bh));
699
700 /* The metadata is now released for reuse, but we need
701 to remember it against this transaction so that when
702 we finally commit, we can do any checkpointing
703 required. */
704 JBUFFER_TRACE(jh, "file as BJ_Forget");
705 journal_file_buffer(jh, commit_transaction, BJ_Forget);
706 /* Wake up any transactions which were waiting for this
707 IO to complete */
708 wake_up_bit(&bh->b_state, BH_Unshadow);
709 JBUFFER_TRACE(jh, "brelse shadowed buffer");
710 __brelse(bh);
711 }
712
713 J_ASSERT (commit_transaction->t_shadow_list == NULL);
714
715 jbd_debug(3, "JBD: commit phase 5\n");
716
717 /* Here we wait for the revoke record and descriptor record buffers */
718 wait_for_ctlbuf:
719 while (commit_transaction->t_log_list != NULL) {
720 struct buffer_head *bh;
721
722 jh = commit_transaction->t_log_list->b_tprev;
723 bh = jh2bh(jh);
724 if (buffer_locked(bh)) {
725 wait_on_buffer(bh);
726 goto wait_for_ctlbuf;
727 }
728 if (cond_resched())
729 goto wait_for_ctlbuf;
730
731 if (unlikely(!buffer_uptodate(bh)))
732 err = -EIO;
733
734 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
735 clear_buffer_jwrite(bh);
736 journal_unfile_buffer(journal, jh);
737 journal_put_journal_head(jh);
738 __brelse(bh); /* One for getblk */
739 /* AKPM: bforget here */
740 }
741
742 jbd_debug(3, "JBD: commit phase 6\n");
743
744 if (journal_write_commit_record(journal, commit_transaction))
745 err = -EIO;
746
747 if (err)
748 __journal_abort_hard(journal);
749
750 /* End of a transaction! Finally, we can do checkpoint
751 processing: any buffers committed as a result of this
752 transaction can be removed from any checkpoint list it was on
753 before. */
754
755 jbd_debug(3, "JBD: commit phase 7\n");
756
757 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
758 J_ASSERT(commit_transaction->t_buffers == NULL);
759 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
760 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
761 J_ASSERT(commit_transaction->t_shadow_list == NULL);
762 J_ASSERT(commit_transaction->t_log_list == NULL);
763
764restart_loop:
Jan Karae6c9f5c2005-09-06 15:19:09 -0700765 /*
766 * As there are other places (journal_unmap_buffer()) adding buffers
767 * to this list we have to be careful and hold the j_list_lock.
768 */
769 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700770 while (commit_transaction->t_forget) {
771 transaction_t *cp_transaction;
772 struct buffer_head *bh;
773
774 jh = commit_transaction->t_forget;
Jan Karae6c9f5c2005-09-06 15:19:09 -0700775 spin_unlock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700776 bh = jh2bh(jh);
777 jbd_lock_bh_state(bh);
778 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
779 jh->b_transaction == journal->j_running_transaction);
780
781 /*
782 * If there is undo-protected committed data against
783 * this buffer, then we can remove it now. If it is a
784 * buffer needing such protection, the old frozen_data
785 * field now points to a committed version of the
786 * buffer, so rotate that field to the new committed
787 * data.
788 *
789 * Otherwise, we can just throw away the frozen data now.
790 */
791 if (jh->b_committed_data) {
Badari Pulavartyea817392006-08-27 01:23:52 -0700792 jbd_slab_free(jh->b_committed_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700793 jh->b_committed_data = NULL;
794 if (jh->b_frozen_data) {
795 jh->b_committed_data = jh->b_frozen_data;
796 jh->b_frozen_data = NULL;
797 }
798 } else if (jh->b_frozen_data) {
Badari Pulavartyea817392006-08-27 01:23:52 -0700799 jbd_slab_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700800 jh->b_frozen_data = NULL;
801 }
802
803 spin_lock(&journal->j_list_lock);
804 cp_transaction = jh->b_cp_transaction;
805 if (cp_transaction) {
806 JBUFFER_TRACE(jh, "remove from old cp transaction");
807 __journal_remove_checkpoint(jh);
808 }
809
810 /* Only re-checkpoint the buffer_head if it is marked
811 * dirty. If the buffer was added to the BJ_Forget list
812 * by journal_forget, it may no longer be dirty and
813 * there's no point in keeping a checkpoint record for
814 * it. */
815
816 /* A buffer which has been freed while still being
817 * journaled by a previous transaction may end up still
818 * being dirty here, but we want to avoid writing back
819 * that buffer in the future now that the last use has
820 * been committed. That's not only a performance gain,
821 * it also stops aliasing problems if the buffer is left
822 * behind for writeback and gets reallocated for another
823 * use in a different page. */
824 if (buffer_freed(bh)) {
825 clear_buffer_freed(bh);
826 clear_buffer_jbddirty(bh);
827 }
828
829 if (buffer_jbddirty(bh)) {
830 JBUFFER_TRACE(jh, "add to new checkpointing trans");
831 __journal_insert_checkpoint(jh, commit_transaction);
832 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
833 __journal_refile_buffer(jh);
834 jbd_unlock_bh_state(bh);
835 } else {
836 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara9ada7342006-06-23 02:05:25 -0700837 /* The buffer on BJ_Forget list and not jbddirty means
838 * it has been freed by this transaction and hence it
839 * could not have been reallocated until this
840 * transaction has committed. *BUT* it could be
841 * reallocated once we have written all the data to
842 * disk and before we process the buffer on BJ_Forget
843 * list. */
844 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
845 __journal_refile_buffer(jh);
846 if (!jh->b_transaction) {
847 jbd_unlock_bh_state(bh);
848 /* needs a brelse */
849 journal_remove_journal_head(bh);
850 release_buffer_page(bh);
851 } else
852 jbd_unlock_bh_state(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 }
Jan Karae6c9f5c2005-09-06 15:19:09 -0700854 cond_resched_lock(&journal->j_list_lock);
855 }
856 spin_unlock(&journal->j_list_lock);
857 /*
858 * This is a bit sleazy. We borrow j_list_lock to protect
859 * journal->j_committing_transaction in __journal_remove_checkpoint.
860 * Really, __journal_remove_checkpoint should be using j_state_lock but
861 * it's a bit hassle to hold that across __journal_remove_checkpoint
862 */
863 spin_lock(&journal->j_state_lock);
864 spin_lock(&journal->j_list_lock);
865 /*
866 * Now recheck if some buffers did not get attached to the transaction
867 * while the lock was dropped...
868 */
869 if (commit_transaction->t_forget) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700870 spin_unlock(&journal->j_list_lock);
Jan Karae6c9f5c2005-09-06 15:19:09 -0700871 spin_unlock(&journal->j_state_lock);
872 goto restart_loop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 }
874
875 /* Done with this transaction! */
876
877 jbd_debug(3, "JBD: commit phase 8\n");
878
879 J_ASSERT(commit_transaction->t_state == T_COMMIT);
880
Linus Torvalds1da177e2005-04-16 15:20:36 -0700881 commit_transaction->t_state = T_FINISHED;
882 J_ASSERT(commit_transaction == journal->j_committing_transaction);
883 journal->j_commit_sequence = commit_transaction->t_tid;
884 journal->j_committing_transaction = NULL;
885 spin_unlock(&journal->j_state_lock);
886
Mark Fasheh7c8903f2006-02-14 13:53:03 -0800887 if (commit_transaction->t_checkpoint_list == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700888 __journal_drop_transaction(journal, commit_transaction);
889 } else {
890 if (journal->j_checkpoint_transactions == NULL) {
891 journal->j_checkpoint_transactions = commit_transaction;
892 commit_transaction->t_cpnext = commit_transaction;
893 commit_transaction->t_cpprev = commit_transaction;
894 } else {
895 commit_transaction->t_cpnext =
896 journal->j_checkpoint_transactions;
897 commit_transaction->t_cpprev =
898 commit_transaction->t_cpnext->t_cpprev;
899 commit_transaction->t_cpnext->t_cpprev =
900 commit_transaction;
901 commit_transaction->t_cpprev->t_cpnext =
902 commit_transaction;
903 }
904 }
905 spin_unlock(&journal->j_list_lock);
906
907 jbd_debug(1, "JBD: commit %d complete, head %d\n",
908 journal->j_commit_sequence, journal->j_tail_sequence);
909
910 wake_up(&journal->j_wait_done_commit);
911}