blob: f8077b9c898160513b1e958a532401653ac96123 [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Uwe Zeisbergerf30c2262006-10-03 23:01:26 +02002 * linux/fs/jbd/commit.c
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
Theodore Ts'o512a0042009-03-27 22:14:27 -040023#include <linux/bio.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070024
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070040 * not successfully freed, because they are attached to a committing transaction.
Linus Torvalds1da177e2005-04-16 15:20:36 -070041 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070049 * Called under journal->j_list_lock. The caller provided us with a ref
50 * against the buffer, and we drop that here.
Linus Torvalds1da177e2005-04-16 15:20:36 -070051 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
Nick Piggin529ae9a2008-08-02 12:01:03 +020067 if (!trylock_page(page))
Linus Torvalds1da177e2005-04-16 15:20:36 -070068 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070082 * Decrement reference counter for data buffer. If it has been marked
83 * 'BH_Freed', release it and the page to which it belongs if possible.
84 */
85static void release_data_buffer(struct buffer_head *bh)
86{
87 if (buffer_freed(bh)) {
88 clear_buffer_freed(bh);
89 release_buffer_page(bh);
90 } else
91 put_bh(bh);
92}
93
94/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070095 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
96 * held. For ranking reasons we must trylock. If we lose, schedule away and
97 * return 0. j_list_lock is dropped in this case.
98 */
99static int inverted_lock(journal_t *journal, struct buffer_head *bh)
100{
101 if (!jbd_trylock_bh_state(bh)) {
102 spin_unlock(&journal->j_list_lock);
103 schedule();
104 return 0;
105 }
106 return 1;
107}
108
109/* Done it all: now write the commit record. We should have
110 * cleaned up our previous buffers by now, so if we are in abort
111 * mode we can now just skip the rest of the journal write
112 * entirely.
113 *
114 * Returns 1 if the journal needs to be aborted or 0 on success
115 */
116static int journal_write_commit_record(journal_t *journal,
117 transaction_t *commit_transaction)
118{
119 struct journal_head *descriptor;
120 struct buffer_head *bh;
Jan Kara53152172008-02-01 08:26:46 -0500121 journal_header_t *header;
122 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700123 int barrier_done = 0;
124
125 if (is_journal_aborted(journal))
126 return 0;
127
128 descriptor = journal_get_descriptor_buffer(journal);
129 if (!descriptor)
130 return 1;
131
132 bh = jh2bh(descriptor);
133
Jan Kara53152172008-02-01 08:26:46 -0500134 header = (journal_header_t *)(bh->b_data);
135 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136 header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
139 JBUFFER_TRACE(descriptor, "write commit block");
140 set_buffer_dirty(bh);
141 if (journal->j_flags & JFS_BARRIER) {
142 set_buffer_ordered(bh);
143 barrier_done = 1;
144 }
145 ret = sync_dirty_buffer(bh);
Neil Brown28ae0942008-02-08 04:22:13 -0800146 if (barrier_done)
147 clear_buffer_ordered(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700148 /* is it possible for another commit to fail at roughly
149 * the same time as this one? If so, we don't want to
150 * trust the barrier flag in the super, but instead want
151 * to remember if we sent a barrier request
152 */
153 if (ret == -EOPNOTSUPP && barrier_done) {
154 char b[BDEVNAME_SIZE];
155
156 printk(KERN_WARNING
157 "JBD: barrier-based sync failed on %s - "
158 "disabling barriers\n",
159 bdevname(journal->j_dev, b));
160 spin_lock(&journal->j_state_lock);
161 journal->j_flags &= ~JFS_BARRIER;
162 spin_unlock(&journal->j_state_lock);
163
164 /* And try again, without the barrier */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700165 set_buffer_uptodate(bh);
166 set_buffer_dirty(bh);
167 ret = sync_dirty_buffer(bh);
168 }
169 put_bh(bh); /* One for getblk() */
170 journal_put_journal_head(descriptor);
171
172 return (ret == -EIO);
173}
174
Theodore Ts'o512a0042009-03-27 22:14:27 -0400175static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
176 int write_op)
Jan Kara3998b932006-09-25 23:30:53 -0700177{
178 int i;
179
180 for (i = 0; i < bufs; i++) {
181 wbuf[i]->b_end_io = end_buffer_write_sync;
182 /* We use-up our safety reference in submit_bh() */
Theodore Ts'o512a0042009-03-27 22:14:27 -0400183 submit_bh(write_op, wbuf[i]);
Jan Kara3998b932006-09-25 23:30:53 -0700184 }
185}
186
187/*
188 * Submit all the data buffers to disk
189 */
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700190static int journal_submit_data_buffers(journal_t *journal,
Theodore Ts'o512a0042009-03-27 22:14:27 -0400191 transaction_t *commit_transaction,
192 int write_op)
Jan Kara3998b932006-09-25 23:30:53 -0700193{
194 struct journal_head *jh;
195 struct buffer_head *bh;
196 int locked;
197 int bufs = 0;
198 struct buffer_head **wbuf = journal->j_wbuf;
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700199 int err = 0;
Jan Kara3998b932006-09-25 23:30:53 -0700200
201 /*
202 * Whenever we unlock the journal and sleep, things can get added
203 * onto ->t_sync_datalist, so we have to keep looping back to
204 * write_out_data until we *know* that the list is empty.
205 *
206 * Cleanup any flushed data buffers from the data list. Even in
207 * abort mode, we want to flush this out as soon as possible.
208 */
209write_out_data:
210 cond_resched();
211 spin_lock(&journal->j_list_lock);
212
213 while (commit_transaction->t_sync_datalist) {
214 jh = commit_transaction->t_sync_datalist;
215 bh = jh2bh(jh);
216 locked = 0;
217
218 /* Get reference just to make sure buffer does not disappear
219 * when we are forced to drop various locks */
220 get_bh(bh);
221 /* If the buffer is dirty, we need to submit IO and hence
222 * we need the buffer lock. We try to lock the buffer without
223 * blocking. If we fail, we need to drop j_list_lock and do
224 * blocking lock_buffer().
225 */
226 if (buffer_dirty(bh)) {
Nick Pigginca5de402008-08-02 12:02:13 +0200227 if (!trylock_buffer(bh)) {
Jan Kara3998b932006-09-25 23:30:53 -0700228 BUFFER_TRACE(bh, "needs blocking lock");
229 spin_unlock(&journal->j_list_lock);
230 /* Write out all data to prevent deadlocks */
Theodore Ts'o512a0042009-03-27 22:14:27 -0400231 journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara3998b932006-09-25 23:30:53 -0700232 bufs = 0;
233 lock_buffer(bh);
234 spin_lock(&journal->j_list_lock);
235 }
236 locked = 1;
237 }
238 /* We have to get bh_state lock. Again out of order, sigh. */
239 if (!inverted_lock(journal, bh)) {
240 jbd_lock_bh_state(bh);
241 spin_lock(&journal->j_list_lock);
242 }
243 /* Someone already cleaned up the buffer? */
244 if (!buffer_jbd(bh)
245 || jh->b_transaction != commit_transaction
246 || jh->b_jlist != BJ_SyncData) {
247 jbd_unlock_bh_state(bh);
248 if (locked)
249 unlock_buffer(bh);
250 BUFFER_TRACE(bh, "already cleaned up");
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700251 release_data_buffer(bh);
Jan Kara3998b932006-09-25 23:30:53 -0700252 continue;
253 }
254 if (locked && test_clear_buffer_dirty(bh)) {
255 BUFFER_TRACE(bh, "needs writeout, adding to array");
256 wbuf[bufs++] = bh;
257 __journal_file_buffer(jh, commit_transaction,
258 BJ_Locked);
259 jbd_unlock_bh_state(bh);
260 if (bufs == journal->j_wbufsize) {
261 spin_unlock(&journal->j_list_lock);
Theodore Ts'o512a0042009-03-27 22:14:27 -0400262 journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara3998b932006-09-25 23:30:53 -0700263 bufs = 0;
264 goto write_out_data;
265 }
Hisashi Hifumi6f5a9da2006-12-22 01:11:50 -0800266 } else if (!locked && buffer_locked(bh)) {
267 __journal_file_buffer(jh, commit_transaction,
268 BJ_Locked);
269 jbd_unlock_bh_state(bh);
270 put_bh(bh);
271 } else {
Jan Kara3998b932006-09-25 23:30:53 -0700272 BUFFER_TRACE(bh, "writeout complete: unfile");
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700273 if (unlikely(!buffer_uptodate(bh)))
274 err = -EIO;
Jan Kara3998b932006-09-25 23:30:53 -0700275 __journal_unfile_buffer(jh);
276 jbd_unlock_bh_state(bh);
277 if (locked)
278 unlock_buffer(bh);
279 journal_remove_journal_head(bh);
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700280 /* One for our safety reference, other for
Jan Kara3998b932006-09-25 23:30:53 -0700281 * journal_remove_journal_head() */
282 put_bh(bh);
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700283 release_data_buffer(bh);
Jan Kara3998b932006-09-25 23:30:53 -0700284 }
285
Nick Piggin95c354f2008-01-30 13:31:20 +0100286 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
Jan Kara3998b932006-09-25 23:30:53 -0700287 spin_unlock(&journal->j_list_lock);
288 goto write_out_data;
289 }
290 }
291 spin_unlock(&journal->j_list_lock);
Theodore Ts'o512a0042009-03-27 22:14:27 -0400292 journal_do_submit_data(wbuf, bufs, write_op);
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700293
294 return err;
Jan Kara3998b932006-09-25 23:30:53 -0700295}
296
Linus Torvalds1da177e2005-04-16 15:20:36 -0700297/*
298 * journal_commit_transaction
299 *
300 * The primary function for committing a transaction to the log. This
301 * function is called by the journal thread to begin a complete commit.
302 */
303void journal_commit_transaction(journal_t *journal)
304{
305 transaction_t *commit_transaction;
306 struct journal_head *jh, *new_jh, *descriptor;
307 struct buffer_head **wbuf = journal->j_wbuf;
308 int bufs;
309 int flags;
310 int err;
311 unsigned long blocknr;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800312 ktime_t start_time;
313 u64 commit_time;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700314 char *tagp = NULL;
315 journal_header_t *header;
316 journal_block_tag_t *tag = NULL;
317 int space_left = 0;
318 int first_tag = 0;
319 int tag_flag;
320 int i;
Theodore Ts'o512a0042009-03-27 22:14:27 -0400321 int write_op = WRITE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700322
323 /*
324 * First job: lock down the current transaction and wait for
325 * all outstanding updates to complete.
326 */
327
328#ifdef COMMIT_STATS
329 spin_lock(&journal->j_list_lock);
330 summarise_journal_usage(journal);
331 spin_unlock(&journal->j_list_lock);
332#endif
333
334 /* Do we need to erase the effects of a prior journal_flush? */
335 if (journal->j_flags & JFS_FLUSHED) {
336 jbd_debug(3, "super block updated\n");
337 journal_update_superblock(journal, 1);
338 } else {
339 jbd_debug(3, "superblock not updated\n");
340 }
341
342 J_ASSERT(journal->j_running_transaction != NULL);
343 J_ASSERT(journal->j_committing_transaction == NULL);
344
345 commit_transaction = journal->j_running_transaction;
346 J_ASSERT(commit_transaction->t_state == T_RUNNING);
347
348 jbd_debug(1, "JBD: starting commit of transaction %d\n",
349 commit_transaction->t_tid);
350
351 spin_lock(&journal->j_state_lock);
352 commit_transaction->t_state = T_LOCKED;
353
Theodore Ts'o512a0042009-03-27 22:14:27 -0400354 if (commit_transaction->t_synchronous_commit)
355 write_op = WRITE_SYNC;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700356 spin_lock(&commit_transaction->t_handle_lock);
357 while (commit_transaction->t_updates) {
358 DEFINE_WAIT(wait);
359
360 prepare_to_wait(&journal->j_wait_updates, &wait,
361 TASK_UNINTERRUPTIBLE);
362 if (commit_transaction->t_updates) {
363 spin_unlock(&commit_transaction->t_handle_lock);
364 spin_unlock(&journal->j_state_lock);
365 schedule();
366 spin_lock(&journal->j_state_lock);
367 spin_lock(&commit_transaction->t_handle_lock);
368 }
369 finish_wait(&journal->j_wait_updates, &wait);
370 }
371 spin_unlock(&commit_transaction->t_handle_lock);
372
373 J_ASSERT (commit_transaction->t_outstanding_credits <=
374 journal->j_max_transaction_buffers);
375
376 /*
377 * First thing we are allowed to do is to discard any remaining
378 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
379 * that there are no such buffers: if a large filesystem
380 * operation like a truncate needs to split itself over multiple
381 * transactions, then it may try to do a journal_restart() while
382 * there are still BJ_Reserved buffers outstanding. These must
383 * be released cleanly from the current transaction.
384 *
385 * In this case, the filesystem must still reserve write access
386 * again before modifying the buffer in the new transaction, but
387 * we do not require it to remember exactly which old buffers it
388 * has reserved. This is consistent with the existing behaviour
389 * that multiple journal_get_write_access() calls to the same
390 * buffer are perfectly permissable.
391 */
392 while (commit_transaction->t_reserved_list) {
393 jh = commit_transaction->t_reserved_list;
394 JBUFFER_TRACE(jh, "reserved, unused: refile");
395 /*
396 * A journal_get_undo_access()+journal_release_buffer() may
397 * leave undo-committed data.
398 */
399 if (jh->b_committed_data) {
400 struct buffer_head *bh = jh2bh(jh);
401
402 jbd_lock_bh_state(bh);
Mingming Caoc089d492007-10-16 18:38:25 -0400403 jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhlf99d49a2005-11-07 01:01:34 -0800404 jh->b_committed_data = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700405 jbd_unlock_bh_state(bh);
406 }
407 journal_refile_buffer(journal, jh);
408 }
409
410 /*
411 * Now try to drop any written-back buffers from the journal's
412 * checkpoint lists. We do this *before* commit because it potentially
413 * frees some memory
414 */
415 spin_lock(&journal->j_list_lock);
416 __journal_clean_checkpoint_list(journal);
417 spin_unlock(&journal->j_list_lock);
418
419 jbd_debug (3, "JBD: commit phase 1\n");
420
421 /*
422 * Switch to a new revoke table.
423 */
424 journal_switch_revoke_table(journal);
425
426 commit_transaction->t_state = T_FLUSH;
427 journal->j_committing_transaction = commit_transaction;
428 journal->j_running_transaction = NULL;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800429 start_time = ktime_get();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 commit_transaction->t_log_start = journal->j_head;
431 wake_up(&journal->j_wait_transaction_locked);
432 spin_unlock(&journal->j_state_lock);
433
434 jbd_debug (3, "JBD: commit phase 2\n");
435
436 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437 * Now start flushing things to disk, in the order they appear
438 * on the transaction lists. Data blocks go first.
439 */
Theodore Ts'o512a0042009-03-27 22:14:27 -0400440 err = journal_submit_data_buffers(journal, commit_transaction,
441 write_op);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442
443 /*
444 * Wait for all previously submitted IO to complete.
445 */
Jan Kara3998b932006-09-25 23:30:53 -0700446 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700447 while (commit_transaction->t_locked_list) {
448 struct buffer_head *bh;
449
450 jh = commit_transaction->t_locked_list->b_tprev;
451 bh = jh2bh(jh);
452 get_bh(bh);
453 if (buffer_locked(bh)) {
454 spin_unlock(&journal->j_list_lock);
455 wait_on_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700456 spin_lock(&journal->j_list_lock);
457 }
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700458 if (unlikely(!buffer_uptodate(bh))) {
Nick Piggin529ae9a2008-08-02 12:01:03 +0200459 if (!trylock_page(bh->b_page)) {
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700460 spin_unlock(&journal->j_list_lock);
461 lock_page(bh->b_page);
462 spin_lock(&journal->j_list_lock);
463 }
464 if (bh->b_page->mapping)
465 set_bit(AS_EIO, &bh->b_page->mapping->flags);
466
467 unlock_page(bh->b_page);
468 SetPageError(bh->b_page);
469 err = -EIO;
470 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700471 if (!inverted_lock(journal, bh)) {
472 put_bh(bh);
473 spin_lock(&journal->j_list_lock);
474 continue;
475 }
476 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
477 __journal_unfile_buffer(jh);
478 jbd_unlock_bh_state(bh);
479 journal_remove_journal_head(bh);
480 put_bh(bh);
481 } else {
482 jbd_unlock_bh_state(bh);
483 }
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700484 release_data_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700485 cond_resched_lock(&journal->j_list_lock);
486 }
487 spin_unlock(&journal->j_list_lock);
488
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700489 if (err) {
490 char b[BDEVNAME_SIZE];
491
492 printk(KERN_WARNING
493 "JBD: Detected IO errors while flushing file data "
494 "on %s\n", bdevname(journal->j_fs_dev, b));
Hidehiro Kawai0e4fb5e2008-10-18 20:27:57 -0700495 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
496 journal_abort(journal, err);
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700497 err = 0;
498 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700499
500 journal_write_revoke_records(journal, commit_transaction);
501
Linus Torvalds1da177e2005-04-16 15:20:36 -0700502 /*
503 * If we found any dirty or locked buffers, then we should have
504 * looped back up to the write_out_data label. If there weren't
505 * any then journal_clean_data_list should have wiped the list
506 * clean by now, so check that it is in fact empty.
507 */
508 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
509
510 jbd_debug (3, "JBD: commit phase 3\n");
511
512 /*
513 * Way to go: we have now written out all of the data for a
514 * transaction! Now comes the tricky part: we need to write out
515 * metadata. Loop over the transaction's entire buffer list:
516 */
Mingming Cao772279c2008-05-14 16:05:41 -0700517 spin_lock(&journal->j_state_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700518 commit_transaction->t_state = T_COMMIT;
Mingming Cao772279c2008-05-14 16:05:41 -0700519 spin_unlock(&journal->j_state_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700520
Josef Bacik5b9a4992008-04-28 02:16:12 -0700521 J_ASSERT(commit_transaction->t_nr_buffers <=
522 commit_transaction->t_outstanding_credits);
523
Linus Torvalds1da177e2005-04-16 15:20:36 -0700524 descriptor = NULL;
525 bufs = 0;
526 while (commit_transaction->t_buffers) {
527
528 /* Find the next buffer to be journaled... */
529
530 jh = commit_transaction->t_buffers;
531
532 /* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700533 release it. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700534
535 if (is_journal_aborted(journal)) {
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700536 clear_buffer_jbddirty(jh2bh(jh));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700537 JBUFFER_TRACE(jh, "journal is aborting: refile");
538 journal_refile_buffer(journal, jh);
539 /* If that was the last one, we need to clean up
540 * any descriptor buffers which may have been
541 * already allocated, even if we are now
542 * aborting. */
543 if (!commit_transaction->t_buffers)
544 goto start_journal_io;
545 continue;
546 }
547
548 /* Make sure we have a descriptor block in which to
549 record the metadata buffer. */
550
551 if (!descriptor) {
552 struct buffer_head *bh;
553
554 J_ASSERT (bufs == 0);
555
556 jbd_debug(4, "JBD: get descriptor\n");
557
558 descriptor = journal_get_descriptor_buffer(journal);
559 if (!descriptor) {
Jan Kara7a266e72007-10-18 23:39:22 -0700560 journal_abort(journal, -EIO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700561 continue;
562 }
563
564 bh = jh2bh(descriptor);
565 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
566 (unsigned long long)bh->b_blocknr, bh->b_data);
567 header = (journal_header_t *)&bh->b_data[0];
568 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
569 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
570 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
571
572 tagp = &bh->b_data[sizeof(journal_header_t)];
573 space_left = bh->b_size - sizeof(journal_header_t);
574 first_tag = 1;
575 set_buffer_jwrite(bh);
576 set_buffer_dirty(bh);
577 wbuf[bufs++] = bh;
578
579 /* Record it so that we can wait for IO
580 completion later */
581 BUFFER_TRACE(bh, "ph3: file as descriptor");
582 journal_file_buffer(descriptor, commit_transaction,
583 BJ_LogCtl);
584 }
585
586 /* Where is the buffer to be written? */
587
588 err = journal_next_log_block(journal, &blocknr);
589 /* If the block mapping failed, just abandon the buffer
590 and repeat this loop: we'll fall into the
591 refile-on-abort condition above. */
592 if (err) {
Jan Kara7a266e72007-10-18 23:39:22 -0700593 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700594 continue;
595 }
596
597 /*
598 * start_this_handle() uses t_outstanding_credits to determine
599 * the free space in the log, but this counter is changed
600 * by journal_next_log_block() also.
601 */
602 commit_transaction->t_outstanding_credits--;
603
604 /* Bump b_count to prevent truncate from stumbling over
605 the shadowed buffer! @@@ This can go if we ever get
606 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
607 atomic_inc(&jh2bh(jh)->b_count);
608
609 /* Make a temporary IO buffer with which to write it out
610 (this will requeue both the metadata buffer and the
611 temporary IO buffer). new_bh goes on BJ_IO*/
612
613 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
614 /*
615 * akpm: journal_write_metadata_buffer() sets
616 * new_bh->b_transaction to commit_transaction.
617 * We need to clean this up before we release new_bh
618 * (which is of type BJ_IO)
619 */
620 JBUFFER_TRACE(jh, "ph3: write metadata");
621 flags = journal_write_metadata_buffer(commit_transaction,
622 jh, &new_jh, blocknr);
623 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
624 wbuf[bufs++] = jh2bh(new_jh);
625
626 /* Record the new block's tag in the current descriptor
627 buffer */
628
629 tag_flag = 0;
630 if (flags & 1)
631 tag_flag |= JFS_FLAG_ESCAPE;
632 if (!first_tag)
633 tag_flag |= JFS_FLAG_SAME_UUID;
634
635 tag = (journal_block_tag_t *) tagp;
636 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
637 tag->t_flags = cpu_to_be32(tag_flag);
638 tagp += sizeof(journal_block_tag_t);
639 space_left -= sizeof(journal_block_tag_t);
640
641 if (first_tag) {
642 memcpy (tagp, journal->j_uuid, 16);
643 tagp += 16;
644 space_left -= 16;
645 first_tag = 0;
646 }
647
648 /* If there's no more to do, or if the descriptor is full,
649 let the IO rip! */
650
651 if (bufs == journal->j_wbufsize ||
652 commit_transaction->t_buffers == NULL ||
653 space_left < sizeof(journal_block_tag_t) + 16) {
654
655 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
656
657 /* Write an end-of-descriptor marker before
658 submitting the IOs. "tag" still points to
659 the last tag we set up. */
660
661 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
662
663start_journal_io:
664 for (i = 0; i < bufs; i++) {
665 struct buffer_head *bh = wbuf[i];
666 lock_buffer(bh);
667 clear_buffer_dirty(bh);
668 set_buffer_uptodate(bh);
669 bh->b_end_io = journal_end_buffer_io_sync;
Theodore Ts'o512a0042009-03-27 22:14:27 -0400670 submit_bh(write_op, bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700671 }
672 cond_resched();
673
674 /* Force a new descriptor to be generated next
675 time round the loop. */
676 descriptor = NULL;
677 bufs = 0;
678 }
679 }
680
681 /* Lo and behold: we have just managed to send a transaction to
682 the log. Before we can commit it, wait for the IO so far to
683 complete. Control buffers being written are on the
684 transaction's t_log_list queue, and metadata buffers are on
685 the t_iobuf_list queue.
686
687 Wait for the buffers in reverse order. That way we are
688 less likely to be woken up until all IOs have completed, and
689 so we incur less scheduling load.
690 */
691
692 jbd_debug(3, "JBD: commit phase 4\n");
693
694 /*
695 * akpm: these are BJ_IO, and j_list_lock is not needed.
696 * See __journal_try_to_free_buffer.
697 */
698wait_for_iobuf:
699 while (commit_transaction->t_iobuf_list != NULL) {
700 struct buffer_head *bh;
701
702 jh = commit_transaction->t_iobuf_list->b_tprev;
703 bh = jh2bh(jh);
704 if (buffer_locked(bh)) {
705 wait_on_buffer(bh);
706 goto wait_for_iobuf;
707 }
708 if (cond_resched())
709 goto wait_for_iobuf;
710
711 if (unlikely(!buffer_uptodate(bh)))
712 err = -EIO;
713
714 clear_buffer_jwrite(bh);
715
716 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
717 journal_unfile_buffer(journal, jh);
718
719 /*
720 * ->t_iobuf_list should contain only dummy buffer_heads
721 * which were created by journal_write_metadata_buffer().
722 */
723 BUFFER_TRACE(bh, "dumping temporary bh");
724 journal_put_journal_head(jh);
725 __brelse(bh);
726 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
727 free_buffer_head(bh);
728
729 /* We also have to unlock and free the corresponding
730 shadowed buffer */
731 jh = commit_transaction->t_shadow_list->b_tprev;
732 bh = jh2bh(jh);
733 clear_bit(BH_JWrite, &bh->b_state);
734 J_ASSERT_BH(bh, buffer_jbddirty(bh));
735
736 /* The metadata is now released for reuse, but we need
737 to remember it against this transaction so that when
738 we finally commit, we can do any checkpointing
739 required. */
740 JBUFFER_TRACE(jh, "file as BJ_Forget");
741 journal_file_buffer(jh, commit_transaction, BJ_Forget);
742 /* Wake up any transactions which were waiting for this
743 IO to complete */
744 wake_up_bit(&bh->b_state, BH_Unshadow);
745 JBUFFER_TRACE(jh, "brelse shadowed buffer");
746 __brelse(bh);
747 }
748
749 J_ASSERT (commit_transaction->t_shadow_list == NULL);
750
751 jbd_debug(3, "JBD: commit phase 5\n");
752
753 /* Here we wait for the revoke record and descriptor record buffers */
754 wait_for_ctlbuf:
755 while (commit_transaction->t_log_list != NULL) {
756 struct buffer_head *bh;
757
758 jh = commit_transaction->t_log_list->b_tprev;
759 bh = jh2bh(jh);
760 if (buffer_locked(bh)) {
761 wait_on_buffer(bh);
762 goto wait_for_ctlbuf;
763 }
764 if (cond_resched())
765 goto wait_for_ctlbuf;
766
767 if (unlikely(!buffer_uptodate(bh)))
768 err = -EIO;
769
770 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
771 clear_buffer_jwrite(bh);
772 journal_unfile_buffer(journal, jh);
773 journal_put_journal_head(jh);
774 __brelse(bh); /* One for getblk */
775 /* AKPM: bforget here */
776 }
777
Hidehiro Kawaid1645e52008-10-18 20:27:53 -0700778 if (err)
779 journal_abort(journal, err);
780
Linus Torvalds1da177e2005-04-16 15:20:36 -0700781 jbd_debug(3, "JBD: commit phase 6\n");
782
783 if (journal_write_commit_record(journal, commit_transaction))
784 err = -EIO;
785
786 if (err)
Jan Kara7a266e72007-10-18 23:39:22 -0700787 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700788
789 /* End of a transaction! Finally, we can do checkpoint
790 processing: any buffers committed as a result of this
791 transaction can be removed from any checkpoint list it was on
792 before. */
793
794 jbd_debug(3, "JBD: commit phase 7\n");
795
796 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
797 J_ASSERT(commit_transaction->t_buffers == NULL);
798 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
799 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
800 J_ASSERT(commit_transaction->t_shadow_list == NULL);
801 J_ASSERT(commit_transaction->t_log_list == NULL);
802
803restart_loop:
Jan Karae6c9f5c2005-09-06 15:19:09 -0700804 /*
805 * As there are other places (journal_unmap_buffer()) adding buffers
806 * to this list we have to be careful and hold the j_list_lock.
807 */
808 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700809 while (commit_transaction->t_forget) {
810 transaction_t *cp_transaction;
811 struct buffer_head *bh;
812
813 jh = commit_transaction->t_forget;
Jan Karae6c9f5c2005-09-06 15:19:09 -0700814 spin_unlock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700815 bh = jh2bh(jh);
816 jbd_lock_bh_state(bh);
817 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
818 jh->b_transaction == journal->j_running_transaction);
819
820 /*
821 * If there is undo-protected committed data against
822 * this buffer, then we can remove it now. If it is a
823 * buffer needing such protection, the old frozen_data
824 * field now points to a committed version of the
825 * buffer, so rotate that field to the new committed
826 * data.
827 *
828 * Otherwise, we can just throw away the frozen data now.
829 */
830 if (jh->b_committed_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400831 jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700832 jh->b_committed_data = NULL;
833 if (jh->b_frozen_data) {
834 jh->b_committed_data = jh->b_frozen_data;
835 jh->b_frozen_data = NULL;
836 }
837 } else if (jh->b_frozen_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400838 jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700839 jh->b_frozen_data = NULL;
840 }
841
842 spin_lock(&journal->j_list_lock);
843 cp_transaction = jh->b_cp_transaction;
844 if (cp_transaction) {
845 JBUFFER_TRACE(jh, "remove from old cp transaction");
846 __journal_remove_checkpoint(jh);
847 }
848
849 /* Only re-checkpoint the buffer_head if it is marked
850 * dirty. If the buffer was added to the BJ_Forget list
851 * by journal_forget, it may no longer be dirty and
852 * there's no point in keeping a checkpoint record for
853 * it. */
854
855 /* A buffer which has been freed while still being
856 * journaled by a previous transaction may end up still
857 * being dirty here, but we want to avoid writing back
858 * that buffer in the future now that the last use has
859 * been committed. That's not only a performance gain,
860 * it also stops aliasing problems if the buffer is left
861 * behind for writeback and gets reallocated for another
862 * use in a different page. */
863 if (buffer_freed(bh)) {
864 clear_buffer_freed(bh);
865 clear_buffer_jbddirty(bh);
866 }
867
868 if (buffer_jbddirty(bh)) {
869 JBUFFER_TRACE(jh, "add to new checkpointing trans");
870 __journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700871 if (is_journal_aborted(journal))
872 clear_buffer_jbddirty(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700873 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
874 __journal_refile_buffer(jh);
875 jbd_unlock_bh_state(bh);
876 } else {
877 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara9ada7342006-06-23 02:05:25 -0700878 /* The buffer on BJ_Forget list and not jbddirty means
879 * it has been freed by this transaction and hence it
880 * could not have been reallocated until this
881 * transaction has committed. *BUT* it could be
882 * reallocated once we have written all the data to
883 * disk and before we process the buffer on BJ_Forget
884 * list. */
885 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
886 __journal_refile_buffer(jh);
887 if (!jh->b_transaction) {
888 jbd_unlock_bh_state(bh);
889 /* needs a brelse */
890 journal_remove_journal_head(bh);
891 release_buffer_page(bh);
892 } else
893 jbd_unlock_bh_state(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700894 }
Jan Karae6c9f5c2005-09-06 15:19:09 -0700895 cond_resched_lock(&journal->j_list_lock);
896 }
897 spin_unlock(&journal->j_list_lock);
898 /*
Jan Karad4beaf42007-12-04 23:45:27 -0800899 * This is a bit sleazy. We use j_list_lock to protect transition
900 * of a transaction into T_FINISHED state and calling
901 * __journal_drop_transaction(). Otherwise we could race with
902 * other checkpointing code processing the transaction...
Jan Karae6c9f5c2005-09-06 15:19:09 -0700903 */
904 spin_lock(&journal->j_state_lock);
905 spin_lock(&journal->j_list_lock);
906 /*
907 * Now recheck if some buffers did not get attached to the transaction
908 * while the lock was dropped...
909 */
910 if (commit_transaction->t_forget) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700911 spin_unlock(&journal->j_list_lock);
Jan Karae6c9f5c2005-09-06 15:19:09 -0700912 spin_unlock(&journal->j_state_lock);
913 goto restart_loop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700914 }
915
916 /* Done with this transaction! */
917
918 jbd_debug(3, "JBD: commit phase 8\n");
919
920 J_ASSERT(commit_transaction->t_state == T_COMMIT);
921
Linus Torvalds1da177e2005-04-16 15:20:36 -0700922 commit_transaction->t_state = T_FINISHED;
923 J_ASSERT(commit_transaction == journal->j_committing_transaction);
924 journal->j_commit_sequence = commit_transaction->t_tid;
925 journal->j_committing_transaction = NULL;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800926 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
927
928 /*
929 * weight the commit time higher than the average time so we don't
930 * react too strongly to vast changes in commit time
931 */
932 if (likely(journal->j_average_commit_time))
933 journal->j_average_commit_time = (commit_time*3 +
934 journal->j_average_commit_time) / 4;
935 else
936 journal->j_average_commit_time = commit_time;
937
Linus Torvalds1da177e2005-04-16 15:20:36 -0700938 spin_unlock(&journal->j_state_lock);
939
Jan Karafe28e422007-07-15 23:37:18 -0700940 if (commit_transaction->t_checkpoint_list == NULL &&
941 commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700942 __journal_drop_transaction(journal, commit_transaction);
943 } else {
944 if (journal->j_checkpoint_transactions == NULL) {
945 journal->j_checkpoint_transactions = commit_transaction;
946 commit_transaction->t_cpnext = commit_transaction;
947 commit_transaction->t_cpprev = commit_transaction;
948 } else {
949 commit_transaction->t_cpnext =
950 journal->j_checkpoint_transactions;
951 commit_transaction->t_cpprev =
952 commit_transaction->t_cpnext->t_cpprev;
953 commit_transaction->t_cpnext->t_cpprev =
954 commit_transaction;
955 commit_transaction->t_cpprev->t_cpnext =
956 commit_transaction;
957 }
958 }
959 spin_unlock(&journal->j_list_lock);
960
961 jbd_debug(1, "JBD: commit %d complete, head %d\n",
962 journal->j_commit_sequence, journal->j_tail_sequence);
963
964 wake_up(&journal->j_wait_done_commit);
965}