blob: 52c15c776029098546cc8c4d1309d4a156f651cf [file] [log] [blame]
Linus Torvalds1da177e2005-04-16 15:20:36 -07001/*
Uwe Zeisbergerf30c2262006-10-03 23:01:26 +02002 * linux/fs/jbd/commit.c
Linus Torvalds1da177e2005-04-16 15:20:36 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
18#include <linux/jbd.h>
19#include <linux/errno.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070020#include <linux/mm.h>
21#include <linux/pagemap.h>
Theodore Ts'o512a0042009-03-27 22:14:27 -040022#include <linux/bio.h>
Jens Axboe65ab8022011-03-17 10:56:45 +010023#include <linux/blkdev.h>
Lukas Czerner99cb1a32011-05-23 18:33:02 +020024#include <trace/events/jbd.h>
Linus Torvalds1da177e2005-04-16 15:20:36 -070025
26/*
27 * Default IO end handler for temporary BJ_IO buffer_heads.
28 */
29static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
30{
31 BUFFER_TRACE(bh, "");
32 if (uptodate)
33 set_buffer_uptodate(bh);
34 else
35 clear_buffer_uptodate(bh);
36 unlock_buffer(bh);
37}
38
39/*
40 * When an ext3-ordered file is truncated, it is possible that many pages are
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070041 * not successfully freed, because they are attached to a committing transaction.
Linus Torvalds1da177e2005-04-16 15:20:36 -070042 * After the transaction commits, these pages are left on the LRU, with no
43 * ->mapping, and with attached buffers. These pages are trivially reclaimable
44 * by the VM, but their apparent absence upsets the VM accounting, and it makes
45 * the numbers in /proc/meminfo look odd.
46 *
47 * So here, we have a buffer which has just come off the forget list. Look to
48 * see if we can strip all buffers from the backing page.
49 *
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070050 * Called under journal->j_list_lock. The caller provided us with a ref
51 * against the buffer, and we drop that here.
Linus Torvalds1da177e2005-04-16 15:20:36 -070052 */
53static void release_buffer_page(struct buffer_head *bh)
54{
55 struct page *page;
56
57 if (buffer_dirty(bh))
58 goto nope;
59 if (atomic_read(&bh->b_count) != 1)
60 goto nope;
61 page = bh->b_page;
62 if (!page)
63 goto nope;
64 if (page->mapping)
65 goto nope;
66
67 /* OK, it's a truncated page */
Nick Piggin529ae9a2008-08-02 12:01:03 +020068 if (!trylock_page(page))
Linus Torvalds1da177e2005-04-16 15:20:36 -070069 goto nope;
70
71 page_cache_get(page);
72 __brelse(bh);
73 try_to_free_buffers(page);
74 unlock_page(page);
75 page_cache_release(page);
76 return;
77
78nope:
79 __brelse(bh);
80}
81
82/*
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -070083 * Decrement reference counter for data buffer. If it has been marked
84 * 'BH_Freed', release it and the page to which it belongs if possible.
85 */
86static void release_data_buffer(struct buffer_head *bh)
87{
88 if (buffer_freed(bh)) {
89 clear_buffer_freed(bh);
90 release_buffer_page(bh);
91 } else
92 put_bh(bh);
93}
94
95/*
Linus Torvalds1da177e2005-04-16 15:20:36 -070096 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
97 * held. For ranking reasons we must trylock. If we lose, schedule away and
98 * return 0. j_list_lock is dropped in this case.
99 */
100static int inverted_lock(journal_t *journal, struct buffer_head *bh)
101{
102 if (!jbd_trylock_bh_state(bh)) {
103 spin_unlock(&journal->j_list_lock);
104 schedule();
105 return 0;
106 }
107 return 1;
108}
109
110/* Done it all: now write the commit record. We should have
111 * cleaned up our previous buffers by now, so if we are in abort
112 * mode we can now just skip the rest of the journal write
113 * entirely.
114 *
115 * Returns 1 if the journal needs to be aborted or 0 on success
116 */
117static int journal_write_commit_record(journal_t *journal,
118 transaction_t *commit_transaction)
119{
120 struct journal_head *descriptor;
121 struct buffer_head *bh;
Jan Kara53152172008-02-01 08:26:46 -0500122 journal_header_t *header;
123 int ret;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700124
125 if (is_journal_aborted(journal))
126 return 0;
127
128 descriptor = journal_get_descriptor_buffer(journal);
129 if (!descriptor)
130 return 1;
131
132 bh = jh2bh(descriptor);
133
Jan Kara53152172008-02-01 08:26:46 -0500134 header = (journal_header_t *)(bh->b_data);
135 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
136 header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
137 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700138
139 JBUFFER_TRACE(descriptor, "write commit block");
140 set_buffer_dirty(bh);
Christoph Hellwig87e99512010-08-11 17:05:45 +0200141
Christoph Hellwig45244512010-08-18 05:29:16 -0400142 if (journal->j_flags & JFS_BARRIER)
143 ret = __sync_dirty_buffer(bh, WRITE_SYNC | WRITE_FLUSH_FUA);
144 else
Linus Torvalds1da177e2005-04-16 15:20:36 -0700145 ret = sync_dirty_buffer(bh);
Christoph Hellwig87e99512010-08-11 17:05:45 +0200146
Linus Torvalds1da177e2005-04-16 15:20:36 -0700147 put_bh(bh); /* One for getblk() */
148 journal_put_journal_head(descriptor);
149
150 return (ret == -EIO);
151}
152
Theodore Ts'o512a0042009-03-27 22:14:27 -0400153static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
154 int write_op)
Jan Kara3998b932006-09-25 23:30:53 -0700155{
156 int i;
157
158 for (i = 0; i < bufs; i++) {
159 wbuf[i]->b_end_io = end_buffer_write_sync;
160 /* We use-up our safety reference in submit_bh() */
Theodore Ts'o512a0042009-03-27 22:14:27 -0400161 submit_bh(write_op, wbuf[i]);
Jan Kara3998b932006-09-25 23:30:53 -0700162 }
163}
164
165/*
166 * Submit all the data buffers to disk
167 */
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700168static int journal_submit_data_buffers(journal_t *journal,
Theodore Ts'o512a0042009-03-27 22:14:27 -0400169 transaction_t *commit_transaction,
170 int write_op)
Jan Kara3998b932006-09-25 23:30:53 -0700171{
172 struct journal_head *jh;
173 struct buffer_head *bh;
174 int locked;
175 int bufs = 0;
176 struct buffer_head **wbuf = journal->j_wbuf;
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700177 int err = 0;
Jan Kara3998b932006-09-25 23:30:53 -0700178
179 /*
180 * Whenever we unlock the journal and sleep, things can get added
181 * onto ->t_sync_datalist, so we have to keep looping back to
182 * write_out_data until we *know* that the list is empty.
183 *
184 * Cleanup any flushed data buffers from the data list. Even in
185 * abort mode, we want to flush this out as soon as possible.
186 */
187write_out_data:
188 cond_resched();
189 spin_lock(&journal->j_list_lock);
190
191 while (commit_transaction->t_sync_datalist) {
192 jh = commit_transaction->t_sync_datalist;
193 bh = jh2bh(jh);
194 locked = 0;
195
196 /* Get reference just to make sure buffer does not disappear
197 * when we are forced to drop various locks */
198 get_bh(bh);
199 /* If the buffer is dirty, we need to submit IO and hence
200 * we need the buffer lock. We try to lock the buffer without
201 * blocking. If we fail, we need to drop j_list_lock and do
202 * blocking lock_buffer().
203 */
204 if (buffer_dirty(bh)) {
Nick Pigginca5de402008-08-02 12:02:13 +0200205 if (!trylock_buffer(bh)) {
Jan Kara3998b932006-09-25 23:30:53 -0700206 BUFFER_TRACE(bh, "needs blocking lock");
207 spin_unlock(&journal->j_list_lock);
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200208 trace_jbd_do_submit_data(journal,
209 commit_transaction);
Jan Kara3998b932006-09-25 23:30:53 -0700210 /* Write out all data to prevent deadlocks */
Theodore Ts'o512a0042009-03-27 22:14:27 -0400211 journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara3998b932006-09-25 23:30:53 -0700212 bufs = 0;
213 lock_buffer(bh);
214 spin_lock(&journal->j_list_lock);
215 }
216 locked = 1;
217 }
218 /* We have to get bh_state lock. Again out of order, sigh. */
219 if (!inverted_lock(journal, bh)) {
220 jbd_lock_bh_state(bh);
221 spin_lock(&journal->j_list_lock);
222 }
223 /* Someone already cleaned up the buffer? */
Jan Karaa61d90d2009-06-09 16:26:26 -0700224 if (!buffer_jbd(bh) || bh2jh(bh) != jh
Jan Kara3998b932006-09-25 23:30:53 -0700225 || jh->b_transaction != commit_transaction
226 || jh->b_jlist != BJ_SyncData) {
227 jbd_unlock_bh_state(bh);
228 if (locked)
229 unlock_buffer(bh);
230 BUFFER_TRACE(bh, "already cleaned up");
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700231 release_data_buffer(bh);
Jan Kara3998b932006-09-25 23:30:53 -0700232 continue;
233 }
234 if (locked && test_clear_buffer_dirty(bh)) {
235 BUFFER_TRACE(bh, "needs writeout, adding to array");
236 wbuf[bufs++] = bh;
237 __journal_file_buffer(jh, commit_transaction,
238 BJ_Locked);
239 jbd_unlock_bh_state(bh);
240 if (bufs == journal->j_wbufsize) {
241 spin_unlock(&journal->j_list_lock);
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200242 trace_jbd_do_submit_data(journal,
243 commit_transaction);
Theodore Ts'o512a0042009-03-27 22:14:27 -0400244 journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara3998b932006-09-25 23:30:53 -0700245 bufs = 0;
246 goto write_out_data;
247 }
Hisashi Hifumi6f5a9da2006-12-22 01:11:50 -0800248 } else if (!locked && buffer_locked(bh)) {
249 __journal_file_buffer(jh, commit_transaction,
250 BJ_Locked);
251 jbd_unlock_bh_state(bh);
252 put_bh(bh);
253 } else {
Jan Kara3998b932006-09-25 23:30:53 -0700254 BUFFER_TRACE(bh, "writeout complete: unfile");
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700255 if (unlikely(!buffer_uptodate(bh)))
256 err = -EIO;
Jan Kara3998b932006-09-25 23:30:53 -0700257 __journal_unfile_buffer(jh);
258 jbd_unlock_bh_state(bh);
259 if (locked)
260 unlock_buffer(bh);
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700261 release_data_buffer(bh);
Jan Kara3998b932006-09-25 23:30:53 -0700262 }
263
Nick Piggin95c354f2008-01-30 13:31:20 +0100264 if (need_resched() || spin_needbreak(&journal->j_list_lock)) {
Jan Kara3998b932006-09-25 23:30:53 -0700265 spin_unlock(&journal->j_list_lock);
266 goto write_out_data;
267 }
268 }
269 spin_unlock(&journal->j_list_lock);
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200270 trace_jbd_do_submit_data(journal, commit_transaction);
Theodore Ts'o512a0042009-03-27 22:14:27 -0400271 journal_do_submit_data(wbuf, bufs, write_op);
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700272
273 return err;
Jan Kara3998b932006-09-25 23:30:53 -0700274}
275
Linus Torvalds1da177e2005-04-16 15:20:36 -0700276/*
277 * journal_commit_transaction
278 *
279 * The primary function for committing a transaction to the log. This
280 * function is called by the journal thread to begin a complete commit.
281 */
282void journal_commit_transaction(journal_t *journal)
283{
284 transaction_t *commit_transaction;
285 struct journal_head *jh, *new_jh, *descriptor;
286 struct buffer_head **wbuf = journal->j_wbuf;
287 int bufs;
288 int flags;
289 int err;
Jan Kara9c28cbc2009-08-03 19:21:00 +0200290 unsigned int blocknr;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800291 ktime_t start_time;
292 u64 commit_time;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700293 char *tagp = NULL;
294 journal_header_t *header;
295 journal_block_tag_t *tag = NULL;
296 int space_left = 0;
297 int first_tag = 0;
298 int tag_flag;
299 int i;
Jens Axboe65ab8022011-03-17 10:56:45 +0100300 struct blk_plug plug;
Jan Kara2db938b2011-02-21 17:25:37 +0100301 int write_op = WRITE;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700302
303 /*
304 * First job: lock down the current transaction and wait for
305 * all outstanding updates to complete.
306 */
307
Linus Torvalds1da177e2005-04-16 15:20:36 -0700308 /* Do we need to erase the effects of a prior journal_flush? */
309 if (journal->j_flags & JFS_FLUSHED) {
310 jbd_debug(3, "super block updated\n");
Jan Kara1ce84862012-04-07 12:50:13 +0200311 mutex_lock(&journal->j_checkpoint_mutex);
Jan Karafd2cbd42012-04-07 11:05:19 +0200312 /*
313 * We hold j_checkpoint_mutex so tail cannot change under us.
314 * We don't need any special data guarantees for writing sb
315 * since journal is empty and it is ok for write to be
316 * flushed only with transaction commit.
317 */
318 journal_update_sb_log_tail(journal, journal->j_tail_sequence,
319 journal->j_tail, WRITE_SYNC);
Jan Kara1ce84862012-04-07 12:50:13 +0200320 mutex_unlock(&journal->j_checkpoint_mutex);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700321 } else {
322 jbd_debug(3, "superblock not updated\n");
323 }
324
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
327
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200331 trace_jbd_start_commit(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700332 jbd_debug(1, "JBD: starting commit of transaction %d\n",
333 commit_transaction->t_tid);
334
335 spin_lock(&journal->j_state_lock);
336 commit_transaction->t_state = T_LOCKED;
337
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200338 trace_jbd_commit_locking(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700339 spin_lock(&commit_transaction->t_handle_lock);
340 while (commit_transaction->t_updates) {
341 DEFINE_WAIT(wait);
342
343 prepare_to_wait(&journal->j_wait_updates, &wait,
344 TASK_UNINTERRUPTIBLE);
345 if (commit_transaction->t_updates) {
346 spin_unlock(&commit_transaction->t_handle_lock);
347 spin_unlock(&journal->j_state_lock);
348 schedule();
349 spin_lock(&journal->j_state_lock);
350 spin_lock(&commit_transaction->t_handle_lock);
351 }
352 finish_wait(&journal->j_wait_updates, &wait);
353 }
354 spin_unlock(&commit_transaction->t_handle_lock);
355
356 J_ASSERT (commit_transaction->t_outstanding_credits <=
357 journal->j_max_transaction_buffers);
358
359 /*
360 * First thing we are allowed to do is to discard any remaining
361 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
362 * that there are no such buffers: if a large filesystem
363 * operation like a truncate needs to split itself over multiple
364 * transactions, then it may try to do a journal_restart() while
365 * there are still BJ_Reserved buffers outstanding. These must
366 * be released cleanly from the current transaction.
367 *
368 * In this case, the filesystem must still reserve write access
369 * again before modifying the buffer in the new transaction, but
370 * we do not require it to remember exactly which old buffers it
371 * has reserved. This is consistent with the existing behaviour
372 * that multiple journal_get_write_access() calls to the same
Lucas De Marchi25985ed2011-03-30 22:57:33 -0300373 * buffer are perfectly permissible.
Linus Torvalds1da177e2005-04-16 15:20:36 -0700374 */
375 while (commit_transaction->t_reserved_list) {
376 jh = commit_transaction->t_reserved_list;
377 JBUFFER_TRACE(jh, "reserved, unused: refile");
378 /*
379 * A journal_get_undo_access()+journal_release_buffer() may
380 * leave undo-committed data.
381 */
382 if (jh->b_committed_data) {
383 struct buffer_head *bh = jh2bh(jh);
384
385 jbd_lock_bh_state(bh);
Mingming Caoc089d492007-10-16 18:38:25 -0400386 jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhlf99d49a2005-11-07 01:01:34 -0800387 jh->b_committed_data = NULL;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700388 jbd_unlock_bh_state(bh);
389 }
390 journal_refile_buffer(journal, jh);
391 }
392
393 /*
394 * Now try to drop any written-back buffers from the journal's
395 * checkpoint lists. We do this *before* commit because it potentially
396 * frees some memory
397 */
398 spin_lock(&journal->j_list_lock);
399 __journal_clean_checkpoint_list(journal);
400 spin_unlock(&journal->j_list_lock);
401
402 jbd_debug (3, "JBD: commit phase 1\n");
403
404 /*
Yongqiang Yang8c111b32011-11-19 17:34:29 +0800405 * Clear revoked flag to reflect there is no revoked buffers
406 * in the next transaction which is going to be started.
407 */
408 journal_clear_buffer_revoked_flags(journal);
409
410 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700411 * Switch to a new revoke table.
412 */
413 journal_switch_revoke_table(journal);
414
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200415 trace_jbd_commit_flushing(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700416 commit_transaction->t_state = T_FLUSH;
417 journal->j_committing_transaction = commit_transaction;
418 journal->j_running_transaction = NULL;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800419 start_time = ktime_get();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700420 commit_transaction->t_log_start = journal->j_head;
421 wake_up(&journal->j_wait_transaction_locked);
422 spin_unlock(&journal->j_state_lock);
423
424 jbd_debug (3, "JBD: commit phase 2\n");
425
Jan Kara2db938b2011-02-21 17:25:37 +0100426 if (tid_geq(journal->j_commit_waited, commit_transaction->t_tid))
427 write_op = WRITE_SYNC;
428
Linus Torvalds1da177e2005-04-16 15:20:36 -0700429 /*
Linus Torvalds1da177e2005-04-16 15:20:36 -0700430 * Now start flushing things to disk, in the order they appear
431 * on the transaction lists. Data blocks go first.
432 */
Jens Axboe65ab8022011-03-17 10:56:45 +0100433 blk_start_plug(&plug);
Theodore Ts'o512a0042009-03-27 22:14:27 -0400434 err = journal_submit_data_buffers(journal, commit_transaction,
Jan Kara2db938b2011-02-21 17:25:37 +0100435 write_op);
Jens Axboe65ab8022011-03-17 10:56:45 +0100436 blk_finish_plug(&plug);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700437
438 /*
439 * Wait for all previously submitted IO to complete.
440 */
Jan Kara3998b932006-09-25 23:30:53 -0700441 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700442 while (commit_transaction->t_locked_list) {
443 struct buffer_head *bh;
444
445 jh = commit_transaction->t_locked_list->b_tprev;
446 bh = jh2bh(jh);
447 get_bh(bh);
448 if (buffer_locked(bh)) {
449 spin_unlock(&journal->j_list_lock);
450 wait_on_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700451 spin_lock(&journal->j_list_lock);
452 }
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700453 if (unlikely(!buffer_uptodate(bh))) {
Nick Piggin529ae9a2008-08-02 12:01:03 +0200454 if (!trylock_page(bh->b_page)) {
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700455 spin_unlock(&journal->j_list_lock);
456 lock_page(bh->b_page);
457 spin_lock(&journal->j_list_lock);
458 }
459 if (bh->b_page->mapping)
460 set_bit(AS_EIO, &bh->b_page->mapping->flags);
461
462 unlock_page(bh->b_page);
463 SetPageError(bh->b_page);
464 err = -EIO;
465 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700466 if (!inverted_lock(journal, bh)) {
467 put_bh(bh);
468 spin_lock(&journal->j_list_lock);
469 continue;
470 }
Jan Karaa61d90d2009-06-09 16:26:26 -0700471 if (buffer_jbd(bh) && bh2jh(bh) == jh &&
472 jh->b_transaction == commit_transaction &&
Jan Karabb189242011-06-24 23:11:59 +0200473 jh->b_jlist == BJ_Locked)
Linus Torvalds1da177e2005-04-16 15:20:36 -0700474 __journal_unfile_buffer(jh);
Jan Karabb189242011-06-24 23:11:59 +0200475 jbd_unlock_bh_state(bh);
Toshiyuki Okajimafc80c442008-07-25 01:46:29 -0700476 release_data_buffer(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700477 cond_resched_lock(&journal->j_list_lock);
478 }
479 spin_unlock(&journal->j_list_lock);
480
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700481 if (err) {
482 char b[BDEVNAME_SIZE];
483
484 printk(KERN_WARNING
485 "JBD: Detected IO errors while flushing file data "
486 "on %s\n", bdevname(journal->j_fs_dev, b));
Hidehiro Kawai0e4fb5e2008-10-18 20:27:57 -0700487 if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
488 journal_abort(journal, err);
Hidehiro Kawaicbe5f462008-07-25 01:46:30 -0700489 err = 0;
490 }
Linus Torvalds1da177e2005-04-16 15:20:36 -0700491
Jens Axboe65ab8022011-03-17 10:56:45 +0100492 blk_start_plug(&plug);
493
Jan Kara2db938b2011-02-21 17:25:37 +0100494 journal_write_revoke_records(journal, commit_transaction, write_op);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700495
Linus Torvalds1da177e2005-04-16 15:20:36 -0700496 /*
497 * If we found any dirty or locked buffers, then we should have
498 * looped back up to the write_out_data label. If there weren't
499 * any then journal_clean_data_list should have wiped the list
500 * clean by now, so check that it is in fact empty.
501 */
502 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
503
504 jbd_debug (3, "JBD: commit phase 3\n");
505
506 /*
507 * Way to go: we have now written out all of the data for a
508 * transaction! Now comes the tricky part: we need to write out
509 * metadata. Loop over the transaction's entire buffer list:
510 */
Mingming Cao772279c2008-05-14 16:05:41 -0700511 spin_lock(&journal->j_state_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700512 commit_transaction->t_state = T_COMMIT;
Mingming Cao772279c2008-05-14 16:05:41 -0700513 spin_unlock(&journal->j_state_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700514
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200515 trace_jbd_commit_logging(journal, commit_transaction);
Josef Bacik5b9a4992008-04-28 02:16:12 -0700516 J_ASSERT(commit_transaction->t_nr_buffers <=
517 commit_transaction->t_outstanding_credits);
518
Linus Torvalds1da177e2005-04-16 15:20:36 -0700519 descriptor = NULL;
520 bufs = 0;
521 while (commit_transaction->t_buffers) {
522
523 /* Find the next buffer to be journaled... */
524
525 jh = commit_transaction->t_buffers;
526
527 /* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700528 release it. */
Linus Torvalds1da177e2005-04-16 15:20:36 -0700529
530 if (is_journal_aborted(journal)) {
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700531 clear_buffer_jbddirty(jh2bh(jh));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700532 JBUFFER_TRACE(jh, "journal is aborting: refile");
533 journal_refile_buffer(journal, jh);
534 /* If that was the last one, we need to clean up
535 * any descriptor buffers which may have been
536 * already allocated, even if we are now
537 * aborting. */
538 if (!commit_transaction->t_buffers)
539 goto start_journal_io;
540 continue;
541 }
542
543 /* Make sure we have a descriptor block in which to
544 record the metadata buffer. */
545
546 if (!descriptor) {
547 struct buffer_head *bh;
548
549 J_ASSERT (bufs == 0);
550
551 jbd_debug(4, "JBD: get descriptor\n");
552
553 descriptor = journal_get_descriptor_buffer(journal);
554 if (!descriptor) {
Jan Kara7a266e72007-10-18 23:39:22 -0700555 journal_abort(journal, -EIO);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700556 continue;
557 }
558
559 bh = jh2bh(descriptor);
560 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
561 (unsigned long long)bh->b_blocknr, bh->b_data);
562 header = (journal_header_t *)&bh->b_data[0];
563 header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
564 header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
565 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
566
567 tagp = &bh->b_data[sizeof(journal_header_t)];
568 space_left = bh->b_size - sizeof(journal_header_t);
569 first_tag = 1;
570 set_buffer_jwrite(bh);
571 set_buffer_dirty(bh);
572 wbuf[bufs++] = bh;
573
574 /* Record it so that we can wait for IO
575 completion later */
576 BUFFER_TRACE(bh, "ph3: file as descriptor");
577 journal_file_buffer(descriptor, commit_transaction,
578 BJ_LogCtl);
579 }
580
581 /* Where is the buffer to be written? */
582
583 err = journal_next_log_block(journal, &blocknr);
584 /* If the block mapping failed, just abandon the buffer
585 and repeat this loop: we'll fall into the
586 refile-on-abort condition above. */
587 if (err) {
Jan Kara7a266e72007-10-18 23:39:22 -0700588 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700589 continue;
590 }
591
592 /*
593 * start_this_handle() uses t_outstanding_credits to determine
594 * the free space in the log, but this counter is changed
595 * by journal_next_log_block() also.
596 */
597 commit_transaction->t_outstanding_credits--;
598
599 /* Bump b_count to prevent truncate from stumbling over
600 the shadowed buffer! @@@ This can go if we ever get
601 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
Namhyung Kime4d5e3a2010-10-16 17:11:02 +0900602 get_bh(jh2bh(jh));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700603
604 /* Make a temporary IO buffer with which to write it out
605 (this will requeue both the metadata buffer and the
606 temporary IO buffer). new_bh goes on BJ_IO*/
607
Namhyung Kima910eef2010-10-08 20:05:06 +0900608 set_buffer_jwrite(jh2bh(jh));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700609 /*
610 * akpm: journal_write_metadata_buffer() sets
611 * new_bh->b_transaction to commit_transaction.
612 * We need to clean this up before we release new_bh
613 * (which is of type BJ_IO)
614 */
615 JBUFFER_TRACE(jh, "ph3: write metadata");
616 flags = journal_write_metadata_buffer(commit_transaction,
617 jh, &new_jh, blocknr);
Namhyung Kima910eef2010-10-08 20:05:06 +0900618 set_buffer_jwrite(jh2bh(new_jh));
Linus Torvalds1da177e2005-04-16 15:20:36 -0700619 wbuf[bufs++] = jh2bh(new_jh);
620
621 /* Record the new block's tag in the current descriptor
622 buffer */
623
624 tag_flag = 0;
625 if (flags & 1)
626 tag_flag |= JFS_FLAG_ESCAPE;
627 if (!first_tag)
628 tag_flag |= JFS_FLAG_SAME_UUID;
629
630 tag = (journal_block_tag_t *) tagp;
631 tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
632 tag->t_flags = cpu_to_be32(tag_flag);
633 tagp += sizeof(journal_block_tag_t);
634 space_left -= sizeof(journal_block_tag_t);
635
636 if (first_tag) {
637 memcpy (tagp, journal->j_uuid, 16);
638 tagp += 16;
639 space_left -= 16;
640 first_tag = 0;
641 }
642
643 /* If there's no more to do, or if the descriptor is full,
644 let the IO rip! */
645
646 if (bufs == journal->j_wbufsize ||
647 commit_transaction->t_buffers == NULL ||
648 space_left < sizeof(journal_block_tag_t) + 16) {
649
650 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
651
652 /* Write an end-of-descriptor marker before
653 submitting the IOs. "tag" still points to
654 the last tag we set up. */
655
656 tag->t_flags |= cpu_to_be32(JFS_FLAG_LAST_TAG);
657
658start_journal_io:
659 for (i = 0; i < bufs; i++) {
660 struct buffer_head *bh = wbuf[i];
661 lock_buffer(bh);
662 clear_buffer_dirty(bh);
663 set_buffer_uptodate(bh);
664 bh->b_end_io = journal_end_buffer_io_sync;
Jan Kara2db938b2011-02-21 17:25:37 +0100665 submit_bh(write_op, bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700666 }
667 cond_resched();
668
669 /* Force a new descriptor to be generated next
670 time round the loop. */
671 descriptor = NULL;
672 bufs = 0;
673 }
674 }
675
Jens Axboe65ab8022011-03-17 10:56:45 +0100676 blk_finish_plug(&plug);
677
Linus Torvalds1da177e2005-04-16 15:20:36 -0700678 /* Lo and behold: we have just managed to send a transaction to
679 the log. Before we can commit it, wait for the IO so far to
680 complete. Control buffers being written are on the
681 transaction's t_log_list queue, and metadata buffers are on
682 the t_iobuf_list queue.
683
684 Wait for the buffers in reverse order. That way we are
685 less likely to be woken up until all IOs have completed, and
686 so we incur less scheduling load.
687 */
688
689 jbd_debug(3, "JBD: commit phase 4\n");
690
691 /*
692 * akpm: these are BJ_IO, and j_list_lock is not needed.
693 * See __journal_try_to_free_buffer.
694 */
695wait_for_iobuf:
696 while (commit_transaction->t_iobuf_list != NULL) {
697 struct buffer_head *bh;
698
699 jh = commit_transaction->t_iobuf_list->b_tprev;
700 bh = jh2bh(jh);
701 if (buffer_locked(bh)) {
702 wait_on_buffer(bh);
703 goto wait_for_iobuf;
704 }
705 if (cond_resched())
706 goto wait_for_iobuf;
707
708 if (unlikely(!buffer_uptodate(bh)))
709 err = -EIO;
710
711 clear_buffer_jwrite(bh);
712
713 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
714 journal_unfile_buffer(journal, jh);
715
716 /*
717 * ->t_iobuf_list should contain only dummy buffer_heads
718 * which were created by journal_write_metadata_buffer().
719 */
720 BUFFER_TRACE(bh, "dumping temporary bh");
721 journal_put_journal_head(jh);
722 __brelse(bh);
723 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
724 free_buffer_head(bh);
725
726 /* We also have to unlock and free the corresponding
727 shadowed buffer */
728 jh = commit_transaction->t_shadow_list->b_tprev;
729 bh = jh2bh(jh);
Namhyung Kima910eef2010-10-08 20:05:06 +0900730 clear_buffer_jwrite(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700731 J_ASSERT_BH(bh, buffer_jbddirty(bh));
732
733 /* The metadata is now released for reuse, but we need
734 to remember it against this transaction so that when
735 we finally commit, we can do any checkpointing
736 required. */
737 JBUFFER_TRACE(jh, "file as BJ_Forget");
738 journal_file_buffer(jh, commit_transaction, BJ_Forget);
Jan Kara2842bb22011-05-05 13:59:35 +0200739 /*
740 * Wake up any transactions which were waiting for this
741 * IO to complete. The barrier must be here so that changes
742 * by journal_file_buffer() take effect before wake_up_bit()
743 * does the waitqueue check.
744 */
745 smp_mb();
Linus Torvalds1da177e2005-04-16 15:20:36 -0700746 wake_up_bit(&bh->b_state, BH_Unshadow);
747 JBUFFER_TRACE(jh, "brelse shadowed buffer");
748 __brelse(bh);
749 }
750
751 J_ASSERT (commit_transaction->t_shadow_list == NULL);
752
753 jbd_debug(3, "JBD: commit phase 5\n");
754
755 /* Here we wait for the revoke record and descriptor record buffers */
756 wait_for_ctlbuf:
757 while (commit_transaction->t_log_list != NULL) {
758 struct buffer_head *bh;
759
760 jh = commit_transaction->t_log_list->b_tprev;
761 bh = jh2bh(jh);
762 if (buffer_locked(bh)) {
763 wait_on_buffer(bh);
764 goto wait_for_ctlbuf;
765 }
766 if (cond_resched())
767 goto wait_for_ctlbuf;
768
769 if (unlikely(!buffer_uptodate(bh)))
770 err = -EIO;
771
772 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
773 clear_buffer_jwrite(bh);
774 journal_unfile_buffer(journal, jh);
775 journal_put_journal_head(jh);
776 __brelse(bh); /* One for getblk */
777 /* AKPM: bforget here */
778 }
779
Hidehiro Kawaid1645e52008-10-18 20:27:53 -0700780 if (err)
781 journal_abort(journal, err);
782
Linus Torvalds1da177e2005-04-16 15:20:36 -0700783 jbd_debug(3, "JBD: commit phase 6\n");
784
Jan Kara03f4d802010-04-15 22:16:24 +0200785 /* All metadata is written, now write commit record and do cleanup */
786 spin_lock(&journal->j_state_lock);
787 J_ASSERT(commit_transaction->t_state == T_COMMIT);
788 commit_transaction->t_state = T_COMMIT_RECORD;
789 spin_unlock(&journal->j_state_lock);
790
Linus Torvalds1da177e2005-04-16 15:20:36 -0700791 if (journal_write_commit_record(journal, commit_transaction))
792 err = -EIO;
793
794 if (err)
Jan Kara7a266e72007-10-18 23:39:22 -0700795 journal_abort(journal, err);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700796
797 /* End of a transaction! Finally, we can do checkpoint
798 processing: any buffers committed as a result of this
799 transaction can be removed from any checkpoint list it was on
800 before. */
801
802 jbd_debug(3, "JBD: commit phase 7\n");
803
804 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
805 J_ASSERT(commit_transaction->t_buffers == NULL);
806 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
807 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
808 J_ASSERT(commit_transaction->t_shadow_list == NULL);
809 J_ASSERT(commit_transaction->t_log_list == NULL);
810
811restart_loop:
Jan Karae6c9f5c2005-09-06 15:19:09 -0700812 /*
813 * As there are other places (journal_unmap_buffer()) adding buffers
814 * to this list we have to be careful and hold the j_list_lock.
815 */
816 spin_lock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700817 while (commit_transaction->t_forget) {
818 transaction_t *cp_transaction;
819 struct buffer_head *bh;
Jan Karabb189242011-06-24 23:11:59 +0200820 int try_to_free = 0;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700821
822 jh = commit_transaction->t_forget;
Jan Karae6c9f5c2005-09-06 15:19:09 -0700823 spin_unlock(&journal->j_list_lock);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700824 bh = jh2bh(jh);
Jan Karabb189242011-06-24 23:11:59 +0200825 /*
826 * Get a reference so that bh cannot be freed before we are
827 * done with it.
828 */
829 get_bh(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700830 jbd_lock_bh_state(bh);
831 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
832 jh->b_transaction == journal->j_running_transaction);
833
834 /*
835 * If there is undo-protected committed data against
836 * this buffer, then we can remove it now. If it is a
837 * buffer needing such protection, the old frozen_data
838 * field now points to a committed version of the
839 * buffer, so rotate that field to the new committed
840 * data.
841 *
842 * Otherwise, we can just throw away the frozen data now.
843 */
844 if (jh->b_committed_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400845 jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700846 jh->b_committed_data = NULL;
847 if (jh->b_frozen_data) {
848 jh->b_committed_data = jh->b_frozen_data;
849 jh->b_frozen_data = NULL;
850 }
851 } else if (jh->b_frozen_data) {
Mingming Caoc089d492007-10-16 18:38:25 -0400852 jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700853 jh->b_frozen_data = NULL;
854 }
855
856 spin_lock(&journal->j_list_lock);
857 cp_transaction = jh->b_cp_transaction;
858 if (cp_transaction) {
859 JBUFFER_TRACE(jh, "remove from old cp transaction");
860 __journal_remove_checkpoint(jh);
861 }
862
863 /* Only re-checkpoint the buffer_head if it is marked
864 * dirty. If the buffer was added to the BJ_Forget list
865 * by journal_forget, it may no longer be dirty and
866 * there's no point in keeping a checkpoint record for
867 * it. */
868
869 /* A buffer which has been freed while still being
870 * journaled by a previous transaction may end up still
871 * being dirty here, but we want to avoid writing back
Jan Kara86963912010-02-16 20:37:12 +0100872 * that buffer in the future after the "add to orphan"
873 * operation been committed, That's not only a performance
874 * gain, it also stops aliasing problems if the buffer is
875 * left behind for writeback and gets reallocated for another
Linus Torvalds1da177e2005-04-16 15:20:36 -0700876 * use in a different page. */
Jan Kara86963912010-02-16 20:37:12 +0100877 if (buffer_freed(bh) && !jh->b_next_transaction) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700878 clear_buffer_freed(bh);
879 clear_buffer_jbddirty(bh);
880 }
881
882 if (buffer_jbddirty(bh)) {
883 JBUFFER_TRACE(jh, "add to new checkpointing trans");
884 __journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai885e3532008-10-18 20:27:54 -0700885 if (is_journal_aborted(journal))
886 clear_buffer_jbddirty(bh);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700887 } else {
888 J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Karabb189242011-06-24 23:11:59 +0200889 /*
890 * The buffer on BJ_Forget list and not jbddirty means
Jan Kara9ada7342006-06-23 02:05:25 -0700891 * it has been freed by this transaction and hence it
892 * could not have been reallocated until this
893 * transaction has committed. *BUT* it could be
894 * reallocated once we have written all the data to
895 * disk and before we process the buffer on BJ_Forget
Jan Karabb189242011-06-24 23:11:59 +0200896 * list.
897 */
898 if (!jh->b_next_transaction)
899 try_to_free = 1;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700900 }
Jan Karabb189242011-06-24 23:11:59 +0200901 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
902 __journal_refile_buffer(jh);
903 jbd_unlock_bh_state(bh);
904 if (try_to_free)
905 release_buffer_page(bh);
906 else
907 __brelse(bh);
Jan Karae6c9f5c2005-09-06 15:19:09 -0700908 cond_resched_lock(&journal->j_list_lock);
909 }
910 spin_unlock(&journal->j_list_lock);
911 /*
Jan Karad4beaf42007-12-04 23:45:27 -0800912 * This is a bit sleazy. We use j_list_lock to protect transition
913 * of a transaction into T_FINISHED state and calling
914 * __journal_drop_transaction(). Otherwise we could race with
915 * other checkpointing code processing the transaction...
Jan Karae6c9f5c2005-09-06 15:19:09 -0700916 */
917 spin_lock(&journal->j_state_lock);
918 spin_lock(&journal->j_list_lock);
919 /*
920 * Now recheck if some buffers did not get attached to the transaction
921 * while the lock was dropped...
922 */
923 if (commit_transaction->t_forget) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700924 spin_unlock(&journal->j_list_lock);
Jan Karae6c9f5c2005-09-06 15:19:09 -0700925 spin_unlock(&journal->j_state_lock);
926 goto restart_loop;
Linus Torvalds1da177e2005-04-16 15:20:36 -0700927 }
928
929 /* Done with this transaction! */
930
931 jbd_debug(3, "JBD: commit phase 8\n");
932
Jan Kara03f4d802010-04-15 22:16:24 +0200933 J_ASSERT(commit_transaction->t_state == T_COMMIT_RECORD);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700934
Linus Torvalds1da177e2005-04-16 15:20:36 -0700935 commit_transaction->t_state = T_FINISHED;
936 J_ASSERT(commit_transaction == journal->j_committing_transaction);
937 journal->j_commit_sequence = commit_transaction->t_tid;
938 journal->j_committing_transaction = NULL;
Josef Bacikf420d4d2009-01-07 18:07:24 -0800939 commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
940
941 /*
942 * weight the commit time higher than the average time so we don't
943 * react too strongly to vast changes in commit time
944 */
945 if (likely(journal->j_average_commit_time))
946 journal->j_average_commit_time = (commit_time*3 +
947 journal->j_average_commit_time) / 4;
948 else
949 journal->j_average_commit_time = commit_time;
950
Linus Torvalds1da177e2005-04-16 15:20:36 -0700951 spin_unlock(&journal->j_state_lock);
952
Jan Karafe28e422007-07-15 23:37:18 -0700953 if (commit_transaction->t_checkpoint_list == NULL &&
954 commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds1da177e2005-04-16 15:20:36 -0700955 __journal_drop_transaction(journal, commit_transaction);
956 } else {
957 if (journal->j_checkpoint_transactions == NULL) {
958 journal->j_checkpoint_transactions = commit_transaction;
959 commit_transaction->t_cpnext = commit_transaction;
960 commit_transaction->t_cpprev = commit_transaction;
961 } else {
962 commit_transaction->t_cpnext =
963 journal->j_checkpoint_transactions;
964 commit_transaction->t_cpprev =
965 commit_transaction->t_cpnext->t_cpprev;
966 commit_transaction->t_cpnext->t_cpprev =
967 commit_transaction;
968 commit_transaction->t_cpprev->t_cpnext =
969 commit_transaction;
970 }
971 }
972 spin_unlock(&journal->j_list_lock);
973
Lukas Czerner99cb1a32011-05-23 18:33:02 +0200974 trace_jbd_end_commit(journal, commit_transaction);
Linus Torvalds1da177e2005-04-16 15:20:36 -0700975 jbd_debug(1, "JBD: commit %d complete, head %d\n",
976 journal->j_commit_sequence, journal->j_tail_sequence);
977
978 wake_up(&journal->j_wait_done_commit);
979}