blob: 44d68a113c73bd06560a5b004eba5942ec37c908 [file] [log] [blame]
Dave Kleikamp470decc2006-10-11 01:20:57 -07001/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -07002 * linux/fs/jbd2/commit.c
Dave Kleikamp470decc2006-10-11 01:20:57 -07003 *
4 * Written by Stephen C. Tweedie <sct@redhat.com>, 1998
5 *
6 * Copyright 1998 Red Hat corp --- All Rights Reserved
7 *
8 * This file is part of the Linux kernel and is made available under
9 * the terms of the GNU General Public License, version 2, or at your
10 * option, any later version, incorporated herein by reference.
11 *
12 * Journal commit routines for the generic filesystem journaling code;
13 * part of the ext2fs journaling system.
14 */
15
16#include <linux/time.h>
17#include <linux/fs.h>
Mingming Caof7f4bcc2006-10-11 01:20:59 -070018#include <linux/jbd2.h>
Dave Kleikamp470decc2006-10-11 01:20:57 -070019#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/mm.h>
22#include <linux/pagemap.h>
23#include <linux/smp_lock.h>
24
25/*
26 * Default IO end handler for temporary BJ_IO buffer_heads.
27 */
28static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
29{
30 BUFFER_TRACE(bh, "");
31 if (uptodate)
32 set_buffer_uptodate(bh);
33 else
34 clear_buffer_uptodate(bh);
35 unlock_buffer(bh);
36}
37
38/*
39 * When an ext3-ordered file is truncated, it is possible that many pages are
40 * not sucessfully freed, because they are attached to a committing transaction.
41 * After the transaction commits, these pages are left on the LRU, with no
42 * ->mapping, and with attached buffers. These pages are trivially reclaimable
43 * by the VM, but their apparent absence upsets the VM accounting, and it makes
44 * the numbers in /proc/meminfo look odd.
45 *
46 * So here, we have a buffer which has just come off the forget list. Look to
47 * see if we can strip all buffers from the backing page.
48 *
49 * Called under lock_journal(), and possibly under journal_datalist_lock. The
50 * caller provided us with a ref against the buffer, and we drop that here.
51 */
52static void release_buffer_page(struct buffer_head *bh)
53{
54 struct page *page;
55
56 if (buffer_dirty(bh))
57 goto nope;
58 if (atomic_read(&bh->b_count) != 1)
59 goto nope;
60 page = bh->b_page;
61 if (!page)
62 goto nope;
63 if (page->mapping)
64 goto nope;
65
66 /* OK, it's a truncated page */
67 if (TestSetPageLocked(page))
68 goto nope;
69
70 page_cache_get(page);
71 __brelse(bh);
72 try_to_free_buffers(page);
73 unlock_page(page);
74 page_cache_release(page);
75 return;
76
77nope:
78 __brelse(bh);
79}
80
81/*
82 * Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
83 * held. For ranking reasons we must trylock. If we lose, schedule away and
84 * return 0. j_list_lock is dropped in this case.
85 */
86static int inverted_lock(journal_t *journal, struct buffer_head *bh)
87{
88 if (!jbd_trylock_bh_state(bh)) {
89 spin_unlock(&journal->j_list_lock);
90 schedule();
91 return 0;
92 }
93 return 1;
94}
95
96/* Done it all: now write the commit record. We should have
97 * cleaned up our previous buffers by now, so if we are in abort
98 * mode we can now just skip the rest of the journal write
99 * entirely.
100 *
101 * Returns 1 if the journal needs to be aborted or 0 on success
102 */
103static int journal_write_commit_record(journal_t *journal,
104 transaction_t *commit_transaction)
105{
106 struct journal_head *descriptor;
107 struct buffer_head *bh;
108 int i, ret;
109 int barrier_done = 0;
110
111 if (is_journal_aborted(journal))
112 return 0;
113
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700114 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700115 if (!descriptor)
116 return 1;
117
118 bh = jh2bh(descriptor);
119
120 /* AKPM: buglet - add `i' to tmp! */
121 for (i = 0; i < bh->b_size; i += 512) {
122 journal_header_t *tmp = (journal_header_t*)bh->b_data;
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700123 tmp->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
124 tmp->h_blocktype = cpu_to_be32(JBD2_COMMIT_BLOCK);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700125 tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
126 }
127
128 JBUFFER_TRACE(descriptor, "write commit block");
129 set_buffer_dirty(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700130 if (journal->j_flags & JBD2_BARRIER) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700131 set_buffer_ordered(bh);
132 barrier_done = 1;
133 }
134 ret = sync_dirty_buffer(bh);
135 /* is it possible for another commit to fail at roughly
136 * the same time as this one? If so, we don't want to
137 * trust the barrier flag in the super, but instead want
138 * to remember if we sent a barrier request
139 */
140 if (ret == -EOPNOTSUPP && barrier_done) {
141 char b[BDEVNAME_SIZE];
142
143 printk(KERN_WARNING
144 "JBD: barrier-based sync failed on %s - "
145 "disabling barriers\n",
146 bdevname(journal->j_dev, b));
147 spin_lock(&journal->j_state_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700148 journal->j_flags &= ~JBD2_BARRIER;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700149 spin_unlock(&journal->j_state_lock);
150
151 /* And try again, without the barrier */
152 clear_buffer_ordered(bh);
153 set_buffer_uptodate(bh);
154 set_buffer_dirty(bh);
155 ret = sync_dirty_buffer(bh);
156 }
157 put_bh(bh); /* One for getblk() */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700158 jbd2_journal_put_journal_head(descriptor);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700159
160 return (ret == -EIO);
161}
162
163static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
164{
165 int i;
166
167 for (i = 0; i < bufs; i++) {
168 wbuf[i]->b_end_io = end_buffer_write_sync;
169 /* We use-up our safety reference in submit_bh() */
170 submit_bh(WRITE, wbuf[i]);
171 }
172}
173
174/*
175 * Submit all the data buffers to disk
176 */
177static void journal_submit_data_buffers(journal_t *journal,
178 transaction_t *commit_transaction)
179{
180 struct journal_head *jh;
181 struct buffer_head *bh;
182 int locked;
183 int bufs = 0;
184 struct buffer_head **wbuf = journal->j_wbuf;
185
186 /*
187 * Whenever we unlock the journal and sleep, things can get added
188 * onto ->t_sync_datalist, so we have to keep looping back to
189 * write_out_data until we *know* that the list is empty.
190 *
191 * Cleanup any flushed data buffers from the data list. Even in
192 * abort mode, we want to flush this out as soon as possible.
193 */
194write_out_data:
195 cond_resched();
196 spin_lock(&journal->j_list_lock);
197
198 while (commit_transaction->t_sync_datalist) {
199 jh = commit_transaction->t_sync_datalist;
200 bh = jh2bh(jh);
201 locked = 0;
202
203 /* Get reference just to make sure buffer does not disappear
204 * when we are forced to drop various locks */
205 get_bh(bh);
206 /* If the buffer is dirty, we need to submit IO and hence
207 * we need the buffer lock. We try to lock the buffer without
208 * blocking. If we fail, we need to drop j_list_lock and do
209 * blocking lock_buffer().
210 */
211 if (buffer_dirty(bh)) {
212 if (test_set_buffer_locked(bh)) {
213 BUFFER_TRACE(bh, "needs blocking lock");
214 spin_unlock(&journal->j_list_lock);
215 /* Write out all data to prevent deadlocks */
216 journal_do_submit_data(wbuf, bufs);
217 bufs = 0;
218 lock_buffer(bh);
219 spin_lock(&journal->j_list_lock);
220 }
221 locked = 1;
222 }
223 /* We have to get bh_state lock. Again out of order, sigh. */
224 if (!inverted_lock(journal, bh)) {
225 jbd_lock_bh_state(bh);
226 spin_lock(&journal->j_list_lock);
227 }
228 /* Someone already cleaned up the buffer? */
229 if (!buffer_jbd(bh)
230 || jh->b_transaction != commit_transaction
231 || jh->b_jlist != BJ_SyncData) {
232 jbd_unlock_bh_state(bh);
233 if (locked)
234 unlock_buffer(bh);
235 BUFFER_TRACE(bh, "already cleaned up");
236 put_bh(bh);
237 continue;
238 }
239 if (locked && test_clear_buffer_dirty(bh)) {
240 BUFFER_TRACE(bh, "needs writeout, adding to array");
241 wbuf[bufs++] = bh;
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700242 __jbd2_journal_file_buffer(jh, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700243 BJ_Locked);
244 jbd_unlock_bh_state(bh);
245 if (bufs == journal->j_wbufsize) {
246 spin_unlock(&journal->j_list_lock);
247 journal_do_submit_data(wbuf, bufs);
248 bufs = 0;
249 goto write_out_data;
250 }
251 }
252 else {
253 BUFFER_TRACE(bh, "writeout complete: unfile");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700254 __jbd2_journal_unfile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700255 jbd_unlock_bh_state(bh);
256 if (locked)
257 unlock_buffer(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700258 jbd2_journal_remove_journal_head(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700259 /* Once for our safety reference, once for
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700260 * jbd2_journal_remove_journal_head() */
Dave Kleikamp470decc2006-10-11 01:20:57 -0700261 put_bh(bh);
262 put_bh(bh);
263 }
264
265 if (lock_need_resched(&journal->j_list_lock)) {
266 spin_unlock(&journal->j_list_lock);
267 goto write_out_data;
268 }
269 }
270 spin_unlock(&journal->j_list_lock);
271 journal_do_submit_data(wbuf, bufs);
272}
273
Zach Brownb517bea2006-10-11 01:21:08 -0700274static inline void write_tag_block(int tag_bytes, journal_block_tag_t *tag,
275 sector_t block)
276{
277 tag->t_blocknr = cpu_to_be32(block & (u32)~0);
278 if (tag_bytes > JBD_TAG_SIZE32)
279 tag->t_blocknr_high = cpu_to_be32((block >> 31) >> 1);
280}
281
Dave Kleikamp470decc2006-10-11 01:20:57 -0700282/*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700283 * jbd2_journal_commit_transaction
Dave Kleikamp470decc2006-10-11 01:20:57 -0700284 *
285 * The primary function for committing a transaction to the log. This
286 * function is called by the journal thread to begin a complete commit.
287 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700288void jbd2_journal_commit_transaction(journal_t *journal)
Dave Kleikamp470decc2006-10-11 01:20:57 -0700289{
290 transaction_t *commit_transaction;
291 struct journal_head *jh, *new_jh, *descriptor;
292 struct buffer_head **wbuf = journal->j_wbuf;
293 int bufs;
294 int flags;
295 int err;
296 unsigned long blocknr;
297 char *tagp = NULL;
298 journal_header_t *header;
299 journal_block_tag_t *tag = NULL;
300 int space_left = 0;
301 int first_tag = 0;
302 int tag_flag;
303 int i;
Zach Brownb517bea2006-10-11 01:21:08 -0700304 int tag_bytes = journal_tag_bytes(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700305
306 /*
307 * First job: lock down the current transaction and wait for
308 * all outstanding updates to complete.
309 */
310
311#ifdef COMMIT_STATS
312 spin_lock(&journal->j_list_lock);
313 summarise_journal_usage(journal);
314 spin_unlock(&journal->j_list_lock);
315#endif
316
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700317 /* Do we need to erase the effects of a prior jbd2_journal_flush? */
318 if (journal->j_flags & JBD2_FLUSHED) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700319 jbd_debug(3, "super block updated\n");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700320 jbd2_journal_update_superblock(journal, 1);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700321 } else {
322 jbd_debug(3, "superblock not updated\n");
323 }
324
325 J_ASSERT(journal->j_running_transaction != NULL);
326 J_ASSERT(journal->j_committing_transaction == NULL);
327
328 commit_transaction = journal->j_running_transaction;
329 J_ASSERT(commit_transaction->t_state == T_RUNNING);
330
331 jbd_debug(1, "JBD: starting commit of transaction %d\n",
332 commit_transaction->t_tid);
333
334 spin_lock(&journal->j_state_lock);
335 commit_transaction->t_state = T_LOCKED;
336
337 spin_lock(&commit_transaction->t_handle_lock);
338 while (commit_transaction->t_updates) {
339 DEFINE_WAIT(wait);
340
341 prepare_to_wait(&journal->j_wait_updates, &wait,
342 TASK_UNINTERRUPTIBLE);
343 if (commit_transaction->t_updates) {
344 spin_unlock(&commit_transaction->t_handle_lock);
345 spin_unlock(&journal->j_state_lock);
346 schedule();
347 spin_lock(&journal->j_state_lock);
348 spin_lock(&commit_transaction->t_handle_lock);
349 }
350 finish_wait(&journal->j_wait_updates, &wait);
351 }
352 spin_unlock(&commit_transaction->t_handle_lock);
353
354 J_ASSERT (commit_transaction->t_outstanding_credits <=
355 journal->j_max_transaction_buffers);
356
357 /*
358 * First thing we are allowed to do is to discard any remaining
359 * BJ_Reserved buffers. Note, it is _not_ permissible to assume
360 * that there are no such buffers: if a large filesystem
361 * operation like a truncate needs to split itself over multiple
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700362 * transactions, then it may try to do a jbd2_journal_restart() while
Dave Kleikamp470decc2006-10-11 01:20:57 -0700363 * there are still BJ_Reserved buffers outstanding. These must
364 * be released cleanly from the current transaction.
365 *
366 * In this case, the filesystem must still reserve write access
367 * again before modifying the buffer in the new transaction, but
368 * we do not require it to remember exactly which old buffers it
369 * has reserved. This is consistent with the existing behaviour
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700370 * that multiple jbd2_journal_get_write_access() calls to the same
Dave Kleikamp470decc2006-10-11 01:20:57 -0700371 * buffer are perfectly permissable.
372 */
373 while (commit_transaction->t_reserved_list) {
374 jh = commit_transaction->t_reserved_list;
375 JBUFFER_TRACE(jh, "reserved, unused: refile");
376 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700377 * A jbd2_journal_get_undo_access()+jbd2_journal_release_buffer() may
Dave Kleikamp470decc2006-10-11 01:20:57 -0700378 * leave undo-committed data.
379 */
380 if (jh->b_committed_data) {
381 struct buffer_head *bh = jh2bh(jh);
382
383 jbd_lock_bh_state(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700384 jbd2_slab_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700385 jh->b_committed_data = NULL;
386 jbd_unlock_bh_state(bh);
387 }
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700388 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700389 }
390
391 /*
392 * Now try to drop any written-back buffers from the journal's
393 * checkpoint lists. We do this *before* commit because it potentially
394 * frees some memory
395 */
396 spin_lock(&journal->j_list_lock);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700397 __jbd2_journal_clean_checkpoint_list(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700398 spin_unlock(&journal->j_list_lock);
399
400 jbd_debug (3, "JBD: commit phase 1\n");
401
402 /*
403 * Switch to a new revoke table.
404 */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700405 jbd2_journal_switch_revoke_table(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700406
407 commit_transaction->t_state = T_FLUSH;
408 journal->j_committing_transaction = commit_transaction;
409 journal->j_running_transaction = NULL;
410 commit_transaction->t_log_start = journal->j_head;
411 wake_up(&journal->j_wait_transaction_locked);
412 spin_unlock(&journal->j_state_lock);
413
414 jbd_debug (3, "JBD: commit phase 2\n");
415
416 /*
417 * First, drop modified flag: all accesses to the buffers
418 * will be tracked for a new trasaction only -bzzz
419 */
420 spin_lock(&journal->j_list_lock);
421 if (commit_transaction->t_buffers) {
422 new_jh = jh = commit_transaction->t_buffers->b_tnext;
423 do {
424 J_ASSERT_JH(new_jh, new_jh->b_modified == 1 ||
425 new_jh->b_modified == 0);
426 new_jh->b_modified = 0;
427 new_jh = new_jh->b_tnext;
428 } while (new_jh != jh);
429 }
430 spin_unlock(&journal->j_list_lock);
431
432 /*
433 * Now start flushing things to disk, in the order they appear
434 * on the transaction lists. Data blocks go first.
435 */
436 err = 0;
437 journal_submit_data_buffers(journal, commit_transaction);
438
439 /*
440 * Wait for all previously submitted IO to complete.
441 */
442 spin_lock(&journal->j_list_lock);
443 while (commit_transaction->t_locked_list) {
444 struct buffer_head *bh;
445
446 jh = commit_transaction->t_locked_list->b_tprev;
447 bh = jh2bh(jh);
448 get_bh(bh);
449 if (buffer_locked(bh)) {
450 spin_unlock(&journal->j_list_lock);
451 wait_on_buffer(bh);
452 if (unlikely(!buffer_uptodate(bh)))
453 err = -EIO;
454 spin_lock(&journal->j_list_lock);
455 }
456 if (!inverted_lock(journal, bh)) {
457 put_bh(bh);
458 spin_lock(&journal->j_list_lock);
459 continue;
460 }
461 if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700462 __jbd2_journal_unfile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700463 jbd_unlock_bh_state(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700464 jbd2_journal_remove_journal_head(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700465 put_bh(bh);
466 } else {
467 jbd_unlock_bh_state(bh);
468 }
469 put_bh(bh);
470 cond_resched_lock(&journal->j_list_lock);
471 }
472 spin_unlock(&journal->j_list_lock);
473
474 if (err)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700475 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700476
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700477 jbd2_journal_write_revoke_records(journal, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700478
479 jbd_debug(3, "JBD: commit phase 2\n");
480
481 /*
482 * If we found any dirty or locked buffers, then we should have
483 * looped back up to the write_out_data label. If there weren't
484 * any then journal_clean_data_list should have wiped the list
485 * clean by now, so check that it is in fact empty.
486 */
487 J_ASSERT (commit_transaction->t_sync_datalist == NULL);
488
489 jbd_debug (3, "JBD: commit phase 3\n");
490
491 /*
492 * Way to go: we have now written out all of the data for a
493 * transaction! Now comes the tricky part: we need to write out
494 * metadata. Loop over the transaction's entire buffer list:
495 */
496 commit_transaction->t_state = T_COMMIT;
497
498 descriptor = NULL;
499 bufs = 0;
500 while (commit_transaction->t_buffers) {
501
502 /* Find the next buffer to be journaled... */
503
504 jh = commit_transaction->t_buffers;
505
506 /* If we're in abort mode, we just un-journal the buffer and
507 release it for background writing. */
508
509 if (is_journal_aborted(journal)) {
510 JBUFFER_TRACE(jh, "journal is aborting: refile");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700511 jbd2_journal_refile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700512 /* If that was the last one, we need to clean up
513 * any descriptor buffers which may have been
514 * already allocated, even if we are now
515 * aborting. */
516 if (!commit_transaction->t_buffers)
517 goto start_journal_io;
518 continue;
519 }
520
521 /* Make sure we have a descriptor block in which to
522 record the metadata buffer. */
523
524 if (!descriptor) {
525 struct buffer_head *bh;
526
527 J_ASSERT (bufs == 0);
528
529 jbd_debug(4, "JBD: get descriptor\n");
530
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700531 descriptor = jbd2_journal_get_descriptor_buffer(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700532 if (!descriptor) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700533 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700534 continue;
535 }
536
537 bh = jh2bh(descriptor);
538 jbd_debug(4, "JBD: got buffer %llu (%p)\n",
539 (unsigned long long)bh->b_blocknr, bh->b_data);
540 header = (journal_header_t *)&bh->b_data[0];
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700541 header->h_magic = cpu_to_be32(JBD2_MAGIC_NUMBER);
542 header->h_blocktype = cpu_to_be32(JBD2_DESCRIPTOR_BLOCK);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700543 header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
544
545 tagp = &bh->b_data[sizeof(journal_header_t)];
546 space_left = bh->b_size - sizeof(journal_header_t);
547 first_tag = 1;
548 set_buffer_jwrite(bh);
549 set_buffer_dirty(bh);
550 wbuf[bufs++] = bh;
551
552 /* Record it so that we can wait for IO
553 completion later */
554 BUFFER_TRACE(bh, "ph3: file as descriptor");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700555 jbd2_journal_file_buffer(descriptor, commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700556 BJ_LogCtl);
557 }
558
559 /* Where is the buffer to be written? */
560
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700561 err = jbd2_journal_next_log_block(journal, &blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700562 /* If the block mapping failed, just abandon the buffer
563 and repeat this loop: we'll fall into the
564 refile-on-abort condition above. */
565 if (err) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700566 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700567 continue;
568 }
569
570 /*
571 * start_this_handle() uses t_outstanding_credits to determine
572 * the free space in the log, but this counter is changed
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700573 * by jbd2_journal_next_log_block() also.
Dave Kleikamp470decc2006-10-11 01:20:57 -0700574 */
575 commit_transaction->t_outstanding_credits--;
576
577 /* Bump b_count to prevent truncate from stumbling over
578 the shadowed buffer! @@@ This can go if we ever get
579 rid of the BJ_IO/BJ_Shadow pairing of buffers. */
580 atomic_inc(&jh2bh(jh)->b_count);
581
582 /* Make a temporary IO buffer with which to write it out
583 (this will requeue both the metadata buffer and the
584 temporary IO buffer). new_bh goes on BJ_IO*/
585
586 set_bit(BH_JWrite, &jh2bh(jh)->b_state);
587 /*
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700588 * akpm: jbd2_journal_write_metadata_buffer() sets
Dave Kleikamp470decc2006-10-11 01:20:57 -0700589 * new_bh->b_transaction to commit_transaction.
590 * We need to clean this up before we release new_bh
591 * (which is of type BJ_IO)
592 */
593 JBUFFER_TRACE(jh, "ph3: write metadata");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700594 flags = jbd2_journal_write_metadata_buffer(commit_transaction,
Dave Kleikamp470decc2006-10-11 01:20:57 -0700595 jh, &new_jh, blocknr);
596 set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
597 wbuf[bufs++] = jh2bh(new_jh);
598
599 /* Record the new block's tag in the current descriptor
600 buffer */
601
602 tag_flag = 0;
603 if (flags & 1)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700604 tag_flag |= JBD2_FLAG_ESCAPE;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700605 if (!first_tag)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700606 tag_flag |= JBD2_FLAG_SAME_UUID;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700607
608 tag = (journal_block_tag_t *) tagp;
Zach Brownb517bea2006-10-11 01:21:08 -0700609 write_tag_block(tag_bytes, tag, jh2bh(jh)->b_blocknr);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700610 tag->t_flags = cpu_to_be32(tag_flag);
Zach Brownb517bea2006-10-11 01:21:08 -0700611 tagp += tag_bytes;
612 space_left -= tag_bytes;
Dave Kleikamp470decc2006-10-11 01:20:57 -0700613
614 if (first_tag) {
615 memcpy (tagp, journal->j_uuid, 16);
616 tagp += 16;
617 space_left -= 16;
618 first_tag = 0;
619 }
620
621 /* If there's no more to do, or if the descriptor is full,
622 let the IO rip! */
623
624 if (bufs == journal->j_wbufsize ||
625 commit_transaction->t_buffers == NULL ||
Zach Brownb517bea2006-10-11 01:21:08 -0700626 space_left < tag_bytes + 16) {
Dave Kleikamp470decc2006-10-11 01:20:57 -0700627
628 jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
629
630 /* Write an end-of-descriptor marker before
631 submitting the IOs. "tag" still points to
632 the last tag we set up. */
633
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700634 tag->t_flags |= cpu_to_be32(JBD2_FLAG_LAST_TAG);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700635
636start_journal_io:
637 for (i = 0; i < bufs; i++) {
638 struct buffer_head *bh = wbuf[i];
639 lock_buffer(bh);
640 clear_buffer_dirty(bh);
641 set_buffer_uptodate(bh);
642 bh->b_end_io = journal_end_buffer_io_sync;
643 submit_bh(WRITE, bh);
644 }
645 cond_resched();
646
647 /* Force a new descriptor to be generated next
648 time round the loop. */
649 descriptor = NULL;
650 bufs = 0;
651 }
652 }
653
654 /* Lo and behold: we have just managed to send a transaction to
655 the log. Before we can commit it, wait for the IO so far to
656 complete. Control buffers being written are on the
657 transaction's t_log_list queue, and metadata buffers are on
658 the t_iobuf_list queue.
659
660 Wait for the buffers in reverse order. That way we are
661 less likely to be woken up until all IOs have completed, and
662 so we incur less scheduling load.
663 */
664
665 jbd_debug(3, "JBD: commit phase 4\n");
666
667 /*
668 * akpm: these are BJ_IO, and j_list_lock is not needed.
669 * See __journal_try_to_free_buffer.
670 */
671wait_for_iobuf:
672 while (commit_transaction->t_iobuf_list != NULL) {
673 struct buffer_head *bh;
674
675 jh = commit_transaction->t_iobuf_list->b_tprev;
676 bh = jh2bh(jh);
677 if (buffer_locked(bh)) {
678 wait_on_buffer(bh);
679 goto wait_for_iobuf;
680 }
681 if (cond_resched())
682 goto wait_for_iobuf;
683
684 if (unlikely(!buffer_uptodate(bh)))
685 err = -EIO;
686
687 clear_buffer_jwrite(bh);
688
689 JBUFFER_TRACE(jh, "ph4: unfile after journal write");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700690 jbd2_journal_unfile_buffer(journal, jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700691
692 /*
693 * ->t_iobuf_list should contain only dummy buffer_heads
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700694 * which were created by jbd2_journal_write_metadata_buffer().
Dave Kleikamp470decc2006-10-11 01:20:57 -0700695 */
696 BUFFER_TRACE(bh, "dumping temporary bh");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700697 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700698 __brelse(bh);
699 J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
700 free_buffer_head(bh);
701
702 /* We also have to unlock and free the corresponding
703 shadowed buffer */
704 jh = commit_transaction->t_shadow_list->b_tprev;
705 bh = jh2bh(jh);
706 clear_bit(BH_JWrite, &bh->b_state);
707 J_ASSERT_BH(bh, buffer_jbddirty(bh));
708
709 /* The metadata is now released for reuse, but we need
710 to remember it against this transaction so that when
711 we finally commit, we can do any checkpointing
712 required. */
713 JBUFFER_TRACE(jh, "file as BJ_Forget");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700714 jbd2_journal_file_buffer(jh, commit_transaction, BJ_Forget);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700715 /* Wake up any transactions which were waiting for this
716 IO to complete */
717 wake_up_bit(&bh->b_state, BH_Unshadow);
718 JBUFFER_TRACE(jh, "brelse shadowed buffer");
719 __brelse(bh);
720 }
721
722 J_ASSERT (commit_transaction->t_shadow_list == NULL);
723
724 jbd_debug(3, "JBD: commit phase 5\n");
725
726 /* Here we wait for the revoke record and descriptor record buffers */
727 wait_for_ctlbuf:
728 while (commit_transaction->t_log_list != NULL) {
729 struct buffer_head *bh;
730
731 jh = commit_transaction->t_log_list->b_tprev;
732 bh = jh2bh(jh);
733 if (buffer_locked(bh)) {
734 wait_on_buffer(bh);
735 goto wait_for_ctlbuf;
736 }
737 if (cond_resched())
738 goto wait_for_ctlbuf;
739
740 if (unlikely(!buffer_uptodate(bh)))
741 err = -EIO;
742
743 BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
744 clear_buffer_jwrite(bh);
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700745 jbd2_journal_unfile_buffer(journal, jh);
746 jbd2_journal_put_journal_head(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700747 __brelse(bh); /* One for getblk */
748 /* AKPM: bforget here */
749 }
750
751 jbd_debug(3, "JBD: commit phase 6\n");
752
753 if (journal_write_commit_record(journal, commit_transaction))
754 err = -EIO;
755
756 if (err)
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700757 __jbd2_journal_abort_hard(journal);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700758
759 /* End of a transaction! Finally, we can do checkpoint
760 processing: any buffers committed as a result of this
761 transaction can be removed from any checkpoint list it was on
762 before. */
763
764 jbd_debug(3, "JBD: commit phase 7\n");
765
766 J_ASSERT(commit_transaction->t_sync_datalist == NULL);
767 J_ASSERT(commit_transaction->t_buffers == NULL);
768 J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
769 J_ASSERT(commit_transaction->t_iobuf_list == NULL);
770 J_ASSERT(commit_transaction->t_shadow_list == NULL);
771 J_ASSERT(commit_transaction->t_log_list == NULL);
772
773restart_loop:
774 /*
775 * As there are other places (journal_unmap_buffer()) adding buffers
776 * to this list we have to be careful and hold the j_list_lock.
777 */
778 spin_lock(&journal->j_list_lock);
779 while (commit_transaction->t_forget) {
780 transaction_t *cp_transaction;
781 struct buffer_head *bh;
782
783 jh = commit_transaction->t_forget;
784 spin_unlock(&journal->j_list_lock);
785 bh = jh2bh(jh);
786 jbd_lock_bh_state(bh);
787 J_ASSERT_JH(jh, jh->b_transaction == commit_transaction ||
788 jh->b_transaction == journal->j_running_transaction);
789
790 /*
791 * If there is undo-protected committed data against
792 * this buffer, then we can remove it now. If it is a
793 * buffer needing such protection, the old frozen_data
794 * field now points to a committed version of the
795 * buffer, so rotate that field to the new committed
796 * data.
797 *
798 * Otherwise, we can just throw away the frozen data now.
799 */
800 if (jh->b_committed_data) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700801 jbd2_slab_free(jh->b_committed_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700802 jh->b_committed_data = NULL;
803 if (jh->b_frozen_data) {
804 jh->b_committed_data = jh->b_frozen_data;
805 jh->b_frozen_data = NULL;
806 }
807 } else if (jh->b_frozen_data) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700808 jbd2_slab_free(jh->b_frozen_data, bh->b_size);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700809 jh->b_frozen_data = NULL;
810 }
811
812 spin_lock(&journal->j_list_lock);
813 cp_transaction = jh->b_cp_transaction;
814 if (cp_transaction) {
815 JBUFFER_TRACE(jh, "remove from old cp transaction");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700816 __jbd2_journal_remove_checkpoint(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700817 }
818
819 /* Only re-checkpoint the buffer_head if it is marked
820 * dirty. If the buffer was added to the BJ_Forget list
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700821 * by jbd2_journal_forget, it may no longer be dirty and
Dave Kleikamp470decc2006-10-11 01:20:57 -0700822 * there's no point in keeping a checkpoint record for
823 * it. */
824
825 /* A buffer which has been freed while still being
826 * journaled by a previous transaction may end up still
827 * being dirty here, but we want to avoid writing back
828 * that buffer in the future now that the last use has
829 * been committed. That's not only a performance gain,
830 * it also stops aliasing problems if the buffer is left
831 * behind for writeback and gets reallocated for another
832 * use in a different page. */
833 if (buffer_freed(bh)) {
834 clear_buffer_freed(bh);
835 clear_buffer_jbddirty(bh);
836 }
837
838 if (buffer_jbddirty(bh)) {
839 JBUFFER_TRACE(jh, "add to new checkpointing trans");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700840 __jbd2_journal_insert_checkpoint(jh, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700841 JBUFFER_TRACE(jh, "refile for checkpoint writeback");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700842 __jbd2_journal_refile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700843 jbd_unlock_bh_state(bh);
844 } else {
845 J_ASSERT_BH(bh, !buffer_dirty(bh));
846 /* The buffer on BJ_Forget list and not jbddirty means
847 * it has been freed by this transaction and hence it
848 * could not have been reallocated until this
849 * transaction has committed. *BUT* it could be
850 * reallocated once we have written all the data to
851 * disk and before we process the buffer on BJ_Forget
852 * list. */
853 JBUFFER_TRACE(jh, "refile or unfile freed buffer");
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700854 __jbd2_journal_refile_buffer(jh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700855 if (!jh->b_transaction) {
856 jbd_unlock_bh_state(bh);
857 /* needs a brelse */
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700858 jbd2_journal_remove_journal_head(bh);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700859 release_buffer_page(bh);
860 } else
861 jbd_unlock_bh_state(bh);
862 }
863 cond_resched_lock(&journal->j_list_lock);
864 }
865 spin_unlock(&journal->j_list_lock);
866 /*
867 * This is a bit sleazy. We borrow j_list_lock to protect
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700868 * journal->j_committing_transaction in __jbd2_journal_remove_checkpoint.
869 * Really, __jbd2_journal_remove_checkpoint should be using j_state_lock but
870 * it's a bit hassle to hold that across __jbd2_journal_remove_checkpoint
Dave Kleikamp470decc2006-10-11 01:20:57 -0700871 */
872 spin_lock(&journal->j_state_lock);
873 spin_lock(&journal->j_list_lock);
874 /*
875 * Now recheck if some buffers did not get attached to the transaction
876 * while the lock was dropped...
877 */
878 if (commit_transaction->t_forget) {
879 spin_unlock(&journal->j_list_lock);
880 spin_unlock(&journal->j_state_lock);
881 goto restart_loop;
882 }
883
884 /* Done with this transaction! */
885
886 jbd_debug(3, "JBD: commit phase 8\n");
887
888 J_ASSERT(commit_transaction->t_state == T_COMMIT);
889
890 commit_transaction->t_state = T_FINISHED;
891 J_ASSERT(commit_transaction == journal->j_committing_transaction);
892 journal->j_commit_sequence = commit_transaction->t_tid;
893 journal->j_committing_transaction = NULL;
894 spin_unlock(&journal->j_state_lock);
895
896 if (commit_transaction->t_checkpoint_list == NULL) {
Mingming Caof7f4bcc2006-10-11 01:20:59 -0700897 __jbd2_journal_drop_transaction(journal, commit_transaction);
Dave Kleikamp470decc2006-10-11 01:20:57 -0700898 } else {
899 if (journal->j_checkpoint_transactions == NULL) {
900 journal->j_checkpoint_transactions = commit_transaction;
901 commit_transaction->t_cpnext = commit_transaction;
902 commit_transaction->t_cpprev = commit_transaction;
903 } else {
904 commit_transaction->t_cpnext =
905 journal->j_checkpoint_transactions;
906 commit_transaction->t_cpprev =
907 commit_transaction->t_cpnext->t_cpprev;
908 commit_transaction->t_cpnext->t_cpprev =
909 commit_transaction;
910 commit_transaction->t_cpprev->t_cpnext =
911 commit_transaction;
912 }
913 }
914 spin_unlock(&journal->j_list_lock);
915
916 jbd_debug(1, "JBD: commit %d complete, head %d\n",
917 journal->j_commit_sequence, journal->j_tail_sequence);
918
919 wake_up(&journal->j_wait_done_commit);
920}