Blame - fs/jbd/commit.c - kernel/msm-4.9

blob: 5a8ca61498caf36cbb818fddc00eafbca2642f03 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Uwe Zeisberger	f30c226	2006-10-03 23:01:26 +0200	[diff] [blame]	2	* linux/fs/jbd/commit.c
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	23
				24	/*
				25	* Default IO end handler for temporary BJ_IO buffer_heads.
				26	*/
				27	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				28	{
				29	BUFFER_TRACE(bh, "");
				30	if (uptodate)
				31	set_buffer_uptodate(bh);
				32	else
				33	clear_buffer_uptodate(bh);
				34	unlock_buffer(bh);
				35	}
				36
				37	/*
				38	* When an ext3-ordered file is truncated, it is possible that many pages are
				39	* not sucessfully freed, because they are attached to a committing transaction.
				40	* After the transaction commits, these pages are left on the LRU, with no
				41	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				42	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				43	* the numbers in /proc/meminfo look odd.
				44	*
				45	* So here, we have a buffer which has just come off the forget list. Look to
				46	* see if we can strip all buffers from the backing page.
				47	*
				48	* Called under lock_journal(), and possibly under journal_datalist_lock. The
				49	* caller provided us with a ref against the buffer, and we drop that here.
				50	*/
				51	static void release_buffer_page(struct buffer_head *bh)
				52	{
				53	struct page *page;
				54
				55	if (buffer_dirty(bh))
				56	goto nope;
				57	if (atomic_read(&bh->b_count) != 1)
				58	goto nope;
				59	page = bh->b_page;
				60	if (!page)
				61	goto nope;
				62	if (page->mapping)
				63	goto nope;
				64
				65	/* OK, it's a truncated page */
				66	if (TestSetPageLocked(page))
				67	goto nope;
				68
				69	page_cache_get(page);
				70	__brelse(bh);
				71	try_to_free_buffers(page);
				72	unlock_page(page);
				73	page_cache_release(page);
				74	return;
				75
				76	nope:
				77	__brelse(bh);
				78	}
				79
				80	/*
				81	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
				82	* held. For ranking reasons we must trylock. If we lose, schedule away and
				83	* return 0. j_list_lock is dropped in this case.
				84	*/
				85	static int inverted_lock(journal_t journal, struct buffer_head bh)
				86	{
				87	if (!jbd_trylock_bh_state(bh)) {
				88	spin_unlock(&journal->j_list_lock);
				89	schedule();
				90	return 0;
				91	}
				92	return 1;
				93	}
				94
				95	/* Done it all: now write the commit record. We should have
				96	* cleaned up our previous buffers by now, so if we are in abort
				97	* mode we can now just skip the rest of the journal write
				98	* entirely.
				99	*
				100	* Returns 1 if the journal needs to be aborted or 0 on success
				101	*/
				102	static int journal_write_commit_record(journal_t *journal,
				103	transaction_t *commit_transaction)
				104	{
				105	struct journal_head *descriptor;
				106	struct buffer_head *bh;
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	107	journal_header_t *header;
				108	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	109	int barrier_done = 0;
				110
				111	if (is_journal_aborted(journal))
				112	return 0;
				113
				114	descriptor = journal_get_descriptor_buffer(journal);
				115	if (!descriptor)
				116	return 1;
				117
				118	bh = jh2bh(descriptor);
				119
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	120	header = (journal_header_t *)(bh->b_data);
				121	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				122	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
				123	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	124
				125	JBUFFER_TRACE(descriptor, "write commit block");
				126	set_buffer_dirty(bh);
				127	if (journal->j_flags & JFS_BARRIER) {
				128	set_buffer_ordered(bh);
				129	barrier_done = 1;
				130	}
				131	ret = sync_dirty_buffer(bh);
Neil Brown	28ae094	2008-02-08 04:22:13 -0800	[diff] [blame]	132	if (barrier_done)
				133	clear_buffer_ordered(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	134	/* is it possible for another commit to fail at roughly
				135	* the same time as this one? If so, we don't want to
				136	* trust the barrier flag in the super, but instead want
				137	* to remember if we sent a barrier request
				138	*/
				139	if (ret == -EOPNOTSUPP && barrier_done) {
				140	char b[BDEVNAME_SIZE];
				141
				142	printk(KERN_WARNING
				143	"JBD: barrier-based sync failed on %s - "
				144	"disabling barriers\n",
				145	bdevname(journal->j_dev, b));
				146	spin_lock(&journal->j_state_lock);
				147	journal->j_flags &= ~JFS_BARRIER;
				148	spin_unlock(&journal->j_state_lock);
				149
				150	/* And try again, without the barrier */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	151	set_buffer_uptodate(bh);
				152	set_buffer_dirty(bh);
				153	ret = sync_dirty_buffer(bh);
				154	}
				155	put_bh(bh); /* One for getblk() */
				156	journal_put_journal_head(descriptor);
				157
				158	return (ret == -EIO);
				159	}
				160
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	161	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
				162	{
				163	int i;
				164
				165	for (i = 0; i < bufs; i++) {
				166	wbuf[i]->b_end_io = end_buffer_write_sync;
				167	/* We use-up our safety reference in submit_bh() */
				168	submit_bh(WRITE, wbuf[i]);
				169	}
				170	}
				171
				172	/*
				173	* Submit all the data buffers to disk
				174	*/
				175	static void journal_submit_data_buffers(journal_t *journal,
				176	transaction_t *commit_transaction)
				177	{
				178	struct journal_head *jh;
				179	struct buffer_head *bh;
				180	int locked;
				181	int bufs = 0;
				182	struct buffer_head **wbuf = journal->j_wbuf;
				183
				184	/*
				185	* Whenever we unlock the journal and sleep, things can get added
				186	* onto ->t_sync_datalist, so we have to keep looping back to
				187	* write_out_data until we know that the list is empty.
				188	*
				189	* Cleanup any flushed data buffers from the data list. Even in
				190	* abort mode, we want to flush this out as soon as possible.
				191	*/
				192	write_out_data:
				193	cond_resched();
				194	spin_lock(&journal->j_list_lock);
				195
				196	while (commit_transaction->t_sync_datalist) {
				197	jh = commit_transaction->t_sync_datalist;
				198	bh = jh2bh(jh);
				199	locked = 0;
				200
				201	/* Get reference just to make sure buffer does not disappear
				202	* when we are forced to drop various locks */
				203	get_bh(bh);
				204	/* If the buffer is dirty, we need to submit IO and hence
				205	* we need the buffer lock. We try to lock the buffer without
				206	* blocking. If we fail, we need to drop j_list_lock and do
				207	* blocking lock_buffer().
				208	*/
				209	if (buffer_dirty(bh)) {
				210	if (test_set_buffer_locked(bh)) {
				211	BUFFER_TRACE(bh, "needs blocking lock");
				212	spin_unlock(&journal->j_list_lock);
				213	/* Write out all data to prevent deadlocks */
				214	journal_do_submit_data(wbuf, bufs);
				215	bufs = 0;
				216	lock_buffer(bh);
				217	spin_lock(&journal->j_list_lock);
				218	}
				219	locked = 1;
				220	}
				221	/* We have to get bh_state lock. Again out of order, sigh. */
				222	if (!inverted_lock(journal, bh)) {
				223	jbd_lock_bh_state(bh);
				224	spin_lock(&journal->j_list_lock);
				225	}
				226	/* Someone already cleaned up the buffer? */
				227	if (!buffer_jbd(bh)
				228	\|\| jh->b_transaction != commit_transaction
				229	\|\| jh->b_jlist != BJ_SyncData) {
				230	jbd_unlock_bh_state(bh);
				231	if (locked)
				232	unlock_buffer(bh);
				233	BUFFER_TRACE(bh, "already cleaned up");
				234	put_bh(bh);
				235	continue;
				236	}
				237	if (locked && test_clear_buffer_dirty(bh)) {
				238	BUFFER_TRACE(bh, "needs writeout, adding to array");
				239	wbuf[bufs++] = bh;
				240	__journal_file_buffer(jh, commit_transaction,
				241	BJ_Locked);
				242	jbd_unlock_bh_state(bh);
				243	if (bufs == journal->j_wbufsize) {
				244	spin_unlock(&journal->j_list_lock);
				245	journal_do_submit_data(wbuf, bufs);
				246	bufs = 0;
				247	goto write_out_data;
				248	}
Hisashi Hifumi	6f5a9da	2006-12-22 01:11:50 -0800	[diff] [blame]	249	} else if (!locked && buffer_locked(bh)) {
				250	__journal_file_buffer(jh, commit_transaction,
				251	BJ_Locked);
				252	jbd_unlock_bh_state(bh);
				253	put_bh(bh);
				254	} else {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	255	BUFFER_TRACE(bh, "writeout complete: unfile");
				256	__journal_unfile_buffer(jh);
				257	jbd_unlock_bh_state(bh);
				258	if (locked)
				259	unlock_buffer(bh);
				260	journal_remove_journal_head(bh);
				261	/* Once for our safety reference, once for
				262	* journal_remove_journal_head() */
				263	put_bh(bh);
				264	put_bh(bh);
				265	}
				266
Nick Piggin	95c354f	2008-01-30 13:31:20 +0100	[diff] [blame]	267	if (need_resched() \|\| spin_needbreak(&journal->j_list_lock)) {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	268	spin_unlock(&journal->j_list_lock);
				269	goto write_out_data;
				270	}
				271	}
				272	spin_unlock(&journal->j_list_lock);
				273	journal_do_submit_data(wbuf, bufs);
				274	}
				275
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	276	/*
				277	* journal_commit_transaction
				278	*
				279	* The primary function for committing a transaction to the log. This
				280	* function is called by the journal thread to begin a complete commit.
				281	*/
				282	void journal_commit_transaction(journal_t *journal)
				283	{
				284	transaction_t *commit_transaction;
				285	struct journal_head jh, new_jh, *descriptor;
				286	struct buffer_head **wbuf = journal->j_wbuf;
				287	int bufs;
				288	int flags;
				289	int err;
				290	unsigned long blocknr;
				291	char *tagp = NULL;
				292	journal_header_t *header;
				293	journal_block_tag_t *tag = NULL;
				294	int space_left = 0;
				295	int first_tag = 0;
				296	int tag_flag;
				297	int i;
				298
				299	/*
				300	* First job: lock down the current transaction and wait for
				301	* all outstanding updates to complete.
				302	*/
				303
				304	#ifdef COMMIT_STATS
				305	spin_lock(&journal->j_list_lock);
				306	summarise_journal_usage(journal);
				307	spin_unlock(&journal->j_list_lock);
				308	#endif
				309
				310	/* Do we need to erase the effects of a prior journal_flush? */
				311	if (journal->j_flags & JFS_FLUSHED) {
				312	jbd_debug(3, "super block updated\n");
				313	journal_update_superblock(journal, 1);
				314	} else {
				315	jbd_debug(3, "superblock not updated\n");
				316	}
				317
				318	J_ASSERT(journal->j_running_transaction != NULL);
				319	J_ASSERT(journal->j_committing_transaction == NULL);
				320
				321	commit_transaction = journal->j_running_transaction;
				322	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				323
				324	jbd_debug(1, "JBD: starting commit of transaction %d\n",
				325	commit_transaction->t_tid);
				326
				327	spin_lock(&journal->j_state_lock);
				328	commit_transaction->t_state = T_LOCKED;
				329
				330	spin_lock(&commit_transaction->t_handle_lock);
				331	while (commit_transaction->t_updates) {
				332	DEFINE_WAIT(wait);
				333
				334	prepare_to_wait(&journal->j_wait_updates, &wait,
				335	TASK_UNINTERRUPTIBLE);
				336	if (commit_transaction->t_updates) {
				337	spin_unlock(&commit_transaction->t_handle_lock);
				338	spin_unlock(&journal->j_state_lock);
				339	schedule();
				340	spin_lock(&journal->j_state_lock);
				341	spin_lock(&commit_transaction->t_handle_lock);
				342	}
				343	finish_wait(&journal->j_wait_updates, &wait);
				344	}
				345	spin_unlock(&commit_transaction->t_handle_lock);
				346
				347	J_ASSERT (commit_transaction->t_outstanding_credits <=
				348	journal->j_max_transaction_buffers);
				349
				350	/*
				351	* First thing we are allowed to do is to discard any remaining
				352	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				353	* that there are no such buffers: if a large filesystem
				354	* operation like a truncate needs to split itself over multiple
				355	* transactions, then it may try to do a journal_restart() while
				356	* there are still BJ_Reserved buffers outstanding. These must
				357	* be released cleanly from the current transaction.
				358	*
				359	* In this case, the filesystem must still reserve write access
				360	* again before modifying the buffer in the new transaction, but
				361	* we do not require it to remember exactly which old buffers it
				362	* has reserved. This is consistent with the existing behaviour
				363	* that multiple journal_get_write_access() calls to the same
				364	* buffer are perfectly permissable.
				365	*/
				366	while (commit_transaction->t_reserved_list) {
				367	jh = commit_transaction->t_reserved_list;
				368	JBUFFER_TRACE(jh, "reserved, unused: refile");
				369	/*
				370	* A journal_get_undo_access()+journal_release_buffer() may
				371	* leave undo-committed data.
				372	*/
				373	if (jh->b_committed_data) {
				374	struct buffer_head *bh = jh2bh(jh);
				375
				376	jbd_lock_bh_state(bh);
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	377	jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhl	f99d49a	2005-11-07 01:01:34 -0800	[diff] [blame]	378	jh->b_committed_data = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	379	jbd_unlock_bh_state(bh);
				380	}
				381	journal_refile_buffer(journal, jh);
				382	}
				383
				384	/*
				385	* Now try to drop any written-back buffers from the journal's
				386	* checkpoint lists. We do this before commit because it potentially
				387	* frees some memory
				388	*/
				389	spin_lock(&journal->j_list_lock);
				390	__journal_clean_checkpoint_list(journal);
				391	spin_unlock(&journal->j_list_lock);
				392
				393	jbd_debug (3, "JBD: commit phase 1\n");
				394
				395	/*
				396	* Switch to a new revoke table.
				397	*/
				398	journal_switch_revoke_table(journal);
				399
				400	commit_transaction->t_state = T_FLUSH;
				401	journal->j_committing_transaction = commit_transaction;
				402	journal->j_running_transaction = NULL;
				403	commit_transaction->t_log_start = journal->j_head;
				404	wake_up(&journal->j_wait_transaction_locked);
				405	spin_unlock(&journal->j_state_lock);
				406
				407	jbd_debug (3, "JBD: commit phase 2\n");
				408
				409	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	410	* Now start flushing things to disk, in the order they appear
				411	* on the transaction lists. Data blocks go first.
				412	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	413	err = 0;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	414	journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	415
				416	/*
				417	* Wait for all previously submitted IO to complete.
				418	*/
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	419	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	420	while (commit_transaction->t_locked_list) {
				421	struct buffer_head *bh;
				422
				423	jh = commit_transaction->t_locked_list->b_tprev;
				424	bh = jh2bh(jh);
				425	get_bh(bh);
				426	if (buffer_locked(bh)) {
				427	spin_unlock(&journal->j_list_lock);
				428	wait_on_buffer(bh);
				429	if (unlikely(!buffer_uptodate(bh)))
				430	err = -EIO;
				431	spin_lock(&journal->j_list_lock);
				432	}
				433	if (!inverted_lock(journal, bh)) {
				434	put_bh(bh);
				435	spin_lock(&journal->j_list_lock);
				436	continue;
				437	}
				438	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
				439	__journal_unfile_buffer(jh);
				440	jbd_unlock_bh_state(bh);
				441	journal_remove_journal_head(bh);
				442	put_bh(bh);
				443	} else {
				444	jbd_unlock_bh_state(bh);
				445	}
				446	put_bh(bh);
				447	cond_resched_lock(&journal->j_list_lock);
				448	}
				449	spin_unlock(&journal->j_list_lock);
				450
				451	if (err)
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	452	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	453
				454	journal_write_revoke_records(journal, commit_transaction);
				455
				456	jbd_debug(3, "JBD: commit phase 2\n");
				457
				458	/*
				459	* If we found any dirty or locked buffers, then we should have
				460	* looped back up to the write_out_data label. If there weren't
				461	* any then journal_clean_data_list should have wiped the list
				462	* clean by now, so check that it is in fact empty.
				463	*/
				464	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
				465
				466	jbd_debug (3, "JBD: commit phase 3\n");
				467
				468	/*
				469	* Way to go: we have now written out all of the data for a
				470	* transaction! Now comes the tricky part: we need to write out
				471	* metadata. Loop over the transaction's entire buffer list:
				472	*/
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame^]	473	spin_lock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	474	commit_transaction->t_state = T_COMMIT;
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame^]	475	spin_unlock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476
Josef Bacik	5b9a499	2008-04-28 02:16:12 -0700	[diff] [blame]	477	J_ASSERT(commit_transaction->t_nr_buffers <=
				478	commit_transaction->t_outstanding_credits);
				479
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	480	descriptor = NULL;
				481	bufs = 0;
				482	while (commit_transaction->t_buffers) {
				483
				484	/* Find the next buffer to be journaled... */
				485
				486	jh = commit_transaction->t_buffers;
				487
				488	/* If we're in abort mode, we just un-journal the buffer and
				489	release it for background writing. */
				490
				491	if (is_journal_aborted(journal)) {
				492	JBUFFER_TRACE(jh, "journal is aborting: refile");
				493	journal_refile_buffer(journal, jh);
				494	/* If that was the last one, we need to clean up
				495	* any descriptor buffers which may have been
				496	* already allocated, even if we are now
				497	* aborting. */
				498	if (!commit_transaction->t_buffers)
				499	goto start_journal_io;
				500	continue;
				501	}
				502
				503	/* Make sure we have a descriptor block in which to
				504	record the metadata buffer. */
				505
				506	if (!descriptor) {
				507	struct buffer_head *bh;
				508
				509	J_ASSERT (bufs == 0);
				510
				511	jbd_debug(4, "JBD: get descriptor\n");
				512
				513	descriptor = journal_get_descriptor_buffer(journal);
				514	if (!descriptor) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	515	journal_abort(journal, -EIO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	516	continue;
				517	}
				518
				519	bh = jh2bh(descriptor);
				520	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
				521	(unsigned long long)bh->b_blocknr, bh->b_data);
				522	header = (journal_header_t *)&bh->b_data[0];
				523	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				524	header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
				525	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				526
				527	tagp = &bh->b_data[sizeof(journal_header_t)];
				528	space_left = bh->b_size - sizeof(journal_header_t);
				529	first_tag = 1;
				530	set_buffer_jwrite(bh);
				531	set_buffer_dirty(bh);
				532	wbuf[bufs++] = bh;
				533
				534	/* Record it so that we can wait for IO
				535	completion later */
				536	BUFFER_TRACE(bh, "ph3: file as descriptor");
				537	journal_file_buffer(descriptor, commit_transaction,
				538	BJ_LogCtl);
				539	}
				540
				541	/* Where is the buffer to be written? */
				542
				543	err = journal_next_log_block(journal, &blocknr);
				544	/* If the block mapping failed, just abandon the buffer
				545	and repeat this loop: we'll fall into the
				546	refile-on-abort condition above. */
				547	if (err) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	548	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	549	continue;
				550	}
				551
				552	/*
				553	* start_this_handle() uses t_outstanding_credits to determine
				554	* the free space in the log, but this counter is changed
				555	* by journal_next_log_block() also.
				556	*/
				557	commit_transaction->t_outstanding_credits--;
				558
				559	/* Bump b_count to prevent truncate from stumbling over
				560	the shadowed buffer! @@@ This can go if we ever get
				561	rid of the BJ_IO/BJ_Shadow pairing of buffers. */
				562	atomic_inc(&jh2bh(jh)->b_count);
				563
				564	/* Make a temporary IO buffer with which to write it out
				565	(this will requeue both the metadata buffer and the
				566	temporary IO buffer). new_bh goes on BJ_IO*/
				567
				568	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				569	/*
				570	* akpm: journal_write_metadata_buffer() sets
				571	* new_bh->b_transaction to commit_transaction.
				572	* We need to clean this up before we release new_bh
				573	* (which is of type BJ_IO)
				574	*/
				575	JBUFFER_TRACE(jh, "ph3: write metadata");
				576	flags = journal_write_metadata_buffer(commit_transaction,
				577	jh, &new_jh, blocknr);
				578	set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
				579	wbuf[bufs++] = jh2bh(new_jh);
				580
				581	/* Record the new block's tag in the current descriptor
				582	buffer */
				583
				584	tag_flag = 0;
				585	if (flags & 1)
				586	tag_flag \|= JFS_FLAG_ESCAPE;
				587	if (!first_tag)
				588	tag_flag \|= JFS_FLAG_SAME_UUID;
				589
				590	tag = (journal_block_tag_t *) tagp;
				591	tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
				592	tag->t_flags = cpu_to_be32(tag_flag);
				593	tagp += sizeof(journal_block_tag_t);
				594	space_left -= sizeof(journal_block_tag_t);
				595
				596	if (first_tag) {
				597	memcpy (tagp, journal->j_uuid, 16);
				598	tagp += 16;
				599	space_left -= 16;
				600	first_tag = 0;
				601	}
				602
				603	/* If there's no more to do, or if the descriptor is full,
				604	let the IO rip! */
				605
				606	if (bufs == journal->j_wbufsize \|\|
				607	commit_transaction->t_buffers == NULL \|\|
				608	space_left < sizeof(journal_block_tag_t) + 16) {
				609
				610	jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
				611
				612	/* Write an end-of-descriptor marker before
				613	submitting the IOs. "tag" still points to
				614	the last tag we set up. */
				615
				616	tag->t_flags \|= cpu_to_be32(JFS_FLAG_LAST_TAG);
				617
				618	start_journal_io:
				619	for (i = 0; i < bufs; i++) {
				620	struct buffer_head *bh = wbuf[i];
				621	lock_buffer(bh);
				622	clear_buffer_dirty(bh);
				623	set_buffer_uptodate(bh);
				624	bh->b_end_io = journal_end_buffer_io_sync;
				625	submit_bh(WRITE, bh);
				626	}
				627	cond_resched();
				628
				629	/* Force a new descriptor to be generated next
				630	time round the loop. */
				631	descriptor = NULL;
				632	bufs = 0;
				633	}
				634	}
				635
				636	/* Lo and behold: we have just managed to send a transaction to
				637	the log. Before we can commit it, wait for the IO so far to
				638	complete. Control buffers being written are on the
				639	transaction's t_log_list queue, and metadata buffers are on
				640	the t_iobuf_list queue.
				641
				642	Wait for the buffers in reverse order. That way we are
				643	less likely to be woken up until all IOs have completed, and
				644	so we incur less scheduling load.
				645	*/
				646
				647	jbd_debug(3, "JBD: commit phase 4\n");
				648
				649	/*
				650	* akpm: these are BJ_IO, and j_list_lock is not needed.
				651	* See __journal_try_to_free_buffer.
				652	*/
				653	wait_for_iobuf:
				654	while (commit_transaction->t_iobuf_list != NULL) {
				655	struct buffer_head *bh;
				656
				657	jh = commit_transaction->t_iobuf_list->b_tprev;
				658	bh = jh2bh(jh);
				659	if (buffer_locked(bh)) {
				660	wait_on_buffer(bh);
				661	goto wait_for_iobuf;
				662	}
				663	if (cond_resched())
				664	goto wait_for_iobuf;
				665
				666	if (unlikely(!buffer_uptodate(bh)))
				667	err = -EIO;
				668
				669	clear_buffer_jwrite(bh);
				670
				671	JBUFFER_TRACE(jh, "ph4: unfile after journal write");
				672	journal_unfile_buffer(journal, jh);
				673
				674	/*
				675	* ->t_iobuf_list should contain only dummy buffer_heads
				676	* which were created by journal_write_metadata_buffer().
				677	*/
				678	BUFFER_TRACE(bh, "dumping temporary bh");
				679	journal_put_journal_head(jh);
				680	__brelse(bh);
				681	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				682	free_buffer_head(bh);
				683
				684	/* We also have to unlock and free the corresponding
				685	shadowed buffer */
				686	jh = commit_transaction->t_shadow_list->b_tprev;
				687	bh = jh2bh(jh);
				688	clear_bit(BH_JWrite, &bh->b_state);
				689	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				690
				691	/* The metadata is now released for reuse, but we need
				692	to remember it against this transaction so that when
				693	we finally commit, we can do any checkpointing
				694	required. */
				695	JBUFFER_TRACE(jh, "file as BJ_Forget");
				696	journal_file_buffer(jh, commit_transaction, BJ_Forget);
				697	/* Wake up any transactions which were waiting for this
				698	IO to complete */
				699	wake_up_bit(&bh->b_state, BH_Unshadow);
				700	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				701	__brelse(bh);
				702	}
				703
				704	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				705
				706	jbd_debug(3, "JBD: commit phase 5\n");
				707
				708	/* Here we wait for the revoke record and descriptor record buffers */
				709	wait_for_ctlbuf:
				710	while (commit_transaction->t_log_list != NULL) {
				711	struct buffer_head *bh;
				712
				713	jh = commit_transaction->t_log_list->b_tprev;
				714	bh = jh2bh(jh);
				715	if (buffer_locked(bh)) {
				716	wait_on_buffer(bh);
				717	goto wait_for_ctlbuf;
				718	}
				719	if (cond_resched())
				720	goto wait_for_ctlbuf;
				721
				722	if (unlikely(!buffer_uptodate(bh)))
				723	err = -EIO;
				724
				725	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				726	clear_buffer_jwrite(bh);
				727	journal_unfile_buffer(journal, jh);
				728	journal_put_journal_head(jh);
				729	__brelse(bh); /* One for getblk */
				730	/* AKPM: bforget here */
				731	}
				732
				733	jbd_debug(3, "JBD: commit phase 6\n");
				734
				735	if (journal_write_commit_record(journal, commit_transaction))
				736	err = -EIO;
				737
				738	if (err)
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	739	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	740
				741	/* End of a transaction! Finally, we can do checkpoint
				742	processing: any buffers committed as a result of this
				743	transaction can be removed from any checkpoint list it was on
				744	before. */
				745
				746	jbd_debug(3, "JBD: commit phase 7\n");
				747
				748	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
				749	J_ASSERT(commit_transaction->t_buffers == NULL);
				750	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				751	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
				752	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				753	J_ASSERT(commit_transaction->t_log_list == NULL);
				754
				755	restart_loop:
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	756	/*
				757	* As there are other places (journal_unmap_buffer()) adding buffers
				758	* to this list we have to be careful and hold the j_list_lock.
				759	*/
				760	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	761	while (commit_transaction->t_forget) {
				762	transaction_t *cp_transaction;
				763	struct buffer_head *bh;
				764
				765	jh = commit_transaction->t_forget;
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	766	spin_unlock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	767	bh = jh2bh(jh);
				768	jbd_lock_bh_state(bh);
				769	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction \|\|
				770	jh->b_transaction == journal->j_running_transaction);
				771
				772	/*
				773	* If there is undo-protected committed data against
				774	* this buffer, then we can remove it now. If it is a
				775	* buffer needing such protection, the old frozen_data
				776	* field now points to a committed version of the
				777	* buffer, so rotate that field to the new committed
				778	* data.
				779	*
				780	* Otherwise, we can just throw away the frozen data now.
				781	*/
				782	if (jh->b_committed_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	783	jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	784	jh->b_committed_data = NULL;
				785	if (jh->b_frozen_data) {
				786	jh->b_committed_data = jh->b_frozen_data;
				787	jh->b_frozen_data = NULL;
				788	}
				789	} else if (jh->b_frozen_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	790	jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	791	jh->b_frozen_data = NULL;
				792	}
				793
				794	spin_lock(&journal->j_list_lock);
				795	cp_transaction = jh->b_cp_transaction;
				796	if (cp_transaction) {
				797	JBUFFER_TRACE(jh, "remove from old cp transaction");
				798	__journal_remove_checkpoint(jh);
				799	}
				800
				801	/* Only re-checkpoint the buffer_head if it is marked
				802	* dirty. If the buffer was added to the BJ_Forget list
				803	* by journal_forget, it may no longer be dirty and
				804	* there's no point in keeping a checkpoint record for
				805	* it. */
				806
				807	/* A buffer which has been freed while still being
				808	* journaled by a previous transaction may end up still
				809	* being dirty here, but we want to avoid writing back
				810	* that buffer in the future now that the last use has
				811	* been committed. That's not only a performance gain,
				812	* it also stops aliasing problems if the buffer is left
				813	* behind for writeback and gets reallocated for another
				814	* use in a different page. */
				815	if (buffer_freed(bh)) {
				816	clear_buffer_freed(bh);
				817	clear_buffer_jbddirty(bh);
				818	}
				819
				820	if (buffer_jbddirty(bh)) {
				821	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				822	__journal_insert_checkpoint(jh, commit_transaction);
				823	JBUFFER_TRACE(jh, "refile for checkpoint writeback");
				824	__journal_refile_buffer(jh);
				825	jbd_unlock_bh_state(bh);
				826	} else {
				827	J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara	9ada734	2006-06-23 02:05:25 -0700	[diff] [blame]	828	/* The buffer on BJ_Forget list and not jbddirty means
				829	* it has been freed by this transaction and hence it
				830	* could not have been reallocated until this
				831	* transaction has committed. BUT it could be
				832	* reallocated once we have written all the data to
				833	* disk and before we process the buffer on BJ_Forget
				834	* list. */
				835	JBUFFER_TRACE(jh, "refile or unfile freed buffer");
				836	__journal_refile_buffer(jh);
				837	if (!jh->b_transaction) {
				838	jbd_unlock_bh_state(bh);
				839	/* needs a brelse */
				840	journal_remove_journal_head(bh);
				841	release_buffer_page(bh);
				842	} else
				843	jbd_unlock_bh_state(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	844	}
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	845	cond_resched_lock(&journal->j_list_lock);
				846	}
				847	spin_unlock(&journal->j_list_lock);
				848	/*
Jan Kara	d4beaf4	2007-12-04 23:45:27 -0800	[diff] [blame]	849	* This is a bit sleazy. We use j_list_lock to protect transition
				850	* of a transaction into T_FINISHED state and calling
				851	* __journal_drop_transaction(). Otherwise we could race with
				852	* other checkpointing code processing the transaction...
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	853	*/
				854	spin_lock(&journal->j_state_lock);
				855	spin_lock(&journal->j_list_lock);
				856	/*
				857	* Now recheck if some buffers did not get attached to the transaction
				858	* while the lock was dropped...
				859	*/
				860	if (commit_transaction->t_forget) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	861	spin_unlock(&journal->j_list_lock);
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	862	spin_unlock(&journal->j_state_lock);
				863	goto restart_loop;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	864	}
				865
				866	/* Done with this transaction! */
				867
				868	jbd_debug(3, "JBD: commit phase 8\n");
				869
				870	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				871
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	872	commit_transaction->t_state = T_FINISHED;
				873	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				874	journal->j_commit_sequence = commit_transaction->t_tid;
				875	journal->j_committing_transaction = NULL;
				876	spin_unlock(&journal->j_state_lock);
				877
Jan Kara	fe28e42	2007-07-15 23:37:18 -0700	[diff] [blame]	878	if (commit_transaction->t_checkpoint_list == NULL &&
				879	commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	880	__journal_drop_transaction(journal, commit_transaction);
				881	} else {
				882	if (journal->j_checkpoint_transactions == NULL) {
				883	journal->j_checkpoint_transactions = commit_transaction;
				884	commit_transaction->t_cpnext = commit_transaction;
				885	commit_transaction->t_cpprev = commit_transaction;
				886	} else {
				887	commit_transaction->t_cpnext =
				888	journal->j_checkpoint_transactions;
				889	commit_transaction->t_cpprev =
				890	commit_transaction->t_cpnext->t_cpprev;
				891	commit_transaction->t_cpnext->t_cpprev =
				892	commit_transaction;
				893	commit_transaction->t_cpprev->t_cpnext =
				894	commit_transaction;
				895	}
				896	}
				897	spin_unlock(&journal->j_list_lock);
				898
				899	jbd_debug(1, "JBD: commit %d complete, head %d\n",
				900	journal->j_commit_sequence, journal->j_tail_sequence);
				901
				902	wake_up(&journal->j_wait_done_commit);
				903	}