Blame - fs/jbd/commit.c - kernel/msm-4.9

blob: f8077b9c898160513b1e958a532401653ac96123 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Uwe Zeisberger	f30c226	2006-10-03 23:01:26 +0200	[diff] [blame]	2	* linux/fs/jbd/commit.c
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	23	#include <linux/bio.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24
				25	/*
				26	* Default IO end handler for temporary BJ_IO buffer_heads.
				27	*/
				28	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				29	{
				30	BUFFER_TRACE(bh, "");
				31	if (uptodate)
				32	set_buffer_uptodate(bh);
				33	else
				34	clear_buffer_uptodate(bh);
				35	unlock_buffer(bh);
				36	}
				37
				38	/*
				39	* When an ext3-ordered file is truncated, it is possible that many pages are
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	40	* not successfully freed, because they are attached to a committing transaction.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	41	* After the transaction commits, these pages are left on the LRU, with no
				42	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				43	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				44	* the numbers in /proc/meminfo look odd.
				45	*
				46	* So here, we have a buffer which has just come off the forget list. Look to
				47	* see if we can strip all buffers from the backing page.
				48	*
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	49	* Called under journal->j_list_lock. The caller provided us with a ref
				50	* against the buffer, and we drop that here.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	51	*/
				52	static void release_buffer_page(struct buffer_head *bh)
				53	{
				54	struct page *page;
				55
				56	if (buffer_dirty(bh))
				57	goto nope;
				58	if (atomic_read(&bh->b_count) != 1)
				59	goto nope;
				60	page = bh->b_page;
				61	if (!page)
				62	goto nope;
				63	if (page->mapping)
				64	goto nope;
				65
				66	/* OK, it's a truncated page */
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	67	if (!trylock_page(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	68	goto nope;
				69
				70	page_cache_get(page);
				71	__brelse(bh);
				72	try_to_free_buffers(page);
				73	unlock_page(page);
				74	page_cache_release(page);
				75	return;
				76
				77	nope:
				78	__brelse(bh);
				79	}
				80
				81	/*
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	82	* Decrement reference counter for data buffer. If it has been marked
				83	* 'BH_Freed', release it and the page to which it belongs if possible.
				84	*/
				85	static void release_data_buffer(struct buffer_head *bh)
				86	{
				87	if (buffer_freed(bh)) {
				88	clear_buffer_freed(bh);
				89	release_buffer_page(bh);
				90	} else
				91	put_bh(bh);
				92	}
				93
				94	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	95	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
				96	* held. For ranking reasons we must trylock. If we lose, schedule away and
				97	* return 0. j_list_lock is dropped in this case.
				98	*/
				99	static int inverted_lock(journal_t journal, struct buffer_head bh)
				100	{
				101	if (!jbd_trylock_bh_state(bh)) {
				102	spin_unlock(&journal->j_list_lock);
				103	schedule();
				104	return 0;
				105	}
				106	return 1;
				107	}
				108
				109	/* Done it all: now write the commit record. We should have
				110	* cleaned up our previous buffers by now, so if we are in abort
				111	* mode we can now just skip the rest of the journal write
				112	* entirely.
				113	*
				114	* Returns 1 if the journal needs to be aborted or 0 on success
				115	*/
				116	static int journal_write_commit_record(journal_t *journal,
				117	transaction_t *commit_transaction)
				118	{
				119	struct journal_head *descriptor;
				120	struct buffer_head *bh;
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	121	journal_header_t *header;
				122	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	123	int barrier_done = 0;
				124
				125	if (is_journal_aborted(journal))
				126	return 0;
				127
				128	descriptor = journal_get_descriptor_buffer(journal);
				129	if (!descriptor)
				130	return 1;
				131
				132	bh = jh2bh(descriptor);
				133
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	134	header = (journal_header_t *)(bh->b_data);
				135	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				136	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
				137	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	138
				139	JBUFFER_TRACE(descriptor, "write commit block");
				140	set_buffer_dirty(bh);
				141	if (journal->j_flags & JFS_BARRIER) {
				142	set_buffer_ordered(bh);
				143	barrier_done = 1;
				144	}
				145	ret = sync_dirty_buffer(bh);
Neil Brown	28ae094	2008-02-08 04:22:13 -0800	[diff] [blame]	146	if (barrier_done)
				147	clear_buffer_ordered(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	148	/* is it possible for another commit to fail at roughly
				149	* the same time as this one? If so, we don't want to
				150	* trust the barrier flag in the super, but instead want
				151	* to remember if we sent a barrier request
				152	*/
				153	if (ret == -EOPNOTSUPP && barrier_done) {
				154	char b[BDEVNAME_SIZE];
				155
				156	printk(KERN_WARNING
				157	"JBD: barrier-based sync failed on %s - "
				158	"disabling barriers\n",
				159	bdevname(journal->j_dev, b));
				160	spin_lock(&journal->j_state_lock);
				161	journal->j_flags &= ~JFS_BARRIER;
				162	spin_unlock(&journal->j_state_lock);
				163
				164	/* And try again, without the barrier */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	165	set_buffer_uptodate(bh);
				166	set_buffer_dirty(bh);
				167	ret = sync_dirty_buffer(bh);
				168	}
				169	put_bh(bh); /* One for getblk() */
				170	journal_put_journal_head(descriptor);
				171
				172	return (ret == -EIO);
				173	}
				174
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	175	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs,
				176	int write_op)
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	177	{
				178	int i;
				179
				180	for (i = 0; i < bufs; i++) {
				181	wbuf[i]->b_end_io = end_buffer_write_sync;
				182	/* We use-up our safety reference in submit_bh() */
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	183	submit_bh(write_op, wbuf[i]);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	184	}
				185	}
				186
				187	/*
				188	* Submit all the data buffers to disk
				189	*/
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	190	static int journal_submit_data_buffers(journal_t *journal,
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	191	transaction_t *commit_transaction,
				192	int write_op)
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	193	{
				194	struct journal_head *jh;
				195	struct buffer_head *bh;
				196	int locked;
				197	int bufs = 0;
				198	struct buffer_head **wbuf = journal->j_wbuf;
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	199	int err = 0;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	200
				201	/*
				202	* Whenever we unlock the journal and sleep, things can get added
				203	* onto ->t_sync_datalist, so we have to keep looping back to
				204	* write_out_data until we know that the list is empty.
				205	*
				206	* Cleanup any flushed data buffers from the data list. Even in
				207	* abort mode, we want to flush this out as soon as possible.
				208	*/
				209	write_out_data:
				210	cond_resched();
				211	spin_lock(&journal->j_list_lock);
				212
				213	while (commit_transaction->t_sync_datalist) {
				214	jh = commit_transaction->t_sync_datalist;
				215	bh = jh2bh(jh);
				216	locked = 0;
				217
				218	/* Get reference just to make sure buffer does not disappear
				219	* when we are forced to drop various locks */
				220	get_bh(bh);
				221	/* If the buffer is dirty, we need to submit IO and hence
				222	* we need the buffer lock. We try to lock the buffer without
				223	* blocking. If we fail, we need to drop j_list_lock and do
				224	* blocking lock_buffer().
				225	*/
				226	if (buffer_dirty(bh)) {
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	227	if (!trylock_buffer(bh)) {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	228	BUFFER_TRACE(bh, "needs blocking lock");
				229	spin_unlock(&journal->j_list_lock);
				230	/* Write out all data to prevent deadlocks */
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	231	journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	232	bufs = 0;
				233	lock_buffer(bh);
				234	spin_lock(&journal->j_list_lock);
				235	}
				236	locked = 1;
				237	}
				238	/* We have to get bh_state lock. Again out of order, sigh. */
				239	if (!inverted_lock(journal, bh)) {
				240	jbd_lock_bh_state(bh);
				241	spin_lock(&journal->j_list_lock);
				242	}
				243	/* Someone already cleaned up the buffer? */
				244	if (!buffer_jbd(bh)
				245	\|\| jh->b_transaction != commit_transaction
				246	\|\| jh->b_jlist != BJ_SyncData) {
				247	jbd_unlock_bh_state(bh);
				248	if (locked)
				249	unlock_buffer(bh);
				250	BUFFER_TRACE(bh, "already cleaned up");
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	251	release_data_buffer(bh);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	252	continue;
				253	}
				254	if (locked && test_clear_buffer_dirty(bh)) {
				255	BUFFER_TRACE(bh, "needs writeout, adding to array");
				256	wbuf[bufs++] = bh;
				257	__journal_file_buffer(jh, commit_transaction,
				258	BJ_Locked);
				259	jbd_unlock_bh_state(bh);
				260	if (bufs == journal->j_wbufsize) {
				261	spin_unlock(&journal->j_list_lock);
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	262	journal_do_submit_data(wbuf, bufs, write_op);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	263	bufs = 0;
				264	goto write_out_data;
				265	}
Hisashi Hifumi	6f5a9da	2006-12-22 01:11:50 -0800	[diff] [blame]	266	} else if (!locked && buffer_locked(bh)) {
				267	__journal_file_buffer(jh, commit_transaction,
				268	BJ_Locked);
				269	jbd_unlock_bh_state(bh);
				270	put_bh(bh);
				271	} else {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	272	BUFFER_TRACE(bh, "writeout complete: unfile");
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	273	if (unlikely(!buffer_uptodate(bh)))
				274	err = -EIO;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	275	__journal_unfile_buffer(jh);
				276	jbd_unlock_bh_state(bh);
				277	if (locked)
				278	unlock_buffer(bh);
				279	journal_remove_journal_head(bh);
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	280	/* One for our safety reference, other for
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	281	* journal_remove_journal_head() */
				282	put_bh(bh);
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	283	release_data_buffer(bh);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	284	}
				285
Nick Piggin	95c354f	2008-01-30 13:31:20 +0100	[diff] [blame]	286	if (need_resched() \|\| spin_needbreak(&journal->j_list_lock)) {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	287	spin_unlock(&journal->j_list_lock);
				288	goto write_out_data;
				289	}
				290	}
				291	spin_unlock(&journal->j_list_lock);
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	292	journal_do_submit_data(wbuf, bufs, write_op);
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	293
				294	return err;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	295	}
				296
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	297	/*
				298	* journal_commit_transaction
				299	*
				300	* The primary function for committing a transaction to the log. This
				301	* function is called by the journal thread to begin a complete commit.
				302	*/
				303	void journal_commit_transaction(journal_t *journal)
				304	{
				305	transaction_t *commit_transaction;
				306	struct journal_head jh, new_jh, *descriptor;
				307	struct buffer_head **wbuf = journal->j_wbuf;
				308	int bufs;
				309	int flags;
				310	int err;
				311	unsigned long blocknr;
Josef Bacik	f420d4d	2009-01-07 18:07:24 -0800	[diff] [blame]	312	ktime_t start_time;
				313	u64 commit_time;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	314	char *tagp = NULL;
				315	journal_header_t *header;
				316	journal_block_tag_t *tag = NULL;
				317	int space_left = 0;
				318	int first_tag = 0;
				319	int tag_flag;
				320	int i;
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	321	int write_op = WRITE;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	322
				323	/*
				324	* First job: lock down the current transaction and wait for
				325	* all outstanding updates to complete.
				326	*/
				327
				328	#ifdef COMMIT_STATS
				329	spin_lock(&journal->j_list_lock);
				330	summarise_journal_usage(journal);
				331	spin_unlock(&journal->j_list_lock);
				332	#endif
				333
				334	/* Do we need to erase the effects of a prior journal_flush? */
				335	if (journal->j_flags & JFS_FLUSHED) {
				336	jbd_debug(3, "super block updated\n");
				337	journal_update_superblock(journal, 1);
				338	} else {
				339	jbd_debug(3, "superblock not updated\n");
				340	}
				341
				342	J_ASSERT(journal->j_running_transaction != NULL);
				343	J_ASSERT(journal->j_committing_transaction == NULL);
				344
				345	commit_transaction = journal->j_running_transaction;
				346	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				347
				348	jbd_debug(1, "JBD: starting commit of transaction %d\n",
				349	commit_transaction->t_tid);
				350
				351	spin_lock(&journal->j_state_lock);
				352	commit_transaction->t_state = T_LOCKED;
				353
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	354	if (commit_transaction->t_synchronous_commit)
				355	write_op = WRITE_SYNC;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	356	spin_lock(&commit_transaction->t_handle_lock);
				357	while (commit_transaction->t_updates) {
				358	DEFINE_WAIT(wait);
				359
				360	prepare_to_wait(&journal->j_wait_updates, &wait,
				361	TASK_UNINTERRUPTIBLE);
				362	if (commit_transaction->t_updates) {
				363	spin_unlock(&commit_transaction->t_handle_lock);
				364	spin_unlock(&journal->j_state_lock);
				365	schedule();
				366	spin_lock(&journal->j_state_lock);
				367	spin_lock(&commit_transaction->t_handle_lock);
				368	}
				369	finish_wait(&journal->j_wait_updates, &wait);
				370	}
				371	spin_unlock(&commit_transaction->t_handle_lock);
				372
				373	J_ASSERT (commit_transaction->t_outstanding_credits <=
				374	journal->j_max_transaction_buffers);
				375
				376	/*
				377	* First thing we are allowed to do is to discard any remaining
				378	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				379	* that there are no such buffers: if a large filesystem
				380	* operation like a truncate needs to split itself over multiple
				381	* transactions, then it may try to do a journal_restart() while
				382	* there are still BJ_Reserved buffers outstanding. These must
				383	* be released cleanly from the current transaction.
				384	*
				385	* In this case, the filesystem must still reserve write access
				386	* again before modifying the buffer in the new transaction, but
				387	* we do not require it to remember exactly which old buffers it
				388	* has reserved. This is consistent with the existing behaviour
				389	* that multiple journal_get_write_access() calls to the same
				390	* buffer are perfectly permissable.
				391	*/
				392	while (commit_transaction->t_reserved_list) {
				393	jh = commit_transaction->t_reserved_list;
				394	JBUFFER_TRACE(jh, "reserved, unused: refile");
				395	/*
				396	* A journal_get_undo_access()+journal_release_buffer() may
				397	* leave undo-committed data.
				398	*/
				399	if (jh->b_committed_data) {
				400	struct buffer_head *bh = jh2bh(jh);
				401
				402	jbd_lock_bh_state(bh);
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	403	jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhl	f99d49a	2005-11-07 01:01:34 -0800	[diff] [blame]	404	jh->b_committed_data = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	405	jbd_unlock_bh_state(bh);
				406	}
				407	journal_refile_buffer(journal, jh);
				408	}
				409
				410	/*
				411	* Now try to drop any written-back buffers from the journal's
				412	* checkpoint lists. We do this before commit because it potentially
				413	* frees some memory
				414	*/
				415	spin_lock(&journal->j_list_lock);
				416	__journal_clean_checkpoint_list(journal);
				417	spin_unlock(&journal->j_list_lock);
				418
				419	jbd_debug (3, "JBD: commit phase 1\n");
				420
				421	/*
				422	* Switch to a new revoke table.
				423	*/
				424	journal_switch_revoke_table(journal);
				425
				426	commit_transaction->t_state = T_FLUSH;
				427	journal->j_committing_transaction = commit_transaction;
				428	journal->j_running_transaction = NULL;
Josef Bacik	f420d4d	2009-01-07 18:07:24 -0800	[diff] [blame]	429	start_time = ktime_get();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	commit_transaction->t_log_start = journal->j_head;
				431	wake_up(&journal->j_wait_transaction_locked);
				432	spin_unlock(&journal->j_state_lock);
				433
				434	jbd_debug (3, "JBD: commit phase 2\n");
				435
				436	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	* Now start flushing things to disk, in the order they appear
				438	* on the transaction lists. Data blocks go first.
				439	*/
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	440	err = journal_submit_data_buffers(journal, commit_transaction,
				441	write_op);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	442
				443	/*
				444	* Wait for all previously submitted IO to complete.
				445	*/
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	446	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	447	while (commit_transaction->t_locked_list) {
				448	struct buffer_head *bh;
				449
				450	jh = commit_transaction->t_locked_list->b_tprev;
				451	bh = jh2bh(jh);
				452	get_bh(bh);
				453	if (buffer_locked(bh)) {
				454	spin_unlock(&journal->j_list_lock);
				455	wait_on_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	456	spin_lock(&journal->j_list_lock);
				457	}
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	458	if (unlikely(!buffer_uptodate(bh))) {
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	459	if (!trylock_page(bh->b_page)) {
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	460	spin_unlock(&journal->j_list_lock);
				461	lock_page(bh->b_page);
				462	spin_lock(&journal->j_list_lock);
				463	}
				464	if (bh->b_page->mapping)
				465	set_bit(AS_EIO, &bh->b_page->mapping->flags);
				466
				467	unlock_page(bh->b_page);
				468	SetPageError(bh->b_page);
				469	err = -EIO;
				470	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	471	if (!inverted_lock(journal, bh)) {
				472	put_bh(bh);
				473	spin_lock(&journal->j_list_lock);
				474	continue;
				475	}
				476	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
				477	__journal_unfile_buffer(jh);
				478	jbd_unlock_bh_state(bh);
				479	journal_remove_journal_head(bh);
				480	put_bh(bh);
				481	} else {
				482	jbd_unlock_bh_state(bh);
				483	}
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	484	release_data_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	485	cond_resched_lock(&journal->j_list_lock);
				486	}
				487	spin_unlock(&journal->j_list_lock);
				488
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	489	if (err) {
				490	char b[BDEVNAME_SIZE];
				491
				492	printk(KERN_WARNING
				493	"JBD: Detected IO errors while flushing file data "
				494	"on %s\n", bdevname(journal->j_fs_dev, b));
Hidehiro Kawai	0e4fb5e	2008-10-18 20:27:57 -0700	[diff] [blame]	495	if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
				496	journal_abort(journal, err);
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	497	err = 0;
				498	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	499
				500	journal_write_revoke_records(journal, commit_transaction);
				501
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	502	/*
				503	* If we found any dirty or locked buffers, then we should have
				504	* looped back up to the write_out_data label. If there weren't
				505	* any then journal_clean_data_list should have wiped the list
				506	* clean by now, so check that it is in fact empty.
				507	*/
				508	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
				509
				510	jbd_debug (3, "JBD: commit phase 3\n");
				511
				512	/*
				513	* Way to go: we have now written out all of the data for a
				514	* transaction! Now comes the tricky part: we need to write out
				515	* metadata. Loop over the transaction's entire buffer list:
				516	*/
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame]	517	spin_lock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	518	commit_transaction->t_state = T_COMMIT;
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame]	519	spin_unlock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	520
Josef Bacik	5b9a499	2008-04-28 02:16:12 -0700	[diff] [blame]	521	J_ASSERT(commit_transaction->t_nr_buffers <=
				522	commit_transaction->t_outstanding_credits);
				523
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	524	descriptor = NULL;
				525	bufs = 0;
				526	while (commit_transaction->t_buffers) {
				527
				528	/* Find the next buffer to be journaled... */
				529
				530	jh = commit_transaction->t_buffers;
				531
				532	/* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	533	release it. */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	534
				535	if (is_journal_aborted(journal)) {
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	536	clear_buffer_jbddirty(jh2bh(jh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	537	JBUFFER_TRACE(jh, "journal is aborting: refile");
				538	journal_refile_buffer(journal, jh);
				539	/* If that was the last one, we need to clean up
				540	* any descriptor buffers which may have been
				541	* already allocated, even if we are now
				542	* aborting. */
				543	if (!commit_transaction->t_buffers)
				544	goto start_journal_io;
				545	continue;
				546	}
				547
				548	/* Make sure we have a descriptor block in which to
				549	record the metadata buffer. */
				550
				551	if (!descriptor) {
				552	struct buffer_head *bh;
				553
				554	J_ASSERT (bufs == 0);
				555
				556	jbd_debug(4, "JBD: get descriptor\n");
				557
				558	descriptor = journal_get_descriptor_buffer(journal);
				559	if (!descriptor) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	560	journal_abort(journal, -EIO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	561	continue;
				562	}
				563
				564	bh = jh2bh(descriptor);
				565	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
				566	(unsigned long long)bh->b_blocknr, bh->b_data);
				567	header = (journal_header_t *)&bh->b_data[0];
				568	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				569	header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
				570	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				571
				572	tagp = &bh->b_data[sizeof(journal_header_t)];
				573	space_left = bh->b_size - sizeof(journal_header_t);
				574	first_tag = 1;
				575	set_buffer_jwrite(bh);
				576	set_buffer_dirty(bh);
				577	wbuf[bufs++] = bh;
				578
				579	/* Record it so that we can wait for IO
				580	completion later */
				581	BUFFER_TRACE(bh, "ph3: file as descriptor");
				582	journal_file_buffer(descriptor, commit_transaction,
				583	BJ_LogCtl);
				584	}
				585
				586	/* Where is the buffer to be written? */
				587
				588	err = journal_next_log_block(journal, &blocknr);
				589	/* If the block mapping failed, just abandon the buffer
				590	and repeat this loop: we'll fall into the
				591	refile-on-abort condition above. */
				592	if (err) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	593	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	594	continue;
				595	}
				596
				597	/*
				598	* start_this_handle() uses t_outstanding_credits to determine
				599	* the free space in the log, but this counter is changed
				600	* by journal_next_log_block() also.
				601	*/
				602	commit_transaction->t_outstanding_credits--;
				603
				604	/* Bump b_count to prevent truncate from stumbling over
				605	the shadowed buffer! @@@ This can go if we ever get
				606	rid of the BJ_IO/BJ_Shadow pairing of buffers. */
				607	atomic_inc(&jh2bh(jh)->b_count);
				608
				609	/* Make a temporary IO buffer with which to write it out
				610	(this will requeue both the metadata buffer and the
				611	temporary IO buffer). new_bh goes on BJ_IO*/
				612
				613	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				614	/*
				615	* akpm: journal_write_metadata_buffer() sets
				616	* new_bh->b_transaction to commit_transaction.
				617	* We need to clean this up before we release new_bh
				618	* (which is of type BJ_IO)
				619	*/
				620	JBUFFER_TRACE(jh, "ph3: write metadata");
				621	flags = journal_write_metadata_buffer(commit_transaction,
				622	jh, &new_jh, blocknr);
				623	set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
				624	wbuf[bufs++] = jh2bh(new_jh);
				625
				626	/* Record the new block's tag in the current descriptor
				627	buffer */
				628
				629	tag_flag = 0;
				630	if (flags & 1)
				631	tag_flag \|= JFS_FLAG_ESCAPE;
				632	if (!first_tag)
				633	tag_flag \|= JFS_FLAG_SAME_UUID;
				634
				635	tag = (journal_block_tag_t *) tagp;
				636	tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
				637	tag->t_flags = cpu_to_be32(tag_flag);
				638	tagp += sizeof(journal_block_tag_t);
				639	space_left -= sizeof(journal_block_tag_t);
				640
				641	if (first_tag) {
				642	memcpy (tagp, journal->j_uuid, 16);
				643	tagp += 16;
				644	space_left -= 16;
				645	first_tag = 0;
				646	}
				647
				648	/* If there's no more to do, or if the descriptor is full,
				649	let the IO rip! */
				650
				651	if (bufs == journal->j_wbufsize \|\|
				652	commit_transaction->t_buffers == NULL \|\|
				653	space_left < sizeof(journal_block_tag_t) + 16) {
				654
				655	jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
				656
				657	/* Write an end-of-descriptor marker before
				658	submitting the IOs. "tag" still points to
				659	the last tag we set up. */
				660
				661	tag->t_flags \|= cpu_to_be32(JFS_FLAG_LAST_TAG);
				662
				663	start_journal_io:
				664	for (i = 0; i < bufs; i++) {
				665	struct buffer_head *bh = wbuf[i];
				666	lock_buffer(bh);
				667	clear_buffer_dirty(bh);
				668	set_buffer_uptodate(bh);
				669	bh->b_end_io = journal_end_buffer_io_sync;
Theodore Ts'o	512a004	2009-03-27 22:14:27 -0400	[diff] [blame^]	670	submit_bh(write_op, bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	671	}
				672	cond_resched();
				673
				674	/* Force a new descriptor to be generated next
				675	time round the loop. */
				676	descriptor = NULL;
				677	bufs = 0;
				678	}
				679	}
				680
				681	/* Lo and behold: we have just managed to send a transaction to
				682	the log. Before we can commit it, wait for the IO so far to
				683	complete. Control buffers being written are on the
				684	transaction's t_log_list queue, and metadata buffers are on
				685	the t_iobuf_list queue.
				686
				687	Wait for the buffers in reverse order. That way we are
				688	less likely to be woken up until all IOs have completed, and
				689	so we incur less scheduling load.
				690	*/
				691
				692	jbd_debug(3, "JBD: commit phase 4\n");
				693
				694	/*
				695	* akpm: these are BJ_IO, and j_list_lock is not needed.
				696	* See __journal_try_to_free_buffer.
				697	*/
				698	wait_for_iobuf:
				699	while (commit_transaction->t_iobuf_list != NULL) {
				700	struct buffer_head *bh;
				701
				702	jh = commit_transaction->t_iobuf_list->b_tprev;
				703	bh = jh2bh(jh);
				704	if (buffer_locked(bh)) {
				705	wait_on_buffer(bh);
				706	goto wait_for_iobuf;
				707	}
				708	if (cond_resched())
				709	goto wait_for_iobuf;
				710
				711	if (unlikely(!buffer_uptodate(bh)))
				712	err = -EIO;
				713
				714	clear_buffer_jwrite(bh);
				715
				716	JBUFFER_TRACE(jh, "ph4: unfile after journal write");
				717	journal_unfile_buffer(journal, jh);
				718
				719	/*
				720	* ->t_iobuf_list should contain only dummy buffer_heads
				721	* which were created by journal_write_metadata_buffer().
				722	*/
				723	BUFFER_TRACE(bh, "dumping temporary bh");
				724	journal_put_journal_head(jh);
				725	__brelse(bh);
				726	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				727	free_buffer_head(bh);
				728
				729	/* We also have to unlock and free the corresponding
				730	shadowed buffer */
				731	jh = commit_transaction->t_shadow_list->b_tprev;
				732	bh = jh2bh(jh);
				733	clear_bit(BH_JWrite, &bh->b_state);
				734	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				735
				736	/* The metadata is now released for reuse, but we need
				737	to remember it against this transaction so that when
				738	we finally commit, we can do any checkpointing
				739	required. */
				740	JBUFFER_TRACE(jh, "file as BJ_Forget");
				741	journal_file_buffer(jh, commit_transaction, BJ_Forget);
				742	/* Wake up any transactions which were waiting for this
				743	IO to complete */
				744	wake_up_bit(&bh->b_state, BH_Unshadow);
				745	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				746	__brelse(bh);
				747	}
				748
				749	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				750
				751	jbd_debug(3, "JBD: commit phase 5\n");
				752
				753	/* Here we wait for the revoke record and descriptor record buffers */
				754	wait_for_ctlbuf:
				755	while (commit_transaction->t_log_list != NULL) {
				756	struct buffer_head *bh;
				757
				758	jh = commit_transaction->t_log_list->b_tprev;
				759	bh = jh2bh(jh);
				760	if (buffer_locked(bh)) {
				761	wait_on_buffer(bh);
				762	goto wait_for_ctlbuf;
				763	}
				764	if (cond_resched())
				765	goto wait_for_ctlbuf;
				766
				767	if (unlikely(!buffer_uptodate(bh)))
				768	err = -EIO;
				769
				770	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				771	clear_buffer_jwrite(bh);
				772	journal_unfile_buffer(journal, jh);
				773	journal_put_journal_head(jh);
				774	__brelse(bh); /* One for getblk */
				775	/* AKPM: bforget here */
				776	}
				777
Hidehiro Kawai	d1645e5	2008-10-18 20:27:53 -0700	[diff] [blame]	778	if (err)
				779	journal_abort(journal, err);
				780
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	781	jbd_debug(3, "JBD: commit phase 6\n");
				782
				783	if (journal_write_commit_record(journal, commit_transaction))
				784	err = -EIO;
				785
				786	if (err)
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	787	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	788
				789	/* End of a transaction! Finally, we can do checkpoint
				790	processing: any buffers committed as a result of this
				791	transaction can be removed from any checkpoint list it was on
				792	before. */
				793
				794	jbd_debug(3, "JBD: commit phase 7\n");
				795
				796	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
				797	J_ASSERT(commit_transaction->t_buffers == NULL);
				798	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				799	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
				800	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				801	J_ASSERT(commit_transaction->t_log_list == NULL);
				802
				803	restart_loop:
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	804	/*
				805	* As there are other places (journal_unmap_buffer()) adding buffers
				806	* to this list we have to be careful and hold the j_list_lock.
				807	*/
				808	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	809	while (commit_transaction->t_forget) {
				810	transaction_t *cp_transaction;
				811	struct buffer_head *bh;
				812
				813	jh = commit_transaction->t_forget;
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	814	spin_unlock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	815	bh = jh2bh(jh);
				816	jbd_lock_bh_state(bh);
				817	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction \|\|
				818	jh->b_transaction == journal->j_running_transaction);
				819
				820	/*
				821	* If there is undo-protected committed data against
				822	* this buffer, then we can remove it now. If it is a
				823	* buffer needing such protection, the old frozen_data
				824	* field now points to a committed version of the
				825	* buffer, so rotate that field to the new committed
				826	* data.
				827	*
				828	* Otherwise, we can just throw away the frozen data now.
				829	*/
				830	if (jh->b_committed_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	831	jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	832	jh->b_committed_data = NULL;
				833	if (jh->b_frozen_data) {
				834	jh->b_committed_data = jh->b_frozen_data;
				835	jh->b_frozen_data = NULL;
				836	}
				837	} else if (jh->b_frozen_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	838	jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	839	jh->b_frozen_data = NULL;
				840	}
				841
				842	spin_lock(&journal->j_list_lock);
				843	cp_transaction = jh->b_cp_transaction;
				844	if (cp_transaction) {
				845	JBUFFER_TRACE(jh, "remove from old cp transaction");
				846	__journal_remove_checkpoint(jh);
				847	}
				848
				849	/* Only re-checkpoint the buffer_head if it is marked
				850	* dirty. If the buffer was added to the BJ_Forget list
				851	* by journal_forget, it may no longer be dirty and
				852	* there's no point in keeping a checkpoint record for
				853	* it. */
				854
				855	/* A buffer which has been freed while still being
				856	* journaled by a previous transaction may end up still
				857	* being dirty here, but we want to avoid writing back
				858	* that buffer in the future now that the last use has
				859	* been committed. That's not only a performance gain,
				860	* it also stops aliasing problems if the buffer is left
				861	* behind for writeback and gets reallocated for another
				862	* use in a different page. */
				863	if (buffer_freed(bh)) {
				864	clear_buffer_freed(bh);
				865	clear_buffer_jbddirty(bh);
				866	}
				867
				868	if (buffer_jbddirty(bh)) {
				869	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				870	__journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	871	if (is_journal_aborted(journal))
				872	clear_buffer_jbddirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	873	JBUFFER_TRACE(jh, "refile for checkpoint writeback");
				874	__journal_refile_buffer(jh);
				875	jbd_unlock_bh_state(bh);
				876	} else {
				877	J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara	9ada734	2006-06-23 02:05:25 -0700	[diff] [blame]	878	/* The buffer on BJ_Forget list and not jbddirty means
				879	* it has been freed by this transaction and hence it
				880	* could not have been reallocated until this
				881	* transaction has committed. BUT it could be
				882	* reallocated once we have written all the data to
				883	* disk and before we process the buffer on BJ_Forget
				884	* list. */
				885	JBUFFER_TRACE(jh, "refile or unfile freed buffer");
				886	__journal_refile_buffer(jh);
				887	if (!jh->b_transaction) {
				888	jbd_unlock_bh_state(bh);
				889	/* needs a brelse */
				890	journal_remove_journal_head(bh);
				891	release_buffer_page(bh);
				892	} else
				893	jbd_unlock_bh_state(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	894	}
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	895	cond_resched_lock(&journal->j_list_lock);
				896	}
				897	spin_unlock(&journal->j_list_lock);
				898	/*
Jan Kara	d4beaf4	2007-12-04 23:45:27 -0800	[diff] [blame]	899	* This is a bit sleazy. We use j_list_lock to protect transition
				900	* of a transaction into T_FINISHED state and calling
				901	* __journal_drop_transaction(). Otherwise we could race with
				902	* other checkpointing code processing the transaction...
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	903	*/
				904	spin_lock(&journal->j_state_lock);
				905	spin_lock(&journal->j_list_lock);
				906	/*
				907	* Now recheck if some buffers did not get attached to the transaction
				908	* while the lock was dropped...
				909	*/
				910	if (commit_transaction->t_forget) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	911	spin_unlock(&journal->j_list_lock);
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	912	spin_unlock(&journal->j_state_lock);
				913	goto restart_loop;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	914	}
				915
				916	/* Done with this transaction! */
				917
				918	jbd_debug(3, "JBD: commit phase 8\n");
				919
				920	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				921
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	922	commit_transaction->t_state = T_FINISHED;
				923	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				924	journal->j_commit_sequence = commit_transaction->t_tid;
				925	journal->j_committing_transaction = NULL;
Josef Bacik	f420d4d	2009-01-07 18:07:24 -0800	[diff] [blame]	926	commit_time = ktime_to_ns(ktime_sub(ktime_get(), start_time));
				927
				928	/*
				929	* weight the commit time higher than the average time so we don't
				930	* react too strongly to vast changes in commit time
				931	*/
				932	if (likely(journal->j_average_commit_time))
				933	journal->j_average_commit_time = (commit_time*3 +
				934	journal->j_average_commit_time) / 4;
				935	else
				936	journal->j_average_commit_time = commit_time;
				937
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	938	spin_unlock(&journal->j_state_lock);
				939
Jan Kara	fe28e42	2007-07-15 23:37:18 -0700	[diff] [blame]	940	if (commit_transaction->t_checkpoint_list == NULL &&
				941	commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	942	__journal_drop_transaction(journal, commit_transaction);
				943	} else {
				944	if (journal->j_checkpoint_transactions == NULL) {
				945	journal->j_checkpoint_transactions = commit_transaction;
				946	commit_transaction->t_cpnext = commit_transaction;
				947	commit_transaction->t_cpprev = commit_transaction;
				948	} else {
				949	commit_transaction->t_cpnext =
				950	journal->j_checkpoint_transactions;
				951	commit_transaction->t_cpprev =
				952	commit_transaction->t_cpnext->t_cpprev;
				953	commit_transaction->t_cpnext->t_cpprev =
				954	commit_transaction;
				955	commit_transaction->t_cpprev->t_cpnext =
				956	commit_transaction;
				957	}
				958	}
				959	spin_unlock(&journal->j_list_lock);
				960
				961	jbd_debug(1, "JBD: commit %d complete, head %d\n",
				962	journal->j_commit_sequence, journal->j_tail_sequence);
				963
				964	wake_up(&journal->j_wait_done_commit);
				965	}