Blame - fs/jbd/commit.c - kernel/msm-4.9

blob: 25719d902c5116a6ff50248b272e5603c196f0f7 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Uwe Zeisberger	f30c226	2006-10-03 23:01:26 +0200	[diff] [blame]	2	* linux/fs/jbd/commit.c
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	23
				24	/*
				25	* Default IO end handler for temporary BJ_IO buffer_heads.
				26	*/
				27	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				28	{
				29	BUFFER_TRACE(bh, "");
				30	if (uptodate)
				31	set_buffer_uptodate(bh);
				32	else
				33	clear_buffer_uptodate(bh);
				34	unlock_buffer(bh);
				35	}
				36
				37	/*
				38	* When an ext3-ordered file is truncated, it is possible that many pages are
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	39	* not successfully freed, because they are attached to a committing transaction.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40	* After the transaction commits, these pages are left on the LRU, with no
				41	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				42	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				43	* the numbers in /proc/meminfo look odd.
				44	*
				45	* So here, we have a buffer which has just come off the forget list. Look to
				46	* see if we can strip all buffers from the backing page.
				47	*
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	48	* Called under journal->j_list_lock. The caller provided us with a ref
				49	* against the buffer, and we drop that here.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	50	*/
				51	static void release_buffer_page(struct buffer_head *bh)
				52	{
				53	struct page *page;
				54
				55	if (buffer_dirty(bh))
				56	goto nope;
				57	if (atomic_read(&bh->b_count) != 1)
				58	goto nope;
				59	page = bh->b_page;
				60	if (!page)
				61	goto nope;
				62	if (page->mapping)
				63	goto nope;
				64
				65	/* OK, it's a truncated page */
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	66	if (!trylock_page(page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	67	goto nope;
				68
				69	page_cache_get(page);
				70	__brelse(bh);
				71	try_to_free_buffers(page);
				72	unlock_page(page);
				73	page_cache_release(page);
				74	return;
				75
				76	nope:
				77	__brelse(bh);
				78	}
				79
				80	/*
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	81	* Decrement reference counter for data buffer. If it has been marked
				82	* 'BH_Freed', release it and the page to which it belongs if possible.
				83	*/
				84	static void release_data_buffer(struct buffer_head *bh)
				85	{
				86	if (buffer_freed(bh)) {
				87	clear_buffer_freed(bh);
				88	release_buffer_page(bh);
				89	} else
				90	put_bh(bh);
				91	}
				92
				93	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	94	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
				95	* held. For ranking reasons we must trylock. If we lose, schedule away and
				96	* return 0. j_list_lock is dropped in this case.
				97	*/
				98	static int inverted_lock(journal_t journal, struct buffer_head bh)
				99	{
				100	if (!jbd_trylock_bh_state(bh)) {
				101	spin_unlock(&journal->j_list_lock);
				102	schedule();
				103	return 0;
				104	}
				105	return 1;
				106	}
				107
				108	/* Done it all: now write the commit record. We should have
				109	* cleaned up our previous buffers by now, so if we are in abort
				110	* mode we can now just skip the rest of the journal write
				111	* entirely.
				112	*
				113	* Returns 1 if the journal needs to be aborted or 0 on success
				114	*/
				115	static int journal_write_commit_record(journal_t *journal,
				116	transaction_t *commit_transaction)
				117	{
				118	struct journal_head *descriptor;
				119	struct buffer_head *bh;
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	120	journal_header_t *header;
				121	int ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	122	int barrier_done = 0;
				123
				124	if (is_journal_aborted(journal))
				125	return 0;
				126
				127	descriptor = journal_get_descriptor_buffer(journal);
				128	if (!descriptor)
				129	return 1;
				130
				131	bh = jh2bh(descriptor);
				132
Jan Kara	5315217	2008-02-01 08:26:46 -0500	[diff] [blame]	133	header = (journal_header_t *)(bh->b_data);
				134	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				135	header->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
				136	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	137
				138	JBUFFER_TRACE(descriptor, "write commit block");
				139	set_buffer_dirty(bh);
				140	if (journal->j_flags & JFS_BARRIER) {
				141	set_buffer_ordered(bh);
				142	barrier_done = 1;
				143	}
				144	ret = sync_dirty_buffer(bh);
Neil Brown	28ae094	2008-02-08 04:22:13 -0800	[diff] [blame]	145	if (barrier_done)
				146	clear_buffer_ordered(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	147	/* is it possible for another commit to fail at roughly
				148	* the same time as this one? If so, we don't want to
				149	* trust the barrier flag in the super, but instead want
				150	* to remember if we sent a barrier request
				151	*/
				152	if (ret == -EOPNOTSUPP && barrier_done) {
				153	char b[BDEVNAME_SIZE];
				154
				155	printk(KERN_WARNING
				156	"JBD: barrier-based sync failed on %s - "
				157	"disabling barriers\n",
				158	bdevname(journal->j_dev, b));
				159	spin_lock(&journal->j_state_lock);
				160	journal->j_flags &= ~JFS_BARRIER;
				161	spin_unlock(&journal->j_state_lock);
				162
				163	/* And try again, without the barrier */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	164	set_buffer_uptodate(bh);
				165	set_buffer_dirty(bh);
				166	ret = sync_dirty_buffer(bh);
				167	}
				168	put_bh(bh); /* One for getblk() */
				169	journal_put_journal_head(descriptor);
				170
				171	return (ret == -EIO);
				172	}
				173
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	174	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
				175	{
				176	int i;
				177
				178	for (i = 0; i < bufs; i++) {
				179	wbuf[i]->b_end_io = end_buffer_write_sync;
				180	/* We use-up our safety reference in submit_bh() */
				181	submit_bh(WRITE, wbuf[i]);
				182	}
				183	}
				184
				185	/*
				186	* Submit all the data buffers to disk
				187	*/
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	188	static int journal_submit_data_buffers(journal_t *journal,
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	189	transaction_t *commit_transaction)
				190	{
				191	struct journal_head *jh;
				192	struct buffer_head *bh;
				193	int locked;
				194	int bufs = 0;
				195	struct buffer_head **wbuf = journal->j_wbuf;
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	196	int err = 0;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	197
				198	/*
				199	* Whenever we unlock the journal and sleep, things can get added
				200	* onto ->t_sync_datalist, so we have to keep looping back to
				201	* write_out_data until we know that the list is empty.
				202	*
				203	* Cleanup any flushed data buffers from the data list. Even in
				204	* abort mode, we want to flush this out as soon as possible.
				205	*/
				206	write_out_data:
				207	cond_resched();
				208	spin_lock(&journal->j_list_lock);
				209
				210	while (commit_transaction->t_sync_datalist) {
				211	jh = commit_transaction->t_sync_datalist;
				212	bh = jh2bh(jh);
				213	locked = 0;
				214
				215	/* Get reference just to make sure buffer does not disappear
				216	* when we are forced to drop various locks */
				217	get_bh(bh);
				218	/* If the buffer is dirty, we need to submit IO and hence
				219	* we need the buffer lock. We try to lock the buffer without
				220	* blocking. If we fail, we need to drop j_list_lock and do
				221	* blocking lock_buffer().
				222	*/
				223	if (buffer_dirty(bh)) {
Nick Piggin	ca5de40	2008-08-02 12:02:13 +0200	[diff] [blame]	224	if (!trylock_buffer(bh)) {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	225	BUFFER_TRACE(bh, "needs blocking lock");
				226	spin_unlock(&journal->j_list_lock);
				227	/* Write out all data to prevent deadlocks */
				228	journal_do_submit_data(wbuf, bufs);
				229	bufs = 0;
				230	lock_buffer(bh);
				231	spin_lock(&journal->j_list_lock);
				232	}
				233	locked = 1;
				234	}
				235	/* We have to get bh_state lock. Again out of order, sigh. */
				236	if (!inverted_lock(journal, bh)) {
				237	jbd_lock_bh_state(bh);
				238	spin_lock(&journal->j_list_lock);
				239	}
				240	/* Someone already cleaned up the buffer? */
				241	if (!buffer_jbd(bh)
				242	\|\| jh->b_transaction != commit_transaction
				243	\|\| jh->b_jlist != BJ_SyncData) {
				244	jbd_unlock_bh_state(bh);
				245	if (locked)
				246	unlock_buffer(bh);
				247	BUFFER_TRACE(bh, "already cleaned up");
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	248	release_data_buffer(bh);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	249	continue;
				250	}
				251	if (locked && test_clear_buffer_dirty(bh)) {
				252	BUFFER_TRACE(bh, "needs writeout, adding to array");
				253	wbuf[bufs++] = bh;
				254	__journal_file_buffer(jh, commit_transaction,
				255	BJ_Locked);
				256	jbd_unlock_bh_state(bh);
				257	if (bufs == journal->j_wbufsize) {
				258	spin_unlock(&journal->j_list_lock);
				259	journal_do_submit_data(wbuf, bufs);
				260	bufs = 0;
				261	goto write_out_data;
				262	}
Hisashi Hifumi	6f5a9da	2006-12-22 01:11:50 -0800	[diff] [blame]	263	} else if (!locked && buffer_locked(bh)) {
				264	__journal_file_buffer(jh, commit_transaction,
				265	BJ_Locked);
				266	jbd_unlock_bh_state(bh);
				267	put_bh(bh);
				268	} else {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	269	BUFFER_TRACE(bh, "writeout complete: unfile");
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	270	if (unlikely(!buffer_uptodate(bh)))
				271	err = -EIO;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	272	__journal_unfile_buffer(jh);
				273	jbd_unlock_bh_state(bh);
				274	if (locked)
				275	unlock_buffer(bh);
				276	journal_remove_journal_head(bh);
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	277	/* One for our safety reference, other for
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	278	* journal_remove_journal_head() */
				279	put_bh(bh);
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	280	release_data_buffer(bh);
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	281	}
				282
Nick Piggin	95c354f	2008-01-30 13:31:20 +0100	[diff] [blame]	283	if (need_resched() \|\| spin_needbreak(&journal->j_list_lock)) {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	284	spin_unlock(&journal->j_list_lock);
				285	goto write_out_data;
				286	}
				287	}
				288	spin_unlock(&journal->j_list_lock);
				289	journal_do_submit_data(wbuf, bufs);
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	290
				291	return err;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	292	}
				293
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	294	/*
				295	* journal_commit_transaction
				296	*
				297	* The primary function for committing a transaction to the log. This
				298	* function is called by the journal thread to begin a complete commit.
				299	*/
				300	void journal_commit_transaction(journal_t *journal)
				301	{
				302	transaction_t *commit_transaction;
				303	struct journal_head jh, new_jh, *descriptor;
				304	struct buffer_head **wbuf = journal->j_wbuf;
				305	int bufs;
				306	int flags;
				307	int err;
				308	unsigned long blocknr;
				309	char *tagp = NULL;
				310	journal_header_t *header;
				311	journal_block_tag_t *tag = NULL;
				312	int space_left = 0;
				313	int first_tag = 0;
				314	int tag_flag;
				315	int i;
				316
				317	/*
				318	* First job: lock down the current transaction and wait for
				319	* all outstanding updates to complete.
				320	*/
				321
				322	#ifdef COMMIT_STATS
				323	spin_lock(&journal->j_list_lock);
				324	summarise_journal_usage(journal);
				325	spin_unlock(&journal->j_list_lock);
				326	#endif
				327
				328	/* Do we need to erase the effects of a prior journal_flush? */
				329	if (journal->j_flags & JFS_FLUSHED) {
				330	jbd_debug(3, "super block updated\n");
				331	journal_update_superblock(journal, 1);
				332	} else {
				333	jbd_debug(3, "superblock not updated\n");
				334	}
				335
				336	J_ASSERT(journal->j_running_transaction != NULL);
				337	J_ASSERT(journal->j_committing_transaction == NULL);
				338
				339	commit_transaction = journal->j_running_transaction;
				340	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				341
				342	jbd_debug(1, "JBD: starting commit of transaction %d\n",
				343	commit_transaction->t_tid);
				344
				345	spin_lock(&journal->j_state_lock);
				346	commit_transaction->t_state = T_LOCKED;
				347
				348	spin_lock(&commit_transaction->t_handle_lock);
				349	while (commit_transaction->t_updates) {
				350	DEFINE_WAIT(wait);
				351
				352	prepare_to_wait(&journal->j_wait_updates, &wait,
				353	TASK_UNINTERRUPTIBLE);
				354	if (commit_transaction->t_updates) {
				355	spin_unlock(&commit_transaction->t_handle_lock);
				356	spin_unlock(&journal->j_state_lock);
				357	schedule();
				358	spin_lock(&journal->j_state_lock);
				359	spin_lock(&commit_transaction->t_handle_lock);
				360	}
				361	finish_wait(&journal->j_wait_updates, &wait);
				362	}
				363	spin_unlock(&commit_transaction->t_handle_lock);
				364
				365	J_ASSERT (commit_transaction->t_outstanding_credits <=
				366	journal->j_max_transaction_buffers);
				367
				368	/*
				369	* First thing we are allowed to do is to discard any remaining
				370	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				371	* that there are no such buffers: if a large filesystem
				372	* operation like a truncate needs to split itself over multiple
				373	* transactions, then it may try to do a journal_restart() while
				374	* there are still BJ_Reserved buffers outstanding. These must
				375	* be released cleanly from the current transaction.
				376	*
				377	* In this case, the filesystem must still reserve write access
				378	* again before modifying the buffer in the new transaction, but
				379	* we do not require it to remember exactly which old buffers it
				380	* has reserved. This is consistent with the existing behaviour
				381	* that multiple journal_get_write_access() calls to the same
				382	* buffer are perfectly permissable.
				383	*/
				384	while (commit_transaction->t_reserved_list) {
				385	jh = commit_transaction->t_reserved_list;
				386	JBUFFER_TRACE(jh, "reserved, unused: refile");
				387	/*
				388	* A journal_get_undo_access()+journal_release_buffer() may
				389	* leave undo-committed data.
				390	*/
				391	if (jh->b_committed_data) {
				392	struct buffer_head *bh = jh2bh(jh);
				393
				394	jbd_lock_bh_state(bh);
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	395	jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhl	f99d49a	2005-11-07 01:01:34 -0800	[diff] [blame]	396	jh->b_committed_data = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	397	jbd_unlock_bh_state(bh);
				398	}
				399	journal_refile_buffer(journal, jh);
				400	}
				401
				402	/*
				403	* Now try to drop any written-back buffers from the journal's
				404	* checkpoint lists. We do this before commit because it potentially
				405	* frees some memory
				406	*/
				407	spin_lock(&journal->j_list_lock);
				408	__journal_clean_checkpoint_list(journal);
				409	spin_unlock(&journal->j_list_lock);
				410
				411	jbd_debug (3, "JBD: commit phase 1\n");
				412
				413	/*
				414	* Switch to a new revoke table.
				415	*/
				416	journal_switch_revoke_table(journal);
				417
				418	commit_transaction->t_state = T_FLUSH;
				419	journal->j_committing_transaction = commit_transaction;
				420	journal->j_running_transaction = NULL;
				421	commit_transaction->t_log_start = journal->j_head;
				422	wake_up(&journal->j_wait_transaction_locked);
				423	spin_unlock(&journal->j_state_lock);
				424
				425	jbd_debug (3, "JBD: commit phase 2\n");
				426
				427	/*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	428	* Now start flushing things to disk, in the order they appear
				429	* on the transaction lists. Data blocks go first.
				430	*/
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	431	err = journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432
				433	/*
				434	* Wait for all previously submitted IO to complete.
				435	*/
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	436	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	while (commit_transaction->t_locked_list) {
				438	struct buffer_head *bh;
				439
				440	jh = commit_transaction->t_locked_list->b_tprev;
				441	bh = jh2bh(jh);
				442	get_bh(bh);
				443	if (buffer_locked(bh)) {
				444	spin_unlock(&journal->j_list_lock);
				445	wait_on_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	446	spin_lock(&journal->j_list_lock);
				447	}
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	448	if (unlikely(!buffer_uptodate(bh))) {
Nick Piggin	529ae9a	2008-08-02 12:01:03 +0200	[diff] [blame]	449	if (!trylock_page(bh->b_page)) {
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	450	spin_unlock(&journal->j_list_lock);
				451	lock_page(bh->b_page);
				452	spin_lock(&journal->j_list_lock);
				453	}
				454	if (bh->b_page->mapping)
				455	set_bit(AS_EIO, &bh->b_page->mapping->flags);
				456
				457	unlock_page(bh->b_page);
				458	SetPageError(bh->b_page);
				459	err = -EIO;
				460	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	461	if (!inverted_lock(journal, bh)) {
				462	put_bh(bh);
				463	spin_lock(&journal->j_list_lock);
				464	continue;
				465	}
				466	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
				467	__journal_unfile_buffer(jh);
				468	jbd_unlock_bh_state(bh);
				469	journal_remove_journal_head(bh);
				470	put_bh(bh);
				471	} else {
				472	jbd_unlock_bh_state(bh);
				473	}
Toshiyuki Okajima	fc80c44	2008-07-25 01:46:29 -0700	[diff] [blame]	474	release_data_buffer(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	475	cond_resched_lock(&journal->j_list_lock);
				476	}
				477	spin_unlock(&journal->j_list_lock);
				478
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	479	if (err) {
				480	char b[BDEVNAME_SIZE];
				481
				482	printk(KERN_WARNING
				483	"JBD: Detected IO errors while flushing file data "
				484	"on %s\n", bdevname(journal->j_fs_dev, b));
Hidehiro Kawai	0e4fb5e	2008-10-18 20:27:57 -0700	[diff] [blame]	485	if (journal->j_flags & JFS_ABORT_ON_SYNCDATA_ERR)
				486	journal_abort(journal, err);
Hidehiro Kawai	cbe5f46	2008-07-25 01:46:30 -0700	[diff] [blame]	487	err = 0;
				488	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	489
				490	journal_write_revoke_records(journal, commit_transaction);
				491
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	492	/*
				493	* If we found any dirty or locked buffers, then we should have
				494	* looped back up to the write_out_data label. If there weren't
				495	* any then journal_clean_data_list should have wiped the list
				496	* clean by now, so check that it is in fact empty.
				497	*/
				498	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
				499
				500	jbd_debug (3, "JBD: commit phase 3\n");
				501
				502	/*
				503	* Way to go: we have now written out all of the data for a
				504	* transaction! Now comes the tricky part: we need to write out
				505	* metadata. Loop over the transaction's entire buffer list:
				506	*/
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame]	507	spin_lock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	508	commit_transaction->t_state = T_COMMIT;
Mingming Cao	772279c	2008-05-14 16:05:41 -0700	[diff] [blame]	509	spin_unlock(&journal->j_state_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	510
Josef Bacik	5b9a499	2008-04-28 02:16:12 -0700	[diff] [blame]	511	J_ASSERT(commit_transaction->t_nr_buffers <=
				512	commit_transaction->t_outstanding_credits);
				513
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	514	descriptor = NULL;
				515	bufs = 0;
				516	while (commit_transaction->t_buffers) {
				517
				518	/* Find the next buffer to be journaled... */
				519
				520	jh = commit_transaction->t_buffers;
				521
				522	/* If we're in abort mode, we just un-journal the buffer and
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	523	release it. */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	524
				525	if (is_journal_aborted(journal)) {
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	526	clear_buffer_jbddirty(jh2bh(jh));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	527	JBUFFER_TRACE(jh, "journal is aborting: refile");
				528	journal_refile_buffer(journal, jh);
				529	/* If that was the last one, we need to clean up
				530	* any descriptor buffers which may have been
				531	* already allocated, even if we are now
				532	* aborting. */
				533	if (!commit_transaction->t_buffers)
				534	goto start_journal_io;
				535	continue;
				536	}
				537
				538	/* Make sure we have a descriptor block in which to
				539	record the metadata buffer. */
				540
				541	if (!descriptor) {
				542	struct buffer_head *bh;
				543
				544	J_ASSERT (bufs == 0);
				545
				546	jbd_debug(4, "JBD: get descriptor\n");
				547
				548	descriptor = journal_get_descriptor_buffer(journal);
				549	if (!descriptor) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	550	journal_abort(journal, -EIO);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	551	continue;
				552	}
				553
				554	bh = jh2bh(descriptor);
				555	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
				556	(unsigned long long)bh->b_blocknr, bh->b_data);
				557	header = (journal_header_t *)&bh->b_data[0];
				558	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				559	header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
				560	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				561
				562	tagp = &bh->b_data[sizeof(journal_header_t)];
				563	space_left = bh->b_size - sizeof(journal_header_t);
				564	first_tag = 1;
				565	set_buffer_jwrite(bh);
				566	set_buffer_dirty(bh);
				567	wbuf[bufs++] = bh;
				568
				569	/* Record it so that we can wait for IO
				570	completion later */
				571	BUFFER_TRACE(bh, "ph3: file as descriptor");
				572	journal_file_buffer(descriptor, commit_transaction,
				573	BJ_LogCtl);
				574	}
				575
				576	/* Where is the buffer to be written? */
				577
				578	err = journal_next_log_block(journal, &blocknr);
				579	/* If the block mapping failed, just abandon the buffer
				580	and repeat this loop: we'll fall into the
				581	refile-on-abort condition above. */
				582	if (err) {
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	583	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	584	continue;
				585	}
				586
				587	/*
				588	* start_this_handle() uses t_outstanding_credits to determine
				589	* the free space in the log, but this counter is changed
				590	* by journal_next_log_block() also.
				591	*/
				592	commit_transaction->t_outstanding_credits--;
				593
				594	/* Bump b_count to prevent truncate from stumbling over
				595	the shadowed buffer! @@@ This can go if we ever get
				596	rid of the BJ_IO/BJ_Shadow pairing of buffers. */
				597	atomic_inc(&jh2bh(jh)->b_count);
				598
				599	/* Make a temporary IO buffer with which to write it out
				600	(this will requeue both the metadata buffer and the
				601	temporary IO buffer). new_bh goes on BJ_IO*/
				602
				603	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				604	/*
				605	* akpm: journal_write_metadata_buffer() sets
				606	* new_bh->b_transaction to commit_transaction.
				607	* We need to clean this up before we release new_bh
				608	* (which is of type BJ_IO)
				609	*/
				610	JBUFFER_TRACE(jh, "ph3: write metadata");
				611	flags = journal_write_metadata_buffer(commit_transaction,
				612	jh, &new_jh, blocknr);
				613	set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
				614	wbuf[bufs++] = jh2bh(new_jh);
				615
				616	/* Record the new block's tag in the current descriptor
				617	buffer */
				618
				619	tag_flag = 0;
				620	if (flags & 1)
				621	tag_flag \|= JFS_FLAG_ESCAPE;
				622	if (!first_tag)
				623	tag_flag \|= JFS_FLAG_SAME_UUID;
				624
				625	tag = (journal_block_tag_t *) tagp;
				626	tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
				627	tag->t_flags = cpu_to_be32(tag_flag);
				628	tagp += sizeof(journal_block_tag_t);
				629	space_left -= sizeof(journal_block_tag_t);
				630
				631	if (first_tag) {
				632	memcpy (tagp, journal->j_uuid, 16);
				633	tagp += 16;
				634	space_left -= 16;
				635	first_tag = 0;
				636	}
				637
				638	/* If there's no more to do, or if the descriptor is full,
				639	let the IO rip! */
				640
				641	if (bufs == journal->j_wbufsize \|\|
				642	commit_transaction->t_buffers == NULL \|\|
				643	space_left < sizeof(journal_block_tag_t) + 16) {
				644
				645	jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
				646
				647	/* Write an end-of-descriptor marker before
				648	submitting the IOs. "tag" still points to
				649	the last tag we set up. */
				650
				651	tag->t_flags \|= cpu_to_be32(JFS_FLAG_LAST_TAG);
				652
				653	start_journal_io:
				654	for (i = 0; i < bufs; i++) {
				655	struct buffer_head *bh = wbuf[i];
				656	lock_buffer(bh);
				657	clear_buffer_dirty(bh);
				658	set_buffer_uptodate(bh);
				659	bh->b_end_io = journal_end_buffer_io_sync;
				660	submit_bh(WRITE, bh);
				661	}
				662	cond_resched();
				663
				664	/* Force a new descriptor to be generated next
				665	time round the loop. */
				666	descriptor = NULL;
				667	bufs = 0;
				668	}
				669	}
				670
				671	/* Lo and behold: we have just managed to send a transaction to
				672	the log. Before we can commit it, wait for the IO so far to
				673	complete. Control buffers being written are on the
				674	transaction's t_log_list queue, and metadata buffers are on
				675	the t_iobuf_list queue.
				676
				677	Wait for the buffers in reverse order. That way we are
				678	less likely to be woken up until all IOs have completed, and
				679	so we incur less scheduling load.
				680	*/
				681
				682	jbd_debug(3, "JBD: commit phase 4\n");
				683
				684	/*
				685	* akpm: these are BJ_IO, and j_list_lock is not needed.
				686	* See __journal_try_to_free_buffer.
				687	*/
				688	wait_for_iobuf:
				689	while (commit_transaction->t_iobuf_list != NULL) {
				690	struct buffer_head *bh;
				691
				692	jh = commit_transaction->t_iobuf_list->b_tprev;
				693	bh = jh2bh(jh);
				694	if (buffer_locked(bh)) {
				695	wait_on_buffer(bh);
				696	goto wait_for_iobuf;
				697	}
				698	if (cond_resched())
				699	goto wait_for_iobuf;
				700
				701	if (unlikely(!buffer_uptodate(bh)))
				702	err = -EIO;
				703
				704	clear_buffer_jwrite(bh);
				705
				706	JBUFFER_TRACE(jh, "ph4: unfile after journal write");
				707	journal_unfile_buffer(journal, jh);
				708
				709	/*
				710	* ->t_iobuf_list should contain only dummy buffer_heads
				711	* which were created by journal_write_metadata_buffer().
				712	*/
				713	BUFFER_TRACE(bh, "dumping temporary bh");
				714	journal_put_journal_head(jh);
				715	__brelse(bh);
				716	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				717	free_buffer_head(bh);
				718
				719	/* We also have to unlock and free the corresponding
				720	shadowed buffer */
				721	jh = commit_transaction->t_shadow_list->b_tprev;
				722	bh = jh2bh(jh);
				723	clear_bit(BH_JWrite, &bh->b_state);
				724	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				725
				726	/* The metadata is now released for reuse, but we need
				727	to remember it against this transaction so that when
				728	we finally commit, we can do any checkpointing
				729	required. */
				730	JBUFFER_TRACE(jh, "file as BJ_Forget");
				731	journal_file_buffer(jh, commit_transaction, BJ_Forget);
				732	/* Wake up any transactions which were waiting for this
				733	IO to complete */
				734	wake_up_bit(&bh->b_state, BH_Unshadow);
				735	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				736	__brelse(bh);
				737	}
				738
				739	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				740
				741	jbd_debug(3, "JBD: commit phase 5\n");
				742
				743	/* Here we wait for the revoke record and descriptor record buffers */
				744	wait_for_ctlbuf:
				745	while (commit_transaction->t_log_list != NULL) {
				746	struct buffer_head *bh;
				747
				748	jh = commit_transaction->t_log_list->b_tprev;
				749	bh = jh2bh(jh);
				750	if (buffer_locked(bh)) {
				751	wait_on_buffer(bh);
				752	goto wait_for_ctlbuf;
				753	}
				754	if (cond_resched())
				755	goto wait_for_ctlbuf;
				756
				757	if (unlikely(!buffer_uptodate(bh)))
				758	err = -EIO;
				759
				760	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				761	clear_buffer_jwrite(bh);
				762	journal_unfile_buffer(journal, jh);
				763	journal_put_journal_head(jh);
				764	__brelse(bh); /* One for getblk */
				765	/* AKPM: bforget here */
				766	}
				767
Hidehiro Kawai	d1645e5	2008-10-18 20:27:53 -0700	[diff] [blame]	768	if (err)
				769	journal_abort(journal, err);
				770
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	771	jbd_debug(3, "JBD: commit phase 6\n");
				772
				773	if (journal_write_commit_record(journal, commit_transaction))
				774	err = -EIO;
				775
				776	if (err)
Jan Kara	7a266e7	2007-10-18 23:39:22 -0700	[diff] [blame]	777	journal_abort(journal, err);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	778
				779	/* End of a transaction! Finally, we can do checkpoint
				780	processing: any buffers committed as a result of this
				781	transaction can be removed from any checkpoint list it was on
				782	before. */
				783
				784	jbd_debug(3, "JBD: commit phase 7\n");
				785
				786	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
				787	J_ASSERT(commit_transaction->t_buffers == NULL);
				788	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				789	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
				790	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				791	J_ASSERT(commit_transaction->t_log_list == NULL);
				792
				793	restart_loop:
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	794	/*
				795	* As there are other places (journal_unmap_buffer()) adding buffers
				796	* to this list we have to be careful and hold the j_list_lock.
				797	*/
				798	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	799	while (commit_transaction->t_forget) {
				800	transaction_t *cp_transaction;
				801	struct buffer_head *bh;
				802
				803	jh = commit_transaction->t_forget;
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	804	spin_unlock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	805	bh = jh2bh(jh);
				806	jbd_lock_bh_state(bh);
				807	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction \|\|
				808	jh->b_transaction == journal->j_running_transaction);
				809
				810	/*
				811	* If there is undo-protected committed data against
				812	* this buffer, then we can remove it now. If it is a
				813	* buffer needing such protection, the old frozen_data
				814	* field now points to a committed version of the
				815	* buffer, so rotate that field to the new committed
				816	* data.
				817	*
				818	* Otherwise, we can just throw away the frozen data now.
				819	*/
				820	if (jh->b_committed_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	821	jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	822	jh->b_committed_data = NULL;
				823	if (jh->b_frozen_data) {
				824	jh->b_committed_data = jh->b_frozen_data;
				825	jh->b_frozen_data = NULL;
				826	}
				827	} else if (jh->b_frozen_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame]	828	jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	829	jh->b_frozen_data = NULL;
				830	}
				831
				832	spin_lock(&journal->j_list_lock);
				833	cp_transaction = jh->b_cp_transaction;
				834	if (cp_transaction) {
				835	JBUFFER_TRACE(jh, "remove from old cp transaction");
				836	__journal_remove_checkpoint(jh);
				837	}
				838
				839	/* Only re-checkpoint the buffer_head if it is marked
				840	* dirty. If the buffer was added to the BJ_Forget list
				841	* by journal_forget, it may no longer be dirty and
				842	* there's no point in keeping a checkpoint record for
				843	* it. */
				844
				845	/* A buffer which has been freed while still being
				846	* journaled by a previous transaction may end up still
				847	* being dirty here, but we want to avoid writing back
				848	* that buffer in the future now that the last use has
				849	* been committed. That's not only a performance gain,
				850	* it also stops aliasing problems if the buffer is left
				851	* behind for writeback and gets reallocated for another
				852	* use in a different page. */
				853	if (buffer_freed(bh)) {
				854	clear_buffer_freed(bh);
				855	clear_buffer_jbddirty(bh);
				856	}
				857
				858	if (buffer_jbddirty(bh)) {
				859	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				860	__journal_insert_checkpoint(jh, commit_transaction);
Hidehiro Kawai	885e353	2008-10-18 20:27:54 -0700	[diff] [blame]	861	if (is_journal_aborted(journal))
				862	clear_buffer_jbddirty(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	863	JBUFFER_TRACE(jh, "refile for checkpoint writeback");
				864	__journal_refile_buffer(jh);
				865	jbd_unlock_bh_state(bh);
				866	} else {
				867	J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara	9ada734	2006-06-23 02:05:25 -0700	[diff] [blame]	868	/* The buffer on BJ_Forget list and not jbddirty means
				869	* it has been freed by this transaction and hence it
				870	* could not have been reallocated until this
				871	* transaction has committed. BUT it could be
				872	* reallocated once we have written all the data to
				873	* disk and before we process the buffer on BJ_Forget
				874	* list. */
				875	JBUFFER_TRACE(jh, "refile or unfile freed buffer");
				876	__journal_refile_buffer(jh);
				877	if (!jh->b_transaction) {
				878	jbd_unlock_bh_state(bh);
				879	/* needs a brelse */
				880	journal_remove_journal_head(bh);
				881	release_buffer_page(bh);
				882	} else
				883	jbd_unlock_bh_state(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	884	}
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	885	cond_resched_lock(&journal->j_list_lock);
				886	}
				887	spin_unlock(&journal->j_list_lock);
				888	/*
Jan Kara	d4beaf4	2007-12-04 23:45:27 -0800	[diff] [blame]	889	* This is a bit sleazy. We use j_list_lock to protect transition
				890	* of a transaction into T_FINISHED state and calling
				891	* __journal_drop_transaction(). Otherwise we could race with
				892	* other checkpointing code processing the transaction...
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	893	*/
				894	spin_lock(&journal->j_state_lock);
				895	spin_lock(&journal->j_list_lock);
				896	/*
				897	* Now recheck if some buffers did not get attached to the transaction
				898	* while the lock was dropped...
				899	*/
				900	if (commit_transaction->t_forget) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	901	spin_unlock(&journal->j_list_lock);
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	902	spin_unlock(&journal->j_state_lock);
				903	goto restart_loop;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	904	}
				905
				906	/* Done with this transaction! */
				907
				908	jbd_debug(3, "JBD: commit phase 8\n");
				909
				910	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				911
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	912	commit_transaction->t_state = T_FINISHED;
				913	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				914	journal->j_commit_sequence = commit_transaction->t_tid;
				915	journal->j_committing_transaction = NULL;
				916	spin_unlock(&journal->j_state_lock);
				917
Jan Kara	fe28e42	2007-07-15 23:37:18 -0700	[diff] [blame]	918	if (commit_transaction->t_checkpoint_list == NULL &&
				919	commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	__journal_drop_transaction(journal, commit_transaction);
				921	} else {
				922	if (journal->j_checkpoint_transactions == NULL) {
				923	journal->j_checkpoint_transactions = commit_transaction;
				924	commit_transaction->t_cpnext = commit_transaction;
				925	commit_transaction->t_cpprev = commit_transaction;
				926	} else {
				927	commit_transaction->t_cpnext =
				928	journal->j_checkpoint_transactions;
				929	commit_transaction->t_cpprev =
				930	commit_transaction->t_cpnext->t_cpprev;
				931	commit_transaction->t_cpnext->t_cpprev =
				932	commit_transaction;
				933	commit_transaction->t_cpprev->t_cpnext =
				934	commit_transaction;
				935	}
				936	}
				937	spin_unlock(&journal->j_list_lock);
				938
				939	jbd_debug(1, "JBD: commit %d complete, head %d\n",
				940	journal->j_commit_sequence, journal->j_tail_sequence);
				941
				942	wake_up(&journal->j_wait_done_commit);
				943	}