Blame - fs/jbd/commit.c - kernel/msm-4.9

blob: a263d82761dfdc3fcf63be7900ed47a803b9b3b2 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Uwe Zeisberger	f30c226	2006-10-03 23:01:26 +0200	[diff] [blame]	2	* linux/fs/jbd/commit.c
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	23
				24	/*
				25	* Default IO end handler for temporary BJ_IO buffer_heads.
				26	*/
				27	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				28	{
				29	BUFFER_TRACE(bh, "");
				30	if (uptodate)
				31	set_buffer_uptodate(bh);
				32	else
				33	clear_buffer_uptodate(bh);
				34	unlock_buffer(bh);
				35	}
				36
				37	/*
				38	* When an ext3-ordered file is truncated, it is possible that many pages are
				39	* not sucessfully freed, because they are attached to a committing transaction.
				40	* After the transaction commits, these pages are left on the LRU, with no
				41	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				42	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				43	* the numbers in /proc/meminfo look odd.
				44	*
				45	* So here, we have a buffer which has just come off the forget list. Look to
				46	* see if we can strip all buffers from the backing page.
				47	*
				48	* Called under lock_journal(), and possibly under journal_datalist_lock. The
				49	* caller provided us with a ref against the buffer, and we drop that here.
				50	*/
				51	static void release_buffer_page(struct buffer_head *bh)
				52	{
				53	struct page *page;
				54
				55	if (buffer_dirty(bh))
				56	goto nope;
				57	if (atomic_read(&bh->b_count) != 1)
				58	goto nope;
				59	page = bh->b_page;
				60	if (!page)
				61	goto nope;
				62	if (page->mapping)
				63	goto nope;
				64
				65	/* OK, it's a truncated page */
				66	if (TestSetPageLocked(page))
				67	goto nope;
				68
				69	page_cache_get(page);
				70	__brelse(bh);
				71	try_to_free_buffers(page);
				72	unlock_page(page);
				73	page_cache_release(page);
				74	return;
				75
				76	nope:
				77	__brelse(bh);
				78	}
				79
				80	/*
				81	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
				82	* held. For ranking reasons we must trylock. If we lose, schedule away and
				83	* return 0. j_list_lock is dropped in this case.
				84	*/
				85	static int inverted_lock(journal_t journal, struct buffer_head bh)
				86	{
				87	if (!jbd_trylock_bh_state(bh)) {
				88	spin_unlock(&journal->j_list_lock);
				89	schedule();
				90	return 0;
				91	}
				92	return 1;
				93	}
				94
				95	/* Done it all: now write the commit record. We should have
				96	* cleaned up our previous buffers by now, so if we are in abort
				97	* mode we can now just skip the rest of the journal write
				98	* entirely.
				99	*
				100	* Returns 1 if the journal needs to be aborted or 0 on success
				101	*/
				102	static int journal_write_commit_record(journal_t *journal,
				103	transaction_t *commit_transaction)
				104	{
				105	struct journal_head *descriptor;
				106	struct buffer_head *bh;
				107	int i, ret;
				108	int barrier_done = 0;
				109
				110	if (is_journal_aborted(journal))
				111	return 0;
				112
				113	descriptor = journal_get_descriptor_buffer(journal);
				114	if (!descriptor)
				115	return 1;
				116
				117	bh = jh2bh(descriptor);
				118
				119	/* AKPM: buglet - add `i' to tmp! */
				120	for (i = 0; i < bh->b_size; i += 512) {
				121	journal_header_t tmp = (journal_header_t)bh->b_data;
				122	tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				123	tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
				124	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				125	}
				126
				127	JBUFFER_TRACE(descriptor, "write commit block");
				128	set_buffer_dirty(bh);
				129	if (journal->j_flags & JFS_BARRIER) {
				130	set_buffer_ordered(bh);
				131	barrier_done = 1;
				132	}
				133	ret = sync_dirty_buffer(bh);
				134	/* is it possible for another commit to fail at roughly
				135	* the same time as this one? If so, we don't want to
				136	* trust the barrier flag in the super, but instead want
				137	* to remember if we sent a barrier request
				138	*/
				139	if (ret == -EOPNOTSUPP && barrier_done) {
				140	char b[BDEVNAME_SIZE];
				141
				142	printk(KERN_WARNING
				143	"JBD: barrier-based sync failed on %s - "
				144	"disabling barriers\n",
				145	bdevname(journal->j_dev, b));
				146	spin_lock(&journal->j_state_lock);
				147	journal->j_flags &= ~JFS_BARRIER;
				148	spin_unlock(&journal->j_state_lock);
				149
				150	/* And try again, without the barrier */
				151	clear_buffer_ordered(bh);
				152	set_buffer_uptodate(bh);
				153	set_buffer_dirty(bh);
				154	ret = sync_dirty_buffer(bh);
				155	}
				156	put_bh(bh); /* One for getblk() */
				157	journal_put_journal_head(descriptor);
				158
				159	return (ret == -EIO);
				160	}
				161
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	162	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
				163	{
				164	int i;
				165
				166	for (i = 0; i < bufs; i++) {
				167	wbuf[i]->b_end_io = end_buffer_write_sync;
				168	/* We use-up our safety reference in submit_bh() */
				169	submit_bh(WRITE, wbuf[i]);
				170	}
				171	}
				172
				173	/*
				174	* Submit all the data buffers to disk
				175	*/
				176	static void journal_submit_data_buffers(journal_t *journal,
				177	transaction_t *commit_transaction)
				178	{
				179	struct journal_head *jh;
				180	struct buffer_head *bh;
				181	int locked;
				182	int bufs = 0;
				183	struct buffer_head **wbuf = journal->j_wbuf;
				184
				185	/*
				186	* Whenever we unlock the journal and sleep, things can get added
				187	* onto ->t_sync_datalist, so we have to keep looping back to
				188	* write_out_data until we know that the list is empty.
				189	*
				190	* Cleanup any flushed data buffers from the data list. Even in
				191	* abort mode, we want to flush this out as soon as possible.
				192	*/
				193	write_out_data:
				194	cond_resched();
				195	spin_lock(&journal->j_list_lock);
				196
				197	while (commit_transaction->t_sync_datalist) {
				198	jh = commit_transaction->t_sync_datalist;
				199	bh = jh2bh(jh);
				200	locked = 0;
				201
				202	/* Get reference just to make sure buffer does not disappear
				203	* when we are forced to drop various locks */
				204	get_bh(bh);
				205	/* If the buffer is dirty, we need to submit IO and hence
				206	* we need the buffer lock. We try to lock the buffer without
				207	* blocking. If we fail, we need to drop j_list_lock and do
				208	* blocking lock_buffer().
				209	*/
				210	if (buffer_dirty(bh)) {
				211	if (test_set_buffer_locked(bh)) {
				212	BUFFER_TRACE(bh, "needs blocking lock");
				213	spin_unlock(&journal->j_list_lock);
				214	/* Write out all data to prevent deadlocks */
				215	journal_do_submit_data(wbuf, bufs);
				216	bufs = 0;
				217	lock_buffer(bh);
				218	spin_lock(&journal->j_list_lock);
				219	}
				220	locked = 1;
				221	}
				222	/* We have to get bh_state lock. Again out of order, sigh. */
				223	if (!inverted_lock(journal, bh)) {
				224	jbd_lock_bh_state(bh);
				225	spin_lock(&journal->j_list_lock);
				226	}
				227	/* Someone already cleaned up the buffer? */
				228	if (!buffer_jbd(bh)
				229	\|\| jh->b_transaction != commit_transaction
				230	\|\| jh->b_jlist != BJ_SyncData) {
				231	jbd_unlock_bh_state(bh);
				232	if (locked)
				233	unlock_buffer(bh);
				234	BUFFER_TRACE(bh, "already cleaned up");
				235	put_bh(bh);
				236	continue;
				237	}
				238	if (locked && test_clear_buffer_dirty(bh)) {
				239	BUFFER_TRACE(bh, "needs writeout, adding to array");
				240	wbuf[bufs++] = bh;
				241	__journal_file_buffer(jh, commit_transaction,
				242	BJ_Locked);
				243	jbd_unlock_bh_state(bh);
				244	if (bufs == journal->j_wbufsize) {
				245	spin_unlock(&journal->j_list_lock);
				246	journal_do_submit_data(wbuf, bufs);
				247	bufs = 0;
				248	goto write_out_data;
				249	}
Hisashi Hifumi	6f5a9da	2006-12-22 01:11:50 -0800	[diff] [blame]	250	} else if (!locked && buffer_locked(bh)) {
				251	__journal_file_buffer(jh, commit_transaction,
				252	BJ_Locked);
				253	jbd_unlock_bh_state(bh);
				254	put_bh(bh);
				255	} else {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	256	BUFFER_TRACE(bh, "writeout complete: unfile");
				257	__journal_unfile_buffer(jh);
				258	jbd_unlock_bh_state(bh);
				259	if (locked)
				260	unlock_buffer(bh);
				261	journal_remove_journal_head(bh);
				262	/* Once for our safety reference, once for
				263	* journal_remove_journal_head() */
				264	put_bh(bh);
				265	put_bh(bh);
				266	}
				267
				268	if (lock_need_resched(&journal->j_list_lock)) {
				269	spin_unlock(&journal->j_list_lock);
				270	goto write_out_data;
				271	}
				272	}
				273	spin_unlock(&journal->j_list_lock);
				274	journal_do_submit_data(wbuf, bufs);
				275	}
				276
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	277	/*
				278	* journal_commit_transaction
				279	*
				280	* The primary function for committing a transaction to the log. This
				281	* function is called by the journal thread to begin a complete commit.
				282	*/
				283	void journal_commit_transaction(journal_t *journal)
				284	{
				285	transaction_t *commit_transaction;
				286	struct journal_head jh, new_jh, *descriptor;
				287	struct buffer_head **wbuf = journal->j_wbuf;
				288	int bufs;
				289	int flags;
				290	int err;
				291	unsigned long blocknr;
				292	char *tagp = NULL;
				293	journal_header_t *header;
				294	journal_block_tag_t *tag = NULL;
				295	int space_left = 0;
				296	int first_tag = 0;
				297	int tag_flag;
				298	int i;
				299
				300	/*
				301	* First job: lock down the current transaction and wait for
				302	* all outstanding updates to complete.
				303	*/
				304
				305	#ifdef COMMIT_STATS
				306	spin_lock(&journal->j_list_lock);
				307	summarise_journal_usage(journal);
				308	spin_unlock(&journal->j_list_lock);
				309	#endif
				310
				311	/* Do we need to erase the effects of a prior journal_flush? */
				312	if (journal->j_flags & JFS_FLUSHED) {
				313	jbd_debug(3, "super block updated\n");
				314	journal_update_superblock(journal, 1);
				315	} else {
				316	jbd_debug(3, "superblock not updated\n");
				317	}
				318
				319	J_ASSERT(journal->j_running_transaction != NULL);
				320	J_ASSERT(journal->j_committing_transaction == NULL);
				321
				322	commit_transaction = journal->j_running_transaction;
				323	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				324
				325	jbd_debug(1, "JBD: starting commit of transaction %d\n",
				326	commit_transaction->t_tid);
				327
				328	spin_lock(&journal->j_state_lock);
				329	commit_transaction->t_state = T_LOCKED;
				330
				331	spin_lock(&commit_transaction->t_handle_lock);
				332	while (commit_transaction->t_updates) {
				333	DEFINE_WAIT(wait);
				334
				335	prepare_to_wait(&journal->j_wait_updates, &wait,
				336	TASK_UNINTERRUPTIBLE);
				337	if (commit_transaction->t_updates) {
				338	spin_unlock(&commit_transaction->t_handle_lock);
				339	spin_unlock(&journal->j_state_lock);
				340	schedule();
				341	spin_lock(&journal->j_state_lock);
				342	spin_lock(&commit_transaction->t_handle_lock);
				343	}
				344	finish_wait(&journal->j_wait_updates, &wait);
				345	}
				346	spin_unlock(&commit_transaction->t_handle_lock);
				347
				348	J_ASSERT (commit_transaction->t_outstanding_credits <=
				349	journal->j_max_transaction_buffers);
				350
				351	/*
				352	* First thing we are allowed to do is to discard any remaining
				353	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				354	* that there are no such buffers: if a large filesystem
				355	* operation like a truncate needs to split itself over multiple
				356	* transactions, then it may try to do a journal_restart() while
				357	* there are still BJ_Reserved buffers outstanding. These must
				358	* be released cleanly from the current transaction.
				359	*
				360	* In this case, the filesystem must still reserve write access
				361	* again before modifying the buffer in the new transaction, but
				362	* we do not require it to remember exactly which old buffers it
				363	* has reserved. This is consistent with the existing behaviour
				364	* that multiple journal_get_write_access() calls to the same
				365	* buffer are perfectly permissable.
				366	*/
				367	while (commit_transaction->t_reserved_list) {
				368	jh = commit_transaction->t_reserved_list;
				369	JBUFFER_TRACE(jh, "reserved, unused: refile");
				370	/*
				371	* A journal_get_undo_access()+journal_release_buffer() may
				372	* leave undo-committed data.
				373	*/
				374	if (jh->b_committed_data) {
				375	struct buffer_head *bh = jh2bh(jh);
				376
				377	jbd_lock_bh_state(bh);
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame^]	378	jbd_free(jh->b_committed_data, bh->b_size);
Jesper Juhl	f99d49a	2005-11-07 01:01:34 -0800	[diff] [blame]	379	jh->b_committed_data = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	380	jbd_unlock_bh_state(bh);
				381	}
				382	journal_refile_buffer(journal, jh);
				383	}
				384
				385	/*
				386	* Now try to drop any written-back buffers from the journal's
				387	* checkpoint lists. We do this before commit because it potentially
				388	* frees some memory
				389	*/
				390	spin_lock(&journal->j_list_lock);
				391	__journal_clean_checkpoint_list(journal);
				392	spin_unlock(&journal->j_list_lock);
				393
				394	jbd_debug (3, "JBD: commit phase 1\n");
				395
				396	/*
				397	* Switch to a new revoke table.
				398	*/
				399	journal_switch_revoke_table(journal);
				400
				401	commit_transaction->t_state = T_FLUSH;
				402	journal->j_committing_transaction = commit_transaction;
				403	journal->j_running_transaction = NULL;
				404	commit_transaction->t_log_start = journal->j_head;
				405	wake_up(&journal->j_wait_transaction_locked);
				406	spin_unlock(&journal->j_state_lock);
				407
				408	jbd_debug (3, "JBD: commit phase 2\n");
				409
				410	/*
				411	* First, drop modified flag: all accesses to the buffers
				412	* will be tracked for a new trasaction only -bzzz
				413	*/
				414	spin_lock(&journal->j_list_lock);
				415	if (commit_transaction->t_buffers) {
				416	new_jh = jh = commit_transaction->t_buffers->b_tnext;
				417	do {
				418	J_ASSERT_JH(new_jh, new_jh->b_modified == 1 \|\|
				419	new_jh->b_modified == 0);
				420	new_jh->b_modified = 0;
				421	new_jh = new_jh->b_tnext;
				422	} while (new_jh != jh);
				423	}
				424	spin_unlock(&journal->j_list_lock);
				425
				426	/*
				427	* Now start flushing things to disk, in the order they appear
				428	* on the transaction lists. Data blocks go first.
				429	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	err = 0;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	431	journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	432
				433	/*
				434	* Wait for all previously submitted IO to complete.
				435	*/
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	436	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	437	while (commit_transaction->t_locked_list) {
				438	struct buffer_head *bh;
				439
				440	jh = commit_transaction->t_locked_list->b_tprev;
				441	bh = jh2bh(jh);
				442	get_bh(bh);
				443	if (buffer_locked(bh)) {
				444	spin_unlock(&journal->j_list_lock);
				445	wait_on_buffer(bh);
				446	if (unlikely(!buffer_uptodate(bh)))
				447	err = -EIO;
				448	spin_lock(&journal->j_list_lock);
				449	}
				450	if (!inverted_lock(journal, bh)) {
				451	put_bh(bh);
				452	spin_lock(&journal->j_list_lock);
				453	continue;
				454	}
				455	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
				456	__journal_unfile_buffer(jh);
				457	jbd_unlock_bh_state(bh);
				458	journal_remove_journal_head(bh);
				459	put_bh(bh);
				460	} else {
				461	jbd_unlock_bh_state(bh);
				462	}
				463	put_bh(bh);
				464	cond_resched_lock(&journal->j_list_lock);
				465	}
				466	spin_unlock(&journal->j_list_lock);
				467
				468	if (err)
				469	__journal_abort_hard(journal);
				470
				471	journal_write_revoke_records(journal, commit_transaction);
				472
				473	jbd_debug(3, "JBD: commit phase 2\n");
				474
				475	/*
				476	* If we found any dirty or locked buffers, then we should have
				477	* looped back up to the write_out_data label. If there weren't
				478	* any then journal_clean_data_list should have wiped the list
				479	* clean by now, so check that it is in fact empty.
				480	*/
				481	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
				482
				483	jbd_debug (3, "JBD: commit phase 3\n");
				484
				485	/*
				486	* Way to go: we have now written out all of the data for a
				487	* transaction! Now comes the tricky part: we need to write out
				488	* metadata. Loop over the transaction's entire buffer list:
				489	*/
				490	commit_transaction->t_state = T_COMMIT;
				491
				492	descriptor = NULL;
				493	bufs = 0;
				494	while (commit_transaction->t_buffers) {
				495
				496	/* Find the next buffer to be journaled... */
				497
				498	jh = commit_transaction->t_buffers;
				499
				500	/* If we're in abort mode, we just un-journal the buffer and
				501	release it for background writing. */
				502
				503	if (is_journal_aborted(journal)) {
				504	JBUFFER_TRACE(jh, "journal is aborting: refile");
				505	journal_refile_buffer(journal, jh);
				506	/* If that was the last one, we need to clean up
				507	* any descriptor buffers which may have been
				508	* already allocated, even if we are now
				509	* aborting. */
				510	if (!commit_transaction->t_buffers)
				511	goto start_journal_io;
				512	continue;
				513	}
				514
				515	/* Make sure we have a descriptor block in which to
				516	record the metadata buffer. */
				517
				518	if (!descriptor) {
				519	struct buffer_head *bh;
				520
				521	J_ASSERT (bufs == 0);
				522
				523	jbd_debug(4, "JBD: get descriptor\n");
				524
				525	descriptor = journal_get_descriptor_buffer(journal);
				526	if (!descriptor) {
				527	__journal_abort_hard(journal);
				528	continue;
				529	}
				530
				531	bh = jh2bh(descriptor);
				532	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
				533	(unsigned long long)bh->b_blocknr, bh->b_data);
				534	header = (journal_header_t *)&bh->b_data[0];
				535	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				536	header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
				537	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				538
				539	tagp = &bh->b_data[sizeof(journal_header_t)];
				540	space_left = bh->b_size - sizeof(journal_header_t);
				541	first_tag = 1;
				542	set_buffer_jwrite(bh);
				543	set_buffer_dirty(bh);
				544	wbuf[bufs++] = bh;
				545
				546	/* Record it so that we can wait for IO
				547	completion later */
				548	BUFFER_TRACE(bh, "ph3: file as descriptor");
				549	journal_file_buffer(descriptor, commit_transaction,
				550	BJ_LogCtl);
				551	}
				552
				553	/* Where is the buffer to be written? */
				554
				555	err = journal_next_log_block(journal, &blocknr);
				556	/* If the block mapping failed, just abandon the buffer
				557	and repeat this loop: we'll fall into the
				558	refile-on-abort condition above. */
				559	if (err) {
				560	__journal_abort_hard(journal);
				561	continue;
				562	}
				563
				564	/*
				565	* start_this_handle() uses t_outstanding_credits to determine
				566	* the free space in the log, but this counter is changed
				567	* by journal_next_log_block() also.
				568	*/
				569	commit_transaction->t_outstanding_credits--;
				570
				571	/* Bump b_count to prevent truncate from stumbling over
				572	the shadowed buffer! @@@ This can go if we ever get
				573	rid of the BJ_IO/BJ_Shadow pairing of buffers. */
				574	atomic_inc(&jh2bh(jh)->b_count);
				575
				576	/* Make a temporary IO buffer with which to write it out
				577	(this will requeue both the metadata buffer and the
				578	temporary IO buffer). new_bh goes on BJ_IO*/
				579
				580	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				581	/*
				582	* akpm: journal_write_metadata_buffer() sets
				583	* new_bh->b_transaction to commit_transaction.
				584	* We need to clean this up before we release new_bh
				585	* (which is of type BJ_IO)
				586	*/
				587	JBUFFER_TRACE(jh, "ph3: write metadata");
				588	flags = journal_write_metadata_buffer(commit_transaction,
				589	jh, &new_jh, blocknr);
				590	set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
				591	wbuf[bufs++] = jh2bh(new_jh);
				592
				593	/* Record the new block's tag in the current descriptor
				594	buffer */
				595
				596	tag_flag = 0;
				597	if (flags & 1)
				598	tag_flag \|= JFS_FLAG_ESCAPE;
				599	if (!first_tag)
				600	tag_flag \|= JFS_FLAG_SAME_UUID;
				601
				602	tag = (journal_block_tag_t *) tagp;
				603	tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
				604	tag->t_flags = cpu_to_be32(tag_flag);
				605	tagp += sizeof(journal_block_tag_t);
				606	space_left -= sizeof(journal_block_tag_t);
				607
				608	if (first_tag) {
				609	memcpy (tagp, journal->j_uuid, 16);
				610	tagp += 16;
				611	space_left -= 16;
				612	first_tag = 0;
				613	}
				614
				615	/* If there's no more to do, or if the descriptor is full,
				616	let the IO rip! */
				617
				618	if (bufs == journal->j_wbufsize \|\|
				619	commit_transaction->t_buffers == NULL \|\|
				620	space_left < sizeof(journal_block_tag_t) + 16) {
				621
				622	jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
				623
				624	/* Write an end-of-descriptor marker before
				625	submitting the IOs. "tag" still points to
				626	the last tag we set up. */
				627
				628	tag->t_flags \|= cpu_to_be32(JFS_FLAG_LAST_TAG);
				629
				630	start_journal_io:
				631	for (i = 0; i < bufs; i++) {
				632	struct buffer_head *bh = wbuf[i];
				633	lock_buffer(bh);
				634	clear_buffer_dirty(bh);
				635	set_buffer_uptodate(bh);
				636	bh->b_end_io = journal_end_buffer_io_sync;
				637	submit_bh(WRITE, bh);
				638	}
				639	cond_resched();
				640
				641	/* Force a new descriptor to be generated next
				642	time round the loop. */
				643	descriptor = NULL;
				644	bufs = 0;
				645	}
				646	}
				647
				648	/* Lo and behold: we have just managed to send a transaction to
				649	the log. Before we can commit it, wait for the IO so far to
				650	complete. Control buffers being written are on the
				651	transaction's t_log_list queue, and metadata buffers are on
				652	the t_iobuf_list queue.
				653
				654	Wait for the buffers in reverse order. That way we are
				655	less likely to be woken up until all IOs have completed, and
				656	so we incur less scheduling load.
				657	*/
				658
				659	jbd_debug(3, "JBD: commit phase 4\n");
				660
				661	/*
				662	* akpm: these are BJ_IO, and j_list_lock is not needed.
				663	* See __journal_try_to_free_buffer.
				664	*/
				665	wait_for_iobuf:
				666	while (commit_transaction->t_iobuf_list != NULL) {
				667	struct buffer_head *bh;
				668
				669	jh = commit_transaction->t_iobuf_list->b_tprev;
				670	bh = jh2bh(jh);
				671	if (buffer_locked(bh)) {
				672	wait_on_buffer(bh);
				673	goto wait_for_iobuf;
				674	}
				675	if (cond_resched())
				676	goto wait_for_iobuf;
				677
				678	if (unlikely(!buffer_uptodate(bh)))
				679	err = -EIO;
				680
				681	clear_buffer_jwrite(bh);
				682
				683	JBUFFER_TRACE(jh, "ph4: unfile after journal write");
				684	journal_unfile_buffer(journal, jh);
				685
				686	/*
				687	* ->t_iobuf_list should contain only dummy buffer_heads
				688	* which were created by journal_write_metadata_buffer().
				689	*/
				690	BUFFER_TRACE(bh, "dumping temporary bh");
				691	journal_put_journal_head(jh);
				692	__brelse(bh);
				693	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				694	free_buffer_head(bh);
				695
				696	/* We also have to unlock and free the corresponding
				697	shadowed buffer */
				698	jh = commit_transaction->t_shadow_list->b_tprev;
				699	bh = jh2bh(jh);
				700	clear_bit(BH_JWrite, &bh->b_state);
				701	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				702
				703	/* The metadata is now released for reuse, but we need
				704	to remember it against this transaction so that when
				705	we finally commit, we can do any checkpointing
				706	required. */
				707	JBUFFER_TRACE(jh, "file as BJ_Forget");
				708	journal_file_buffer(jh, commit_transaction, BJ_Forget);
				709	/* Wake up any transactions which were waiting for this
				710	IO to complete */
				711	wake_up_bit(&bh->b_state, BH_Unshadow);
				712	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				713	__brelse(bh);
				714	}
				715
				716	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				717
				718	jbd_debug(3, "JBD: commit phase 5\n");
				719
				720	/* Here we wait for the revoke record and descriptor record buffers */
				721	wait_for_ctlbuf:
				722	while (commit_transaction->t_log_list != NULL) {
				723	struct buffer_head *bh;
				724
				725	jh = commit_transaction->t_log_list->b_tprev;
				726	bh = jh2bh(jh);
				727	if (buffer_locked(bh)) {
				728	wait_on_buffer(bh);
				729	goto wait_for_ctlbuf;
				730	}
				731	if (cond_resched())
				732	goto wait_for_ctlbuf;
				733
				734	if (unlikely(!buffer_uptodate(bh)))
				735	err = -EIO;
				736
				737	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				738	clear_buffer_jwrite(bh);
				739	journal_unfile_buffer(journal, jh);
				740	journal_put_journal_head(jh);
				741	__brelse(bh); /* One for getblk */
				742	/* AKPM: bforget here */
				743	}
				744
				745	jbd_debug(3, "JBD: commit phase 6\n");
				746
				747	if (journal_write_commit_record(journal, commit_transaction))
				748	err = -EIO;
				749
				750	if (err)
				751	__journal_abort_hard(journal);
				752
				753	/* End of a transaction! Finally, we can do checkpoint
				754	processing: any buffers committed as a result of this
				755	transaction can be removed from any checkpoint list it was on
				756	before. */
				757
				758	jbd_debug(3, "JBD: commit phase 7\n");
				759
				760	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
				761	J_ASSERT(commit_transaction->t_buffers == NULL);
				762	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				763	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
				764	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				765	J_ASSERT(commit_transaction->t_log_list == NULL);
				766
				767	restart_loop:
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	768	/*
				769	* As there are other places (journal_unmap_buffer()) adding buffers
				770	* to this list we have to be careful and hold the j_list_lock.
				771	*/
				772	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	773	while (commit_transaction->t_forget) {
				774	transaction_t *cp_transaction;
				775	struct buffer_head *bh;
				776
				777	jh = commit_transaction->t_forget;
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	778	spin_unlock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	779	bh = jh2bh(jh);
				780	jbd_lock_bh_state(bh);
				781	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction \|\|
				782	jh->b_transaction == journal->j_running_transaction);
				783
				784	/*
				785	* If there is undo-protected committed data against
				786	* this buffer, then we can remove it now. If it is a
				787	* buffer needing such protection, the old frozen_data
				788	* field now points to a committed version of the
				789	* buffer, so rotate that field to the new committed
				790	* data.
				791	*
				792	* Otherwise, we can just throw away the frozen data now.
				793	*/
				794	if (jh->b_committed_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame^]	795	jbd_free(jh->b_committed_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	796	jh->b_committed_data = NULL;
				797	if (jh->b_frozen_data) {
				798	jh->b_committed_data = jh->b_frozen_data;
				799	jh->b_frozen_data = NULL;
				800	}
				801	} else if (jh->b_frozen_data) {
Mingming Cao	c089d49	2007-10-16 18:38:25 -0400	[diff] [blame^]	802	jbd_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	803	jh->b_frozen_data = NULL;
				804	}
				805
				806	spin_lock(&journal->j_list_lock);
				807	cp_transaction = jh->b_cp_transaction;
				808	if (cp_transaction) {
				809	JBUFFER_TRACE(jh, "remove from old cp transaction");
				810	__journal_remove_checkpoint(jh);
				811	}
				812
				813	/* Only re-checkpoint the buffer_head if it is marked
				814	* dirty. If the buffer was added to the BJ_Forget list
				815	* by journal_forget, it may no longer be dirty and
				816	* there's no point in keeping a checkpoint record for
				817	* it. */
				818
				819	/* A buffer which has been freed while still being
				820	* journaled by a previous transaction may end up still
				821	* being dirty here, but we want to avoid writing back
				822	* that buffer in the future now that the last use has
				823	* been committed. That's not only a performance gain,
				824	* it also stops aliasing problems if the buffer is left
				825	* behind for writeback and gets reallocated for another
				826	* use in a different page. */
				827	if (buffer_freed(bh)) {
				828	clear_buffer_freed(bh);
				829	clear_buffer_jbddirty(bh);
				830	}
				831
				832	if (buffer_jbddirty(bh)) {
				833	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				834	__journal_insert_checkpoint(jh, commit_transaction);
				835	JBUFFER_TRACE(jh, "refile for checkpoint writeback");
				836	__journal_refile_buffer(jh);
				837	jbd_unlock_bh_state(bh);
				838	} else {
				839	J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara	9ada734	2006-06-23 02:05:25 -0700	[diff] [blame]	840	/* The buffer on BJ_Forget list and not jbddirty means
				841	* it has been freed by this transaction and hence it
				842	* could not have been reallocated until this
				843	* transaction has committed. BUT it could be
				844	* reallocated once we have written all the data to
				845	* disk and before we process the buffer on BJ_Forget
				846	* list. */
				847	JBUFFER_TRACE(jh, "refile or unfile freed buffer");
				848	__journal_refile_buffer(jh);
				849	if (!jh->b_transaction) {
				850	jbd_unlock_bh_state(bh);
				851	/* needs a brelse */
				852	journal_remove_journal_head(bh);
				853	release_buffer_page(bh);
				854	} else
				855	jbd_unlock_bh_state(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	856	}
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	857	cond_resched_lock(&journal->j_list_lock);
				858	}
				859	spin_unlock(&journal->j_list_lock);
				860	/*
				861	* This is a bit sleazy. We borrow j_list_lock to protect
				862	* journal->j_committing_transaction in __journal_remove_checkpoint.
				863	* Really, __journal_remove_checkpoint should be using j_state_lock but
				864	* it's a bit hassle to hold that across __journal_remove_checkpoint
				865	*/
				866	spin_lock(&journal->j_state_lock);
				867	spin_lock(&journal->j_list_lock);
				868	/*
				869	* Now recheck if some buffers did not get attached to the transaction
				870	* while the lock was dropped...
				871	*/
				872	if (commit_transaction->t_forget) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	873	spin_unlock(&journal->j_list_lock);
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	874	spin_unlock(&journal->j_state_lock);
				875	goto restart_loop;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	876	}
				877
				878	/* Done with this transaction! */
				879
				880	jbd_debug(3, "JBD: commit phase 8\n");
				881
				882	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				883
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	884	commit_transaction->t_state = T_FINISHED;
				885	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				886	journal->j_commit_sequence = commit_transaction->t_tid;
				887	journal->j_committing_transaction = NULL;
				888	spin_unlock(&journal->j_state_lock);
				889
Jan Kara	fe28e42	2007-07-15 23:37:18 -0700	[diff] [blame]	890	if (commit_transaction->t_checkpoint_list == NULL &&
				891	commit_transaction->t_checkpoint_io_list == NULL) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	892	__journal_drop_transaction(journal, commit_transaction);
				893	} else {
				894	if (journal->j_checkpoint_transactions == NULL) {
				895	journal->j_checkpoint_transactions = commit_transaction;
				896	commit_transaction->t_cpnext = commit_transaction;
				897	commit_transaction->t_cpprev = commit_transaction;
				898	} else {
				899	commit_transaction->t_cpnext =
				900	journal->j_checkpoint_transactions;
				901	commit_transaction->t_cpprev =
				902	commit_transaction->t_cpnext->t_cpprev;
				903	commit_transaction->t_cpnext->t_cpprev =
				904	commit_transaction;
				905	commit_transaction->t_cpprev->t_cpnext =
				906	commit_transaction;
				907	}
				908	}
				909	spin_unlock(&journal->j_list_lock);
				910
				911	jbd_debug(1, "JBD: commit %d complete, head %d\n",
				912	journal->j_commit_sequence, journal->j_tail_sequence);
				913
				914	wake_up(&journal->j_wait_done_commit);
				915	}