Blame - fs/jbd/commit.c - kernel/msm-4.9

blob: be4648bc7a2f8febe584a999040599bac87edfb6 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
Uwe Zeisberger	f30c226	2006-10-03 23:01:26 +0200	[diff] [blame]	2	* linux/fs/jbd/commit.c
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	3	*
				4	* Written by Stephen C. Tweedie <sct@redhat.com>, 1998
				5	*
				6	* Copyright 1998 Red Hat corp --- All Rights Reserved
				7	*
				8	* This file is part of the Linux kernel and is made available under
				9	* the terms of the GNU General Public License, version 2, or at your
				10	* option, any later version, incorporated herein by reference.
				11	*
				12	* Journal commit routines for the generic filesystem journaling code;
				13	* part of the ext2fs journaling system.
				14	*/
				15
				16	#include <linux/time.h>
				17	#include <linux/fs.h>
				18	#include <linux/jbd.h>
				19	#include <linux/errno.h>
				20	#include <linux/slab.h>
				21	#include <linux/mm.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/smp_lock.h>
				24
				25	/*
				26	* Default IO end handler for temporary BJ_IO buffer_heads.
				27	*/
				28	static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate)
				29	{
				30	BUFFER_TRACE(bh, "");
				31	if (uptodate)
				32	set_buffer_uptodate(bh);
				33	else
				34	clear_buffer_uptodate(bh);
				35	unlock_buffer(bh);
				36	}
				37
				38	/*
				39	* When an ext3-ordered file is truncated, it is possible that many pages are
				40	* not sucessfully freed, because they are attached to a committing transaction.
				41	* After the transaction commits, these pages are left on the LRU, with no
				42	* ->mapping, and with attached buffers. These pages are trivially reclaimable
				43	* by the VM, but their apparent absence upsets the VM accounting, and it makes
				44	* the numbers in /proc/meminfo look odd.
				45	*
				46	* So here, we have a buffer which has just come off the forget list. Look to
				47	* see if we can strip all buffers from the backing page.
				48	*
				49	* Called under lock_journal(), and possibly under journal_datalist_lock. The
				50	* caller provided us with a ref against the buffer, and we drop that here.
				51	*/
				52	static void release_buffer_page(struct buffer_head *bh)
				53	{
				54	struct page *page;
				55
				56	if (buffer_dirty(bh))
				57	goto nope;
				58	if (atomic_read(&bh->b_count) != 1)
				59	goto nope;
				60	page = bh->b_page;
				61	if (!page)
				62	goto nope;
				63	if (page->mapping)
				64	goto nope;
				65
				66	/* OK, it's a truncated page */
				67	if (TestSetPageLocked(page))
				68	goto nope;
				69
				70	page_cache_get(page);
				71	__brelse(bh);
				72	try_to_free_buffers(page);
				73	unlock_page(page);
				74	page_cache_release(page);
				75	return;
				76
				77	nope:
				78	__brelse(bh);
				79	}
				80
				81	/*
				82	* Try to acquire jbd_lock_bh_state() against the buffer, when j_list_lock is
				83	* held. For ranking reasons we must trylock. If we lose, schedule away and
				84	* return 0. j_list_lock is dropped in this case.
				85	*/
				86	static int inverted_lock(journal_t journal, struct buffer_head bh)
				87	{
				88	if (!jbd_trylock_bh_state(bh)) {
				89	spin_unlock(&journal->j_list_lock);
				90	schedule();
				91	return 0;
				92	}
				93	return 1;
				94	}
				95
				96	/* Done it all: now write the commit record. We should have
				97	* cleaned up our previous buffers by now, so if we are in abort
				98	* mode we can now just skip the rest of the journal write
				99	* entirely.
				100	*
				101	* Returns 1 if the journal needs to be aborted or 0 on success
				102	*/
				103	static int journal_write_commit_record(journal_t *journal,
				104	transaction_t *commit_transaction)
				105	{
				106	struct journal_head *descriptor;
				107	struct buffer_head *bh;
				108	int i, ret;
				109	int barrier_done = 0;
				110
				111	if (is_journal_aborted(journal))
				112	return 0;
				113
				114	descriptor = journal_get_descriptor_buffer(journal);
				115	if (!descriptor)
				116	return 1;
				117
				118	bh = jh2bh(descriptor);
				119
				120	/* AKPM: buglet - add `i' to tmp! */
				121	for (i = 0; i < bh->b_size; i += 512) {
				122	journal_header_t tmp = (journal_header_t)bh->b_data;
				123	tmp->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				124	tmp->h_blocktype = cpu_to_be32(JFS_COMMIT_BLOCK);
				125	tmp->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				126	}
				127
				128	JBUFFER_TRACE(descriptor, "write commit block");
				129	set_buffer_dirty(bh);
				130	if (journal->j_flags & JFS_BARRIER) {
				131	set_buffer_ordered(bh);
				132	barrier_done = 1;
				133	}
				134	ret = sync_dirty_buffer(bh);
				135	/* is it possible for another commit to fail at roughly
				136	* the same time as this one? If so, we don't want to
				137	* trust the barrier flag in the super, but instead want
				138	* to remember if we sent a barrier request
				139	*/
				140	if (ret == -EOPNOTSUPP && barrier_done) {
				141	char b[BDEVNAME_SIZE];
				142
				143	printk(KERN_WARNING
				144	"JBD: barrier-based sync failed on %s - "
				145	"disabling barriers\n",
				146	bdevname(journal->j_dev, b));
				147	spin_lock(&journal->j_state_lock);
				148	journal->j_flags &= ~JFS_BARRIER;
				149	spin_unlock(&journal->j_state_lock);
				150
				151	/* And try again, without the barrier */
				152	clear_buffer_ordered(bh);
				153	set_buffer_uptodate(bh);
				154	set_buffer_dirty(bh);
				155	ret = sync_dirty_buffer(bh);
				156	}
				157	put_bh(bh); /* One for getblk() */
				158	journal_put_journal_head(descriptor);
				159
				160	return (ret == -EIO);
				161	}
				162
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	163	static void journal_do_submit_data(struct buffer_head **wbuf, int bufs)
				164	{
				165	int i;
				166
				167	for (i = 0; i < bufs; i++) {
				168	wbuf[i]->b_end_io = end_buffer_write_sync;
				169	/* We use-up our safety reference in submit_bh() */
				170	submit_bh(WRITE, wbuf[i]);
				171	}
				172	}
				173
				174	/*
				175	* Submit all the data buffers to disk
				176	*/
				177	static void journal_submit_data_buffers(journal_t *journal,
				178	transaction_t *commit_transaction)
				179	{
				180	struct journal_head *jh;
				181	struct buffer_head *bh;
				182	int locked;
				183	int bufs = 0;
				184	struct buffer_head **wbuf = journal->j_wbuf;
				185
				186	/*
				187	* Whenever we unlock the journal and sleep, things can get added
				188	* onto ->t_sync_datalist, so we have to keep looping back to
				189	* write_out_data until we know that the list is empty.
				190	*
				191	* Cleanup any flushed data buffers from the data list. Even in
				192	* abort mode, we want to flush this out as soon as possible.
				193	*/
				194	write_out_data:
				195	cond_resched();
				196	spin_lock(&journal->j_list_lock);
				197
				198	while (commit_transaction->t_sync_datalist) {
				199	jh = commit_transaction->t_sync_datalist;
				200	bh = jh2bh(jh);
				201	locked = 0;
				202
				203	/* Get reference just to make sure buffer does not disappear
				204	* when we are forced to drop various locks */
				205	get_bh(bh);
				206	/* If the buffer is dirty, we need to submit IO and hence
				207	* we need the buffer lock. We try to lock the buffer without
				208	* blocking. If we fail, we need to drop j_list_lock and do
				209	* blocking lock_buffer().
				210	*/
				211	if (buffer_dirty(bh)) {
				212	if (test_set_buffer_locked(bh)) {
				213	BUFFER_TRACE(bh, "needs blocking lock");
				214	spin_unlock(&journal->j_list_lock);
				215	/* Write out all data to prevent deadlocks */
				216	journal_do_submit_data(wbuf, bufs);
				217	bufs = 0;
				218	lock_buffer(bh);
				219	spin_lock(&journal->j_list_lock);
				220	}
				221	locked = 1;
				222	}
				223	/* We have to get bh_state lock. Again out of order, sigh. */
				224	if (!inverted_lock(journal, bh)) {
				225	jbd_lock_bh_state(bh);
				226	spin_lock(&journal->j_list_lock);
				227	}
				228	/* Someone already cleaned up the buffer? */
				229	if (!buffer_jbd(bh)
				230	\|\| jh->b_transaction != commit_transaction
				231	\|\| jh->b_jlist != BJ_SyncData) {
				232	jbd_unlock_bh_state(bh);
				233	if (locked)
				234	unlock_buffer(bh);
				235	BUFFER_TRACE(bh, "already cleaned up");
				236	put_bh(bh);
				237	continue;
				238	}
				239	if (locked && test_clear_buffer_dirty(bh)) {
				240	BUFFER_TRACE(bh, "needs writeout, adding to array");
				241	wbuf[bufs++] = bh;
				242	__journal_file_buffer(jh, commit_transaction,
				243	BJ_Locked);
				244	jbd_unlock_bh_state(bh);
				245	if (bufs == journal->j_wbufsize) {
				246	spin_unlock(&journal->j_list_lock);
				247	journal_do_submit_data(wbuf, bufs);
				248	bufs = 0;
				249	goto write_out_data;
				250	}
Hisashi Hifumi	6f5a9da	2006-12-22 01:11:50 -0800	[diff] [blame]	251	} else if (!locked && buffer_locked(bh)) {
				252	__journal_file_buffer(jh, commit_transaction,
				253	BJ_Locked);
				254	jbd_unlock_bh_state(bh);
				255	put_bh(bh);
				256	} else {
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	257	BUFFER_TRACE(bh, "writeout complete: unfile");
				258	__journal_unfile_buffer(jh);
				259	jbd_unlock_bh_state(bh);
				260	if (locked)
				261	unlock_buffer(bh);
				262	journal_remove_journal_head(bh);
				263	/* Once for our safety reference, once for
				264	* journal_remove_journal_head() */
				265	put_bh(bh);
				266	put_bh(bh);
				267	}
				268
				269	if (lock_need_resched(&journal->j_list_lock)) {
				270	spin_unlock(&journal->j_list_lock);
				271	goto write_out_data;
				272	}
				273	}
				274	spin_unlock(&journal->j_list_lock);
				275	journal_do_submit_data(wbuf, bufs);
				276	}
				277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	/*
				279	* journal_commit_transaction
				280	*
				281	* The primary function for committing a transaction to the log. This
				282	* function is called by the journal thread to begin a complete commit.
				283	*/
				284	void journal_commit_transaction(journal_t *journal)
				285	{
				286	transaction_t *commit_transaction;
				287	struct journal_head jh, new_jh, *descriptor;
				288	struct buffer_head **wbuf = journal->j_wbuf;
				289	int bufs;
				290	int flags;
				291	int err;
				292	unsigned long blocknr;
				293	char *tagp = NULL;
				294	journal_header_t *header;
				295	journal_block_tag_t *tag = NULL;
				296	int space_left = 0;
				297	int first_tag = 0;
				298	int tag_flag;
				299	int i;
				300
				301	/*
				302	* First job: lock down the current transaction and wait for
				303	* all outstanding updates to complete.
				304	*/
				305
				306	#ifdef COMMIT_STATS
				307	spin_lock(&journal->j_list_lock);
				308	summarise_journal_usage(journal);
				309	spin_unlock(&journal->j_list_lock);
				310	#endif
				311
				312	/* Do we need to erase the effects of a prior journal_flush? */
				313	if (journal->j_flags & JFS_FLUSHED) {
				314	jbd_debug(3, "super block updated\n");
				315	journal_update_superblock(journal, 1);
				316	} else {
				317	jbd_debug(3, "superblock not updated\n");
				318	}
				319
				320	J_ASSERT(journal->j_running_transaction != NULL);
				321	J_ASSERT(journal->j_committing_transaction == NULL);
				322
				323	commit_transaction = journal->j_running_transaction;
				324	J_ASSERT(commit_transaction->t_state == T_RUNNING);
				325
				326	jbd_debug(1, "JBD: starting commit of transaction %d\n",
				327	commit_transaction->t_tid);
				328
				329	spin_lock(&journal->j_state_lock);
				330	commit_transaction->t_state = T_LOCKED;
				331
				332	spin_lock(&commit_transaction->t_handle_lock);
				333	while (commit_transaction->t_updates) {
				334	DEFINE_WAIT(wait);
				335
				336	prepare_to_wait(&journal->j_wait_updates, &wait,
				337	TASK_UNINTERRUPTIBLE);
				338	if (commit_transaction->t_updates) {
				339	spin_unlock(&commit_transaction->t_handle_lock);
				340	spin_unlock(&journal->j_state_lock);
				341	schedule();
				342	spin_lock(&journal->j_state_lock);
				343	spin_lock(&commit_transaction->t_handle_lock);
				344	}
				345	finish_wait(&journal->j_wait_updates, &wait);
				346	}
				347	spin_unlock(&commit_transaction->t_handle_lock);
				348
				349	J_ASSERT (commit_transaction->t_outstanding_credits <=
				350	journal->j_max_transaction_buffers);
				351
				352	/*
				353	* First thing we are allowed to do is to discard any remaining
				354	* BJ_Reserved buffers. Note, it is _not_ permissible to assume
				355	* that there are no such buffers: if a large filesystem
				356	* operation like a truncate needs to split itself over multiple
				357	* transactions, then it may try to do a journal_restart() while
				358	* there are still BJ_Reserved buffers outstanding. These must
				359	* be released cleanly from the current transaction.
				360	*
				361	* In this case, the filesystem must still reserve write access
				362	* again before modifying the buffer in the new transaction, but
				363	* we do not require it to remember exactly which old buffers it
				364	* has reserved. This is consistent with the existing behaviour
				365	* that multiple journal_get_write_access() calls to the same
				366	* buffer are perfectly permissable.
				367	*/
				368	while (commit_transaction->t_reserved_list) {
				369	jh = commit_transaction->t_reserved_list;
				370	JBUFFER_TRACE(jh, "reserved, unused: refile");
				371	/*
				372	* A journal_get_undo_access()+journal_release_buffer() may
				373	* leave undo-committed data.
				374	*/
				375	if (jh->b_committed_data) {
				376	struct buffer_head *bh = jh2bh(jh);
				377
				378	jbd_lock_bh_state(bh);
Badari Pulavarty	ea81739	2006-08-27 01:23:52 -0700	[diff] [blame]	379	jbd_slab_free(jh->b_committed_data, bh->b_size);
Jesper Juhl	f99d49a	2005-11-07 01:01:34 -0800	[diff] [blame]	380	jh->b_committed_data = NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	381	jbd_unlock_bh_state(bh);
				382	}
				383	journal_refile_buffer(journal, jh);
				384	}
				385
				386	/*
				387	* Now try to drop any written-back buffers from the journal's
				388	* checkpoint lists. We do this before commit because it potentially
				389	* frees some memory
				390	*/
				391	spin_lock(&journal->j_list_lock);
				392	__journal_clean_checkpoint_list(journal);
				393	spin_unlock(&journal->j_list_lock);
				394
				395	jbd_debug (3, "JBD: commit phase 1\n");
				396
				397	/*
				398	* Switch to a new revoke table.
				399	*/
				400	journal_switch_revoke_table(journal);
				401
				402	commit_transaction->t_state = T_FLUSH;
				403	journal->j_committing_transaction = commit_transaction;
				404	journal->j_running_transaction = NULL;
				405	commit_transaction->t_log_start = journal->j_head;
				406	wake_up(&journal->j_wait_transaction_locked);
				407	spin_unlock(&journal->j_state_lock);
				408
				409	jbd_debug (3, "JBD: commit phase 2\n");
				410
				411	/*
				412	* First, drop modified flag: all accesses to the buffers
				413	* will be tracked for a new trasaction only -bzzz
				414	*/
				415	spin_lock(&journal->j_list_lock);
				416	if (commit_transaction->t_buffers) {
				417	new_jh = jh = commit_transaction->t_buffers->b_tnext;
				418	do {
				419	J_ASSERT_JH(new_jh, new_jh->b_modified == 1 \|\|
				420	new_jh->b_modified == 0);
				421	new_jh->b_modified = 0;
				422	new_jh = new_jh->b_tnext;
				423	} while (new_jh != jh);
				424	}
				425	spin_unlock(&journal->j_list_lock);
				426
				427	/*
				428	* Now start flushing things to disk, in the order they appear
				429	* on the transaction lists. Data blocks go first.
				430	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	431	err = 0;
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	432	journal_submit_data_buffers(journal, commit_transaction);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	433
				434	/*
				435	* Wait for all previously submitted IO to complete.
				436	*/
Jan Kara	3998b93	2006-09-25 23:30:53 -0700	[diff] [blame]	437	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	438	while (commit_transaction->t_locked_list) {
				439	struct buffer_head *bh;
				440
				441	jh = commit_transaction->t_locked_list->b_tprev;
				442	bh = jh2bh(jh);
				443	get_bh(bh);
				444	if (buffer_locked(bh)) {
				445	spin_unlock(&journal->j_list_lock);
				446	wait_on_buffer(bh);
				447	if (unlikely(!buffer_uptodate(bh)))
				448	err = -EIO;
				449	spin_lock(&journal->j_list_lock);
				450	}
				451	if (!inverted_lock(journal, bh)) {
				452	put_bh(bh);
				453	spin_lock(&journal->j_list_lock);
				454	continue;
				455	}
				456	if (buffer_jbd(bh) && jh->b_jlist == BJ_Locked) {
				457	__journal_unfile_buffer(jh);
				458	jbd_unlock_bh_state(bh);
				459	journal_remove_journal_head(bh);
				460	put_bh(bh);
				461	} else {
				462	jbd_unlock_bh_state(bh);
				463	}
				464	put_bh(bh);
				465	cond_resched_lock(&journal->j_list_lock);
				466	}
				467	spin_unlock(&journal->j_list_lock);
				468
				469	if (err)
				470	__journal_abort_hard(journal);
				471
				472	journal_write_revoke_records(journal, commit_transaction);
				473
				474	jbd_debug(3, "JBD: commit phase 2\n");
				475
				476	/*
				477	* If we found any dirty or locked buffers, then we should have
				478	* looped back up to the write_out_data label. If there weren't
				479	* any then journal_clean_data_list should have wiped the list
				480	* clean by now, so check that it is in fact empty.
				481	*/
				482	J_ASSERT (commit_transaction->t_sync_datalist == NULL);
				483
				484	jbd_debug (3, "JBD: commit phase 3\n");
				485
				486	/*
				487	* Way to go: we have now written out all of the data for a
				488	* transaction! Now comes the tricky part: we need to write out
				489	* metadata. Loop over the transaction's entire buffer list:
				490	*/
				491	commit_transaction->t_state = T_COMMIT;
				492
				493	descriptor = NULL;
				494	bufs = 0;
				495	while (commit_transaction->t_buffers) {
				496
				497	/* Find the next buffer to be journaled... */
				498
				499	jh = commit_transaction->t_buffers;
				500
				501	/* If we're in abort mode, we just un-journal the buffer and
				502	release it for background writing. */
				503
				504	if (is_journal_aborted(journal)) {
				505	JBUFFER_TRACE(jh, "journal is aborting: refile");
				506	journal_refile_buffer(journal, jh);
				507	/* If that was the last one, we need to clean up
				508	* any descriptor buffers which may have been
				509	* already allocated, even if we are now
				510	* aborting. */
				511	if (!commit_transaction->t_buffers)
				512	goto start_journal_io;
				513	continue;
				514	}
				515
				516	/* Make sure we have a descriptor block in which to
				517	record the metadata buffer. */
				518
				519	if (!descriptor) {
				520	struct buffer_head *bh;
				521
				522	J_ASSERT (bufs == 0);
				523
				524	jbd_debug(4, "JBD: get descriptor\n");
				525
				526	descriptor = journal_get_descriptor_buffer(journal);
				527	if (!descriptor) {
				528	__journal_abort_hard(journal);
				529	continue;
				530	}
				531
				532	bh = jh2bh(descriptor);
				533	jbd_debug(4, "JBD: got buffer %llu (%p)\n",
				534	(unsigned long long)bh->b_blocknr, bh->b_data);
				535	header = (journal_header_t *)&bh->b_data[0];
				536	header->h_magic = cpu_to_be32(JFS_MAGIC_NUMBER);
				537	header->h_blocktype = cpu_to_be32(JFS_DESCRIPTOR_BLOCK);
				538	header->h_sequence = cpu_to_be32(commit_transaction->t_tid);
				539
				540	tagp = &bh->b_data[sizeof(journal_header_t)];
				541	space_left = bh->b_size - sizeof(journal_header_t);
				542	first_tag = 1;
				543	set_buffer_jwrite(bh);
				544	set_buffer_dirty(bh);
				545	wbuf[bufs++] = bh;
				546
				547	/* Record it so that we can wait for IO
				548	completion later */
				549	BUFFER_TRACE(bh, "ph3: file as descriptor");
				550	journal_file_buffer(descriptor, commit_transaction,
				551	BJ_LogCtl);
				552	}
				553
				554	/* Where is the buffer to be written? */
				555
				556	err = journal_next_log_block(journal, &blocknr);
				557	/* If the block mapping failed, just abandon the buffer
				558	and repeat this loop: we'll fall into the
				559	refile-on-abort condition above. */
				560	if (err) {
				561	__journal_abort_hard(journal);
				562	continue;
				563	}
				564
				565	/*
				566	* start_this_handle() uses t_outstanding_credits to determine
				567	* the free space in the log, but this counter is changed
				568	* by journal_next_log_block() also.
				569	*/
				570	commit_transaction->t_outstanding_credits--;
				571
				572	/* Bump b_count to prevent truncate from stumbling over
				573	the shadowed buffer! @@@ This can go if we ever get
				574	rid of the BJ_IO/BJ_Shadow pairing of buffers. */
				575	atomic_inc(&jh2bh(jh)->b_count);
				576
				577	/* Make a temporary IO buffer with which to write it out
				578	(this will requeue both the metadata buffer and the
				579	temporary IO buffer). new_bh goes on BJ_IO*/
				580
				581	set_bit(BH_JWrite, &jh2bh(jh)->b_state);
				582	/*
				583	* akpm: journal_write_metadata_buffer() sets
				584	* new_bh->b_transaction to commit_transaction.
				585	* We need to clean this up before we release new_bh
				586	* (which is of type BJ_IO)
				587	*/
				588	JBUFFER_TRACE(jh, "ph3: write metadata");
				589	flags = journal_write_metadata_buffer(commit_transaction,
				590	jh, &new_jh, blocknr);
				591	set_bit(BH_JWrite, &jh2bh(new_jh)->b_state);
				592	wbuf[bufs++] = jh2bh(new_jh);
				593
				594	/* Record the new block's tag in the current descriptor
				595	buffer */
				596
				597	tag_flag = 0;
				598	if (flags & 1)
				599	tag_flag \|= JFS_FLAG_ESCAPE;
				600	if (!first_tag)
				601	tag_flag \|= JFS_FLAG_SAME_UUID;
				602
				603	tag = (journal_block_tag_t *) tagp;
				604	tag->t_blocknr = cpu_to_be32(jh2bh(jh)->b_blocknr);
				605	tag->t_flags = cpu_to_be32(tag_flag);
				606	tagp += sizeof(journal_block_tag_t);
				607	space_left -= sizeof(journal_block_tag_t);
				608
				609	if (first_tag) {
				610	memcpy (tagp, journal->j_uuid, 16);
				611	tagp += 16;
				612	space_left -= 16;
				613	first_tag = 0;
				614	}
				615
				616	/* If there's no more to do, or if the descriptor is full,
				617	let the IO rip! */
				618
				619	if (bufs == journal->j_wbufsize \|\|
				620	commit_transaction->t_buffers == NULL \|\|
				621	space_left < sizeof(journal_block_tag_t) + 16) {
				622
				623	jbd_debug(4, "JBD: Submit %d IOs\n", bufs);
				624
				625	/* Write an end-of-descriptor marker before
				626	submitting the IOs. "tag" still points to
				627	the last tag we set up. */
				628
				629	tag->t_flags \|= cpu_to_be32(JFS_FLAG_LAST_TAG);
				630
				631	start_journal_io:
				632	for (i = 0; i < bufs; i++) {
				633	struct buffer_head *bh = wbuf[i];
				634	lock_buffer(bh);
				635	clear_buffer_dirty(bh);
				636	set_buffer_uptodate(bh);
				637	bh->b_end_io = journal_end_buffer_io_sync;
				638	submit_bh(WRITE, bh);
				639	}
				640	cond_resched();
				641
				642	/* Force a new descriptor to be generated next
				643	time round the loop. */
				644	descriptor = NULL;
				645	bufs = 0;
				646	}
				647	}
				648
				649	/* Lo and behold: we have just managed to send a transaction to
				650	the log. Before we can commit it, wait for the IO so far to
				651	complete. Control buffers being written are on the
				652	transaction's t_log_list queue, and metadata buffers are on
				653	the t_iobuf_list queue.
				654
				655	Wait for the buffers in reverse order. That way we are
				656	less likely to be woken up until all IOs have completed, and
				657	so we incur less scheduling load.
				658	*/
				659
				660	jbd_debug(3, "JBD: commit phase 4\n");
				661
				662	/*
				663	* akpm: these are BJ_IO, and j_list_lock is not needed.
				664	* See __journal_try_to_free_buffer.
				665	*/
				666	wait_for_iobuf:
				667	while (commit_transaction->t_iobuf_list != NULL) {
				668	struct buffer_head *bh;
				669
				670	jh = commit_transaction->t_iobuf_list->b_tprev;
				671	bh = jh2bh(jh);
				672	if (buffer_locked(bh)) {
				673	wait_on_buffer(bh);
				674	goto wait_for_iobuf;
				675	}
				676	if (cond_resched())
				677	goto wait_for_iobuf;
				678
				679	if (unlikely(!buffer_uptodate(bh)))
				680	err = -EIO;
				681
				682	clear_buffer_jwrite(bh);
				683
				684	JBUFFER_TRACE(jh, "ph4: unfile after journal write");
				685	journal_unfile_buffer(journal, jh);
				686
				687	/*
				688	* ->t_iobuf_list should contain only dummy buffer_heads
				689	* which were created by journal_write_metadata_buffer().
				690	*/
				691	BUFFER_TRACE(bh, "dumping temporary bh");
				692	journal_put_journal_head(jh);
				693	__brelse(bh);
				694	J_ASSERT_BH(bh, atomic_read(&bh->b_count) == 0);
				695	free_buffer_head(bh);
				696
				697	/* We also have to unlock and free the corresponding
				698	shadowed buffer */
				699	jh = commit_transaction->t_shadow_list->b_tprev;
				700	bh = jh2bh(jh);
				701	clear_bit(BH_JWrite, &bh->b_state);
				702	J_ASSERT_BH(bh, buffer_jbddirty(bh));
				703
				704	/* The metadata is now released for reuse, but we need
				705	to remember it against this transaction so that when
				706	we finally commit, we can do any checkpointing
				707	required. */
				708	JBUFFER_TRACE(jh, "file as BJ_Forget");
				709	journal_file_buffer(jh, commit_transaction, BJ_Forget);
				710	/* Wake up any transactions which were waiting for this
				711	IO to complete */
				712	wake_up_bit(&bh->b_state, BH_Unshadow);
				713	JBUFFER_TRACE(jh, "brelse shadowed buffer");
				714	__brelse(bh);
				715	}
				716
				717	J_ASSERT (commit_transaction->t_shadow_list == NULL);
				718
				719	jbd_debug(3, "JBD: commit phase 5\n");
				720
				721	/* Here we wait for the revoke record and descriptor record buffers */
				722	wait_for_ctlbuf:
				723	while (commit_transaction->t_log_list != NULL) {
				724	struct buffer_head *bh;
				725
				726	jh = commit_transaction->t_log_list->b_tprev;
				727	bh = jh2bh(jh);
				728	if (buffer_locked(bh)) {
				729	wait_on_buffer(bh);
				730	goto wait_for_ctlbuf;
				731	}
				732	if (cond_resched())
				733	goto wait_for_ctlbuf;
				734
				735	if (unlikely(!buffer_uptodate(bh)))
				736	err = -EIO;
				737
				738	BUFFER_TRACE(bh, "ph5: control buffer writeout done: unfile");
				739	clear_buffer_jwrite(bh);
				740	journal_unfile_buffer(journal, jh);
				741	journal_put_journal_head(jh);
				742	__brelse(bh); /* One for getblk */
				743	/* AKPM: bforget here */
				744	}
				745
				746	jbd_debug(3, "JBD: commit phase 6\n");
				747
				748	if (journal_write_commit_record(journal, commit_transaction))
				749	err = -EIO;
				750
				751	if (err)
				752	__journal_abort_hard(journal);
				753
				754	/* End of a transaction! Finally, we can do checkpoint
				755	processing: any buffers committed as a result of this
				756	transaction can be removed from any checkpoint list it was on
				757	before. */
				758
				759	jbd_debug(3, "JBD: commit phase 7\n");
				760
				761	J_ASSERT(commit_transaction->t_sync_datalist == NULL);
				762	J_ASSERT(commit_transaction->t_buffers == NULL);
				763	J_ASSERT(commit_transaction->t_checkpoint_list == NULL);
				764	J_ASSERT(commit_transaction->t_iobuf_list == NULL);
				765	J_ASSERT(commit_transaction->t_shadow_list == NULL);
				766	J_ASSERT(commit_transaction->t_log_list == NULL);
				767
				768	restart_loop:
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	769	/*
				770	* As there are other places (journal_unmap_buffer()) adding buffers
				771	* to this list we have to be careful and hold the j_list_lock.
				772	*/
				773	spin_lock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	774	while (commit_transaction->t_forget) {
				775	transaction_t *cp_transaction;
				776	struct buffer_head *bh;
				777
				778	jh = commit_transaction->t_forget;
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	779	spin_unlock(&journal->j_list_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	780	bh = jh2bh(jh);
				781	jbd_lock_bh_state(bh);
				782	J_ASSERT_JH(jh, jh->b_transaction == commit_transaction \|\|
				783	jh->b_transaction == journal->j_running_transaction);
				784
				785	/*
				786	* If there is undo-protected committed data against
				787	* this buffer, then we can remove it now. If it is a
				788	* buffer needing such protection, the old frozen_data
				789	* field now points to a committed version of the
				790	* buffer, so rotate that field to the new committed
				791	* data.
				792	*
				793	* Otherwise, we can just throw away the frozen data now.
				794	*/
				795	if (jh->b_committed_data) {
Badari Pulavarty	ea81739	2006-08-27 01:23:52 -0700	[diff] [blame]	796	jbd_slab_free(jh->b_committed_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	797	jh->b_committed_data = NULL;
				798	if (jh->b_frozen_data) {
				799	jh->b_committed_data = jh->b_frozen_data;
				800	jh->b_frozen_data = NULL;
				801	}
				802	} else if (jh->b_frozen_data) {
Badari Pulavarty	ea81739	2006-08-27 01:23:52 -0700	[diff] [blame]	803	jbd_slab_free(jh->b_frozen_data, bh->b_size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	804	jh->b_frozen_data = NULL;
				805	}
				806
				807	spin_lock(&journal->j_list_lock);
				808	cp_transaction = jh->b_cp_transaction;
				809	if (cp_transaction) {
				810	JBUFFER_TRACE(jh, "remove from old cp transaction");
				811	__journal_remove_checkpoint(jh);
				812	}
				813
				814	/* Only re-checkpoint the buffer_head if it is marked
				815	* dirty. If the buffer was added to the BJ_Forget list
				816	* by journal_forget, it may no longer be dirty and
				817	* there's no point in keeping a checkpoint record for
				818	* it. */
				819
				820	/* A buffer which has been freed while still being
				821	* journaled by a previous transaction may end up still
				822	* being dirty here, but we want to avoid writing back
				823	* that buffer in the future now that the last use has
				824	* been committed. That's not only a performance gain,
				825	* it also stops aliasing problems if the buffer is left
				826	* behind for writeback and gets reallocated for another
				827	* use in a different page. */
				828	if (buffer_freed(bh)) {
				829	clear_buffer_freed(bh);
				830	clear_buffer_jbddirty(bh);
				831	}
				832
				833	if (buffer_jbddirty(bh)) {
				834	JBUFFER_TRACE(jh, "add to new checkpointing trans");
				835	__journal_insert_checkpoint(jh, commit_transaction);
				836	JBUFFER_TRACE(jh, "refile for checkpoint writeback");
				837	__journal_refile_buffer(jh);
				838	jbd_unlock_bh_state(bh);
				839	} else {
				840	J_ASSERT_BH(bh, !buffer_dirty(bh));
Jan Kara	9ada734	2006-06-23 02:05:25 -0700	[diff] [blame]	841	/* The buffer on BJ_Forget list and not jbddirty means
				842	* it has been freed by this transaction and hence it
				843	* could not have been reallocated until this
				844	* transaction has committed. BUT it could be
				845	* reallocated once we have written all the data to
				846	* disk and before we process the buffer on BJ_Forget
				847	* list. */
				848	JBUFFER_TRACE(jh, "refile or unfile freed buffer");
				849	__journal_refile_buffer(jh);
				850	if (!jh->b_transaction) {
				851	jbd_unlock_bh_state(bh);
				852	/* needs a brelse */
				853	journal_remove_journal_head(bh);
				854	release_buffer_page(bh);
				855	} else
				856	jbd_unlock_bh_state(bh);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	857	}
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	858	cond_resched_lock(&journal->j_list_lock);
				859	}
				860	spin_unlock(&journal->j_list_lock);
				861	/*
				862	* This is a bit sleazy. We borrow j_list_lock to protect
				863	* journal->j_committing_transaction in __journal_remove_checkpoint.
				864	* Really, __journal_remove_checkpoint should be using j_state_lock but
				865	* it's a bit hassle to hold that across __journal_remove_checkpoint
				866	*/
				867	spin_lock(&journal->j_state_lock);
				868	spin_lock(&journal->j_list_lock);
				869	/*
				870	* Now recheck if some buffers did not get attached to the transaction
				871	* while the lock was dropped...
				872	*/
				873	if (commit_transaction->t_forget) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	874	spin_unlock(&journal->j_list_lock);
Jan Kara	e6c9f5c	2005-09-06 15:19:09 -0700	[diff] [blame]	875	spin_unlock(&journal->j_state_lock);
				876	goto restart_loop;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	877	}
				878
				879	/* Done with this transaction! */
				880
				881	jbd_debug(3, "JBD: commit phase 8\n");
				882
				883	J_ASSERT(commit_transaction->t_state == T_COMMIT);
				884
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	885	commit_transaction->t_state = T_FINISHED;
				886	J_ASSERT(commit_transaction == journal->j_committing_transaction);
				887	journal->j_commit_sequence = commit_transaction->t_tid;
				888	journal->j_committing_transaction = NULL;
				889	spin_unlock(&journal->j_state_lock);
				890
Mark Fasheh	7c8903f	2006-02-14 13:53:03 -0800	[diff] [blame]	891	if (commit_transaction->t_checkpoint_list == NULL) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	892	__journal_drop_transaction(journal, commit_transaction);
				893	} else {
				894	if (journal->j_checkpoint_transactions == NULL) {
				895	journal->j_checkpoint_transactions = commit_transaction;
				896	commit_transaction->t_cpnext = commit_transaction;
				897	commit_transaction->t_cpprev = commit_transaction;
				898	} else {
				899	commit_transaction->t_cpnext =
				900	journal->j_checkpoint_transactions;
				901	commit_transaction->t_cpprev =
				902	commit_transaction->t_cpnext->t_cpprev;
				903	commit_transaction->t_cpnext->t_cpprev =
				904	commit_transaction;
				905	commit_transaction->t_cpprev->t_cpnext =
				906	commit_transaction;
				907	}
				908	}
				909	spin_unlock(&journal->j_list_lock);
				910
				911	jbd_debug(1, "JBD: commit %d complete, head %d\n",
				912	journal->j_commit_sequence, journal->j_tail_sequence);
				913
				914	wake_up(&journal->j_wait_done_commit);
				915	}