Blame - fs/fs-writeback.c - kernel/msm-4.9

blob: 2e601ce581c9ccf7a50f1016a6df690892607c56 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* fs/fs-writeback.c
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains all the functions related to writing back and waiting
				7	* upon dirty inodes against superblocks, and writing back dirty
				8	* pages against inodes. ie: data writeback. Writeout of the
				9	* inode itself is not handled here.
				10	*
Francois Cami	e1f8e87	2008-10-15 22:01:59 -0700	[diff] [blame]	11	* 10Apr2002 Andrew Morton
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	* Split out of fs/inode.c
				13	* Additions for address_space-based writeback
				14	*/
				15
				16	#include <linux/kernel.h>
Jens Axboe	f5ff842	2007-09-21 09:19:54 +0200	[diff] [blame]	17	#include <linux/module.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	18	#include <linux/spinlock.h>
				19	#include <linux/sched.h>
				20	#include <linux/fs.h>
				21	#include <linux/mm.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	22	#include <linux/kthread.h>
				23	#include <linux/freezer.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/writeback.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/buffer_head.h>
David Howells	07f3f05	2006-09-30 20:52:18 +0200	[diff] [blame]	28	#include "internal.h"
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	30	#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	31
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	32	/*
Jens Axboe	d0bceac	2009-05-18 08:20:32 +0200	[diff] [blame^]	33	* We don't actually have pdflush, but this one is exported though /proc...
				34	*/
				35	int nr_pdflush_threads;
				36
				37	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	38	* Work items for the bdi_writeback threads
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	39	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	40	struct bdi_work {
				41	struct list_head list;
				42	struct list_head wait_list;
				43	struct rcu_head rcu_head;
				44
				45	unsigned long seen;
				46	atomic_t pending;
				47
				48	struct super_block *sb;
				49	unsigned long nr_pages;
				50	enum writeback_sync_modes sync_mode;
				51
				52	unsigned long state;
				53	};
				54
				55	enum {
				56	WS_USED_B = 0,
				57	WS_ONSTACK_B,
				58	};
				59
				60	#define WS_USED (1 << WS_USED_B)
				61	#define WS_ONSTACK (1 << WS_ONSTACK_B)
				62
				63	static inline bool bdi_work_on_stack(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	64	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	65	return test_bit(WS_ONSTACK_B, &work->state);
				66	}
				67
				68	static inline void bdi_work_init(struct bdi_work *work,
				69	struct writeback_control *wbc)
				70	{
				71	INIT_RCU_HEAD(&work->rcu_head);
				72	work->sb = wbc->sb;
				73	work->nr_pages = wbc->nr_to_write;
				74	work->sync_mode = wbc->sync_mode;
				75	work->state = WS_USED;
				76	}
				77
				78	static inline void bdi_work_init_on_stack(struct bdi_work *work,
				79	struct writeback_control *wbc)
				80	{
				81	bdi_work_init(work, wbc);
				82	work->state \|= WS_ONSTACK;
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	83	}
				84
				85	/**
				86	* writeback_in_progress - determine whether there is writeback in progress
				87	* @bdi: the device's backing_dev_info structure.
				88	*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	89	* Determine whether there is writeback waiting to be handled against a
				90	* backing device.
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	91	*/
				92	int writeback_in_progress(struct backing_dev_info *bdi)
				93	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	94	return !list_empty(&bdi->work_list);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	95	}
				96
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	97	static void bdi_work_clear(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	98	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	99	clear_bit(WS_USED_B, &work->state);
				100	smp_mb__after_clear_bit();
				101	wake_up_bit(&work->state, WS_USED_B);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	102	}
				103
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	104	static void bdi_work_free(struct rcu_head *head)
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	105	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	106	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	107
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	108	if (!bdi_work_on_stack(work))
				109	kfree(work);
				110	else
				111	bdi_work_clear(work);
				112	}
				113
				114	static void wb_work_complete(struct bdi_work *work)
				115	{
				116	const enum writeback_sync_modes sync_mode = work->sync_mode;
				117
				118	/*
				119	* For allocated work, we can clear the done/seen bit right here.
				120	* For on-stack work, we need to postpone both the clear and free
				121	* to after the RCU grace period, since the stack could be invalidated
				122	* as soon as bdi_work_clear() has done the wakeup.
				123	*/
				124	if (!bdi_work_on_stack(work))
				125	bdi_work_clear(work);
				126	if (sync_mode == WB_SYNC_NONE \|\| bdi_work_on_stack(work))
				127	call_rcu(&work->rcu_head, bdi_work_free);
				128	}
				129
				130	static void wb_clear_pending(struct bdi_writeback wb, struct bdi_work work)
				131	{
				132	/*
				133	* The caller has retrieved the work arguments from this work,
				134	* drop our reference. If this is the last ref, delete and free it
				135	*/
				136	if (atomic_dec_and_test(&work->pending)) {
				137	struct backing_dev_info *bdi = wb->bdi;
				138
				139	spin_lock(&bdi->wb_lock);
				140	list_del_rcu(&work->list);
				141	spin_unlock(&bdi->wb_lock);
				142
				143	wb_work_complete(work);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	144	}
				145	}
				146
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	147	static void bdi_queue_work(struct backing_dev_info bdi, struct bdi_work work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	148	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	149	if (work) {
				150	work->seen = bdi->wb_mask;
				151	BUG_ON(!work->seen);
				152	atomic_set(&work->pending, bdi->wb_cnt);
				153	BUG_ON(!bdi->wb_cnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	154
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	155	/*
				156	* Make sure stores are seen before it appears on the list
				157	*/
				158	smp_mb();
				159
				160	spin_lock(&bdi->wb_lock);
				161	list_add_tail_rcu(&work->list, &bdi->work_list);
				162	spin_unlock(&bdi->wb_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	}
				164
				165	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	166	* If the default thread isn't there, make sure we add it. When
				167	* it gets created and wakes up, we'll run this work.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	168	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	169	if (unlikely(list_empty_careful(&bdi->wb_list)))
				170	wake_up_process(default_backing_dev_info.wb.task);
				171	else {
				172	struct bdi_writeback *wb = &bdi->wb;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	173
				174	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	175	* If we failed allocating the bdi work item, wake up the wb
				176	* thread always. As a safety precaution, it'll flush out
				177	* everything
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	178	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	179	if (!wb_has_dirty_io(wb)) {
				180	if (work)
				181	wb_clear_pending(wb, work);
				182	} else if (wb->task)
				183	wake_up_process(wb->task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	184	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	185	}
				186
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	187	/*
				188	* Used for on-stack allocated work items. The caller needs to wait until
				189	* the wb threads have acked the work before it's safe to continue.
				190	*/
				191	static void bdi_wait_on_work_clear(struct bdi_work *work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	192	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	193	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
				194	TASK_UNINTERRUPTIBLE);
				195	}
				196
				197	static struct bdi_work bdi_alloc_work(struct writeback_control wbc)
				198	{
				199	struct bdi_work *work;
				200
				201	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				202	if (work)
				203	bdi_work_init(work, wbc);
				204
				205	return work;
				206	}
				207
				208	void bdi_start_writeback(struct writeback_control *wbc)
				209	{
				210	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
				211	struct bdi_work work_stack, *work = NULL;
				212
				213	if (!must_wait)
				214	work = bdi_alloc_work(wbc);
				215
				216	if (!work) {
				217	work = &work_stack;
				218	bdi_work_init_on_stack(work, wbc);
				219	}
				220
				221	bdi_queue_work(wbc->bdi, work);
				222
				223	/*
				224	* If the sync mode is WB_SYNC_ALL, block waiting for the work to
				225	* complete. If not, we only need to wait for the work to be started,
				226	* if we allocated it on-stack. We use the same mechanism, if the
				227	* wait bit is set in the bdi_work struct, then threads will not
				228	* clear pending until after they are done.
				229	*
				230	* Note that work == &work_stack if must_wait is true, so we don't
				231	* need to do call_rcu() here ever, since the completion path will
				232	* have done that for us.
				233	*/
				234	if (must_wait \|\| work == &work_stack) {
				235	bdi_wait_on_work_clear(work);
				236	if (work != &work_stack)
				237	call_rcu(&work->rcu_head, bdi_work_free);
				238	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	239	}
				240
				241	/*
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	242	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
				243	* furthest end of its superblock's dirty-inode list.
				244	*
				245	* Before stamping the inode's ->dirtied_when, we check to see whether it is
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	246	* already the most-recently-dirtied inode on the b_dirty list. If that is
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	247	* the case then the inode must have been redirtied while it was being written
				248	* out and we don't reset its dirtied_when.
				249	*/
				250	static void redirty_tail(struct inode *inode)
				251	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	252	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	253
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	254	if (!list_empty(&wb->b_dirty)) {
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	255	struct inode *tail;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	256
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	257	tail = list_entry(wb->b_dirty.next, struct inode, i_list);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	258	if (time_before(inode->dirtied_when, tail->dirtied_when))
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	259	inode->dirtied_when = jiffies;
				260	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	261	list_move(&inode->i_list, &wb->b_dirty);
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	262	}
				263
				264	/*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	265	* requeue inode for re-scanning after bdi->b_io list is exhausted.
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	266	*/
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	267	static void requeue_io(struct inode *inode)
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	268	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	269	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				270
				271	list_move(&inode->i_list, &wb->b_more_io);
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	272	}
				273
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	274	static void inode_sync_complete(struct inode *inode)
				275	{
				276	/*
				277	* Prevent speculative execution through spin_unlock(&inode_lock);
				278	*/
				279	smp_mb();
				280	wake_up_bit(&inode->i_state, __I_SYNC);
				281	}
				282
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	283	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
				284	{
				285	bool ret = time_after(inode->dirtied_when, t);
				286	#ifndef CONFIG_64BIT
				287	/*
				288	* For inodes being constantly redirtied, dirtied_when can get stuck.
				289	* It _appears_ to be in the future, but is actually in distant past.
				290	* This test is necessary to prevent such wrapped-around relative times
				291	* from permanently stopping the whole pdflush writeback.
				292	*/
				293	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
				294	#endif
				295	return ret;
				296	}
				297
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	298	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	299	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
				300	*/
				301	static void move_expired_inodes(struct list_head *delaying_queue,
				302	struct list_head *dispatch_queue,
				303	unsigned long *older_than_this)
				304	{
				305	while (!list_empty(delaying_queue)) {
				306	struct inode *inode = list_entry(delaying_queue->prev,
				307	struct inode, i_list);
				308	if (older_than_this &&
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	309	inode_dirtied_after(inode, *older_than_this))
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	310	break;
				311	list_move(&inode->i_list, dispatch_queue);
				312	}
				313	}
				314
				315	/*
				316	* Queue all expired dirty inodes for io, eldest first.
				317	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	318	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	319	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	320	list_splice_init(&wb->b_more_io, wb->b_io.prev);
				321	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	322	}
				323
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	324	static int write_inode(struct inode *inode, int sync)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	325	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	326	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
				327	return inode->i_sb->s_op->write_inode(inode, sync);
				328	return 0;
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	329	}
				330
				331	/*
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	332	* Wait for writeback on an inode to complete.
				333	*/
				334	static void inode_wait_for_writeback(struct inode *inode)
				335	{
				336	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
				337	wait_queue_head_t *wqh;
				338
				339	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
				340	do {
				341	spin_unlock(&inode_lock);
				342	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
				343	spin_lock(&inode_lock);
				344	} while (inode->i_state & I_SYNC);
				345	}
				346
				347	/*
				348	* Write out an inode's dirty pages. Called under inode_lock. Either the
				349	* caller has ref on the inode (either via __iget or via syscall against an fd)
				350	* or the inode has I_WILL_FREE set (via generic_forget_inode)
				351	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	352	* If `wait' is set, wait on the writeout.
				353	*
				354	* The whole writeout design is quite complex and fragile. We want to avoid
				355	* starvation of particular inodes when others are being redirtied, prevent
				356	* livelocks, etc.
				357	*
				358	* Called under inode_lock.
				359	*/
				360	static int
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	361	writeback_single_inode(struct inode inode, struct writeback_control wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	362	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	363	struct address_space *mapping = inode->i_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	364	int wait = wbc->sync_mode == WB_SYNC_ALL;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	365	unsigned dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	366	int ret;
				367
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	368	if (!atomic_read(&inode->i_count))
				369	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
				370	else
				371	WARN_ON(inode->i_state & I_WILL_FREE);
				372
				373	if (inode->i_state & I_SYNC) {
				374	/*
				375	* If this inode is locked for writeback and we are not doing
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	376	* writeback-for-data-integrity, move it to b_more_io so that
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	377	* writeback can proceed with the other inodes on s_io.
				378	*
				379	* We'll have another go at writing back this inode when we
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	380	* completed a full scan of b_io.
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	381	*/
				382	if (!wait) {
				383	requeue_io(inode);
				384	return 0;
				385	}
				386
				387	/*
				388	* It's a data-integrity sync. We must wait.
				389	*/
				390	inode_wait_for_writeback(inode);
				391	}
				392
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	393	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	394
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	395	/* Set I_SYNC, reset I_DIRTY */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	396	dirty = inode->i_state & I_DIRTY;
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	397	inode->i_state \|= I_SYNC;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	398	inode->i_state &= ~I_DIRTY;
				399
				400	spin_unlock(&inode_lock);
				401
				402	ret = do_writepages(mapping, wbc);
				403
				404	/* Don't write the inode if only I_DIRTY_PAGES was set */
				405	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				406	int err = write_inode(inode, wait);
				407	if (ret == 0)
				408	ret = err;
				409	}
				410
				411	if (wait) {
				412	int err = filemap_fdatawait(mapping);
				413	if (ret == 0)
				414	ret = err;
				415	}
				416
				417	spin_lock(&inode_lock);
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	418	inode->i_state &= ~I_SYNC;
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	419	if (!(inode->i_state & (I_FREEING \| I_CLEAR))) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	420	if (!(inode->i_state & I_DIRTY) &&
				421	mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
				422	/*
				423	* We didn't write back all the pages. nfs_writepages()
				424	* sometimes bales out without doing anything. Redirty
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	425	* the inode; Move it from b_io onto b_more_io/b_dirty.
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	426	*/
				427	/*
				428	* akpm: if the caller was the kupdate function we put
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	429	* this inode at the head of b_dirty so it gets first
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	430	* consideration. Otherwise, move it to the tail, for
				431	* the reasons described there. I'm not really sure
				432	* how much sense this makes. Presumably I had a good
				433	* reasons for doing it this way, and I'd rather not
				434	* muck with it at present.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	435	*/
				436	if (wbc->for_kupdate) {
				437	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	438	* For the kupdate function we move the inode
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	439	* to b_more_io so it will get more writeout as
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	440	* soon as the queue becomes uncongested.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	441	*/
				442	inode->i_state \|= I_DIRTY_PAGES;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	443	if (wbc->nr_to_write <= 0) {
				444	/*
				445	* slice used up: queue for next turn
				446	*/
				447	requeue_io(inode);
				448	} else {
				449	/*
				450	* somehow blocked: retry later
				451	*/
				452	redirty_tail(inode);
				453	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	454	} else {
				455	/*
				456	* Otherwise fully redirty the inode so that
				457	* other inodes on this superblock will get some
				458	* writeout. Otherwise heavy writing to one
				459	* file would indefinitely suspend writeout of
				460	* all the other files.
				461	*/
				462	inode->i_state \|= I_DIRTY_PAGES;
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	463	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	464	}
				465	} else if (inode->i_state & I_DIRTY) {
				466	/*
				467	* Someone redirtied the inode while were writing back
				468	* the pages.
				469	*/
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	470	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	471	} else if (atomic_read(&inode->i_count)) {
				472	/*
				473	* The inode is clean, inuse
				474	*/
				475	list_move(&inode->i_list, &inode_in_use);
				476	} else {
				477	/*
				478	* The inode is clean, unused
				479	*/
				480	list_move(&inode->i_list, &inode_unused);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	481	}
				482	}
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	483	inode_sync_complete(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	484	return ret;
				485	}
				486
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	487	/*
				488	* For WB_SYNC_NONE writeback, the caller does not have the sb pinned
				489	* before calling writeback. So make sure that we do pin it, so it doesn't
				490	* go away while we are writing inodes from it.
				491	*
				492	* Returns 0 if the super was successfully pinned (or pinning wasn't needed),
				493	* 1 if we failed.
				494	*/
				495	static int pin_sb_for_writeback(struct writeback_control *wbc,
				496	struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	497	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	498	struct super_block *sb = inode->i_sb;
				499
				500	/*
				501	* Caller must already hold the ref for this
				502	*/
				503	if (wbc->sync_mode == WB_SYNC_ALL) {
				504	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				505	return 0;
				506	}
				507
				508	spin_lock(&sb_lock);
				509	sb->s_count++;
				510	if (down_read_trylock(&sb->s_umount)) {
				511	if (sb->s_root) {
				512	spin_unlock(&sb_lock);
				513	return 0;
				514	}
				515	/*
				516	* umounted, drop rwsem again and fall through to failure
				517	*/
				518	up_read(&sb->s_umount);
				519	}
				520
				521	sb->s_count--;
				522	spin_unlock(&sb_lock);
				523	return 1;
				524	}
				525
				526	static void unpin_sb_for_writeback(struct writeback_control *wbc,
				527	struct inode *inode)
				528	{
				529	struct super_block *sb = inode->i_sb;
				530
				531	if (wbc->sync_mode == WB_SYNC_ALL)
				532	return;
				533
				534	up_read(&sb->s_umount);
				535	put_super(sb);
				536	}
				537
				538	static void writeback_inodes_wb(struct bdi_writeback *wb,
				539	struct writeback_control *wbc)
				540	{
				541	struct super_block *sb = wbc->sb;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	542	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	543	const unsigned long start = jiffies; /* livelock avoidance */
				544
Hans Reiser	ae8547b	2008-05-07 15:48:57 +0300	[diff] [blame]	545	spin_lock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	546
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	547	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))
				548	queue_io(wb, wbc->older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	549
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	550	while (!list_empty(&wb->b_io)) {
				551	struct inode *inode = list_entry(wb->b_io.prev,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	552	struct inode, i_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	553	long pages_skipped;
				554
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	555	/*
				556	* super block given and doesn't match, skip this inode
				557	*/
				558	if (sb && sb != inode->i_sb) {
				559	redirty_tail(inode);
				560	continue;
				561	}
				562
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	563	if (!bdi_cap_writeback_dirty(wb->bdi)) {
Andrew Morton	9852a0e7	2007-10-16 23:30:33 -0700	[diff] [blame]	564	redirty_tail(inode);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	565	if (is_blkdev_sb) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	566	/*
				567	* Dirty memory-backed blockdev: the ramdisk
				568	* driver does this. Skip just this inode
				569	*/
				570	continue;
				571	}
				572	/*
				573	* Dirty memory-backed inode against a filesystem other
				574	* than the kernel-internal bdev filesystem. Skip the
				575	* entire superblock.
				576	*/
				577	break;
				578	}
				579
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	580	if (inode->i_state & (I_NEW \| I_WILL_FREE)) {
Nick Piggin	7ef0d73	2009-03-12 14:31:38 -0700	[diff] [blame]	581	requeue_io(inode);
				582	continue;
				583	}
				584
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	585	if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	586	wbc->encountered_congestion = 1;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	587	if (!is_blkdev_sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	588	break; /* Skip a congested fs */
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	589	requeue_io(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	590	continue; /* Skip a congested blockdev */
				591	}
				592
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	593	/*
				594	* Was this inode dirtied after sync_sb_inodes was called?
				595	* This keeps sync from extra jobs and livelock.
				596	*/
				597	if (inode_dirtied_after(inode, start))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	598	break;
				599
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	600	if (pin_sb_for_writeback(wbc, inode)) {
				601	requeue_io(inode);
				602	continue;
				603	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	604
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	605	BUG_ON(inode->i_state & (I_FREEING \| I_CLEAR));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	606	__iget(inode);
				607	pages_skipped = wbc->pages_skipped;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	608	writeback_single_inode(inode, wbc);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	609	unpin_sb_for_writeback(wbc, inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	610	if (wbc->pages_skipped != pages_skipped) {
				611	/*
				612	* writeback is not making progress due to locked
				613	* buffers. Skip this inode for now.
				614	*/
Andrew Morton	f57b9b7	2007-10-16 23:30:34 -0700	[diff] [blame]	615	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	616	}
				617	spin_unlock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	618	iput(inode);
OGAWA Hirofumi	4ffc844	2006-03-25 03:07:44 -0800	[diff] [blame]	619	cond_resched();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	620	spin_lock(&inode_lock);
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	621	if (wbc->nr_to_write <= 0) {
				622	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	623	break;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	624	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	625	if (!list_empty(&wb->b_more_io))
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	626	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	627	}
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	628
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	629	spin_unlock(&inode_lock);
				630	/* Leave any unwritten inodes on b_io */
				631	}
				632
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	633	void writeback_inodes_wbc(struct writeback_control *wbc)
				634	{
				635	struct backing_dev_info *bdi = wbc->bdi;
				636
				637	writeback_inodes_wb(&bdi->wb, wbc);
				638	}
				639
				640	/*
				641	* The maximum number of pages to writeout in a single bdi flush/kupdate
				642	* operation. We do this so we don't hold I_SYNC against an inode for
				643	* enormous amounts of time, which would block a userspace task which has
				644	* been forced to throttle against that inode. Also, the code reevaluates
				645	* the dirty each time it has written this many pages.
				646	*/
				647	#define MAX_WRITEBACK_PAGES 1024
				648
				649	static inline bool over_bground_thresh(void)
				650	{
				651	unsigned long background_thresh, dirty_thresh;
				652
				653	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
				654
				655	return (global_page_state(NR_FILE_DIRTY) +
				656	global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
				657	}
				658
				659	/*
				660	* Explicit flushing or periodic writeback of "old" data.
				661	*
				662	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				663	* dirtying-time in the inode's address_space. So this periodic writeback code
				664	* just walks the superblock inode list, writing back any inodes which are
				665	* older than a specific point in time.
				666	*
				667	* Try to run once per dirty_writeback_interval. But if a writeback event
				668	* takes longer than a dirty_writeback_interval interval, then leave a
				669	* one-second gap.
				670	*
				671	* older_than_this takes precedence over nr_to_write. So we'll only write back
				672	* all dirty pages if they are all attached to "old" mappings.
				673	*/
				674	static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
				675	struct super_block *sb,
				676	enum writeback_sync_modes sync_mode, int for_kupdate)
				677	{
				678	struct writeback_control wbc = {
				679	.bdi = wb->bdi,
				680	.sb = sb,
				681	.sync_mode = sync_mode,
				682	.older_than_this = NULL,
				683	.for_kupdate = for_kupdate,
				684	.range_cyclic = 1,
				685	};
				686	unsigned long oldest_jif;
				687	long wrote = 0;
				688
				689	if (wbc.for_kupdate) {
				690	wbc.older_than_this = &oldest_jif;
				691	oldest_jif = jiffies -
				692	msecs_to_jiffies(dirty_expire_interval * 10);
				693	}
				694
				695	for (;;) {
				696	/*
				697	* Don't flush anything for non-integrity writeback where
				698	* no nr_pages was given
				699	*/
				700	if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
				701	break;
				702
				703	/*
				704	* If no specific pages were given and this is just a
				705	* periodic background writeout and we are below the
				706	* background dirty threshold, don't do anything
				707	*/
				708	if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
				709	break;
				710
				711	wbc.more_io = 0;
				712	wbc.encountered_congestion = 0;
				713	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				714	wbc.pages_skipped = 0;
				715	writeback_inodes_wb(wb, &wbc);
				716	nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				717	wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				718
				719	/*
				720	* If we ran out of stuff to write, bail unless more_io got set
				721	*/
				722	if (wbc.nr_to_write > 0 \|\| wbc.pages_skipped > 0) {
				723	if (wbc.more_io && !wbc.for_kupdate)
				724	continue;
				725	break;
				726	}
				727	}
				728
				729	return wrote;
				730	}
				731
				732	/*
				733	* Return the next bdi_work struct that hasn't been processed by this
				734	* wb thread yet
				735	*/
				736	static struct bdi_work get_next_work_item(struct backing_dev_info bdi,
				737	struct bdi_writeback *wb)
				738	{
				739	struct bdi_work work, ret = NULL;
				740
				741	rcu_read_lock();
				742
				743	list_for_each_entry_rcu(work, &bdi->work_list, list) {
				744	if (!test_and_clear_bit(wb->nr, &work->seen))
				745	continue;
				746
				747	ret = work;
				748	break;
				749	}
				750
				751	rcu_read_unlock();
				752	return ret;
				753	}
				754
				755	static long wb_check_old_data_flush(struct bdi_writeback *wb)
				756	{
				757	unsigned long expired;
				758	long nr_pages;
				759
				760	expired = wb->last_old_flush +
				761	msecs_to_jiffies(dirty_writeback_interval * 10);
				762	if (time_before(jiffies, expired))
				763	return 0;
				764
				765	wb->last_old_flush = jiffies;
				766	nr_pages = global_page_state(NR_FILE_DIRTY) +
				767	global_page_state(NR_UNSTABLE_NFS) +
				768	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
				769
				770	if (nr_pages)
				771	return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
				772
				773	return 0;
				774	}
				775
				776	/*
				777	* Retrieve work items and do the writeback they describe
				778	*/
				779	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
				780	{
				781	struct backing_dev_info *bdi = wb->bdi;
				782	struct bdi_work *work;
				783	long nr_pages, wrote = 0;
				784
				785	while ((work = get_next_work_item(bdi, wb)) != NULL) {
				786	enum writeback_sync_modes sync_mode;
				787
				788	nr_pages = work->nr_pages;
				789
				790	/*
				791	* Override sync mode, in case we must wait for completion
				792	*/
				793	if (force_wait)
				794	work->sync_mode = sync_mode = WB_SYNC_ALL;
				795	else
				796	sync_mode = work->sync_mode;
				797
				798	/*
				799	* If this isn't a data integrity operation, just notify
				800	* that we have seen this work and we are now starting it.
				801	*/
				802	if (sync_mode == WB_SYNC_NONE)
				803	wb_clear_pending(wb, work);
				804
				805	wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
				806
				807	/*
				808	* This is a data integrity writeback, so only do the
				809	* notification when we have completed the work.
				810	*/
				811	if (sync_mode == WB_SYNC_ALL)
				812	wb_clear_pending(wb, work);
				813	}
				814
				815	/*
				816	* Check for periodic writeback, kupdated() style
				817	*/
				818	wrote += wb_check_old_data_flush(wb);
				819
				820	return wrote;
				821	}
				822
				823	/*
				824	* Handle writeback of dirty data for the device backed by this bdi. Also
				825	* wakes up periodically and does kupdated style flushing.
				826	*/
				827	int bdi_writeback_task(struct bdi_writeback *wb)
				828	{
				829	unsigned long last_active = jiffies;
				830	unsigned long wait_jiffies = -1UL;
				831	long pages_written;
				832
				833	while (!kthread_should_stop()) {
				834	pages_written = wb_do_writeback(wb, 0);
				835
				836	if (pages_written)
				837	last_active = jiffies;
				838	else if (wait_jiffies != -1UL) {
				839	unsigned long max_idle;
				840
				841	/*
				842	* Longest period of inactivity that we tolerate. If we
				843	* see dirty data again later, the task will get
				844	* recreated automatically.
				845	*/
				846	max_idle = max(5UL * 60 * HZ, wait_jiffies);
				847	if (time_after(jiffies, max_idle + last_active))
				848	break;
				849	}
				850
				851	wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
				852	set_current_state(TASK_INTERRUPTIBLE);
				853	schedule_timeout(wait_jiffies);
				854	try_to_freeze();
				855	}
				856
				857	return 0;
				858	}
				859
				860	/*
				861	* Schedule writeback for all backing devices. Expensive! If this is a data
				862	* integrity operation, writeback will be complete when this returns. If
				863	* we are simply called for WB_SYNC_NONE, then writeback will merely be
				864	* scheduled to run.
				865	*/
				866	static void bdi_writeback_all(struct writeback_control *wbc)
				867	{
				868	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
				869	struct backing_dev_info *bdi;
				870	struct bdi_work *work;
				871	LIST_HEAD(list);
				872
				873	restart:
				874	spin_lock(&bdi_lock);
				875
				876	list_for_each_entry(bdi, &bdi_list, bdi_list) {
				877	struct bdi_work *work;
				878
				879	if (!bdi_has_dirty_io(bdi))
				880	continue;
				881
				882	/*
				883	* If work allocation fails, do the writes inline. We drop
				884	* the lock and restart the list writeout. This should be OK,
				885	* since this happens rarely and because the writeout should
				886	* eventually make more free memory available.
				887	*/
				888	work = bdi_alloc_work(wbc);
				889	if (!work) {
				890	struct writeback_control __wbc;
				891
				892	/*
				893	* Not a data integrity writeout, just continue
				894	*/
				895	if (!must_wait)
				896	continue;
				897
				898	spin_unlock(&bdi_lock);
				899	__wbc = *wbc;
				900	__wbc.bdi = bdi;
				901	writeback_inodes_wbc(&__wbc);
				902	goto restart;
				903	}
				904	if (must_wait)
				905	list_add_tail(&work->wait_list, &list);
				906
				907	bdi_queue_work(bdi, work);
				908	}
				909
				910	spin_unlock(&bdi_lock);
				911
				912	/*
				913	* If this is for WB_SYNC_ALL, wait for pending work to complete
				914	* before returning.
				915	*/
				916	while (!list_empty(&list)) {
				917	work = list_entry(list.next, struct bdi_work, wait_list);
				918	list_del(&work->wait_list);
				919	bdi_wait_on_work_clear(work);
				920	call_rcu(&work->rcu_head, bdi_work_free);
				921	}
				922	}
				923
				924	/*
				925	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				926	* the whole world.
				927	*/
				928	void wakeup_flusher_threads(long nr_pages)
				929	{
				930	struct writeback_control wbc = {
				931	.sync_mode = WB_SYNC_NONE,
				932	.older_than_this = NULL,
				933	.range_cyclic = 1,
				934	};
				935
				936	if (nr_pages == 0)
				937	nr_pages = global_page_state(NR_FILE_DIRTY) +
				938	global_page_state(NR_UNSTABLE_NFS);
				939	wbc.nr_to_write = nr_pages;
				940	bdi_writeback_all(&wbc);
				941	}
				942
				943	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
				944	{
				945	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
				946	struct dentry *dentry;
				947	const char *name = "?";
				948
				949	dentry = d_find_alias(inode);
				950	if (dentry) {
				951	spin_lock(&dentry->d_lock);
				952	name = (const char *) dentry->d_name.name;
				953	}
				954	printk(KERN_DEBUG
				955	"%s(%d): dirtied inode %lu (%s) on %s\n",
				956	current->comm, task_pid_nr(current), inode->i_ino,
				957	name, inode->i_sb->s_id);
				958	if (dentry) {
				959	spin_unlock(&dentry->d_lock);
				960	dput(dentry);
				961	}
				962	}
				963	}
				964
				965	/**
				966	* __mark_inode_dirty - internal function
				967	* @inode: inode to mark
				968	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
				969	* Mark an inode as dirty. Callers should use mark_inode_dirty or
				970	* mark_inode_dirty_sync.
				971	*
				972	* Put the inode on the super block's dirty list.
				973	*
				974	* CAREFUL! We mark it dirty unconditionally, but move it onto the
				975	* dirty list only if it is hashed or if it refers to a blockdev.
				976	* If it was not hashed, it will never be added to the dirty list
				977	* even if it is later hashed, as it will have been marked dirty already.
				978	*
				979	* In short, make sure you hash any inodes _before_ you start marking
				980	* them dirty.
				981	*
				982	* This function must be atomic for the I_DIRTY_PAGES case -
				983	* set_page_dirty() is called under spinlock in several places.
				984	*
				985	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
				986	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
				987	* the kernel-internal blockdev inode represents the dirtying time of the
				988	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
				989	* page->mapping->host, so the page-dirtying time is recorded in the internal
				990	* blockdev inode.
				991	*/
				992	void __mark_inode_dirty(struct inode *inode, int flags)
				993	{
				994	struct super_block *sb = inode->i_sb;
				995
				996	/*
				997	* Don't do this for I_DIRTY_PAGES - that doesn't actually
				998	* dirty the inode itself
				999	*/
				1000	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				1001	if (sb->s_op->dirty_inode)
				1002	sb->s_op->dirty_inode(inode);
				1003	}
				1004
				1005	/*
				1006	* make sure that changes are seen by all cpus before we test i_state
				1007	* -- mikulas
				1008	*/
				1009	smp_mb();
				1010
				1011	/* avoid the locking if we can */
				1012	if ((inode->i_state & flags) == flags)
				1013	return;
				1014
				1015	if (unlikely(block_dump))
				1016	block_dump___mark_inode_dirty(inode);
				1017
				1018	spin_lock(&inode_lock);
				1019	if ((inode->i_state & flags) != flags) {
				1020	const int was_dirty = inode->i_state & I_DIRTY;
				1021
				1022	inode->i_state \|= flags;
				1023
				1024	/*
				1025	* If the inode is being synced, just update its dirty state.
				1026	* The unlocker will place the inode on the appropriate
				1027	* superblock list, based upon its state.
				1028	*/
				1029	if (inode->i_state & I_SYNC)
				1030	goto out;
				1031
				1032	/*
				1033	* Only add valid (hashed) inodes to the superblock's
				1034	* dirty list. Add blockdev inodes as well.
				1035	*/
				1036	if (!S_ISBLK(inode->i_mode)) {
				1037	if (hlist_unhashed(&inode->i_hash))
				1038	goto out;
				1039	}
				1040	if (inode->i_state & (I_FREEING\|I_CLEAR))
				1041	goto out;
				1042
				1043	/*
				1044	* If the inode was already on b_dirty/b_io/b_more_io, don't
				1045	* reposition it (that would break b_dirty time-ordering).
				1046	*/
				1047	if (!was_dirty) {
				1048	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				1049
				1050	inode->dirtied_when = jiffies;
				1051	list_move(&inode->i_list, &wb->b_dirty);
				1052	}
				1053	}
				1054	out:
				1055	spin_unlock(&inode_lock);
				1056	}
				1057	EXPORT_SYMBOL(__mark_inode_dirty);
				1058
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1059	/*
				1060	* Write out a superblock's list of dirty inodes. A wait will be performed
				1061	* upon no inodes, all inodes or the final one, depending upon sync_mode.
				1062	*
				1063	* If older_than_this is non-NULL, then only write out inodes which
				1064	* had their first dirtying at a time earlier than *older_than_this.
				1065	*
				1066	* If we're a pdlfush thread, then implement pdflush collision avoidance
				1067	* against the entire list.
				1068	*
				1069	* If `bdi' is non-zero then we're being asked to writeback a specific queue.
				1070	* This function assumes that the blockdev superblock's inodes are backed by
				1071	* a variety of queues, so all inodes are searched. For other superblocks,
				1072	* assume that all inodes are backed by the same queue.
				1073	*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1074	* The inodes to be written are parked on bdi->b_io. They are moved back onto
				1075	* bdi->b_dirty as they are selected for writing. This way, none can be missed
				1076	* on the writer throttling path, and we get decent balancing between many
				1077	* throttled threads: we don't want them all piling up on inode_sync_wait.
				1078	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1079	static void wait_sb_inodes(struct writeback_control *wbc)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1080	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1081	struct inode inode, old_inode = NULL;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1082
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1083	/*
				1084	* We need to be protected against the filesystem going from
				1085	* r/o to r/w or vice versa.
				1086	*/
				1087	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1088
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1089	spin_lock(&inode_lock);
				1090
				1091	/*
				1092	* Data integrity sync. Must wait for all pages under writeback,
				1093	* because there may have been pages dirtied before our sync
				1094	* call, but which had writeout started before we write it out.
				1095	* In which case, the inode may not be on the dirty list, but
				1096	* we still have to wait for that writeout.
				1097	*/
				1098	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
				1099	struct address_space *mapping;
				1100
				1101	if (inode->i_state & (I_FREEING\|I_CLEAR\|I_WILL_FREE\|I_NEW))
				1102	continue;
				1103	mapping = inode->i_mapping;
				1104	if (mapping->nrpages == 0)
				1105	continue;
				1106	__iget(inode);
				1107	spin_unlock(&inode_lock);
				1108	/*
				1109	* We hold a reference to 'inode' so it couldn't have
				1110	* been removed from s_inodes list while we dropped the
				1111	* inode_lock. We cannot iput the inode now as we can
				1112	* be holding the last reference and we cannot iput it
				1113	* under inode_lock. So we keep the reference and iput
				1114	* it later.
				1115	*/
				1116	iput(old_inode);
				1117	old_inode = inode;
				1118
				1119	filemap_fdatawait(mapping);
				1120
				1121	cond_resched();
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1122
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1123	spin_lock(&inode_lock);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1124	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1125	spin_unlock(&inode_lock);
				1126	iput(old_inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1127	}
				1128
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1129	/**
				1130	* writeback_inodes_sb - writeback dirty inodes from given super_block
				1131	* @sb: the superblock
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1132	*
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1133	* Start writeback on some inodes on this super_block. No guarantees are made
				1134	* on how many (if any) will be written, and this function does not wait
				1135	* for IO completion of submitted IO. The number of pages submitted is
				1136	* returned.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1137	*/
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1138	long writeback_inodes_sb(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1139	{
				1140	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1141	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1142	.sync_mode = WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1143	.range_start = 0,
				1144	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1145	};
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1146	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
				1147	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
				1148	long nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1149
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1150	nr_to_write = nr_dirty + nr_unstable +
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1151	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1152
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1153	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1154	bdi_writeback_all(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1155	return nr_to_write - wbc.nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1156	}
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1157	EXPORT_SYMBOL(writeback_inodes_sb);
				1158
				1159	/**
				1160	* sync_inodes_sb - sync sb inode pages
				1161	* @sb: the superblock
				1162	*
				1163	* This function writes and waits on any dirty inode belonging to this
				1164	* super_block. The number of pages synced is returned.
				1165	*/
				1166	long sync_inodes_sb(struct super_block *sb)
				1167	{
				1168	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1169	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1170	.sync_mode = WB_SYNC_ALL,
				1171	.range_start = 0,
				1172	.range_end = LLONG_MAX,
				1173	};
				1174	long nr_to_write = LONG_MAX; /* doesn't actually matter */
				1175
				1176	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1177	bdi_writeback_all(&wbc);
				1178	wait_sb_inodes(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1179	return nr_to_write - wbc.nr_to_write;
				1180	}
				1181	EXPORT_SYMBOL(sync_inodes_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1183	/**
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1184	* write_inode_now - write an inode to disk
				1185	* @inode: inode to write to disk
				1186	* @sync: whether the write should be synchronous or not
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1187	*
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1188	* This function commits an inode to disk immediately if it is dirty. This is
				1189	* primarily needed by knfsd.
				1190	*
				1191	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1192	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1193	int write_inode_now(struct inode *inode, int sync)
				1194	{
				1195	int ret;
				1196	struct writeback_control wbc = {
				1197	.nr_to_write = LONG_MAX,
Mike Galbraith	18914b1	2008-02-08 04:20:23 -0800	[diff] [blame]	1198	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1199	.range_start = 0,
				1200	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1201	};
				1202
				1203	if (!mapping_cap_writeback_dirty(inode->i_mapping))
Andrew Morton	49364ce	2005-11-07 00:59:15 -0800	[diff] [blame]	1204	wbc.nr_to_write = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1205
				1206	might_sleep();
				1207	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1208	ret = writeback_single_inode(inode, &wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1209	spin_unlock(&inode_lock);
				1210	if (sync)
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1211	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1212	return ret;
				1213	}
				1214	EXPORT_SYMBOL(write_inode_now);
				1215
				1216	/**
				1217	* sync_inode - write an inode and its pages to disk.
				1218	* @inode: the inode to sync
				1219	* @wbc: controls the writeback mode
				1220	*
				1221	* sync_inode() will write an inode and its pages to disk. It will also
				1222	* correctly update the inode on its superblock's dirty inode lists and will
				1223	* update inode->i_state.
				1224	*
				1225	* The caller must have a ref on the inode.
				1226	*/
				1227	int sync_inode(struct inode inode, struct writeback_control wbc)
				1228	{
				1229	int ret;
				1230
				1231	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1232	ret = writeback_single_inode(inode, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1233	spin_unlock(&inode_lock);
				1234	return ret;
				1235	}
				1236	EXPORT_SYMBOL(sync_inode);
				1237
				1238	/**
				1239	* generic_osync_inode - flush all dirty data for a given inode to disk
				1240	* @inode: inode to write
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1241	* @mapping: the address_space that should be flushed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1242	* @what: what to write and wait upon
				1243	*
				1244	* This can be called by file_write functions for files which have the
				1245	* O_SYNC flag set, to flush dirty writes to disk.
				1246	*
				1247	* @what is a bitmask, specifying which part of the inode's data should be
Randy Dunlap	b8887e6	2005-11-07 01:01:07 -0800	[diff] [blame]	1248	* written and waited upon.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1249	*
				1250	* OSYNC_DATA: i_mapping's dirty data
				1251	* OSYNC_METADATA: the buffers at i_mapping->private_list
				1252	* OSYNC_INODE: the inode itself
				1253	*/
				1254
				1255	int generic_osync_inode(struct inode inode, struct address_space mapping, int what)
				1256	{
				1257	int err = 0;
				1258	int need_write_inode_now = 0;
				1259	int err2;
				1260
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1261	if (what & OSYNC_DATA)
				1262	err = filemap_fdatawrite(mapping);
				1263	if (what & (OSYNC_METADATA\|OSYNC_DATA)) {
				1264	err2 = sync_mapping_buffers(mapping);
				1265	if (!err)
				1266	err = err2;
				1267	}
				1268	if (what & OSYNC_DATA) {
				1269	err2 = filemap_fdatawait(mapping);
				1270	if (!err)
				1271	err = err2;
				1272	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1273
				1274	spin_lock(&inode_lock);
				1275	if ((inode->i_state & I_DIRTY) &&
				1276	((what & OSYNC_INODE) \|\| (inode->i_state & I_DIRTY_DATASYNC)))
				1277	need_write_inode_now = 1;
				1278	spin_unlock(&inode_lock);
				1279
				1280	if (need_write_inode_now) {
				1281	err2 = write_inode_now(inode, 1);
				1282	if (!err)
				1283	err = err2;
				1284	}
				1285	else
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1286	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1287
				1288	return err;
				1289	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1290	EXPORT_SYMBOL(generic_osync_inode);