Blame - fs/fs-writeback.c - kernel/msm-4.9

blob: 55f0d4e51b59081d7d2c3bc58e5533eaef41ae1e [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* fs/fs-writeback.c
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains all the functions related to writing back and waiting
				7	* upon dirty inodes against superblocks, and writing back dirty
				8	* pages against inodes. ie: data writeback. Writeout of the
				9	* inode itself is not handled here.
				10	*
Francois Cami	e1f8e87	2008-10-15 22:01:59 -0700	[diff] [blame]	11	* 10Apr2002 Andrew Morton
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	* Split out of fs/inode.c
				13	* Additions for address_space-based writeback
				14	*/
				15
				16	#include <linux/kernel.h>
Jens Axboe	f5ff842	2007-09-21 09:19:54 +0200	[diff] [blame]	17	#include <linux/module.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	18	#include <linux/spinlock.h>
				19	#include <linux/sched.h>
				20	#include <linux/fs.h>
				21	#include <linux/mm.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	22	#include <linux/kthread.h>
				23	#include <linux/freezer.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/writeback.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/buffer_head.h>
David Howells	07f3f05	2006-09-30 20:52:18 +0200	[diff] [blame]	28	#include "internal.h"
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	30	#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	31
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	32	/*
Jens Axboe	d0bceac	2009-05-18 08:20:32 +0200	[diff] [blame]	33	* We don't actually have pdflush, but this one is exported though /proc...
				34	*/
				35	int nr_pdflush_threads;
				36
				37	/*
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	38	* Passed into wb_writeback(), essentially a subset of writeback_control
				39	*/
				40	struct wb_writeback_args {
				41	long nr_pages;
				42	struct super_block *sb;
				43	enum writeback_sync_modes sync_mode;
				44	int for_kupdate;
				45	int range_cyclic;
				46	};
				47
				48	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	49	* Work items for the bdi_writeback threads
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	50	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	51	struct bdi_work {
Jens Axboe	8010c3b	2009-09-15 20:04:57 +0200	[diff] [blame^]	52	struct list_head list; /* pending work list */
				53	struct rcu_head rcu_head; /* for RCU free/clear of work */
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	54
Jens Axboe	8010c3b	2009-09-15 20:04:57 +0200	[diff] [blame^]	55	unsigned long seen; /* threads that have seen this work */
				56	atomic_t pending; /* number of threads still to do work */
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	57
Jens Axboe	8010c3b	2009-09-15 20:04:57 +0200	[diff] [blame^]	58	struct wb_writeback_args args; /* writeback arguments */
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	59
Jens Axboe	8010c3b	2009-09-15 20:04:57 +0200	[diff] [blame^]	60	unsigned long state; /* flag bits, see WS_* */
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	61	};
				62
				63	enum {
				64	WS_USED_B = 0,
				65	WS_ONSTACK_B,
				66	};
				67
				68	#define WS_USED (1 << WS_USED_B)
				69	#define WS_ONSTACK (1 << WS_ONSTACK_B)
				70
				71	static inline bool bdi_work_on_stack(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	72	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	73	return test_bit(WS_ONSTACK_B, &work->state);
				74	}
				75
				76	static inline void bdi_work_init(struct bdi_work *work,
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	77	struct wb_writeback_args *args)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	78	{
				79	INIT_RCU_HEAD(&work->rcu_head);
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	80	work->args = *args;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	81	work->state = WS_USED;
				82	}
				83
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	84	/**
				85	* writeback_in_progress - determine whether there is writeback in progress
				86	* @bdi: the device's backing_dev_info structure.
				87	*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	88	* Determine whether there is writeback waiting to be handled against a
				89	* backing device.
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	90	*/
				91	int writeback_in_progress(struct backing_dev_info *bdi)
				92	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	93	return !list_empty(&bdi->work_list);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	94	}
				95
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	96	static void bdi_work_clear(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	97	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	98	clear_bit(WS_USED_B, &work->state);
				99	smp_mb__after_clear_bit();
				100	wake_up_bit(&work->state, WS_USED_B);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	101	}
				102
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	103	static void bdi_work_free(struct rcu_head *head)
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	104	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	105	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	106
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	107	if (!bdi_work_on_stack(work))
				108	kfree(work);
				109	else
				110	bdi_work_clear(work);
				111	}
				112
				113	static void wb_work_complete(struct bdi_work *work)
				114	{
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	115	const enum writeback_sync_modes sync_mode = work->args.sync_mode;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	116
				117	/*
				118	* For allocated work, we can clear the done/seen bit right here.
				119	* For on-stack work, we need to postpone both the clear and free
				120	* to after the RCU grace period, since the stack could be invalidated
				121	* as soon as bdi_work_clear() has done the wakeup.
				122	*/
				123	if (!bdi_work_on_stack(work))
				124	bdi_work_clear(work);
				125	if (sync_mode == WB_SYNC_NONE \|\| bdi_work_on_stack(work))
				126	call_rcu(&work->rcu_head, bdi_work_free);
				127	}
				128
				129	static void wb_clear_pending(struct bdi_writeback wb, struct bdi_work work)
				130	{
				131	/*
				132	* The caller has retrieved the work arguments from this work,
				133	* drop our reference. If this is the last ref, delete and free it
				134	*/
				135	if (atomic_dec_and_test(&work->pending)) {
				136	struct backing_dev_info *bdi = wb->bdi;
				137
				138	spin_lock(&bdi->wb_lock);
				139	list_del_rcu(&work->list);
				140	spin_unlock(&bdi->wb_lock);
				141
				142	wb_work_complete(work);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	143	}
				144	}
				145
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	146	static void bdi_queue_work(struct backing_dev_info bdi, struct bdi_work work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	147	{
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	148	work->seen = bdi->wb_mask;
				149	BUG_ON(!work->seen);
				150	atomic_set(&work->pending, bdi->wb_cnt);
				151	BUG_ON(!bdi->wb_cnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	152
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	153	/*
				154	* Make sure stores are seen before it appears on the list
				155	*/
				156	smp_mb();
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	157
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	158	spin_lock(&bdi->wb_lock);
				159	list_add_tail_rcu(&work->list, &bdi->work_list);
				160	spin_unlock(&bdi->wb_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	161
				162	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	163	* If the default thread isn't there, make sure we add it. When
				164	* it gets created and wakes up, we'll run this work.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	165	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	166	if (unlikely(list_empty_careful(&bdi->wb_list)))
				167	wake_up_process(default_backing_dev_info.wb.task);
				168	else {
				169	struct bdi_writeback *wb = &bdi->wb;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	170
				171	/*
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	172	* End work now if this wb has no dirty IO pending. Otherwise
				173	* wakeup the handling thread
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	174	*/
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	175	if (!wb_has_dirty_io(wb))
				176	wb_clear_pending(wb, work);
				177	else if (wb->task)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	178	wake_up_process(wb->task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	179	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	180	}
				181
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	182	/*
				183	* Used for on-stack allocated work items. The caller needs to wait until
				184	* the wb threads have acked the work before it's safe to continue.
				185	*/
				186	static void bdi_wait_on_work_clear(struct bdi_work *work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	187	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	188	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
				189	TASK_UNINTERRUPTIBLE);
				190	}
				191
Jens Axboe	f11fcae	2009-09-15 09:53:35 +0200	[diff] [blame]	192	static void bdi_alloc_queue_work(struct backing_dev_info *bdi,
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	193	struct wb_writeback_args *args)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	194	{
				195	struct bdi_work *work;
				196
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	197	/*
				198	* This is WB_SYNC_NONE writeback, so if allocation fails just
				199	* wakeup the thread for old dirty data writeback
				200	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	201	work = kmalloc(sizeof(*work), GFP_ATOMIC);
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	202	if (work) {
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	203	bdi_work_init(work, args);
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	204	bdi_queue_work(bdi, work);
				205	} else {
				206	struct bdi_writeback *wb = &bdi->wb;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	207
Jens Axboe	bcddc3f	2009-09-13 20:07:36 +0200	[diff] [blame]	208	if (wb->task)
				209	wake_up_process(wb->task);
				210	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	211	}
				212
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	213	/**
				214	* bdi_sync_writeback - start and wait for writeback
				215	* @bdi: the backing device to write from
				216	* @sb: write inodes from this super_block
				217	*
				218	* Description:
				219	* This does WB_SYNC_ALL data integrity writeback and waits for the
				220	* IO to complete. Callers must hold the sb s_umount semaphore for
				221	* reading, to avoid having the super disappear before we are done.
				222	*/
				223	static void bdi_sync_writeback(struct backing_dev_info *bdi,
				224	struct super_block *sb)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	225	{
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	226	struct wb_writeback_args args = {
				227	.sb = sb,
				228	.sync_mode = WB_SYNC_ALL,
				229	.nr_pages = LONG_MAX,
				230	.range_cyclic = 0,
				231	};
				232	struct bdi_work work;
Christoph Hellwig	f0fad8a	2009-09-11 09:47:56 +0200	[diff] [blame]	233
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	234	bdi_work_init(&work, &args);
				235	work.state \|= WS_ONSTACK;
Christoph Hellwig	f0fad8a	2009-09-11 09:47:56 +0200	[diff] [blame]	236
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	237	bdi_queue_work(bdi, &work);
				238	bdi_wait_on_work_clear(&work);
				239	}
				240
				241	/**
				242	* bdi_start_writeback - start writeback
				243	* @bdi: the backing device to write from
				244	* @nr_pages: the number of pages to write
				245	*
				246	* Description:
				247	* This does WB_SYNC_NONE opportunistic writeback. The IO is only
				248	* started when this function returns, we make no guarentees on
				249	* completion. Caller need not hold sb s_umount semaphore.
				250	*
				251	*/
				252	void bdi_start_writeback(struct backing_dev_info *bdi, long nr_pages)
				253	{
				254	struct wb_writeback_args args = {
				255	.sync_mode = WB_SYNC_NONE,
				256	.nr_pages = nr_pages,
				257	.range_cyclic = 1,
				258	};
				259
				260	bdi_alloc_queue_work(bdi, &args);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	261	}
				262
				263	/*
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	264	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
				265	* furthest end of its superblock's dirty-inode list.
				266	*
				267	* Before stamping the inode's ->dirtied_when, we check to see whether it is
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	268	* already the most-recently-dirtied inode on the b_dirty list. If that is
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	269	* the case then the inode must have been redirtied while it was being written
				270	* out and we don't reset its dirtied_when.
				271	*/
				272	static void redirty_tail(struct inode *inode)
				273	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	274	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	275
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	276	if (!list_empty(&wb->b_dirty)) {
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	277	struct inode *tail;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	278
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	279	tail = list_entry(wb->b_dirty.next, struct inode, i_list);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	280	if (time_before(inode->dirtied_when, tail->dirtied_when))
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	281	inode->dirtied_when = jiffies;
				282	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	283	list_move(&inode->i_list, &wb->b_dirty);
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	284	}
				285
				286	/*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	287	* requeue inode for re-scanning after bdi->b_io list is exhausted.
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	288	*/
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	289	static void requeue_io(struct inode *inode)
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	290	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	291	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				292
				293	list_move(&inode->i_list, &wb->b_more_io);
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	294	}
				295
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	296	static void inode_sync_complete(struct inode *inode)
				297	{
				298	/*
				299	* Prevent speculative execution through spin_unlock(&inode_lock);
				300	*/
				301	smp_mb();
				302	wake_up_bit(&inode->i_state, __I_SYNC);
				303	}
				304
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	305	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
				306	{
				307	bool ret = time_after(inode->dirtied_when, t);
				308	#ifndef CONFIG_64BIT
				309	/*
				310	* For inodes being constantly redirtied, dirtied_when can get stuck.
				311	* It _appears_ to be in the future, but is actually in distant past.
				312	* This test is necessary to prevent such wrapped-around relative times
				313	* from permanently stopping the whole pdflush writeback.
				314	*/
				315	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
				316	#endif
				317	return ret;
				318	}
				319
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	320	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	321	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
				322	*/
				323	static void move_expired_inodes(struct list_head *delaying_queue,
				324	struct list_head *dispatch_queue,
				325	unsigned long *older_than_this)
				326	{
				327	while (!list_empty(delaying_queue)) {
				328	struct inode *inode = list_entry(delaying_queue->prev,
				329	struct inode, i_list);
				330	if (older_than_this &&
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	331	inode_dirtied_after(inode, *older_than_this))
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	332	break;
				333	list_move(&inode->i_list, dispatch_queue);
				334	}
				335	}
				336
				337	/*
				338	* Queue all expired dirty inodes for io, eldest first.
				339	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	340	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	341	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	342	list_splice_init(&wb->b_more_io, wb->b_io.prev);
				343	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	344	}
				345
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	346	static int write_inode(struct inode *inode, int sync)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	347	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	348	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
				349	return inode->i_sb->s_op->write_inode(inode, sync);
				350	return 0;
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	351	}
				352
				353	/*
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	354	* Wait for writeback on an inode to complete.
				355	*/
				356	static void inode_wait_for_writeback(struct inode *inode)
				357	{
				358	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
				359	wait_queue_head_t *wqh;
				360
				361	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
				362	do {
				363	spin_unlock(&inode_lock);
				364	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
				365	spin_lock(&inode_lock);
				366	} while (inode->i_state & I_SYNC);
				367	}
				368
				369	/*
				370	* Write out an inode's dirty pages. Called under inode_lock. Either the
				371	* caller has ref on the inode (either via __iget or via syscall against an fd)
				372	* or the inode has I_WILL_FREE set (via generic_forget_inode)
				373	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	374	* If `wait' is set, wait on the writeout.
				375	*
				376	* The whole writeout design is quite complex and fragile. We want to avoid
				377	* starvation of particular inodes when others are being redirtied, prevent
				378	* livelocks, etc.
				379	*
				380	* Called under inode_lock.
				381	*/
				382	static int
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	383	writeback_single_inode(struct inode inode, struct writeback_control wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	384	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	385	struct address_space *mapping = inode->i_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	386	int wait = wbc->sync_mode == WB_SYNC_ALL;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	387	unsigned dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	388	int ret;
				389
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	390	if (!atomic_read(&inode->i_count))
				391	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
				392	else
				393	WARN_ON(inode->i_state & I_WILL_FREE);
				394
				395	if (inode->i_state & I_SYNC) {
				396	/*
				397	* If this inode is locked for writeback and we are not doing
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	398	* writeback-for-data-integrity, move it to b_more_io so that
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	399	* writeback can proceed with the other inodes on s_io.
				400	*
				401	* We'll have another go at writing back this inode when we
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	402	* completed a full scan of b_io.
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	403	*/
				404	if (!wait) {
				405	requeue_io(inode);
				406	return 0;
				407	}
				408
				409	/*
				410	* It's a data-integrity sync. We must wait.
				411	*/
				412	inode_wait_for_writeback(inode);
				413	}
				414
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	415	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	416
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	417	/* Set I_SYNC, reset I_DIRTY */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	418	dirty = inode->i_state & I_DIRTY;
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	419	inode->i_state \|= I_SYNC;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	420	inode->i_state &= ~I_DIRTY;
				421
				422	spin_unlock(&inode_lock);
				423
				424	ret = do_writepages(mapping, wbc);
				425
				426	/* Don't write the inode if only I_DIRTY_PAGES was set */
				427	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				428	int err = write_inode(inode, wait);
				429	if (ret == 0)
				430	ret = err;
				431	}
				432
				433	if (wait) {
				434	int err = filemap_fdatawait(mapping);
				435	if (ret == 0)
				436	ret = err;
				437	}
				438
				439	spin_lock(&inode_lock);
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	440	inode->i_state &= ~I_SYNC;
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	441	if (!(inode->i_state & (I_FREEING \| I_CLEAR))) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	442	if (!(inode->i_state & I_DIRTY) &&
				443	mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
				444	/*
				445	* We didn't write back all the pages. nfs_writepages()
				446	* sometimes bales out without doing anything. Redirty
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	447	* the inode; Move it from b_io onto b_more_io/b_dirty.
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	448	*/
				449	/*
				450	* akpm: if the caller was the kupdate function we put
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	451	* this inode at the head of b_dirty so it gets first
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	452	* consideration. Otherwise, move it to the tail, for
				453	* the reasons described there. I'm not really sure
				454	* how much sense this makes. Presumably I had a good
				455	* reasons for doing it this way, and I'd rather not
				456	* muck with it at present.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	457	*/
				458	if (wbc->for_kupdate) {
				459	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	460	* For the kupdate function we move the inode
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	461	* to b_more_io so it will get more writeout as
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	462	* soon as the queue becomes uncongested.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	463	*/
				464	inode->i_state \|= I_DIRTY_PAGES;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	465	if (wbc->nr_to_write <= 0) {
				466	/*
				467	* slice used up: queue for next turn
				468	*/
				469	requeue_io(inode);
				470	} else {
				471	/*
				472	* somehow blocked: retry later
				473	*/
				474	redirty_tail(inode);
				475	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	} else {
				477	/*
				478	* Otherwise fully redirty the inode so that
				479	* other inodes on this superblock will get some
				480	* writeout. Otherwise heavy writing to one
				481	* file would indefinitely suspend writeout of
				482	* all the other files.
				483	*/
				484	inode->i_state \|= I_DIRTY_PAGES;
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	485	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	486	}
				487	} else if (inode->i_state & I_DIRTY) {
				488	/*
				489	* Someone redirtied the inode while were writing back
				490	* the pages.
				491	*/
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	492	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	493	} else if (atomic_read(&inode->i_count)) {
				494	/*
				495	* The inode is clean, inuse
				496	*/
				497	list_move(&inode->i_list, &inode_in_use);
				498	} else {
				499	/*
				500	* The inode is clean, unused
				501	*/
				502	list_move(&inode->i_list, &inode_unused);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	503	}
				504	}
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	505	inode_sync_complete(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	506	return ret;
				507	}
				508
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	509	/*
				510	* For WB_SYNC_NONE writeback, the caller does not have the sb pinned
				511	* before calling writeback. So make sure that we do pin it, so it doesn't
				512	* go away while we are writing inodes from it.
				513	*
				514	* Returns 0 if the super was successfully pinned (or pinning wasn't needed),
				515	* 1 if we failed.
				516	*/
				517	static int pin_sb_for_writeback(struct writeback_control *wbc,
				518	struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	520	struct super_block *sb = inode->i_sb;
				521
				522	/*
				523	* Caller must already hold the ref for this
				524	*/
				525	if (wbc->sync_mode == WB_SYNC_ALL) {
				526	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				527	return 0;
				528	}
				529
				530	spin_lock(&sb_lock);
				531	sb->s_count++;
				532	if (down_read_trylock(&sb->s_umount)) {
				533	if (sb->s_root) {
				534	spin_unlock(&sb_lock);
				535	return 0;
				536	}
				537	/*
				538	* umounted, drop rwsem again and fall through to failure
				539	*/
				540	up_read(&sb->s_umount);
				541	}
				542
				543	sb->s_count--;
				544	spin_unlock(&sb_lock);
				545	return 1;
				546	}
				547
				548	static void unpin_sb_for_writeback(struct writeback_control *wbc,
				549	struct inode *inode)
				550	{
				551	struct super_block *sb = inode->i_sb;
				552
				553	if (wbc->sync_mode == WB_SYNC_ALL)
				554	return;
				555
				556	up_read(&sb->s_umount);
				557	put_super(sb);
				558	}
				559
				560	static void writeback_inodes_wb(struct bdi_writeback *wb,
				561	struct writeback_control *wbc)
				562	{
				563	struct super_block *sb = wbc->sb;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	564	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	565	const unsigned long start = jiffies; /* livelock avoidance */
				566
Hans Reiser	ae8547b	2008-05-07 15:48:57 +0300	[diff] [blame]	567	spin_lock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	569	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))
				570	queue_io(wb, wbc->older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	571
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	572	while (!list_empty(&wb->b_io)) {
				573	struct inode *inode = list_entry(wb->b_io.prev,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	574	struct inode, i_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	575	long pages_skipped;
				576
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	577	/*
				578	* super block given and doesn't match, skip this inode
				579	*/
				580	if (sb && sb != inode->i_sb) {
				581	redirty_tail(inode);
				582	continue;
				583	}
				584
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	585	if (!bdi_cap_writeback_dirty(wb->bdi)) {
Andrew Morton	9852a0e7	2007-10-16 23:30:33 -0700	[diff] [blame]	586	redirty_tail(inode);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	587	if (is_blkdev_sb) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	588	/*
				589	* Dirty memory-backed blockdev: the ramdisk
				590	* driver does this. Skip just this inode
				591	*/
				592	continue;
				593	}
				594	/*
				595	* Dirty memory-backed inode against a filesystem other
				596	* than the kernel-internal bdev filesystem. Skip the
				597	* entire superblock.
				598	*/
				599	break;
				600	}
				601
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	602	if (inode->i_state & (I_NEW \| I_WILL_FREE)) {
Nick Piggin	7ef0d73	2009-03-12 14:31:38 -0700	[diff] [blame]	603	requeue_io(inode);
				604	continue;
				605	}
				606
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	607	if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	608	wbc->encountered_congestion = 1;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	609	if (!is_blkdev_sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	610	break; /* Skip a congested fs */
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	611	requeue_io(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	612	continue; /* Skip a congested blockdev */
				613	}
				614
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	615	/*
				616	* Was this inode dirtied after sync_sb_inodes was called?
				617	* This keeps sync from extra jobs and livelock.
				618	*/
				619	if (inode_dirtied_after(inode, start))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	620	break;
				621
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	622	if (pin_sb_for_writeback(wbc, inode)) {
				623	requeue_io(inode);
				624	continue;
				625	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	626
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	627	BUG_ON(inode->i_state & (I_FREEING \| I_CLEAR));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	628	__iget(inode);
				629	pages_skipped = wbc->pages_skipped;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	630	writeback_single_inode(inode, wbc);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	631	unpin_sb_for_writeback(wbc, inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	632	if (wbc->pages_skipped != pages_skipped) {
				633	/*
				634	* writeback is not making progress due to locked
				635	* buffers. Skip this inode for now.
				636	*/
Andrew Morton	f57b9b7	2007-10-16 23:30:34 -0700	[diff] [blame]	637	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	638	}
				639	spin_unlock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	640	iput(inode);
OGAWA Hirofumi	4ffc844	2006-03-25 03:07:44 -0800	[diff] [blame]	641	cond_resched();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	642	spin_lock(&inode_lock);
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	643	if (wbc->nr_to_write <= 0) {
				644	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	645	break;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	646	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	647	if (!list_empty(&wb->b_more_io))
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	648	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	649	}
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	650
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	651	spin_unlock(&inode_lock);
				652	/* Leave any unwritten inodes on b_io */
				653	}
				654
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	655	void writeback_inodes_wbc(struct writeback_control *wbc)
				656	{
				657	struct backing_dev_info *bdi = wbc->bdi;
				658
				659	writeback_inodes_wb(&bdi->wb, wbc);
				660	}
				661
				662	/*
				663	* The maximum number of pages to writeout in a single bdi flush/kupdate
				664	* operation. We do this so we don't hold I_SYNC against an inode for
				665	* enormous amounts of time, which would block a userspace task which has
				666	* been forced to throttle against that inode. Also, the code reevaluates
				667	* the dirty each time it has written this many pages.
				668	*/
				669	#define MAX_WRITEBACK_PAGES 1024
				670
				671	static inline bool over_bground_thresh(void)
				672	{
				673	unsigned long background_thresh, dirty_thresh;
				674
				675	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
				676
				677	return (global_page_state(NR_FILE_DIRTY) +
				678	global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
				679	}
				680
				681	/*
				682	* Explicit flushing or periodic writeback of "old" data.
				683	*
				684	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				685	* dirtying-time in the inode's address_space. So this periodic writeback code
				686	* just walks the superblock inode list, writing back any inodes which are
				687	* older than a specific point in time.
				688	*
				689	* Try to run once per dirty_writeback_interval. But if a writeback event
				690	* takes longer than a dirty_writeback_interval interval, then leave a
				691	* one-second gap.
				692	*
				693	* older_than_this takes precedence over nr_to_write. So we'll only write back
				694	* all dirty pages if they are all attached to "old" mappings.
				695	*/
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	696	static long wb_writeback(struct bdi_writeback *wb,
				697	struct wb_writeback_args *args)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	698	{
				699	struct writeback_control wbc = {
				700	.bdi = wb->bdi,
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	701	.sb = args->sb,
				702	.sync_mode = args->sync_mode,
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	703	.older_than_this = NULL,
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	704	.for_kupdate = args->for_kupdate,
				705	.range_cyclic = args->range_cyclic,
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	706	};
				707	unsigned long oldest_jif;
				708	long wrote = 0;
				709
				710	if (wbc.for_kupdate) {
				711	wbc.older_than_this = &oldest_jif;
				712	oldest_jif = jiffies -
				713	msecs_to_jiffies(dirty_expire_interval * 10);
				714	}
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	715	if (!wbc.range_cyclic) {
				716	wbc.range_start = 0;
				717	wbc.range_end = LLONG_MAX;
				718	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	719
				720	for (;;) {
				721	/*
				722	* Don't flush anything for non-integrity writeback where
				723	* no nr_pages was given
				724	*/
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	725	if (!args->for_kupdate && args->nr_pages <= 0 &&
				726	args->sync_mode == WB_SYNC_NONE)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	727	break;
				728
				729	/*
				730	* If no specific pages were given and this is just a
				731	* periodic background writeout and we are below the
				732	* background dirty threshold, don't do anything
				733	*/
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	734	if (args->for_kupdate && args->nr_pages <= 0 &&
				735	!over_bground_thresh())
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	736	break;
				737
				738	wbc.more_io = 0;
				739	wbc.encountered_congestion = 0;
				740	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				741	wbc.pages_skipped = 0;
				742	writeback_inodes_wb(wb, &wbc);
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	743	args->nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	744	wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				745
				746	/*
				747	* If we ran out of stuff to write, bail unless more_io got set
				748	*/
				749	if (wbc.nr_to_write > 0 \|\| wbc.pages_skipped > 0) {
				750	if (wbc.more_io && !wbc.for_kupdate)
				751	continue;
				752	break;
				753	}
				754	}
				755
				756	return wrote;
				757	}
				758
				759	/*
				760	* Return the next bdi_work struct that hasn't been processed by this
Jens Axboe	8010c3b	2009-09-15 20:04:57 +0200	[diff] [blame^]	761	* wb thread yet. ->seen is initially set for each thread that exists
				762	* for this device, when a thread first notices a piece of work it
				763	* clears its bit. Depending on writeback type, the thread will notify
				764	* completion on either receiving the work (WB_SYNC_NONE) or after
				765	* it is done (WB_SYNC_ALL).
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	766	*/
				767	static struct bdi_work get_next_work_item(struct backing_dev_info bdi,
				768	struct bdi_writeback *wb)
				769	{
				770	struct bdi_work work, ret = NULL;
				771
				772	rcu_read_lock();
				773
				774	list_for_each_entry_rcu(work, &bdi->work_list, list) {
				775	if (!test_and_clear_bit(wb->nr, &work->seen))
				776	continue;
				777
				778	ret = work;
				779	break;
				780	}
				781
				782	rcu_read_unlock();
				783	return ret;
				784	}
				785
				786	static long wb_check_old_data_flush(struct bdi_writeback *wb)
				787	{
				788	unsigned long expired;
				789	long nr_pages;
				790
				791	expired = wb->last_old_flush +
				792	msecs_to_jiffies(dirty_writeback_interval * 10);
				793	if (time_before(jiffies, expired))
				794	return 0;
				795
				796	wb->last_old_flush = jiffies;
				797	nr_pages = global_page_state(NR_FILE_DIRTY) +
				798	global_page_state(NR_UNSTABLE_NFS) +
				799	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
				800
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	801	if (nr_pages) {
				802	struct wb_writeback_args args = {
				803	.nr_pages = nr_pages,
				804	.sync_mode = WB_SYNC_NONE,
				805	.for_kupdate = 1,
				806	.range_cyclic = 1,
				807	};
				808
				809	return wb_writeback(wb, &args);
				810	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	811
				812	return 0;
				813	}
				814
				815	/*
				816	* Retrieve work items and do the writeback they describe
				817	*/
				818	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
				819	{
				820	struct backing_dev_info *bdi = wb->bdi;
				821	struct bdi_work *work;
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	822	long wrote = 0;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	823
				824	while ((work = get_next_work_item(bdi, wb)) != NULL) {
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	825	struct wb_writeback_args args = work->args;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	826
				827	/*
				828	* Override sync mode, in case we must wait for completion
				829	*/
				830	if (force_wait)
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	831	work->args.sync_mode = args.sync_mode = WB_SYNC_ALL;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	832
				833	/*
				834	* If this isn't a data integrity operation, just notify
				835	* that we have seen this work and we are now starting it.
				836	*/
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	837	if (args.sync_mode == WB_SYNC_NONE)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	838	wb_clear_pending(wb, work);
				839
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	840	wrote += wb_writeback(wb, &args);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	841
				842	/*
				843	* This is a data integrity writeback, so only do the
				844	* notification when we have completed the work.
				845	*/
Jens Axboe	c4a77a6	2009-09-16 15:18:25 +0200	[diff] [blame]	846	if (args.sync_mode == WB_SYNC_ALL)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	847	wb_clear_pending(wb, work);
				848	}
				849
				850	/*
				851	* Check for periodic writeback, kupdated() style
				852	*/
				853	wrote += wb_check_old_data_flush(wb);
				854
				855	return wrote;
				856	}
				857
				858	/*
				859	* Handle writeback of dirty data for the device backed by this bdi. Also
				860	* wakes up periodically and does kupdated style flushing.
				861	*/
				862	int bdi_writeback_task(struct bdi_writeback *wb)
				863	{
				864	unsigned long last_active = jiffies;
				865	unsigned long wait_jiffies = -1UL;
				866	long pages_written;
				867
				868	while (!kthread_should_stop()) {
				869	pages_written = wb_do_writeback(wb, 0);
				870
				871	if (pages_written)
				872	last_active = jiffies;
				873	else if (wait_jiffies != -1UL) {
				874	unsigned long max_idle;
				875
				876	/*
				877	* Longest period of inactivity that we tolerate. If we
				878	* see dirty data again later, the task will get
				879	* recreated automatically.
				880	*/
				881	max_idle = max(5UL * 60 * HZ, wait_jiffies);
				882	if (time_after(jiffies, max_idle + last_active))
				883	break;
				884	}
				885
				886	wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
				887	set_current_state(TASK_INTERRUPTIBLE);
				888	schedule_timeout(wait_jiffies);
				889	try_to_freeze();
				890	}
				891
				892	return 0;
				893	}
				894
				895	/*
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	896	* Schedule writeback for all backing devices. This does WB_SYNC_NONE
				897	* writeback, for integrity writeback see bdi_sync_writeback().
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	898	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	899	static void bdi_writeback_all(struct super_block *sb, long nr_pages)
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	900	{
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	901	struct wb_writeback_args args = {
				902	.sb = sb,
				903	.nr_pages = nr_pages,
				904	.sync_mode = WB_SYNC_NONE,
				905	};
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	906	struct backing_dev_info *bdi;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	907
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	908	rcu_read_lock();
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	909
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	910	list_for_each_entry_rcu(bdi, &bdi_list, bdi_list) {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	911	if (!bdi_has_dirty_io(bdi))
				912	continue;
				913
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	914	bdi_alloc_queue_work(bdi, &args);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	915	}
				916
Jens Axboe	cfc4ba5	2009-09-14 13:12:40 +0200	[diff] [blame]	917	rcu_read_unlock();
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	918	}
				919
				920	/*
				921	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				922	* the whole world.
				923	*/
				924	void wakeup_flusher_threads(long nr_pages)
				925	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	926	if (nr_pages == 0)
				927	nr_pages = global_page_state(NR_FILE_DIRTY) +
				928	global_page_state(NR_UNSTABLE_NFS);
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	929	bdi_writeback_all(NULL, nr_pages);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	930	}
				931
				932	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
				933	{
				934	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
				935	struct dentry *dentry;
				936	const char *name = "?";
				937
				938	dentry = d_find_alias(inode);
				939	if (dentry) {
				940	spin_lock(&dentry->d_lock);
				941	name = (const char *) dentry->d_name.name;
				942	}
				943	printk(KERN_DEBUG
				944	"%s(%d): dirtied inode %lu (%s) on %s\n",
				945	current->comm, task_pid_nr(current), inode->i_ino,
				946	name, inode->i_sb->s_id);
				947	if (dentry) {
				948	spin_unlock(&dentry->d_lock);
				949	dput(dentry);
				950	}
				951	}
				952	}
				953
				954	/**
				955	* __mark_inode_dirty - internal function
				956	* @inode: inode to mark
				957	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
				958	* Mark an inode as dirty. Callers should use mark_inode_dirty or
				959	* mark_inode_dirty_sync.
				960	*
				961	* Put the inode on the super block's dirty list.
				962	*
				963	* CAREFUL! We mark it dirty unconditionally, but move it onto the
				964	* dirty list only if it is hashed or if it refers to a blockdev.
				965	* If it was not hashed, it will never be added to the dirty list
				966	* even if it is later hashed, as it will have been marked dirty already.
				967	*
				968	* In short, make sure you hash any inodes _before_ you start marking
				969	* them dirty.
				970	*
				971	* This function must be atomic for the I_DIRTY_PAGES case -
				972	* set_page_dirty() is called under spinlock in several places.
				973	*
				974	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
				975	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
				976	* the kernel-internal blockdev inode represents the dirtying time of the
				977	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
				978	* page->mapping->host, so the page-dirtying time is recorded in the internal
				979	* blockdev inode.
				980	*/
				981	void __mark_inode_dirty(struct inode *inode, int flags)
				982	{
				983	struct super_block *sb = inode->i_sb;
				984
				985	/*
				986	* Don't do this for I_DIRTY_PAGES - that doesn't actually
				987	* dirty the inode itself
				988	*/
				989	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				990	if (sb->s_op->dirty_inode)
				991	sb->s_op->dirty_inode(inode);
				992	}
				993
				994	/*
				995	* make sure that changes are seen by all cpus before we test i_state
				996	* -- mikulas
				997	*/
				998	smp_mb();
				999
				1000	/* avoid the locking if we can */
				1001	if ((inode->i_state & flags) == flags)
				1002	return;
				1003
				1004	if (unlikely(block_dump))
				1005	block_dump___mark_inode_dirty(inode);
				1006
				1007	spin_lock(&inode_lock);
				1008	if ((inode->i_state & flags) != flags) {
				1009	const int was_dirty = inode->i_state & I_DIRTY;
				1010
				1011	inode->i_state \|= flags;
				1012
				1013	/*
				1014	* If the inode is being synced, just update its dirty state.
				1015	* The unlocker will place the inode on the appropriate
				1016	* superblock list, based upon its state.
				1017	*/
				1018	if (inode->i_state & I_SYNC)
				1019	goto out;
				1020
				1021	/*
				1022	* Only add valid (hashed) inodes to the superblock's
				1023	* dirty list. Add blockdev inodes as well.
				1024	*/
				1025	if (!S_ISBLK(inode->i_mode)) {
				1026	if (hlist_unhashed(&inode->i_hash))
				1027	goto out;
				1028	}
				1029	if (inode->i_state & (I_FREEING\|I_CLEAR))
				1030	goto out;
				1031
				1032	/*
				1033	* If the inode was already on b_dirty/b_io/b_more_io, don't
				1034	* reposition it (that would break b_dirty time-ordering).
				1035	*/
				1036	if (!was_dirty) {
				1037	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Jens Axboe	500b067	2009-09-09 09:10:25 +0200	[diff] [blame]	1038	struct backing_dev_info *bdi = wb->bdi;
				1039
				1040	if (bdi_cap_writeback_dirty(bdi) &&
				1041	!test_bit(BDI_registered, &bdi->state)) {
				1042	WARN_ON(1);
				1043	printk(KERN_ERR "bdi-%s not registered\n",
				1044	bdi->name);
				1045	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1046
				1047	inode->dirtied_when = jiffies;
				1048	list_move(&inode->i_list, &wb->b_dirty);
				1049	}
				1050	}
				1051	out:
				1052	spin_unlock(&inode_lock);
				1053	}
				1054	EXPORT_SYMBOL(__mark_inode_dirty);
				1055
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1056	/*
				1057	* Write out a superblock's list of dirty inodes. A wait will be performed
				1058	* upon no inodes, all inodes or the final one, depending upon sync_mode.
				1059	*
				1060	* If older_than_this is non-NULL, then only write out inodes which
				1061	* had their first dirtying at a time earlier than *older_than_this.
				1062	*
				1063	* If we're a pdlfush thread, then implement pdflush collision avoidance
				1064	* against the entire list.
				1065	*
				1066	* If `bdi' is non-zero then we're being asked to writeback a specific queue.
				1067	* This function assumes that the blockdev superblock's inodes are backed by
				1068	* a variety of queues, so all inodes are searched. For other superblocks,
				1069	* assume that all inodes are backed by the same queue.
				1070	*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1071	* The inodes to be written are parked on bdi->b_io. They are moved back onto
				1072	* bdi->b_dirty as they are selected for writing. This way, none can be missed
				1073	* on the writer throttling path, and we get decent balancing between many
				1074	* throttled threads: we don't want them all piling up on inode_sync_wait.
				1075	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1076	static void wait_sb_inodes(struct super_block *sb)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1077	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1078	struct inode inode, old_inode = NULL;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1079
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1080	/*
				1081	* We need to be protected against the filesystem going from
				1082	* r/o to r/w or vice versa.
				1083	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1084	WARN_ON(!rwsem_is_locked(&sb->s_umount));
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1085
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1086	spin_lock(&inode_lock);
				1087
				1088	/*
				1089	* Data integrity sync. Must wait for all pages under writeback,
				1090	* because there may have been pages dirtied before our sync
				1091	* call, but which had writeout started before we write it out.
				1092	* In which case, the inode may not be on the dirty list, but
				1093	* we still have to wait for that writeout.
				1094	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1095	list_for_each_entry(inode, &sb->s_inodes, i_sb_list) {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1096	struct address_space *mapping;
				1097
				1098	if (inode->i_state & (I_FREEING\|I_CLEAR\|I_WILL_FREE\|I_NEW))
				1099	continue;
				1100	mapping = inode->i_mapping;
				1101	if (mapping->nrpages == 0)
				1102	continue;
				1103	__iget(inode);
				1104	spin_unlock(&inode_lock);
				1105	/*
				1106	* We hold a reference to 'inode' so it couldn't have
				1107	* been removed from s_inodes list while we dropped the
				1108	* inode_lock. We cannot iput the inode now as we can
				1109	* be holding the last reference and we cannot iput it
				1110	* under inode_lock. So we keep the reference and iput
				1111	* it later.
				1112	*/
				1113	iput(old_inode);
				1114	old_inode = inode;
				1115
				1116	filemap_fdatawait(mapping);
				1117
				1118	cond_resched();
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1119
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1120	spin_lock(&inode_lock);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1121	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame]	1122	spin_unlock(&inode_lock);
				1123	iput(old_inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1124	}
				1125
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1126	/**
				1127	* writeback_inodes_sb - writeback dirty inodes from given super_block
				1128	* @sb: the superblock
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1129	*
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1130	* Start writeback on some inodes on this super_block. No guarantees are made
				1131	* on how many (if any) will be written, and this function does not wait
				1132	* for IO completion of submitted IO. The number of pages submitted is
				1133	* returned.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1134	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1135	void writeback_inodes_sb(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1136	{
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1137	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
				1138	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
				1139	long nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1140
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1141	nr_to_write = nr_dirty + nr_unstable +
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1142	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1143
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1144	bdi_writeback_all(sb, nr_to_write);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1145	}
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1146	EXPORT_SYMBOL(writeback_inodes_sb);
				1147
				1148	/**
				1149	* sync_inodes_sb - sync sb inode pages
				1150	* @sb: the superblock
				1151	*
				1152	* This function writes and waits on any dirty inode belonging to this
				1153	* super_block. The number of pages synced is returned.
				1154	*/
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1155	void sync_inodes_sb(struct super_block *sb)
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1156	{
Jens Axboe	b6e5131	2009-09-16 15:13:54 +0200	[diff] [blame]	1157	bdi_sync_writeback(sb->s_bdi, sb);
				1158	wait_sb_inodes(sb);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1159	}
				1160	EXPORT_SYMBOL(sync_inodes_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1161
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1162	/**
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1163	* write_inode_now - write an inode to disk
				1164	* @inode: inode to write to disk
				1165	* @sync: whether the write should be synchronous or not
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1166	*
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1167	* This function commits an inode to disk immediately if it is dirty. This is
				1168	* primarily needed by knfsd.
				1169	*
				1170	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1171	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1172	int write_inode_now(struct inode *inode, int sync)
				1173	{
				1174	int ret;
				1175	struct writeback_control wbc = {
				1176	.nr_to_write = LONG_MAX,
Mike Galbraith	18914b1	2008-02-08 04:20:23 -0800	[diff] [blame]	1177	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1178	.range_start = 0,
				1179	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1180	};
				1181
				1182	if (!mapping_cap_writeback_dirty(inode->i_mapping))
Andrew Morton	49364ce	2005-11-07 00:59:15 -0800	[diff] [blame]	1183	wbc.nr_to_write = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1184
				1185	might_sleep();
				1186	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1187	ret = writeback_single_inode(inode, &wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1188	spin_unlock(&inode_lock);
				1189	if (sync)
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1190	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1191	return ret;
				1192	}
				1193	EXPORT_SYMBOL(write_inode_now);
				1194
				1195	/**
				1196	* sync_inode - write an inode and its pages to disk.
				1197	* @inode: the inode to sync
				1198	* @wbc: controls the writeback mode
				1199	*
				1200	* sync_inode() will write an inode and its pages to disk. It will also
				1201	* correctly update the inode on its superblock's dirty inode lists and will
				1202	* update inode->i_state.
				1203	*
				1204	* The caller must have a ref on the inode.
				1205	*/
				1206	int sync_inode(struct inode inode, struct writeback_control wbc)
				1207	{
				1208	int ret;
				1209
				1210	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1211	ret = writeback_single_inode(inode, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1212	spin_unlock(&inode_lock);
				1213	return ret;
				1214	}
				1215	EXPORT_SYMBOL(sync_inode);