Blame - fs/fs-writeback.c - kernel/msm-5.4

blob: 7f6dae8aa47f1f5b458bb4ab7912e10dcaf3c95b [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* fs/fs-writeback.c
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains all the functions related to writing back and waiting
				7	* upon dirty inodes against superblocks, and writing back dirty
				8	* pages against inodes. ie: data writeback. Writeout of the
				9	* inode itself is not handled here.
				10	*
Francois Cami	e1f8e87	2008-10-15 22:01:59 -0700	[diff] [blame]	11	* 10Apr2002 Andrew Morton
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	12	* Split out of fs/inode.c
				13	* Additions for address_space-based writeback
				14	*/
				15
				16	#include <linux/kernel.h>
Jens Axboe	f5ff842	2007-09-21 09:19:54 +0200	[diff] [blame]	17	#include <linux/module.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	18	#include <linux/spinlock.h>
				19	#include <linux/sched.h>
				20	#include <linux/fs.h>
				21	#include <linux/mm.h>
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	22	#include <linux/kthread.h>
				23	#include <linux/freezer.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	24	#include <linux/writeback.h>
				25	#include <linux/blkdev.h>
				26	#include <linux/backing-dev.h>
				27	#include <linux/buffer_head.h>
David Howells	07f3f05	2006-09-30 20:52:18 +0200	[diff] [blame]	28	#include "internal.h"
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	29
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	30	#define inode_to_bdi(inode) ((inode)->i_mapping->backing_dev_info)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	31
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	32	/*
				33	* Work items for the bdi_writeback threads
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	34	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	35	struct bdi_work {
				36	struct list_head list;
				37	struct list_head wait_list;
				38	struct rcu_head rcu_head;
				39
				40	unsigned long seen;
				41	atomic_t pending;
				42
				43	struct super_block *sb;
				44	unsigned long nr_pages;
				45	enum writeback_sync_modes sync_mode;
				46
				47	unsigned long state;
				48	};
				49
				50	enum {
				51	WS_USED_B = 0,
				52	WS_ONSTACK_B,
				53	};
				54
				55	#define WS_USED (1 << WS_USED_B)
				56	#define WS_ONSTACK (1 << WS_ONSTACK_B)
				57
				58	static inline bool bdi_work_on_stack(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	59	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	60	return test_bit(WS_ONSTACK_B, &work->state);
				61	}
				62
				63	static inline void bdi_work_init(struct bdi_work *work,
				64	struct writeback_control *wbc)
				65	{
				66	INIT_RCU_HEAD(&work->rcu_head);
				67	work->sb = wbc->sb;
				68	work->nr_pages = wbc->nr_to_write;
				69	work->sync_mode = wbc->sync_mode;
				70	work->state = WS_USED;
				71	}
				72
				73	static inline void bdi_work_init_on_stack(struct bdi_work *work,
				74	struct writeback_control *wbc)
				75	{
				76	bdi_work_init(work, wbc);
				77	work->state \|= WS_ONSTACK;
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	78	}
				79
				80	/**
				81	* writeback_in_progress - determine whether there is writeback in progress
				82	* @bdi: the device's backing_dev_info structure.
				83	*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	84	* Determine whether there is writeback waiting to be handled against a
				85	* backing device.
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	86	*/
				87	int writeback_in_progress(struct backing_dev_info *bdi)
				88	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	89	return !list_empty(&bdi->work_list);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	90	}
				91
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	92	static void bdi_work_clear(struct bdi_work *work)
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	93	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	94	clear_bit(WS_USED_B, &work->state);
				95	smp_mb__after_clear_bit();
				96	wake_up_bit(&work->state, WS_USED_B);
Adrian Bunk	f11b00f	2008-04-29 00:58:56 -0700	[diff] [blame]	97	}
				98
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	99	static void bdi_work_free(struct rcu_head *head)
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	100	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	101	struct bdi_work *work = container_of(head, struct bdi_work, rcu_head);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	102
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	103	if (!bdi_work_on_stack(work))
				104	kfree(work);
				105	else
				106	bdi_work_clear(work);
				107	}
				108
				109	static void wb_work_complete(struct bdi_work *work)
				110	{
				111	const enum writeback_sync_modes sync_mode = work->sync_mode;
				112
				113	/*
				114	* For allocated work, we can clear the done/seen bit right here.
				115	* For on-stack work, we need to postpone both the clear and free
				116	* to after the RCU grace period, since the stack could be invalidated
				117	* as soon as bdi_work_clear() has done the wakeup.
				118	*/
				119	if (!bdi_work_on_stack(work))
				120	bdi_work_clear(work);
				121	if (sync_mode == WB_SYNC_NONE \|\| bdi_work_on_stack(work))
				122	call_rcu(&work->rcu_head, bdi_work_free);
				123	}
				124
				125	static void wb_clear_pending(struct bdi_writeback wb, struct bdi_work work)
				126	{
				127	/*
				128	* The caller has retrieved the work arguments from this work,
				129	* drop our reference. If this is the last ref, delete and free it
				130	*/
				131	if (atomic_dec_and_test(&work->pending)) {
				132	struct backing_dev_info *bdi = wb->bdi;
				133
				134	spin_lock(&bdi->wb_lock);
				135	list_del_rcu(&work->list);
				136	spin_unlock(&bdi->wb_lock);
				137
				138	wb_work_complete(work);
Nick Piggin	4195f73	2009-05-28 09:01:15 +0200	[diff] [blame]	139	}
				140	}
				141
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	142	static void bdi_queue_work(struct backing_dev_info bdi, struct bdi_work work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	143	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	144	if (work) {
				145	work->seen = bdi->wb_mask;
				146	BUG_ON(!work->seen);
				147	atomic_set(&work->pending, bdi->wb_cnt);
				148	BUG_ON(!bdi->wb_cnt);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	149
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	150	/*
				151	* Make sure stores are seen before it appears on the list
				152	*/
				153	smp_mb();
				154
				155	spin_lock(&bdi->wb_lock);
				156	list_add_tail_rcu(&work->list, &bdi->work_list);
				157	spin_unlock(&bdi->wb_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	158	}
				159
				160	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	161	* If the default thread isn't there, make sure we add it. When
				162	* it gets created and wakes up, we'll run this work.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	164	if (unlikely(list_empty_careful(&bdi->wb_list)))
				165	wake_up_process(default_backing_dev_info.wb.task);
				166	else {
				167	struct bdi_writeback *wb = &bdi->wb;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	168
				169	/*
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	170	* If we failed allocating the bdi work item, wake up the wb
				171	* thread always. As a safety precaution, it'll flush out
				172	* everything
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	173	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	174	if (!wb_has_dirty_io(wb)) {
				175	if (work)
				176	wb_clear_pending(wb, work);
				177	} else if (wb->task)
				178	wake_up_process(wb->task);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	179	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	180	}
				181
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	182	/*
				183	* Used for on-stack allocated work items. The caller needs to wait until
				184	* the wb threads have acked the work before it's safe to continue.
				185	*/
				186	static void bdi_wait_on_work_clear(struct bdi_work *work)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	187	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	188	wait_on_bit(&work->state, WS_USED_B, bdi_sched_wait,
				189	TASK_UNINTERRUPTIBLE);
				190	}
				191
				192	static struct bdi_work bdi_alloc_work(struct writeback_control wbc)
				193	{
				194	struct bdi_work *work;
				195
				196	work = kmalloc(sizeof(*work), GFP_ATOMIC);
				197	if (work)
				198	bdi_work_init(work, wbc);
				199
				200	return work;
				201	}
				202
				203	void bdi_start_writeback(struct writeback_control *wbc)
				204	{
				205	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
				206	struct bdi_work work_stack, *work = NULL;
				207
				208	if (!must_wait)
				209	work = bdi_alloc_work(wbc);
				210
				211	if (!work) {
				212	work = &work_stack;
				213	bdi_work_init_on_stack(work, wbc);
				214	}
				215
				216	bdi_queue_work(wbc->bdi, work);
				217
				218	/*
				219	* If the sync mode is WB_SYNC_ALL, block waiting for the work to
				220	* complete. If not, we only need to wait for the work to be started,
				221	* if we allocated it on-stack. We use the same mechanism, if the
				222	* wait bit is set in the bdi_work struct, then threads will not
				223	* clear pending until after they are done.
				224	*
				225	* Note that work == &work_stack if must_wait is true, so we don't
				226	* need to do call_rcu() here ever, since the completion path will
				227	* have done that for us.
				228	*/
				229	if (must_wait \|\| work == &work_stack) {
				230	bdi_wait_on_work_clear(work);
				231	if (work != &work_stack)
				232	call_rcu(&work->rcu_head, bdi_work_free);
				233	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	234	}
				235
				236	/*
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	237	* Redirty an inode: set its when-it-was dirtied timestamp and move it to the
				238	* furthest end of its superblock's dirty-inode list.
				239	*
				240	* Before stamping the inode's ->dirtied_when, we check to see whether it is
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	241	* already the most-recently-dirtied inode on the b_dirty list. If that is
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	242	* the case then the inode must have been redirtied while it was being written
				243	* out and we don't reset its dirtied_when.
				244	*/
				245	static void redirty_tail(struct inode *inode)
				246	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	247	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	248
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	249	if (!list_empty(&wb->b_dirty)) {
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	250	struct inode *tail;
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	251
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	252	tail = list_entry(wb->b_dirty.next, struct inode, i_list);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	253	if (time_before(inode->dirtied_when, tail->dirtied_when))
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	254	inode->dirtied_when = jiffies;
				255	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	256	list_move(&inode->i_list, &wb->b_dirty);
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	257	}
				258
				259	/*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	260	* requeue inode for re-scanning after bdi->b_io list is exhausted.
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	261	*/
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	262	static void requeue_io(struct inode *inode)
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	263	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	264	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				265
				266	list_move(&inode->i_list, &wb->b_more_io);
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	267	}
				268
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	269	static void inode_sync_complete(struct inode *inode)
				270	{
				271	/*
				272	* Prevent speculative execution through spin_unlock(&inode_lock);
				273	*/
				274	smp_mb();
				275	wake_up_bit(&inode->i_state, __I_SYNC);
				276	}
				277
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	278	static bool inode_dirtied_after(struct inode *inode, unsigned long t)
				279	{
				280	bool ret = time_after(inode->dirtied_when, t);
				281	#ifndef CONFIG_64BIT
				282	/*
				283	* For inodes being constantly redirtied, dirtied_when can get stuck.
				284	* It _appears_ to be in the future, but is actually in distant past.
				285	* This test is necessary to prevent such wrapped-around relative times
				286	* from permanently stopping the whole pdflush writeback.
				287	*/
				288	ret = ret && time_before_eq(inode->dirtied_when, jiffies);
				289	#endif
				290	return ret;
				291	}
				292
Andrew Morton	c986d1e	2007-10-16 23:30:34 -0700	[diff] [blame]	293	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	294	* Move expired dirty inodes from @delaying_queue to @dispatch_queue.
				295	*/
				296	static void move_expired_inodes(struct list_head *delaying_queue,
				297	struct list_head *dispatch_queue,
				298	unsigned long *older_than_this)
				299	{
				300	while (!list_empty(delaying_queue)) {
				301	struct inode *inode = list_entry(delaying_queue->prev,
				302	struct inode, i_list);
				303	if (older_than_this &&
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	304	inode_dirtied_after(inode, *older_than_this))
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	305	break;
				306	list_move(&inode->i_list, dispatch_queue);
				307	}
				308	}
				309
				310	/*
				311	* Queue all expired dirty inodes for io, eldest first.
				312	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	313	static void queue_io(struct bdi_writeback wb, unsigned long older_than_this)
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	314	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	315	list_splice_init(&wb->b_more_io, wb->b_io.prev);
				316	move_expired_inodes(&wb->b_dirty, &wb->b_io, older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	317	}
				318
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	319	static int write_inode(struct inode *inode, int sync)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	320	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	321	if (inode->i_sb->s_op->write_inode && !is_bad_inode(inode))
				322	return inode->i_sb->s_op->write_inode(inode, sync);
				323	return 0;
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	324	}
				325
				326	/*
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	327	* Wait for writeback on an inode to complete.
				328	*/
				329	static void inode_wait_for_writeback(struct inode *inode)
				330	{
				331	DEFINE_WAIT_BIT(wq, &inode->i_state, __I_SYNC);
				332	wait_queue_head_t *wqh;
				333
				334	wqh = bit_waitqueue(&inode->i_state, __I_SYNC);
				335	do {
				336	spin_unlock(&inode_lock);
				337	__wait_on_bit(wqh, &wq, inode_wait, TASK_UNINTERRUPTIBLE);
				338	spin_lock(&inode_lock);
				339	} while (inode->i_state & I_SYNC);
				340	}
				341
				342	/*
				343	* Write out an inode's dirty pages. Called under inode_lock. Either the
				344	* caller has ref on the inode (either via __iget or via syscall against an fd)
				345	* or the inode has I_WILL_FREE set (via generic_forget_inode)
				346	*
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	347	* If `wait' is set, wait on the writeout.
				348	*
				349	* The whole writeout design is quite complex and fragile. We want to avoid
				350	* starvation of particular inodes when others are being redirtied, prevent
				351	* livelocks, etc.
				352	*
				353	* Called under inode_lock.
				354	*/
				355	static int
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	356	writeback_single_inode(struct inode inode, struct writeback_control wbc)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	357	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	358	struct address_space *mapping = inode->i_mapping;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	359	int wait = wbc->sync_mode == WB_SYNC_ALL;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	360	unsigned dirty;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	361	int ret;
				362
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	363	if (!atomic_read(&inode->i_count))
				364	WARN_ON(!(inode->i_state & (I_WILL_FREE\|I_FREEING)));
				365	else
				366	WARN_ON(inode->i_state & I_WILL_FREE);
				367
				368	if (inode->i_state & I_SYNC) {
				369	/*
				370	* If this inode is locked for writeback and we are not doing
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	371	* writeback-for-data-integrity, move it to b_more_io so that
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	372	* writeback can proceed with the other inodes on s_io.
				373	*
				374	* We'll have another go at writing back this inode when we
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	375	* completed a full scan of b_io.
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	376	*/
				377	if (!wait) {
				378	requeue_io(inode);
				379	return 0;
				380	}
				381
				382	/*
				383	* It's a data-integrity sync. We must wait.
				384	*/
				385	inode_wait_for_writeback(inode);
				386	}
				387
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	388	BUG_ON(inode->i_state & I_SYNC);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	389
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	390	/* Set I_SYNC, reset I_DIRTY */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	391	dirty = inode->i_state & I_DIRTY;
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	392	inode->i_state \|= I_SYNC;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	393	inode->i_state &= ~I_DIRTY;
				394
				395	spin_unlock(&inode_lock);
				396
				397	ret = do_writepages(mapping, wbc);
				398
				399	/* Don't write the inode if only I_DIRTY_PAGES was set */
				400	if (dirty & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				401	int err = write_inode(inode, wait);
				402	if (ret == 0)
				403	ret = err;
				404	}
				405
				406	if (wait) {
				407	int err = filemap_fdatawait(mapping);
				408	if (ret == 0)
				409	ret = err;
				410	}
				411
				412	spin_lock(&inode_lock);
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	413	inode->i_state &= ~I_SYNC;
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	414	if (!(inode->i_state & (I_FREEING \| I_CLEAR))) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	415	if (!(inode->i_state & I_DIRTY) &&
				416	mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) {
				417	/*
				418	* We didn't write back all the pages. nfs_writepages()
				419	* sometimes bales out without doing anything. Redirty
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	420	* the inode; Move it from b_io onto b_more_io/b_dirty.
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	421	*/
				422	/*
				423	* akpm: if the caller was the kupdate function we put
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	424	* this inode at the head of b_dirty so it gets first
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	425	* consideration. Otherwise, move it to the tail, for
				426	* the reasons described there. I'm not really sure
				427	* how much sense this makes. Presumably I had a good
				428	* reasons for doing it this way, and I'd rather not
				429	* muck with it at present.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	430	*/
				431	if (wbc->for_kupdate) {
				432	/*
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	433	* For the kupdate function we move the inode
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	434	* to b_more_io so it will get more writeout as
Fengguang Wu	2c13657	2007-10-16 23:30:39 -0700	[diff] [blame]	435	* soon as the queue becomes uncongested.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	436	*/
				437	inode->i_state \|= I_DIRTY_PAGES;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	438	if (wbc->nr_to_write <= 0) {
				439	/*
				440	* slice used up: queue for next turn
				441	*/
				442	requeue_io(inode);
				443	} else {
				444	/*
				445	* somehow blocked: retry later
				446	*/
				447	redirty_tail(inode);
				448	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	449	} else {
				450	/*
				451	* Otherwise fully redirty the inode so that
				452	* other inodes on this superblock will get some
				453	* writeout. Otherwise heavy writing to one
				454	* file would indefinitely suspend writeout of
				455	* all the other files.
				456	*/
				457	inode->i_state \|= I_DIRTY_PAGES;
Andrew Morton	1b43ef9	2007-10-16 23:30:35 -0700	[diff] [blame]	458	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	459	}
				460	} else if (inode->i_state & I_DIRTY) {
				461	/*
				462	* Someone redirtied the inode while were writing back
				463	* the pages.
				464	*/
Andrew Morton	6610a0b	2007-10-16 23:30:32 -0700	[diff] [blame]	465	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	466	} else if (atomic_read(&inode->i_count)) {
				467	/*
				468	* The inode is clean, inuse
				469	*/
				470	list_move(&inode->i_list, &inode_in_use);
				471	} else {
				472	/*
				473	* The inode is clean, unused
				474	*/
				475	list_move(&inode->i_list, &inode_unused);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476	}
				477	}
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	478	inode_sync_complete(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	479	return ret;
				480	}
				481
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	482	/*
				483	* For WB_SYNC_NONE writeback, the caller does not have the sb pinned
				484	* before calling writeback. So make sure that we do pin it, so it doesn't
				485	* go away while we are writing inodes from it.
				486	*
				487	* Returns 0 if the super was successfully pinned (or pinning wasn't needed),
				488	* 1 if we failed.
				489	*/
				490	static int pin_sb_for_writeback(struct writeback_control *wbc,
				491	struct inode *inode)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	492	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	493	struct super_block *sb = inode->i_sb;
				494
				495	/*
				496	* Caller must already hold the ref for this
				497	*/
				498	if (wbc->sync_mode == WB_SYNC_ALL) {
				499	WARN_ON(!rwsem_is_locked(&sb->s_umount));
				500	return 0;
				501	}
				502
				503	spin_lock(&sb_lock);
				504	sb->s_count++;
				505	if (down_read_trylock(&sb->s_umount)) {
				506	if (sb->s_root) {
				507	spin_unlock(&sb_lock);
				508	return 0;
				509	}
				510	/*
				511	* umounted, drop rwsem again and fall through to failure
				512	*/
				513	up_read(&sb->s_umount);
				514	}
				515
				516	sb->s_count--;
				517	spin_unlock(&sb_lock);
				518	return 1;
				519	}
				520
				521	static void unpin_sb_for_writeback(struct writeback_control *wbc,
				522	struct inode *inode)
				523	{
				524	struct super_block *sb = inode->i_sb;
				525
				526	if (wbc->sync_mode == WB_SYNC_ALL)
				527	return;
				528
				529	up_read(&sb->s_umount);
				530	put_super(sb);
				531	}
				532
				533	static void writeback_inodes_wb(struct bdi_writeback *wb,
				534	struct writeback_control *wbc)
				535	{
				536	struct super_block *sb = wbc->sb;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	537	const int is_blkdev_sb = sb_is_blkdev_sb(sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	538	const unsigned long start = jiffies; /* livelock avoidance */
				539
Hans Reiser	ae8547b	2008-05-07 15:48:57 +0300	[diff] [blame]	540	spin_lock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	541
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	542	if (!wbc->for_kupdate \|\| list_empty(&wb->b_io))
				543	queue_io(wb, wbc->older_than_this);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	544
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	545	while (!list_empty(&wb->b_io)) {
				546	struct inode *inode = list_entry(wb->b_io.prev,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	547	struct inode, i_list);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	548	long pages_skipped;
				549
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	550	/*
				551	* super block given and doesn't match, skip this inode
				552	*/
				553	if (sb && sb != inode->i_sb) {
				554	redirty_tail(inode);
				555	continue;
				556	}
				557
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	558	if (!bdi_cap_writeback_dirty(wb->bdi)) {
Andrew Morton	9852a0e7	2007-10-16 23:30:33 -0700	[diff] [blame]	559	redirty_tail(inode);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	560	if (is_blkdev_sb) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	561	/*
				562	* Dirty memory-backed blockdev: the ramdisk
				563	* driver does this. Skip just this inode
				564	*/
				565	continue;
				566	}
				567	/*
				568	* Dirty memory-backed inode against a filesystem other
				569	* than the kernel-internal bdev filesystem. Skip the
				570	* entire superblock.
				571	*/
				572	break;
				573	}
				574
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	575	if (inode->i_state & (I_NEW \| I_WILL_FREE)) {
Nick Piggin	7ef0d73	2009-03-12 14:31:38 -0700	[diff] [blame]	576	requeue_io(inode);
				577	continue;
				578	}
				579
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	580	if (wbc->nonblocking && bdi_write_congested(wb->bdi)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	581	wbc->encountered_congestion = 1;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	582	if (!is_blkdev_sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	583	break; /* Skip a congested fs */
Ken Chen	0e0f4fc	2007-10-16 23:30:38 -0700	[diff] [blame]	584	requeue_io(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	585	continue; /* Skip a congested blockdev */
				586	}
				587
Jeff Layton	d2caa3c5	2009-04-02 16:56:37 -0700	[diff] [blame]	588	/*
				589	* Was this inode dirtied after sync_sb_inodes was called?
				590	* This keeps sync from extra jobs and livelock.
				591	*/
				592	if (inode_dirtied_after(inode, start))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	593	break;
				594
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	595	if (pin_sb_for_writeback(wbc, inode)) {
				596	requeue_io(inode);
				597	continue;
				598	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	599
Wu Fengguang	84a8924	2009-06-16 15:33:17 -0700	[diff] [blame]	600	BUG_ON(inode->i_state & (I_FREEING \| I_CLEAR));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	601	__iget(inode);
				602	pages_skipped = wbc->pages_skipped;
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	603	writeback_single_inode(inode, wbc);
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	604	unpin_sb_for_writeback(wbc, inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	605	if (wbc->pages_skipped != pages_skipped) {
				606	/*
				607	* writeback is not making progress due to locked
				608	* buffers. Skip this inode for now.
				609	*/
Andrew Morton	f57b9b7	2007-10-16 23:30:34 -0700	[diff] [blame]	610	redirty_tail(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	611	}
				612	spin_unlock(&inode_lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	613	iput(inode);
OGAWA Hirofumi	4ffc844	2006-03-25 03:07:44 -0800	[diff] [blame]	614	cond_resched();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	615	spin_lock(&inode_lock);
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	616	if (wbc->nr_to_write <= 0) {
				617	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	618	break;
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	619	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	620	if (!list_empty(&wb->b_more_io))
Fengguang Wu	8bc3be2	2008-02-04 22:29:36 -0800	[diff] [blame]	621	wbc->more_io = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	622	}
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	623
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	624	spin_unlock(&inode_lock);
				625	/* Leave any unwritten inodes on b_io */
				626	}
				627
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	628	void writeback_inodes_wbc(struct writeback_control *wbc)
				629	{
				630	struct backing_dev_info *bdi = wbc->bdi;
				631
				632	writeback_inodes_wb(&bdi->wb, wbc);
				633	}
				634
				635	/*
				636	* The maximum number of pages to writeout in a single bdi flush/kupdate
				637	* operation. We do this so we don't hold I_SYNC against an inode for
				638	* enormous amounts of time, which would block a userspace task which has
				639	* been forced to throttle against that inode. Also, the code reevaluates
				640	* the dirty each time it has written this many pages.
				641	*/
				642	#define MAX_WRITEBACK_PAGES 1024
				643
				644	static inline bool over_bground_thresh(void)
				645	{
				646	unsigned long background_thresh, dirty_thresh;
				647
				648	get_dirty_limits(&background_thresh, &dirty_thresh, NULL, NULL);
				649
				650	return (global_page_state(NR_FILE_DIRTY) +
				651	global_page_state(NR_UNSTABLE_NFS) >= background_thresh);
				652	}
				653
				654	/*
				655	* Explicit flushing or periodic writeback of "old" data.
				656	*
				657	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				658	* dirtying-time in the inode's address_space. So this periodic writeback code
				659	* just walks the superblock inode list, writing back any inodes which are
				660	* older than a specific point in time.
				661	*
				662	* Try to run once per dirty_writeback_interval. But if a writeback event
				663	* takes longer than a dirty_writeback_interval interval, then leave a
				664	* one-second gap.
				665	*
				666	* older_than_this takes precedence over nr_to_write. So we'll only write back
				667	* all dirty pages if they are all attached to "old" mappings.
				668	*/
				669	static long wb_writeback(struct bdi_writeback *wb, long nr_pages,
				670	struct super_block *sb,
				671	enum writeback_sync_modes sync_mode, int for_kupdate)
				672	{
				673	struct writeback_control wbc = {
				674	.bdi = wb->bdi,
				675	.sb = sb,
				676	.sync_mode = sync_mode,
				677	.older_than_this = NULL,
				678	.for_kupdate = for_kupdate,
				679	.range_cyclic = 1,
				680	};
				681	unsigned long oldest_jif;
				682	long wrote = 0;
				683
				684	if (wbc.for_kupdate) {
				685	wbc.older_than_this = &oldest_jif;
				686	oldest_jif = jiffies -
				687	msecs_to_jiffies(dirty_expire_interval * 10);
				688	}
				689
				690	for (;;) {
				691	/*
				692	* Don't flush anything for non-integrity writeback where
				693	* no nr_pages was given
				694	*/
				695	if (!for_kupdate && nr_pages <= 0 && sync_mode == WB_SYNC_NONE)
				696	break;
				697
				698	/*
				699	* If no specific pages were given and this is just a
				700	* periodic background writeout and we are below the
				701	* background dirty threshold, don't do anything
				702	*/
				703	if (for_kupdate && nr_pages <= 0 && !over_bground_thresh())
				704	break;
				705
				706	wbc.more_io = 0;
				707	wbc.encountered_congestion = 0;
				708	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				709	wbc.pages_skipped = 0;
				710	writeback_inodes_wb(wb, &wbc);
				711	nr_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				712	wrote += MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				713
				714	/*
				715	* If we ran out of stuff to write, bail unless more_io got set
				716	*/
				717	if (wbc.nr_to_write > 0 \|\| wbc.pages_skipped > 0) {
				718	if (wbc.more_io && !wbc.for_kupdate)
				719	continue;
				720	break;
				721	}
				722	}
				723
				724	return wrote;
				725	}
				726
				727	/*
				728	* Return the next bdi_work struct that hasn't been processed by this
				729	* wb thread yet
				730	*/
				731	static struct bdi_work get_next_work_item(struct backing_dev_info bdi,
				732	struct bdi_writeback *wb)
				733	{
				734	struct bdi_work work, ret = NULL;
				735
				736	rcu_read_lock();
				737
				738	list_for_each_entry_rcu(work, &bdi->work_list, list) {
				739	if (!test_and_clear_bit(wb->nr, &work->seen))
				740	continue;
				741
				742	ret = work;
				743	break;
				744	}
				745
				746	rcu_read_unlock();
				747	return ret;
				748	}
				749
				750	static long wb_check_old_data_flush(struct bdi_writeback *wb)
				751	{
				752	unsigned long expired;
				753	long nr_pages;
				754
				755	expired = wb->last_old_flush +
				756	msecs_to_jiffies(dirty_writeback_interval * 10);
				757	if (time_before(jiffies, expired))
				758	return 0;
				759
				760	wb->last_old_flush = jiffies;
				761	nr_pages = global_page_state(NR_FILE_DIRTY) +
				762	global_page_state(NR_UNSTABLE_NFS) +
				763	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
				764
				765	if (nr_pages)
				766	return wb_writeback(wb, nr_pages, NULL, WB_SYNC_NONE, 1);
				767
				768	return 0;
				769	}
				770
				771	/*
				772	* Retrieve work items and do the writeback they describe
				773	*/
				774	long wb_do_writeback(struct bdi_writeback *wb, int force_wait)
				775	{
				776	struct backing_dev_info *bdi = wb->bdi;
				777	struct bdi_work *work;
				778	long nr_pages, wrote = 0;
				779
				780	while ((work = get_next_work_item(bdi, wb)) != NULL) {
				781	enum writeback_sync_modes sync_mode;
				782
				783	nr_pages = work->nr_pages;
				784
				785	/*
				786	* Override sync mode, in case we must wait for completion
				787	*/
				788	if (force_wait)
				789	work->sync_mode = sync_mode = WB_SYNC_ALL;
				790	else
				791	sync_mode = work->sync_mode;
				792
				793	/*
				794	* If this isn't a data integrity operation, just notify
				795	* that we have seen this work and we are now starting it.
				796	*/
				797	if (sync_mode == WB_SYNC_NONE)
				798	wb_clear_pending(wb, work);
				799
				800	wrote += wb_writeback(wb, nr_pages, work->sb, sync_mode, 0);
				801
				802	/*
				803	* This is a data integrity writeback, so only do the
				804	* notification when we have completed the work.
				805	*/
				806	if (sync_mode == WB_SYNC_ALL)
				807	wb_clear_pending(wb, work);
				808	}
				809
				810	/*
				811	* Check for periodic writeback, kupdated() style
				812	*/
				813	wrote += wb_check_old_data_flush(wb);
				814
				815	return wrote;
				816	}
				817
				818	/*
				819	* Handle writeback of dirty data for the device backed by this bdi. Also
				820	* wakes up periodically and does kupdated style flushing.
				821	*/
				822	int bdi_writeback_task(struct bdi_writeback *wb)
				823	{
				824	unsigned long last_active = jiffies;
				825	unsigned long wait_jiffies = -1UL;
				826	long pages_written;
				827
				828	while (!kthread_should_stop()) {
				829	pages_written = wb_do_writeback(wb, 0);
				830
				831	if (pages_written)
				832	last_active = jiffies;
				833	else if (wait_jiffies != -1UL) {
				834	unsigned long max_idle;
				835
				836	/*
				837	* Longest period of inactivity that we tolerate. If we
				838	* see dirty data again later, the task will get
				839	* recreated automatically.
				840	*/
				841	max_idle = max(5UL * 60 * HZ, wait_jiffies);
				842	if (time_after(jiffies, max_idle + last_active))
				843	break;
				844	}
				845
				846	wait_jiffies = msecs_to_jiffies(dirty_writeback_interval * 10);
				847	set_current_state(TASK_INTERRUPTIBLE);
				848	schedule_timeout(wait_jiffies);
				849	try_to_freeze();
				850	}
				851
				852	return 0;
				853	}
				854
				855	/*
				856	* Schedule writeback for all backing devices. Expensive! If this is a data
				857	* integrity operation, writeback will be complete when this returns. If
				858	* we are simply called for WB_SYNC_NONE, then writeback will merely be
				859	* scheduled to run.
				860	*/
				861	static void bdi_writeback_all(struct writeback_control *wbc)
				862	{
				863	const bool must_wait = wbc->sync_mode == WB_SYNC_ALL;
				864	struct backing_dev_info *bdi;
				865	struct bdi_work *work;
				866	LIST_HEAD(list);
				867
				868	restart:
				869	spin_lock(&bdi_lock);
				870
				871	list_for_each_entry(bdi, &bdi_list, bdi_list) {
				872	struct bdi_work *work;
				873
				874	if (!bdi_has_dirty_io(bdi))
				875	continue;
				876
				877	/*
				878	* If work allocation fails, do the writes inline. We drop
				879	* the lock and restart the list writeout. This should be OK,
				880	* since this happens rarely and because the writeout should
				881	* eventually make more free memory available.
				882	*/
				883	work = bdi_alloc_work(wbc);
				884	if (!work) {
				885	struct writeback_control __wbc;
				886
				887	/*
				888	* Not a data integrity writeout, just continue
				889	*/
				890	if (!must_wait)
				891	continue;
				892
				893	spin_unlock(&bdi_lock);
				894	__wbc = *wbc;
				895	__wbc.bdi = bdi;
				896	writeback_inodes_wbc(&__wbc);
				897	goto restart;
				898	}
				899	if (must_wait)
				900	list_add_tail(&work->wait_list, &list);
				901
				902	bdi_queue_work(bdi, work);
				903	}
				904
				905	spin_unlock(&bdi_lock);
				906
				907	/*
				908	* If this is for WB_SYNC_ALL, wait for pending work to complete
				909	* before returning.
				910	*/
				911	while (!list_empty(&list)) {
				912	work = list_entry(list.next, struct bdi_work, wait_list);
				913	list_del(&work->wait_list);
				914	bdi_wait_on_work_clear(work);
				915	call_rcu(&work->rcu_head, bdi_work_free);
				916	}
				917	}
				918
				919	/*
				920	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				921	* the whole world.
				922	*/
				923	void wakeup_flusher_threads(long nr_pages)
				924	{
				925	struct writeback_control wbc = {
				926	.sync_mode = WB_SYNC_NONE,
				927	.older_than_this = NULL,
				928	.range_cyclic = 1,
				929	};
				930
				931	if (nr_pages == 0)
				932	nr_pages = global_page_state(NR_FILE_DIRTY) +
				933	global_page_state(NR_UNSTABLE_NFS);
				934	wbc.nr_to_write = nr_pages;
				935	bdi_writeback_all(&wbc);
				936	}
				937
				938	static noinline void block_dump___mark_inode_dirty(struct inode *inode)
				939	{
				940	if (inode->i_ino \|\| strcmp(inode->i_sb->s_id, "bdev")) {
				941	struct dentry *dentry;
				942	const char *name = "?";
				943
				944	dentry = d_find_alias(inode);
				945	if (dentry) {
				946	spin_lock(&dentry->d_lock);
				947	name = (const char *) dentry->d_name.name;
				948	}
				949	printk(KERN_DEBUG
				950	"%s(%d): dirtied inode %lu (%s) on %s\n",
				951	current->comm, task_pid_nr(current), inode->i_ino,
				952	name, inode->i_sb->s_id);
				953	if (dentry) {
				954	spin_unlock(&dentry->d_lock);
				955	dput(dentry);
				956	}
				957	}
				958	}
				959
				960	/**
				961	* __mark_inode_dirty - internal function
				962	* @inode: inode to mark
				963	* @flags: what kind of dirty (i.e. I_DIRTY_SYNC)
				964	* Mark an inode as dirty. Callers should use mark_inode_dirty or
				965	* mark_inode_dirty_sync.
				966	*
				967	* Put the inode on the super block's dirty list.
				968	*
				969	* CAREFUL! We mark it dirty unconditionally, but move it onto the
				970	* dirty list only if it is hashed or if it refers to a blockdev.
				971	* If it was not hashed, it will never be added to the dirty list
				972	* even if it is later hashed, as it will have been marked dirty already.
				973	*
				974	* In short, make sure you hash any inodes _before_ you start marking
				975	* them dirty.
				976	*
				977	* This function must be atomic for the I_DIRTY_PAGES case -
				978	* set_page_dirty() is called under spinlock in several places.
				979	*
				980	* Note that for blockdevs, inode->dirtied_when represents the dirtying time of
				981	* the block-special inode (/dev/hda1) itself. And the ->dirtied_when field of
				982	* the kernel-internal blockdev inode represents the dirtying time of the
				983	* blockdev's pages. This is why for I_DIRTY_PAGES we always use
				984	* page->mapping->host, so the page-dirtying time is recorded in the internal
				985	* blockdev inode.
				986	*/
				987	void __mark_inode_dirty(struct inode *inode, int flags)
				988	{
				989	struct super_block *sb = inode->i_sb;
				990
				991	/*
				992	* Don't do this for I_DIRTY_PAGES - that doesn't actually
				993	* dirty the inode itself
				994	*/
				995	if (flags & (I_DIRTY_SYNC \| I_DIRTY_DATASYNC)) {
				996	if (sb->s_op->dirty_inode)
				997	sb->s_op->dirty_inode(inode);
				998	}
				999
				1000	/*
				1001	* make sure that changes are seen by all cpus before we test i_state
				1002	* -- mikulas
				1003	*/
				1004	smp_mb();
				1005
				1006	/* avoid the locking if we can */
				1007	if ((inode->i_state & flags) == flags)
				1008	return;
				1009
				1010	if (unlikely(block_dump))
				1011	block_dump___mark_inode_dirty(inode);
				1012
				1013	spin_lock(&inode_lock);
				1014	if ((inode->i_state & flags) != flags) {
				1015	const int was_dirty = inode->i_state & I_DIRTY;
				1016
				1017	inode->i_state \|= flags;
				1018
				1019	/*
				1020	* If the inode is being synced, just update its dirty state.
				1021	* The unlocker will place the inode on the appropriate
				1022	* superblock list, based upon its state.
				1023	*/
				1024	if (inode->i_state & I_SYNC)
				1025	goto out;
				1026
				1027	/*
				1028	* Only add valid (hashed) inodes to the superblock's
				1029	* dirty list. Add blockdev inodes as well.
				1030	*/
				1031	if (!S_ISBLK(inode->i_mode)) {
				1032	if (hlist_unhashed(&inode->i_hash))
				1033	goto out;
				1034	}
				1035	if (inode->i_state & (I_FREEING\|I_CLEAR))
				1036	goto out;
				1037
				1038	/*
				1039	* If the inode was already on b_dirty/b_io/b_more_io, don't
				1040	* reposition it (that would break b_dirty time-ordering).
				1041	*/
				1042	if (!was_dirty) {
				1043	struct bdi_writeback *wb = &inode_to_bdi(inode)->wb;
				1044
				1045	inode->dirtied_when = jiffies;
				1046	list_move(&inode->i_list, &wb->b_dirty);
				1047	}
				1048	}
				1049	out:
				1050	spin_unlock(&inode_lock);
				1051	}
				1052	EXPORT_SYMBOL(__mark_inode_dirty);
				1053
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1054	/*
				1055	* Write out a superblock's list of dirty inodes. A wait will be performed
				1056	* upon no inodes, all inodes or the final one, depending upon sync_mode.
				1057	*
				1058	* If older_than_this is non-NULL, then only write out inodes which
				1059	* had their first dirtying at a time earlier than *older_than_this.
				1060	*
				1061	* If we're a pdlfush thread, then implement pdflush collision avoidance
				1062	* against the entire list.
				1063	*
				1064	* If `bdi' is non-zero then we're being asked to writeback a specific queue.
				1065	* This function assumes that the blockdev superblock's inodes are backed by
				1066	* a variety of queues, so all inodes are searched. For other superblocks,
				1067	* assume that all inodes are backed by the same queue.
				1068	*
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1069	* The inodes to be written are parked on bdi->b_io. They are moved back onto
				1070	* bdi->b_dirty as they are selected for writing. This way, none can be missed
				1071	* on the writer throttling path, and we get decent balancing between many
				1072	* throttled threads: we don't want them all piling up on inode_sync_wait.
				1073	*/
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1074	static void wait_sb_inodes(struct writeback_control *wbc)
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1075	{
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1076	struct inode inode, old_inode = NULL;
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1077
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1078	/*
				1079	* We need to be protected against the filesystem going from
				1080	* r/o to r/w or vice versa.
				1081	*/
				1082	WARN_ON(!rwsem_is_locked(&wbc->sb->s_umount));
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1083
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1084	spin_lock(&inode_lock);
				1085
				1086	/*
				1087	* Data integrity sync. Must wait for all pages under writeback,
				1088	* because there may have been pages dirtied before our sync
				1089	* call, but which had writeout started before we write it out.
				1090	* In which case, the inode may not be on the dirty list, but
				1091	* we still have to wait for that writeout.
				1092	*/
				1093	list_for_each_entry(inode, &wbc->sb->s_inodes, i_sb_list) {
				1094	struct address_space *mapping;
				1095
				1096	if (inode->i_state & (I_FREEING\|I_CLEAR\|I_WILL_FREE\|I_NEW))
				1097	continue;
				1098	mapping = inode->i_mapping;
				1099	if (mapping->nrpages == 0)
				1100	continue;
				1101	__iget(inode);
				1102	spin_unlock(&inode_lock);
				1103	/*
				1104	* We hold a reference to 'inode' so it couldn't have
				1105	* been removed from s_inodes list while we dropped the
				1106	* inode_lock. We cannot iput the inode now as we can
				1107	* be holding the last reference and we cannot iput it
				1108	* under inode_lock. So we keep the reference and iput
				1109	* it later.
				1110	*/
				1111	iput(old_inode);
				1112	old_inode = inode;
				1113
				1114	filemap_fdatawait(mapping);
				1115
				1116	cond_resched();
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1117
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1118	spin_lock(&inode_lock);
Jens Axboe	66f3b8e	2009-09-02 09:19:46 +0200	[diff] [blame]	1119	}
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1120	spin_unlock(&inode_lock);
				1121	iput(old_inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1122	}
				1123
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1124	/**
				1125	* writeback_inodes_sb - writeback dirty inodes from given super_block
				1126	* @sb: the superblock
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1127	*
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1128	* Start writeback on some inodes on this super_block. No guarantees are made
				1129	* on how many (if any) will be written, and this function does not wait
				1130	* for IO completion of submitted IO. The number of pages submitted is
				1131	* returned.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1132	*/
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1133	long writeback_inodes_sb(struct super_block *sb)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1134	{
				1135	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1136	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1137	.sync_mode = WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1138	.range_start = 0,
				1139	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1140	};
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1141	unsigned long nr_dirty = global_page_state(NR_FILE_DIRTY);
				1142	unsigned long nr_unstable = global_page_state(NR_UNSTABLE_NFS);
				1143	long nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1144
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1145	nr_to_write = nr_dirty + nr_unstable +
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1146	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
Nick Piggin	38f2197	2009-01-06 14:40:25 -0800	[diff] [blame]	1147
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1148	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1149	bdi_writeback_all(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1150	return nr_to_write - wbc.nr_to_write;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1151	}
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1152	EXPORT_SYMBOL(writeback_inodes_sb);
				1153
				1154	/**
				1155	* sync_inodes_sb - sync sb inode pages
				1156	* @sb: the superblock
				1157	*
				1158	* This function writes and waits on any dirty inode belonging to this
				1159	* super_block. The number of pages synced is returned.
				1160	*/
				1161	long sync_inodes_sb(struct super_block *sb)
				1162	{
				1163	struct writeback_control wbc = {
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1164	.sb = sb,
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1165	.sync_mode = WB_SYNC_ALL,
				1166	.range_start = 0,
				1167	.range_end = LLONG_MAX,
				1168	};
				1169	long nr_to_write = LONG_MAX; /* doesn't actually matter */
				1170
				1171	wbc.nr_to_write = nr_to_write;
Jens Axboe	03ba378	2009-09-09 09:08:54 +0200	[diff] [blame^]	1172	bdi_writeback_all(&wbc);
				1173	wait_sb_inodes(&wbc);
Jens Axboe	d8a8559	2009-09-02 12:34:32 +0200	[diff] [blame]	1174	return nr_to_write - wbc.nr_to_write;
				1175	}
				1176	EXPORT_SYMBOL(sync_inodes_sb);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1177
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1178	/**
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1179	* write_inode_now - write an inode to disk
				1180	* @inode: inode to write to disk
				1181	* @sync: whether the write should be synchronous or not
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1182	*
Andrea Arcangeli	7f04c26	2005-10-30 15:03:05 -0800	[diff] [blame]	1183	* This function commits an inode to disk immediately if it is dirty. This is
				1184	* primarily needed by knfsd.
				1185	*
				1186	* The caller must either have a ref on the inode or must have set I_WILL_FREE.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1187	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1188	int write_inode_now(struct inode *inode, int sync)
				1189	{
				1190	int ret;
				1191	struct writeback_control wbc = {
				1192	.nr_to_write = LONG_MAX,
Mike Galbraith	18914b1	2008-02-08 04:20:23 -0800	[diff] [blame]	1193	.sync_mode = sync ? WB_SYNC_ALL : WB_SYNC_NONE,
OGAWA Hirofumi	111ebb6	2006-06-23 02:03:26 -0700	[diff] [blame]	1194	.range_start = 0,
				1195	.range_end = LLONG_MAX,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1196	};
				1197
				1198	if (!mapping_cap_writeback_dirty(inode->i_mapping))
Andrew Morton	49364ce	2005-11-07 00:59:15 -0800	[diff] [blame]	1199	wbc.nr_to_write = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1200
				1201	might_sleep();
				1202	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1203	ret = writeback_single_inode(inode, &wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1204	spin_unlock(&inode_lock);
				1205	if (sync)
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1206	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1207	return ret;
				1208	}
				1209	EXPORT_SYMBOL(write_inode_now);
				1210
				1211	/**
				1212	* sync_inode - write an inode and its pages to disk.
				1213	* @inode: the inode to sync
				1214	* @wbc: controls the writeback mode
				1215	*
				1216	* sync_inode() will write an inode and its pages to disk. It will also
				1217	* correctly update the inode on its superblock's dirty inode lists and will
				1218	* update inode->i_state.
				1219	*
				1220	* The caller must have a ref on the inode.
				1221	*/
				1222	int sync_inode(struct inode inode, struct writeback_control wbc)
				1223	{
				1224	int ret;
				1225
				1226	spin_lock(&inode_lock);
Christoph Hellwig	01c0319	2009-06-08 13:35:40 +0200	[diff] [blame]	1227	ret = writeback_single_inode(inode, wbc);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1228	spin_unlock(&inode_lock);
				1229	return ret;
				1230	}
				1231	EXPORT_SYMBOL(sync_inode);
				1232
				1233	/**
				1234	* generic_osync_inode - flush all dirty data for a given inode to disk
				1235	* @inode: inode to write
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	1236	* @mapping: the address_space that should be flushed
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1237	* @what: what to write and wait upon
				1238	*
				1239	* This can be called by file_write functions for files which have the
				1240	* O_SYNC flag set, to flush dirty writes to disk.
				1241	*
				1242	* @what is a bitmask, specifying which part of the inode's data should be
Randy Dunlap	b8887e6	2005-11-07 01:01:07 -0800	[diff] [blame]	1243	* written and waited upon.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1244	*
				1245	* OSYNC_DATA: i_mapping's dirty data
				1246	* OSYNC_METADATA: the buffers at i_mapping->private_list
				1247	* OSYNC_INODE: the inode itself
				1248	*/
				1249
				1250	int generic_osync_inode(struct inode inode, struct address_space mapping, int what)
				1251	{
				1252	int err = 0;
				1253	int need_write_inode_now = 0;
				1254	int err2;
				1255
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1256	if (what & OSYNC_DATA)
				1257	err = filemap_fdatawrite(mapping);
				1258	if (what & (OSYNC_METADATA\|OSYNC_DATA)) {
				1259	err2 = sync_mapping_buffers(mapping);
				1260	if (!err)
				1261	err = err2;
				1262	}
				1263	if (what & OSYNC_DATA) {
				1264	err2 = filemap_fdatawait(mapping);
				1265	if (!err)
				1266	err = err2;
				1267	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1268
				1269	spin_lock(&inode_lock);
				1270	if ((inode->i_state & I_DIRTY) &&
				1271	((what & OSYNC_INODE) \|\| (inode->i_state & I_DIRTY_DATASYNC)))
				1272	need_write_inode_now = 1;
				1273	spin_unlock(&inode_lock);
				1274
				1275	if (need_write_inode_now) {
				1276	err2 = write_inode_now(inode, 1);
				1277	if (!err)
				1278	err = err2;
				1279	}
				1280	else
Joern Engel	1c0eeaf	2007-10-16 23:30:44 -0700	[diff] [blame]	1281	inode_sync_wait(inode);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1282
				1283	return err;
				1284	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1285	EXPORT_SYMBOL(generic_osync_inode);