Blame - mm/page-writeback.c - kernel/msm-4.9

blob: a6329fa8f862da5aea02238c687e93843ba2a80e [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* mm/page-writeback.c.
				3	*
				4	* Copyright (C) 2002, Linus Torvalds.
				5	*
				6	* Contains functions related to writing back dirty pages at the
				7	* address_space level.
				8	*
				9	* 10Apr2002 akpm@zip.com.au
				10	* Initial version
				11	*/
				12
				13	#include <linux/kernel.h>
				14	#include <linux/module.h>
				15	#include <linux/spinlock.h>
				16	#include <linux/fs.h>
				17	#include <linux/mm.h>
				18	#include <linux/swap.h>
				19	#include <linux/slab.h>
				20	#include <linux/pagemap.h>
				21	#include <linux/writeback.h>
				22	#include <linux/init.h>
				23	#include <linux/backing-dev.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/mpage.h>
				26	#include <linux/percpu.h>
				27	#include <linux/notifier.h>
				28	#include <linux/smp.h>
				29	#include <linux/sysctl.h>
				30	#include <linux/cpu.h>
				31	#include <linux/syscalls.h>
				32
				33	/*
				34	* The maximum number of pages to writeout in a single bdflush/kupdate
				35	* operation. We do this so we don't hold I_LOCK against an inode for
				36	* enormous amounts of time, which would block a userspace task which has
				37	* been forced to throttle against that inode. Also, the code reevaluates
				38	* the dirty each time it has written this many pages.
				39	*/
				40	#define MAX_WRITEBACK_PAGES 1024
				41
				42	/*
				43	* After a CPU has dirtied this many pages, balance_dirty_pages_ratelimited
				44	* will look to see if it needs to force writeback or throttling.
				45	*/
				46	static long ratelimit_pages = 32;
				47
				48	static long total_pages; /* The total number of pages in the machine. */
				49	static int dirty_exceeded; /* Dirty mem may be over limit */
				50
				51	/*
				52	* When balance_dirty_pages decides that the caller needs to perform some
				53	* non-background writeback, this is how many pages it will attempt to write.
				54	* It should be somewhat larger than RATELIMIT_PAGES to ensure that reasonably
				55	* large amounts of I/O are submitted.
				56	*/
				57	static inline long sync_writeback_pages(void)
				58	{
				59	return ratelimit_pages + ratelimit_pages / 2;
				60	}
				61
				62	/* The following parameters are exported via /proc/sys/vm */
				63
				64	/*
				65	* Start background writeback (via pdflush) at this percentage
				66	*/
				67	int dirty_background_ratio = 10;
				68
				69	/*
				70	* The generator of dirty data starts writeback at this percentage
				71	*/
				72	int vm_dirty_ratio = 40;
				73
				74	/*
				75	* The interval between `kupdate'-style writebacks, in centiseconds
				76	* (hundredths of a second)
				77	*/
				78	int dirty_writeback_centisecs = 5 * 100;
				79
				80	/*
				81	* The longest number of centiseconds for which data is allowed to remain dirty
				82	*/
				83	int dirty_expire_centisecs = 30 * 100;
				84
				85	/*
				86	* Flag that makes the machine dump writes/reads and block dirtyings.
				87	*/
				88	int block_dump;
				89
				90	/*
				91	* Flag that puts the machine in "laptop mode".
				92	*/
				93	int laptop_mode;
				94
				95	EXPORT_SYMBOL(laptop_mode);
				96
				97	/* End of sysctl-exported parameters */
				98
				99
				100	static void background_writeout(unsigned long _min_pages);
				101
				102	struct writeback_state
				103	{
				104	unsigned long nr_dirty;
				105	unsigned long nr_unstable;
				106	unsigned long nr_mapped;
				107	unsigned long nr_writeback;
				108	};
				109
				110	static void get_writeback_state(struct writeback_state *wbs)
				111	{
				112	wbs->nr_dirty = read_page_state(nr_dirty);
				113	wbs->nr_unstable = read_page_state(nr_unstable);
				114	wbs->nr_mapped = read_page_state(nr_mapped);
				115	wbs->nr_writeback = read_page_state(nr_writeback);
				116	}
				117
				118	/*
				119	* Work out the current dirty-memory clamping and background writeout
				120	* thresholds.
				121	*
				122	* The main aim here is to lower them aggressively if there is a lot of mapped
				123	* memory around. To avoid stressing page reclaim with lots of unreclaimable
				124	* pages. It is better to clamp down on writers than to start swapping, and
				125	* performing lots of scanning.
				126	*
				127	* We only allow 1/2 of the currently-unmapped memory to be dirtied.
				128	*
				129	* We don't permit the clamping level to fall below 5% - that is getting rather
				130	* excessive.
				131	*
				132	* We make sure that the background writeout level is below the adjusted
				133	* clamping level.
				134	*/
				135	static void
				136	get_dirty_limits(struct writeback_state wbs, long pbackground, long *pdirty,
				137	struct address_space *mapping)
				138	{
				139	int background_ratio; /* Percentages */
				140	int dirty_ratio;
				141	int unmapped_ratio;
				142	long background;
				143	long dirty;
				144	unsigned long available_memory = total_pages;
				145	struct task_struct *tsk;
				146
				147	get_writeback_state(wbs);
				148
				149	#ifdef CONFIG_HIGHMEM
				150	/*
				151	* If this mapping can only allocate from low memory,
				152	* we exclude high memory from our count.
				153	*/
				154	if (mapping && !(mapping_gfp_mask(mapping) & __GFP_HIGHMEM))
				155	available_memory -= totalhigh_pages;
				156	#endif
				157
				158
				159	unmapped_ratio = 100 - (wbs->nr_mapped * 100) / total_pages;
				160
				161	dirty_ratio = vm_dirty_ratio;
				162	if (dirty_ratio > unmapped_ratio / 2)
				163	dirty_ratio = unmapped_ratio / 2;
				164
				165	if (dirty_ratio < 5)
				166	dirty_ratio = 5;
				167
				168	background_ratio = dirty_background_ratio;
				169	if (background_ratio >= dirty_ratio)
				170	background_ratio = dirty_ratio / 2;
				171
				172	background = (background_ratio * available_memory) / 100;
				173	dirty = (dirty_ratio * available_memory) / 100;
				174	tsk = current;
				175	if (tsk->flags & PF_LESS_THROTTLE \|\| rt_task(tsk)) {
				176	background += background / 4;
				177	dirty += dirty / 4;
				178	}
				179	*pbackground = background;
				180	*pdirty = dirty;
				181	}
				182
				183	/*
				184	* balance_dirty_pages() must be called by processes which are generating dirty
				185	* data. It looks at the number of dirty pages in the machine and will force
				186	* the caller to perform writeback if the system is over `vm_dirty_ratio'.
				187	* If we're over `background_thresh' then pdflush is woken to perform some
				188	* writeout.
				189	*/
				190	static void balance_dirty_pages(struct address_space *mapping)
				191	{
				192	struct writeback_state wbs;
				193	long nr_reclaimable;
				194	long background_thresh;
				195	long dirty_thresh;
				196	unsigned long pages_written = 0;
				197	unsigned long write_chunk = sync_writeback_pages();
				198
				199	struct backing_dev_info *bdi = mapping->backing_dev_info;
				200
				201	for (;;) {
				202	struct writeback_control wbc = {
				203	.bdi = bdi,
				204	.sync_mode = WB_SYNC_NONE,
				205	.older_than_this = NULL,
				206	.nr_to_write = write_chunk,
				207	};
				208
				209	get_dirty_limits(&wbs, &background_thresh,
				210	&dirty_thresh, mapping);
				211	nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
				212	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
				213	break;
				214
				215	dirty_exceeded = 1;
				216
				217	/* Note: nr_reclaimable denotes nr_dirty + nr_unstable.
				218	* Unstable writes are a feature of certain networked
				219	* filesystems (i.e. NFS) in which data may have been
				220	* written to the server's write cache, but has not yet
				221	* been flushed to permanent storage.
				222	*/
				223	if (nr_reclaimable) {
				224	writeback_inodes(&wbc);
				225	get_dirty_limits(&wbs, &background_thresh,
				226	&dirty_thresh, mapping);
				227	nr_reclaimable = wbs.nr_dirty + wbs.nr_unstable;
				228	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
				229	break;
				230	pages_written += write_chunk - wbc.nr_to_write;
				231	if (pages_written >= write_chunk)
				232	break; /* We've done our duty */
				233	}
				234	blk_congestion_wait(WRITE, HZ/10);
				235	}
				236
				237	if (nr_reclaimable + wbs.nr_writeback <= dirty_thresh)
				238	dirty_exceeded = 0;
				239
				240	if (writeback_in_progress(bdi))
				241	return; /* pdflush is already working this queue */
				242
				243	/*
				244	* In laptop mode, we wait until hitting the higher threshold before
				245	* starting background writeout, and then write out all the way down
				246	* to the lower threshold. So slow writers cause minimal disk activity.
				247	*
				248	* In normal mode, we start background writeout at the lower
				249	* background_thresh, to keep the amount of dirty memory low.
				250	*/
				251	if ((laptop_mode && pages_written) \|\|
				252	(!laptop_mode && (nr_reclaimable > background_thresh)))
				253	pdflush_operation(background_writeout, 0);
				254	}
				255
				256	/**
				257	* balance_dirty_pages_ratelimited - balance dirty memory state
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	258	* @mapping: address_space which was dirtied
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	259	*
				260	* Processes which are dirtying memory should call in here once for each page
				261	* which was newly dirtied. The function will periodically check the system's
				262	* dirty state and will initiate writeback if needed.
				263	*
				264	* On really big machines, get_writeback_state is expensive, so try to avoid
				265	* calling it too often (ratelimiting). But once we're over the dirty memory
				266	* limit we decrease the ratelimiting by a lot, to prevent individual processes
				267	* from overshooting the limit by (ratelimit_pages) each.
				268	*/
				269	void balance_dirty_pages_ratelimited(struct address_space *mapping)
				270	{
				271	static DEFINE_PER_CPU(int, ratelimits) = 0;
				272	long ratelimit;
				273
				274	ratelimit = ratelimit_pages;
				275	if (dirty_exceeded)
				276	ratelimit = 8;
				277
				278	/*
				279	* Check the rate limiting. Also, we do not want to throttle real-time
				280	* tasks in balance_dirty_pages(). Period.
				281	*/
				282	if (get_cpu_var(ratelimits)++ >= ratelimit) {
				283	__get_cpu_var(ratelimits) = 0;
				284	put_cpu_var(ratelimits);
				285	balance_dirty_pages(mapping);
				286	return;
				287	}
				288	put_cpu_var(ratelimits);
				289	}
				290	EXPORT_SYMBOL(balance_dirty_pages_ratelimited);
				291
				292	void throttle_vm_writeout(void)
				293	{
				294	struct writeback_state wbs;
				295	long background_thresh;
				296	long dirty_thresh;
				297
				298	for ( ; ; ) {
				299	get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
				300
				301	/*
				302	* Boost the allowable dirty threshold a bit for page
				303	* allocators so they don't get DoS'ed by heavy writers
				304	*/
				305	dirty_thresh += dirty_thresh / 10; /* wheeee... */
				306
				307	if (wbs.nr_unstable + wbs.nr_writeback <= dirty_thresh)
				308	break;
				309	blk_congestion_wait(WRITE, HZ/10);
				310	}
				311	}
				312
				313
				314	/*
				315	* writeback at least _min_pages, and keep writing until the amount of dirty
				316	* memory is less than the background threshold, or until we're all clean.
				317	*/
				318	static void background_writeout(unsigned long _min_pages)
				319	{
				320	long min_pages = _min_pages;
				321	struct writeback_control wbc = {
				322	.bdi = NULL,
				323	.sync_mode = WB_SYNC_NONE,
				324	.older_than_this = NULL,
				325	.nr_to_write = 0,
				326	.nonblocking = 1,
				327	};
				328
				329	for ( ; ; ) {
				330	struct writeback_state wbs;
				331	long background_thresh;
				332	long dirty_thresh;
				333
				334	get_dirty_limits(&wbs, &background_thresh, &dirty_thresh, NULL);
				335	if (wbs.nr_dirty + wbs.nr_unstable < background_thresh
				336	&& min_pages <= 0)
				337	break;
				338	wbc.encountered_congestion = 0;
				339	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				340	wbc.pages_skipped = 0;
				341	writeback_inodes(&wbc);
				342	min_pages -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				343	if (wbc.nr_to_write > 0 \|\| wbc.pages_skipped > 0) {
				344	/* Wrote less than expected */
				345	blk_congestion_wait(WRITE, HZ/10);
				346	if (!wbc.encountered_congestion)
				347	break;
				348	}
				349	}
				350	}
				351
				352	/*
				353	* Start writeback of `nr_pages' pages. If `nr_pages' is zero, write back
				354	* the whole world. Returns 0 if a pdflush thread was dispatched. Returns
				355	* -1 if all pdflush threads were busy.
				356	*/
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame^]	357	int wakeup_pdflush(long nr_pages)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	358	{
				359	if (nr_pages == 0) {
				360	struct writeback_state wbs;
				361
				362	get_writeback_state(&wbs);
				363	nr_pages = wbs.nr_dirty + wbs.nr_unstable;
				364	}
				365	return pdflush_operation(background_writeout, nr_pages);
				366	}
				367
				368	static void wb_timer_fn(unsigned long unused);
				369	static void laptop_timer_fn(unsigned long unused);
				370
				371	static struct timer_list wb_timer =
				372	TIMER_INITIALIZER(wb_timer_fn, 0, 0);
				373	static struct timer_list laptop_mode_wb_timer =
				374	TIMER_INITIALIZER(laptop_timer_fn, 0, 0);
				375
				376	/*
				377	* Periodic writeback of "old" data.
				378	*
				379	* Define "old": the first time one of an inode's pages is dirtied, we mark the
				380	* dirtying-time in the inode's address_space. So this periodic writeback code
				381	* just walks the superblock inode list, writing back any inodes which are
				382	* older than a specific point in time.
				383	*
				384	* Try to run once per dirty_writeback_centisecs. But if a writeback event
				385	* takes longer than a dirty_writeback_centisecs interval, then leave a
				386	* one-second gap.
				387	*
				388	* older_than_this takes precedence over nr_to_write. So we'll only write back
				389	* all dirty pages if they are all attached to "old" mappings.
				390	*/
				391	static void wb_kupdate(unsigned long arg)
				392	{
				393	unsigned long oldest_jif;
				394	unsigned long start_jif;
				395	unsigned long next_jif;
				396	long nr_to_write;
				397	struct writeback_state wbs;
				398	struct writeback_control wbc = {
				399	.bdi = NULL,
				400	.sync_mode = WB_SYNC_NONE,
				401	.older_than_this = &oldest_jif,
				402	.nr_to_write = 0,
				403	.nonblocking = 1,
				404	.for_kupdate = 1,
				405	};
				406
				407	sync_supers();
				408
				409	get_writeback_state(&wbs);
				410	oldest_jif = jiffies - (dirty_expire_centisecs * HZ) / 100;
				411	start_jif = jiffies;
				412	next_jif = start_jif + (dirty_writeback_centisecs * HZ) / 100;
				413	nr_to_write = wbs.nr_dirty + wbs.nr_unstable +
				414	(inodes_stat.nr_inodes - inodes_stat.nr_unused);
				415	while (nr_to_write > 0) {
				416	wbc.encountered_congestion = 0;
				417	wbc.nr_to_write = MAX_WRITEBACK_PAGES;
				418	writeback_inodes(&wbc);
				419	if (wbc.nr_to_write > 0) {
				420	if (wbc.encountered_congestion)
				421	blk_congestion_wait(WRITE, HZ/10);
				422	else
				423	break; /* All the old data is written */
				424	}
				425	nr_to_write -= MAX_WRITEBACK_PAGES - wbc.nr_to_write;
				426	}
				427	if (time_before(next_jif, jiffies + HZ))
				428	next_jif = jiffies + HZ;
				429	if (dirty_writeback_centisecs)
				430	mod_timer(&wb_timer, next_jif);
				431	}
				432
				433	/*
				434	* sysctl handler for /proc/sys/vm/dirty_writeback_centisecs
				435	*/
				436	int dirty_writeback_centisecs_handler(ctl_table *table, int write,
				437	struct file file, void __user buffer, size_t length, loff_t ppos)
				438	{
				439	proc_dointvec(table, write, file, buffer, length, ppos);
				440	if (dirty_writeback_centisecs) {
				441	mod_timer(&wb_timer,
				442	jiffies + (dirty_writeback_centisecs * HZ) / 100);
				443	} else {
				444	del_timer(&wb_timer);
				445	}
				446	return 0;
				447	}
				448
				449	static void wb_timer_fn(unsigned long unused)
				450	{
				451	if (pdflush_operation(wb_kupdate, 0) < 0)
				452	mod_timer(&wb_timer, jiffies + HZ); /* delay 1 second */
				453	}
				454
				455	static void laptop_flush(unsigned long unused)
				456	{
				457	sys_sync();
				458	}
				459
				460	static void laptop_timer_fn(unsigned long unused)
				461	{
				462	pdflush_operation(laptop_flush, 0);
				463	}
				464
				465	/*
				466	* We've spun up the disk and we're in laptop mode: schedule writeback
				467	* of all dirty data a few seconds from now. If the flush is already scheduled
				468	* then push it back - the user is still using the disk.
				469	*/
				470	void laptop_io_completion(void)
				471	{
				472	mod_timer(&laptop_mode_wb_timer, jiffies + laptop_mode * HZ);
				473	}
				474
				475	/*
				476	* We're in laptop mode and we've just synced. The sync's writes will have
				477	* caused another writeback to be scheduled by laptop_io_completion.
				478	* Nothing needs to be written back anymore, so we unschedule the writeback.
				479	*/
				480	void laptop_sync_completion(void)
				481	{
				482	del_timer(&laptop_mode_wb_timer);
				483	}
				484
				485	/*
				486	* If ratelimit_pages is too high then we can get into dirty-data overload
				487	* if a large number of processes all perform writes at the same time.
				488	* If it is too low then SMP machines will call the (expensive)
				489	* get_writeback_state too often.
				490	*
				491	* Here we set ratelimit_pages to a level which ensures that when all CPUs are
				492	* dirtying in parallel, we cannot go more than 3% (1/32) over the dirty memory
				493	* thresholds before writeback cuts in.
				494	*
				495	* But the limit should not be set too high. Because it also controls the
				496	* amount of memory which the balance_dirty_pages() caller has to write back.
				497	* If this is too large then the caller will block on the IO queue all the
				498	* time. So limit it to four megabytes - the balance_dirty_pages() caller
				499	* will write six megabyte chunks, max.
				500	*/
				501
				502	static void set_ratelimit(void)
				503	{
				504	ratelimit_pages = total_pages / (num_online_cpus() * 32);
				505	if (ratelimit_pages < 16)
				506	ratelimit_pages = 16;
				507	if (ratelimit_pages * PAGE_CACHE_SIZE > 4096 * 1024)
				508	ratelimit_pages = (4096 * 1024) / PAGE_CACHE_SIZE;
				509	}
				510
				511	static int
				512	ratelimit_handler(struct notifier_block self, unsigned long u, void v)
				513	{
				514	set_ratelimit();
				515	return 0;
				516	}
				517
				518	static struct notifier_block ratelimit_nb = {
				519	.notifier_call = ratelimit_handler,
				520	.next = NULL,
				521	};
				522
				523	/*
				524	* If the machine has a large highmem:lowmem ratio then scale back the default
				525	* dirty memory thresholds: allowing too much dirty highmem pins an excessive
				526	* number of buffer_heads.
				527	*/
				528	void __init page_writeback_init(void)
				529	{
				530	long buffer_pages = nr_free_buffer_pages();
				531	long correction;
				532
				533	total_pages = nr_free_pagecache_pages();
				534
				535	correction = (100 * 4 * buffer_pages) / total_pages;
				536
				537	if (correction < 100) {
				538	dirty_background_ratio *= correction;
				539	dirty_background_ratio /= 100;
				540	vm_dirty_ratio *= correction;
				541	vm_dirty_ratio /= 100;
				542
				543	if (dirty_background_ratio <= 0)
				544	dirty_background_ratio = 1;
				545	if (vm_dirty_ratio <= 0)
				546	vm_dirty_ratio = 1;
				547	}
				548	mod_timer(&wb_timer, jiffies + (dirty_writeback_centisecs * HZ) / 100);
				549	set_ratelimit();
				550	register_cpu_notifier(&ratelimit_nb);
				551	}
				552
				553	int do_writepages(struct address_space mapping, struct writeback_control wbc)
				554	{
				555	if (wbc->nr_to_write <= 0)
				556	return 0;
				557	if (mapping->a_ops->writepages)
				558	return mapping->a_ops->writepages(mapping, wbc);
				559	return generic_writepages(mapping, wbc);
				560	}
				561
				562	/**
				563	* write_one_page - write out a single page and optionally wait on I/O
				564	*
Martin Waitz	67be2dd	2005-05-01 08:59:26 -0700	[diff] [blame]	565	* @page: the page to write
				566	* @wait: if true, wait on writeout
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	567	*
				568	* The page must be locked by the caller and will be unlocked upon return.
				569	*
				570	* write_one_page() returns a negative error code if I/O failed.
				571	*/
				572	int write_one_page(struct page *page, int wait)
				573	{
				574	struct address_space *mapping = page->mapping;
				575	int ret = 0;
				576	struct writeback_control wbc = {
				577	.sync_mode = WB_SYNC_ALL,
				578	.nr_to_write = 1,
				579	};
				580
				581	BUG_ON(!PageLocked(page));
				582
				583	if (wait)
				584	wait_on_page_writeback(page);
				585
				586	if (clear_page_dirty_for_io(page)) {
				587	page_cache_get(page);
				588	ret = mapping->a_ops->writepage(page, &wbc);
				589	if (ret == 0 && wait) {
				590	wait_on_page_writeback(page);
				591	if (PageError(page))
				592	ret = -EIO;
				593	}
				594	page_cache_release(page);
				595	} else {
				596	unlock_page(page);
				597	}
				598	return ret;
				599	}
				600	EXPORT_SYMBOL(write_one_page);
				601
				602	/*
				603	* For address_spaces which do not use buffers. Just tag the page as dirty in
				604	* its radix tree.
				605	*
				606	* This is also used when a single buffer is being dirtied: we want to set the
				607	* page dirty in that case, but not all the buffers. This is a "bottom-up"
				608	* dirtying, whereas __set_page_dirty_buffers() is a "top-down" dirtying.
				609	*
				610	* Most callers have locked the page, which pins the address_space in memory.
				611	* But zap_pte_range() does not lock the page, however in that case the
				612	* mapping is pinned by the vma's ->vm_file reference.
				613	*
				614	* We take care to handle the case where the page was truncated from the
				615	* mapping by re-checking page_mapping() insode tree_lock.
				616	*/
				617	int __set_page_dirty_nobuffers(struct page *page)
				618	{
				619	int ret = 0;
				620
				621	if (!TestSetPageDirty(page)) {
				622	struct address_space *mapping = page_mapping(page);
				623	struct address_space *mapping2;
				624
				625	if (mapping) {
				626	write_lock_irq(&mapping->tree_lock);
				627	mapping2 = page_mapping(page);
				628	if (mapping2) { /* Race with truncate? */
				629	BUG_ON(mapping2 != mapping);
				630	if (mapping_cap_account_dirty(mapping))
				631	inc_page_state(nr_dirty);
				632	radix_tree_tag_set(&mapping->page_tree,
				633	page_index(page), PAGECACHE_TAG_DIRTY);
				634	}
				635	write_unlock_irq(&mapping->tree_lock);
				636	if (mapping->host) {
				637	/* !PageAnon && !swapper_space */
				638	__mark_inode_dirty(mapping->host,
				639	I_DIRTY_PAGES);
				640	}
				641	}
				642	}
				643	return ret;
				644	}
				645	EXPORT_SYMBOL(__set_page_dirty_nobuffers);
				646
				647	/*
				648	* When a writepage implementation decides that it doesn't want to write this
				649	* page for some reason, it should redirty the locked page via
				650	* redirty_page_for_writepage() and it should then unlock the page and return 0
				651	*/
				652	int redirty_page_for_writepage(struct writeback_control wbc, struct page page)
				653	{
				654	wbc->pages_skipped++;
				655	return __set_page_dirty_nobuffers(page);
				656	}
				657	EXPORT_SYMBOL(redirty_page_for_writepage);
				658
				659	/*
				660	* If the mapping doesn't provide a set_page_dirty a_op, then
				661	* just fall through and assume that it wants buffer_heads.
				662	*/
				663	int fastcall set_page_dirty(struct page *page)
				664	{
				665	struct address_space *mapping = page_mapping(page);
				666
				667	if (likely(mapping)) {
				668	int (spd)(struct page ) = mapping->a_ops->set_page_dirty;
				669	if (spd)
				670	return (*spd)(page);
				671	return __set_page_dirty_buffers(page);
				672	}
				673	if (!PageDirty(page))
				674	SetPageDirty(page);
				675	return 0;
				676	}
				677	EXPORT_SYMBOL(set_page_dirty);
				678
				679	/*
				680	* set_page_dirty() is racy if the caller has no reference against
				681	* page->mapping->host, and if the page is unlocked. This is because another
				682	* CPU could truncate the page off the mapping and then free the mapping.
				683	*
				684	* Usually, the page _is_ locked, or the caller is a user-space process which
				685	* holds a reference on the inode by having an open file.
				686	*
				687	* In other cases, the page should be locked before running set_page_dirty().
				688	*/
				689	int set_page_dirty_lock(struct page *page)
				690	{
				691	int ret;
				692
				693	lock_page(page);
				694	ret = set_page_dirty(page);
				695	unlock_page(page);
				696	return ret;
				697	}
				698	EXPORT_SYMBOL(set_page_dirty_lock);
				699
				700	/*
				701	* Clear a page's dirty flag, while caring for dirty memory accounting.
				702	* Returns true if the page was previously dirty.
				703	*/
				704	int test_clear_page_dirty(struct page *page)
				705	{
				706	struct address_space *mapping = page_mapping(page);
				707	unsigned long flags;
				708
				709	if (mapping) {
				710	write_lock_irqsave(&mapping->tree_lock, flags);
				711	if (TestClearPageDirty(page)) {
				712	radix_tree_tag_clear(&mapping->page_tree,
				713	page_index(page),
				714	PAGECACHE_TAG_DIRTY);
				715	write_unlock_irqrestore(&mapping->tree_lock, flags);
				716	if (mapping_cap_account_dirty(mapping))
				717	dec_page_state(nr_dirty);
				718	return 1;
				719	}
				720	write_unlock_irqrestore(&mapping->tree_lock, flags);
				721	return 0;
				722	}
				723	return TestClearPageDirty(page);
				724	}
				725	EXPORT_SYMBOL(test_clear_page_dirty);
				726
				727	/*
				728	* Clear a page's dirty flag, while caring for dirty memory accounting.
				729	* Returns true if the page was previously dirty.
				730	*
				731	* This is for preparing to put the page under writeout. We leave the page
				732	* tagged as dirty in the radix tree so that a concurrent write-for-sync
				733	* can discover it via a PAGECACHE_TAG_DIRTY walk. The ->writepage
				734	* implementation will run either set_page_writeback() or set_page_dirty(),
				735	* at which stage we bring the page's dirty flag and radix-tree dirty tag
				736	* back into sync.
				737	*
				738	* This incoherency between the page's dirty flag and radix-tree tag is
				739	* unfortunate, but it only exists while the page is locked.
				740	*/
				741	int clear_page_dirty_for_io(struct page *page)
				742	{
				743	struct address_space *mapping = page_mapping(page);
				744
				745	if (mapping) {
				746	if (TestClearPageDirty(page)) {
				747	if (mapping_cap_account_dirty(mapping))
				748	dec_page_state(nr_dirty);
				749	return 1;
				750	}
				751	return 0;
				752	}
				753	return TestClearPageDirty(page);
				754	}
				755	EXPORT_SYMBOL(clear_page_dirty_for_io);
				756
				757	int test_clear_page_writeback(struct page *page)
				758	{
				759	struct address_space *mapping = page_mapping(page);
				760	int ret;
				761
				762	if (mapping) {
				763	unsigned long flags;
				764
				765	write_lock_irqsave(&mapping->tree_lock, flags);
				766	ret = TestClearPageWriteback(page);
				767	if (ret)
				768	radix_tree_tag_clear(&mapping->page_tree,
				769	page_index(page),
				770	PAGECACHE_TAG_WRITEBACK);
				771	write_unlock_irqrestore(&mapping->tree_lock, flags);
				772	} else {
				773	ret = TestClearPageWriteback(page);
				774	}
				775	return ret;
				776	}
				777
				778	int test_set_page_writeback(struct page *page)
				779	{
				780	struct address_space *mapping = page_mapping(page);
				781	int ret;
				782
				783	if (mapping) {
				784	unsigned long flags;
				785
				786	write_lock_irqsave(&mapping->tree_lock, flags);
				787	ret = TestSetPageWriteback(page);
				788	if (!ret)
				789	radix_tree_tag_set(&mapping->page_tree,
				790	page_index(page),
				791	PAGECACHE_TAG_WRITEBACK);
				792	if (!PageDirty(page))
				793	radix_tree_tag_clear(&mapping->page_tree,
				794	page_index(page),
				795	PAGECACHE_TAG_DIRTY);
				796	write_unlock_irqrestore(&mapping->tree_lock, flags);
				797	} else {
				798	ret = TestSetPageWriteback(page);
				799	}
				800	return ret;
				801
				802	}
				803	EXPORT_SYMBOL(test_set_page_writeback);
				804
				805	/*
				806	* Return true if any of the pages in the mapping are marged with the
				807	* passed tag.
				808	*/
				809	int mapping_tagged(struct address_space *mapping, int tag)
				810	{
				811	unsigned long flags;
				812	int ret;
				813
				814	read_lock_irqsave(&mapping->tree_lock, flags);
				815	ret = radix_tree_tagged(&mapping->page_tree, tag);
				816	read_unlock_irqrestore(&mapping->tree_lock, flags);
				817	return ret;
				818	}
				819	EXPORT_SYMBOL(mapping_tagged);