Blame - mm/vmscan.c - kernel/msm-4.9

blob: 0ea71e887bb6e3fdf51d9b18ad56fe993423249d [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/vmscan.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	*
				6	* Swap reorganised 29.12.95, Stephen Tweedie.
				7	* kswapd added: 7.1.96 sct
				8	* Removed kswapd_ctl limits, and swap out as many pages as needed
				9	* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
				10	* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
				11	* Multiqueue VM started 5.8.00, Rik van Riel.
				12	*/
				13
				14	#include <linux/mm.h>
				15	#include <linux/module.h>
				16	#include <linux/slab.h>
				17	#include <linux/kernel_stat.h>
				18	#include <linux/swap.h>
				19	#include <linux/pagemap.h>
				20	#include <linux/init.h>
				21	#include <linux/highmem.h>
				22	#include <linux/file.h>
				23	#include <linux/writeback.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/buffer_head.h> /* for try_to_release_page(),
				26	buffer_heads_over_limit */
				27	#include <linux/mm_inline.h>
				28	#include <linux/pagevec.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/rmap.h>
				31	#include <linux/topology.h>
				32	#include <linux/cpu.h>
				33	#include <linux/cpuset.h>
				34	#include <linux/notifier.h>
				35	#include <linux/rwsem.h>
				36
				37	#include <asm/tlbflush.h>
				38	#include <asm/div64.h>
				39
				40	#include <linux/swapops.h>
				41
				42	/* possible outcome of pageout() */
				43	typedef enum {
				44	/* failed to write page out, page is locked */
				45	PAGE_KEEP,
				46	/* move page to the active list, page is locked */
				47	PAGE_ACTIVATE,
				48	/* page has been sent to the disk successfully, page is unlocked */
				49	PAGE_SUCCESS,
				50	/* page is clean and locked */
				51	PAGE_CLEAN,
				52	} pageout_t;
				53
				54	struct scan_control {
				55	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
				56	unsigned long nr_to_scan;
				57
				58	/* Incremented by the number of inactive pages that were scanned */
				59	unsigned long nr_scanned;
				60
				61	/* Incremented by the number of pages reclaimed */
				62	unsigned long nr_reclaimed;
				63
				64	unsigned long nr_mapped; /* From page_state */
				65
				66	/* How many pages shrink_cache() should reclaim */
				67	int nr_to_reclaim;
				68
				69	/* Ask shrink_caches, or shrink_zone to scan at this priority */
				70	unsigned int priority;
				71
				72	/* This context's GFP mask */
				73	unsigned int gfp_mask;
				74
				75	int may_writepage;
				76
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	77	/* Can pages be swapped as part of reclaim? */
				78	int may_swap;
				79
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	80	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
				81	* suspend, we effectively ignore SWAP_CLUSTER_MAX.
				82	* In this context, it doesn't matter that we scan the
				83	* whole list at once. */
				84	int swap_cluster_max;
				85	};
				86
				87	/*
				88	* The list of shrinker callbacks used by to apply pressure to
				89	* ageable caches.
				90	*/
				91	struct shrinker {
				92	shrinker_t shrinker;
				93	struct list_head list;
				94	int seeks; /* seeks to recreate an obj */
				95	long nr; /* objs pending delete */
				96	};
				97
				98	#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
				99
				100	#ifdef ARCH_HAS_PREFETCH
				101	#define prefetch_prev_lru_page(_page, _base, _field) \
				102	do { \
				103	if ((_page)->lru.prev != _base) { \
				104	struct page *prev; \
				105	\
				106	prev = lru_to_page(&(_page->lru)); \
				107	prefetch(&prev->_field); \
				108	} \
				109	} while (0)
				110	#else
				111	#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
				112	#endif
				113
				114	#ifdef ARCH_HAS_PREFETCHW
				115	#define prefetchw_prev_lru_page(_page, _base, _field) \
				116	do { \
				117	if ((_page)->lru.prev != _base) { \
				118	struct page *prev; \
				119	\
				120	prev = lru_to_page(&(_page->lru)); \
				121	prefetchw(&prev->_field); \
				122	} \
				123	} while (0)
				124	#else
				125	#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
				126	#endif
				127
				128	/*
				129	* From 0 .. 100. Higher means more swappy.
				130	*/
				131	int vm_swappiness = 60;
				132	static long total_memory;
				133
				134	static LIST_HEAD(shrinker_list);
				135	static DECLARE_RWSEM(shrinker_rwsem);
				136
				137	/*
				138	* Add a shrinker callback to be called from the vm
				139	*/
				140	struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
				141	{
				142	struct shrinker *shrinker;
				143
				144	shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
				145	if (shrinker) {
				146	shrinker->shrinker = theshrinker;
				147	shrinker->seeks = seeks;
				148	shrinker->nr = 0;
				149	down_write(&shrinker_rwsem);
				150	list_add_tail(&shrinker->list, &shrinker_list);
				151	up_write(&shrinker_rwsem);
				152	}
				153	return shrinker;
				154	}
				155	EXPORT_SYMBOL(set_shrinker);
				156
				157	/*
				158	* Remove one
				159	*/
				160	void remove_shrinker(struct shrinker *shrinker)
				161	{
				162	down_write(&shrinker_rwsem);
				163	list_del(&shrinker->list);
				164	up_write(&shrinker_rwsem);
				165	kfree(shrinker);
				166	}
				167	EXPORT_SYMBOL(remove_shrinker);
				168
				169	#define SHRINK_BATCH 128
				170	/*
				171	* Call the shrink functions to age shrinkable caches
				172	*
				173	* Here we assume it costs one seek to replace a lru page and that it also
				174	* takes a seek to recreate a cache object. With this in mind we age equal
				175	* percentages of the lru and ageable caches. This should balance the seeks
				176	* generated by these structures.
				177	*
				178	* If the vm encounted mapped pages on the LRU it increase the pressure on
				179	* slab to avoid swapping.
				180	*
				181	* We do weird things to avoid (scannedseeksentries) overflowing 32 bits.
				182	*
				183	* `lru_pages' represents the number of on-LRU pages in all the zones which
				184	* are eligible for the caller's allocation attempt. It is used for balancing
				185	* slab reclaim versus page reclaim.
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	186	*
				187	* Returns the number of slab objects which we shrunk.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	188	*/
				189	static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
				190	unsigned long lru_pages)
				191	{
				192	struct shrinker *shrinker;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	193	int ret = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194
				195	if (scanned == 0)
				196	scanned = SWAP_CLUSTER_MAX;
				197
				198	if (!down_read_trylock(&shrinker_rwsem))
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	199	return 1; /* Assume we'll be able to shrink next time */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	200
				201	list_for_each_entry(shrinker, &shrinker_list, list) {
				202	unsigned long long delta;
				203	unsigned long total_scan;
				204
				205	delta = (4 * scanned) / shrinker->seeks;
				206	delta = (shrinker->shrinker)(0, gfp_mask);
				207	do_div(delta, lru_pages + 1);
				208	shrinker->nr += delta;
				209	if (shrinker->nr < 0)
				210	shrinker->nr = LONG_MAX; /* It wrapped! */
				211
				212	total_scan = shrinker->nr;
				213	shrinker->nr = 0;
				214
				215	while (total_scan >= SHRINK_BATCH) {
				216	long this_scan = SHRINK_BATCH;
				217	int shrink_ret;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	218	int nr_before;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	219
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	220	nr_before = (*shrinker->shrinker)(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221	shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
				222	if (shrink_ret == -1)
				223	break;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	224	if (shrink_ret < nr_before)
				225	ret += nr_before - shrink_ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	226	mod_page_state(slabs_scanned, this_scan);
				227	total_scan -= this_scan;
				228
				229	cond_resched();
				230	}
				231
				232	shrinker->nr += total_scan;
				233	}
				234	up_read(&shrinker_rwsem);
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	235	return ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	236	}
				237
				238	/* Called without lock on whether page is mapped, so answer is unstable */
				239	static inline int page_mapping_inuse(struct page *page)
				240	{
				241	struct address_space *mapping;
				242
				243	/* Page is in somebody's page tables. */
				244	if (page_mapped(page))
				245	return 1;
				246
				247	/* Be more reluctant to reclaim swapcache than pagecache */
				248	if (PageSwapCache(page))
				249	return 1;
				250
				251	mapping = page_mapping(page);
				252	if (!mapping)
				253	return 0;
				254
				255	/* File is mmap'd by somebody? */
				256	return mapping_mapped(mapping);
				257	}
				258
				259	static inline int is_page_cache_freeable(struct page *page)
				260	{
				261	return page_count(page) - !!PagePrivate(page) == 2;
				262	}
				263
				264	static int may_write_to_queue(struct backing_dev_info *bdi)
				265	{
				266	if (current_is_kswapd())
				267	return 1;
				268	if (current_is_pdflush()) /* This is unlikely, but why not... */
				269	return 1;
				270	if (!bdi_write_congested(bdi))
				271	return 1;
				272	if (bdi == current->backing_dev_info)
				273	return 1;
				274	return 0;
				275	}
				276
				277	/*
				278	* We detected a synchronous write error writing a page out. Probably
				279	* -ENOSPC. We need to propagate that into the address_space for a subsequent
				280	* fsync(), msync() or close().
				281	*
				282	* The tricky part is that after writepage we cannot touch the mapping: nothing
				283	* prevents it from being freed up. But we have a ref on the page and once
				284	* that page is locked, the mapping is pinned.
				285	*
				286	* We're allowed to run sleeping lock_page() here because we know the caller has
				287	* __GFP_FS.
				288	*/
				289	static void handle_write_error(struct address_space *mapping,
				290	struct page *page, int error)
				291	{
				292	lock_page(page);
				293	if (page_mapping(page) == mapping) {
				294	if (error == -ENOSPC)
				295	set_bit(AS_ENOSPC, &mapping->flags);
				296	else
				297	set_bit(AS_EIO, &mapping->flags);
				298	}
				299	unlock_page(page);
				300	}
				301
				302	/*
				303	* pageout is called by shrink_list() for each dirty page. Calls ->writepage().
				304	*/
				305	static pageout_t pageout(struct page page, struct address_space mapping)
				306	{
				307	/*
				308	* If the page is dirty, only perform writeback if that write
				309	* will be non-blocking. To prevent this allocation from being
				310	* stalled by pagecache activity. But note that there may be
				311	* stalls if we need to run get_block(). We could test
				312	* PagePrivate for that.
				313	*
				314	* If this process is currently in generic_file_write() against
				315	* this page's queue, we can perform writeback even if that
				316	* will block.
				317	*
				318	* If the page is swapcache, write it back even if that would
				319	* block, for some throttling. This happens by accident, because
				320	* swap_backing_dev_info is bust: it doesn't reflect the
				321	* congestion state of the swapdevs. Easy to fix, if needed.
				322	* See swapfile.c:page_queue_congested().
				323	*/
				324	if (!is_page_cache_freeable(page))
				325	return PAGE_KEEP;
				326	if (!mapping) {
				327	/*
				328	* Some data journaling orphaned pages can have
				329	* page->mapping == NULL while being dirty with clean buffers.
				330	*/
akpm@osdl.org	323aca6	2005-04-16 15:24:06 -0700	[diff] [blame]	331	if (PagePrivate(page)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	332	if (try_to_free_buffers(page)) {
				333	ClearPageDirty(page);
				334	printk("%s: orphaned page\n", __FUNCTION__);
				335	return PAGE_CLEAN;
				336	}
				337	}
				338	return PAGE_KEEP;
				339	}
				340	if (mapping->a_ops->writepage == NULL)
				341	return PAGE_ACTIVATE;
				342	if (!may_write_to_queue(mapping->backing_dev_info))
				343	return PAGE_KEEP;
				344
				345	if (clear_page_dirty_for_io(page)) {
				346	int res;
				347	struct writeback_control wbc = {
				348	.sync_mode = WB_SYNC_NONE,
				349	.nr_to_write = SWAP_CLUSTER_MAX,
				350	.nonblocking = 1,
				351	.for_reclaim = 1,
				352	};
				353
				354	SetPageReclaim(page);
				355	res = mapping->a_ops->writepage(page, &wbc);
				356	if (res < 0)
				357	handle_write_error(mapping, page, res);
				358	if (res == WRITEPAGE_ACTIVATE) {
				359	ClearPageReclaim(page);
				360	return PAGE_ACTIVATE;
				361	}
				362	if (!PageWriteback(page)) {
				363	/* synchronous write or broken a_ops? */
				364	ClearPageReclaim(page);
				365	}
				366
				367	return PAGE_SUCCESS;
				368	}
				369
				370	return PAGE_CLEAN;
				371	}
				372
				373	/*
				374	* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
				375	*/
				376	static int shrink_list(struct list_head page_list, struct scan_control sc)
				377	{
				378	LIST_HEAD(ret_pages);
				379	struct pagevec freed_pvec;
				380	int pgactivate = 0;
				381	int reclaimed = 0;
				382
				383	cond_resched();
				384
				385	pagevec_init(&freed_pvec, 1);
				386	while (!list_empty(page_list)) {
				387	struct address_space *mapping;
				388	struct page *page;
				389	int may_enter_fs;
				390	int referenced;
				391
				392	cond_resched();
				393
				394	page = lru_to_page(page_list);
				395	list_del(&page->lru);
				396
				397	if (TestSetPageLocked(page))
				398	goto keep;
				399
				400	BUG_ON(PageActive(page));
				401
				402	sc->nr_scanned++;
				403	/* Double the slab pressure for mapped and swapcache pages */
				404	if (page_mapped(page) \|\| PageSwapCache(page))
				405	sc->nr_scanned++;
				406
				407	if (PageWriteback(page))
				408	goto keep_locked;
				409
				410	referenced = page_referenced(page, 1, sc->priority <= 0);
				411	/* In active use or really unfreeable? Activate it. */
				412	if (referenced && page_mapping_inuse(page))
				413	goto activate_locked;
				414
				415	#ifdef CONFIG_SWAP
				416	/*
				417	* Anonymous process memory has backing store?
				418	* Try to allocate it some swap space here.
				419	*/
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	420	if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	421	if (!add_to_swap(page))
				422	goto activate_locked;
				423	}
				424	#endif /* CONFIG_SWAP */
				425
				426	mapping = page_mapping(page);
				427	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|
				428	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
				429
				430	/*
				431	* The page is mapped into the page tables of one or more
				432	* processes. Try to unmap it here.
				433	*/
				434	if (page_mapped(page) && mapping) {
				435	switch (try_to_unmap(page)) {
				436	case SWAP_FAIL:
				437	goto activate_locked;
				438	case SWAP_AGAIN:
				439	goto keep_locked;
				440	case SWAP_SUCCESS:
				441	; /* try to free the page below */
				442	}
				443	}
				444
				445	if (PageDirty(page)) {
				446	if (referenced)
				447	goto keep_locked;
				448	if (!may_enter_fs)
				449	goto keep_locked;
				450	if (laptop_mode && !sc->may_writepage)
				451	goto keep_locked;
				452
				453	/* Page is dirty, try to write it out here */
				454	switch(pageout(page, mapping)) {
				455	case PAGE_KEEP:
				456	goto keep_locked;
				457	case PAGE_ACTIVATE:
				458	goto activate_locked;
				459	case PAGE_SUCCESS:
				460	if (PageWriteback(page) \|\| PageDirty(page))
				461	goto keep;
				462	/*
				463	* A synchronous write - probably a ramdisk. Go
				464	* ahead and try to reclaim the page.
				465	*/
				466	if (TestSetPageLocked(page))
				467	goto keep;
				468	if (PageDirty(page) \|\| PageWriteback(page))
				469	goto keep_locked;
				470	mapping = page_mapping(page);
				471	case PAGE_CLEAN:
				472	; /* try to free the page below */
				473	}
				474	}
				475
				476	/*
				477	* If the page has buffers, try to free the buffer mappings
				478	* associated with this page. If we succeed we try to free
				479	* the page as well.
				480	*
				481	* We do this even if the page is PageDirty().
				482	* try_to_release_page() does not perform I/O, but it is
				483	* possible for a page to have PageDirty set, but it is actually
				484	* clean (all its buffers are clean). This happens if the
				485	* buffers were written out directly, with submit_bh(). ext3
				486	* will do this, as well as the blockdev mapping.
				487	* try_to_release_page() will discover that cleanness and will
				488	* drop the buffers and mark the page clean - it can be freed.
				489	*
				490	* Rarely, pages can have buffers and no ->mapping. These are
				491	* the pages which were not successfully invalidated in
				492	* truncate_complete_page(). We try to drop those buffers here
				493	* and if that worked, and the page is no longer mapped into
				494	* process address space (page_count == 1) it can be freed.
				495	* Otherwise, leave the page on the LRU so it is swappable.
				496	*/
				497	if (PagePrivate(page)) {
				498	if (!try_to_release_page(page, sc->gfp_mask))
				499	goto activate_locked;
				500	if (!mapping && page_count(page) == 1)
				501	goto free_it;
				502	}
				503
				504	if (!mapping)
				505	goto keep_locked; /* truncate got there first */
				506
				507	write_lock_irq(&mapping->tree_lock);
				508
				509	/*
				510	* The non-racy check for busy page. It is critical to check
				511	* PageDirty _after_ making sure that the page is freeable and
				512	* not in use by anybody. (pagecache + us == 2)
				513	*/
				514	if (page_count(page) != 2 \|\| PageDirty(page)) {
				515	write_unlock_irq(&mapping->tree_lock);
				516	goto keep_locked;
				517	}
				518
				519	#ifdef CONFIG_SWAP
				520	if (PageSwapCache(page)) {
				521	swp_entry_t swap = { .val = page->private };
				522	__delete_from_swap_cache(page);
				523	write_unlock_irq(&mapping->tree_lock);
				524	swap_free(swap);
				525	__put_page(page); /* The pagecache ref */
				526	goto free_it;
				527	}
				528	#endif /* CONFIG_SWAP */
				529
				530	__remove_from_page_cache(page);
				531	write_unlock_irq(&mapping->tree_lock);
				532	__put_page(page);
				533
				534	free_it:
				535	unlock_page(page);
				536	reclaimed++;
				537	if (!pagevec_add(&freed_pvec, page))
				538	__pagevec_release_nonlru(&freed_pvec);
				539	continue;
				540
				541	activate_locked:
				542	SetPageActive(page);
				543	pgactivate++;
				544	keep_locked:
				545	unlock_page(page);
				546	keep:
				547	list_add(&page->lru, &ret_pages);
				548	BUG_ON(PageLRU(page));
				549	}
				550	list_splice(&ret_pages, page_list);
				551	if (pagevec_count(&freed_pvec))
				552	__pagevec_release_nonlru(&freed_pvec);
				553	mod_page_state(pgactivate, pgactivate);
				554	sc->nr_reclaimed += reclaimed;
				555	return reclaimed;
				556	}
				557
				558	/*
				559	* zone->lru_lock is heavily contended. Some of the functions that
				560	* shrink the lists perform better by taking out a batch of pages
				561	* and working on them outside the LRU lock.
				562	*
				563	* For pagecache intensive workloads, this function is the hottest
				564	* spot in the kernel (apart from copy_*_user functions).
				565	*
				566	* Appropriate locks must be held before calling this function.
				567	*
				568	* @nr_to_scan: The number of pages to look through on the list.
				569	* @src: The LRU list to pull pages off.
				570	* @dst: The temp list to put pages on to.
				571	* @scanned: The number of pages that were scanned.
				572	*
				573	* returns how many pages were moved onto *@dst.
				574	*/
				575	static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
				576	struct list_head dst, int scanned)
				577	{
				578	int nr_taken = 0;
				579	struct page *page;
				580	int scan = 0;
				581
				582	while (scan++ < nr_to_scan && !list_empty(src)) {
				583	page = lru_to_page(src);
				584	prefetchw_prev_lru_page(page, src, flags);
				585
				586	if (!TestClearPageLRU(page))
				587	BUG();
				588	list_del(&page->lru);
				589	if (get_page_testone(page)) {
				590	/*
				591	* It is being freed elsewhere
				592	*/
				593	__put_page(page);
				594	SetPageLRU(page);
				595	list_add(&page->lru, src);
				596	continue;
				597	} else {
				598	list_add(&page->lru, dst);
				599	nr_taken++;
				600	}
				601	}
				602
				603	*scanned = scan;
				604	return nr_taken;
				605	}
				606
				607	/*
				608	* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
				609	*/
				610	static void shrink_cache(struct zone zone, struct scan_control sc)
				611	{
				612	LIST_HEAD(page_list);
				613	struct pagevec pvec;
				614	int max_scan = sc->nr_to_scan;
				615
				616	pagevec_init(&pvec, 1);
				617
				618	lru_add_drain();
				619	spin_lock_irq(&zone->lru_lock);
				620	while (max_scan > 0) {
				621	struct page *page;
				622	int nr_taken;
				623	int nr_scan;
				624	int nr_freed;
				625
				626	nr_taken = isolate_lru_pages(sc->swap_cluster_max,
				627	&zone->inactive_list,
				628	&page_list, &nr_scan);
				629	zone->nr_inactive -= nr_taken;
				630	zone->pages_scanned += nr_scan;
				631	spin_unlock_irq(&zone->lru_lock);
				632
				633	if (nr_taken == 0)
				634	goto done;
				635
				636	max_scan -= nr_scan;
				637	if (current_is_kswapd())
				638	mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
				639	else
				640	mod_page_state_zone(zone, pgscan_direct, nr_scan);
				641	nr_freed = shrink_list(&page_list, sc);
				642	if (current_is_kswapd())
				643	mod_page_state(kswapd_steal, nr_freed);
				644	mod_page_state_zone(zone, pgsteal, nr_freed);
				645	sc->nr_to_reclaim -= nr_freed;
				646
				647	spin_lock_irq(&zone->lru_lock);
				648	/*
				649	* Put back any unfreeable pages.
				650	*/
				651	while (!list_empty(&page_list)) {
				652	page = lru_to_page(&page_list);
				653	if (TestSetPageLRU(page))
				654	BUG();
				655	list_del(&page->lru);
				656	if (PageActive(page))
				657	add_page_to_active_list(zone, page);
				658	else
				659	add_page_to_inactive_list(zone, page);
				660	if (!pagevec_add(&pvec, page)) {
				661	spin_unlock_irq(&zone->lru_lock);
				662	__pagevec_release(&pvec);
				663	spin_lock_irq(&zone->lru_lock);
				664	}
				665	}
				666	}
				667	spin_unlock_irq(&zone->lru_lock);
				668	done:
				669	pagevec_release(&pvec);
				670	}
				671
				672	/*
				673	* This moves pages from the active list to the inactive list.
				674	*
				675	* We move them the other way if the page is referenced by one or more
				676	* processes, from rmap.
				677	*
				678	* If the pages are mostly unmapped, the processing is fast and it is
				679	* appropriate to hold zone->lru_lock across the whole operation. But if
				680	* the pages are mapped, the processing is slow (page_referenced()) so we
				681	* should drop zone->lru_lock around each page. It's impossible to balance
				682	* this, so instead we remove the pages from the LRU while processing them.
				683	* It is safe to rely on PG_active against the non-LRU pages in here because
				684	* nobody will play with that bit on a non-LRU page.
				685	*
				686	* The downside is that we have to touch page->_count against each page.
				687	* But we had to alter page->flags anyway.
				688	*/
				689	static void
				690	refill_inactive_zone(struct zone zone, struct scan_control sc)
				691	{
				692	int pgmoved;
				693	int pgdeactivate = 0;
				694	int pgscanned;
				695	int nr_pages = sc->nr_to_scan;
				696	LIST_HEAD(l_hold); /* The pages which were snipped off */
				697	LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
				698	LIST_HEAD(l_active); /* Pages to go onto the active_list */
				699	struct page *page;
				700	struct pagevec pvec;
				701	int reclaim_mapped = 0;
				702	long mapped_ratio;
				703	long distress;
				704	long swap_tendency;
				705
				706	lru_add_drain();
				707	spin_lock_irq(&zone->lru_lock);
				708	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
				709	&l_hold, &pgscanned);
				710	zone->pages_scanned += pgscanned;
				711	zone->nr_active -= pgmoved;
				712	spin_unlock_irq(&zone->lru_lock);
				713
				714	/*
				715	* `distress' is a measure of how much trouble we're having reclaiming
				716	* pages. 0 -> no problems. 100 -> great trouble.
				717	*/
				718	distress = 100 >> zone->prev_priority;
				719
				720	/*
				721	* The point of this algorithm is to decide when to start reclaiming
				722	* mapped memory instead of just pagecache. Work out how much memory
				723	* is mapped.
				724	*/
				725	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
				726
				727	/*
				728	* Now decide how much we really want to unmap some pages. The mapped
				729	* ratio is downgraded - just because there's a lot of mapped memory
				730	* doesn't necessarily mean that page reclaim isn't succeeding.
				731	*
				732	* The distress ratio is important - we don't want to start going oom.
				733	*
				734	* A 100% value of vm_swappiness overrides this algorithm altogether.
				735	*/
				736	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
				737
				738	/*
				739	* Now use this metric to decide whether to start moving mapped memory
				740	* onto the inactive list.
				741	*/
				742	if (swap_tendency >= 100)
				743	reclaim_mapped = 1;
				744
				745	while (!list_empty(&l_hold)) {
				746	cond_resched();
				747	page = lru_to_page(&l_hold);
				748	list_del(&page->lru);
				749	if (page_mapped(page)) {
				750	if (!reclaim_mapped \|\|
				751	(total_swap_pages == 0 && PageAnon(page)) \|\|
				752	page_referenced(page, 0, sc->priority <= 0)) {
				753	list_add(&page->lru, &l_active);
				754	continue;
				755	}
				756	}
				757	list_add(&page->lru, &l_inactive);
				758	}
				759
				760	pagevec_init(&pvec, 1);
				761	pgmoved = 0;
				762	spin_lock_irq(&zone->lru_lock);
				763	while (!list_empty(&l_inactive)) {
				764	page = lru_to_page(&l_inactive);
				765	prefetchw_prev_lru_page(page, &l_inactive, flags);
				766	if (TestSetPageLRU(page))
				767	BUG();
				768	if (!TestClearPageActive(page))
				769	BUG();
				770	list_move(&page->lru, &zone->inactive_list);
				771	pgmoved++;
				772	if (!pagevec_add(&pvec, page)) {
				773	zone->nr_inactive += pgmoved;
				774	spin_unlock_irq(&zone->lru_lock);
				775	pgdeactivate += pgmoved;
				776	pgmoved = 0;
				777	if (buffer_heads_over_limit)
				778	pagevec_strip(&pvec);
				779	__pagevec_release(&pvec);
				780	spin_lock_irq(&zone->lru_lock);
				781	}
				782	}
				783	zone->nr_inactive += pgmoved;
				784	pgdeactivate += pgmoved;
				785	if (buffer_heads_over_limit) {
				786	spin_unlock_irq(&zone->lru_lock);
				787	pagevec_strip(&pvec);
				788	spin_lock_irq(&zone->lru_lock);
				789	}
				790
				791	pgmoved = 0;
				792	while (!list_empty(&l_active)) {
				793	page = lru_to_page(&l_active);
				794	prefetchw_prev_lru_page(page, &l_active, flags);
				795	if (TestSetPageLRU(page))
				796	BUG();
				797	BUG_ON(!PageActive(page));
				798	list_move(&page->lru, &zone->active_list);
				799	pgmoved++;
				800	if (!pagevec_add(&pvec, page)) {
				801	zone->nr_active += pgmoved;
				802	pgmoved = 0;
				803	spin_unlock_irq(&zone->lru_lock);
				804	__pagevec_release(&pvec);
				805	spin_lock_irq(&zone->lru_lock);
				806	}
				807	}
				808	zone->nr_active += pgmoved;
				809	spin_unlock_irq(&zone->lru_lock);
				810	pagevec_release(&pvec);
				811
				812	mod_page_state_zone(zone, pgrefill, pgscanned);
				813	mod_page_state(pgdeactivate, pgdeactivate);
				814	}
				815
				816	/*
				817	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
				818	*/
				819	static void
				820	shrink_zone(struct zone zone, struct scan_control sc)
				821	{
				822	unsigned long nr_active;
				823	unsigned long nr_inactive;
				824
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	825	atomic_inc(&zone->reclaim_in_progress);
				826
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	827	/*
				828	* Add one to `nr_to_scan' just to make sure that the kernel will
				829	* slowly sift through the active list.
				830	*/
				831	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
				832	nr_active = zone->nr_scan_active;
				833	if (nr_active >= sc->swap_cluster_max)
				834	zone->nr_scan_active = 0;
				835	else
				836	nr_active = 0;
				837
				838	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
				839	nr_inactive = zone->nr_scan_inactive;
				840	if (nr_inactive >= sc->swap_cluster_max)
				841	zone->nr_scan_inactive = 0;
				842	else
				843	nr_inactive = 0;
				844
				845	sc->nr_to_reclaim = sc->swap_cluster_max;
				846
				847	while (nr_active \|\| nr_inactive) {
				848	if (nr_active) {
				849	sc->nr_to_scan = min(nr_active,
				850	(unsigned long)sc->swap_cluster_max);
				851	nr_active -= sc->nr_to_scan;
				852	refill_inactive_zone(zone, sc);
				853	}
				854
				855	if (nr_inactive) {
				856	sc->nr_to_scan = min(nr_inactive,
				857	(unsigned long)sc->swap_cluster_max);
				858	nr_inactive -= sc->nr_to_scan;
				859	shrink_cache(zone, sc);
				860	if (sc->nr_to_reclaim <= 0)
				861	break;
				862	}
				863	}
				864
				865	throttle_vm_writeout();
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	866
				867	atomic_dec(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	868	}
				869
				870	/*
				871	* This is the direct reclaim path, for page-allocating processes. We only
				872	* try to reclaim pages from zones which will satisfy the caller's allocation
				873	* request.
				874	*
				875	* We reclaim from a zone even if that zone is over pages_high. Because:
				876	* a) The caller may be trying to free extra pages to satisfy a higher-order
				877	* allocation or
				878	* b) The zones may be over pages_high but they must go over pages_high to
				879	* satisfy the `incremental min' zone defense algorithm.
				880	*
				881	* Returns the number of reclaimed pages.
				882	*
				883	* If a zone is deemed to be full of pinned pages then just give it a light
				884	* scan then give up on it.
				885	*/
				886	static void
				887	shrink_caches(struct zone *zones, struct scan_control sc)
				888	{
				889	int i;
				890
				891	for (i = 0; zones[i] != NULL; i++) {
				892	struct zone *zone = zones[i];
				893
				894	if (zone->present_pages == 0)
				895	continue;
				896
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	897	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	898	continue;
				899
				900	zone->temp_priority = sc->priority;
				901	if (zone->prev_priority > sc->priority)
				902	zone->prev_priority = sc->priority;
				903
				904	if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
				905	continue; /* Let kswapd poll it */
				906
				907	shrink_zone(zone, sc);
				908	}
				909	}
				910
				911	/*
				912	* This is the main entry point to direct page reclaim.
				913	*
				914	* If a full scan of the inactive list fails to free enough memory then we
				915	* are "out of memory" and something needs to be killed.
				916	*
				917	* If the caller is !__GFP_FS then the probability of a failure is reasonably
				918	* high - the zone may be full of dirty or under-writeback pages, which this
				919	* caller can't do much about. We kick pdflush and take explicit naps in the
				920	* hope that some of these pages can be written. But if the allocating task
				921	* holds filesystem locks which prevent writeout this might not work, and the
				922	* allocation attempt will fail.
				923	*/
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	924	int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	925	{
				926	int priority;
				927	int ret = 0;
				928	int total_scanned = 0, total_reclaimed = 0;
				929	struct reclaim_state *reclaim_state = current->reclaim_state;
				930	struct scan_control sc;
				931	unsigned long lru_pages = 0;
				932	int i;
				933
				934	sc.gfp_mask = gfp_mask;
				935	sc.may_writepage = 0;
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	936	sc.may_swap = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	937
				938	inc_page_state(allocstall);
				939
				940	for (i = 0; zones[i] != NULL; i++) {
				941	struct zone *zone = zones[i];
				942
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	943	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	944	continue;
				945
				946	zone->temp_priority = DEF_PRIORITY;
				947	lru_pages += zone->nr_active + zone->nr_inactive;
				948	}
				949
				950	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
				951	sc.nr_mapped = read_page_state(nr_mapped);
				952	sc.nr_scanned = 0;
				953	sc.nr_reclaimed = 0;
				954	sc.priority = priority;
				955	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
				956	shrink_caches(zones, &sc);
				957	shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
				958	if (reclaim_state) {
				959	sc.nr_reclaimed += reclaim_state->reclaimed_slab;
				960	reclaim_state->reclaimed_slab = 0;
				961	}
				962	total_scanned += sc.nr_scanned;
				963	total_reclaimed += sc.nr_reclaimed;
				964	if (total_reclaimed >= sc.swap_cluster_max) {
				965	ret = 1;
				966	goto out;
				967	}
				968
				969	/*
				970	* Try to write back as many pages as we just scanned. This
				971	* tends to cause slow streaming writers to write data to the
				972	* disk smoothly, at the dirtying rate, which is nice. But
				973	* that's undesirable in laptop mode, where we want lumpy
				974	* writeout. So in laptop mode, write out the whole world.
				975	*/
				976	if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	977	wakeup_pdflush(laptop_mode ? 0 : total_scanned);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	978	sc.may_writepage = 1;
				979	}
				980
				981	/* Take a nap, wait for some writeback to complete */
				982	if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
				983	blk_congestion_wait(WRITE, HZ/10);
				984	}
				985	out:
				986	for (i = 0; zones[i] != 0; i++) {
				987	struct zone *zone = zones[i];
				988
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	989	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	990	continue;
				991
				992	zone->prev_priority = zone->temp_priority;
				993	}
				994	return ret;
				995	}
				996
				997	/*
				998	* For kswapd, balance_pgdat() will work across all this node's zones until
				999	* they are all at pages_high.
				1000	*
				1001	* If `nr_pages' is non-zero then it is the number of pages which are to be
				1002	* reclaimed, regardless of the zone occupancies. This is a software suspend
				1003	* special.
				1004	*
				1005	* Returns the number of pages which were actually freed.
				1006	*
				1007	* There is special handling here for zones which are full of pinned pages.
				1008	* This can happen if the pages are all mlocked, or if they are all used by
				1009	* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
				1010	* What we do is to detect the case where all pages in the zone have been
				1011	* scanned twice and there has been zero successful reclaim. Mark the zone as
				1012	* dead and from now on, only perform a short scan. Basically we're polling
				1013	* the zone for when the problem goes away.
				1014	*
				1015	* kswapd scans the zones in the highmem->normal->dma direction. It skips
				1016	* zones which have free_pages > pages_high, but once a zone is found to have
				1017	* free_pages <= pages_high, we scan that zone and the lower zones regardless
				1018	* of the number of free pages in the lower zones. This interoperates with
				1019	* the page allocator fallback scheme to ensure that aging of pages is balanced
				1020	* across the zones.
				1021	*/
				1022	static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
				1023	{
				1024	int to_free = nr_pages;
				1025	int all_zones_ok;
				1026	int priority;
				1027	int i;
				1028	int total_scanned, total_reclaimed;
				1029	struct reclaim_state *reclaim_state = current->reclaim_state;
				1030	struct scan_control sc;
				1031
				1032	loop_again:
				1033	total_scanned = 0;
				1034	total_reclaimed = 0;
				1035	sc.gfp_mask = GFP_KERNEL;
				1036	sc.may_writepage = 0;
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	1037	sc.may_swap = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1038	sc.nr_mapped = read_page_state(nr_mapped);
				1039
				1040	inc_page_state(pageoutrun);
				1041
				1042	for (i = 0; i < pgdat->nr_zones; i++) {
				1043	struct zone *zone = pgdat->node_zones + i;
				1044
				1045	zone->temp_priority = DEF_PRIORITY;
				1046	}
				1047
				1048	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
				1049	int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
				1050	unsigned long lru_pages = 0;
				1051
				1052	all_zones_ok = 1;
				1053
				1054	if (nr_pages == 0) {
				1055	/*
				1056	* Scan in the highmem->dma direction for the highest
				1057	* zone which needs scanning
				1058	*/
				1059	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
				1060	struct zone *zone = pgdat->node_zones + i;
				1061
				1062	if (zone->present_pages == 0)
				1063	continue;
				1064
				1065	if (zone->all_unreclaimable &&
				1066	priority != DEF_PRIORITY)
				1067	continue;
				1068
				1069	if (!zone_watermark_ok(zone, order,
				1070	zone->pages_high, 0, 0, 0)) {
				1071	end_zone = i;
				1072	goto scan;
				1073	}
				1074	}
				1075	goto out;
				1076	} else {
				1077	end_zone = pgdat->nr_zones - 1;
				1078	}
				1079	scan:
				1080	for (i = 0; i <= end_zone; i++) {
				1081	struct zone *zone = pgdat->node_zones + i;
				1082
				1083	lru_pages += zone->nr_active + zone->nr_inactive;
				1084	}
				1085
				1086	/*
				1087	* Now scan the zone in the dma->highmem direction, stopping
				1088	* at the last zone which needs scanning.
				1089	*
				1090	* We do this because the page allocator works in the opposite
				1091	* direction. This prevents the page allocator from allocating
				1092	* pages behind kswapd's direction of progress, which would
				1093	* cause too much scanning of the lower zones.
				1094	*/
				1095	for (i = 0; i <= end_zone; i++) {
				1096	struct zone *zone = pgdat->node_zones + i;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1097	int nr_slab;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1098
				1099	if (zone->present_pages == 0)
				1100	continue;
				1101
				1102	if (zone->all_unreclaimable && priority != DEF_PRIORITY)
				1103	continue;
				1104
				1105	if (nr_pages == 0) { /* Not software suspend */
				1106	if (!zone_watermark_ok(zone, order,
				1107	zone->pages_high, end_zone, 0, 0))
				1108	all_zones_ok = 0;
				1109	}
				1110	zone->temp_priority = priority;
				1111	if (zone->prev_priority > priority)
				1112	zone->prev_priority = priority;
				1113	sc.nr_scanned = 0;
				1114	sc.nr_reclaimed = 0;
				1115	sc.priority = priority;
				1116	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1117	atomic_inc(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1118	shrink_zone(zone, &sc);
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1119	atomic_dec(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1120	reclaim_state->reclaimed_slab = 0;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1121	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
				1122	lru_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1123	sc.nr_reclaimed += reclaim_state->reclaimed_slab;
				1124	total_reclaimed += sc.nr_reclaimed;
				1125	total_scanned += sc.nr_scanned;
				1126	if (zone->all_unreclaimable)
				1127	continue;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1128	if (nr_slab == 0 && zone->pages_scanned >=
				1129	(zone->nr_active + zone->nr_inactive) * 4)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1130	zone->all_unreclaimable = 1;
				1131	/*
				1132	* If we've done a decent amount of scanning and
				1133	* the reclaim ratio is low, start doing writepage
				1134	* even in laptop mode
				1135	*/
				1136	if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
				1137	total_scanned > total_reclaimed+total_reclaimed/2)
				1138	sc.may_writepage = 1;
				1139	}
				1140	if (nr_pages && to_free > total_reclaimed)
				1141	continue; /* swsusp: need to do more work */
				1142	if (all_zones_ok)
				1143	break; /* kswapd: all done */
				1144	/*
				1145	* OK, kswapd is getting into trouble. Take a nap, then take
				1146	* another pass across the zones.
				1147	*/
				1148	if (total_scanned && priority < DEF_PRIORITY - 2)
				1149	blk_congestion_wait(WRITE, HZ/10);
				1150
				1151	/*
				1152	* We do this so kswapd doesn't build up large priorities for
				1153	* example when it is freeing in parallel with allocators. It
				1154	* matches the direct reclaim path behaviour in terms of impact
				1155	* on zone->*_priority.
				1156	*/
				1157	if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
				1158	break;
				1159	}
				1160	out:
				1161	for (i = 0; i < pgdat->nr_zones; i++) {
				1162	struct zone *zone = pgdat->node_zones + i;
				1163
				1164	zone->prev_priority = zone->temp_priority;
				1165	}
				1166	if (!all_zones_ok) {
				1167	cond_resched();
				1168	goto loop_again;
				1169	}
				1170
				1171	return total_reclaimed;
				1172	}
				1173
				1174	/*
				1175	* The background pageout daemon, started as a kernel thread
				1176	* from the init process.
				1177	*
				1178	* This basically trickles out pages so that we have _some_
				1179	* free memory available even if there is no other activity
				1180	* that frees anything up. This is needed for things like routing
				1181	* etc, where we otherwise might have all activity going on in
				1182	* asynchronous contexts that cannot page things out.
				1183	*
				1184	* If there are applications that are active memory-allocators
				1185	* (most normal use), this basically shouldn't matter.
				1186	*/
				1187	static int kswapd(void *p)
				1188	{
				1189	unsigned long order;
				1190	pg_data_t pgdat = (pg_data_t)p;
				1191	struct task_struct *tsk = current;
				1192	DEFINE_WAIT(wait);
				1193	struct reclaim_state reclaim_state = {
				1194	.reclaimed_slab = 0,
				1195	};
				1196	cpumask_t cpumask;
				1197
				1198	daemonize("kswapd%d", pgdat->node_id);
				1199	cpumask = node_to_cpumask(pgdat->node_id);
				1200	if (!cpus_empty(cpumask))
				1201	set_cpus_allowed(tsk, cpumask);
				1202	current->reclaim_state = &reclaim_state;
				1203
				1204	/*
				1205	* Tell the memory management that we're a "memory allocator",
				1206	* and that if we need more memory we should get access to it
				1207	* regardless (see "__alloc_pages()"). "kswapd" should
				1208	* never get caught in the normal page freeing logic.
				1209	*
				1210	* (Kswapd normally doesn't need memory anyway, but sometimes
				1211	* you need a small amount of memory in order to be able to
				1212	* page out something else, and this flag essentially protects
				1213	* us from recursively trying to free more memory as we're
				1214	* trying to free the first piece of memory in the first place).
				1215	*/
				1216	tsk->flags \|= PF_MEMALLOC\|PF_KSWAPD;
				1217
				1218	order = 0;
				1219	for ( ; ; ) {
				1220	unsigned long new_order;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1221
				1222	try_to_freeze();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1223
				1224	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
				1225	new_order = pgdat->kswapd_max_order;
				1226	pgdat->kswapd_max_order = 0;
				1227	if (order < new_order) {
				1228	/*
				1229	* Don't sleep if someone wants a larger 'order'
				1230	* allocation
				1231	*/
				1232	order = new_order;
				1233	} else {
				1234	schedule();
				1235	order = pgdat->kswapd_max_order;
				1236	}
				1237	finish_wait(&pgdat->kswapd_wait, &wait);
				1238
				1239	balance_pgdat(pgdat, 0, order);
				1240	}
				1241	return 0;
				1242	}
				1243
				1244	/*
				1245	* A zone is low on free memory, so wake its kswapd task to service it.
				1246	*/
				1247	void wakeup_kswapd(struct zone *zone, int order)
				1248	{
				1249	pg_data_t *pgdat;
				1250
				1251	if (zone->present_pages == 0)
				1252	return;
				1253
				1254	pgdat = zone->zone_pgdat;
				1255	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
				1256	return;
				1257	if (pgdat->kswapd_max_order < order)
				1258	pgdat->kswapd_max_order = order;
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	1259	if (!cpuset_zone_allowed(zone, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1260	return;
Con Kolivas	8d0986e	2005-09-13 01:25:07 -0700	[diff] [blame]	1261	if (!waitqueue_active(&pgdat->kswapd_wait))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1262	return;
Con Kolivas	8d0986e	2005-09-13 01:25:07 -0700	[diff] [blame]	1263	wake_up_interruptible(&pgdat->kswapd_wait);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1264	}
				1265
				1266	#ifdef CONFIG_PM
				1267	/*
				1268	* Try to free `nr_pages' of memory, system-wide. Returns the number of freed
				1269	* pages.
				1270	*/
				1271	int shrink_all_memory(int nr_pages)
				1272	{
				1273	pg_data_t *pgdat;
				1274	int nr_to_free = nr_pages;
				1275	int ret = 0;
				1276	struct reclaim_state reclaim_state = {
				1277	.reclaimed_slab = 0,
				1278	};
				1279
				1280	current->reclaim_state = &reclaim_state;
				1281	for_each_pgdat(pgdat) {
				1282	int freed;
				1283	freed = balance_pgdat(pgdat, nr_to_free, 0);
				1284	ret += freed;
				1285	nr_to_free -= freed;
				1286	if (nr_to_free <= 0)
				1287	break;
				1288	}
				1289	current->reclaim_state = NULL;
				1290	return ret;
				1291	}
				1292	#endif
				1293
				1294	#ifdef CONFIG_HOTPLUG_CPU
				1295	/* It's optimal to keep kswapds on the same CPUs as their memory, but
				1296	not required for correctness. So if the last cpu in a node goes
				1297	away, we get changed to run anywhere: as the first one comes back,
				1298	restore their cpu bindings. */
				1299	static int __devinit cpu_callback(struct notifier_block *nfb,
				1300	unsigned long action,
				1301	void *hcpu)
				1302	{
				1303	pg_data_t *pgdat;
				1304	cpumask_t mask;
				1305
				1306	if (action == CPU_ONLINE) {
				1307	for_each_pgdat(pgdat) {
				1308	mask = node_to_cpumask(pgdat->node_id);
				1309	if (any_online_cpu(mask) != NR_CPUS)
				1310	/* One of our CPUs online: restore mask */
				1311	set_cpus_allowed(pgdat->kswapd, mask);
				1312	}
				1313	}
				1314	return NOTIFY_OK;
				1315	}
				1316	#endif /* CONFIG_HOTPLUG_CPU */
				1317
				1318	static int __init kswapd_init(void)
				1319	{
				1320	pg_data_t *pgdat;
				1321	swap_setup();
				1322	for_each_pgdat(pgdat)
				1323	pgdat->kswapd
				1324	= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
				1325	total_memory = nr_free_pagecache_pages();
				1326	hotcpu_notifier(cpu_callback, 0);
				1327	return 0;
				1328	}
				1329
				1330	module_init(kswapd_init)
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1331
				1332
				1333	/*
				1334	* Try to free up some pages from this zone through reclaim.
				1335	*/
				1336	int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
				1337	{
				1338	struct scan_control sc;
				1339	int nr_pages = 1 << order;
				1340	int total_reclaimed = 0;
				1341
				1342	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
				1343	if (!(gfp_mask & __GFP_WAIT))
				1344	return 0;
				1345	if (zone->all_unreclaimable)
				1346	return 0;
				1347
				1348	sc.gfp_mask = gfp_mask;
				1349	sc.may_writepage = 0;
				1350	sc.may_swap = 0;
				1351	sc.nr_mapped = read_page_state(nr_mapped);
				1352	sc.nr_scanned = 0;
				1353	sc.nr_reclaimed = 0;
				1354	/* scan at the highest priority */
				1355	sc.priority = 0;
				1356
				1357	if (nr_pages > SWAP_CLUSTER_MAX)
				1358	sc.swap_cluster_max = nr_pages;
				1359	else
				1360	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
				1361
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1362	/* Don't reclaim the zone if there are other reclaimers active */
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	1363	if (atomic_read(&zone->reclaim_in_progress) > 0)
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1364	goto out;
				1365
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1366	shrink_zone(zone, &sc);
				1367	total_reclaimed = sc.nr_reclaimed;
				1368
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1369	out:
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1370	return total_reclaimed;
				1371	}
				1372
				1373	asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
				1374	unsigned int state)
				1375	{
				1376	struct zone *z;
				1377	int i;
				1378
Martin Hicks	bce5f6ba	2005-09-03 15:54:50 -0700	[diff] [blame]	1379	if (!capable(CAP_SYS_ADMIN))
				1380	return -EACCES;
				1381
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1382	if (node >= MAX_NUMNODES \|\| !node_online(node))
				1383	return -EINVAL;
				1384
				1385	/* This will break if we ever add more zones */
				1386	if (!(zone & (1<<ZONE_DMA\|1<<ZONE_NORMAL\|1<<ZONE_HIGHMEM)))
				1387	return -EINVAL;
				1388
				1389	for (i = 0; i < MAX_NR_ZONES; i++) {
				1390	if (!(zone & 1<<i))
				1391	continue;
				1392
				1393	z = &NODE_DATA(node)->node_zones[i];
				1394
				1395	if (state)
				1396	z->reclaim_pages = 1;
				1397	else
				1398	z->reclaim_pages = 0;
				1399	}
				1400
				1401	return 0;
				1402	}