Blame - mm/vmscan.c - kernel/msm

blob: cfffe5098d538e6d54d1954c523d455924cbf7fc [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/vmscan.c
				3	*
				4	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				5	*
				6	* Swap reorganised 29.12.95, Stephen Tweedie.
				7	* kswapd added: 7.1.96 sct
				8	* Removed kswapd_ctl limits, and swap out as many pages as needed
				9	* to bring the system back to freepages.high: 2.4.97, Rik van Riel.
				10	* Zone aware kswapd started 02/00, Kanoj Sarcar (kanoj@sgi.com).
				11	* Multiqueue VM started 5.8.00, Rik van Riel.
				12	*/
				13
				14	#include <linux/mm.h>
				15	#include <linux/module.h>
				16	#include <linux/slab.h>
				17	#include <linux/kernel_stat.h>
				18	#include <linux/swap.h>
				19	#include <linux/pagemap.h>
				20	#include <linux/init.h>
				21	#include <linux/highmem.h>
				22	#include <linux/file.h>
				23	#include <linux/writeback.h>
				24	#include <linux/blkdev.h>
				25	#include <linux/buffer_head.h> /* for try_to_release_page(),
				26	buffer_heads_over_limit */
				27	#include <linux/mm_inline.h>
				28	#include <linux/pagevec.h>
				29	#include <linux/backing-dev.h>
				30	#include <linux/rmap.h>
				31	#include <linux/topology.h>
				32	#include <linux/cpu.h>
				33	#include <linux/cpuset.h>
				34	#include <linux/notifier.h>
				35	#include <linux/rwsem.h>
				36
				37	#include <asm/tlbflush.h>
				38	#include <asm/div64.h>
				39
				40	#include <linux/swapops.h>
				41
				42	/* possible outcome of pageout() */
				43	typedef enum {
				44	/* failed to write page out, page is locked */
				45	PAGE_KEEP,
				46	/* move page to the active list, page is locked */
				47	PAGE_ACTIVATE,
				48	/* page has been sent to the disk successfully, page is unlocked */
				49	PAGE_SUCCESS,
				50	/* page is clean and locked */
				51	PAGE_CLEAN,
				52	} pageout_t;
				53
				54	struct scan_control {
				55	/* Ask refill_inactive_zone, or shrink_cache to scan this many pages */
				56	unsigned long nr_to_scan;
				57
				58	/* Incremented by the number of inactive pages that were scanned */
				59	unsigned long nr_scanned;
				60
				61	/* Incremented by the number of pages reclaimed */
				62	unsigned long nr_reclaimed;
				63
				64	unsigned long nr_mapped; /* From page_state */
				65
				66	/* How many pages shrink_cache() should reclaim */
				67	int nr_to_reclaim;
				68
				69	/* Ask shrink_caches, or shrink_zone to scan at this priority */
				70	unsigned int priority;
				71
				72	/* This context's GFP mask */
				73	unsigned int gfp_mask;
				74
				75	int may_writepage;
				76
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	77	/* Can pages be swapped as part of reclaim? */
				78	int may_swap;
				79
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	80	/* This context's SWAP_CLUSTER_MAX. If freeing memory for
				81	* suspend, we effectively ignore SWAP_CLUSTER_MAX.
				82	* In this context, it doesn't matter that we scan the
				83	* whole list at once. */
				84	int swap_cluster_max;
				85	};
				86
				87	/*
				88	* The list of shrinker callbacks used by to apply pressure to
				89	* ageable caches.
				90	*/
				91	struct shrinker {
				92	shrinker_t shrinker;
				93	struct list_head list;
				94	int seeks; /* seeks to recreate an obj */
				95	long nr; /* objs pending delete */
				96	};
				97
				98	#define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
				99
				100	#ifdef ARCH_HAS_PREFETCH
				101	#define prefetch_prev_lru_page(_page, _base, _field) \
				102	do { \
				103	if ((_page)->lru.prev != _base) { \
				104	struct page *prev; \
				105	\
				106	prev = lru_to_page(&(_page->lru)); \
				107	prefetch(&prev->_field); \
				108	} \
				109	} while (0)
				110	#else
				111	#define prefetch_prev_lru_page(_page, _base, _field) do { } while (0)
				112	#endif
				113
				114	#ifdef ARCH_HAS_PREFETCHW
				115	#define prefetchw_prev_lru_page(_page, _base, _field) \
				116	do { \
				117	if ((_page)->lru.prev != _base) { \
				118	struct page *prev; \
				119	\
				120	prev = lru_to_page(&(_page->lru)); \
				121	prefetchw(&prev->_field); \
				122	} \
				123	} while (0)
				124	#else
				125	#define prefetchw_prev_lru_page(_page, _base, _field) do { } while (0)
				126	#endif
				127
				128	/*
				129	* From 0 .. 100. Higher means more swappy.
				130	*/
				131	int vm_swappiness = 60;
				132	static long total_memory;
				133
				134	static LIST_HEAD(shrinker_list);
				135	static DECLARE_RWSEM(shrinker_rwsem);
				136
				137	/*
				138	* Add a shrinker callback to be called from the vm
				139	*/
				140	struct shrinker *set_shrinker(int seeks, shrinker_t theshrinker)
				141	{
				142	struct shrinker *shrinker;
				143
				144	shrinker = kmalloc(sizeof(*shrinker), GFP_KERNEL);
				145	if (shrinker) {
				146	shrinker->shrinker = theshrinker;
				147	shrinker->seeks = seeks;
				148	shrinker->nr = 0;
				149	down_write(&shrinker_rwsem);
				150	list_add_tail(&shrinker->list, &shrinker_list);
				151	up_write(&shrinker_rwsem);
				152	}
				153	return shrinker;
				154	}
				155	EXPORT_SYMBOL(set_shrinker);
				156
				157	/*
				158	* Remove one
				159	*/
				160	void remove_shrinker(struct shrinker *shrinker)
				161	{
				162	down_write(&shrinker_rwsem);
				163	list_del(&shrinker->list);
				164	up_write(&shrinker_rwsem);
				165	kfree(shrinker);
				166	}
				167	EXPORT_SYMBOL(remove_shrinker);
				168
				169	#define SHRINK_BATCH 128
				170	/*
				171	* Call the shrink functions to age shrinkable caches
				172	*
				173	* Here we assume it costs one seek to replace a lru page and that it also
				174	* takes a seek to recreate a cache object. With this in mind we age equal
				175	* percentages of the lru and ageable caches. This should balance the seeks
				176	* generated by these structures.
				177	*
				178	* If the vm encounted mapped pages on the LRU it increase the pressure on
				179	* slab to avoid swapping.
				180	*
				181	* We do weird things to avoid (scannedseeksentries) overflowing 32 bits.
				182	*
				183	* `lru_pages' represents the number of on-LRU pages in all the zones which
				184	* are eligible for the caller's allocation attempt. It is used for balancing
				185	* slab reclaim versus page reclaim.
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	186	*
				187	* Returns the number of slab objects which we shrunk.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	188	*/
				189	static int shrink_slab(unsigned long scanned, unsigned int gfp_mask,
				190	unsigned long lru_pages)
				191	{
				192	struct shrinker *shrinker;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	193	int ret = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194
				195	if (scanned == 0)
				196	scanned = SWAP_CLUSTER_MAX;
				197
				198	if (!down_read_trylock(&shrinker_rwsem))
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	199	return 1; /* Assume we'll be able to shrink next time */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	200
				201	list_for_each_entry(shrinker, &shrinker_list, list) {
				202	unsigned long long delta;
				203	unsigned long total_scan;
				204
				205	delta = (4 * scanned) / shrinker->seeks;
				206	delta = (shrinker->shrinker)(0, gfp_mask);
				207	do_div(delta, lru_pages + 1);
				208	shrinker->nr += delta;
				209	if (shrinker->nr < 0)
				210	shrinker->nr = LONG_MAX; /* It wrapped! */
				211
				212	total_scan = shrinker->nr;
				213	shrinker->nr = 0;
				214
				215	while (total_scan >= SHRINK_BATCH) {
				216	long this_scan = SHRINK_BATCH;
				217	int shrink_ret;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	218	int nr_before;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	219
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	220	nr_before = (*shrinker->shrinker)(0, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221	shrink_ret = (*shrinker->shrinker)(this_scan, gfp_mask);
				222	if (shrink_ret == -1)
				223	break;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	224	if (shrink_ret < nr_before)
				225	ret += nr_before - shrink_ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	226	mod_page_state(slabs_scanned, this_scan);
				227	total_scan -= this_scan;
				228
				229	cond_resched();
				230	}
				231
				232	shrinker->nr += total_scan;
				233	}
				234	up_read(&shrinker_rwsem);
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	235	return ret;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	236	}
				237
				238	/* Called without lock on whether page is mapped, so answer is unstable */
				239	static inline int page_mapping_inuse(struct page *page)
				240	{
				241	struct address_space *mapping;
				242
				243	/* Page is in somebody's page tables. */
				244	if (page_mapped(page))
				245	return 1;
				246
				247	/* Be more reluctant to reclaim swapcache than pagecache */
				248	if (PageSwapCache(page))
				249	return 1;
				250
				251	mapping = page_mapping(page);
				252	if (!mapping)
				253	return 0;
				254
				255	/* File is mmap'd by somebody? */
				256	return mapping_mapped(mapping);
				257	}
				258
				259	static inline int is_page_cache_freeable(struct page *page)
				260	{
				261	return page_count(page) - !!PagePrivate(page) == 2;
				262	}
				263
				264	static int may_write_to_queue(struct backing_dev_info *bdi)
				265	{
				266	if (current_is_kswapd())
				267	return 1;
				268	if (current_is_pdflush()) /* This is unlikely, but why not... */
				269	return 1;
				270	if (!bdi_write_congested(bdi))
				271	return 1;
				272	if (bdi == current->backing_dev_info)
				273	return 1;
				274	return 0;
				275	}
				276
				277	/*
				278	* We detected a synchronous write error writing a page out. Probably
				279	* -ENOSPC. We need to propagate that into the address_space for a subsequent
				280	* fsync(), msync() or close().
				281	*
				282	* The tricky part is that after writepage we cannot touch the mapping: nothing
				283	* prevents it from being freed up. But we have a ref on the page and once
				284	* that page is locked, the mapping is pinned.
				285	*
				286	* We're allowed to run sleeping lock_page() here because we know the caller has
				287	* __GFP_FS.
				288	*/
				289	static void handle_write_error(struct address_space *mapping,
				290	struct page *page, int error)
				291	{
				292	lock_page(page);
				293	if (page_mapping(page) == mapping) {
				294	if (error == -ENOSPC)
				295	set_bit(AS_ENOSPC, &mapping->flags);
				296	else
				297	set_bit(AS_EIO, &mapping->flags);
				298	}
				299	unlock_page(page);
				300	}
				301
				302	/*
				303	* pageout is called by shrink_list() for each dirty page. Calls ->writepage().
				304	*/
				305	static pageout_t pageout(struct page page, struct address_space mapping)
				306	{
				307	/*
				308	* If the page is dirty, only perform writeback if that write
				309	* will be non-blocking. To prevent this allocation from being
				310	* stalled by pagecache activity. But note that there may be
				311	* stalls if we need to run get_block(). We could test
				312	* PagePrivate for that.
				313	*
				314	* If this process is currently in generic_file_write() against
				315	* this page's queue, we can perform writeback even if that
				316	* will block.
				317	*
				318	* If the page is swapcache, write it back even if that would
				319	* block, for some throttling. This happens by accident, because
				320	* swap_backing_dev_info is bust: it doesn't reflect the
				321	* congestion state of the swapdevs. Easy to fix, if needed.
				322	* See swapfile.c:page_queue_congested().
				323	*/
				324	if (!is_page_cache_freeable(page))
				325	return PAGE_KEEP;
				326	if (!mapping) {
				327	/*
				328	* Some data journaling orphaned pages can have
				329	* page->mapping == NULL while being dirty with clean buffers.
				330	*/
akpm@osdl.org	323aca6	2005-04-16 15:24:06 -0700	[diff] [blame]	331	if (PagePrivate(page)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	332	if (try_to_free_buffers(page)) {
				333	ClearPageDirty(page);
				334	printk("%s: orphaned page\n", __FUNCTION__);
				335	return PAGE_CLEAN;
				336	}
				337	}
				338	return PAGE_KEEP;
				339	}
				340	if (mapping->a_ops->writepage == NULL)
				341	return PAGE_ACTIVATE;
				342	if (!may_write_to_queue(mapping->backing_dev_info))
				343	return PAGE_KEEP;
				344
				345	if (clear_page_dirty_for_io(page)) {
				346	int res;
				347	struct writeback_control wbc = {
				348	.sync_mode = WB_SYNC_NONE,
				349	.nr_to_write = SWAP_CLUSTER_MAX,
				350	.nonblocking = 1,
				351	.for_reclaim = 1,
				352	};
				353
				354	SetPageReclaim(page);
				355	res = mapping->a_ops->writepage(page, &wbc);
				356	if (res < 0)
				357	handle_write_error(mapping, page, res);
				358	if (res == WRITEPAGE_ACTIVATE) {
				359	ClearPageReclaim(page);
				360	return PAGE_ACTIVATE;
				361	}
				362	if (!PageWriteback(page)) {
				363	/* synchronous write or broken a_ops? */
				364	ClearPageReclaim(page);
				365	}
				366
				367	return PAGE_SUCCESS;
				368	}
				369
				370	return PAGE_CLEAN;
				371	}
				372
				373	/*
				374	* shrink_list adds the number of reclaimed pages to sc->nr_reclaimed
				375	*/
				376	static int shrink_list(struct list_head page_list, struct scan_control sc)
				377	{
				378	LIST_HEAD(ret_pages);
				379	struct pagevec freed_pvec;
				380	int pgactivate = 0;
				381	int reclaimed = 0;
				382
				383	cond_resched();
				384
				385	pagevec_init(&freed_pvec, 1);
				386	while (!list_empty(page_list)) {
				387	struct address_space *mapping;
				388	struct page *page;
				389	int may_enter_fs;
				390	int referenced;
				391
				392	cond_resched();
				393
				394	page = lru_to_page(page_list);
				395	list_del(&page->lru);
				396
				397	if (TestSetPageLocked(page))
				398	goto keep;
				399
				400	BUG_ON(PageActive(page));
				401
				402	sc->nr_scanned++;
				403	/* Double the slab pressure for mapped and swapcache pages */
				404	if (page_mapped(page) \|\| PageSwapCache(page))
				405	sc->nr_scanned++;
				406
				407	if (PageWriteback(page))
				408	goto keep_locked;
				409
				410	referenced = page_referenced(page, 1, sc->priority <= 0);
				411	/* In active use or really unfreeable? Activate it. */
				412	if (referenced && page_mapping_inuse(page))
				413	goto activate_locked;
				414
				415	#ifdef CONFIG_SWAP
				416	/*
				417	* Anonymous process memory has backing store?
				418	* Try to allocate it some swap space here.
				419	*/
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	420	if (PageAnon(page) && !PageSwapCache(page) && sc->may_swap) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	421	if (!add_to_swap(page))
				422	goto activate_locked;
				423	}
				424	#endif /* CONFIG_SWAP */
				425
				426	mapping = page_mapping(page);
				427	may_enter_fs = (sc->gfp_mask & __GFP_FS) \|\|
				428	(PageSwapCache(page) && (sc->gfp_mask & __GFP_IO));
				429
				430	/*
				431	* The page is mapped into the page tables of one or more
				432	* processes. Try to unmap it here.
				433	*/
				434	if (page_mapped(page) && mapping) {
				435	switch (try_to_unmap(page)) {
				436	case SWAP_FAIL:
				437	goto activate_locked;
				438	case SWAP_AGAIN:
				439	goto keep_locked;
				440	case SWAP_SUCCESS:
				441	; /* try to free the page below */
				442	}
				443	}
				444
				445	if (PageDirty(page)) {
				446	if (referenced)
				447	goto keep_locked;
				448	if (!may_enter_fs)
				449	goto keep_locked;
				450	if (laptop_mode && !sc->may_writepage)
				451	goto keep_locked;
				452
				453	/* Page is dirty, try to write it out here */
				454	switch(pageout(page, mapping)) {
				455	case PAGE_KEEP:
				456	goto keep_locked;
				457	case PAGE_ACTIVATE:
				458	goto activate_locked;
				459	case PAGE_SUCCESS:
				460	if (PageWriteback(page) \|\| PageDirty(page))
				461	goto keep;
				462	/*
				463	* A synchronous write - probably a ramdisk. Go
				464	* ahead and try to reclaim the page.
				465	*/
				466	if (TestSetPageLocked(page))
				467	goto keep;
				468	if (PageDirty(page) \|\| PageWriteback(page))
				469	goto keep_locked;
				470	mapping = page_mapping(page);
				471	case PAGE_CLEAN:
				472	; /* try to free the page below */
				473	}
				474	}
				475
				476	/*
				477	* If the page has buffers, try to free the buffer mappings
				478	* associated with this page. If we succeed we try to free
				479	* the page as well.
				480	*
				481	* We do this even if the page is PageDirty().
				482	* try_to_release_page() does not perform I/O, but it is
				483	* possible for a page to have PageDirty set, but it is actually
				484	* clean (all its buffers are clean). This happens if the
				485	* buffers were written out directly, with submit_bh(). ext3
				486	* will do this, as well as the blockdev mapping.
				487	* try_to_release_page() will discover that cleanness and will
				488	* drop the buffers and mark the page clean - it can be freed.
				489	*
				490	* Rarely, pages can have buffers and no ->mapping. These are
				491	* the pages which were not successfully invalidated in
				492	* truncate_complete_page(). We try to drop those buffers here
				493	* and if that worked, and the page is no longer mapped into
				494	* process address space (page_count == 1) it can be freed.
				495	* Otherwise, leave the page on the LRU so it is swappable.
				496	*/
				497	if (PagePrivate(page)) {
				498	if (!try_to_release_page(page, sc->gfp_mask))
				499	goto activate_locked;
				500	if (!mapping && page_count(page) == 1)
				501	goto free_it;
				502	}
				503
				504	if (!mapping)
				505	goto keep_locked; /* truncate got there first */
				506
				507	write_lock_irq(&mapping->tree_lock);
				508
				509	/*
				510	* The non-racy check for busy page. It is critical to check
				511	* PageDirty _after_ making sure that the page is freeable and
				512	* not in use by anybody. (pagecache + us == 2)
				513	*/
				514	if (page_count(page) != 2 \|\| PageDirty(page)) {
				515	write_unlock_irq(&mapping->tree_lock);
				516	goto keep_locked;
				517	}
				518
				519	#ifdef CONFIG_SWAP
				520	if (PageSwapCache(page)) {
				521	swp_entry_t swap = { .val = page->private };
				522	__delete_from_swap_cache(page);
				523	write_unlock_irq(&mapping->tree_lock);
				524	swap_free(swap);
				525	__put_page(page); /* The pagecache ref */
				526	goto free_it;
				527	}
				528	#endif /* CONFIG_SWAP */
				529
				530	__remove_from_page_cache(page);
				531	write_unlock_irq(&mapping->tree_lock);
				532	__put_page(page);
				533
				534	free_it:
				535	unlock_page(page);
				536	reclaimed++;
				537	if (!pagevec_add(&freed_pvec, page))
				538	__pagevec_release_nonlru(&freed_pvec);
				539	continue;
				540
				541	activate_locked:
				542	SetPageActive(page);
				543	pgactivate++;
				544	keep_locked:
				545	unlock_page(page);
				546	keep:
				547	list_add(&page->lru, &ret_pages);
				548	BUG_ON(PageLRU(page));
				549	}
				550	list_splice(&ret_pages, page_list);
				551	if (pagevec_count(&freed_pvec))
				552	__pagevec_release_nonlru(&freed_pvec);
				553	mod_page_state(pgactivate, pgactivate);
				554	sc->nr_reclaimed += reclaimed;
				555	return reclaimed;
				556	}
				557
				558	/*
				559	* zone->lru_lock is heavily contended. Some of the functions that
				560	* shrink the lists perform better by taking out a batch of pages
				561	* and working on them outside the LRU lock.
				562	*
				563	* For pagecache intensive workloads, this function is the hottest
				564	* spot in the kernel (apart from copy_*_user functions).
				565	*
				566	* Appropriate locks must be held before calling this function.
				567	*
				568	* @nr_to_scan: The number of pages to look through on the list.
				569	* @src: The LRU list to pull pages off.
				570	* @dst: The temp list to put pages on to.
				571	* @scanned: The number of pages that were scanned.
				572	*
				573	* returns how many pages were moved onto *@dst.
				574	*/
				575	static int isolate_lru_pages(int nr_to_scan, struct list_head *src,
				576	struct list_head dst, int scanned)
				577	{
				578	int nr_taken = 0;
				579	struct page *page;
				580	int scan = 0;
				581
				582	while (scan++ < nr_to_scan && !list_empty(src)) {
				583	page = lru_to_page(src);
				584	prefetchw_prev_lru_page(page, src, flags);
				585
				586	if (!TestClearPageLRU(page))
				587	BUG();
				588	list_del(&page->lru);
				589	if (get_page_testone(page)) {
				590	/*
				591	* It is being freed elsewhere
				592	*/
				593	__put_page(page);
				594	SetPageLRU(page);
				595	list_add(&page->lru, src);
				596	continue;
				597	} else {
				598	list_add(&page->lru, dst);
				599	nr_taken++;
				600	}
				601	}
				602
				603	*scanned = scan;
				604	return nr_taken;
				605	}
				606
				607	/*
				608	* shrink_cache() adds the number of pages reclaimed to sc->nr_reclaimed
				609	*/
				610	static void shrink_cache(struct zone zone, struct scan_control sc)
				611	{
				612	LIST_HEAD(page_list);
				613	struct pagevec pvec;
				614	int max_scan = sc->nr_to_scan;
				615
				616	pagevec_init(&pvec, 1);
				617
				618	lru_add_drain();
				619	spin_lock_irq(&zone->lru_lock);
				620	while (max_scan > 0) {
				621	struct page *page;
				622	int nr_taken;
				623	int nr_scan;
				624	int nr_freed;
				625
				626	nr_taken = isolate_lru_pages(sc->swap_cluster_max,
				627	&zone->inactive_list,
				628	&page_list, &nr_scan);
				629	zone->nr_inactive -= nr_taken;
				630	zone->pages_scanned += nr_scan;
				631	spin_unlock_irq(&zone->lru_lock);
				632
				633	if (nr_taken == 0)
				634	goto done;
				635
				636	max_scan -= nr_scan;
				637	if (current_is_kswapd())
				638	mod_page_state_zone(zone, pgscan_kswapd, nr_scan);
				639	else
				640	mod_page_state_zone(zone, pgscan_direct, nr_scan);
				641	nr_freed = shrink_list(&page_list, sc);
				642	if (current_is_kswapd())
				643	mod_page_state(kswapd_steal, nr_freed);
				644	mod_page_state_zone(zone, pgsteal, nr_freed);
				645	sc->nr_to_reclaim -= nr_freed;
				646
				647	spin_lock_irq(&zone->lru_lock);
				648	/*
				649	* Put back any unfreeable pages.
				650	*/
				651	while (!list_empty(&page_list)) {
				652	page = lru_to_page(&page_list);
				653	if (TestSetPageLRU(page))
				654	BUG();
				655	list_del(&page->lru);
				656	if (PageActive(page))
				657	add_page_to_active_list(zone, page);
				658	else
				659	add_page_to_inactive_list(zone, page);
				660	if (!pagevec_add(&pvec, page)) {
				661	spin_unlock_irq(&zone->lru_lock);
				662	__pagevec_release(&pvec);
				663	spin_lock_irq(&zone->lru_lock);
				664	}
				665	}
				666	}
				667	spin_unlock_irq(&zone->lru_lock);
				668	done:
				669	pagevec_release(&pvec);
				670	}
				671
				672	/*
				673	* This moves pages from the active list to the inactive list.
				674	*
				675	* We move them the other way if the page is referenced by one or more
				676	* processes, from rmap.
				677	*
				678	* If the pages are mostly unmapped, the processing is fast and it is
				679	* appropriate to hold zone->lru_lock across the whole operation. But if
				680	* the pages are mapped, the processing is slow (page_referenced()) so we
				681	* should drop zone->lru_lock around each page. It's impossible to balance
				682	* this, so instead we remove the pages from the LRU while processing them.
				683	* It is safe to rely on PG_active against the non-LRU pages in here because
				684	* nobody will play with that bit on a non-LRU page.
				685	*
				686	* The downside is that we have to touch page->_count against each page.
				687	* But we had to alter page->flags anyway.
				688	*/
				689	static void
				690	refill_inactive_zone(struct zone zone, struct scan_control sc)
				691	{
				692	int pgmoved;
				693	int pgdeactivate = 0;
				694	int pgscanned;
				695	int nr_pages = sc->nr_to_scan;
				696	LIST_HEAD(l_hold); /* The pages which were snipped off */
				697	LIST_HEAD(l_inactive); /* Pages to go onto the inactive_list */
				698	LIST_HEAD(l_active); /* Pages to go onto the active_list */
				699	struct page *page;
				700	struct pagevec pvec;
				701	int reclaim_mapped = 0;
				702	long mapped_ratio;
				703	long distress;
				704	long swap_tendency;
				705
				706	lru_add_drain();
				707	spin_lock_irq(&zone->lru_lock);
				708	pgmoved = isolate_lru_pages(nr_pages, &zone->active_list,
				709	&l_hold, &pgscanned);
				710	zone->pages_scanned += pgscanned;
				711	zone->nr_active -= pgmoved;
				712	spin_unlock_irq(&zone->lru_lock);
				713
				714	/*
				715	* `distress' is a measure of how much trouble we're having reclaiming
				716	* pages. 0 -> no problems. 100 -> great trouble.
				717	*/
				718	distress = 100 >> zone->prev_priority;
				719
				720	/*
				721	* The point of this algorithm is to decide when to start reclaiming
				722	* mapped memory instead of just pagecache. Work out how much memory
				723	* is mapped.
				724	*/
				725	mapped_ratio = (sc->nr_mapped * 100) / total_memory;
				726
				727	/*
				728	* Now decide how much we really want to unmap some pages. The mapped
				729	* ratio is downgraded - just because there's a lot of mapped memory
				730	* doesn't necessarily mean that page reclaim isn't succeeding.
				731	*
				732	* The distress ratio is important - we don't want to start going oom.
				733	*
				734	* A 100% value of vm_swappiness overrides this algorithm altogether.
				735	*/
				736	swap_tendency = mapped_ratio / 2 + distress + vm_swappiness;
				737
				738	/*
				739	* Now use this metric to decide whether to start moving mapped memory
				740	* onto the inactive list.
				741	*/
				742	if (swap_tendency >= 100)
				743	reclaim_mapped = 1;
				744
				745	while (!list_empty(&l_hold)) {
				746	cond_resched();
				747	page = lru_to_page(&l_hold);
				748	list_del(&page->lru);
				749	if (page_mapped(page)) {
				750	if (!reclaim_mapped \|\|
				751	(total_swap_pages == 0 && PageAnon(page)) \|\|
				752	page_referenced(page, 0, sc->priority <= 0)) {
				753	list_add(&page->lru, &l_active);
				754	continue;
				755	}
				756	}
				757	list_add(&page->lru, &l_inactive);
				758	}
				759
				760	pagevec_init(&pvec, 1);
				761	pgmoved = 0;
				762	spin_lock_irq(&zone->lru_lock);
				763	while (!list_empty(&l_inactive)) {
				764	page = lru_to_page(&l_inactive);
				765	prefetchw_prev_lru_page(page, &l_inactive, flags);
				766	if (TestSetPageLRU(page))
				767	BUG();
				768	if (!TestClearPageActive(page))
				769	BUG();
				770	list_move(&page->lru, &zone->inactive_list);
				771	pgmoved++;
				772	if (!pagevec_add(&pvec, page)) {
				773	zone->nr_inactive += pgmoved;
				774	spin_unlock_irq(&zone->lru_lock);
				775	pgdeactivate += pgmoved;
				776	pgmoved = 0;
				777	if (buffer_heads_over_limit)
				778	pagevec_strip(&pvec);
				779	__pagevec_release(&pvec);
				780	spin_lock_irq(&zone->lru_lock);
				781	}
				782	}
				783	zone->nr_inactive += pgmoved;
				784	pgdeactivate += pgmoved;
				785	if (buffer_heads_over_limit) {
				786	spin_unlock_irq(&zone->lru_lock);
				787	pagevec_strip(&pvec);
				788	spin_lock_irq(&zone->lru_lock);
				789	}
				790
				791	pgmoved = 0;
				792	while (!list_empty(&l_active)) {
				793	page = lru_to_page(&l_active);
				794	prefetchw_prev_lru_page(page, &l_active, flags);
				795	if (TestSetPageLRU(page))
				796	BUG();
				797	BUG_ON(!PageActive(page));
				798	list_move(&page->lru, &zone->active_list);
				799	pgmoved++;
				800	if (!pagevec_add(&pvec, page)) {
				801	zone->nr_active += pgmoved;
				802	pgmoved = 0;
				803	spin_unlock_irq(&zone->lru_lock);
				804	__pagevec_release(&pvec);
				805	spin_lock_irq(&zone->lru_lock);
				806	}
				807	}
				808	zone->nr_active += pgmoved;
				809	spin_unlock_irq(&zone->lru_lock);
				810	pagevec_release(&pvec);
				811
				812	mod_page_state_zone(zone, pgrefill, pgscanned);
				813	mod_page_state(pgdeactivate, pgdeactivate);
				814	}
				815
				816	/*
				817	* This is a basic per-zone page freer. Used by both kswapd and direct reclaim.
				818	*/
				819	static void
				820	shrink_zone(struct zone zone, struct scan_control sc)
				821	{
				822	unsigned long nr_active;
				823	unsigned long nr_inactive;
				824
				825	/*
				826	* Add one to `nr_to_scan' just to make sure that the kernel will
				827	* slowly sift through the active list.
				828	*/
				829	zone->nr_scan_active += (zone->nr_active >> sc->priority) + 1;
				830	nr_active = zone->nr_scan_active;
				831	if (nr_active >= sc->swap_cluster_max)
				832	zone->nr_scan_active = 0;
				833	else
				834	nr_active = 0;
				835
				836	zone->nr_scan_inactive += (zone->nr_inactive >> sc->priority) + 1;
				837	nr_inactive = zone->nr_scan_inactive;
				838	if (nr_inactive >= sc->swap_cluster_max)
				839	zone->nr_scan_inactive = 0;
				840	else
				841	nr_inactive = 0;
				842
				843	sc->nr_to_reclaim = sc->swap_cluster_max;
				844
				845	while (nr_active \|\| nr_inactive) {
				846	if (nr_active) {
				847	sc->nr_to_scan = min(nr_active,
				848	(unsigned long)sc->swap_cluster_max);
				849	nr_active -= sc->nr_to_scan;
				850	refill_inactive_zone(zone, sc);
				851	}
				852
				853	if (nr_inactive) {
				854	sc->nr_to_scan = min(nr_inactive,
				855	(unsigned long)sc->swap_cluster_max);
				856	nr_inactive -= sc->nr_to_scan;
				857	shrink_cache(zone, sc);
				858	if (sc->nr_to_reclaim <= 0)
				859	break;
				860	}
				861	}
				862
				863	throttle_vm_writeout();
				864	}
				865
				866	/*
				867	* This is the direct reclaim path, for page-allocating processes. We only
				868	* try to reclaim pages from zones which will satisfy the caller's allocation
				869	* request.
				870	*
				871	* We reclaim from a zone even if that zone is over pages_high. Because:
				872	* a) The caller may be trying to free extra pages to satisfy a higher-order
				873	* allocation or
				874	* b) The zones may be over pages_high but they must go over pages_high to
				875	* satisfy the `incremental min' zone defense algorithm.
				876	*
				877	* Returns the number of reclaimed pages.
				878	*
				879	* If a zone is deemed to be full of pinned pages then just give it a light
				880	* scan then give up on it.
				881	*/
				882	static void
				883	shrink_caches(struct zone *zones, struct scan_control sc)
				884	{
				885	int i;
				886
				887	for (i = 0; zones[i] != NULL; i++) {
				888	struct zone *zone = zones[i];
				889
				890	if (zone->present_pages == 0)
				891	continue;
				892
				893	if (!cpuset_zone_allowed(zone))
				894	continue;
				895
				896	zone->temp_priority = sc->priority;
				897	if (zone->prev_priority > sc->priority)
				898	zone->prev_priority = sc->priority;
				899
				900	if (zone->all_unreclaimable && sc->priority != DEF_PRIORITY)
				901	continue; /* Let kswapd poll it */
				902
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	903	atomic_inc(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	904	shrink_zone(zone, sc);
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	905	atomic_dec(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	906	}
				907	}
				908
				909	/*
				910	* This is the main entry point to direct page reclaim.
				911	*
				912	* If a full scan of the inactive list fails to free enough memory then we
				913	* are "out of memory" and something needs to be killed.
				914	*
				915	* If the caller is !__GFP_FS then the probability of a failure is reasonably
				916	* high - the zone may be full of dirty or under-writeback pages, which this
				917	* caller can't do much about. We kick pdflush and take explicit naps in the
				918	* hope that some of these pages can be written. But if the allocating task
				919	* holds filesystem locks which prevent writeout this might not work, and the
				920	* allocation attempt will fail.
				921	*/
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	922	int try_to_free_pages(struct zone **zones, unsigned int gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	923	{
				924	int priority;
				925	int ret = 0;
				926	int total_scanned = 0, total_reclaimed = 0;
				927	struct reclaim_state *reclaim_state = current->reclaim_state;
				928	struct scan_control sc;
				929	unsigned long lru_pages = 0;
				930	int i;
				931
				932	sc.gfp_mask = gfp_mask;
				933	sc.may_writepage = 0;
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	934	sc.may_swap = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	935
				936	inc_page_state(allocstall);
				937
				938	for (i = 0; zones[i] != NULL; i++) {
				939	struct zone *zone = zones[i];
				940
				941	if (!cpuset_zone_allowed(zone))
				942	continue;
				943
				944	zone->temp_priority = DEF_PRIORITY;
				945	lru_pages += zone->nr_active + zone->nr_inactive;
				946	}
				947
				948	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
				949	sc.nr_mapped = read_page_state(nr_mapped);
				950	sc.nr_scanned = 0;
				951	sc.nr_reclaimed = 0;
				952	sc.priority = priority;
				953	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
				954	shrink_caches(zones, &sc);
				955	shrink_slab(sc.nr_scanned, gfp_mask, lru_pages);
				956	if (reclaim_state) {
				957	sc.nr_reclaimed += reclaim_state->reclaimed_slab;
				958	reclaim_state->reclaimed_slab = 0;
				959	}
				960	total_scanned += sc.nr_scanned;
				961	total_reclaimed += sc.nr_reclaimed;
				962	if (total_reclaimed >= sc.swap_cluster_max) {
				963	ret = 1;
				964	goto out;
				965	}
				966
				967	/*
				968	* Try to write back as many pages as we just scanned. This
				969	* tends to cause slow streaming writers to write data to the
				970	* disk smoothly, at the dirtying rate, which is nice. But
				971	* that's undesirable in laptop mode, where we want lumpy
				972	* writeout. So in laptop mode, write out the whole world.
				973	*/
				974	if (total_scanned > sc.swap_cluster_max + sc.swap_cluster_max/2) {
Pekka J Enberg	687a21c	2005-06-28 20:44:55 -0700	[diff] [blame]	975	wakeup_pdflush(laptop_mode ? 0 : total_scanned);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	976	sc.may_writepage = 1;
				977	}
				978
				979	/* Take a nap, wait for some writeback to complete */
				980	if (sc.nr_scanned && priority < DEF_PRIORITY - 2)
				981	blk_congestion_wait(WRITE, HZ/10);
				982	}
				983	out:
				984	for (i = 0; zones[i] != 0; i++) {
				985	struct zone *zone = zones[i];
				986
				987	if (!cpuset_zone_allowed(zone))
				988	continue;
				989
				990	zone->prev_priority = zone->temp_priority;
				991	}
				992	return ret;
				993	}
				994
				995	/*
				996	* For kswapd, balance_pgdat() will work across all this node's zones until
				997	* they are all at pages_high.
				998	*
				999	* If `nr_pages' is non-zero then it is the number of pages which are to be
				1000	* reclaimed, regardless of the zone occupancies. This is a software suspend
				1001	* special.
				1002	*
				1003	* Returns the number of pages which were actually freed.
				1004	*
				1005	* There is special handling here for zones which are full of pinned pages.
				1006	* This can happen if the pages are all mlocked, or if they are all used by
				1007	* device drivers (say, ZONE_DMA). Or if they are all in use by hugetlb.
				1008	* What we do is to detect the case where all pages in the zone have been
				1009	* scanned twice and there has been zero successful reclaim. Mark the zone as
				1010	* dead and from now on, only perform a short scan. Basically we're polling
				1011	* the zone for when the problem goes away.
				1012	*
				1013	* kswapd scans the zones in the highmem->normal->dma direction. It skips
				1014	* zones which have free_pages > pages_high, but once a zone is found to have
				1015	* free_pages <= pages_high, we scan that zone and the lower zones regardless
				1016	* of the number of free pages in the lower zones. This interoperates with
				1017	* the page allocator fallback scheme to ensure that aging of pages is balanced
				1018	* across the zones.
				1019	*/
				1020	static int balance_pgdat(pg_data_t *pgdat, int nr_pages, int order)
				1021	{
				1022	int to_free = nr_pages;
				1023	int all_zones_ok;
				1024	int priority;
				1025	int i;
				1026	int total_scanned, total_reclaimed;
				1027	struct reclaim_state *reclaim_state = current->reclaim_state;
				1028	struct scan_control sc;
				1029
				1030	loop_again:
				1031	total_scanned = 0;
				1032	total_reclaimed = 0;
				1033	sc.gfp_mask = GFP_KERNEL;
				1034	sc.may_writepage = 0;
Martin Hicks	bfbb38f	2005-06-21 17:14:40 -0700	[diff] [blame]	1035	sc.may_swap = 1;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1036	sc.nr_mapped = read_page_state(nr_mapped);
				1037
				1038	inc_page_state(pageoutrun);
				1039
				1040	for (i = 0; i < pgdat->nr_zones; i++) {
				1041	struct zone *zone = pgdat->node_zones + i;
				1042
				1043	zone->temp_priority = DEF_PRIORITY;
				1044	}
				1045
				1046	for (priority = DEF_PRIORITY; priority >= 0; priority--) {
				1047	int end_zone = 0; /* Inclusive. 0 = ZONE_DMA */
				1048	unsigned long lru_pages = 0;
				1049
				1050	all_zones_ok = 1;
				1051
				1052	if (nr_pages == 0) {
				1053	/*
				1054	* Scan in the highmem->dma direction for the highest
				1055	* zone which needs scanning
				1056	*/
				1057	for (i = pgdat->nr_zones - 1; i >= 0; i--) {
				1058	struct zone *zone = pgdat->node_zones + i;
				1059
				1060	if (zone->present_pages == 0)
				1061	continue;
				1062
				1063	if (zone->all_unreclaimable &&
				1064	priority != DEF_PRIORITY)
				1065	continue;
				1066
				1067	if (!zone_watermark_ok(zone, order,
				1068	zone->pages_high, 0, 0, 0)) {
				1069	end_zone = i;
				1070	goto scan;
				1071	}
				1072	}
				1073	goto out;
				1074	} else {
				1075	end_zone = pgdat->nr_zones - 1;
				1076	}
				1077	scan:
				1078	for (i = 0; i <= end_zone; i++) {
				1079	struct zone *zone = pgdat->node_zones + i;
				1080
				1081	lru_pages += zone->nr_active + zone->nr_inactive;
				1082	}
				1083
				1084	/*
				1085	* Now scan the zone in the dma->highmem direction, stopping
				1086	* at the last zone which needs scanning.
				1087	*
				1088	* We do this because the page allocator works in the opposite
				1089	* direction. This prevents the page allocator from allocating
				1090	* pages behind kswapd's direction of progress, which would
				1091	* cause too much scanning of the lower zones.
				1092	*/
				1093	for (i = 0; i <= end_zone; i++) {
				1094	struct zone *zone = pgdat->node_zones + i;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1095	int nr_slab;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1096
				1097	if (zone->present_pages == 0)
				1098	continue;
				1099
				1100	if (zone->all_unreclaimable && priority != DEF_PRIORITY)
				1101	continue;
				1102
				1103	if (nr_pages == 0) { /* Not software suspend */
				1104	if (!zone_watermark_ok(zone, order,
				1105	zone->pages_high, end_zone, 0, 0))
				1106	all_zones_ok = 0;
				1107	}
				1108	zone->temp_priority = priority;
				1109	if (zone->prev_priority > priority)
				1110	zone->prev_priority = priority;
				1111	sc.nr_scanned = 0;
				1112	sc.nr_reclaimed = 0;
				1113	sc.priority = priority;
				1114	sc.swap_cluster_max = nr_pages? nr_pages : SWAP_CLUSTER_MAX;
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1115	atomic_inc(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1116	shrink_zone(zone, &sc);
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1117	atomic_dec(&zone->reclaim_in_progress);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1118	reclaim_state->reclaimed_slab = 0;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1119	nr_slab = shrink_slab(sc.nr_scanned, GFP_KERNEL,
				1120	lru_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1121	sc.nr_reclaimed += reclaim_state->reclaimed_slab;
				1122	total_reclaimed += sc.nr_reclaimed;
				1123	total_scanned += sc.nr_scanned;
				1124	if (zone->all_unreclaimable)
				1125	continue;
akpm@osdl.org	b15e090	2005-06-21 17:14:35 -0700	[diff] [blame]	1126	if (nr_slab == 0 && zone->pages_scanned >=
				1127	(zone->nr_active + zone->nr_inactive) * 4)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1128	zone->all_unreclaimable = 1;
				1129	/*
				1130	* If we've done a decent amount of scanning and
				1131	* the reclaim ratio is low, start doing writepage
				1132	* even in laptop mode
				1133	*/
				1134	if (total_scanned > SWAP_CLUSTER_MAX * 2 &&
				1135	total_scanned > total_reclaimed+total_reclaimed/2)
				1136	sc.may_writepage = 1;
				1137	}
				1138	if (nr_pages && to_free > total_reclaimed)
				1139	continue; /* swsusp: need to do more work */
				1140	if (all_zones_ok)
				1141	break; /* kswapd: all done */
				1142	/*
				1143	* OK, kswapd is getting into trouble. Take a nap, then take
				1144	* another pass across the zones.
				1145	*/
				1146	if (total_scanned && priority < DEF_PRIORITY - 2)
				1147	blk_congestion_wait(WRITE, HZ/10);
				1148
				1149	/*
				1150	* We do this so kswapd doesn't build up large priorities for
				1151	* example when it is freeing in parallel with allocators. It
				1152	* matches the direct reclaim path behaviour in terms of impact
				1153	* on zone->*_priority.
				1154	*/
				1155	if ((total_reclaimed >= SWAP_CLUSTER_MAX) && (!nr_pages))
				1156	break;
				1157	}
				1158	out:
				1159	for (i = 0; i < pgdat->nr_zones; i++) {
				1160	struct zone *zone = pgdat->node_zones + i;
				1161
				1162	zone->prev_priority = zone->temp_priority;
				1163	}
				1164	if (!all_zones_ok) {
				1165	cond_resched();
				1166	goto loop_again;
				1167	}
				1168
				1169	return total_reclaimed;
				1170	}
				1171
				1172	/*
				1173	* The background pageout daemon, started as a kernel thread
				1174	* from the init process.
				1175	*
				1176	* This basically trickles out pages so that we have _some_
				1177	* free memory available even if there is no other activity
				1178	* that frees anything up. This is needed for things like routing
				1179	* etc, where we otherwise might have all activity going on in
				1180	* asynchronous contexts that cannot page things out.
				1181	*
				1182	* If there are applications that are active memory-allocators
				1183	* (most normal use), this basically shouldn't matter.
				1184	*/
				1185	static int kswapd(void *p)
				1186	{
				1187	unsigned long order;
				1188	pg_data_t pgdat = (pg_data_t)p;
				1189	struct task_struct *tsk = current;
				1190	DEFINE_WAIT(wait);
				1191	struct reclaim_state reclaim_state = {
				1192	.reclaimed_slab = 0,
				1193	};
				1194	cpumask_t cpumask;
				1195
				1196	daemonize("kswapd%d", pgdat->node_id);
				1197	cpumask = node_to_cpumask(pgdat->node_id);
				1198	if (!cpus_empty(cpumask))
				1199	set_cpus_allowed(tsk, cpumask);
				1200	current->reclaim_state = &reclaim_state;
				1201
				1202	/*
				1203	* Tell the memory management that we're a "memory allocator",
				1204	* and that if we need more memory we should get access to it
				1205	* regardless (see "__alloc_pages()"). "kswapd" should
				1206	* never get caught in the normal page freeing logic.
				1207	*
				1208	* (Kswapd normally doesn't need memory anyway, but sometimes
				1209	* you need a small amount of memory in order to be able to
				1210	* page out something else, and this flag essentially protects
				1211	* us from recursively trying to free more memory as we're
				1212	* trying to free the first piece of memory in the first place).
				1213	*/
				1214	tsk->flags \|= PF_MEMALLOC\|PF_KSWAPD;
				1215
				1216	order = 0;
				1217	for ( ; ; ) {
				1218	unsigned long new_order;
Christoph Lameter	3e1d1d2	2005-06-24 23:13:50 -0700	[diff] [blame]	1219
				1220	try_to_freeze();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1221
				1222	prepare_to_wait(&pgdat->kswapd_wait, &wait, TASK_INTERRUPTIBLE);
				1223	new_order = pgdat->kswapd_max_order;
				1224	pgdat->kswapd_max_order = 0;
				1225	if (order < new_order) {
				1226	/*
				1227	* Don't sleep if someone wants a larger 'order'
				1228	* allocation
				1229	*/
				1230	order = new_order;
				1231	} else {
				1232	schedule();
				1233	order = pgdat->kswapd_max_order;
				1234	}
				1235	finish_wait(&pgdat->kswapd_wait, &wait);
				1236
				1237	balance_pgdat(pgdat, 0, order);
				1238	}
				1239	return 0;
				1240	}
				1241
				1242	/*
				1243	* A zone is low on free memory, so wake its kswapd task to service it.
				1244	*/
				1245	void wakeup_kswapd(struct zone *zone, int order)
				1246	{
				1247	pg_data_t *pgdat;
				1248
				1249	if (zone->present_pages == 0)
				1250	return;
				1251
				1252	pgdat = zone->zone_pgdat;
				1253	if (zone_watermark_ok(zone, order, zone->pages_low, 0, 0, 0))
				1254	return;
				1255	if (pgdat->kswapd_max_order < order)
				1256	pgdat->kswapd_max_order = order;
				1257	if (!cpuset_zone_allowed(zone))
				1258	return;
				1259	if (!waitqueue_active(&zone->zone_pgdat->kswapd_wait))
				1260	return;
				1261	wake_up_interruptible(&zone->zone_pgdat->kswapd_wait);
				1262	}
				1263
				1264	#ifdef CONFIG_PM
				1265	/*
				1266	* Try to free `nr_pages' of memory, system-wide. Returns the number of freed
				1267	* pages.
				1268	*/
				1269	int shrink_all_memory(int nr_pages)
				1270	{
				1271	pg_data_t *pgdat;
				1272	int nr_to_free = nr_pages;
				1273	int ret = 0;
				1274	struct reclaim_state reclaim_state = {
				1275	.reclaimed_slab = 0,
				1276	};
				1277
				1278	current->reclaim_state = &reclaim_state;
				1279	for_each_pgdat(pgdat) {
				1280	int freed;
				1281	freed = balance_pgdat(pgdat, nr_to_free, 0);
				1282	ret += freed;
				1283	nr_to_free -= freed;
				1284	if (nr_to_free <= 0)
				1285	break;
				1286	}
				1287	current->reclaim_state = NULL;
				1288	return ret;
				1289	}
				1290	#endif
				1291
				1292	#ifdef CONFIG_HOTPLUG_CPU
				1293	/* It's optimal to keep kswapds on the same CPUs as their memory, but
				1294	not required for correctness. So if the last cpu in a node goes
				1295	away, we get changed to run anywhere: as the first one comes back,
				1296	restore their cpu bindings. */
				1297	static int __devinit cpu_callback(struct notifier_block *nfb,
				1298	unsigned long action,
				1299	void *hcpu)
				1300	{
				1301	pg_data_t *pgdat;
				1302	cpumask_t mask;
				1303
				1304	if (action == CPU_ONLINE) {
				1305	for_each_pgdat(pgdat) {
				1306	mask = node_to_cpumask(pgdat->node_id);
				1307	if (any_online_cpu(mask) != NR_CPUS)
				1308	/* One of our CPUs online: restore mask */
				1309	set_cpus_allowed(pgdat->kswapd, mask);
				1310	}
				1311	}
				1312	return NOTIFY_OK;
				1313	}
				1314	#endif /* CONFIG_HOTPLUG_CPU */
				1315
				1316	static int __init kswapd_init(void)
				1317	{
				1318	pg_data_t *pgdat;
				1319	swap_setup();
				1320	for_each_pgdat(pgdat)
				1321	pgdat->kswapd
				1322	= find_task_by_pid(kernel_thread(kswapd, pgdat, CLONE_KERNEL));
				1323	total_memory = nr_free_pagecache_pages();
				1324	hotcpu_notifier(cpu_callback, 0);
				1325	return 0;
				1326	}
				1327
				1328	module_init(kswapd_init)
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1329
				1330
				1331	/*
				1332	* Try to free up some pages from this zone through reclaim.
				1333	*/
				1334	int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
				1335	{
				1336	struct scan_control sc;
				1337	int nr_pages = 1 << order;
				1338	int total_reclaimed = 0;
				1339
				1340	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
				1341	if (!(gfp_mask & __GFP_WAIT))
				1342	return 0;
				1343	if (zone->all_unreclaimable)
				1344	return 0;
				1345
				1346	sc.gfp_mask = gfp_mask;
				1347	sc.may_writepage = 0;
				1348	sc.may_swap = 0;
				1349	sc.nr_mapped = read_page_state(nr_mapped);
				1350	sc.nr_scanned = 0;
				1351	sc.nr_reclaimed = 0;
				1352	/* scan at the highest priority */
				1353	sc.priority = 0;
				1354
				1355	if (nr_pages > SWAP_CLUSTER_MAX)
				1356	sc.swap_cluster_max = nr_pages;
				1357	else
				1358	sc.swap_cluster_max = SWAP_CLUSTER_MAX;
				1359
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1360	/* Don't reclaim the zone if there are other reclaimers active */
				1361	if (!atomic_inc_and_test(&zone->reclaim_in_progress))
				1362	goto out;
				1363
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1364	shrink_zone(zone, &sc);
				1365	total_reclaimed = sc.nr_reclaimed;
				1366
Martin Hicks	1e7e5a9	2005-06-21 17:14:43 -0700	[diff] [blame]	1367	out:
				1368	atomic_dec(&zone->reclaim_in_progress);
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	1369	return total_reclaimed;
				1370	}
				1371
				1372	asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
				1373	unsigned int state)
				1374	{
				1375	struct zone *z;
				1376	int i;
				1377
				1378	if (node >= MAX_NUMNODES \|\| !node_online(node))
				1379	return -EINVAL;
				1380
				1381	/* This will break if we ever add more zones */
				1382	if (!(zone & (1<<ZONE_DMA\|1<<ZONE_NORMAL\|1<<ZONE_HIGHMEM)))
				1383	return -EINVAL;
				1384
				1385	for (i = 0; i < MAX_NR_ZONES; i++) {
				1386	if (!(zone & 1<<i))
				1387	continue;
				1388
				1389	z = &NODE_DATA(node)->node_zones[i];
				1390
				1391	if (state)
				1392	z->reclaim_pages = 1;
				1393	else
				1394	z->reclaim_pages = 0;
				1395	}
				1396
				1397	return 0;
				1398	}