Blame - mm/page_alloc.c - kernel/msm-4.9

blob: b9fd2c238f137ba135234dc8f2b1a4dbb749a462 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/page_alloc.c
				3	*
				4	* Manages the free list, the system allocates free pages here.
				5	* Note that kmalloc() lives in slab.c
				6	*
				7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				8	* Swap reorganised 29.12.95, Stephen Tweedie
				9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
				11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
				12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
				13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
				14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
				15	*/
				16
				17	#include <linux/config.h>
				18	#include <linux/stddef.h>
				19	#include <linux/mm.h>
				20	#include <linux/swap.h>
				21	#include <linux/interrupt.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/bootmem.h>
				24	#include <linux/compiler.h>
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	25	#include <linux/kernel.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	26	#include <linux/module.h>
				27	#include <linux/suspend.h>
				28	#include <linux/pagevec.h>
				29	#include <linux/blkdev.h>
				30	#include <linux/slab.h>
				31	#include <linux/notifier.h>
				32	#include <linux/topology.h>
				33	#include <linux/sysctl.h>
				34	#include <linux/cpu.h>
				35	#include <linux/cpuset.h>
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	36	#include <linux/memory_hotplug.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	37	#include <linux/nodemask.h>
				38	#include <linux/vmalloc.h>
				39
				40	#include <asm/tlbflush.h>
				41	#include "internal.h"
				42
				43	/*
				44	* MCD - HACK: Find somewhere to initialize this EARLY, or make this
				45	* initializer cleaner
				46	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	47	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	48	EXPORT_SYMBOL(node_online_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	49	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	50	EXPORT_SYMBOL(node_possible_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	51	struct pglist_data *pgdat_list __read_mostly;
Ravikiran G Thirumalai	6c231b7	2005-09-06 15:17:45 -0700	[diff] [blame]	52	unsigned long totalram_pages __read_mostly;
				53	unsigned long totalhigh_pages __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	54	long nr_swap_pages;
				55
David Howells	a226f6c	2006-01-06 00:11:08 -0800	[diff] [blame]	56	static void fastcall free_hot_cold_page(struct page *page, int cold);
				57
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	58	/*
				59	* results with 256, 32 in the lowmem_reserve sysctl:
				60	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
				61	* 1G machine -> (16M dma, 784M normal, 224M high)
				62	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
				63	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
				64	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	65	*
				66	* TBD: should special case ZONE_DMA32 machines here - in those we normally
				67	* don't need any ZONE_NORMAL reservation
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	68	*/
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	69	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	70
				71	EXPORT_SYMBOL(totalram_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	72
				73	/*
				74	* Used by page_zone() to look up the address of the struct zone whose
				75	* id is encoded in the upper bits of page->flags
				76	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	77	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	78	EXPORT_SYMBOL(zone_table);
				79
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	80	static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	81	int min_free_kbytes = 1024;
				82
				83	unsigned long __initdata nr_kernel_pages;
				84	unsigned long __initdata nr_all_pages;
				85
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	86	#ifdef CONFIG_DEBUG_VM
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	87	static int page_outside_zone_boundaries(struct zone zone, struct page page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	88	{
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	89	int ret = 0;
				90	unsigned seq;
				91	unsigned long pfn = page_to_pfn(page);
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	92
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	93	do {
				94	seq = zone_span_seqbegin(zone);
				95	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
				96	ret = 1;
				97	else if (pfn < zone->zone_start_pfn)
				98	ret = 1;
				99	} while (zone_span_seqretry(zone, seq));
				100
				101	return ret;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	102	}
				103
				104	static int page_is_consistent(struct zone zone, struct page page)
				105	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	106	#ifdef CONFIG_HOLES_IN_ZONE
				107	if (!pfn_valid(page_to_pfn(page)))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	108	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	109	#endif
				110	if (zone != page_zone(page))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	111	return 0;
				112
				113	return 1;
				114	}
				115	/*
				116	* Temporary debugging check for pages not lying within a given zone.
				117	*/
				118	static int bad_range(struct zone zone, struct page page)
				119	{
				120	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	121	return 1;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	122	if (!page_is_consistent(zone, page))
				123	return 1;
				124
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	125	return 0;
				126	}
				127
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	128	#else
				129	static inline int bad_range(struct zone zone, struct page page)
				130	{
				131	return 0;
				132	}
				133	#endif
				134
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	135	static void bad_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	136	{
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	137	printk(KERN_EMERG "Bad page state in process '%s'\n"
				138	"page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
				139	"Trying to fix it up, but a reboot is needed\n"
				140	"Backtrace:\n",
				141	current->comm, page, (int)(2*sizeof(unsigned long)),
				142	(unsigned long)page->flags, page->mapping,
				143	page_mapcount(page), page_count(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	144	dump_stack();
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	145	page->flags &= ~(1 << PG_lru \|
				146	1 << PG_private \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	147	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	148	1 << PG_active \|
				149	1 << PG_dirty \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	150	1 << PG_reclaim \|
				151	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	152	1 << PG_swapcache \|
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	153	1 << PG_writeback );
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	154	set_page_count(page, 0);
				155	reset_page_mapcount(page);
				156	page->mapping = NULL;
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	157	add_taint(TAINT_BAD_PAGE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	158	}
				159
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	160	/*
				161	* Higher-order pages are called "compound pages". They are structured thusly:
				162	*
				163	* The first PAGE_SIZE page is called the "head page".
				164	*
				165	* The remaining PAGE_SIZE pages are called "tail pages".
				166	*
				167	* All pages have PG_compound set. All pages have their ->private pointing at
				168	* the head page (even the head page has this).
				169	*
				170	* The first tail page's ->mapping, if non-zero, holds the address of the
				171	* compound page's put_page() function.
				172	*
				173	* The order of the allocation is stored in the first tail page's ->index
				174	* This is only for debug at present. This usage means that zero-order pages
				175	* may not be compound.
				176	*/
				177	static void prep_compound_page(struct page *page, unsigned long order)
				178	{
				179	int i;
				180	int nr_pages = 1 << order;
				181
				182	page[1].mapping = NULL;
				183	page[1].index = order;
				184	for (i = 0; i < nr_pages; i++) {
				185	struct page *p = page + i;
				186
				187	SetPageCompound(p);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	188	set_page_private(p, (unsigned long)page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	189	}
				190	}
				191
				192	static void destroy_compound_page(struct page *page, unsigned long order)
				193	{
				194	int i;
				195	int nr_pages = 1 << order;
				196
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	197	if (unlikely(page[1].index != order))
				198	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	199
				200	for (i = 0; i < nr_pages; i++) {
				201	struct page *p = page + i;
				202
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	203	if (unlikely(!PageCompound(p) \|
				204	(page_private(p) != (unsigned long)page)))
				205	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	206	ClearPageCompound(p);
				207	}
				208	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	209
				210	/*
				211	* function for dealing with page's order in buddy system.
				212	* zone->lock is already acquired when we use these.
				213	* So, we don't need atomic page->flags operations here.
				214	*/
				215	static inline unsigned long page_order(struct page *page) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	216	return page_private(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	217	}
				218
				219	static inline void set_page_order(struct page *page, int order) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	220	set_page_private(page, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	221	__SetPagePrivate(page);
				222	}
				223
				224	static inline void rmv_page_order(struct page *page)
				225	{
				226	__ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	227	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	228	}
				229
				230	/*
				231	* Locate the struct page for both the matching buddy in our
				232	* pair (buddy1) and the combined O(n+1) page they form (page).
				233	*
				234	* 1) Any buddy B1 will have an order O twin B2 which satisfies
				235	* the following equation:
				236	* B2 = B1 ^ (1 << O)
				237	* For example, if the starting buddy (buddy2) is #8 its order
				238	* 1 buddy is #10:
				239	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
				240	*
				241	* 2) Any buddy B will have an order O+1 parent P which
				242	* satisfies the following equation:
				243	* P = B & ~(1 << O)
				244	*
				245	* Assumption: *_mem_map is contigious at least up to MAX_ORDER
				246	*/
				247	static inline struct page *
				248	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
				249	{
				250	unsigned long buddy_idx = page_idx ^ (1 << order);
				251
				252	return page + (buddy_idx - page_idx);
				253	}
				254
				255	static inline unsigned long
				256	__find_combined_index(unsigned long page_idx, unsigned int order)
				257	{
				258	return (page_idx & ~(1 << order));
				259	}
				260
				261	/*
				262	* This function checks whether a page is free && is the buddy
				263	* we can do coalesce a page and its buddy if
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	264	* (a) the buddy is not in a hole &&
				265	* (b) the buddy is free &&
				266	* (c) the buddy is on the buddy system &&
				267	* (d) a page and its buddy have the same order.
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	268	* for recording page's order, we use page_private(page) and PG_private.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	269	*
				270	*/
				271	static inline int page_is_buddy(struct page *page, int order)
				272	{
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	273	#ifdef CONFIG_HOLES_IN_ZONE
				274	if (!pfn_valid(page_to_pfn(page)))
				275	return 0;
				276	#endif
				277
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	278	if (PagePrivate(page) &&
				279	(page_order(page) == order) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	280	page_count(page) == 0)
				281	return 1;
				282	return 0;
				283	}
				284
				285	/*
				286	* Freeing function for a buddy system allocator.
				287	*
				288	* The concept of a buddy system is to maintain direct-mapped table
				289	* (containing bit values) for memory blocks of various "orders".
				290	* The bottom level table contains the map for the smallest allocatable
				291	* units of memory (here, pages), and each level above it describes
				292	* pairs of units from the levels below, hence, "buddies".
				293	* At a high level, all that happens here is marking the table entry
				294	* at the bottom level available, and propagating the changes upward
				295	* as necessary, plus some accounting needed to play nicely with other
				296	* parts of the VM system.
				297	* At each level, we keep a list of pages, which are heads of continuous
				298	* free pages of length of (1 << order) and marked with PG_Private.Page's
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	299	* order is recorded in page_private(page) field.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	300	* So when we are allocating or freeing one, we can derive the state of the
				301	* other. That is, if we allocate a small block, and both were
				302	* free, the remainder of the region must be split into blocks.
				303	* If a block is freed, and its buddy is also free, then this
				304	* triggers coalescing into a block of larger size.
				305	*
				306	* -- wli
				307	*/
				308
				309	static inline void __free_pages_bulk (struct page *page,
				310	struct zone *zone, unsigned int order)
				311	{
				312	unsigned long page_idx;
				313	int order_size = 1 << order;
				314
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	315	if (unlikely(PageCompound(page)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	316	destroy_compound_page(page, order);
				317
				318	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
				319
				320	BUG_ON(page_idx & (order_size - 1));
				321	BUG_ON(bad_range(zone, page));
				322
				323	zone->free_pages += order_size;
				324	while (order < MAX_ORDER-1) {
				325	unsigned long combined_idx;
				326	struct free_area *area;
				327	struct page *buddy;
				328
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	329	buddy = __page_find_buddy(page, page_idx, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	330	if (!page_is_buddy(buddy, order))
				331	break; /* Move the buddy up one level. */
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	332
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	333	list_del(&buddy->lru);
				334	area = zone->free_area + order;
				335	area->nr_free--;
				336	rmv_page_order(buddy);
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	337	combined_idx = __find_combined_index(page_idx, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	338	page = page + (combined_idx - page_idx);
				339	page_idx = combined_idx;
				340	order++;
				341	}
				342	set_page_order(page, order);
				343	list_add(&page->lru, &zone->free_area[order].free_list);
				344	zone->free_area[order].nr_free++;
				345	}
				346
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	347	static inline int free_pages_check(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	{
Nick Piggin	92be2e33	2006-01-06 00:10:57 -0800	[diff] [blame]	349	if (unlikely(page_mapcount(page) \|
				350	(page->mapping != NULL) \|
				351	(page_count(page) != 0) \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	352	(page->flags & (
				353	1 << PG_lru \|
				354	1 << PG_private \|
				355	1 << PG_locked \|
				356	1 << PG_active \|
				357	1 << PG_reclaim \|
				358	1 << PG_slab \|
				359	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	360	1 << PG_writeback \|
Nick Piggin	92be2e33	2006-01-06 00:10:57 -0800	[diff] [blame]	361	1 << PG_reserved ))))
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	362	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	363	if (PageDirty(page))
Nick Piggin	242e546	2005-09-03 15:54:50 -0700	[diff] [blame]	364	__ClearPageDirty(page);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	365	/*
				366	* For now, we report if PG_reserved was found set, but do not
				367	* clear it, and do not free the page. But we shall soon need
				368	* to do more, for when the ZERO_PAGE count wraps negative.
				369	*/
				370	return PageReserved(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	371	}
				372
				373	/*
				374	* Frees a list of pages.
				375	* Assumes all pages on list are in same zone, and of same order.
Renaud Lienhart	207f36e	2005-09-10 00:26:59 -0700	[diff] [blame]	376	* count is the number of pages to free.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	377	*
				378	* If the zone was previously in an "all pages pinned" state then look to
				379	* see if this freeing clears that state.
				380	*
				381	* And clear the zone's pages_scanned counter, to hold off the "all pages are
				382	* pinned" detection logic.
				383	*/
				384	static int
				385	free_pages_bulk(struct zone *zone, int count,
				386	struct list_head *list, unsigned int order)
				387	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	388	struct page *page = NULL;
				389	int ret = 0;
				390
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	391	spin_lock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	392	zone->all_unreclaimable = 0;
				393	zone->pages_scanned = 0;
				394	while (!list_empty(list) && count--) {
				395	page = list_entry(list->prev, struct page, lru);
				396	/* have to delete it as __free_pages_bulk list manipulates */
				397	list_del(&page->lru);
				398	__free_pages_bulk(page, zone, order);
				399	ret++;
				400	}
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	401	spin_unlock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	402	return ret;
				403	}
				404
				405	void __free_pages_ok(struct page *page, unsigned int order)
				406	{
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	407	unsigned long flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	408	LIST_HEAD(list);
				409	int i;
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	410	int reserved = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	411
				412	arch_free_page(page, order);
				413
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	414	#ifndef CONFIG_MMU
				415	if (order > 0)
				416	for (i = 1 ; i < (1 << order) ; ++i)
				417	__put_page(page + i);
				418	#endif
				419
				420	for (i = 0 ; i < (1 << order) ; ++i)
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	421	reserved += free_pages_check(page + i);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	422	if (reserved)
				423	return;
				424
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	425	list_add(&page->lru, &list);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	426	mod_page_state(pgfree, 1 << order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	427	kernel_map_pages(page, 1<<order, 0);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	428	local_irq_save(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	429	free_pages_bulk(page_zone(page), 1, &list, order);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	430	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	431	}
				432
David Howells	a226f6c	2006-01-06 00:11:08 -0800	[diff] [blame]	433	/*
				434	* permit the bootmem allocator to evade page validation on high-order frees
				435	*/
				436	void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
				437	{
				438	if (order == 0) {
				439	__ClearPageReserved(page);
				440	set_page_count(page, 0);
				441
				442	free_hot_cold_page(page, 0);
				443	} else {
				444	LIST_HEAD(list);
				445	int loop;
				446
				447	for (loop = 0; loop < BITS_PER_LONG; loop++) {
				448	struct page *p = &page[loop];
				449
				450	if (loop + 16 < BITS_PER_LONG)
				451	prefetchw(p + 16);
				452	__ClearPageReserved(p);
				453	set_page_count(p, 0);
				454	}
				455
				456	arch_free_page(page, order);
				457
				458	mod_page_state(pgfree, 1 << order);
				459
				460	list_add(&page->lru, &list);
				461	kernel_map_pages(page, 1 << order, 0);
				462	free_pages_bulk(page_zone(page), 1, &list, order);
				463	}
				464	}
				465
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	466
				467	/*
				468	* The order of subdivision here is critical for the IO subsystem.
				469	* Please do not alter this order without good reasons and regression
				470	* testing. Specifically, as large blocks of memory are subdivided,
				471	* the order in which smaller blocks are delivered depends on the order
				472	* they're subdivided in this function. This is the primary factor
				473	* influencing the order in which pages are delivered to the IO
				474	* subsystem according to empirical testing, and this is also justified
				475	* by considering the behavior of a buddy system containing a single
				476	* large block of memory acted on by a series of small allocations.
				477	* This behavior is a critical factor in sglist merging's success.
				478	*
				479	* -- wli
				480	*/
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	481	static inline void expand(struct zone zone, struct page page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	482	int low, int high, struct free_area *area)
				483	{
				484	unsigned long size = 1 << high;
				485
				486	while (high > low) {
				487	area--;
				488	high--;
				489	size >>= 1;
				490	BUG_ON(bad_range(zone, &page[size]));
				491	list_add(&page[size].lru, &area->free_list);
				492	area->nr_free++;
				493	set_page_order(&page[size], high);
				494	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	495	}
				496
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	497	/*
				498	* This page is about to be returned from the page allocator
				499	*/
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	500	static int prep_new_page(struct page *page, int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	501	{
Nick Piggin	92be2e33	2006-01-06 00:10:57 -0800	[diff] [blame]	502	if (unlikely(page_mapcount(page) \|
				503	(page->mapping != NULL) \|
				504	(page_count(page) != 0) \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	505	(page->flags & (
				506	1 << PG_lru \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	507	1 << PG_private \|
				508	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	509	1 << PG_active \|
				510	1 << PG_dirty \|
				511	1 << PG_reclaim \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	512	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	513	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	514	1 << PG_writeback \|
Nick Piggin	92be2e33	2006-01-06 00:10:57 -0800	[diff] [blame]	515	1 << PG_reserved ))))
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	516	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	517
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	518	/*
				519	* For now, we report if PG_reserved was found set, but do not
				520	* clear it, and do not allocate the page: as a safety net.
				521	*/
				522	if (PageReserved(page))
				523	return 1;
				524
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	525	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
				526	1 << PG_referenced \| 1 << PG_arch_1 \|
				527	1 << PG_checked \| 1 << PG_mappedtodisk);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	528	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	529	set_page_refs(page, order);
				530	kernel_map_pages(page, 1 << order, 1);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	531	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	532	}
				533
				534	/*
				535	* Do the hard work of removing an element from the buddy allocator.
				536	* Call me with the zone->lock already held.
				537	*/
				538	static struct page __rmqueue(struct zone zone, unsigned int order)
				539	{
				540	struct free_area * area;
				541	unsigned int current_order;
				542	struct page *page;
				543
				544	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
				545	area = zone->free_area + current_order;
				546	if (list_empty(&area->free_list))
				547	continue;
				548
				549	page = list_entry(area->free_list.next, struct page, lru);
				550	list_del(&page->lru);
				551	rmv_page_order(page);
				552	area->nr_free--;
				553	zone->free_pages -= 1UL << order;
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	554	expand(zone, page, order, current_order, area);
				555	return page;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	556	}
				557
				558	return NULL;
				559	}
				560
				561	/*
				562	* Obtain a specified number of elements from the buddy allocator, all under
				563	* a single hold of the lock, for efficiency. Add them to the supplied list.
				564	* Returns the number of new pages which were placed at *list.
				565	*/
				566	static int rmqueue_bulk(struct zone *zone, unsigned int order,
				567	unsigned long count, struct list_head *list)
				568	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	569	int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	570
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	571	spin_lock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	572	for (i = 0; i < count; ++i) {
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	573	struct page *page = __rmqueue(zone, order);
				574	if (unlikely(page == NULL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	575	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	576	list_add_tail(&page->lru, list);
				577	}
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	578	spin_unlock(&zone->lock);
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	579	return i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580	}
				581
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	582	#ifdef CONFIG_NUMA
				583	/* Called from the slab reaper to drain remote pagesets */
				584	void drain_remote_pages(void)
				585	{
				586	struct zone *zone;
				587	int i;
				588	unsigned long flags;
				589
				590	local_irq_save(flags);
				591	for_each_zone(zone) {
				592	struct per_cpu_pageset *pset;
				593
				594	/* Do not drain local pagesets */
				595	if (zone->zone_pgdat->node_id == numa_node_id())
				596	continue;
				597
				598	pset = zone->pageset[smp_processor_id()];
				599	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				600	struct per_cpu_pages *pcp;
				601
				602	pcp = &pset->pcp[i];
				603	if (pcp->count)
				604	pcp->count -= free_pages_bulk(zone, pcp->count,
				605	&pcp->list, 0);
				606	}
				607	}
				608	local_irq_restore(flags);
				609	}
				610	#endif
				611
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	612	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)
				613	static void __drain_pages(unsigned int cpu)
				614	{
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	615	unsigned long flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	616	struct zone *zone;
				617	int i;
				618
				619	for_each_zone(zone) {
				620	struct per_cpu_pageset *pset;
				621
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	622	pset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	623	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				624	struct per_cpu_pages *pcp;
				625
				626	pcp = &pset->pcp[i];
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	627	local_irq_save(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	628	pcp->count -= free_pages_bulk(zone, pcp->count,
				629	&pcp->list, 0);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	630	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	631	}
				632	}
				633	}
				634	#endif /* CONFIG_PM \|\| CONFIG_HOTPLUG_CPU */
				635
				636	#ifdef CONFIG_PM
				637
				638	void mark_free_pages(struct zone *zone)
				639	{
				640	unsigned long zone_pfn, flags;
				641	int order;
				642	struct list_head *curr;
				643
				644	if (!zone->spanned_pages)
				645	return;
				646
				647	spin_lock_irqsave(&zone->lock, flags);
				648	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
				649	ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
				650
				651	for (order = MAX_ORDER - 1; order >= 0; --order)
				652	list_for_each(curr, &zone->free_area[order].free_list) {
				653	unsigned long start_pfn, i;
				654
				655	start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
				656
				657	for (i=0; i < (1<<order); i++)
				658	SetPageNosaveFree(pfn_to_page(start_pfn+i));
				659	}
				660	spin_unlock_irqrestore(&zone->lock, flags);
				661	}
				662
				663	/*
				664	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
				665	*/
				666	void drain_local_pages(void)
				667	{
				668	unsigned long flags;
				669
				670	local_irq_save(flags);
				671	__drain_pages(smp_processor_id());
				672	local_irq_restore(flags);
				673	}
				674	#endif /* CONFIG_PM */
				675
				676	static void zone_statistics(struct zonelist zonelist, struct zone z)
				677	{
				678	#ifdef CONFIG_NUMA
				679	unsigned long flags;
				680	int cpu;
				681	pg_data_t *pg = z->zone_pgdat;
				682	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
				683	struct per_cpu_pageset *p;
				684
				685	local_irq_save(flags);
				686	cpu = smp_processor_id();
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	687	p = zone_pcp(z,cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	688	if (pg == orig) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	689	p->numa_hit++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	690	} else {
				691	p->numa_miss++;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	692	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	693	}
				694	if (pg == NODE_DATA(numa_node_id()))
				695	p->local_node++;
				696	else
				697	p->other_node++;
				698	local_irq_restore(flags);
				699	#endif
				700	}
				701
				702	/*
				703	* Free a 0-order page
				704	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	705	static void fastcall free_hot_cold_page(struct page *page, int cold)
				706	{
				707	struct zone *zone = page_zone(page);
				708	struct per_cpu_pages *pcp;
				709	unsigned long flags;
				710
				711	arch_free_page(page, 0);
				712
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	713	if (PageAnon(page))
				714	page->mapping = NULL;
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame^]	715	if (free_pages_check(page))
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	716	return;
				717
				718	inc_page_state(pgfree);
				719	kernel_map_pages(page, 1, 0);
				720
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	721	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	722	local_irq_save(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	723	list_add(&page->lru, &pcp->list);
				724	pcp->count++;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	725	if (pcp->count >= pcp->high)
				726	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	727	local_irq_restore(flags);
				728	put_cpu();
				729	}
				730
				731	void fastcall free_hot_page(struct page *page)
				732	{
				733	free_hot_cold_page(page, 0);
				734	}
				735
				736	void fastcall free_cold_page(struct page *page)
				737	{
				738	free_hot_cold_page(page, 1);
				739	}
				740
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	741	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	742	{
				743	int i;
				744
				745	BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);
				746	for(i = 0; i < (1 << order); i++)
				747	clear_highpage(page + i);
				748	}
				749
				750	/*
				751	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
				752	* we cheat by calling it from here, in the order > 0 path. Saves a branch
				753	* or two.
				754	*/
				755	static struct page *
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	756	buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	757	{
				758	unsigned long flags;
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	759	struct page *page;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	760	int cold = !!(gfp_flags & __GFP_COLD);
				761
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	762	again:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	763	if (order == 0) {
				764	struct per_cpu_pages *pcp;
				765
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	766	page = NULL;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	767	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	768	local_irq_save(flags);
Nick Piggin	2d92c5c	2006-01-06 00:10:59 -0800	[diff] [blame]	769	if (!pcp->count)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	770	pcp->count += rmqueue_bulk(zone, 0,
				771	pcp->batch, &pcp->list);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	772	if (likely(pcp->count)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	773	page = list_entry(pcp->list.next, struct page, lru);
				774	list_del(&page->lru);
				775	pcp->count--;
				776	}
				777	local_irq_restore(flags);
				778	put_cpu();
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	779	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	780	spin_lock_irqsave(&zone->lock, flags);
				781	page = __rmqueue(zone, order);
				782	spin_unlock_irqrestore(&zone->lock, flags);
				783	}
				784
				785	if (page != NULL) {
				786	BUG_ON(bad_range(zone, page));
				787	mod_page_state_zone(zone, pgalloc, 1 << order);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	788	if (prep_new_page(page, order))
				789	goto again;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	790
				791	if (gfp_flags & __GFP_ZERO)
				792	prep_zero_page(page, order, gfp_flags);
				793
				794	if (order && (gfp_flags & __GFP_COMP))
				795	prep_compound_page(page, order);
				796	}
				797	return page;
				798	}
				799
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	800	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	801	#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
				802	#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
				803	#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
				804	#define ALLOC_HARDER 0x10 /* try to alloc harder */
				805	#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
				806	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	807
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	808	/*
				809	* Return 1 if free pages are above 'mark'. This takes into account the order
				810	* of the allocation.
				811	*/
				812	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	813	int classzone_idx, int alloc_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	814	{
				815	/* free_pages my go negative - that's OK */
				816	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
				817	int o;
				818
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	819	if (alloc_flags & ALLOC_HIGH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	820	min -= min / 2;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	821	if (alloc_flags & ALLOC_HARDER)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	822	min -= min / 4;
				823
				824	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
				825	return 0;
				826	for (o = 0; o < order; o++) {
				827	/* At the next order, this order's pages become unavailable */
				828	free_pages -= z->free_area[o].nr_free << o;
				829
				830	/* Require fewer higher order pages to be free */
				831	min >>= 1;
				832
				833	if (free_pages <= min)
				834	return 0;
				835	}
				836	return 1;
				837	}
				838
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	839	/*
				840	* get_page_from_freeliest goes through the zonelist trying to allocate
				841	* a page.
				842	*/
				843	static struct page *
				844	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
				845	struct zonelist *zonelist, int alloc_flags)
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	846	{
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	847	struct zone **z = zonelist->zones;
				848	struct page *page = NULL;
				849	int classzone_idx = zone_idx(*z);
				850
				851	/*
				852	* Go through the zonelist once, looking for a zone with enough free.
				853	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
				854	*/
				855	do {
				856	if ((alloc_flags & ALLOC_CPUSET) &&
				857	!cpuset_zone_allowed(*z, gfp_mask))
				858	continue;
				859
				860	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	861	unsigned long mark;
				862	if (alloc_flags & ALLOC_WMARK_MIN)
				863	mark = (*z)->pages_min;
				864	else if (alloc_flags & ALLOC_WMARK_LOW)
				865	mark = (*z)->pages_low;
				866	else
				867	mark = (*z)->pages_high;
				868	if (!zone_watermark_ok(*z, order, mark,
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	869	classzone_idx, alloc_flags))
				870	continue;
				871	}
				872
				873	page = buffered_rmqueue(*z, order, gfp_mask);
				874	if (page) {
				875	zone_statistics(zonelist, *z);
				876	break;
				877	}
				878	} while (*(++z) != NULL);
				879	return page;
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	880	}
				881
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	882	/*
				883	* This is the 'heart' of the zoned buddy allocator.
				884	*/
				885	struct page * fastcall
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	886	__alloc_pages(gfp_t gfp_mask, unsigned int order,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	887	struct zonelist *zonelist)
				888	{
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	889	const gfp_t wait = gfp_mask & __GFP_WAIT;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	890	struct zone **z;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	891	struct page *page;
				892	struct reclaim_state reclaim_state;
				893	struct task_struct *p = current;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	894	int do_retry;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	895	int alloc_flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	896	int did_some_progress;
				897
				898	might_sleep_if(wait);
				899
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	900	restart:
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	901	z = zonelist->zones; /* the list of zones suitable for gfp_mask */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	902
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	903	if (unlikely(*z == NULL)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	904	/* Should this ever happen?? */
				905	return NULL;
				906	}
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	907
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	908	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	909	zonelist, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	910	if (page)
				911	goto got_pg;
				912
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	913	do {
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	914	wakeup_kswapd(*z, order);
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	915	} while (*(++z));
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	916
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	917	/*
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	918	* OK, we're below the kswapd watermark and have kicked background
				919	* reclaim. Now things get more complex, so set up alloc_flags according
				920	* to how we want to proceed.
				921	*
				922	* The caller may dip into page reserves a bit more if the caller
				923	* cannot run direct reclaim, or if the caller has realtime scheduling
				924	* policy.
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	925	*/
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	926	alloc_flags = ALLOC_WMARK_MIN;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	927	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)
				928	alloc_flags \|= ALLOC_HARDER;
				929	if (gfp_mask & __GFP_HIGH)
				930	alloc_flags \|= ALLOC_HIGH;
Paul Jackson	47f3a86	2006-01-06 00:10:32 -0800	[diff] [blame]	931	alloc_flags \|= ALLOC_CPUSET;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	932
				933	/*
				934	* Go through the zonelist again. Let __GFP_HIGH and allocations
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	935	* coming from realtime tasks go deeper into reserves.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	936	*
				937	* This is the last chance, in general, before the goto nopage.
				938	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	939	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	940	*/
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	941	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
				942	if (page)
				943	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	944
				945	/* This allocation should allow future memory freeing. */
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	946
				947	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
				948	&& !in_interrupt()) {
				949	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
Kirill Korotaev	885036d	2005-11-13 16:06:41 -0800	[diff] [blame]	950	nofail_alloc:
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	951	/* go through the zonelist yet again, ignoring mins */
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	952	page = get_page_from_freelist(gfp_mask, order,
Paul Jackson	47f3a86	2006-01-06 00:10:32 -0800	[diff] [blame]	953	zonelist, ALLOC_NO_WATERMARKS);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	954	if (page)
				955	goto got_pg;
Kirill Korotaev	885036d	2005-11-13 16:06:41 -0800	[diff] [blame]	956	if (gfp_mask & __GFP_NOFAIL) {
				957	blk_congestion_wait(WRITE, HZ/50);
				958	goto nofail_alloc;
				959	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	960	}
				961	goto nopage;
				962	}
				963
				964	/* Atomic allocations - we can't balance anything */
				965	if (!wait)
				966	goto nopage;
				967
				968	rebalance:
				969	cond_resched();
				970
				971	/* We now go into synchronous reclaim */
				972	p->flags \|= PF_MEMALLOC;
				973	reclaim_state.reclaimed_slab = 0;
				974	p->reclaim_state = &reclaim_state;
				975
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	976	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	977
				978	p->reclaim_state = NULL;
				979	p->flags &= ~PF_MEMALLOC;
				980
				981	cond_resched();
				982
				983	if (likely(did_some_progress)) {
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	984	page = get_page_from_freelist(gfp_mask, order,
				985	zonelist, alloc_flags);
				986	if (page)
				987	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	988	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
				989	/*
				990	* Go through the zonelist yet one more time, keep
				991	* very high watermark here, this is only to catch
				992	* a parallel oom killing, we must fail if we're still
				993	* under heavy pressure.
				994	*/
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	995	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	996	zonelist, ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	997	if (page)
				998	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	999
Marcelo Tosatti	79b9ce3	2005-07-07 17:56:04 -0700	[diff] [blame]	1000	out_of_memory(gfp_mask, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1001	goto restart;
				1002	}
				1003
				1004	/*
				1005	* Don't let big-order allocations loop unless the caller explicitly
				1006	* requests that. Wait for some write requests to complete then retry.
				1007	*
				1008	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
				1009	* <= 3, but that may not be true in other implementations.
				1010	*/
				1011	do_retry = 0;
				1012	if (!(gfp_mask & __GFP_NORETRY)) {
				1013	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))
				1014	do_retry = 1;
				1015	if (gfp_mask & __GFP_NOFAIL)
				1016	do_retry = 1;
				1017	}
				1018	if (do_retry) {
				1019	blk_congestion_wait(WRITE, HZ/50);
				1020	goto rebalance;
				1021	}
				1022
				1023	nopage:
				1024	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
				1025	printk(KERN_WARNING "%s: page allocation failure."
				1026	" order:%d, mode:0x%x\n",
				1027	p->comm, order, gfp_mask);
				1028	dump_stack();
Janet Morgan	578c2fd	2005-06-21 17:14:56 -0700	[diff] [blame]	1029	show_mem();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1030	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1031	got_pg:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1032	return page;
				1033	}
				1034
				1035	EXPORT_SYMBOL(__alloc_pages);
				1036
				1037	/*
				1038	* Common helper functions.
				1039	*/
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	1040	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1041	{
				1042	struct page * page;
				1043	page = alloc_pages(gfp_mask, order);
				1044	if (!page)
				1045	return 0;
				1046	return (unsigned long) page_address(page);
				1047	}
				1048
				1049	EXPORT_SYMBOL(__get_free_pages);
				1050
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	1051	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1052	{
				1053	struct page * page;
				1054
				1055	/*
				1056	* get_zeroed_page() returns a 32-bit address, which cannot represent
				1057	* a highmem page
				1058	*/
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1059	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1060
				1061	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);
				1062	if (page)
				1063	return (unsigned long) page_address(page);
				1064	return 0;
				1065	}
				1066
				1067	EXPORT_SYMBOL(get_zeroed_page);
				1068
				1069	void __pagevec_free(struct pagevec *pvec)
				1070	{
				1071	int i = pagevec_count(pvec);
				1072
				1073	while (--i >= 0)
				1074	free_hot_cold_page(pvec->pages[i], pvec->cold);
				1075	}
				1076
				1077	fastcall void __free_pages(struct page *page, unsigned int order)
				1078	{
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1079	if (put_page_testzero(page)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1080	if (order == 0)
				1081	free_hot_page(page);
				1082	else
				1083	__free_pages_ok(page, order);
				1084	}
				1085	}
				1086
				1087	EXPORT_SYMBOL(__free_pages);
				1088
				1089	fastcall void free_pages(unsigned long addr, unsigned int order)
				1090	{
				1091	if (addr != 0) {
				1092	BUG_ON(!virt_addr_valid((void *)addr));
				1093	__free_pages(virt_to_page((void *)addr), order);
				1094	}
				1095	}
				1096
				1097	EXPORT_SYMBOL(free_pages);
				1098
				1099	/*
				1100	* Total amount of free (allocatable) RAM:
				1101	*/
				1102	unsigned int nr_free_pages(void)
				1103	{
				1104	unsigned int sum = 0;
				1105	struct zone *zone;
				1106
				1107	for_each_zone(zone)
				1108	sum += zone->free_pages;
				1109
				1110	return sum;
				1111	}
				1112
				1113	EXPORT_SYMBOL(nr_free_pages);
				1114
				1115	#ifdef CONFIG_NUMA
				1116	unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
				1117	{
				1118	unsigned int i, sum = 0;
				1119
				1120	for (i = 0; i < MAX_NR_ZONES; i++)
				1121	sum += pgdat->node_zones[i].free_pages;
				1122
				1123	return sum;
				1124	}
				1125	#endif
				1126
				1127	static unsigned int nr_free_zone_pages(int offset)
				1128	{
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1129	/* Just pick one node, since fallback list is circular */
				1130	pg_data_t *pgdat = NODE_DATA(numa_node_id());
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1131	unsigned int sum = 0;
				1132
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1133	struct zonelist *zonelist = pgdat->node_zonelists + offset;
				1134	struct zone **zonep = zonelist->zones;
				1135	struct zone *zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1136
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1137	for (zone = zonep++; zone; zone = zonep++) {
				1138	unsigned long size = zone->present_pages;
				1139	unsigned long high = zone->pages_high;
				1140	if (size > high)
				1141	sum += size - high;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1142	}
				1143
				1144	return sum;
				1145	}
				1146
				1147	/*
				1148	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
				1149	*/
				1150	unsigned int nr_free_buffer_pages(void)
				1151	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1152	return nr_free_zone_pages(gfp_zone(GFP_USER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1153	}
				1154
				1155	/*
				1156	* Amount of free RAM allocatable within all zones
				1157	*/
				1158	unsigned int nr_free_pagecache_pages(void)
				1159	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1160	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1161	}
				1162
				1163	#ifdef CONFIG_HIGHMEM
				1164	unsigned int nr_free_highpages (void)
				1165	{
				1166	pg_data_t *pgdat;
				1167	unsigned int pages = 0;
				1168
				1169	for_each_pgdat(pgdat)
				1170	pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1171
				1172	return pages;
				1173	}
				1174	#endif
				1175
				1176	#ifdef CONFIG_NUMA
				1177	static void show_node(struct zone *zone)
				1178	{
				1179	printk("Node %d ", zone->zone_pgdat->node_id);
				1180	}
				1181	#else
				1182	#define show_node(zone) do { } while (0)
				1183	#endif
				1184
				1185	/*
				1186	* Accumulate the page_state information across all CPUs.
				1187	* The result is unavoidably approximate - it can change
				1188	* during and after execution of this function.
				1189	*/
				1190	static DEFINE_PER_CPU(struct page_state, page_states) = {0};
				1191
				1192	atomic_t nr_pagecache = ATOMIC_INIT(0);
				1193	EXPORT_SYMBOL(nr_pagecache);
				1194	#ifdef CONFIG_SMP
				1195	DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
				1196	#endif
				1197
Nick Piggin	a86b1f5	2006-01-06 00:11:00 -0800	[diff] [blame]	1198	static void __get_page_state(struct page_state ret, int nr, cpumask_t cpumask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1199	{
				1200	int cpu = 0;
				1201
				1202	memset(ret, 0, sizeof(*ret));
				1203
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1204	cpu = first_cpu(*cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1205	while (cpu < NR_CPUS) {
				1206	unsigned long in, out, off;
				1207
				1208	in = (unsigned long *)&per_cpu(page_states, cpu);
				1209
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1210	cpu = next_cpu(cpu, *cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1211
				1212	if (cpu < NR_CPUS)
				1213	prefetch(&per_cpu(page_states, cpu));
				1214
				1215	out = (unsigned long *)ret;
				1216	for (off = 0; off < nr; off++)
				1217	out++ += in++;
				1218	}
				1219	}
				1220
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1221	void get_page_state_node(struct page_state *ret, int node)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1222	{
				1223	int nr;
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1224	cpumask_t mask = node_to_cpumask(node);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1225
				1226	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1227	nr /= sizeof(unsigned long);
				1228
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1229	__get_page_state(ret, nr+1, &mask);
				1230	}
				1231
				1232	void get_page_state(struct page_state *ret)
				1233	{
				1234	int nr;
				1235	cpumask_t mask = CPU_MASK_ALL;
				1236
				1237	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1238	nr /= sizeof(unsigned long);
				1239
				1240	__get_page_state(ret, nr + 1, &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1241	}
				1242
				1243	void get_full_page_state(struct page_state *ret)
				1244	{
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1245	cpumask_t mask = CPU_MASK_ALL;
				1246
				1247	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1248	}
				1249
Benjamin LaHaise	c2f29ea	2005-06-21 17:14:55 -0700	[diff] [blame]	1250	unsigned long __read_page_state(unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1251	{
				1252	unsigned long ret = 0;
				1253	int cpu;
				1254
Nick Piggin	a86b1f5	2006-01-06 00:11:00 -0800	[diff] [blame]	1255	for_each_cpu(cpu) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1256	unsigned long in;
				1257
				1258	in = (unsigned long)&per_cpu(page_states, cpu) + offset;
				1259	ret += ((unsigned long )in);
				1260	}
				1261	return ret;
				1262	}
				1263
Benjamin LaHaise	83e5d8f	2005-06-21 17:14:54 -0700	[diff] [blame]	1264	void __mod_page_state(unsigned long offset, unsigned long delta)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1265	{
				1266	unsigned long flags;
				1267	void* ptr;
				1268
				1269	local_irq_save(flags);
				1270	ptr = &__get_cpu_var(page_states);
				1271	(unsigned long)(ptr + offset) += delta;
				1272	local_irq_restore(flags);
				1273	}
				1274
				1275	EXPORT_SYMBOL(__mod_page_state);
				1276
				1277	void __get_zone_counts(unsigned long active, unsigned long inactive,
				1278	unsigned long free, struct pglist_data pgdat)
				1279	{
				1280	struct zone *zones = pgdat->node_zones;
				1281	int i;
				1282
				1283	*active = 0;
				1284	*inactive = 0;
				1285	*free = 0;
				1286	for (i = 0; i < MAX_NR_ZONES; i++) {
				1287	*active += zones[i].nr_active;
				1288	*inactive += zones[i].nr_inactive;
				1289	*free += zones[i].free_pages;
				1290	}
				1291	}
				1292
				1293	void get_zone_counts(unsigned long *active,
				1294	unsigned long inactive, unsigned long free)
				1295	{
				1296	struct pglist_data *pgdat;
				1297
				1298	*active = 0;
				1299	*inactive = 0;
				1300	*free = 0;
				1301	for_each_pgdat(pgdat) {
				1302	unsigned long l, m, n;
				1303	__get_zone_counts(&l, &m, &n, pgdat);
				1304	*active += l;
				1305	*inactive += m;
				1306	*free += n;
				1307	}
				1308	}
				1309
				1310	void si_meminfo(struct sysinfo *val)
				1311	{
				1312	val->totalram = totalram_pages;
				1313	val->sharedram = 0;
				1314	val->freeram = nr_free_pages();
				1315	val->bufferram = nr_blockdev_pages();
				1316	#ifdef CONFIG_HIGHMEM
				1317	val->totalhigh = totalhigh_pages;
				1318	val->freehigh = nr_free_highpages();
				1319	#else
				1320	val->totalhigh = 0;
				1321	val->freehigh = 0;
				1322	#endif
				1323	val->mem_unit = PAGE_SIZE;
				1324	}
				1325
				1326	EXPORT_SYMBOL(si_meminfo);
				1327
				1328	#ifdef CONFIG_NUMA
				1329	void si_meminfo_node(struct sysinfo *val, int nid)
				1330	{
				1331	pg_data_t *pgdat = NODE_DATA(nid);
				1332
				1333	val->totalram = pgdat->node_present_pages;
				1334	val->freeram = nr_free_pages_pgdat(pgdat);
				1335	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1336	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1337	val->mem_unit = PAGE_SIZE;
				1338	}
				1339	#endif
				1340
				1341	#define K(x) ((x) << (PAGE_SHIFT-10))
				1342
				1343	/*
				1344	* Show free area list (used inside shift_scroll-lock stuff)
				1345	* We also calculate the percentage fragmentation. We do this by counting the
				1346	* memory on each free list with the exception of the first item on the list.
				1347	*/
				1348	void show_free_areas(void)
				1349	{
				1350	struct page_state ps;
				1351	int cpu, temperature;
				1352	unsigned long active;
				1353	unsigned long inactive;
				1354	unsigned long free;
				1355	struct zone *zone;
				1356
				1357	for_each_zone(zone) {
				1358	show_node(zone);
				1359	printk("%s per-cpu:", zone->name);
				1360
				1361	if (!zone->present_pages) {
				1362	printk(" empty\n");
				1363	continue;
				1364	} else
				1365	printk("\n");
				1366
Dave Jones	6b482c6	2005-11-10 15:45:56 -0500	[diff] [blame]	1367	for_each_online_cpu(cpu) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1368	struct per_cpu_pageset *pageset;
				1369
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1370	pageset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1371
				1372	for (temperature = 0; temperature < 2; temperature++)
Nick Piggin	2d92c5c	2006-01-06 00:10:59 -0800	[diff] [blame]	1373	printk("cpu %d %s: high %d, batch %d used:%d\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1374	cpu,
				1375	temperature ? "cold" : "hot",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1376	pageset->pcp[temperature].high,
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	1377	pageset->pcp[temperature].batch,
				1378	pageset->pcp[temperature].count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1379	}
				1380	}
				1381
				1382	get_page_state(&ps);
				1383	get_zone_counts(&active, &inactive, &free);
				1384
Denis Vlasenko	c0d6221	2005-06-21 17:15:14 -0700	[diff] [blame]	1385	printk("Free pages: %11ukB (%ukB HighMem)\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1386	K(nr_free_pages()),
				1387	K(nr_free_highpages()));
				1388
				1389	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
				1390	"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
				1391	active,
				1392	inactive,
				1393	ps.nr_dirty,
				1394	ps.nr_writeback,
				1395	ps.nr_unstable,
				1396	nr_free_pages(),
				1397	ps.nr_slab,
				1398	ps.nr_mapped,
				1399	ps.nr_page_table_pages);
				1400
				1401	for_each_zone(zone) {
				1402	int i;
				1403
				1404	show_node(zone);
				1405	printk("%s"
				1406	" free:%lukB"
				1407	" min:%lukB"
				1408	" low:%lukB"
				1409	" high:%lukB"
				1410	" active:%lukB"
				1411	" inactive:%lukB"
				1412	" present:%lukB"
				1413	" pages_scanned:%lu"
				1414	" all_unreclaimable? %s"
				1415	"\n",
				1416	zone->name,
				1417	K(zone->free_pages),
				1418	K(zone->pages_min),
				1419	K(zone->pages_low),
				1420	K(zone->pages_high),
				1421	K(zone->nr_active),
				1422	K(zone->nr_inactive),
				1423	K(zone->present_pages),
				1424	zone->pages_scanned,
				1425	(zone->all_unreclaimable ? "yes" : "no")
				1426	);
				1427	printk("lowmem_reserve[]:");
				1428	for (i = 0; i < MAX_NR_ZONES; i++)
				1429	printk(" %lu", zone->lowmem_reserve[i]);
				1430	printk("\n");
				1431	}
				1432
				1433	for_each_zone(zone) {
				1434	unsigned long nr, flags, order, total = 0;
				1435
				1436	show_node(zone);
				1437	printk("%s: ", zone->name);
				1438	if (!zone->present_pages) {
				1439	printk("empty\n");
				1440	continue;
				1441	}
				1442
				1443	spin_lock_irqsave(&zone->lock, flags);
				1444	for (order = 0; order < MAX_ORDER; order++) {
				1445	nr = zone->free_area[order].nr_free;
				1446	total += nr << order;
				1447	printk("%lu*%lukB ", nr, K(1UL) << order);
				1448	}
				1449	spin_unlock_irqrestore(&zone->lock, flags);
				1450	printk("= %lukB\n", K(total));
				1451	}
				1452
				1453	show_swap_cache_info();
				1454	}
				1455
				1456	/*
				1457	* Builds allocation fallback zone lists.
				1458	*/
				1459	static int __init build_zonelists_node(pg_data_t pgdat, struct zonelist zonelist, int j, int k)
				1460	{
				1461	switch (k) {
				1462	struct zone *zone;
				1463	default:
				1464	BUG();
				1465	case ZONE_HIGHMEM:
				1466	zone = pgdat->node_zones + ZONE_HIGHMEM;
				1467	if (zone->present_pages) {
				1468	#ifndef CONFIG_HIGHMEM
				1469	BUG();
				1470	#endif
				1471	zonelist->zones[j++] = zone;
				1472	}
				1473	case ZONE_NORMAL:
				1474	zone = pgdat->node_zones + ZONE_NORMAL;
				1475	if (zone->present_pages)
				1476	zonelist->zones[j++] = zone;
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	1477	case ZONE_DMA32:
				1478	zone = pgdat->node_zones + ZONE_DMA32;
				1479	if (zone->present_pages)
				1480	zonelist->zones[j++] = zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1481	case ZONE_DMA:
				1482	zone = pgdat->node_zones + ZONE_DMA;
				1483	if (zone->present_pages)
				1484	zonelist->zones[j++] = zone;
				1485	}
				1486
				1487	return j;
				1488	}
				1489
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1490	static inline int highest_zone(int zone_bits)
				1491	{
				1492	int res = ZONE_NORMAL;
				1493	if (zone_bits & (__force int)__GFP_HIGHMEM)
				1494	res = ZONE_HIGHMEM;
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	1495	if (zone_bits & (__force int)__GFP_DMA32)
				1496	res = ZONE_DMA32;
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1497	if (zone_bits & (__force int)__GFP_DMA)
				1498	res = ZONE_DMA;
				1499	return res;
				1500	}
				1501
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1502	#ifdef CONFIG_NUMA
				1503	#define MAX_NODE_LOAD (num_online_nodes())
				1504	static int __initdata node_load[MAX_NUMNODES];
				1505	/**
Pavel Pisa	4dc3b16	2005-05-01 08:59:25 -0700	[diff] [blame]	1506	* find_next_best_node - find the next node that should appear in a given node's fallback list
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1507	* @node: node whose fallback list we're appending
				1508	* @used_node_mask: nodemask_t of already used nodes
				1509	*
				1510	* We use a number of factors to determine which is the next node that should
				1511	* appear on a given node's fallback list. The node should not have appeared
				1512	* already in @node's fallback list, and it should be the next closest node
				1513	* according to the distance array (which contains arbitrary distance values
				1514	* from each node to each node in the system), and should also prefer nodes
				1515	* with no CPUs, since presumably they'll have very little allocation pressure
				1516	* on them otherwise.
				1517	* It returns -1 if no node is found.
				1518	*/
				1519	static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
				1520	{
				1521	int i, n, val;
				1522	int min_val = INT_MAX;
				1523	int best_node = -1;
				1524
				1525	for_each_online_node(i) {
				1526	cpumask_t tmp;
				1527
				1528	/* Start from local node */
				1529	n = (node+i) % num_online_nodes();
				1530
				1531	/* Don't want a node to appear more than once */
				1532	if (node_isset(n, *used_node_mask))
				1533	continue;
				1534
				1535	/* Use the local node if we haven't already */
				1536	if (!node_isset(node, *used_node_mask)) {
				1537	best_node = node;
				1538	break;
				1539	}
				1540
				1541	/* Use the distance array to find the distance */
				1542	val = node_distance(node, n);
				1543
				1544	/* Give preference to headless and unused nodes */
				1545	tmp = node_to_cpumask(n);
				1546	if (!cpus_empty(tmp))
				1547	val += PENALTY_FOR_NODE_WITH_CPUS;
				1548
				1549	/* Slight preference for less loaded node */
				1550	val = (MAX_NODE_LOADMAX_NUMNODES);
				1551	val += node_load[n];
				1552
				1553	if (val < min_val) {
				1554	min_val = val;
				1555	best_node = n;
				1556	}
				1557	}
				1558
				1559	if (best_node >= 0)
				1560	node_set(best_node, *used_node_mask);
				1561
				1562	return best_node;
				1563	}
				1564
				1565	static void __init build_zonelists(pg_data_t *pgdat)
				1566	{
				1567	int i, j, k, node, local_node;
				1568	int prev_node, load;
				1569	struct zonelist *zonelist;
				1570	nodemask_t used_mask;
				1571
				1572	/* initialize zonelists */
				1573	for (i = 0; i < GFP_ZONETYPES; i++) {
				1574	zonelist = pgdat->node_zonelists + i;
				1575	zonelist->zones[0] = NULL;
				1576	}
				1577
				1578	/* NUMA-aware ordering of nodes */
				1579	local_node = pgdat->node_id;
				1580	load = num_online_nodes();
				1581	prev_node = local_node;
				1582	nodes_clear(used_mask);
				1583	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
				1584	/*
				1585	* We don't want to pressure a particular node.
				1586	* So adding penalty to the first node in same
				1587	* distance group to make it round-robin.
				1588	*/
				1589	if (node_distance(local_node, node) !=
				1590	node_distance(local_node, prev_node))
				1591	node_load[node] += load;
				1592	prev_node = node;
				1593	load--;
				1594	for (i = 0; i < GFP_ZONETYPES; i++) {
				1595	zonelist = pgdat->node_zonelists + i;
				1596	for (j = 0; zonelist->zones[j] != NULL; j++);
				1597
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1598	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1599
				1600	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1601	zonelist->zones[j] = NULL;
				1602	}
				1603	}
				1604	}
				1605
				1606	#else /* CONFIG_NUMA */
				1607
				1608	static void __init build_zonelists(pg_data_t *pgdat)
				1609	{
				1610	int i, j, k, node, local_node;
				1611
				1612	local_node = pgdat->node_id;
				1613	for (i = 0; i < GFP_ZONETYPES; i++) {
				1614	struct zonelist *zonelist;
				1615
				1616	zonelist = pgdat->node_zonelists + i;
				1617
				1618	j = 0;
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1619	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1620	j = build_zonelists_node(pgdat, zonelist, j, k);
				1621	/*
				1622	* Now we build the zonelist so that it contains the zones
				1623	* of all the other nodes.
				1624	* We don't want to pressure a particular node, so when
				1625	* building the zones for node N, we make sure that the
				1626	* zones coming right after the local ones are those from
				1627	* node N+1 (modulo N)
				1628	*/
				1629	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
				1630	if (!node_online(node))
				1631	continue;
				1632	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1633	}
				1634	for (node = 0; node < local_node; node++) {
				1635	if (!node_online(node))
				1636	continue;
				1637	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1638	}
				1639
				1640	zonelist->zones[j] = NULL;
				1641	}
				1642	}
				1643
				1644	#endif /* CONFIG_NUMA */
				1645
				1646	void __init build_all_zonelists(void)
				1647	{
				1648	int i;
				1649
				1650	for_each_online_node(i)
				1651	build_zonelists(NODE_DATA(i));
				1652	printk("Built %i zonelists\n", num_online_nodes());
				1653	cpuset_init_current_mems_allowed();
				1654	}
				1655
				1656	/*
				1657	* Helper functions to size the waitqueue hash table.
				1658	* Essentially these want to choose hash table sizes sufficiently
				1659	* large so that collisions trying to wait on pages are rare.
				1660	* But in fact, the number of active page waitqueues on typical
				1661	* systems is ridiculously low, less than 200. So this is even
				1662	* conservative, even though it seems large.
				1663	*
				1664	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
				1665	* waitqueues, i.e. the size of the waitq table given the number of pages.
				1666	*/
				1667	#define PAGES_PER_WAITQUEUE 256
				1668
				1669	static inline unsigned long wait_table_size(unsigned long pages)
				1670	{
				1671	unsigned long size = 1;
				1672
				1673	pages /= PAGES_PER_WAITQUEUE;
				1674
				1675	while (size < pages)
				1676	size <<= 1;
				1677
				1678	/*
				1679	* Once we have dozens or even hundreds of threads sleeping
				1680	* on IO we've got bigger problems than wait queue collision.
				1681	* Limit the size of the wait table to a reasonable size.
				1682	*/
				1683	size = min(size, 4096UL);
				1684
				1685	return max(size, 4UL);
				1686	}
				1687
				1688	/*
				1689	* This is an integer logarithm so that shifts can be used later
				1690	* to extract the more random high bits from the multiplicative
				1691	* hash function before the remainder is taken.
				1692	*/
				1693	static inline unsigned long wait_table_bits(unsigned long size)
				1694	{
				1695	return ffz(~size);
				1696	}
				1697
				1698	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
				1699
				1700	static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
				1701	unsigned long zones_size, unsigned long zholes_size)
				1702	{
				1703	unsigned long realtotalpages, totalpages = 0;
				1704	int i;
				1705
				1706	for (i = 0; i < MAX_NR_ZONES; i++)
				1707	totalpages += zones_size[i];
				1708	pgdat->node_spanned_pages = totalpages;
				1709
				1710	realtotalpages = totalpages;
				1711	if (zholes_size)
				1712	for (i = 0; i < MAX_NR_ZONES; i++)
				1713	realtotalpages -= zholes_size[i];
				1714	pgdat->node_present_pages = realtotalpages;
				1715	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
				1716	}
				1717
				1718
				1719	/*
				1720	* Initially all pages are reserved - free ones are freed
				1721	* up by free_all_bootmem() once the early boot process is
				1722	* done. Non-atomic initialization, single-pass.
				1723	*/
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	1724	void __devinit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1725	unsigned long start_pfn)
				1726	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1727	struct page *page;
Andy Whitcroft	29751f6	2005-06-23 00:08:00 -0700	[diff] [blame]	1728	unsigned long end_pfn = start_pfn + size;
				1729	unsigned long pfn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1730
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1731	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
				1732	if (!early_pfn_valid(pfn))
				1733	continue;
				1734	page = pfn_to_page(pfn);
				1735	set_page_links(page, zone, nid, pfn);
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1736	set_page_count(page, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1737	reset_page_mapcount(page);
				1738	SetPageReserved(page);
				1739	INIT_LIST_HEAD(&page->lru);
				1740	#ifdef WANT_PAGE_VIRTUAL
				1741	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
				1742	if (!is_highmem_idx(zone))
Bob Picco	3212c6b	2005-06-27 14:36:28 -0700	[diff] [blame]	1743	set_page_address(page, __va(pfn << PAGE_SHIFT));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1744	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1745	}
				1746	}
				1747
				1748	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
				1749	unsigned long size)
				1750	{
				1751	int order;
				1752	for (order = 0; order < MAX_ORDER ; order++) {
				1753	INIT_LIST_HEAD(&zone->free_area[order].free_list);
				1754	zone->free_area[order].nr_free = 0;
				1755	}
				1756	}
				1757
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1758	#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) \| zone_nr)
				1759	void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
				1760	unsigned long size)
				1761	{
				1762	unsigned long snum = pfn_to_section_nr(pfn);
				1763	unsigned long end = pfn_to_section_nr(pfn + size);
				1764
				1765	if (FLAGS_HAS_NODE)
				1766	zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
				1767	else
				1768	for (; snum <= end; snum++)
				1769	zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
				1770	}
				1771
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1772	#ifndef __HAVE_ARCH_MEMMAP_INIT
				1773	#define memmap_init(size, nid, zone, start_pfn) \
				1774	memmap_init_zone((size), (nid), (zone), (start_pfn))
				1775	#endif
				1776
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1777	static int __devinit zone_batchsize(struct zone *zone)
				1778	{
				1779	int batch;
				1780
				1781	/*
				1782	* The per-cpu-pages pools are set to around 1000th of the
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1783	* size of the zone. But no more than 1/2 of a meg.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1784	*
				1785	* OK, so we don't know how big the cache is. So guess.
				1786	*/
				1787	batch = zone->present_pages / 1024;
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1788	if (batch * PAGE_SIZE > 512 * 1024)
				1789	batch = (512 * 1024) / PAGE_SIZE;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1790	batch /= 4; /* We effectively = 4 below /
				1791	if (batch < 1)
				1792	batch = 1;
				1793
				1794	/*
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1795	* Clamp the batch to a 2^n - 1 value. Having a power
				1796	* of 2 value was found to be more likely to have
				1797	* suboptimal cache aliasing properties in some cases.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1798	*
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1799	* For example if 2 tasks are alternately allocating
				1800	* batches of pages, one task can end up with a lot
				1801	* of pages of one half of the possible page colors
				1802	* and the other with pages of the other colors.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1803	*/
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1804	batch = (1 << (fls(batch + batch/2)-1)) - 1;
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1805
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1806	return batch;
				1807	}
				1808
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1809	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
				1810	{
				1811	struct per_cpu_pages *pcp;
				1812
Magnus Damm	1c6fe94	2005-10-26 01:58:59 -0700	[diff] [blame]	1813	memset(p, 0, sizeof(*p));
				1814
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1815	pcp = &p->pcp[0]; /* hot */
				1816	pcp->count = 0;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1817	pcp->high = 6 * batch;
				1818	pcp->batch = max(1UL, 1 * batch);
				1819	INIT_LIST_HEAD(&pcp->list);
				1820
				1821	pcp = &p->pcp[1]; /* cold*/
				1822	pcp->count = 0;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1823	pcp->high = 2 * batch;
Seth, Rohit	e46a5e2	2005-10-29 18:15:48 -0700	[diff] [blame]	1824	pcp->batch = max(1UL, batch/2);
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1825	INIT_LIST_HEAD(&pcp->list);
				1826	}
				1827
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1828	#ifdef CONFIG_NUMA
				1829	/*
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1830	* Boot pageset table. One per cpu which is going to be used for all
				1831	* zones and all nodes. The parameters will be set in such a way
				1832	* that an item put on a list will immediately be handed over to
				1833	* the buddy list. This is safe since pageset manipulation is done
				1834	* with interrupts disabled.
				1835	*
				1836	* Some NUMA counter updates may also be caught by the boot pagesets.
Christoph Lameter	b7c84c6	2005-06-22 20:26:07 -0700	[diff] [blame]	1837	*
				1838	* The boot_pagesets must be kept even after bootup is complete for
				1839	* unused processors and/or zones. They do play a role for bootstrapping
				1840	* hotplugged processors.
				1841	*
				1842	* zoneinfo_show() and maybe other functions do
				1843	* not check if the processor is online before following the pageset pointer.
				1844	* Other parts of the kernel may not check if the zone is available.
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1845	*/
				1846	static struct per_cpu_pageset
Christoph Lameter	b7c84c6	2005-06-22 20:26:07 -0700	[diff] [blame]	1847	boot_pageset[NR_CPUS];
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1848
				1849	/*
				1850	* Dynamically allocate memory for the
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1851	* per cpu pageset array in struct zone.
				1852	*/
				1853	static int __devinit process_zones(int cpu)
				1854	{
				1855	struct zone zone, dzone;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1856
				1857	for_each_zone(zone) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1858
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1859	zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1860	GFP_KERNEL, cpu_to_node(cpu));
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1861	if (!zone->pageset[cpu])
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1862	goto bad;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1863
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1864	setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1865	}
				1866
				1867	return 0;
				1868	bad:
				1869	for_each_zone(dzone) {
				1870	if (dzone == zone)
				1871	break;
				1872	kfree(dzone->pageset[cpu]);
				1873	dzone->pageset[cpu] = NULL;
				1874	}
				1875	return -ENOMEM;
				1876	}
				1877
				1878	static inline void free_zone_pagesets(int cpu)
				1879	{
				1880	#ifdef CONFIG_NUMA
				1881	struct zone *zone;
				1882
				1883	for_each_zone(zone) {
				1884	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
				1885
				1886	zone_pcp(zone, cpu) = NULL;
				1887	kfree(pset);
				1888	}
				1889	#endif
				1890	}
				1891
				1892	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
				1893	unsigned long action,
				1894	void *hcpu)
				1895	{
				1896	int cpu = (long)hcpu;
				1897	int ret = NOTIFY_OK;
				1898
				1899	switch (action) {
				1900	case CPU_UP_PREPARE:
				1901	if (process_zones(cpu))
				1902	ret = NOTIFY_BAD;
				1903	break;
Andi Kleen	b0d4169	2005-11-05 17:25:53 +0100	[diff] [blame]	1904	case CPU_UP_CANCELED:
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1905	case CPU_DEAD:
				1906	free_zone_pagesets(cpu);
				1907	break;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1908	default:
				1909	break;
				1910	}
				1911	return ret;
				1912	}
				1913
				1914	static struct notifier_block pageset_notifier =
				1915	{ &pageset_cpuup_callback, NULL, 0 };
				1916
Al Viro	78d9955	2005-12-15 09:18:25 +0000	[diff] [blame]	1917	void __init setup_per_cpu_pageset(void)
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1918	{
				1919	int err;
				1920
				1921	/* Initialize per_cpu_pageset for cpu 0.
				1922	* A cpuup callback will do this for every cpu
				1923	* as it comes online
				1924	*/
				1925	err = process_zones(smp_processor_id());
				1926	BUG_ON(err);
				1927	register_cpu_notifier(&pageset_notifier);
				1928	}
				1929
				1930	#endif
				1931
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1932	static __devinit
				1933	void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
				1934	{
				1935	int i;
				1936	struct pglist_data *pgdat = zone->zone_pgdat;
				1937
				1938	/*
				1939	* The per-page waitqueue mechanism uses hashed waitqueues
				1940	* per zone.
				1941	*/
				1942	zone->wait_table_size = wait_table_size(zone_size_pages);
				1943	zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
				1944	zone->wait_table = (wait_queue_head_t *)
				1945	alloc_bootmem_node(pgdat, zone->wait_table_size
				1946	* sizeof(wait_queue_head_t));
				1947
				1948	for(i = 0; i < zone->wait_table_size; ++i)
				1949	init_waitqueue_head(zone->wait_table + i);
				1950	}
				1951
				1952	static __devinit void zone_pcp_init(struct zone *zone)
				1953	{
				1954	int cpu;
				1955	unsigned long batch = zone_batchsize(zone);
				1956
				1957	for (cpu = 0; cpu < NR_CPUS; cpu++) {
				1958	#ifdef CONFIG_NUMA
				1959	/* Early boot. Slab allocator not functional yet */
				1960	zone->pageset[cpu] = &boot_pageset[cpu];
				1961	setup_pageset(&boot_pageset[cpu],0);
				1962	#else
				1963	setup_pageset(zone_pcp(zone,cpu), batch);
				1964	#endif
				1965	}
				1966	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
				1967	zone->name, zone->present_pages, batch);
				1968	}
				1969
				1970	static __devinit void init_currently_empty_zone(struct zone *zone,
				1971	unsigned long zone_start_pfn, unsigned long size)
				1972	{
				1973	struct pglist_data *pgdat = zone->zone_pgdat;
				1974
				1975	zone_wait_table_init(zone, size);
				1976	pgdat->nr_zones = zone_idx(zone) + 1;
				1977
				1978	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
				1979	zone->zone_start_pfn = zone_start_pfn;
				1980
				1981	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
				1982
				1983	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
				1984	}
				1985
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1986	/*
				1987	* Set up the zone data structures:
				1988	* - mark all pages reserved
				1989	* - mark all memory queues empty
				1990	* - clear the memory bitmaps
				1991	*/
				1992	static void __init free_area_init_core(struct pglist_data *pgdat,
				1993	unsigned long zones_size, unsigned long zholes_size)
				1994	{
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1995	unsigned long j;
				1996	int nid = pgdat->node_id;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1997	unsigned long zone_start_pfn = pgdat->node_start_pfn;
				1998
Dave Hansen	208d54e	2005-10-29 18:16:52 -0700	[diff] [blame]	1999	pgdat_resize_init(pgdat);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2000	pgdat->nr_zones = 0;
				2001	init_waitqueue_head(&pgdat->kswapd_wait);
				2002	pgdat->kswapd_max_order = 0;
				2003
				2004	for (j = 0; j < MAX_NR_ZONES; j++) {
				2005	struct zone *zone = pgdat->node_zones + j;
				2006	unsigned long size, realsize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2007
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2008	realsize = size = zones_size[j];
				2009	if (zholes_size)
				2010	realsize -= zholes_size[j];
				2011
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	2012	if (j < ZONE_HIGHMEM)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2013	nr_kernel_pages += realsize;
				2014	nr_all_pages += realsize;
				2015
				2016	zone->spanned_pages = size;
				2017	zone->present_pages = realsize;
				2018	zone->name = zone_names[j];
				2019	spin_lock_init(&zone->lock);
				2020	spin_lock_init(&zone->lru_lock);
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	2021	zone_seqlock_init(zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2022	zone->zone_pgdat = pgdat;
				2023	zone->free_pages = 0;
				2024
				2025	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
				2026
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2027	zone_pcp_init(zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2028	INIT_LIST_HEAD(&zone->active_list);
				2029	INIT_LIST_HEAD(&zone->inactive_list);
				2030	zone->nr_scan_active = 0;
				2031	zone->nr_scan_inactive = 0;
				2032	zone->nr_active = 0;
				2033	zone->nr_inactive = 0;
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	2034	atomic_set(&zone->reclaim_in_progress, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2035	if (!size)
				2036	continue;
				2037
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2038	zonetable_add(zone, nid, j, zone_start_pfn, size);
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2039	init_currently_empty_zone(zone, zone_start_pfn, size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2040	zone_start_pfn += size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2041	}
				2042	}
				2043
				2044	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
				2045	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2046	/* Skip empty nodes */
				2047	if (!pgdat->node_spanned_pages)
				2048	return;
				2049
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2050	#ifdef CONFIG_FLAT_NODE_MEM_MAP
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2051	/* ia64 gets its own node_mem_map, before this, without bootmem */
				2052	if (!pgdat->node_mem_map) {
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2053	unsigned long size;
				2054	struct page *map;
				2055
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2056	size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
Dave Hansen	6f167ec	2005-06-23 00:07:39 -0700	[diff] [blame]	2057	map = alloc_remap(pgdat->node_id, size);
				2058	if (!map)
				2059	map = alloc_bootmem_node(pgdat, size);
				2060	pgdat->node_mem_map = map;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2061	}
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2062	#ifdef CONFIG_FLATMEM
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2063	/*
				2064	* With no DISCONTIG, the global mem_map is just set as node 0's
				2065	*/
				2066	if (pgdat == NODE_DATA(0))
				2067	mem_map = NODE_DATA(0)->node_mem_map;
				2068	#endif
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2069	#endif /* CONFIG_FLAT_NODE_MEM_MAP */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2070	}
				2071
				2072	void __init free_area_init_node(int nid, struct pglist_data *pgdat,
				2073	unsigned long *zones_size, unsigned long node_start_pfn,
				2074	unsigned long *zholes_size)
				2075	{
				2076	pgdat->node_id = nid;
				2077	pgdat->node_start_pfn = node_start_pfn;
				2078	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
				2079
				2080	alloc_node_mem_map(pgdat);
				2081
				2082	free_area_init_core(pgdat, zones_size, zholes_size);
				2083	}
				2084
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2085	#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2086	static bootmem_data_t contig_bootmem_data;
				2087	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
				2088
				2089	EXPORT_SYMBOL(contig_page_data);
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2090	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2091
				2092	void __init free_area_init(unsigned long *zones_size)
				2093	{
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2094	free_area_init_node(0, NODE_DATA(0), zones_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2095	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
				2096	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2097
				2098	#ifdef CONFIG_PROC_FS
				2099
				2100	#include <linux/seq_file.h>
				2101
				2102	static void frag_start(struct seq_file m, loff_t *pos)
				2103	{
				2104	pg_data_t *pgdat;
				2105	loff_t node = *pos;
				2106
				2107	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
				2108	--node;
				2109
				2110	return pgdat;
				2111	}
				2112
				2113	static void frag_next(struct seq_file m, void arg, loff_t pos)
				2114	{
				2115	pg_data_t pgdat = (pg_data_t )arg;
				2116
				2117	(*pos)++;
				2118	return pgdat->pgdat_next;
				2119	}
				2120
				2121	static void frag_stop(struct seq_file m, void arg)
				2122	{
				2123	}
				2124
				2125	/*
				2126	* This walks the free areas for each zone.
				2127	*/
				2128	static int frag_show(struct seq_file m, void arg)
				2129	{
				2130	pg_data_t pgdat = (pg_data_t )arg;
				2131	struct zone *zone;
				2132	struct zone *node_zones = pgdat->node_zones;
				2133	unsigned long flags;
				2134	int order;
				2135
				2136	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
				2137	if (!zone->present_pages)
				2138	continue;
				2139
				2140	spin_lock_irqsave(&zone->lock, flags);
				2141	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
				2142	for (order = 0; order < MAX_ORDER; ++order)
				2143	seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
				2144	spin_unlock_irqrestore(&zone->lock, flags);
				2145	seq_putc(m, '\n');
				2146	}
				2147	return 0;
				2148	}
				2149
				2150	struct seq_operations fragmentation_op = {
				2151	.start = frag_start,
				2152	.next = frag_next,
				2153	.stop = frag_stop,
				2154	.show = frag_show,
				2155	};
				2156
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2157	/*
				2158	* Output information about zones in @pgdat.
				2159	*/
				2160	static int zoneinfo_show(struct seq_file m, void arg)
				2161	{
				2162	pg_data_t *pgdat = arg;
				2163	struct zone *zone;
				2164	struct zone *node_zones = pgdat->node_zones;
				2165	unsigned long flags;
				2166
				2167	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
				2168	int i;
				2169
				2170	if (!zone->present_pages)
				2171	continue;
				2172
				2173	spin_lock_irqsave(&zone->lock, flags);
				2174	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
				2175	seq_printf(m,
				2176	"\n pages free %lu"
				2177	"\n min %lu"
				2178	"\n low %lu"
				2179	"\n high %lu"
				2180	"\n active %lu"
				2181	"\n inactive %lu"
				2182	"\n scanned %lu (a: %lu i: %lu)"
				2183	"\n spanned %lu"
				2184	"\n present %lu",
				2185	zone->free_pages,
				2186	zone->pages_min,
				2187	zone->pages_low,
				2188	zone->pages_high,
				2189	zone->nr_active,
				2190	zone->nr_inactive,
				2191	zone->pages_scanned,
				2192	zone->nr_scan_active, zone->nr_scan_inactive,
				2193	zone->spanned_pages,
				2194	zone->present_pages);
				2195	seq_printf(m,
				2196	"\n protection: (%lu",
				2197	zone->lowmem_reserve[0]);
				2198	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
				2199	seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
				2200	seq_printf(m,
				2201	")"
				2202	"\n pagesets");
				2203	for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
				2204	struct per_cpu_pageset *pageset;
				2205	int j;
				2206
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	2207	pageset = zone_pcp(zone, i);
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2208	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2209	if (pageset->pcp[j].count)
				2210	break;
				2211	}
				2212	if (j == ARRAY_SIZE(pageset->pcp))
				2213	continue;
				2214	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2215	seq_printf(m,
				2216	"\n cpu: %i pcp: %i"
				2217	"\n count: %i"
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2218	"\n high: %i"
				2219	"\n batch: %i",
				2220	i, j,
				2221	pageset->pcp[j].count,
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2222	pageset->pcp[j].high,
				2223	pageset->pcp[j].batch);
				2224	}
				2225	#ifdef CONFIG_NUMA
				2226	seq_printf(m,
				2227	"\n numa_hit: %lu"
				2228	"\n numa_miss: %lu"
				2229	"\n numa_foreign: %lu"
				2230	"\n interleave_hit: %lu"
				2231	"\n local_node: %lu"
				2232	"\n other_node: %lu",
				2233	pageset->numa_hit,
				2234	pageset->numa_miss,
				2235	pageset->numa_foreign,
				2236	pageset->interleave_hit,
				2237	pageset->local_node,
				2238	pageset->other_node);
				2239	#endif
				2240	}
				2241	seq_printf(m,
				2242	"\n all_unreclaimable: %u"
				2243	"\n prev_priority: %i"
				2244	"\n temp_priority: %i"
				2245	"\n start_pfn: %lu",
				2246	zone->all_unreclaimable,
				2247	zone->prev_priority,
				2248	zone->temp_priority,
				2249	zone->zone_start_pfn);
				2250	spin_unlock_irqrestore(&zone->lock, flags);
				2251	seq_putc(m, '\n');
				2252	}
				2253	return 0;
				2254	}
				2255
				2256	struct seq_operations zoneinfo_op = {
				2257	.start = frag_start, /* iterate over all zones. The same as in
				2258	* fragmentation. */
				2259	.next = frag_next,
				2260	.stop = frag_stop,
				2261	.show = zoneinfo_show,
				2262	};
				2263
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2264	static char *vmstat_text[] = {
				2265	"nr_dirty",
				2266	"nr_writeback",
				2267	"nr_unstable",
				2268	"nr_page_table_pages",
				2269	"nr_mapped",
				2270	"nr_slab",
				2271
				2272	"pgpgin",
				2273	"pgpgout",
				2274	"pswpin",
				2275	"pswpout",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2276
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2277	"pgalloc_high",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2278	"pgalloc_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2279	"pgalloc_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2280	"pgalloc_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2281
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2282	"pgfree",
				2283	"pgactivate",
				2284	"pgdeactivate",
				2285
				2286	"pgfault",
				2287	"pgmajfault",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2288
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2289	"pgrefill_high",
				2290	"pgrefill_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2291	"pgrefill_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2292	"pgrefill_dma",
				2293
				2294	"pgsteal_high",
				2295	"pgsteal_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2296	"pgsteal_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2297	"pgsteal_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2298
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2299	"pgscan_kswapd_high",
				2300	"pgscan_kswapd_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2301	"pgscan_kswapd_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2302	"pgscan_kswapd_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2303
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2304	"pgscan_direct_high",
				2305	"pgscan_direct_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2306	"pgscan_direct_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2307	"pgscan_direct_dma",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2308
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2309	"pginodesteal",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2310	"slabs_scanned",
				2311	"kswapd_steal",
				2312	"kswapd_inodesteal",
				2313	"pageoutrun",
				2314	"allocstall",
				2315
				2316	"pgrotated",
KAMEZAWA Hiroyuki	edfbe2b	2005-05-01 08:58:37 -0700	[diff] [blame]	2317	"nr_bounce",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2318	};
				2319
				2320	static void vmstat_start(struct seq_file m, loff_t *pos)
				2321	{
				2322	struct page_state *ps;
				2323
				2324	if (*pos >= ARRAY_SIZE(vmstat_text))
				2325	return NULL;
				2326
				2327	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
				2328	m->private = ps;
				2329	if (!ps)
				2330	return ERR_PTR(-ENOMEM);
				2331	get_full_page_state(ps);
				2332	ps->pgpgin /= 2; /* sectors -> kbytes */
				2333	ps->pgpgout /= 2;
				2334	return (unsigned long )ps + pos;
				2335	}
				2336
				2337	static void vmstat_next(struct seq_file m, void arg, loff_t pos)
				2338	{
				2339	(*pos)++;
				2340	if (*pos >= ARRAY_SIZE(vmstat_text))
				2341	return NULL;
				2342	return (unsigned long )m->private + pos;
				2343	}
				2344
				2345	static int vmstat_show(struct seq_file m, void arg)
				2346	{
				2347	unsigned long *l = arg;
				2348	unsigned long off = l - (unsigned long *)m->private;
				2349
				2350	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
				2351	return 0;
				2352	}
				2353
				2354	static void vmstat_stop(struct seq_file m, void arg)
				2355	{
				2356	kfree(m->private);
				2357	m->private = NULL;
				2358	}
				2359
				2360	struct seq_operations vmstat_op = {
				2361	.start = vmstat_start,
				2362	.next = vmstat_next,
				2363	.stop = vmstat_stop,
				2364	.show = vmstat_show,
				2365	};
				2366
				2367	#endif /* CONFIG_PROC_FS */
				2368
				2369	#ifdef CONFIG_HOTPLUG_CPU
				2370	static int page_alloc_cpu_notify(struct notifier_block *self,
				2371	unsigned long action, void *hcpu)
				2372	{
				2373	int cpu = (unsigned long)hcpu;
				2374	long *count;
				2375	unsigned long src, dest;
				2376
				2377	if (action == CPU_DEAD) {
				2378	int i;
				2379
				2380	/* Drain local pagecache count. */
				2381	count = &per_cpu(nr_pagecache_local, cpu);
				2382	atomic_add(*count, &nr_pagecache);
				2383	*count = 0;
				2384	local_irq_disable();
				2385	__drain_pages(cpu);
				2386
				2387	/* Add dead cpu's page_states to our own. */
				2388	dest = (unsigned long *)&__get_cpu_var(page_states);
				2389	src = (unsigned long *)&per_cpu(page_states, cpu);
				2390
				2391	for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
				2392	i++) {
				2393	dest[i] += src[i];
				2394	src[i] = 0;
				2395	}
				2396
				2397	local_irq_enable();
				2398	}
				2399	return NOTIFY_OK;
				2400	}
				2401	#endif /* CONFIG_HOTPLUG_CPU */
				2402
				2403	void __init page_alloc_init(void)
				2404	{
				2405	hotcpu_notifier(page_alloc_cpu_notify, 0);
				2406	}
				2407
				2408	/*
				2409	* setup_per_zone_lowmem_reserve - called whenever
				2410	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
				2411	* has a correct pages reserved value, so an adequate number of
				2412	* pages are left in the zone after a successful __alloc_pages().
				2413	*/
				2414	static void setup_per_zone_lowmem_reserve(void)
				2415	{
				2416	struct pglist_data *pgdat;
				2417	int j, idx;
				2418
				2419	for_each_pgdat(pgdat) {
				2420	for (j = 0; j < MAX_NR_ZONES; j++) {
				2421	struct zone *zone = pgdat->node_zones + j;
				2422	unsigned long present_pages = zone->present_pages;
				2423
				2424	zone->lowmem_reserve[j] = 0;
				2425
				2426	for (idx = j-1; idx >= 0; idx--) {
				2427	struct zone *lower_zone;
				2428
				2429	if (sysctl_lowmem_reserve_ratio[idx] < 1)
				2430	sysctl_lowmem_reserve_ratio[idx] = 1;
				2431
				2432	lower_zone = pgdat->node_zones + idx;
				2433	lower_zone->lowmem_reserve[j] = present_pages /
				2434	sysctl_lowmem_reserve_ratio[idx];
				2435	present_pages += lower_zone->present_pages;
				2436	}
				2437	}
				2438	}
				2439	}
				2440
				2441	/*
				2442	* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
				2443	* that the pages_{min,low,high} values for each zone are set correctly
				2444	* with respect to min_free_kbytes.
				2445	*/
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2446	void setup_per_zone_pages_min(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2447	{
				2448	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
				2449	unsigned long lowmem_pages = 0;
				2450	struct zone *zone;
				2451	unsigned long flags;
				2452
				2453	/* Calculate total number of !ZONE_HIGHMEM pages */
				2454	for_each_zone(zone) {
				2455	if (!is_highmem(zone))
				2456	lowmem_pages += zone->present_pages;
				2457	}
				2458
				2459	for_each_zone(zone) {
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2460	unsigned long tmp;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2461	spin_lock_irqsave(&zone->lru_lock, flags);
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2462	tmp = (pages_min * zone->present_pages) / lowmem_pages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2463	if (is_highmem(zone)) {
				2464	/*
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2465	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
				2466	* need highmem pages, so cap pages_min to a small
				2467	* value here.
				2468	*
				2469	* The (pages_high-pages_low) and (pages_low-pages_min)
				2470	* deltas controls asynch page reclaim, and so should
				2471	* not be capped for highmem.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2472	*/
				2473	int min_pages;
				2474
				2475	min_pages = zone->present_pages / 1024;
				2476	if (min_pages < SWAP_CLUSTER_MAX)
				2477	min_pages = SWAP_CLUSTER_MAX;
				2478	if (min_pages > 128)
				2479	min_pages = 128;
				2480	zone->pages_min = min_pages;
				2481	} else {
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2482	/*
				2483	* If it's a lowmem zone, reserve a number of pages
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2484	* proportionate to the zone's size.
				2485	*/
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2486	zone->pages_min = tmp;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2487	}
				2488
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2489	zone->pages_low = zone->pages_min + tmp / 4;
				2490	zone->pages_high = zone->pages_min + tmp / 2;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2491	spin_unlock_irqrestore(&zone->lru_lock, flags);
				2492	}
				2493	}
				2494
				2495	/*
				2496	* Initialise min_free_kbytes.
				2497	*
				2498	* For small machines we want it small (128k min). For large machines
				2499	* we want it large (64MB max). But it is not linear, because network
				2500	* bandwidth does not increase linearly with machine size. We use
				2501	*
				2502	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
				2503	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
				2504	*
				2505	* which yields
				2506	*
				2507	* 16MB: 512k
				2508	* 32MB: 724k
				2509	* 64MB: 1024k
				2510	* 128MB: 1448k
				2511	* 256MB: 2048k
				2512	* 512MB: 2896k
				2513	* 1024MB: 4096k
				2514	* 2048MB: 5792k
				2515	* 4096MB: 8192k
				2516	* 8192MB: 11584k
				2517	* 16384MB: 16384k
				2518	*/
				2519	static int __init init_per_zone_pages_min(void)
				2520	{
				2521	unsigned long lowmem_kbytes;
				2522
				2523	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
				2524
				2525	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
				2526	if (min_free_kbytes < 128)
				2527	min_free_kbytes = 128;
				2528	if (min_free_kbytes > 65536)
				2529	min_free_kbytes = 65536;
				2530	setup_per_zone_pages_min();
				2531	setup_per_zone_lowmem_reserve();
				2532	return 0;
				2533	}
				2534	module_init(init_per_zone_pages_min)
				2535
				2536	/*
				2537	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
				2538	* that we can call two helper functions whenever min_free_kbytes
				2539	* changes.
				2540	*/
				2541	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
				2542	struct file file, void __user buffer, size_t length, loff_t ppos)
				2543	{
				2544	proc_dointvec(table, write, file, buffer, length, ppos);
				2545	setup_per_zone_pages_min();
				2546	return 0;
				2547	}
				2548
				2549	/*
				2550	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
				2551	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
				2552	* whenever sysctl_lowmem_reserve_ratio changes.
				2553	*
				2554	* The reserve ratio obviously has absolutely no relation with the
				2555	* pages_min watermarks. The lowmem reserve ratio can only make sense
				2556	* if in function of the boot time zone sizes.
				2557	*/
				2558	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
				2559	struct file file, void __user buffer, size_t length, loff_t ppos)
				2560	{
				2561	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
				2562	setup_per_zone_lowmem_reserve();
				2563	return 0;
				2564	}
				2565
				2566	__initdata int hashdist = HASHDIST_DEFAULT;
				2567
				2568	#ifdef CONFIG_NUMA
				2569	static int __init set_hashdist(char *str)
				2570	{
				2571	if (!str)
				2572	return 0;
				2573	hashdist = simple_strtoul(str, &str, 0);
				2574	return 1;
				2575	}
				2576	__setup("hashdist=", set_hashdist);
				2577	#endif
				2578
				2579	/*
				2580	* allocate a large system hash table from bootmem
				2581	* - it is assumed that the hash table must contain an exact power-of-2
				2582	* quantity of entries
				2583	* - limit is the number of hash buckets, not the total allocation size
				2584	*/
				2585	void __init alloc_large_system_hash(const char tablename,
				2586	unsigned long bucketsize,
				2587	unsigned long numentries,
				2588	int scale,
				2589	int flags,
				2590	unsigned int *_hash_shift,
				2591	unsigned int *_hash_mask,
				2592	unsigned long limit)
				2593	{
				2594	unsigned long long max = limit;
				2595	unsigned long log2qty, size;
				2596	void *table = NULL;
				2597
				2598	/* allow the kernel cmdline to have a say */
				2599	if (!numentries) {
				2600	/* round applicable memory size up to nearest megabyte */
				2601	numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
				2602	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
				2603	numentries >>= 20 - PAGE_SHIFT;
				2604	numentries <<= 20 - PAGE_SHIFT;
				2605
				2606	/* limit to 1 bucket per 2^scale bytes of low memory */
				2607	if (scale > PAGE_SHIFT)
				2608	numentries >>= (scale - PAGE_SHIFT);
				2609	else
				2610	numentries <<= (PAGE_SHIFT - scale);
				2611	}
				2612	/* rounded up to nearest power of 2 in size */
				2613	numentries = 1UL << (long_log2(numentries) + 1);
				2614
				2615	/* limit allocation size to 1/16 total memory by default */
				2616	if (max == 0) {
				2617	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
				2618	do_div(max, bucketsize);
				2619	}
				2620
				2621	if (numentries > max)
				2622	numentries = max;
				2623
				2624	log2qty = long_log2(numentries);
				2625
				2626	do {
				2627	size = bucketsize << log2qty;
				2628	if (flags & HASH_EARLY)
				2629	table = alloc_bootmem(size);
				2630	else if (hashdist)
				2631	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
				2632	else {
				2633	unsigned long order;
				2634	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
				2635	;
				2636	table = (void*) __get_free_pages(GFP_ATOMIC, order);
				2637	}
				2638	} while (!table && size > PAGE_SIZE && --log2qty);
				2639
				2640	if (!table)
				2641	panic("Failed to allocate %s hash table\n", tablename);
				2642
				2643	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
				2644	tablename,
				2645	(1U << log2qty),
				2646	long_log2(size) - PAGE_SHIFT,
				2647	size);
				2648
				2649	if (_hash_shift)
				2650	*_hash_shift = log2qty;
				2651	if (_hash_mask)
				2652	*_hash_mask = (1 << log2qty) - 1;
				2653
				2654	return table;
				2655	}