Blame - mm/page_alloc.c - kernel/msm

blob: 791690d7d3fa02e4e5956af69fffedf654088adc [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/page_alloc.c
				3	*
				4	* Manages the free list, the system allocates free pages here.
				5	* Note that kmalloc() lives in slab.c
				6	*
				7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				8	* Swap reorganised 29.12.95, Stephen Tweedie
				9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
				11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
				12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
				13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
				14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
				15	*/
				16
				17	#include <linux/config.h>
				18	#include <linux/stddef.h>
				19	#include <linux/mm.h>
				20	#include <linux/swap.h>
				21	#include <linux/interrupt.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/bootmem.h>
				24	#include <linux/compiler.h>
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	25	#include <linux/kernel.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	26	#include <linux/module.h>
				27	#include <linux/suspend.h>
				28	#include <linux/pagevec.h>
				29	#include <linux/blkdev.h>
				30	#include <linux/slab.h>
				31	#include <linux/notifier.h>
				32	#include <linux/topology.h>
				33	#include <linux/sysctl.h>
				34	#include <linux/cpu.h>
				35	#include <linux/cpuset.h>
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	36	#include <linux/memory_hotplug.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	37	#include <linux/nodemask.h>
				38	#include <linux/vmalloc.h>
Christoph Lameter	4be38e3	2006-01-06 00:11:17 -0800	[diff] [blame]	39	#include <linux/mempolicy.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	40
				41	#include <asm/tlbflush.h>
				42	#include "internal.h"
				43
				44	/*
				45	* MCD - HACK: Find somewhere to initialize this EARLY, or make this
				46	* initializer cleaner
				47	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	48	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	49	EXPORT_SYMBOL(node_online_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	50	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	51	EXPORT_SYMBOL(node_possible_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	52	struct pglist_data *pgdat_list __read_mostly;
Ravikiran G Thirumalai	6c231b7	2005-09-06 15:17:45 -0700	[diff] [blame]	53	unsigned long totalram_pages __read_mostly;
				54	unsigned long totalhigh_pages __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	55	long nr_swap_pages;
Rohit Seth	8ad4b1f	2006-01-08 01:00:40 -0800	[diff] [blame]	56	int percpu_pagelist_fraction;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	57
David Howells	a226f6c	2006-01-06 00:11:08 -0800	[diff] [blame]	58	static void fastcall free_hot_cold_page(struct page *page, int cold);
Hugh Dickins	d98c7a0	2006-02-14 13:52:59 -0800	[diff] [blame]	59	static void __free_pages_ok(struct page *page, unsigned int order);
David Howells	a226f6c	2006-01-06 00:11:08 -0800	[diff] [blame]	60
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	61	/*
				62	* results with 256, 32 in the lowmem_reserve sysctl:
				63	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
				64	* 1G machine -> (16M dma, 784M normal, 224M high)
				65	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
				66	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
				67	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	68	*
				69	* TBD: should special case ZONE_DMA32 machines here - in those we normally
				70	* don't need any ZONE_NORMAL reservation
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	71	*/
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	72	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 256, 32 };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	73
				74	EXPORT_SYMBOL(totalram_pages);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	75
				76	/*
				77	* Used by page_zone() to look up the address of the struct zone whose
				78	* id is encoded in the upper bits of page->flags
				79	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	80	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	81	EXPORT_SYMBOL(zone_table);
				82
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	83	static char *zone_names[MAX_NR_ZONES] = { "DMA", "DMA32", "Normal", "HighMem" };
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	84	int min_free_kbytes = 1024;
				85
				86	unsigned long __initdata nr_kernel_pages;
				87	unsigned long __initdata nr_all_pages;
				88
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	89	#ifdef CONFIG_DEBUG_VM
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	90	static int page_outside_zone_boundaries(struct zone zone, struct page page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	91	{
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	92	int ret = 0;
				93	unsigned seq;
				94	unsigned long pfn = page_to_pfn(page);
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	95
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	96	do {
				97	seq = zone_span_seqbegin(zone);
				98	if (pfn >= zone->zone_start_pfn + zone->spanned_pages)
				99	ret = 1;
				100	else if (pfn < zone->zone_start_pfn)
				101	ret = 1;
				102	} while (zone_span_seqretry(zone, seq));
				103
				104	return ret;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	105	}
				106
				107	static int page_is_consistent(struct zone zone, struct page page)
				108	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	109	#ifdef CONFIG_HOLES_IN_ZONE
				110	if (!pfn_valid(page_to_pfn(page)))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	111	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	112	#endif
				113	if (zone != page_zone(page))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	114	return 0;
				115
				116	return 1;
				117	}
				118	/*
				119	* Temporary debugging check for pages not lying within a given zone.
				120	*/
				121	static int bad_range(struct zone zone, struct page page)
				122	{
				123	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	124	return 1;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame]	125	if (!page_is_consistent(zone, page))
				126	return 1;
				127
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	return 0;
				129	}
				130
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	131	#else
				132	static inline int bad_range(struct zone zone, struct page page)
				133	{
				134	return 0;
				135	}
				136	#endif
				137
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	138	static void bad_page(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	139	{
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	140	printk(KERN_EMERG "Bad page state in process '%s'\n"
Hugh Dickins	7365f3d	2006-01-11 12:17:18 -0800	[diff] [blame]	141	KERN_EMERG "page:%p flags:0x%0*lx mapping:%p mapcount:%d count:%d\n"
				142	KERN_EMERG "Trying to fix it up, but a reboot is needed\n"
				143	KERN_EMERG "Backtrace:\n",
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	144	current->comm, page, (int)(2*sizeof(unsigned long)),
				145	(unsigned long)page->flags, page->mapping,
				146	page_mapcount(page), page_count(page));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	147	dump_stack();
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	148	page->flags &= ~(1 << PG_lru \|
				149	1 << PG_private \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	150	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	151	1 << PG_active \|
				152	1 << PG_dirty \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	153	1 << PG_reclaim \|
				154	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	155	1 << PG_swapcache \|
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	156	1 << PG_writeback );
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	157	set_page_count(page, 0);
				158	reset_page_mapcount(page);
				159	page->mapping = NULL;
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	160	add_taint(TAINT_BAD_PAGE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	161	}
				162
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	163	/*
				164	* Higher-order pages are called "compound pages". They are structured thusly:
				165	*
				166	* The first PAGE_SIZE page is called the "head page".
				167	*
				168	* The remaining PAGE_SIZE pages are called "tail pages".
				169	*
				170	* All pages have PG_compound set. All pages have their ->private pointing at
				171	* the head page (even the head page has this).
				172	*
Hugh Dickins	41d78ba	2006-02-14 13:52:58 -0800	[diff] [blame]	173	* The first tail page's ->lru.next holds the address of the compound page's
				174	* put_page() function. Its ->lru.prev holds the order of allocation.
				175	* This usage means that zero-order pages may not be compound.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	176	*/
Hugh Dickins	d98c7a0	2006-02-14 13:52:59 -0800	[diff] [blame]	177
				178	static void free_compound_page(struct page *page)
				179	{
				180	__free_pages_ok(page, (unsigned long)page[1].lru.prev);
				181	}
				182
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	183	static void prep_compound_page(struct page *page, unsigned long order)
				184	{
				185	int i;
				186	int nr_pages = 1 << order;
				187
Hugh Dickins	d98c7a0	2006-02-14 13:52:59 -0800	[diff] [blame]	188	page[1].lru.next = (void )free_compound_page; / set dtor */
Hugh Dickins	41d78ba	2006-02-14 13:52:58 -0800	[diff] [blame]	189	page[1].lru.prev = (void *)order;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	190	for (i = 0; i < nr_pages; i++) {
				191	struct page *p = page + i;
				192
				193	SetPageCompound(p);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	194	set_page_private(p, (unsigned long)page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	195	}
				196	}
				197
				198	static void destroy_compound_page(struct page *page, unsigned long order)
				199	{
				200	int i;
				201	int nr_pages = 1 << order;
				202
Hugh Dickins	41d78ba	2006-02-14 13:52:58 -0800	[diff] [blame]	203	if (unlikely((unsigned long)page[1].lru.prev != order))
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	204	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	205
				206	for (i = 0; i < nr_pages; i++) {
				207	struct page *p = page + i;
				208
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	209	if (unlikely(!PageCompound(p) \|
				210	(page_private(p) != (unsigned long)page)))
				211	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	212	ClearPageCompound(p);
				213	}
				214	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	215
				216	/*
				217	* function for dealing with page's order in buddy system.
				218	* zone->lock is already acquired when we use these.
				219	* So, we don't need atomic page->flags operations here.
				220	*/
				221	static inline unsigned long page_order(struct page *page) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	222	return page_private(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	223	}
				224
				225	static inline void set_page_order(struct page *page, int order) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	226	set_page_private(page, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	227	__SetPagePrivate(page);
				228	}
				229
				230	static inline void rmv_page_order(struct page *page)
				231	{
				232	__ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	233	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	234	}
				235
				236	/*
				237	* Locate the struct page for both the matching buddy in our
				238	* pair (buddy1) and the combined O(n+1) page they form (page).
				239	*
				240	* 1) Any buddy B1 will have an order O twin B2 which satisfies
				241	* the following equation:
				242	* B2 = B1 ^ (1 << O)
				243	* For example, if the starting buddy (buddy2) is #8 its order
				244	* 1 buddy is #10:
				245	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
				246	*
				247	* 2) Any buddy B will have an order O+1 parent P which
				248	* satisfies the following equation:
				249	* P = B & ~(1 << O)
				250	*
				251	* Assumption: *_mem_map is contigious at least up to MAX_ORDER
				252	*/
				253	static inline struct page *
				254	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
				255	{
				256	unsigned long buddy_idx = page_idx ^ (1 << order);
				257
				258	return page + (buddy_idx - page_idx);
				259	}
				260
				261	static inline unsigned long
				262	__find_combined_index(unsigned long page_idx, unsigned int order)
				263	{
				264	return (page_idx & ~(1 << order));
				265	}
				266
				267	/*
				268	* This function checks whether a page is free && is the buddy
				269	* we can do coalesce a page and its buddy if
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	270	* (a) the buddy is not in a hole &&
				271	* (b) the buddy is free &&
				272	* (c) the buddy is on the buddy system &&
				273	* (d) a page and its buddy have the same order.
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	274	* for recording page's order, we use page_private(page) and PG_private.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	275	*
				276	*/
				277	static inline int page_is_buddy(struct page *page, int order)
				278	{
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	279	#ifdef CONFIG_HOLES_IN_ZONE
				280	if (!pfn_valid(page_to_pfn(page)))
				281	return 0;
				282	#endif
				283
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	284	if (PagePrivate(page) &&
				285	(page_order(page) == order) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	286	page_count(page) == 0)
				287	return 1;
				288	return 0;
				289	}
				290
				291	/*
				292	* Freeing function for a buddy system allocator.
				293	*
				294	* The concept of a buddy system is to maintain direct-mapped table
				295	* (containing bit values) for memory blocks of various "orders".
				296	* The bottom level table contains the map for the smallest allocatable
				297	* units of memory (here, pages), and each level above it describes
				298	* pairs of units from the levels below, hence, "buddies".
				299	* At a high level, all that happens here is marking the table entry
				300	* at the bottom level available, and propagating the changes upward
				301	* as necessary, plus some accounting needed to play nicely with other
				302	* parts of the VM system.
				303	* At each level, we keep a list of pages, which are heads of continuous
				304	* free pages of length of (1 << order) and marked with PG_Private.Page's
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	305	* order is recorded in page_private(page) field.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	306	* So when we are allocating or freeing one, we can derive the state of the
				307	* other. That is, if we allocate a small block, and both were
				308	* free, the remainder of the region must be split into blocks.
				309	* If a block is freed, and its buddy is also free, then this
				310	* triggers coalescing into a block of larger size.
				311	*
				312	* -- wli
				313	*/
				314
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	315	static inline void __free_one_page(struct page *page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	316	struct zone *zone, unsigned int order)
				317	{
				318	unsigned long page_idx;
				319	int order_size = 1 << order;
				320
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	321	if (unlikely(PageCompound(page)))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	322	destroy_compound_page(page, order);
				323
				324	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
				325
				326	BUG_ON(page_idx & (order_size - 1));
				327	BUG_ON(bad_range(zone, page));
				328
				329	zone->free_pages += order_size;
				330	while (order < MAX_ORDER-1) {
				331	unsigned long combined_idx;
				332	struct free_area *area;
				333	struct page *buddy;
				334
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	335	buddy = __page_find_buddy(page, page_idx, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	336	if (!page_is_buddy(buddy, order))
				337	break; /* Move the buddy up one level. */
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	338
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	339	list_del(&buddy->lru);
				340	area = zone->free_area + order;
				341	area->nr_free--;
				342	rmv_page_order(buddy);
Nick Piggin	13e7444	2006-01-06 00:10:58 -0800	[diff] [blame]	343	combined_idx = __find_combined_index(page_idx, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	344	page = page + (combined_idx - page_idx);
				345	page_idx = combined_idx;
				346	order++;
				347	}
				348	set_page_order(page, order);
				349	list_add(&page->lru, &zone->free_area[order].free_list);
				350	zone->free_area[order].nr_free++;
				351	}
				352
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	353	static inline int free_pages_check(struct page *page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	354	{
Nick Piggin	92be2e3	2006-01-06 00:10:57 -0800	[diff] [blame]	355	if (unlikely(page_mapcount(page) \|
				356	(page->mapping != NULL) \|
				357	(page_count(page) != 0) \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	358	(page->flags & (
				359	1 << PG_lru \|
				360	1 << PG_private \|
				361	1 << PG_locked \|
				362	1 << PG_active \|
				363	1 << PG_reclaim \|
				364	1 << PG_slab \|
				365	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	366	1 << PG_writeback \|
Nick Piggin	92be2e3	2006-01-06 00:10:57 -0800	[diff] [blame]	367	1 << PG_reserved ))))
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	368	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	369	if (PageDirty(page))
Nick Piggin	242e546	2005-09-03 15:54:50 -0700	[diff] [blame]	370	__ClearPageDirty(page);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	371	/*
				372	* For now, we report if PG_reserved was found set, but do not
				373	* clear it, and do not free the page. But we shall soon need
				374	* to do more, for when the ZERO_PAGE count wraps negative.
				375	*/
				376	return PageReserved(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	377	}
				378
				379	/*
				380	* Frees a list of pages.
				381	* Assumes all pages on list are in same zone, and of same order.
Renaud Lienhart	207f36e	2005-09-10 00:26:59 -0700	[diff] [blame]	382	* count is the number of pages to free.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	383	*
				384	* If the zone was previously in an "all pages pinned" state then look to
				385	* see if this freeing clears that state.
				386	*
				387	* And clear the zone's pages_scanned counter, to hold off the "all pages are
				388	* pinned" detection logic.
				389	*/
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	390	static void free_pages_bulk(struct zone *zone, int count,
				391	struct list_head *list, int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	392	{
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	393	spin_lock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	394	zone->all_unreclaimable = 0;
				395	zone->pages_scanned = 0;
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	396	while (count--) {
				397	struct page *page;
				398
				399	BUG_ON(list_empty(list));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	400	page = list_entry(list->prev, struct page, lru);
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	401	/* have to delete it as __free_one_page list manipulates */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	402	list_del(&page->lru);
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	403	__free_one_page(page, zone, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	404	}
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	405	spin_unlock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	406	}
				407
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	408	static void free_one_page(struct zone zone, struct page page, int order)
				409	{
				410	LIST_HEAD(list);
				411	list_add(&page->lru, &list);
				412	free_pages_bulk(zone, 1, &list, order);
				413	}
				414
				415	static void __free_pages_ok(struct page *page, unsigned int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	416	{
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	417	unsigned long flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	418	int i;
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	419	int reserved = 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	420
				421	arch_free_page(page, order);
Ingo Molnar	de5097c	2006-01-09 15:59:21 -0800	[diff] [blame]	422	if (!PageHighMem(page))
				423	mutex_debug_check_no_locks_freed(page_address(page),
David Woodhouse	a4fc7ab	2006-01-11 14:41:26 +0000	[diff] [blame]	424	PAGE_SIZE<<order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	425
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	426	#ifndef CONFIG_MMU
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	427	for (i = 1 ; i < (1 << order) ; ++i)
				428	__put_page(page + i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	429	#endif
				430
				431	for (i = 0 ; i < (1 << order) ; ++i)
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	432	reserved += free_pages_check(page + i);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	433	if (reserved)
				434	return;
				435
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	436	kernel_map_pages(page, 1 << order, 0);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	437	local_irq_save(flags);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	438	__mod_page_state(pgfree, 1 << order);
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	439	free_one_page(page_zone(page), page, order);
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	440	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	441	}
				442
David Howells	a226f6c	2006-01-06 00:11:08 -0800	[diff] [blame]	443	/*
				444	* permit the bootmem allocator to evade page validation on high-order frees
				445	*/
				446	void fastcall __init __free_pages_bootmem(struct page *page, unsigned int order)
				447	{
				448	if (order == 0) {
				449	__ClearPageReserved(page);
				450	set_page_count(page, 0);
				451
				452	free_hot_cold_page(page, 0);
				453	} else {
				454	LIST_HEAD(list);
				455	int loop;
				456
				457	for (loop = 0; loop < BITS_PER_LONG; loop++) {
				458	struct page *p = &page[loop];
				459
				460	if (loop + 16 < BITS_PER_LONG)
				461	prefetchw(p + 16);
				462	__ClearPageReserved(p);
				463	set_page_count(p, 0);
				464	}
				465
				466	arch_free_page(page, order);
				467
				468	mod_page_state(pgfree, 1 << order);
				469
				470	list_add(&page->lru, &list);
				471	kernel_map_pages(page, 1 << order, 0);
				472	free_pages_bulk(page_zone(page), 1, &list, order);
				473	}
				474	}
				475
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	476
				477	/*
				478	* The order of subdivision here is critical for the IO subsystem.
				479	* Please do not alter this order without good reasons and regression
				480	* testing. Specifically, as large blocks of memory are subdivided,
				481	* the order in which smaller blocks are delivered depends on the order
				482	* they're subdivided in this function. This is the primary factor
				483	* influencing the order in which pages are delivered to the IO
				484	* subsystem according to empirical testing, and this is also justified
				485	* by considering the behavior of a buddy system containing a single
				486	* large block of memory acted on by a series of small allocations.
				487	* This behavior is a critical factor in sglist merging's success.
				488	*
				489	* -- wli
				490	*/
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	491	static inline void expand(struct zone zone, struct page page,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	492	int low, int high, struct free_area *area)
				493	{
				494	unsigned long size = 1 << high;
				495
				496	while (high > low) {
				497	area--;
				498	high--;
				499	size >>= 1;
				500	BUG_ON(bad_range(zone, &page[size]));
				501	list_add(&page[size].lru, &area->free_list);
				502	area->nr_free++;
				503	set_page_order(&page[size], high);
				504	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	505	}
				506
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	507	/*
				508	* This page is about to be returned from the page allocator
				509	*/
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	510	static int prep_new_page(struct page *page, int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	511	{
Nick Piggin	92be2e3	2006-01-06 00:10:57 -0800	[diff] [blame]	512	if (unlikely(page_mapcount(page) \|
				513	(page->mapping != NULL) \|
				514	(page_count(page) != 0) \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	515	(page->flags & (
				516	1 << PG_lru \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	517	1 << PG_private \|
				518	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	519	1 << PG_active \|
				520	1 << PG_dirty \|
				521	1 << PG_reclaim \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	522	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	523	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	524	1 << PG_writeback \|
Nick Piggin	92be2e3	2006-01-06 00:10:57 -0800	[diff] [blame]	525	1 << PG_reserved ))))
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	526	bad_page(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	527
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	528	/*
				529	* For now, we report if PG_reserved was found set, but do not
				530	* clear it, and do not allocate the page: as a safety net.
				531	*/
				532	if (PageReserved(page))
				533	return 1;
				534
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	535	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
				536	1 << PG_referenced \| 1 << PG_arch_1 \|
				537	1 << PG_checked \| 1 << PG_mappedtodisk);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	538	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	539	set_page_refs(page, order);
				540	kernel_map_pages(page, 1 << order, 1);
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	541	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	542	}
				543
				544	/*
				545	* Do the hard work of removing an element from the buddy allocator.
				546	* Call me with the zone->lock already held.
				547	*/
				548	static struct page __rmqueue(struct zone zone, unsigned int order)
				549	{
				550	struct free_area * area;
				551	unsigned int current_order;
				552	struct page *page;
				553
				554	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
				555	area = zone->free_area + current_order;
				556	if (list_empty(&area->free_list))
				557	continue;
				558
				559	page = list_entry(area->free_list.next, struct page, lru);
				560	list_del(&page->lru);
				561	rmv_page_order(page);
				562	area->nr_free--;
				563	zone->free_pages -= 1UL << order;
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	564	expand(zone, page, order, current_order, area);
				565	return page;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	566	}
				567
				568	return NULL;
				569	}
				570
				571	/*
				572	* Obtain a specified number of elements from the buddy allocator, all under
				573	* a single hold of the lock, for efficiency. Add them to the supplied list.
				574	* Returns the number of new pages which were placed at *list.
				575	*/
				576	static int rmqueue_bulk(struct zone *zone, unsigned int order,
				577	unsigned long count, struct list_head *list)
				578	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	579	int i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	580
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	581	spin_lock(&zone->lock);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	582	for (i = 0; i < count; ++i) {
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	583	struct page *page = __rmqueue(zone, order);
				584	if (unlikely(page == NULL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	585	break;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	586	list_add_tail(&page->lru, list);
				587	}
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	588	spin_unlock(&zone->lock);
Nick Piggin	085cc7d	2006-01-06 00:11:01 -0800	[diff] [blame]	589	return i;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	590	}
				591
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	592	#ifdef CONFIG_NUMA
				593	/* Called from the slab reaper to drain remote pagesets */
				594	void drain_remote_pages(void)
				595	{
				596	struct zone *zone;
				597	int i;
				598	unsigned long flags;
				599
				600	local_irq_save(flags);
				601	for_each_zone(zone) {
				602	struct per_cpu_pageset *pset;
				603
				604	/* Do not drain local pagesets */
				605	if (zone->zone_pgdat->node_id == numa_node_id())
				606	continue;
				607
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	608	pset = zone_pcp(zone, smp_processor_id());
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	609	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				610	struct per_cpu_pages *pcp;
				611
				612	pcp = &pset->pcp[i];
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	613	free_pages_bulk(zone, pcp->count, &pcp->list, 0);
				614	pcp->count = 0;
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	615	}
				616	}
				617	local_irq_restore(flags);
				618	}
				619	#endif
				620
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	621	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)
				622	static void __drain_pages(unsigned int cpu)
				623	{
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	624	unsigned long flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	625	struct zone *zone;
				626	int i;
				627
				628	for_each_zone(zone) {
				629	struct per_cpu_pageset *pset;
				630
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	631	pset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	632	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				633	struct per_cpu_pages *pcp;
				634
				635	pcp = &pset->pcp[i];
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	636	local_irq_save(flags);
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	637	free_pages_bulk(zone, pcp->count, &pcp->list, 0);
				638	pcp->count = 0;
Nick Piggin	c54ad30	2006-01-06 00:10:56 -0800	[diff] [blame]	639	local_irq_restore(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	640	}
				641	}
				642	}
				643	#endif /* CONFIG_PM \|\| CONFIG_HOTPLUG_CPU */
				644
				645	#ifdef CONFIG_PM
				646
				647	void mark_free_pages(struct zone *zone)
				648	{
				649	unsigned long zone_pfn, flags;
				650	int order;
				651	struct list_head *curr;
				652
				653	if (!zone->spanned_pages)
				654	return;
				655
				656	spin_lock_irqsave(&zone->lock, flags);
				657	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
				658	ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
				659
				660	for (order = MAX_ORDER - 1; order >= 0; --order)
				661	list_for_each(curr, &zone->free_area[order].free_list) {
				662	unsigned long start_pfn, i;
				663
				664	start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
				665
				666	for (i=0; i < (1<<order); i++)
				667	SetPageNosaveFree(pfn_to_page(start_pfn+i));
				668	}
				669	spin_unlock_irqrestore(&zone->lock, flags);
				670	}
				671
				672	/*
				673	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
				674	*/
				675	void drain_local_pages(void)
				676	{
				677	unsigned long flags;
				678
				679	local_irq_save(flags);
				680	__drain_pages(smp_processor_id());
				681	local_irq_restore(flags);
				682	}
				683	#endif /* CONFIG_PM */
				684
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	685	static void zone_statistics(struct zonelist zonelist, struct zone z, int cpu)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	686	{
				687	#ifdef CONFIG_NUMA
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	688	pg_data_t *pg = z->zone_pgdat;
				689	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
				690	struct per_cpu_pageset *p;
				691
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	692	p = zone_pcp(z, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	693	if (pg == orig) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	694	p->numa_hit++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	695	} else {
				696	p->numa_miss++;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	697	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	698	}
				699	if (pg == NODE_DATA(numa_node_id()))
				700	p->local_node++;
				701	else
				702	p->other_node++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	703	#endif
				704	}
				705
				706	/*
				707	* Free a 0-order page
				708	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	709	static void fastcall free_hot_cold_page(struct page *page, int cold)
				710	{
				711	struct zone *zone = page_zone(page);
				712	struct per_cpu_pages *pcp;
				713	unsigned long flags;
				714
				715	arch_free_page(page, 0);
				716
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	717	if (PageAnon(page))
				718	page->mapping = NULL;
Nick Piggin	224abf9	2006-01-06 00:11:11 -0800	[diff] [blame]	719	if (free_pages_check(page))
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	720	return;
				721
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	722	kernel_map_pages(page, 1, 0);
				723
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	724	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	725	local_irq_save(flags);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	726	__inc_page_state(pgfree);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	727	list_add(&page->lru, &pcp->list);
				728	pcp->count++;
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	729	if (pcp->count >= pcp->high) {
				730	free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
				731	pcp->count -= pcp->batch;
				732	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	733	local_irq_restore(flags);
				734	put_cpu();
				735	}
				736
				737	void fastcall free_hot_page(struct page *page)
				738	{
				739	free_hot_cold_page(page, 0);
				740	}
				741
				742	void fastcall free_cold_page(struct page *page)
				743	{
				744	free_hot_cold_page(page, 1);
				745	}
				746
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	747	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	748	{
				749	int i;
				750
				751	BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);
				752	for(i = 0; i < (1 << order); i++)
				753	clear_highpage(page + i);
				754	}
				755
				756	/*
				757	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
				758	* we cheat by calling it from here, in the order > 0 path. Saves a branch
				759	* or two.
				760	*/
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	761	static struct page buffered_rmqueue(struct zonelist zonelist,
				762	struct zone *zone, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	763	{
				764	unsigned long flags;
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	765	struct page *page;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	766	int cold = !!(gfp_flags & __GFP_COLD);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	767	int cpu;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	768
Hugh Dickins	689bceb	2005-11-21 21:32:20 -0800	[diff] [blame]	769	again:
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	770	cpu = get_cpu();
Nick Piggin	48db57f	2006-01-08 01:00:42 -0800	[diff] [blame]	771	if (likely(order == 0)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	772	struct per_cpu_pages *pcp;
				773
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	774	pcp = &zone_pcp(zone, cpu)->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	775	local_irq_save(flags);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	776	if (!pcp->count) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	777	pcp->count += rmqueue_bulk(zone, 0,
				778	pcp->batch, &pcp->list);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	779	if (unlikely(!pcp->count))
				780	goto failed;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	781	}
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	782	page = list_entry(pcp->list.next, struct page, lru);
				783	list_del(&page->lru);
				784	pcp->count--;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	785	} else {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	786	spin_lock_irqsave(&zone->lock, flags);
				787	page = __rmqueue(zone, order);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	788	spin_unlock(&zone->lock);
				789	if (!page)
				790	goto failed;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	791	}
				792
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	793	__mod_page_state_zone(zone, pgalloc, 1 << order);
				794	zone_statistics(zonelist, zone, cpu);
				795	local_irq_restore(flags);
				796	put_cpu();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	797
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	798	BUG_ON(bad_range(zone, page));
				799	if (prep_new_page(page, order))
				800	goto again;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	801
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	802	if (gfp_flags & __GFP_ZERO)
				803	prep_zero_page(page, order, gfp_flags);
				804
				805	if (order && (gfp_flags & __GFP_COMP))
				806	prep_compound_page(page, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	807	return page;
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	808
				809	failed:
				810	local_irq_restore(flags);
				811	put_cpu();
				812	return NULL;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	813	}
				814
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	815	#define ALLOC_NO_WATERMARKS 0x01 /* don't check watermarks at all */
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	816	#define ALLOC_WMARK_MIN 0x02 /* use pages_min watermark */
				817	#define ALLOC_WMARK_LOW 0x04 /* use pages_low watermark */
				818	#define ALLOC_WMARK_HIGH 0x08 /* use pages_high watermark */
				819	#define ALLOC_HARDER 0x10 /* try to alloc harder */
				820	#define ALLOC_HIGH 0x20 /* __GFP_HIGH set */
				821	#define ALLOC_CPUSET 0x40 /* check for correct cpuset */
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	822
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	823	/*
				824	* Return 1 if free pages are above 'mark'. This takes into account the order
				825	* of the allocation.
				826	*/
				827	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	828	int classzone_idx, int alloc_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	829	{
				830	/* free_pages my go negative - that's OK */
				831	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
				832	int o;
				833
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	834	if (alloc_flags & ALLOC_HIGH)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	835	min -= min / 2;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	836	if (alloc_flags & ALLOC_HARDER)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	837	min -= min / 4;
				838
				839	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
				840	return 0;
				841	for (o = 0; o < order; o++) {
				842	/* At the next order, this order's pages become unavailable */
				843	free_pages -= z->free_area[o].nr_free << o;
				844
				845	/* Require fewer higher order pages to be free */
				846	min >>= 1;
				847
				848	if (free_pages <= min)
				849	return 0;
				850	}
				851	return 1;
				852	}
				853
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	854	/*
				855	* get_page_from_freeliest goes through the zonelist trying to allocate
				856	* a page.
				857	*/
				858	static struct page *
				859	get_page_from_freelist(gfp_t gfp_mask, unsigned int order,
				860	struct zonelist *zonelist, int alloc_flags)
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	861	{
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	862	struct zone **z = zonelist->zones;
				863	struct page *page = NULL;
				864	int classzone_idx = zone_idx(*z);
				865
				866	/*
				867	* Go through the zonelist once, looking for a zone with enough free.
				868	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
				869	*/
				870	do {
				871	if ((alloc_flags & ALLOC_CPUSET) &&
				872	!cpuset_zone_allowed(*z, gfp_mask))
				873	continue;
				874
				875	if (!(alloc_flags & ALLOC_NO_WATERMARKS)) {
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	876	unsigned long mark;
				877	if (alloc_flags & ALLOC_WMARK_MIN)
				878	mark = (*z)->pages_min;
				879	else if (alloc_flags & ALLOC_WMARK_LOW)
				880	mark = (*z)->pages_low;
				881	else
				882	mark = (*z)->pages_high;
				883	if (!zone_watermark_ok(*z, order, mark,
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	884	classzone_idx, alloc_flags))
Christoph Lameter	9eeff23	2006-01-18 17:42:31 -0800	[diff] [blame]	885	if (!zone_reclaim_mode \|\|
				886	!zone_reclaim(*z, gfp_mask, order))
				887	continue;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	888	}
				889
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	890	page = buffered_rmqueue(zonelist, *z, order, gfp_mask);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	891	if (page) {
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	892	break;
				893	}
				894	} while (*(++z) != NULL);
				895	return page;
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	896	}
				897
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	898	/*
				899	* This is the 'heart' of the zoned buddy allocator.
				900	*/
				901	struct page * fastcall
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	902	__alloc_pages(gfp_t gfp_mask, unsigned int order,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	903	struct zonelist *zonelist)
				904	{
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	905	const gfp_t wait = gfp_mask & __GFP_WAIT;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	906	struct zone **z;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	907	struct page *page;
				908	struct reclaim_state reclaim_state;
				909	struct task_struct *p = current;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	910	int do_retry;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	911	int alloc_flags;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	912	int did_some_progress;
				913
				914	might_sleep_if(wait);
				915
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	916	restart:
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	917	z = zonelist->zones; /* the list of zones suitable for gfp_mask */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	918
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	919	if (unlikely(*z == NULL)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	920	/* Should this ever happen?? */
				921	return NULL;
				922	}
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	923
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	924	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	925	zonelist, ALLOC_WMARK_LOW\|ALLOC_CPUSET);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	926	if (page)
				927	goto got_pg;
				928
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	929	do {
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	930	wakeup_kswapd(*z, order);
Jens Axboe	6b1de91	2005-11-17 21:35:02 +0100	[diff] [blame]	931	} while (*(++z));
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	932
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	933	/*
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	934	* OK, we're below the kswapd watermark and have kicked background
				935	* reclaim. Now things get more complex, so set up alloc_flags according
				936	* to how we want to proceed.
				937	*
				938	* The caller may dip into page reserves a bit more if the caller
				939	* cannot run direct reclaim, or if the caller has realtime scheduling
Paul Jackson	4eac915	2006-01-11 12:17:19 -0800	[diff] [blame]	940	* policy or is asking for __GFP_HIGH memory. GFP_ATOMIC requests will
				941	* set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	942	*/
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	943	alloc_flags = ALLOC_WMARK_MIN;
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	944	if ((unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait)
				945	alloc_flags \|= ALLOC_HARDER;
				946	if (gfp_mask & __GFP_HIGH)
				947	alloc_flags \|= ALLOC_HIGH;
Paul Jackson	47f3a86	2006-01-06 00:10:32 -0800	[diff] [blame]	948	alloc_flags \|= ALLOC_CPUSET;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	949
				950	/*
				951	* Go through the zonelist again. Let __GFP_HIGH and allocations
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	952	* coming from realtime tasks go deeper into reserves.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	953	*
				954	* This is the last chance, in general, before the goto nopage.
				955	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	956	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	957	*/
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	958	page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
				959	if (page)
				960	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	961
				962	/* This allocation should allow future memory freeing. */
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	963
				964	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
				965	&& !in_interrupt()) {
				966	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
Kirill Korotaev	885036d	2005-11-13 16:06:41 -0800	[diff] [blame]	967	nofail_alloc:
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	968	/* go through the zonelist yet again, ignoring mins */
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	969	page = get_page_from_freelist(gfp_mask, order,
Paul Jackson	47f3a86	2006-01-06 00:10:32 -0800	[diff] [blame]	970	zonelist, ALLOC_NO_WATERMARKS);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	971	if (page)
				972	goto got_pg;
Kirill Korotaev	885036d	2005-11-13 16:06:41 -0800	[diff] [blame]	973	if (gfp_mask & __GFP_NOFAIL) {
				974	blk_congestion_wait(WRITE, HZ/50);
				975	goto nofail_alloc;
				976	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	977	}
				978	goto nopage;
				979	}
				980
				981	/* Atomic allocations - we can't balance anything */
				982	if (!wait)
				983	goto nopage;
				984
				985	rebalance:
				986	cond_resched();
				987
				988	/* We now go into synchronous reclaim */
Paul Jackson	3e0d98b	2006-01-08 01:01:49 -0800	[diff] [blame]	989	cpuset_memory_pressure_bump();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	990	p->flags \|= PF_MEMALLOC;
				991	reclaim_state.reclaimed_slab = 0;
				992	p->reclaim_state = &reclaim_state;
				993
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	994	did_some_progress = try_to_free_pages(zonelist->zones, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	995
				996	p->reclaim_state = NULL;
				997	p->flags &= ~PF_MEMALLOC;
				998
				999	cond_resched();
				1000
				1001	if (likely(did_some_progress)) {
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	1002	page = get_page_from_freelist(gfp_mask, order,
				1003	zonelist, alloc_flags);
				1004	if (page)
				1005	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1006	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
				1007	/*
				1008	* Go through the zonelist yet one more time, keep
				1009	* very high watermark here, this is only to catch
				1010	* a parallel oom killing, we must fail if we're still
				1011	* under heavy pressure.
				1012	*/
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	1013	page = get_page_from_freelist(gfp_mask\|__GFP_HARDWALL, order,
Nick Piggin	3148890	2005-11-28 13:44:03 -0800	[diff] [blame]	1014	zonelist, ALLOC_WMARK_HIGH\|ALLOC_CPUSET);
Rohit Seth	7fb1d9f	2005-11-13 16:06:43 -0800	[diff] [blame]	1015	if (page)
				1016	goto got_pg;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1017
Christoph Lameter	9b0f8b0	2006-02-20 18:27:52 -0800	[diff] [blame]	1018	out_of_memory(zonelist, gfp_mask, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1019	goto restart;
				1020	}
				1021
				1022	/*
				1023	* Don't let big-order allocations loop unless the caller explicitly
				1024	* requests that. Wait for some write requests to complete then retry.
				1025	*
				1026	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
				1027	* <= 3, but that may not be true in other implementations.
				1028	*/
				1029	do_retry = 0;
				1030	if (!(gfp_mask & __GFP_NORETRY)) {
				1031	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))
				1032	do_retry = 1;
				1033	if (gfp_mask & __GFP_NOFAIL)
				1034	do_retry = 1;
				1035	}
				1036	if (do_retry) {
				1037	blk_congestion_wait(WRITE, HZ/50);
				1038	goto rebalance;
				1039	}
				1040
				1041	nopage:
				1042	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
				1043	printk(KERN_WARNING "%s: page allocation failure."
				1044	" order:%d, mode:0x%x\n",
				1045	p->comm, order, gfp_mask);
				1046	dump_stack();
Janet Morgan	578c2fd	2005-06-21 17:14:56 -0700	[diff] [blame]	1047	show_mem();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1048	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1049	got_pg:
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1050	return page;
				1051	}
				1052
				1053	EXPORT_SYMBOL(__alloc_pages);
				1054
				1055	/*
				1056	* Common helper functions.
				1057	*/
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	1058	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1059	{
				1060	struct page * page;
				1061	page = alloc_pages(gfp_mask, order);
				1062	if (!page)
				1063	return 0;
				1064	return (unsigned long) page_address(page);
				1065	}
				1066
				1067	EXPORT_SYMBOL(__get_free_pages);
				1068
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	1069	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1070	{
				1071	struct page * page;
				1072
				1073	/*
				1074	* get_zeroed_page() returns a 32-bit address, which cannot represent
				1075	* a highmem page
				1076	*/
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1077	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1078
				1079	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);
				1080	if (page)
				1081	return (unsigned long) page_address(page);
				1082	return 0;
				1083	}
				1084
				1085	EXPORT_SYMBOL(get_zeroed_page);
				1086
				1087	void __pagevec_free(struct pagevec *pvec)
				1088	{
				1089	int i = pagevec_count(pvec);
				1090
				1091	while (--i >= 0)
				1092	free_hot_cold_page(pvec->pages[i], pvec->cold);
				1093	}
				1094
				1095	fastcall void __free_pages(struct page *page, unsigned int order)
				1096	{
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1097	if (put_page_testzero(page)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1098	if (order == 0)
				1099	free_hot_page(page);
				1100	else
				1101	__free_pages_ok(page, order);
				1102	}
				1103	}
				1104
				1105	EXPORT_SYMBOL(__free_pages);
				1106
				1107	fastcall void free_pages(unsigned long addr, unsigned int order)
				1108	{
				1109	if (addr != 0) {
				1110	BUG_ON(!virt_addr_valid((void *)addr));
				1111	__free_pages(virt_to_page((void *)addr), order);
				1112	}
				1113	}
				1114
				1115	EXPORT_SYMBOL(free_pages);
				1116
				1117	/*
				1118	* Total amount of free (allocatable) RAM:
				1119	*/
				1120	unsigned int nr_free_pages(void)
				1121	{
				1122	unsigned int sum = 0;
				1123	struct zone *zone;
				1124
				1125	for_each_zone(zone)
				1126	sum += zone->free_pages;
				1127
				1128	return sum;
				1129	}
				1130
				1131	EXPORT_SYMBOL(nr_free_pages);
				1132
				1133	#ifdef CONFIG_NUMA
				1134	unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
				1135	{
				1136	unsigned int i, sum = 0;
				1137
				1138	for (i = 0; i < MAX_NR_ZONES; i++)
				1139	sum += pgdat->node_zones[i].free_pages;
				1140
				1141	return sum;
				1142	}
				1143	#endif
				1144
				1145	static unsigned int nr_free_zone_pages(int offset)
				1146	{
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1147	/* Just pick one node, since fallback list is circular */
				1148	pg_data_t *pgdat = NODE_DATA(numa_node_id());
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1149	unsigned int sum = 0;
				1150
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1151	struct zonelist *zonelist = pgdat->node_zonelists + offset;
				1152	struct zone **zonep = zonelist->zones;
				1153	struct zone *zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1154
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1155	for (zone = zonep++; zone; zone = zonep++) {
				1156	unsigned long size = zone->present_pages;
				1157	unsigned long high = zone->pages_high;
				1158	if (size > high)
				1159	sum += size - high;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1160	}
				1161
				1162	return sum;
				1163	}
				1164
				1165	/*
				1166	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
				1167	*/
				1168	unsigned int nr_free_buffer_pages(void)
				1169	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1170	return nr_free_zone_pages(gfp_zone(GFP_USER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1171	}
				1172
				1173	/*
				1174	* Amount of free RAM allocatable within all zones
				1175	*/
				1176	unsigned int nr_free_pagecache_pages(void)
				1177	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1178	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1179	}
				1180
				1181	#ifdef CONFIG_HIGHMEM
				1182	unsigned int nr_free_highpages (void)
				1183	{
				1184	pg_data_t *pgdat;
				1185	unsigned int pages = 0;
				1186
				1187	for_each_pgdat(pgdat)
				1188	pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1189
				1190	return pages;
				1191	}
				1192	#endif
				1193
				1194	#ifdef CONFIG_NUMA
				1195	static void show_node(struct zone *zone)
				1196	{
				1197	printk("Node %d ", zone->zone_pgdat->node_id);
				1198	}
				1199	#else
				1200	#define show_node(zone) do { } while (0)
				1201	#endif
				1202
				1203	/*
				1204	* Accumulate the page_state information across all CPUs.
				1205	* The result is unavoidably approximate - it can change
				1206	* during and after execution of this function.
				1207	*/
				1208	static DEFINE_PER_CPU(struct page_state, page_states) = {0};
				1209
				1210	atomic_t nr_pagecache = ATOMIC_INIT(0);
				1211	EXPORT_SYMBOL(nr_pagecache);
				1212	#ifdef CONFIG_SMP
				1213	DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
				1214	#endif
				1215
Nick Piggin	a86b1f5	2006-01-06 00:11:00 -0800	[diff] [blame]	1216	static void __get_page_state(struct page_state ret, int nr, cpumask_t cpumask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1217	{
				1218	int cpu = 0;
				1219
Eric Dumazet	88a2a4ac	2006-02-04 23:27:36 -0800	[diff] [blame]	1220	memset(ret, 0, nr * sizeof(unsigned long));
Andrew Morton	84c2008	2006-01-08 01:00:28 -0800	[diff] [blame]	1221	cpus_and(cpumask, cpumask, cpu_online_map);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1222
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1223	cpu = first_cpu(*cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1224	while (cpu < NR_CPUS) {
				1225	unsigned long in, out, off;
				1226
Eric Dumazet	88a2a4ac	2006-02-04 23:27:36 -0800	[diff] [blame]	1227	if (!cpu_isset(cpu, *cpumask))
				1228	continue;
				1229
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1230	in = (unsigned long *)&per_cpu(page_states, cpu);
				1231
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1232	cpu = next_cpu(cpu, *cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1233
Eric Dumazet	88a2a4ac	2006-02-04 23:27:36 -0800	[diff] [blame]	1234	if (likely(cpu < NR_CPUS))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1235	prefetch(&per_cpu(page_states, cpu));
				1236
				1237	out = (unsigned long *)ret;
				1238	for (off = 0; off < nr; off++)
				1239	out++ += in++;
				1240	}
				1241	}
				1242
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1243	void get_page_state_node(struct page_state *ret, int node)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1244	{
				1245	int nr;
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1246	cpumask_t mask = node_to_cpumask(node);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1247
				1248	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1249	nr /= sizeof(unsigned long);
				1250
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1251	__get_page_state(ret, nr+1, &mask);
				1252	}
				1253
				1254	void get_page_state(struct page_state *ret)
				1255	{
				1256	int nr;
				1257	cpumask_t mask = CPU_MASK_ALL;
				1258
				1259	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1260	nr /= sizeof(unsigned long);
				1261
				1262	__get_page_state(ret, nr + 1, &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1263	}
				1264
				1265	void get_full_page_state(struct page_state *ret)
				1266	{
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1267	cpumask_t mask = CPU_MASK_ALL;
				1268
				1269	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1270	}
				1271
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	1272	unsigned long read_page_state_offset(unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1273	{
				1274	unsigned long ret = 0;
				1275	int cpu;
				1276
Andrew Morton	84c2008	2006-01-08 01:00:28 -0800	[diff] [blame]	1277	for_each_online_cpu(cpu) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1278	unsigned long in;
				1279
				1280	in = (unsigned long)&per_cpu(page_states, cpu) + offset;
				1281	ret += ((unsigned long )in);
				1282	}
				1283	return ret;
				1284	}
				1285
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	1286	void __mod_page_state_offset(unsigned long offset, unsigned long delta)
				1287	{
				1288	void *ptr;
				1289
				1290	ptr = &__get_cpu_var(page_states);
				1291	(unsigned long )(ptr + offset) += delta;
				1292	}
				1293	EXPORT_SYMBOL(__mod_page_state_offset);
				1294
				1295	void mod_page_state_offset(unsigned long offset, unsigned long delta)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1296	{
				1297	unsigned long flags;
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	1298	void *ptr;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1299
				1300	local_irq_save(flags);
				1301	ptr = &__get_cpu_var(page_states);
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	1302	(unsigned long )(ptr + offset) += delta;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1303	local_irq_restore(flags);
				1304	}
Nick Piggin	a74609f	2006-01-06 00:11:20 -0800	[diff] [blame]	1305	EXPORT_SYMBOL(mod_page_state_offset);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1306
				1307	void __get_zone_counts(unsigned long active, unsigned long inactive,
				1308	unsigned long free, struct pglist_data pgdat)
				1309	{
				1310	struct zone *zones = pgdat->node_zones;
				1311	int i;
				1312
				1313	*active = 0;
				1314	*inactive = 0;
				1315	*free = 0;
				1316	for (i = 0; i < MAX_NR_ZONES; i++) {
				1317	*active += zones[i].nr_active;
				1318	*inactive += zones[i].nr_inactive;
				1319	*free += zones[i].free_pages;
				1320	}
				1321	}
				1322
				1323	void get_zone_counts(unsigned long *active,
				1324	unsigned long inactive, unsigned long free)
				1325	{
				1326	struct pglist_data *pgdat;
				1327
				1328	*active = 0;
				1329	*inactive = 0;
				1330	*free = 0;
				1331	for_each_pgdat(pgdat) {
				1332	unsigned long l, m, n;
				1333	__get_zone_counts(&l, &m, &n, pgdat);
				1334	*active += l;
				1335	*inactive += m;
				1336	*free += n;
				1337	}
				1338	}
				1339
				1340	void si_meminfo(struct sysinfo *val)
				1341	{
				1342	val->totalram = totalram_pages;
				1343	val->sharedram = 0;
				1344	val->freeram = nr_free_pages();
				1345	val->bufferram = nr_blockdev_pages();
				1346	#ifdef CONFIG_HIGHMEM
				1347	val->totalhigh = totalhigh_pages;
				1348	val->freehigh = nr_free_highpages();
				1349	#else
				1350	val->totalhigh = 0;
				1351	val->freehigh = 0;
				1352	#endif
				1353	val->mem_unit = PAGE_SIZE;
				1354	}
				1355
				1356	EXPORT_SYMBOL(si_meminfo);
				1357
				1358	#ifdef CONFIG_NUMA
				1359	void si_meminfo_node(struct sysinfo *val, int nid)
				1360	{
				1361	pg_data_t *pgdat = NODE_DATA(nid);
				1362
				1363	val->totalram = pgdat->node_present_pages;
				1364	val->freeram = nr_free_pages_pgdat(pgdat);
				1365	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1366	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1367	val->mem_unit = PAGE_SIZE;
				1368	}
				1369	#endif
				1370
				1371	#define K(x) ((x) << (PAGE_SHIFT-10))
				1372
				1373	/*
				1374	* Show free area list (used inside shift_scroll-lock stuff)
				1375	* We also calculate the percentage fragmentation. We do this by counting the
				1376	* memory on each free list with the exception of the first item on the list.
				1377	*/
				1378	void show_free_areas(void)
				1379	{
				1380	struct page_state ps;
				1381	int cpu, temperature;
				1382	unsigned long active;
				1383	unsigned long inactive;
				1384	unsigned long free;
				1385	struct zone *zone;
				1386
				1387	for_each_zone(zone) {
				1388	show_node(zone);
				1389	printk("%s per-cpu:", zone->name);
				1390
Con Kolivas	f3fe651	2006-01-06 00:11:15 -0800	[diff] [blame]	1391	if (!populated_zone(zone)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1392	printk(" empty\n");
				1393	continue;
				1394	} else
				1395	printk("\n");
				1396
Dave Jones	6b482c6	2005-11-10 15:45:56 -0500	[diff] [blame]	1397	for_each_online_cpu(cpu) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1398	struct per_cpu_pageset *pageset;
				1399
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1400	pageset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1401
				1402	for (temperature = 0; temperature < 2; temperature++)
Nick Piggin	2d92c5c	2006-01-06 00:10:59 -0800	[diff] [blame]	1403	printk("cpu %d %s: high %d, batch %d used:%d\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1404	cpu,
				1405	temperature ? "cold" : "hot",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1406	pageset->pcp[temperature].high,
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	1407	pageset->pcp[temperature].batch,
				1408	pageset->pcp[temperature].count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1409	}
				1410	}
				1411
				1412	get_page_state(&ps);
				1413	get_zone_counts(&active, &inactive, &free);
				1414
Denis Vlasenko	c0d6221	2005-06-21 17:15:14 -0700	[diff] [blame]	1415	printk("Free pages: %11ukB (%ukB HighMem)\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1416	K(nr_free_pages()),
				1417	K(nr_free_highpages()));
				1418
				1419	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
				1420	"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
				1421	active,
				1422	inactive,
				1423	ps.nr_dirty,
				1424	ps.nr_writeback,
				1425	ps.nr_unstable,
				1426	nr_free_pages(),
				1427	ps.nr_slab,
				1428	ps.nr_mapped,
				1429	ps.nr_page_table_pages);
				1430
				1431	for_each_zone(zone) {
				1432	int i;
				1433
				1434	show_node(zone);
				1435	printk("%s"
				1436	" free:%lukB"
				1437	" min:%lukB"
				1438	" low:%lukB"
				1439	" high:%lukB"
				1440	" active:%lukB"
				1441	" inactive:%lukB"
				1442	" present:%lukB"
				1443	" pages_scanned:%lu"
				1444	" all_unreclaimable? %s"
				1445	"\n",
				1446	zone->name,
				1447	K(zone->free_pages),
				1448	K(zone->pages_min),
				1449	K(zone->pages_low),
				1450	K(zone->pages_high),
				1451	K(zone->nr_active),
				1452	K(zone->nr_inactive),
				1453	K(zone->present_pages),
				1454	zone->pages_scanned,
				1455	(zone->all_unreclaimable ? "yes" : "no")
				1456	);
				1457	printk("lowmem_reserve[]:");
				1458	for (i = 0; i < MAX_NR_ZONES; i++)
				1459	printk(" %lu", zone->lowmem_reserve[i]);
				1460	printk("\n");
				1461	}
				1462
				1463	for_each_zone(zone) {
				1464	unsigned long nr, flags, order, total = 0;
				1465
				1466	show_node(zone);
				1467	printk("%s: ", zone->name);
Con Kolivas	f3fe651	2006-01-06 00:11:15 -0800	[diff] [blame]	1468	if (!populated_zone(zone)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1469	printk("empty\n");
				1470	continue;
				1471	}
				1472
				1473	spin_lock_irqsave(&zone->lock, flags);
				1474	for (order = 0; order < MAX_ORDER; order++) {
				1475	nr = zone->free_area[order].nr_free;
				1476	total += nr << order;
				1477	printk("%lu*%lukB ", nr, K(1UL) << order);
				1478	}
				1479	spin_unlock_irqrestore(&zone->lock, flags);
				1480	printk("= %lukB\n", K(total));
				1481	}
				1482
				1483	show_swap_cache_info();
				1484	}
				1485
				1486	/*
				1487	* Builds allocation fallback zone lists.
Christoph Lameter	1a93205	2006-01-06 00:11:16 -0800	[diff] [blame]	1488	*
				1489	* Add all populated zones of a node to the zonelist.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1490	*/
Christoph Lameter	1a93205	2006-01-06 00:11:16 -0800	[diff] [blame]	1491	static int __init build_zonelists_node(pg_data_t *pgdat,
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1492	struct zonelist *zonelist, int nr_zones, int zone_type)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1493	{
Christoph Lameter	1a93205	2006-01-06 00:11:16 -0800	[diff] [blame]	1494	struct zone *zone;
				1495
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1496	BUG_ON(zone_type > ZONE_HIGHMEM);
Christoph Lameter	02a68a5	2006-01-06 00:11:18 -0800	[diff] [blame]	1497
				1498	do {
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1499	zone = pgdat->node_zones + zone_type;
Christoph Lameter	1a93205	2006-01-06 00:11:16 -0800	[diff] [blame]	1500	if (populated_zone(zone)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1501	#ifndef CONFIG_HIGHMEM
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1502	BUG_ON(zone_type > ZONE_NORMAL);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1503	#endif
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1504	zonelist->zones[nr_zones++] = zone;
				1505	check_highest_zone(zone_type);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1506	}
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1507	zone_type--;
Christoph Lameter	02a68a5	2006-01-06 00:11:18 -0800	[diff] [blame]	1508
Christoph Lameter	070f803	2006-01-06 00:11:19 -0800	[diff] [blame]	1509	} while (zone_type >= 0);
				1510	return nr_zones;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1511	}
				1512
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1513	static inline int highest_zone(int zone_bits)
				1514	{
				1515	int res = ZONE_NORMAL;
				1516	if (zone_bits & (__force int)__GFP_HIGHMEM)
				1517	res = ZONE_HIGHMEM;
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	1518	if (zone_bits & (__force int)__GFP_DMA32)
				1519	res = ZONE_DMA32;
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1520	if (zone_bits & (__force int)__GFP_DMA)
				1521	res = ZONE_DMA;
				1522	return res;
				1523	}
				1524
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1525	#ifdef CONFIG_NUMA
				1526	#define MAX_NODE_LOAD (num_online_nodes())
				1527	static int __initdata node_load[MAX_NUMNODES];
				1528	/**
Pavel Pisa	4dc3b16	2005-05-01 08:59:25 -0700	[diff] [blame]	1529	* find_next_best_node - find the next node that should appear in a given node's fallback list
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1530	* @node: node whose fallback list we're appending
				1531	* @used_node_mask: nodemask_t of already used nodes
				1532	*
				1533	* We use a number of factors to determine which is the next node that should
				1534	* appear on a given node's fallback list. The node should not have appeared
				1535	* already in @node's fallback list, and it should be the next closest node
				1536	* according to the distance array (which contains arbitrary distance values
				1537	* from each node to each node in the system), and should also prefer nodes
				1538	* with no CPUs, since presumably they'll have very little allocation pressure
				1539	* on them otherwise.
				1540	* It returns -1 if no node is found.
				1541	*/
				1542	static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
				1543	{
Linus Torvalds	4cf808e	2006-02-17 20:38:21 +0100	[diff] [blame]	1544	int n, val;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1545	int min_val = INT_MAX;
				1546	int best_node = -1;
				1547
Linus Torvalds	4cf808e	2006-02-17 20:38:21 +0100	[diff] [blame]	1548	/* Use the local node if we haven't already */
				1549	if (!node_isset(node, *used_node_mask)) {
				1550	node_set(node, *used_node_mask);
				1551	return node;
				1552	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1553
Linus Torvalds	4cf808e	2006-02-17 20:38:21 +0100	[diff] [blame]	1554	for_each_online_node(n) {
				1555	cpumask_t tmp;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1556
				1557	/* Don't want a node to appear more than once */
				1558	if (node_isset(n, *used_node_mask))
				1559	continue;
				1560
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1561	/* Use the distance array to find the distance */
				1562	val = node_distance(node, n);
				1563
Linus Torvalds	4cf808e	2006-02-17 20:38:21 +0100	[diff] [blame]	1564	/* Penalize nodes under us ("prefer the next node") */
				1565	val += (n < node);
				1566
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1567	/* Give preference to headless and unused nodes */
				1568	tmp = node_to_cpumask(n);
				1569	if (!cpus_empty(tmp))
				1570	val += PENALTY_FOR_NODE_WITH_CPUS;
				1571
				1572	/* Slight preference for less loaded node */
				1573	val = (MAX_NODE_LOADMAX_NUMNODES);
				1574	val += node_load[n];
				1575
				1576	if (val < min_val) {
				1577	min_val = val;
				1578	best_node = n;
				1579	}
				1580	}
				1581
				1582	if (best_node >= 0)
				1583	node_set(best_node, *used_node_mask);
				1584
				1585	return best_node;
				1586	}
				1587
				1588	static void __init build_zonelists(pg_data_t *pgdat)
				1589	{
				1590	int i, j, k, node, local_node;
				1591	int prev_node, load;
				1592	struct zonelist *zonelist;
				1593	nodemask_t used_mask;
				1594
				1595	/* initialize zonelists */
				1596	for (i = 0; i < GFP_ZONETYPES; i++) {
				1597	zonelist = pgdat->node_zonelists + i;
				1598	zonelist->zones[0] = NULL;
				1599	}
				1600
				1601	/* NUMA-aware ordering of nodes */
				1602	local_node = pgdat->node_id;
				1603	load = num_online_nodes();
				1604	prev_node = local_node;
				1605	nodes_clear(used_mask);
				1606	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
Christoph Lameter	9eeff23	2006-01-18 17:42:31 -0800	[diff] [blame]	1607	int distance = node_distance(local_node, node);
				1608
				1609	/*
				1610	* If another node is sufficiently far away then it is better
				1611	* to reclaim pages in a zone before going off node.
				1612	*/
				1613	if (distance > RECLAIM_DISTANCE)
				1614	zone_reclaim_mode = 1;
				1615
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1616	/*
				1617	* We don't want to pressure a particular node.
				1618	* So adding penalty to the first node in same
				1619	* distance group to make it round-robin.
				1620	*/
Christoph Lameter	9eeff23	2006-01-18 17:42:31 -0800	[diff] [blame]	1621
				1622	if (distance != node_distance(local_node, prev_node))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1623	node_load[node] += load;
				1624	prev_node = node;
				1625	load--;
				1626	for (i = 0; i < GFP_ZONETYPES; i++) {
				1627	zonelist = pgdat->node_zonelists + i;
				1628	for (j = 0; zonelist->zones[j] != NULL; j++);
				1629
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1630	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1631
				1632	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1633	zonelist->zones[j] = NULL;
				1634	}
				1635	}
				1636	}
				1637
				1638	#else /* CONFIG_NUMA */
				1639
				1640	static void __init build_zonelists(pg_data_t *pgdat)
				1641	{
				1642	int i, j, k, node, local_node;
				1643
				1644	local_node = pgdat->node_id;
				1645	for (i = 0; i < GFP_ZONETYPES; i++) {
				1646	struct zonelist *zonelist;
				1647
				1648	zonelist = pgdat->node_zonelists + i;
				1649
				1650	j = 0;
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1651	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1652	j = build_zonelists_node(pgdat, zonelist, j, k);
				1653	/*
				1654	* Now we build the zonelist so that it contains the zones
				1655	* of all the other nodes.
				1656	* We don't want to pressure a particular node, so when
				1657	* building the zones for node N, we make sure that the
				1658	* zones coming right after the local ones are those from
				1659	* node N+1 (modulo N)
				1660	*/
				1661	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
				1662	if (!node_online(node))
				1663	continue;
				1664	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1665	}
				1666	for (node = 0; node < local_node; node++) {
				1667	if (!node_online(node))
				1668	continue;
				1669	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1670	}
				1671
				1672	zonelist->zones[j] = NULL;
				1673	}
				1674	}
				1675
				1676	#endif /* CONFIG_NUMA */
				1677
				1678	void __init build_all_zonelists(void)
				1679	{
				1680	int i;
				1681
				1682	for_each_online_node(i)
				1683	build_zonelists(NODE_DATA(i));
				1684	printk("Built %i zonelists\n", num_online_nodes());
				1685	cpuset_init_current_mems_allowed();
				1686	}
				1687
				1688	/*
				1689	* Helper functions to size the waitqueue hash table.
				1690	* Essentially these want to choose hash table sizes sufficiently
				1691	* large so that collisions trying to wait on pages are rare.
				1692	* But in fact, the number of active page waitqueues on typical
				1693	* systems is ridiculously low, less than 200. So this is even
				1694	* conservative, even though it seems large.
				1695	*
				1696	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
				1697	* waitqueues, i.e. the size of the waitq table given the number of pages.
				1698	*/
				1699	#define PAGES_PER_WAITQUEUE 256
				1700
				1701	static inline unsigned long wait_table_size(unsigned long pages)
				1702	{
				1703	unsigned long size = 1;
				1704
				1705	pages /= PAGES_PER_WAITQUEUE;
				1706
				1707	while (size < pages)
				1708	size <<= 1;
				1709
				1710	/*
				1711	* Once we have dozens or even hundreds of threads sleeping
				1712	* on IO we've got bigger problems than wait queue collision.
				1713	* Limit the size of the wait table to a reasonable size.
				1714	*/
				1715	size = min(size, 4096UL);
				1716
				1717	return max(size, 4UL);
				1718	}
				1719
				1720	/*
				1721	* This is an integer logarithm so that shifts can be used later
				1722	* to extract the more random high bits from the multiplicative
				1723	* hash function before the remainder is taken.
				1724	*/
				1725	static inline unsigned long wait_table_bits(unsigned long size)
				1726	{
				1727	return ffz(~size);
				1728	}
				1729
				1730	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
				1731
				1732	static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
				1733	unsigned long zones_size, unsigned long zholes_size)
				1734	{
				1735	unsigned long realtotalpages, totalpages = 0;
				1736	int i;
				1737
				1738	for (i = 0; i < MAX_NR_ZONES; i++)
				1739	totalpages += zones_size[i];
				1740	pgdat->node_spanned_pages = totalpages;
				1741
				1742	realtotalpages = totalpages;
				1743	if (zholes_size)
				1744	for (i = 0; i < MAX_NR_ZONES; i++)
				1745	realtotalpages -= zholes_size[i];
				1746	pgdat->node_present_pages = realtotalpages;
				1747	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
				1748	}
				1749
				1750
				1751	/*
				1752	* Initially all pages are reserved - free ones are freed
				1753	* up by free_all_bootmem() once the early boot process is
				1754	* done. Non-atomic initialization, single-pass.
				1755	*/
Matt Tolentino	c09b424	2006-01-17 07:03:44 +0100	[diff] [blame]	1756	void __meminit memmap_init_zone(unsigned long size, int nid, unsigned long zone,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1757	unsigned long start_pfn)
				1758	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1759	struct page *page;
Andy Whitcroft	29751f6	2005-06-23 00:08:00 -0700	[diff] [blame]	1760	unsigned long end_pfn = start_pfn + size;
				1761	unsigned long pfn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1762
Greg Ungerer	cbe8dd4	2006-01-12 01:05:24 -0800	[diff] [blame]	1763	for (pfn = start_pfn; pfn < end_pfn; pfn++) {
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1764	if (!early_pfn_valid(pfn))
				1765	continue;
				1766	page = pfn_to_page(pfn);
				1767	set_page_links(page, zone, nid, pfn);
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1768	set_page_count(page, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1769	reset_page_mapcount(page);
				1770	SetPageReserved(page);
				1771	INIT_LIST_HEAD(&page->lru);
				1772	#ifdef WANT_PAGE_VIRTUAL
				1773	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
				1774	if (!is_highmem_idx(zone))
Bob Picco	3212c6b	2005-06-27 14:36:28 -0700	[diff] [blame]	1775	set_page_address(page, __va(pfn << PAGE_SHIFT));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1776	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1777	}
				1778	}
				1779
				1780	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
				1781	unsigned long size)
				1782	{
				1783	int order;
				1784	for (order = 0; order < MAX_ORDER ; order++) {
				1785	INIT_LIST_HEAD(&zone->free_area[order].free_list);
				1786	zone->free_area[order].nr_free = 0;
				1787	}
				1788	}
				1789
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1790	#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) \| zone_nr)
				1791	void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
				1792	unsigned long size)
				1793	{
				1794	unsigned long snum = pfn_to_section_nr(pfn);
				1795	unsigned long end = pfn_to_section_nr(pfn + size);
				1796
				1797	if (FLAGS_HAS_NODE)
				1798	zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
				1799	else
				1800	for (; snum <= end; snum++)
				1801	zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
				1802	}
				1803
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1804	#ifndef __HAVE_ARCH_MEMMAP_INIT
				1805	#define memmap_init(size, nid, zone, start_pfn) \
				1806	memmap_init_zone((size), (nid), (zone), (start_pfn))
				1807	#endif
				1808
Ashok Raj	6292d9a	2006-02-01 03:04:44 -0800	[diff] [blame]	1809	static int __cpuinit zone_batchsize(struct zone *zone)
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1810	{
				1811	int batch;
				1812
				1813	/*
				1814	* The per-cpu-pages pools are set to around 1000th of the
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1815	* size of the zone. But no more than 1/2 of a meg.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1816	*
				1817	* OK, so we don't know how big the cache is. So guess.
				1818	*/
				1819	batch = zone->present_pages / 1024;
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1820	if (batch * PAGE_SIZE > 512 * 1024)
				1821	batch = (512 * 1024) / PAGE_SIZE;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1822	batch /= 4; /* We effectively = 4 below /
				1823	if (batch < 1)
				1824	batch = 1;
				1825
				1826	/*
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1827	* Clamp the batch to a 2^n - 1 value. Having a power
				1828	* of 2 value was found to be more likely to have
				1829	* suboptimal cache aliasing properties in some cases.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1830	*
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1831	* For example if 2 tasks are alternately allocating
				1832	* batches of pages, one task can end up with a lot
				1833	* of pages of one half of the possible page colors
				1834	* and the other with pages of the other colors.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1835	*/
Nick Piggin	0ceaacc	2005-12-04 13:55:25 +1100	[diff] [blame]	1836	batch = (1 << (fls(batch + batch/2)-1)) - 1;
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1837
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1838	return batch;
				1839	}
				1840
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1841	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
				1842	{
				1843	struct per_cpu_pages *pcp;
				1844
Magnus Damm	1c6fe94	2005-10-26 01:58:59 -0700	[diff] [blame]	1845	memset(p, 0, sizeof(*p));
				1846
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1847	pcp = &p->pcp[0]; /* hot */
				1848	pcp->count = 0;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1849	pcp->high = 6 * batch;
				1850	pcp->batch = max(1UL, 1 * batch);
				1851	INIT_LIST_HEAD(&pcp->list);
				1852
				1853	pcp = &p->pcp[1]; /* cold*/
				1854	pcp->count = 0;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1855	pcp->high = 2 * batch;
Seth, Rohit	e46a5e2	2005-10-29 18:15:48 -0700	[diff] [blame]	1856	pcp->batch = max(1UL, batch/2);
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1857	INIT_LIST_HEAD(&pcp->list);
				1858	}
				1859
Rohit Seth	8ad4b1f	2006-01-08 01:00:40 -0800	[diff] [blame]	1860	/*
				1861	* setup_pagelist_highmark() sets the high water mark for hot per_cpu_pagelist
				1862	* to the value high for the pageset p.
				1863	*/
				1864
				1865	static void setup_pagelist_highmark(struct per_cpu_pageset *p,
				1866	unsigned long high)
				1867	{
				1868	struct per_cpu_pages *pcp;
				1869
				1870	pcp = &p->pcp[0]; /* hot list */
				1871	pcp->high = high;
				1872	pcp->batch = max(1UL, high/4);
				1873	if ((high/4) > (PAGE_SHIFT * 8))
				1874	pcp->batch = PAGE_SHIFT * 8;
				1875	}
				1876
				1877
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1878	#ifdef CONFIG_NUMA
				1879	/*
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1880	* Boot pageset table. One per cpu which is going to be used for all
				1881	* zones and all nodes. The parameters will be set in such a way
				1882	* that an item put on a list will immediately be handed over to
				1883	* the buddy list. This is safe since pageset manipulation is done
				1884	* with interrupts disabled.
				1885	*
				1886	* Some NUMA counter updates may also be caught by the boot pagesets.
Christoph Lameter	b7c84c6	2005-06-22 20:26:07 -0700	[diff] [blame]	1887	*
				1888	* The boot_pagesets must be kept even after bootup is complete for
				1889	* unused processors and/or zones. They do play a role for bootstrapping
				1890	* hotplugged processors.
				1891	*
				1892	* zoneinfo_show() and maybe other functions do
				1893	* not check if the processor is online before following the pageset pointer.
				1894	* Other parts of the kernel may not check if the zone is available.
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1895	*/
Eric Dumazet	88a2a4ac	2006-02-04 23:27:36 -0800	[diff] [blame]	1896	static struct per_cpu_pageset boot_pageset[NR_CPUS];
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1897
				1898	/*
				1899	* Dynamically allocate memory for the
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1900	* per cpu pageset array in struct zone.
				1901	*/
Ashok Raj	6292d9a	2006-02-01 03:04:44 -0800	[diff] [blame]	1902	static int __cpuinit process_zones(int cpu)
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1903	{
				1904	struct zone zone, dzone;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1905
				1906	for_each_zone(zone) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1907
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	1908	zone_pcp(zone, cpu) = kmalloc_node(sizeof(struct per_cpu_pageset),
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1909	GFP_KERNEL, cpu_to_node(cpu));
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	1910	if (!zone_pcp(zone, cpu))
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1911	goto bad;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1912
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	1913	setup_pageset(zone_pcp(zone, cpu), zone_batchsize(zone));
Rohit Seth	8ad4b1f	2006-01-08 01:00:40 -0800	[diff] [blame]	1914
				1915	if (percpu_pagelist_fraction)
				1916	setup_pagelist_highmark(zone_pcp(zone, cpu),
				1917	(zone->present_pages / percpu_pagelist_fraction));
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1918	}
				1919
				1920	return 0;
				1921	bad:
				1922	for_each_zone(dzone) {
				1923	if (dzone == zone)
				1924	break;
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	1925	kfree(zone_pcp(dzone, cpu));
				1926	zone_pcp(dzone, cpu) = NULL;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1927	}
				1928	return -ENOMEM;
				1929	}
				1930
				1931	static inline void free_zone_pagesets(int cpu)
				1932	{
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1933	struct zone *zone;
				1934
				1935	for_each_zone(zone) {
				1936	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
				1937
				1938	zone_pcp(zone, cpu) = NULL;
				1939	kfree(pset);
				1940	}
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1941	}
				1942
Ashok Raj	6292d9a	2006-02-01 03:04:44 -0800	[diff] [blame]	1943	static int __cpuinit pageset_cpuup_callback(struct notifier_block *nfb,
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1944	unsigned long action,
				1945	void *hcpu)
				1946	{
				1947	int cpu = (long)hcpu;
				1948	int ret = NOTIFY_OK;
				1949
				1950	switch (action) {
				1951	case CPU_UP_PREPARE:
				1952	if (process_zones(cpu))
				1953	ret = NOTIFY_BAD;
				1954	break;
Andi Kleen	b0d4169	2005-11-05 17:25:53 +0100	[diff] [blame]	1955	case CPU_UP_CANCELED:
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1956	case CPU_DEAD:
				1957	free_zone_pagesets(cpu);
				1958	break;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1959	default:
				1960	break;
				1961	}
				1962	return ret;
				1963	}
				1964
				1965	static struct notifier_block pageset_notifier =
				1966	{ &pageset_cpuup_callback, NULL, 0 };
				1967
Al Viro	78d9955	2005-12-15 09:18:25 +0000	[diff] [blame]	1968	void __init setup_per_cpu_pageset(void)
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1969	{
				1970	int err;
				1971
				1972	/* Initialize per_cpu_pageset for cpu 0.
				1973	* A cpuup callback will do this for every cpu
				1974	* as it comes online
				1975	*/
				1976	err = process_zones(smp_processor_id());
				1977	BUG_ON(err);
				1978	register_cpu_notifier(&pageset_notifier);
				1979	}
				1980
				1981	#endif
				1982
Matt Tolentino	c09b424	2006-01-17 07:03:44 +0100	[diff] [blame]	1983	static __meminit
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1984	void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
				1985	{
				1986	int i;
				1987	struct pglist_data *pgdat = zone->zone_pgdat;
				1988
				1989	/*
				1990	* The per-page waitqueue mechanism uses hashed waitqueues
				1991	* per zone.
				1992	*/
				1993	zone->wait_table_size = wait_table_size(zone_size_pages);
				1994	zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
				1995	zone->wait_table = (wait_queue_head_t *)
				1996	alloc_bootmem_node(pgdat, zone->wait_table_size
				1997	* sizeof(wait_queue_head_t));
				1998
				1999	for(i = 0; i < zone->wait_table_size; ++i)
				2000	init_waitqueue_head(zone->wait_table + i);
				2001	}
				2002
Matt Tolentino	c09b424	2006-01-17 07:03:44 +0100	[diff] [blame]	2003	static __meminit void zone_pcp_init(struct zone *zone)
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2004	{
				2005	int cpu;
				2006	unsigned long batch = zone_batchsize(zone);
				2007
				2008	for (cpu = 0; cpu < NR_CPUS; cpu++) {
				2009	#ifdef CONFIG_NUMA
				2010	/* Early boot. Slab allocator not functional yet */
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	2011	zone_pcp(zone, cpu) = &boot_pageset[cpu];
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2012	setup_pageset(&boot_pageset[cpu],0);
				2013	#else
				2014	setup_pageset(zone_pcp(zone,cpu), batch);
				2015	#endif
				2016	}
				2017	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
				2018	zone->name, zone->present_pages, batch);
				2019	}
				2020
Matt Tolentino	c09b424	2006-01-17 07:03:44 +0100	[diff] [blame]	2021	static __meminit void init_currently_empty_zone(struct zone *zone,
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2022	unsigned long zone_start_pfn, unsigned long size)
				2023	{
				2024	struct pglist_data *pgdat = zone->zone_pgdat;
				2025
				2026	zone_wait_table_init(zone, size);
				2027	pgdat->nr_zones = zone_idx(zone) + 1;
				2028
				2029	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
				2030	zone->zone_start_pfn = zone_start_pfn;
				2031
				2032	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
				2033
				2034	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
				2035	}
				2036
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2037	/*
				2038	* Set up the zone data structures:
				2039	* - mark all pages reserved
				2040	* - mark all memory queues empty
				2041	* - clear the memory bitmaps
				2042	*/
				2043	static void __init free_area_init_core(struct pglist_data *pgdat,
				2044	unsigned long zones_size, unsigned long zholes_size)
				2045	{
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2046	unsigned long j;
				2047	int nid = pgdat->node_id;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2048	unsigned long zone_start_pfn = pgdat->node_start_pfn;
				2049
Dave Hansen	208d54e	2005-10-29 18:16:52 -0700	[diff] [blame]	2050	pgdat_resize_init(pgdat);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2051	pgdat->nr_zones = 0;
				2052	init_waitqueue_head(&pgdat->kswapd_wait);
				2053	pgdat->kswapd_max_order = 0;
				2054
				2055	for (j = 0; j < MAX_NR_ZONES; j++) {
				2056	struct zone *zone = pgdat->node_zones + j;
				2057	unsigned long size, realsize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2058
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2059	realsize = size = zones_size[j];
				2060	if (zholes_size)
				2061	realsize -= zholes_size[j];
				2062
Andi Kleen	a2f1b42	2005-11-05 17:25:53 +0100	[diff] [blame]	2063	if (j < ZONE_HIGHMEM)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2064	nr_kernel_pages += realsize;
				2065	nr_all_pages += realsize;
				2066
				2067	zone->spanned_pages = size;
				2068	zone->present_pages = realsize;
				2069	zone->name = zone_names[j];
				2070	spin_lock_init(&zone->lock);
				2071	spin_lock_init(&zone->lru_lock);
Dave Hansen	bdc8cb9	2005-10-29 18:16:53 -0700	[diff] [blame]	2072	zone_seqlock_init(zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2073	zone->zone_pgdat = pgdat;
				2074	zone->free_pages = 0;
				2075
				2076	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
				2077
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2078	zone_pcp_init(zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2079	INIT_LIST_HEAD(&zone->active_list);
				2080	INIT_LIST_HEAD(&zone->inactive_list);
				2081	zone->nr_scan_active = 0;
				2082	zone->nr_scan_inactive = 0;
				2083	zone->nr_active = 0;
				2084	zone->nr_inactive = 0;
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	2085	atomic_set(&zone->reclaim_in_progress, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2086	if (!size)
				2087	continue;
				2088
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2089	zonetable_add(zone, nid, j, zone_start_pfn, size);
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	2090	init_currently_empty_zone(zone, zone_start_pfn, size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2091	zone_start_pfn += size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2092	}
				2093	}
				2094
				2095	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
				2096	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2097	/* Skip empty nodes */
				2098	if (!pgdat->node_spanned_pages)
				2099	return;
				2100
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2101	#ifdef CONFIG_FLAT_NODE_MEM_MAP
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2102	/* ia64 gets its own node_mem_map, before this, without bootmem */
				2103	if (!pgdat->node_mem_map) {
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2104	unsigned long size;
				2105	struct page *map;
				2106
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2107	size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
Dave Hansen	6f167ec	2005-06-23 00:07:39 -0700	[diff] [blame]	2108	map = alloc_remap(pgdat->node_id, size);
				2109	if (!map)
				2110	map = alloc_bootmem_node(pgdat, size);
				2111	pgdat->node_mem_map = map;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2112	}
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2113	#ifdef CONFIG_FLATMEM
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2114	/*
				2115	* With no DISCONTIG, the global mem_map is just set as node 0's
				2116	*/
				2117	if (pgdat == NODE_DATA(0))
				2118	mem_map = NODE_DATA(0)->node_mem_map;
				2119	#endif
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2120	#endif /* CONFIG_FLAT_NODE_MEM_MAP */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2121	}
				2122
				2123	void __init free_area_init_node(int nid, struct pglist_data *pgdat,
				2124	unsigned long *zones_size, unsigned long node_start_pfn,
				2125	unsigned long *zholes_size)
				2126	{
				2127	pgdat->node_id = nid;
				2128	pgdat->node_start_pfn = node_start_pfn;
				2129	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
				2130
				2131	alloc_node_mem_map(pgdat);
				2132
				2133	free_area_init_core(pgdat, zones_size, zholes_size);
				2134	}
				2135
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2136	#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2137	static bootmem_data_t contig_bootmem_data;
				2138	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
				2139
				2140	EXPORT_SYMBOL(contig_page_data);
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2141	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2142
				2143	void __init free_area_init(unsigned long *zones_size)
				2144	{
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2145	free_area_init_node(0, NODE_DATA(0), zones_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2146	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
				2147	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2148
				2149	#ifdef CONFIG_PROC_FS
				2150
				2151	#include <linux/seq_file.h>
				2152
				2153	static void frag_start(struct seq_file m, loff_t *pos)
				2154	{
				2155	pg_data_t *pgdat;
				2156	loff_t node = *pos;
				2157
				2158	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
				2159	--node;
				2160
				2161	return pgdat;
				2162	}
				2163
				2164	static void frag_next(struct seq_file m, void arg, loff_t pos)
				2165	{
				2166	pg_data_t pgdat = (pg_data_t )arg;
				2167
				2168	(*pos)++;
				2169	return pgdat->pgdat_next;
				2170	}
				2171
				2172	static void frag_stop(struct seq_file m, void arg)
				2173	{
				2174	}
				2175
				2176	/*
				2177	* This walks the free areas for each zone.
				2178	*/
				2179	static int frag_show(struct seq_file m, void arg)
				2180	{
				2181	pg_data_t pgdat = (pg_data_t )arg;
				2182	struct zone *zone;
				2183	struct zone *node_zones = pgdat->node_zones;
				2184	unsigned long flags;
				2185	int order;
				2186
				2187	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
Con Kolivas	f3fe651	2006-01-06 00:11:15 -0800	[diff] [blame]	2188	if (!populated_zone(zone))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2189	continue;
				2190
				2191	spin_lock_irqsave(&zone->lock, flags);
				2192	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
				2193	for (order = 0; order < MAX_ORDER; ++order)
				2194	seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
				2195	spin_unlock_irqrestore(&zone->lock, flags);
				2196	seq_putc(m, '\n');
				2197	}
				2198	return 0;
				2199	}
				2200
				2201	struct seq_operations fragmentation_op = {
				2202	.start = frag_start,
				2203	.next = frag_next,
				2204	.stop = frag_stop,
				2205	.show = frag_show,
				2206	};
				2207
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2208	/*
				2209	* Output information about zones in @pgdat.
				2210	*/
				2211	static int zoneinfo_show(struct seq_file m, void arg)
				2212	{
				2213	pg_data_t *pgdat = arg;
				2214	struct zone *zone;
				2215	struct zone *node_zones = pgdat->node_zones;
				2216	unsigned long flags;
				2217
				2218	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
				2219	int i;
				2220
Con Kolivas	f3fe651	2006-01-06 00:11:15 -0800	[diff] [blame]	2221	if (!populated_zone(zone))
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2222	continue;
				2223
				2224	spin_lock_irqsave(&zone->lock, flags);
				2225	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
				2226	seq_printf(m,
				2227	"\n pages free %lu"
				2228	"\n min %lu"
				2229	"\n low %lu"
				2230	"\n high %lu"
				2231	"\n active %lu"
				2232	"\n inactive %lu"
				2233	"\n scanned %lu (a: %lu i: %lu)"
				2234	"\n spanned %lu"
				2235	"\n present %lu",
				2236	zone->free_pages,
				2237	zone->pages_min,
				2238	zone->pages_low,
				2239	zone->pages_high,
				2240	zone->nr_active,
				2241	zone->nr_inactive,
				2242	zone->pages_scanned,
				2243	zone->nr_scan_active, zone->nr_scan_inactive,
				2244	zone->spanned_pages,
				2245	zone->present_pages);
				2246	seq_printf(m,
				2247	"\n protection: (%lu",
				2248	zone->lowmem_reserve[0]);
				2249	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
				2250	seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
				2251	seq_printf(m,
				2252	")"
				2253	"\n pagesets");
Nick Piggin	23316bc	2006-01-08 01:00:41 -0800	[diff] [blame]	2254	for_each_online_cpu(i) {
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2255	struct per_cpu_pageset *pageset;
				2256	int j;
				2257
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	2258	pageset = zone_pcp(zone, i);
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2259	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2260	if (pageset->pcp[j].count)
				2261	break;
				2262	}
				2263	if (j == ARRAY_SIZE(pageset->pcp))
				2264	continue;
				2265	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2266	seq_printf(m,
				2267	"\n cpu: %i pcp: %i"
				2268	"\n count: %i"
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2269	"\n high: %i"
				2270	"\n batch: %i",
				2271	i, j,
				2272	pageset->pcp[j].count,
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2273	pageset->pcp[j].high,
				2274	pageset->pcp[j].batch);
				2275	}
				2276	#ifdef CONFIG_NUMA
				2277	seq_printf(m,
				2278	"\n numa_hit: %lu"
				2279	"\n numa_miss: %lu"
				2280	"\n numa_foreign: %lu"
				2281	"\n interleave_hit: %lu"
				2282	"\n local_node: %lu"
				2283	"\n other_node: %lu",
				2284	pageset->numa_hit,
				2285	pageset->numa_miss,
				2286	pageset->numa_foreign,
				2287	pageset->interleave_hit,
				2288	pageset->local_node,
				2289	pageset->other_node);
				2290	#endif
				2291	}
				2292	seq_printf(m,
				2293	"\n all_unreclaimable: %u"
				2294	"\n prev_priority: %i"
				2295	"\n temp_priority: %i"
				2296	"\n start_pfn: %lu",
				2297	zone->all_unreclaimable,
				2298	zone->prev_priority,
				2299	zone->temp_priority,
				2300	zone->zone_start_pfn);
				2301	spin_unlock_irqrestore(&zone->lock, flags);
				2302	seq_putc(m, '\n');
				2303	}
				2304	return 0;
				2305	}
				2306
				2307	struct seq_operations zoneinfo_op = {
				2308	.start = frag_start, /* iterate over all zones. The same as in
				2309	* fragmentation. */
				2310	.next = frag_next,
				2311	.stop = frag_stop,
				2312	.show = zoneinfo_show,
				2313	};
				2314
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2315	static char *vmstat_text[] = {
				2316	"nr_dirty",
				2317	"nr_writeback",
				2318	"nr_unstable",
				2319	"nr_page_table_pages",
				2320	"nr_mapped",
				2321	"nr_slab",
				2322
				2323	"pgpgin",
				2324	"pgpgout",
				2325	"pswpin",
				2326	"pswpout",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2327
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2328	"pgalloc_high",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2329	"pgalloc_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2330	"pgalloc_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2331	"pgalloc_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2332
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2333	"pgfree",
				2334	"pgactivate",
				2335	"pgdeactivate",
				2336
				2337	"pgfault",
				2338	"pgmajfault",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2339
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2340	"pgrefill_high",
				2341	"pgrefill_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2342	"pgrefill_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2343	"pgrefill_dma",
				2344
				2345	"pgsteal_high",
				2346	"pgsteal_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2347	"pgsteal_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2348	"pgsteal_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2349
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2350	"pgscan_kswapd_high",
				2351	"pgscan_kswapd_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2352	"pgscan_kswapd_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2353	"pgscan_kswapd_dma",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2354
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2355	"pgscan_direct_high",
				2356	"pgscan_direct_normal",
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2357	"pgscan_direct_dma32",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2358	"pgscan_direct_dma",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2359
Nick Piggin	9328b8f	2006-01-06 00:11:10 -0800	[diff] [blame]	2360	"pginodesteal",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2361	"slabs_scanned",
				2362	"kswapd_steal",
				2363	"kswapd_inodesteal",
				2364	"pageoutrun",
				2365	"allocstall",
				2366
				2367	"pgrotated",
KAMEZAWA Hiroyuki	edfbe2b	2005-05-01 08:58:37 -0700	[diff] [blame]	2368	"nr_bounce",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2369	};
				2370
				2371	static void vmstat_start(struct seq_file m, loff_t *pos)
				2372	{
				2373	struct page_state *ps;
				2374
				2375	if (*pos >= ARRAY_SIZE(vmstat_text))
				2376	return NULL;
				2377
				2378	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
				2379	m->private = ps;
				2380	if (!ps)
				2381	return ERR_PTR(-ENOMEM);
				2382	get_full_page_state(ps);
				2383	ps->pgpgin /= 2; /* sectors -> kbytes */
				2384	ps->pgpgout /= 2;
				2385	return (unsigned long )ps + pos;
				2386	}
				2387
				2388	static void vmstat_next(struct seq_file m, void arg, loff_t pos)
				2389	{
				2390	(*pos)++;
				2391	if (*pos >= ARRAY_SIZE(vmstat_text))
				2392	return NULL;
				2393	return (unsigned long )m->private + pos;
				2394	}
				2395
				2396	static int vmstat_show(struct seq_file m, void arg)
				2397	{
				2398	unsigned long *l = arg;
				2399	unsigned long off = l - (unsigned long *)m->private;
				2400
				2401	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
				2402	return 0;
				2403	}
				2404
				2405	static void vmstat_stop(struct seq_file m, void arg)
				2406	{
				2407	kfree(m->private);
				2408	m->private = NULL;
				2409	}
				2410
				2411	struct seq_operations vmstat_op = {
				2412	.start = vmstat_start,
				2413	.next = vmstat_next,
				2414	.stop = vmstat_stop,
				2415	.show = vmstat_show,
				2416	};
				2417
				2418	#endif /* CONFIG_PROC_FS */
				2419
				2420	#ifdef CONFIG_HOTPLUG_CPU
				2421	static int page_alloc_cpu_notify(struct notifier_block *self,
				2422	unsigned long action, void *hcpu)
				2423	{
				2424	int cpu = (unsigned long)hcpu;
				2425	long *count;
				2426	unsigned long src, dest;
				2427
				2428	if (action == CPU_DEAD) {
				2429	int i;
				2430
				2431	/* Drain local pagecache count. */
				2432	count = &per_cpu(nr_pagecache_local, cpu);
				2433	atomic_add(*count, &nr_pagecache);
				2434	*count = 0;
				2435	local_irq_disable();
				2436	__drain_pages(cpu);
				2437
				2438	/* Add dead cpu's page_states to our own. */
				2439	dest = (unsigned long *)&__get_cpu_var(page_states);
				2440	src = (unsigned long *)&per_cpu(page_states, cpu);
				2441
				2442	for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
				2443	i++) {
				2444	dest[i] += src[i];
				2445	src[i] = 0;
				2446	}
				2447
				2448	local_irq_enable();
				2449	}
				2450	return NOTIFY_OK;
				2451	}
				2452	#endif /* CONFIG_HOTPLUG_CPU */
				2453
				2454	void __init page_alloc_init(void)
				2455	{
				2456	hotcpu_notifier(page_alloc_cpu_notify, 0);
				2457	}
				2458
				2459	/*
				2460	* setup_per_zone_lowmem_reserve - called whenever
				2461	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
				2462	* has a correct pages reserved value, so an adequate number of
				2463	* pages are left in the zone after a successful __alloc_pages().
				2464	*/
				2465	static void setup_per_zone_lowmem_reserve(void)
				2466	{
				2467	struct pglist_data *pgdat;
				2468	int j, idx;
				2469
				2470	for_each_pgdat(pgdat) {
				2471	for (j = 0; j < MAX_NR_ZONES; j++) {
				2472	struct zone *zone = pgdat->node_zones + j;
				2473	unsigned long present_pages = zone->present_pages;
				2474
				2475	zone->lowmem_reserve[j] = 0;
				2476
				2477	for (idx = j-1; idx >= 0; idx--) {
				2478	struct zone *lower_zone;
				2479
				2480	if (sysctl_lowmem_reserve_ratio[idx] < 1)
				2481	sysctl_lowmem_reserve_ratio[idx] = 1;
				2482
				2483	lower_zone = pgdat->node_zones + idx;
				2484	lower_zone->lowmem_reserve[j] = present_pages /
				2485	sysctl_lowmem_reserve_ratio[idx];
				2486	present_pages += lower_zone->present_pages;
				2487	}
				2488	}
				2489	}
				2490	}
				2491
				2492	/*
				2493	* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
				2494	* that the pages_{min,low,high} values for each zone are set correctly
				2495	* with respect to min_free_kbytes.
				2496	*/
Dave Hansen	3947be1	2005-10-29 18:16:54 -0700	[diff] [blame]	2497	void setup_per_zone_pages_min(void)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2498	{
				2499	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
				2500	unsigned long lowmem_pages = 0;
				2501	struct zone *zone;
				2502	unsigned long flags;
				2503
				2504	/* Calculate total number of !ZONE_HIGHMEM pages */
				2505	for_each_zone(zone) {
				2506	if (!is_highmem(zone))
				2507	lowmem_pages += zone->present_pages;
				2508	}
				2509
				2510	for_each_zone(zone) {
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2511	unsigned long tmp;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2512	spin_lock_irqsave(&zone->lru_lock, flags);
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2513	tmp = (pages_min * zone->present_pages) / lowmem_pages;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2514	if (is_highmem(zone)) {
				2515	/*
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2516	* __GFP_HIGH and PF_MEMALLOC allocations usually don't
				2517	* need highmem pages, so cap pages_min to a small
				2518	* value here.
				2519	*
				2520	* The (pages_high-pages_low) and (pages_low-pages_min)
				2521	* deltas controls asynch page reclaim, and so should
				2522	* not be capped for highmem.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2523	*/
				2524	int min_pages;
				2525
				2526	min_pages = zone->present_pages / 1024;
				2527	if (min_pages < SWAP_CLUSTER_MAX)
				2528	min_pages = SWAP_CLUSTER_MAX;
				2529	if (min_pages > 128)
				2530	min_pages = 128;
				2531	zone->pages_min = min_pages;
				2532	} else {
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2533	/*
				2534	* If it's a lowmem zone, reserve a number of pages
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2535	* proportionate to the zone's size.
				2536	*/
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2537	zone->pages_min = tmp;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2538	}
				2539
Nick Piggin	669ed17	2005-11-13 16:06:45 -0800	[diff] [blame]	2540	zone->pages_low = zone->pages_min + tmp / 4;
				2541	zone->pages_high = zone->pages_min + tmp / 2;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2542	spin_unlock_irqrestore(&zone->lru_lock, flags);
				2543	}
				2544	}
				2545
				2546	/*
				2547	* Initialise min_free_kbytes.
				2548	*
				2549	* For small machines we want it small (128k min). For large machines
				2550	* we want it large (64MB max). But it is not linear, because network
				2551	* bandwidth does not increase linearly with machine size. We use
				2552	*
				2553	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
				2554	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
				2555	*
				2556	* which yields
				2557	*
				2558	* 16MB: 512k
				2559	* 32MB: 724k
				2560	* 64MB: 1024k
				2561	* 128MB: 1448k
				2562	* 256MB: 2048k
				2563	* 512MB: 2896k
				2564	* 1024MB: 4096k
				2565	* 2048MB: 5792k
				2566	* 4096MB: 8192k
				2567	* 8192MB: 11584k
				2568	* 16384MB: 16384k
				2569	*/
				2570	static int __init init_per_zone_pages_min(void)
				2571	{
				2572	unsigned long lowmem_kbytes;
				2573
				2574	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
				2575
				2576	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
				2577	if (min_free_kbytes < 128)
				2578	min_free_kbytes = 128;
				2579	if (min_free_kbytes > 65536)
				2580	min_free_kbytes = 65536;
				2581	setup_per_zone_pages_min();
				2582	setup_per_zone_lowmem_reserve();
				2583	return 0;
				2584	}
				2585	module_init(init_per_zone_pages_min)
				2586
				2587	/*
				2588	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
				2589	* that we can call two helper functions whenever min_free_kbytes
				2590	* changes.
				2591	*/
				2592	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
				2593	struct file file, void __user buffer, size_t length, loff_t ppos)
				2594	{
				2595	proc_dointvec(table, write, file, buffer, length, ppos);
				2596	setup_per_zone_pages_min();
				2597	return 0;
				2598	}
				2599
				2600	/*
				2601	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
				2602	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
				2603	* whenever sysctl_lowmem_reserve_ratio changes.
				2604	*
				2605	* The reserve ratio obviously has absolutely no relation with the
				2606	* pages_min watermarks. The lowmem reserve ratio can only make sense
				2607	* if in function of the boot time zone sizes.
				2608	*/
				2609	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
				2610	struct file file, void __user buffer, size_t length, loff_t ppos)
				2611	{
				2612	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
				2613	setup_per_zone_lowmem_reserve();
				2614	return 0;
				2615	}
				2616
Rohit Seth	8ad4b1f	2006-01-08 01:00:40 -0800	[diff] [blame]	2617	/*
				2618	* percpu_pagelist_fraction - changes the pcp->high for each zone on each
				2619	* cpu. It is the fraction of total pages in each zone that a hot per cpu pagelist
				2620	* can have before it gets flushed back to buddy allocator.
				2621	*/
				2622
				2623	int percpu_pagelist_fraction_sysctl_handler(ctl_table *table, int write,
				2624	struct file file, void __user buffer, size_t length, loff_t ppos)
				2625	{
				2626	struct zone *zone;
				2627	unsigned int cpu;
				2628	int ret;
				2629
				2630	ret = proc_dointvec_minmax(table, write, file, buffer, length, ppos);
				2631	if (!write \|\| (ret == -EINVAL))
				2632	return ret;
				2633	for_each_zone(zone) {
				2634	for_each_online_cpu(cpu) {
				2635	unsigned long high;
				2636	high = zone->present_pages / percpu_pagelist_fraction;
				2637	setup_pagelist_highmark(zone_pcp(zone, cpu), high);
				2638	}
				2639	}
				2640	return 0;
				2641	}
				2642
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2643	__initdata int hashdist = HASHDIST_DEFAULT;
				2644
				2645	#ifdef CONFIG_NUMA
				2646	static int __init set_hashdist(char *str)
				2647	{
				2648	if (!str)
				2649	return 0;
				2650	hashdist = simple_strtoul(str, &str, 0);
				2651	return 1;
				2652	}
				2653	__setup("hashdist=", set_hashdist);
				2654	#endif
				2655
				2656	/*
				2657	* allocate a large system hash table from bootmem
				2658	* - it is assumed that the hash table must contain an exact power-of-2
				2659	* quantity of entries
				2660	* - limit is the number of hash buckets, not the total allocation size
				2661	*/
				2662	void __init alloc_large_system_hash(const char tablename,
				2663	unsigned long bucketsize,
				2664	unsigned long numentries,
				2665	int scale,
				2666	int flags,
				2667	unsigned int *_hash_shift,
				2668	unsigned int *_hash_mask,
				2669	unsigned long limit)
				2670	{
				2671	unsigned long long max = limit;
				2672	unsigned long log2qty, size;
				2673	void *table = NULL;
				2674
				2675	/* allow the kernel cmdline to have a say */
				2676	if (!numentries) {
				2677	/* round applicable memory size up to nearest megabyte */
				2678	numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
				2679	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
				2680	numentries >>= 20 - PAGE_SHIFT;
				2681	numentries <<= 20 - PAGE_SHIFT;
				2682
				2683	/* limit to 1 bucket per 2^scale bytes of low memory */
				2684	if (scale > PAGE_SHIFT)
				2685	numentries >>= (scale - PAGE_SHIFT);
				2686	else
				2687	numentries <<= (PAGE_SHIFT - scale);
				2688	}
				2689	/* rounded up to nearest power of 2 in size */
				2690	numentries = 1UL << (long_log2(numentries) + 1);
				2691
				2692	/* limit allocation size to 1/16 total memory by default */
				2693	if (max == 0) {
				2694	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
				2695	do_div(max, bucketsize);
				2696	}
				2697
				2698	if (numentries > max)
				2699	numentries = max;
				2700
				2701	log2qty = long_log2(numentries);
				2702
				2703	do {
				2704	size = bucketsize << log2qty;
				2705	if (flags & HASH_EARLY)
				2706	table = alloc_bootmem(size);
				2707	else if (hashdist)
				2708	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
				2709	else {
				2710	unsigned long order;
				2711	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
				2712	;
				2713	table = (void*) __get_free_pages(GFP_ATOMIC, order);
				2714	}
				2715	} while (!table && size > PAGE_SIZE && --log2qty);
				2716
				2717	if (!table)
				2718	panic("Failed to allocate %s hash table\n", tablename);
				2719
				2720	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
				2721	tablename,
				2722	(1U << log2qty),
				2723	long_log2(size) - PAGE_SHIFT,
				2724	size);
				2725
				2726	if (_hash_shift)
				2727	*_hash_shift = log2qty;
				2728	if (_hash_mask)
				2729	*_hash_mask = (1 << log2qty) - 1;
				2730
				2731	return table;
				2732	}