Blame - mm/page_alloc.c - kernel/msm-4.9

blob: a51ef94eec33970e41f64d4b327d7e4686f70f15 [file] [log] [blame]

Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1	/*
				2	* linux/mm/page_alloc.c
				3	*
				4	* Manages the free list, the system allocates free pages here.
				5	* Note that kmalloc() lives in slab.c
				6	*
				7	* Copyright (C) 1991, 1992, 1993, 1994 Linus Torvalds
				8	* Swap reorganised 29.12.95, Stephen Tweedie
				9	* Support of BIGMEM added by Gerhard Wichert, Siemens AG, July 1999
				10	* Reshaped it to be a zoned allocator, Ingo Molnar, Red Hat, 1999
				11	* Discontiguous memory support, Kanoj Sarcar, SGI, Nov 1999
				12	* Zone balancing, Kanoj Sarcar, SGI, Jan 2000
				13	* Per cpu hot/cold page lists, bulk allocation, Martin J. Bligh, Sept 2002
				14	* (lots of bits borrowed from Ingo Molnar & Andrew Morton)
				15	*/
				16
				17	#include <linux/config.h>
				18	#include <linux/stddef.h>
				19	#include <linux/mm.h>
				20	#include <linux/swap.h>
				21	#include <linux/interrupt.h>
				22	#include <linux/pagemap.h>
				23	#include <linux/bootmem.h>
				24	#include <linux/compiler.h>
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	25	#include <linux/kernel.h>
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	26	#include <linux/module.h>
				27	#include <linux/suspend.h>
				28	#include <linux/pagevec.h>
				29	#include <linux/blkdev.h>
				30	#include <linux/slab.h>
				31	#include <linux/notifier.h>
				32	#include <linux/topology.h>
				33	#include <linux/sysctl.h>
				34	#include <linux/cpu.h>
				35	#include <linux/cpuset.h>
				36	#include <linux/nodemask.h>
				37	#include <linux/vmalloc.h>
				38
				39	#include <asm/tlbflush.h>
				40	#include "internal.h"
				41
				42	/*
				43	* MCD - HACK: Find somewhere to initialize this EARLY, or make this
				44	* initializer cleaner
				45	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	46	nodemask_t node_online_map __read_mostly = { { [0] = 1UL } };
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	47	EXPORT_SYMBOL(node_online_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	48	nodemask_t node_possible_map __read_mostly = NODE_MASK_ALL;
Dean Nelson	7223a93	2005-03-23 19:00:00 -0700	[diff] [blame]	49	EXPORT_SYMBOL(node_possible_map);
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	50	struct pglist_data *pgdat_list __read_mostly;
Ravikiran G Thirumalai	6c231b7	2005-09-06 15:17:45 -0700	[diff] [blame]	51	unsigned long totalram_pages __read_mostly;
				52	unsigned long totalhigh_pages __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	53	long nr_swap_pages;
				54
				55	/*
				56	* results with 256, 32 in the lowmem_reserve sysctl:
				57	* 1G machine -> (16M dma, 800M-16M normal, 1G-800M high)
				58	* 1G machine -> (16M dma, 784M normal, 224M high)
				59	* NORMAL allocation will leave 784M/256 of ram reserved in the ZONE_DMA
				60	* HIGHMEM allocation will leave 224M/32 of ram reserved in ZONE_NORMAL
				61	* HIGHMEM allocation will (224M+784M)/256 of ram reserved in ZONE_DMA
				62	*/
				63	int sysctl_lowmem_reserve_ratio[MAX_NR_ZONES-1] = { 256, 32 };
				64
				65	EXPORT_SYMBOL(totalram_pages);
				66	EXPORT_SYMBOL(nr_swap_pages);
				67
				68	/*
				69	* Used by page_zone() to look up the address of the struct zone whose
				70	* id is encoded in the upper bits of page->flags
				71	*/
Christoph Lameter	c3d8c14	2005-09-06 15:16:33 -0700	[diff] [blame]	72	struct zone *zone_table[1 << ZONETABLE_SHIFT] __read_mostly;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	73	EXPORT_SYMBOL(zone_table);
				74
				75	static char *zone_names[MAX_NR_ZONES] = { "DMA", "Normal", "HighMem" };
				76	int min_free_kbytes = 1024;
				77
				78	unsigned long __initdata nr_kernel_pages;
				79	unsigned long __initdata nr_all_pages;
				80
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame^]	81	static int page_outside_zone_boundaries(struct zone zone, struct page page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	82	{
				83	if (page_to_pfn(page) >= zone->zone_start_pfn + zone->spanned_pages)
				84	return 1;
				85	if (page_to_pfn(page) < zone->zone_start_pfn)
				86	return 1;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame^]	87
				88	return 0;
				89	}
				90
				91	static int page_is_consistent(struct zone zone, struct page page)
				92	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	93	#ifdef CONFIG_HOLES_IN_ZONE
				94	if (!pfn_valid(page_to_pfn(page)))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame^]	95	return 0;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	96	#endif
				97	if (zone != page_zone(page))
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame^]	98	return 0;
				99
				100	return 1;
				101	}
				102	/*
				103	* Temporary debugging check for pages not lying within a given zone.
				104	*/
				105	static int bad_range(struct zone zone, struct page page)
				106	{
				107	if (page_outside_zone_boundaries(zone, page))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	108	return 1;
Dave Hansen	c6a57e1	2005-10-29 18:16:52 -0700	[diff] [blame^]	109	if (!page_is_consistent(zone, page))
				110	return 1;
				111
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	112	return 0;
				113	}
				114
				115	static void bad_page(const char function, struct page page)
				116	{
				117	printk(KERN_EMERG "Bad page state at %s (in process '%s', page %p)\n",
				118	function, current->comm, page);
				119	printk(KERN_EMERG "flags:0x%0*lx mapping:%p mapcount:%d count:%d\n",
				120	(int)(2*sizeof(page_flags_t)), (unsigned long)page->flags,
				121	page->mapping, page_mapcount(page), page_count(page));
				122	printk(KERN_EMERG "Backtrace:\n");
				123	dump_stack();
				124	printk(KERN_EMERG "Trying to fix it up, but a reboot is needed\n");
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	125	page->flags &= ~(1 << PG_lru \|
				126	1 << PG_private \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	127	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	128	1 << PG_active \|
				129	1 << PG_dirty \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	130	1 << PG_reclaim \|
				131	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	132	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	133	1 << PG_writeback \|
				134	1 << PG_reserved );
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	135	set_page_count(page, 0);
				136	reset_page_mapcount(page);
				137	page->mapping = NULL;
Randy Dunlap	9f15833	2005-09-13 01:25:16 -0700	[diff] [blame]	138	add_taint(TAINT_BAD_PAGE);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	139	}
				140
				141	#ifndef CONFIG_HUGETLB_PAGE
				142	#define prep_compound_page(page, order) do { } while (0)
				143	#define destroy_compound_page(page, order) do { } while (0)
				144	#else
				145	/*
				146	* Higher-order pages are called "compound pages". They are structured thusly:
				147	*
				148	* The first PAGE_SIZE page is called the "head page".
				149	*
				150	* The remaining PAGE_SIZE pages are called "tail pages".
				151	*
				152	* All pages have PG_compound set. All pages have their ->private pointing at
				153	* the head page (even the head page has this).
				154	*
				155	* The first tail page's ->mapping, if non-zero, holds the address of the
				156	* compound page's put_page() function.
				157	*
				158	* The order of the allocation is stored in the first tail page's ->index
				159	* This is only for debug at present. This usage means that zero-order pages
				160	* may not be compound.
				161	*/
				162	static void prep_compound_page(struct page *page, unsigned long order)
				163	{
				164	int i;
				165	int nr_pages = 1 << order;
				166
				167	page[1].mapping = NULL;
				168	page[1].index = order;
				169	for (i = 0; i < nr_pages; i++) {
				170	struct page *p = page + i;
				171
				172	SetPageCompound(p);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	173	set_page_private(p, (unsigned long)page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	174	}
				175	}
				176
				177	static void destroy_compound_page(struct page *page, unsigned long order)
				178	{
				179	int i;
				180	int nr_pages = 1 << order;
				181
				182	if (!PageCompound(page))
				183	return;
				184
				185	if (page[1].index != order)
				186	bad_page(__FUNCTION__, page);
				187
				188	for (i = 0; i < nr_pages; i++) {
				189	struct page *p = page + i;
				190
				191	if (!PageCompound(p))
				192	bad_page(__FUNCTION__, page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	193	if (page_private(p) != (unsigned long)page)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	194	bad_page(__FUNCTION__, page);
				195	ClearPageCompound(p);
				196	}
				197	}
				198	#endif /* CONFIG_HUGETLB_PAGE */
				199
				200	/*
				201	* function for dealing with page's order in buddy system.
				202	* zone->lock is already acquired when we use these.
				203	* So, we don't need atomic page->flags operations here.
				204	*/
				205	static inline unsigned long page_order(struct page *page) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	206	return page_private(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	207	}
				208
				209	static inline void set_page_order(struct page *page, int order) {
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	210	set_page_private(page, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	211	__SetPagePrivate(page);
				212	}
				213
				214	static inline void rmv_page_order(struct page *page)
				215	{
				216	__ClearPagePrivate(page);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	217	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	218	}
				219
				220	/*
				221	* Locate the struct page for both the matching buddy in our
				222	* pair (buddy1) and the combined O(n+1) page they form (page).
				223	*
				224	* 1) Any buddy B1 will have an order O twin B2 which satisfies
				225	* the following equation:
				226	* B2 = B1 ^ (1 << O)
				227	* For example, if the starting buddy (buddy2) is #8 its order
				228	* 1 buddy is #10:
				229	* B2 = 8 ^ (1 << 1) = 8 ^ 2 = 10
				230	*
				231	* 2) Any buddy B will have an order O+1 parent P which
				232	* satisfies the following equation:
				233	* P = B & ~(1 << O)
				234	*
				235	* Assumption: *_mem_map is contigious at least up to MAX_ORDER
				236	*/
				237	static inline struct page *
				238	__page_find_buddy(struct page *page, unsigned long page_idx, unsigned int order)
				239	{
				240	unsigned long buddy_idx = page_idx ^ (1 << order);
				241
				242	return page + (buddy_idx - page_idx);
				243	}
				244
				245	static inline unsigned long
				246	__find_combined_index(unsigned long page_idx, unsigned int order)
				247	{
				248	return (page_idx & ~(1 << order));
				249	}
				250
				251	/*
				252	* This function checks whether a page is free && is the buddy
				253	* we can do coalesce a page and its buddy if
				254	* (a) the buddy is free &&
				255	* (b) the buddy is on the buddy system &&
				256	* (c) a page and its buddy have the same order.
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	257	* for recording page's order, we use page_private(page) and PG_private.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	258	*
				259	*/
				260	static inline int page_is_buddy(struct page *page, int order)
				261	{
				262	if (PagePrivate(page) &&
				263	(page_order(page) == order) &&
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	264	page_count(page) == 0)
				265	return 1;
				266	return 0;
				267	}
				268
				269	/*
				270	* Freeing function for a buddy system allocator.
				271	*
				272	* The concept of a buddy system is to maintain direct-mapped table
				273	* (containing bit values) for memory blocks of various "orders".
				274	* The bottom level table contains the map for the smallest allocatable
				275	* units of memory (here, pages), and each level above it describes
				276	* pairs of units from the levels below, hence, "buddies".
				277	* At a high level, all that happens here is marking the table entry
				278	* at the bottom level available, and propagating the changes upward
				279	* as necessary, plus some accounting needed to play nicely with other
				280	* parts of the VM system.
				281	* At each level, we keep a list of pages, which are heads of continuous
				282	* free pages of length of (1 << order) and marked with PG_Private.Page's
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	283	* order is recorded in page_private(page) field.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	284	* So when we are allocating or freeing one, we can derive the state of the
				285	* other. That is, if we allocate a small block, and both were
				286	* free, the remainder of the region must be split into blocks.
				287	* If a block is freed, and its buddy is also free, then this
				288	* triggers coalescing into a block of larger size.
				289	*
				290	* -- wli
				291	*/
				292
				293	static inline void __free_pages_bulk (struct page *page,
				294	struct zone *zone, unsigned int order)
				295	{
				296	unsigned long page_idx;
				297	int order_size = 1 << order;
				298
				299	if (unlikely(order))
				300	destroy_compound_page(page, order);
				301
				302	page_idx = page_to_pfn(page) & ((1 << MAX_ORDER) - 1);
				303
				304	BUG_ON(page_idx & (order_size - 1));
				305	BUG_ON(bad_range(zone, page));
				306
				307	zone->free_pages += order_size;
				308	while (order < MAX_ORDER-1) {
				309	unsigned long combined_idx;
				310	struct free_area *area;
				311	struct page *buddy;
				312
				313	combined_idx = __find_combined_index(page_idx, order);
				314	buddy = __page_find_buddy(page, page_idx, order);
				315
				316	if (bad_range(zone, buddy))
				317	break;
				318	if (!page_is_buddy(buddy, order))
				319	break; /* Move the buddy up one level. */
				320	list_del(&buddy->lru);
				321	area = zone->free_area + order;
				322	area->nr_free--;
				323	rmv_page_order(buddy);
				324	page = page + (combined_idx - page_idx);
				325	page_idx = combined_idx;
				326	order++;
				327	}
				328	set_page_order(page, order);
				329	list_add(&page->lru, &zone->free_area[order].free_list);
				330	zone->free_area[order].nr_free++;
				331	}
				332
				333	static inline void free_pages_check(const char function, struct page page)
				334	{
				335	if ( page_mapcount(page) \|\|
				336	page->mapping != NULL \|\|
				337	page_count(page) != 0 \|\|
				338	(page->flags & (
				339	1 << PG_lru \|
				340	1 << PG_private \|
				341	1 << PG_locked \|
				342	1 << PG_active \|
				343	1 << PG_reclaim \|
				344	1 << PG_slab \|
				345	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	346	1 << PG_writeback \|
				347	1 << PG_reserved )))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	348	bad_page(function, page);
				349	if (PageDirty(page))
Nick Piggin	242e546	2005-09-03 15:54:50 -0700	[diff] [blame]	350	__ClearPageDirty(page);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	351	}
				352
				353	/*
				354	* Frees a list of pages.
				355	* Assumes all pages on list are in same zone, and of same order.
Renaud Lienhart	207f36e	2005-09-10 00:26:59 -0700	[diff] [blame]	356	* count is the number of pages to free.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	357	*
				358	* If the zone was previously in an "all pages pinned" state then look to
				359	* see if this freeing clears that state.
				360	*
				361	* And clear the zone's pages_scanned counter, to hold off the "all pages are
				362	* pinned" detection logic.
				363	*/
				364	static int
				365	free_pages_bulk(struct zone *zone, int count,
				366	struct list_head *list, unsigned int order)
				367	{
				368	unsigned long flags;
				369	struct page *page = NULL;
				370	int ret = 0;
				371
				372	spin_lock_irqsave(&zone->lock, flags);
				373	zone->all_unreclaimable = 0;
				374	zone->pages_scanned = 0;
				375	while (!list_empty(list) && count--) {
				376	page = list_entry(list->prev, struct page, lru);
				377	/* have to delete it as __free_pages_bulk list manipulates */
				378	list_del(&page->lru);
				379	__free_pages_bulk(page, zone, order);
				380	ret++;
				381	}
				382	spin_unlock_irqrestore(&zone->lock, flags);
				383	return ret;
				384	}
				385
				386	void __free_pages_ok(struct page *page, unsigned int order)
				387	{
				388	LIST_HEAD(list);
				389	int i;
				390
				391	arch_free_page(page, order);
				392
				393	mod_page_state(pgfree, 1 << order);
				394
				395	#ifndef CONFIG_MMU
				396	if (order > 0)
				397	for (i = 1 ; i < (1 << order) ; ++i)
				398	__put_page(page + i);
				399	#endif
				400
				401	for (i = 0 ; i < (1 << order) ; ++i)
				402	free_pages_check(__FUNCTION__, page + i);
				403	list_add(&page->lru, &list);
				404	kernel_map_pages(page, 1<<order, 0);
				405	free_pages_bulk(page_zone(page), 1, &list, order);
				406	}
				407
				408
				409	/*
				410	* The order of subdivision here is critical for the IO subsystem.
				411	* Please do not alter this order without good reasons and regression
				412	* testing. Specifically, as large blocks of memory are subdivided,
				413	* the order in which smaller blocks are delivered depends on the order
				414	* they're subdivided in this function. This is the primary factor
				415	* influencing the order in which pages are delivered to the IO
				416	* subsystem according to empirical testing, and this is also justified
				417	* by considering the behavior of a buddy system containing a single
				418	* large block of memory acted on by a series of small allocations.
				419	* This behavior is a critical factor in sglist merging's success.
				420	*
				421	* -- wli
				422	*/
				423	static inline struct page *
				424	expand(struct zone zone, struct page page,
				425	int low, int high, struct free_area *area)
				426	{
				427	unsigned long size = 1 << high;
				428
				429	while (high > low) {
				430	area--;
				431	high--;
				432	size >>= 1;
				433	BUG_ON(bad_range(zone, &page[size]));
				434	list_add(&page[size].lru, &area->free_list);
				435	area->nr_free++;
				436	set_page_order(&page[size], high);
				437	}
				438	return page;
				439	}
				440
				441	void set_page_refs(struct page *page, int order)
				442	{
				443	#ifdef CONFIG_MMU
				444	set_page_count(page, 1);
				445	#else
				446	int i;
				447
				448	/*
				449	* We need to reference all the pages for this order, otherwise if
				450	* anyone accesses one of the pages with (get/put) it will be freed.
				451	* - eg: access_process_vm()
				452	*/
				453	for (i = 0; i < (1 << order); i++)
				454	set_page_count(page + i, 1);
				455	#endif /* CONFIG_MMU */
				456	}
				457
				458	/*
				459	* This page is about to be returned from the page allocator
				460	*/
				461	static void prep_new_page(struct page *page, int order)
				462	{
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	463	if ( page_mapcount(page) \|\|
				464	page->mapping != NULL \|\|
				465	page_count(page) != 0 \|\|
				466	(page->flags & (
				467	1 << PG_lru \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	468	1 << PG_private \|
				469	1 << PG_locked \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	470	1 << PG_active \|
				471	1 << PG_dirty \|
				472	1 << PG_reclaim \|
Hugh Dickins	334795e	2005-06-21 17:15:08 -0700	[diff] [blame]	473	1 << PG_slab \|
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	474	1 << PG_swapcache \|
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	475	1 << PG_writeback \|
				476	1 << PG_reserved )))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	477	bad_page(__FUNCTION__, page);
				478
				479	page->flags &= ~(1 << PG_uptodate \| 1 << PG_error \|
				480	1 << PG_referenced \| 1 << PG_arch_1 \|
				481	1 << PG_checked \| 1 << PG_mappedtodisk);
Hugh Dickins	4c21e2f	2005-10-29 18:16:40 -0700	[diff] [blame]	482	set_page_private(page, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	483	set_page_refs(page, order);
				484	kernel_map_pages(page, 1 << order, 1);
				485	}
				486
				487	/*
				488	* Do the hard work of removing an element from the buddy allocator.
				489	* Call me with the zone->lock already held.
				490	*/
				491	static struct page __rmqueue(struct zone zone, unsigned int order)
				492	{
				493	struct free_area * area;
				494	unsigned int current_order;
				495	struct page *page;
				496
				497	for (current_order = order; current_order < MAX_ORDER; ++current_order) {
				498	area = zone->free_area + current_order;
				499	if (list_empty(&area->free_list))
				500	continue;
				501
				502	page = list_entry(area->free_list.next, struct page, lru);
				503	list_del(&page->lru);
				504	rmv_page_order(page);
				505	area->nr_free--;
				506	zone->free_pages -= 1UL << order;
				507	return expand(zone, page, order, current_order, area);
				508	}
				509
				510	return NULL;
				511	}
				512
				513	/*
				514	* Obtain a specified number of elements from the buddy allocator, all under
				515	* a single hold of the lock, for efficiency. Add them to the supplied list.
				516	* Returns the number of new pages which were placed at *list.
				517	*/
				518	static int rmqueue_bulk(struct zone *zone, unsigned int order,
				519	unsigned long count, struct list_head *list)
				520	{
				521	unsigned long flags;
				522	int i;
				523	int allocated = 0;
				524	struct page *page;
				525
				526	spin_lock_irqsave(&zone->lock, flags);
				527	for (i = 0; i < count; ++i) {
				528	page = __rmqueue(zone, order);
				529	if (page == NULL)
				530	break;
				531	allocated++;
				532	list_add_tail(&page->lru, list);
				533	}
				534	spin_unlock_irqrestore(&zone->lock, flags);
				535	return allocated;
				536	}
				537
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	538	#ifdef CONFIG_NUMA
				539	/* Called from the slab reaper to drain remote pagesets */
				540	void drain_remote_pages(void)
				541	{
				542	struct zone *zone;
				543	int i;
				544	unsigned long flags;
				545
				546	local_irq_save(flags);
				547	for_each_zone(zone) {
				548	struct per_cpu_pageset *pset;
				549
				550	/* Do not drain local pagesets */
				551	if (zone->zone_pgdat->node_id == numa_node_id())
				552	continue;
				553
				554	pset = zone->pageset[smp_processor_id()];
				555	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				556	struct per_cpu_pages *pcp;
				557
				558	pcp = &pset->pcp[i];
				559	if (pcp->count)
				560	pcp->count -= free_pages_bulk(zone, pcp->count,
				561	&pcp->list, 0);
				562	}
				563	}
				564	local_irq_restore(flags);
				565	}
				566	#endif
				567
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	568	#if defined(CONFIG_PM) \|\| defined(CONFIG_HOTPLUG_CPU)
				569	static void __drain_pages(unsigned int cpu)
				570	{
				571	struct zone *zone;
				572	int i;
				573
				574	for_each_zone(zone) {
				575	struct per_cpu_pageset *pset;
				576
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	577	pset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	578	for (i = 0; i < ARRAY_SIZE(pset->pcp); i++) {
				579	struct per_cpu_pages *pcp;
				580
				581	pcp = &pset->pcp[i];
				582	pcp->count -= free_pages_bulk(zone, pcp->count,
				583	&pcp->list, 0);
				584	}
				585	}
				586	}
				587	#endif /* CONFIG_PM \|\| CONFIG_HOTPLUG_CPU */
				588
				589	#ifdef CONFIG_PM
				590
				591	void mark_free_pages(struct zone *zone)
				592	{
				593	unsigned long zone_pfn, flags;
				594	int order;
				595	struct list_head *curr;
				596
				597	if (!zone->spanned_pages)
				598	return;
				599
				600	spin_lock_irqsave(&zone->lock, flags);
				601	for (zone_pfn = 0; zone_pfn < zone->spanned_pages; ++zone_pfn)
				602	ClearPageNosaveFree(pfn_to_page(zone_pfn + zone->zone_start_pfn));
				603
				604	for (order = MAX_ORDER - 1; order >= 0; --order)
				605	list_for_each(curr, &zone->free_area[order].free_list) {
				606	unsigned long start_pfn, i;
				607
				608	start_pfn = page_to_pfn(list_entry(curr, struct page, lru));
				609
				610	for (i=0; i < (1<<order); i++)
				611	SetPageNosaveFree(pfn_to_page(start_pfn+i));
				612	}
				613	spin_unlock_irqrestore(&zone->lock, flags);
				614	}
				615
				616	/*
				617	* Spill all of this CPU's per-cpu pages back into the buddy allocator.
				618	*/
				619	void drain_local_pages(void)
				620	{
				621	unsigned long flags;
				622
				623	local_irq_save(flags);
				624	__drain_pages(smp_processor_id());
				625	local_irq_restore(flags);
				626	}
				627	#endif /* CONFIG_PM */
				628
				629	static void zone_statistics(struct zonelist zonelist, struct zone z)
				630	{
				631	#ifdef CONFIG_NUMA
				632	unsigned long flags;
				633	int cpu;
				634	pg_data_t *pg = z->zone_pgdat;
				635	pg_data_t *orig = zonelist->zones[0]->zone_pgdat;
				636	struct per_cpu_pageset *p;
				637
				638	local_irq_save(flags);
				639	cpu = smp_processor_id();
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	640	p = zone_pcp(z,cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	641	if (pg == orig) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	642	p->numa_hit++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	643	} else {
				644	p->numa_miss++;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	645	zone_pcp(zonelist->zones[0], cpu)->numa_foreign++;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	646	}
				647	if (pg == NODE_DATA(numa_node_id()))
				648	p->local_node++;
				649	else
				650	p->other_node++;
				651	local_irq_restore(flags);
				652	#endif
				653	}
				654
				655	/*
				656	* Free a 0-order page
				657	*/
				658	static void FASTCALL(free_hot_cold_page(struct page *page, int cold));
				659	static void fastcall free_hot_cold_page(struct page *page, int cold)
				660	{
				661	struct zone *zone = page_zone(page);
				662	struct per_cpu_pages *pcp;
				663	unsigned long flags;
				664
				665	arch_free_page(page, 0);
				666
				667	kernel_map_pages(page, 1, 0);
				668	inc_page_state(pgfree);
				669	if (PageAnon(page))
				670	page->mapping = NULL;
				671	free_pages_check(__FUNCTION__, page);
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	672	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	673	local_irq_save(flags);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	674	list_add(&page->lru, &pcp->list);
				675	pcp->count++;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	676	if (pcp->count >= pcp->high)
				677	pcp->count -= free_pages_bulk(zone, pcp->batch, &pcp->list, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	678	local_irq_restore(flags);
				679	put_cpu();
				680	}
				681
				682	void fastcall free_hot_page(struct page *page)
				683	{
				684	free_hot_cold_page(page, 0);
				685	}
				686
				687	void fastcall free_cold_page(struct page *page)
				688	{
				689	free_hot_cold_page(page, 1);
				690	}
				691
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	692	static inline void prep_zero_page(struct page *page, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	693	{
				694	int i;
				695
				696	BUG_ON((gfp_flags & (__GFP_WAIT \| __GFP_HIGHMEM)) == __GFP_HIGHMEM);
				697	for(i = 0; i < (1 << order); i++)
				698	clear_highpage(page + i);
				699	}
				700
				701	/*
				702	* Really, prep_compound_page() should be called from __rmqueue_bulk(). But
				703	* we cheat by calling it from here, in the order > 0 path. Saves a branch
				704	* or two.
				705	*/
				706	static struct page *
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	707	buffered_rmqueue(struct zone *zone, int order, gfp_t gfp_flags)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	708	{
				709	unsigned long flags;
				710	struct page *page = NULL;
				711	int cold = !!(gfp_flags & __GFP_COLD);
				712
				713	if (order == 0) {
				714	struct per_cpu_pages *pcp;
				715
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	716	pcp = &zone_pcp(zone, get_cpu())->pcp[cold];
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	717	local_irq_save(flags);
				718	if (pcp->count <= pcp->low)
				719	pcp->count += rmqueue_bulk(zone, 0,
				720	pcp->batch, &pcp->list);
				721	if (pcp->count) {
				722	page = list_entry(pcp->list.next, struct page, lru);
				723	list_del(&page->lru);
				724	pcp->count--;
				725	}
				726	local_irq_restore(flags);
				727	put_cpu();
				728	}
				729
				730	if (page == NULL) {
				731	spin_lock_irqsave(&zone->lock, flags);
				732	page = __rmqueue(zone, order);
				733	spin_unlock_irqrestore(&zone->lock, flags);
				734	}
				735
				736	if (page != NULL) {
				737	BUG_ON(bad_range(zone, page));
				738	mod_page_state_zone(zone, pgalloc, 1 << order);
				739	prep_new_page(page, order);
				740
				741	if (gfp_flags & __GFP_ZERO)
				742	prep_zero_page(page, order, gfp_flags);
				743
				744	if (order && (gfp_flags & __GFP_COMP))
				745	prep_compound_page(page, order);
				746	}
				747	return page;
				748	}
				749
				750	/*
				751	* Return 1 if free pages are above 'mark'. This takes into account the order
				752	* of the allocation.
				753	*/
				754	int zone_watermark_ok(struct zone *z, int order, unsigned long mark,
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	755	int classzone_idx, int can_try_harder, gfp_t gfp_high)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	756	{
				757	/* free_pages my go negative - that's OK */
				758	long min = mark, free_pages = z->free_pages - (1 << order) + 1;
				759	int o;
				760
				761	if (gfp_high)
				762	min -= min / 2;
				763	if (can_try_harder)
				764	min -= min / 4;
				765
				766	if (free_pages <= min + z->lowmem_reserve[classzone_idx])
				767	return 0;
				768	for (o = 0; o < order; o++) {
				769	/* At the next order, this order's pages become unavailable */
				770	free_pages -= z->free_area[o].nr_free << o;
				771
				772	/* Require fewer higher order pages to be free */
				773	min >>= 1;
				774
				775	if (free_pages <= min)
				776	return 0;
				777	}
				778	return 1;
				779	}
				780
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	781	static inline int
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	782	should_reclaim_zone(struct zone *z, gfp_t gfp_mask)
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	783	{
				784	if (!z->reclaim_pages)
				785	return 0;
Martin Hicks	0c35bba	2005-06-21 17:14:42 -0700	[diff] [blame]	786	if (gfp_mask & __GFP_NORECLAIM)
				787	return 0;
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	788	return 1;
				789	}
				790
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	791	/*
				792	* This is the 'heart' of the zoned buddy allocator.
				793	*/
				794	struct page * fastcall
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	795	__alloc_pages(gfp_t gfp_mask, unsigned int order,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	796	struct zonelist *zonelist)
				797	{
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	798	const gfp_t wait = gfp_mask & __GFP_WAIT;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	799	struct zone *zones, z;
				800	struct page *page;
				801	struct reclaim_state reclaim_state;
				802	struct task_struct *p = current;
				803	int i;
				804	int classzone_idx;
				805	int do_retry;
				806	int can_try_harder;
				807	int did_some_progress;
				808
				809	might_sleep_if(wait);
				810
				811	/*
				812	* The caller may dip into page reserves a bit more if the caller
				813	* cannot run direct reclaim, or is the caller has realtime scheduling
				814	* policy
				815	*/
				816	can_try_harder = (unlikely(rt_task(p)) && !in_interrupt()) \|\| !wait;
				817
				818	zones = zonelist->zones; /* the list of zones suitable for gfp_mask */
				819
				820	if (unlikely(zones[0] == NULL)) {
				821	/* Should this ever happen?? */
				822	return NULL;
				823	}
				824
				825	classzone_idx = zone_idx(zones[0]);
				826
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	827	restart:
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	828	/*
				829	* Go through the zonelist once, looking for a zone with enough free.
				830	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
				831	*/
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	832	for (i = 0; (z = zones[i]) != NULL; i++) {
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	833	int do_reclaim = should_reclaim_zone(z, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	834
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	835	if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	836	continue;
				837
Martin Hicks	753ee72	2005-06-21 17:14:41 -0700	[diff] [blame]	838	/*
				839	* If the zone is to attempt early page reclaim then this loop
				840	* will try to reclaim pages and check the watermark a second
				841	* time before giving up and falling back to the next zone.
				842	*/
				843	zone_reclaim_retry:
				844	if (!zone_watermark_ok(z, order, z->pages_low,
				845	classzone_idx, 0, 0)) {
				846	if (!do_reclaim)
				847	continue;
				848	else {
				849	zone_reclaim(z, gfp_mask, order);
				850	/* Only try reclaim once */
				851	do_reclaim = 0;
				852	goto zone_reclaim_retry;
				853	}
				854	}
				855
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	856	page = buffered_rmqueue(z, order, gfp_mask);
				857	if (page)
				858	goto got_pg;
				859	}
				860
				861	for (i = 0; (z = zones[i]) != NULL; i++)
				862	wakeup_kswapd(z, order);
				863
				864	/*
				865	* Go through the zonelist again. Let __GFP_HIGH and allocations
				866	* coming from realtime tasks to go deeper into reserves
				867	*
				868	* This is the last chance, in general, before the goto nopage.
				869	* Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	870	* See also cpuset_zone_allowed() comment in kernel/cpuset.c.
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	871	*/
				872	for (i = 0; (z = zones[i]) != NULL; i++) {
				873	if (!zone_watermark_ok(z, order, z->pages_min,
				874	classzone_idx, can_try_harder,
				875	gfp_mask & __GFP_HIGH))
				876	continue;
				877
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	878	if (wait && !cpuset_zone_allowed(z, gfp_mask))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	879	continue;
				880
				881	page = buffered_rmqueue(z, order, gfp_mask);
				882	if (page)
				883	goto got_pg;
				884	}
				885
				886	/* This allocation should allow future memory freeing. */
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	887
				888	if (((p->flags & PF_MEMALLOC) \|\| unlikely(test_thread_flag(TIF_MEMDIE)))
				889	&& !in_interrupt()) {
				890	if (!(gfp_mask & __GFP_NOMEMALLOC)) {
				891	/* go through the zonelist yet again, ignoring mins */
				892	for (i = 0; (z = zones[i]) != NULL; i++) {
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	893	if (!cpuset_zone_allowed(z, gfp_mask))
Nick Piggin	b84a35b	2005-05-01 08:58:36 -0700	[diff] [blame]	894	continue;
				895	page = buffered_rmqueue(z, order, gfp_mask);
				896	if (page)
				897	goto got_pg;
				898	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	899	}
				900	goto nopage;
				901	}
				902
				903	/* Atomic allocations - we can't balance anything */
				904	if (!wait)
				905	goto nopage;
				906
				907	rebalance:
				908	cond_resched();
				909
				910	/* We now go into synchronous reclaim */
				911	p->flags \|= PF_MEMALLOC;
				912	reclaim_state.reclaimed_slab = 0;
				913	p->reclaim_state = &reclaim_state;
				914
Darren Hart	1ad539b	2005-06-21 17:14:53 -0700	[diff] [blame]	915	did_some_progress = try_to_free_pages(zones, gfp_mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	916
				917	p->reclaim_state = NULL;
				918	p->flags &= ~PF_MEMALLOC;
				919
				920	cond_resched();
				921
				922	if (likely(did_some_progress)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	923	for (i = 0; (z = zones[i]) != NULL; i++) {
				924	if (!zone_watermark_ok(z, order, z->pages_min,
				925	classzone_idx, can_try_harder,
				926	gfp_mask & __GFP_HIGH))
				927	continue;
				928
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	929	if (!cpuset_zone_allowed(z, gfp_mask))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	930	continue;
				931
				932	page = buffered_rmqueue(z, order, gfp_mask);
				933	if (page)
				934	goto got_pg;
				935	}
				936	} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) {
				937	/*
				938	* Go through the zonelist yet one more time, keep
				939	* very high watermark here, this is only to catch
				940	* a parallel oom killing, we must fail if we're still
				941	* under heavy pressure.
				942	*/
				943	for (i = 0; (z = zones[i]) != NULL; i++) {
				944	if (!zone_watermark_ok(z, order, z->pages_high,
				945	classzone_idx, 0, 0))
				946	continue;
				947
Paul Jackson	9bf2229	2005-09-06 15:18:12 -0700	[diff] [blame]	948	if (!cpuset_zone_allowed(z, __GFP_HARDWALL))
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	949	continue;
				950
				951	page = buffered_rmqueue(z, order, gfp_mask);
				952	if (page)
				953	goto got_pg;
				954	}
				955
Marcelo Tosatti	79b9ce3	2005-07-07 17:56:04 -0700	[diff] [blame]	956	out_of_memory(gfp_mask, order);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	957	goto restart;
				958	}
				959
				960	/*
				961	* Don't let big-order allocations loop unless the caller explicitly
				962	* requests that. Wait for some write requests to complete then retry.
				963	*
				964	* In this implementation, __GFP_REPEAT means __GFP_NOFAIL for order
				965	* <= 3, but that may not be true in other implementations.
				966	*/
				967	do_retry = 0;
				968	if (!(gfp_mask & __GFP_NORETRY)) {
				969	if ((order <= 3) \|\| (gfp_mask & __GFP_REPEAT))
				970	do_retry = 1;
				971	if (gfp_mask & __GFP_NOFAIL)
				972	do_retry = 1;
				973	}
				974	if (do_retry) {
				975	blk_congestion_wait(WRITE, HZ/50);
				976	goto rebalance;
				977	}
				978
				979	nopage:
				980	if (!(gfp_mask & __GFP_NOWARN) && printk_ratelimit()) {
				981	printk(KERN_WARNING "%s: page allocation failure."
				982	" order:%d, mode:0x%x\n",
				983	p->comm, order, gfp_mask);
				984	dump_stack();
Janet Morgan	578c2fd	2005-06-21 17:14:56 -0700	[diff] [blame]	985	show_mem();
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	986	}
				987	return NULL;
				988	got_pg:
				989	zone_statistics(zonelist, z);
				990	return page;
				991	}
				992
				993	EXPORT_SYMBOL(__alloc_pages);
				994
				995	/*
				996	* Common helper functions.
				997	*/
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	998	fastcall unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	999	{
				1000	struct page * page;
				1001	page = alloc_pages(gfp_mask, order);
				1002	if (!page)
				1003	return 0;
				1004	return (unsigned long) page_address(page);
				1005	}
				1006
				1007	EXPORT_SYMBOL(__get_free_pages);
				1008
Al Viro	dd0fc66	2005-10-07 07:46:04 +0100	[diff] [blame]	1009	fastcall unsigned long get_zeroed_page(gfp_t gfp_mask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1010	{
				1011	struct page * page;
				1012
				1013	/*
				1014	* get_zeroed_page() returns a 32-bit address, which cannot represent
				1015	* a highmem page
				1016	*/
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1017	BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1018
				1019	page = alloc_pages(gfp_mask \| __GFP_ZERO, 0);
				1020	if (page)
				1021	return (unsigned long) page_address(page);
				1022	return 0;
				1023	}
				1024
				1025	EXPORT_SYMBOL(get_zeroed_page);
				1026
				1027	void __pagevec_free(struct pagevec *pvec)
				1028	{
				1029	int i = pagevec_count(pvec);
				1030
				1031	while (--i >= 0)
				1032	free_hot_cold_page(pvec->pages[i], pvec->cold);
				1033	}
				1034
				1035	fastcall void __free_pages(struct page *page, unsigned int order)
				1036	{
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1037	if (put_page_testzero(page)) {
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1038	if (order == 0)
				1039	free_hot_page(page);
				1040	else
				1041	__free_pages_ok(page, order);
				1042	}
				1043	}
				1044
				1045	EXPORT_SYMBOL(__free_pages);
				1046
				1047	fastcall void free_pages(unsigned long addr, unsigned int order)
				1048	{
				1049	if (addr != 0) {
				1050	BUG_ON(!virt_addr_valid((void *)addr));
				1051	__free_pages(virt_to_page((void *)addr), order);
				1052	}
				1053	}
				1054
				1055	EXPORT_SYMBOL(free_pages);
				1056
				1057	/*
				1058	* Total amount of free (allocatable) RAM:
				1059	*/
				1060	unsigned int nr_free_pages(void)
				1061	{
				1062	unsigned int sum = 0;
				1063	struct zone *zone;
				1064
				1065	for_each_zone(zone)
				1066	sum += zone->free_pages;
				1067
				1068	return sum;
				1069	}
				1070
				1071	EXPORT_SYMBOL(nr_free_pages);
				1072
				1073	#ifdef CONFIG_NUMA
				1074	unsigned int nr_free_pages_pgdat(pg_data_t *pgdat)
				1075	{
				1076	unsigned int i, sum = 0;
				1077
				1078	for (i = 0; i < MAX_NR_ZONES; i++)
				1079	sum += pgdat->node_zones[i].free_pages;
				1080
				1081	return sum;
				1082	}
				1083	#endif
				1084
				1085	static unsigned int nr_free_zone_pages(int offset)
				1086	{
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1087	/* Just pick one node, since fallback list is circular */
				1088	pg_data_t *pgdat = NODE_DATA(numa_node_id());
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1089	unsigned int sum = 0;
				1090
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1091	struct zonelist *zonelist = pgdat->node_zonelists + offset;
				1092	struct zone **zonep = zonelist->zones;
				1093	struct zone *zone;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1094
Martin J. Bligh	e310fd4	2005-07-29 22:59:18 -0700	[diff] [blame]	1095	for (zone = zonep++; zone; zone = zonep++) {
				1096	unsigned long size = zone->present_pages;
				1097	unsigned long high = zone->pages_high;
				1098	if (size > high)
				1099	sum += size - high;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1100	}
				1101
				1102	return sum;
				1103	}
				1104
				1105	/*
				1106	* Amount of free RAM allocatable within ZONE_DMA and ZONE_NORMAL
				1107	*/
				1108	unsigned int nr_free_buffer_pages(void)
				1109	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1110	return nr_free_zone_pages(gfp_zone(GFP_USER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1111	}
				1112
				1113	/*
				1114	* Amount of free RAM allocatable within all zones
				1115	*/
				1116	unsigned int nr_free_pagecache_pages(void)
				1117	{
Al Viro	af4ca45	2005-10-21 02:55:38 -0400	[diff] [blame]	1118	return nr_free_zone_pages(gfp_zone(GFP_HIGHUSER));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1119	}
				1120
				1121	#ifdef CONFIG_HIGHMEM
				1122	unsigned int nr_free_highpages (void)
				1123	{
				1124	pg_data_t *pgdat;
				1125	unsigned int pages = 0;
				1126
				1127	for_each_pgdat(pgdat)
				1128	pages += pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1129
				1130	return pages;
				1131	}
				1132	#endif
				1133
				1134	#ifdef CONFIG_NUMA
				1135	static void show_node(struct zone *zone)
				1136	{
				1137	printk("Node %d ", zone->zone_pgdat->node_id);
				1138	}
				1139	#else
				1140	#define show_node(zone) do { } while (0)
				1141	#endif
				1142
				1143	/*
				1144	* Accumulate the page_state information across all CPUs.
				1145	* The result is unavoidably approximate - it can change
				1146	* during and after execution of this function.
				1147	*/
				1148	static DEFINE_PER_CPU(struct page_state, page_states) = {0};
				1149
				1150	atomic_t nr_pagecache = ATOMIC_INIT(0);
				1151	EXPORT_SYMBOL(nr_pagecache);
				1152	#ifdef CONFIG_SMP
				1153	DEFINE_PER_CPU(long, nr_pagecache_local) = 0;
				1154	#endif
				1155
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1156	void __get_page_state(struct page_state ret, int nr, cpumask_t cpumask)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1157	{
				1158	int cpu = 0;
				1159
				1160	memset(ret, 0, sizeof(*ret));
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1161	cpus_and(cpumask, cpumask, cpu_online_map);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1162
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1163	cpu = first_cpu(*cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1164	while (cpu < NR_CPUS) {
				1165	unsigned long in, out, off;
				1166
				1167	in = (unsigned long *)&per_cpu(page_states, cpu);
				1168
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1169	cpu = next_cpu(cpu, *cpumask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1170
				1171	if (cpu < NR_CPUS)
				1172	prefetch(&per_cpu(page_states, cpu));
				1173
				1174	out = (unsigned long *)ret;
				1175	for (off = 0; off < nr; off++)
				1176	out++ += in++;
				1177	}
				1178	}
				1179
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1180	void get_page_state_node(struct page_state *ret, int node)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1181	{
				1182	int nr;
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1183	cpumask_t mask = node_to_cpumask(node);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1184
				1185	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1186	nr /= sizeof(unsigned long);
				1187
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1188	__get_page_state(ret, nr+1, &mask);
				1189	}
				1190
				1191	void get_page_state(struct page_state *ret)
				1192	{
				1193	int nr;
				1194	cpumask_t mask = CPU_MASK_ALL;
				1195
				1196	nr = offsetof(struct page_state, GET_PAGE_STATE_LAST);
				1197	nr /= sizeof(unsigned long);
				1198
				1199	__get_page_state(ret, nr + 1, &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1200	}
				1201
				1202	void get_full_page_state(struct page_state *ret)
				1203	{
Martin Hicks	c07e02d	2005-09-03 15:55:11 -0700	[diff] [blame]	1204	cpumask_t mask = CPU_MASK_ALL;
				1205
				1206	__get_page_state(ret, sizeof(*ret) / sizeof(unsigned long), &mask);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1207	}
				1208
Benjamin LaHaise	c2f29ea	2005-06-21 17:14:55 -0700	[diff] [blame]	1209	unsigned long __read_page_state(unsigned long offset)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1210	{
				1211	unsigned long ret = 0;
				1212	int cpu;
				1213
				1214	for_each_online_cpu(cpu) {
				1215	unsigned long in;
				1216
				1217	in = (unsigned long)&per_cpu(page_states, cpu) + offset;
				1218	ret += ((unsigned long )in);
				1219	}
				1220	return ret;
				1221	}
				1222
Benjamin LaHaise	83e5d8f	2005-06-21 17:14:54 -0700	[diff] [blame]	1223	void __mod_page_state(unsigned long offset, unsigned long delta)
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1224	{
				1225	unsigned long flags;
				1226	void* ptr;
				1227
				1228	local_irq_save(flags);
				1229	ptr = &__get_cpu_var(page_states);
				1230	(unsigned long)(ptr + offset) += delta;
				1231	local_irq_restore(flags);
				1232	}
				1233
				1234	EXPORT_SYMBOL(__mod_page_state);
				1235
				1236	void __get_zone_counts(unsigned long active, unsigned long inactive,
				1237	unsigned long free, struct pglist_data pgdat)
				1238	{
				1239	struct zone *zones = pgdat->node_zones;
				1240	int i;
				1241
				1242	*active = 0;
				1243	*inactive = 0;
				1244	*free = 0;
				1245	for (i = 0; i < MAX_NR_ZONES; i++) {
				1246	*active += zones[i].nr_active;
				1247	*inactive += zones[i].nr_inactive;
				1248	*free += zones[i].free_pages;
				1249	}
				1250	}
				1251
				1252	void get_zone_counts(unsigned long *active,
				1253	unsigned long inactive, unsigned long free)
				1254	{
				1255	struct pglist_data *pgdat;
				1256
				1257	*active = 0;
				1258	*inactive = 0;
				1259	*free = 0;
				1260	for_each_pgdat(pgdat) {
				1261	unsigned long l, m, n;
				1262	__get_zone_counts(&l, &m, &n, pgdat);
				1263	*active += l;
				1264	*inactive += m;
				1265	*free += n;
				1266	}
				1267	}
				1268
				1269	void si_meminfo(struct sysinfo *val)
				1270	{
				1271	val->totalram = totalram_pages;
				1272	val->sharedram = 0;
				1273	val->freeram = nr_free_pages();
				1274	val->bufferram = nr_blockdev_pages();
				1275	#ifdef CONFIG_HIGHMEM
				1276	val->totalhigh = totalhigh_pages;
				1277	val->freehigh = nr_free_highpages();
				1278	#else
				1279	val->totalhigh = 0;
				1280	val->freehigh = 0;
				1281	#endif
				1282	val->mem_unit = PAGE_SIZE;
				1283	}
				1284
				1285	EXPORT_SYMBOL(si_meminfo);
				1286
				1287	#ifdef CONFIG_NUMA
				1288	void si_meminfo_node(struct sysinfo *val, int nid)
				1289	{
				1290	pg_data_t *pgdat = NODE_DATA(nid);
				1291
				1292	val->totalram = pgdat->node_present_pages;
				1293	val->freeram = nr_free_pages_pgdat(pgdat);
				1294	val->totalhigh = pgdat->node_zones[ZONE_HIGHMEM].present_pages;
				1295	val->freehigh = pgdat->node_zones[ZONE_HIGHMEM].free_pages;
				1296	val->mem_unit = PAGE_SIZE;
				1297	}
				1298	#endif
				1299
				1300	#define K(x) ((x) << (PAGE_SHIFT-10))
				1301
				1302	/*
				1303	* Show free area list (used inside shift_scroll-lock stuff)
				1304	* We also calculate the percentage fragmentation. We do this by counting the
				1305	* memory on each free list with the exception of the first item on the list.
				1306	*/
				1307	void show_free_areas(void)
				1308	{
				1309	struct page_state ps;
				1310	int cpu, temperature;
				1311	unsigned long active;
				1312	unsigned long inactive;
				1313	unsigned long free;
				1314	struct zone *zone;
				1315
				1316	for_each_zone(zone) {
				1317	show_node(zone);
				1318	printk("%s per-cpu:", zone->name);
				1319
				1320	if (!zone->present_pages) {
				1321	printk(" empty\n");
				1322	continue;
				1323	} else
				1324	printk("\n");
				1325
				1326	for (cpu = 0; cpu < NR_CPUS; ++cpu) {
				1327	struct per_cpu_pageset *pageset;
				1328
				1329	if (!cpu_possible(cpu))
				1330	continue;
				1331
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1332	pageset = zone_pcp(zone, cpu);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1333
				1334	for (temperature = 0; temperature < 2; temperature++)
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	1335	printk("cpu %d %s: low %d, high %d, batch %d used:%d\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1336	cpu,
				1337	temperature ? "cold" : "hot",
				1338	pageset->pcp[temperature].low,
				1339	pageset->pcp[temperature].high,
Christoph Lameter	4ae7c03	2005-06-21 17:14:57 -0700	[diff] [blame]	1340	pageset->pcp[temperature].batch,
				1341	pageset->pcp[temperature].count);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1342	}
				1343	}
				1344
				1345	get_page_state(&ps);
				1346	get_zone_counts(&active, &inactive, &free);
				1347
Denis Vlasenko	c0d6221	2005-06-21 17:15:14 -0700	[diff] [blame]	1348	printk("Free pages: %11ukB (%ukB HighMem)\n",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1349	K(nr_free_pages()),
				1350	K(nr_free_highpages()));
				1351
				1352	printk("Active:%lu inactive:%lu dirty:%lu writeback:%lu "
				1353	"unstable:%lu free:%u slab:%lu mapped:%lu pagetables:%lu\n",
				1354	active,
				1355	inactive,
				1356	ps.nr_dirty,
				1357	ps.nr_writeback,
				1358	ps.nr_unstable,
				1359	nr_free_pages(),
				1360	ps.nr_slab,
				1361	ps.nr_mapped,
				1362	ps.nr_page_table_pages);
				1363
				1364	for_each_zone(zone) {
				1365	int i;
				1366
				1367	show_node(zone);
				1368	printk("%s"
				1369	" free:%lukB"
				1370	" min:%lukB"
				1371	" low:%lukB"
				1372	" high:%lukB"
				1373	" active:%lukB"
				1374	" inactive:%lukB"
				1375	" present:%lukB"
				1376	" pages_scanned:%lu"
				1377	" all_unreclaimable? %s"
				1378	"\n",
				1379	zone->name,
				1380	K(zone->free_pages),
				1381	K(zone->pages_min),
				1382	K(zone->pages_low),
				1383	K(zone->pages_high),
				1384	K(zone->nr_active),
				1385	K(zone->nr_inactive),
				1386	K(zone->present_pages),
				1387	zone->pages_scanned,
				1388	(zone->all_unreclaimable ? "yes" : "no")
				1389	);
				1390	printk("lowmem_reserve[]:");
				1391	for (i = 0; i < MAX_NR_ZONES; i++)
				1392	printk(" %lu", zone->lowmem_reserve[i]);
				1393	printk("\n");
				1394	}
				1395
				1396	for_each_zone(zone) {
				1397	unsigned long nr, flags, order, total = 0;
				1398
				1399	show_node(zone);
				1400	printk("%s: ", zone->name);
				1401	if (!zone->present_pages) {
				1402	printk("empty\n");
				1403	continue;
				1404	}
				1405
				1406	spin_lock_irqsave(&zone->lock, flags);
				1407	for (order = 0; order < MAX_ORDER; order++) {
				1408	nr = zone->free_area[order].nr_free;
				1409	total += nr << order;
				1410	printk("%lu*%lukB ", nr, K(1UL) << order);
				1411	}
				1412	spin_unlock_irqrestore(&zone->lock, flags);
				1413	printk("= %lukB\n", K(total));
				1414	}
				1415
				1416	show_swap_cache_info();
				1417	}
				1418
				1419	/*
				1420	* Builds allocation fallback zone lists.
				1421	*/
				1422	static int __init build_zonelists_node(pg_data_t pgdat, struct zonelist zonelist, int j, int k)
				1423	{
				1424	switch (k) {
				1425	struct zone *zone;
				1426	default:
				1427	BUG();
				1428	case ZONE_HIGHMEM:
				1429	zone = pgdat->node_zones + ZONE_HIGHMEM;
				1430	if (zone->present_pages) {
				1431	#ifndef CONFIG_HIGHMEM
				1432	BUG();
				1433	#endif
				1434	zonelist->zones[j++] = zone;
				1435	}
				1436	case ZONE_NORMAL:
				1437	zone = pgdat->node_zones + ZONE_NORMAL;
				1438	if (zone->present_pages)
				1439	zonelist->zones[j++] = zone;
				1440	case ZONE_DMA:
				1441	zone = pgdat->node_zones + ZONE_DMA;
				1442	if (zone->present_pages)
				1443	zonelist->zones[j++] = zone;
				1444	}
				1445
				1446	return j;
				1447	}
				1448
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1449	static inline int highest_zone(int zone_bits)
				1450	{
				1451	int res = ZONE_NORMAL;
				1452	if (zone_bits & (__force int)__GFP_HIGHMEM)
				1453	res = ZONE_HIGHMEM;
				1454	if (zone_bits & (__force int)__GFP_DMA)
				1455	res = ZONE_DMA;
				1456	return res;
				1457	}
				1458
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1459	#ifdef CONFIG_NUMA
				1460	#define MAX_NODE_LOAD (num_online_nodes())
				1461	static int __initdata node_load[MAX_NUMNODES];
				1462	/**
Pavel Pisa	4dc3b16	2005-05-01 08:59:25 -0700	[diff] [blame]	1463	* find_next_best_node - find the next node that should appear in a given node's fallback list
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1464	* @node: node whose fallback list we're appending
				1465	* @used_node_mask: nodemask_t of already used nodes
				1466	*
				1467	* We use a number of factors to determine which is the next node that should
				1468	* appear on a given node's fallback list. The node should not have appeared
				1469	* already in @node's fallback list, and it should be the next closest node
				1470	* according to the distance array (which contains arbitrary distance values
				1471	* from each node to each node in the system), and should also prefer nodes
				1472	* with no CPUs, since presumably they'll have very little allocation pressure
				1473	* on them otherwise.
				1474	* It returns -1 if no node is found.
				1475	*/
				1476	static int __init find_next_best_node(int node, nodemask_t *used_node_mask)
				1477	{
				1478	int i, n, val;
				1479	int min_val = INT_MAX;
				1480	int best_node = -1;
				1481
				1482	for_each_online_node(i) {
				1483	cpumask_t tmp;
				1484
				1485	/* Start from local node */
				1486	n = (node+i) % num_online_nodes();
				1487
				1488	/* Don't want a node to appear more than once */
				1489	if (node_isset(n, *used_node_mask))
				1490	continue;
				1491
				1492	/* Use the local node if we haven't already */
				1493	if (!node_isset(node, *used_node_mask)) {
				1494	best_node = node;
				1495	break;
				1496	}
				1497
				1498	/* Use the distance array to find the distance */
				1499	val = node_distance(node, n);
				1500
				1501	/* Give preference to headless and unused nodes */
				1502	tmp = node_to_cpumask(n);
				1503	if (!cpus_empty(tmp))
				1504	val += PENALTY_FOR_NODE_WITH_CPUS;
				1505
				1506	/* Slight preference for less loaded node */
				1507	val = (MAX_NODE_LOADMAX_NUMNODES);
				1508	val += node_load[n];
				1509
				1510	if (val < min_val) {
				1511	min_val = val;
				1512	best_node = n;
				1513	}
				1514	}
				1515
				1516	if (best_node >= 0)
				1517	node_set(best_node, *used_node_mask);
				1518
				1519	return best_node;
				1520	}
				1521
				1522	static void __init build_zonelists(pg_data_t *pgdat)
				1523	{
				1524	int i, j, k, node, local_node;
				1525	int prev_node, load;
				1526	struct zonelist *zonelist;
				1527	nodemask_t used_mask;
				1528
				1529	/* initialize zonelists */
				1530	for (i = 0; i < GFP_ZONETYPES; i++) {
				1531	zonelist = pgdat->node_zonelists + i;
				1532	zonelist->zones[0] = NULL;
				1533	}
				1534
				1535	/* NUMA-aware ordering of nodes */
				1536	local_node = pgdat->node_id;
				1537	load = num_online_nodes();
				1538	prev_node = local_node;
				1539	nodes_clear(used_mask);
				1540	while ((node = find_next_best_node(local_node, &used_mask)) >= 0) {
				1541	/*
				1542	* We don't want to pressure a particular node.
				1543	* So adding penalty to the first node in same
				1544	* distance group to make it round-robin.
				1545	*/
				1546	if (node_distance(local_node, node) !=
				1547	node_distance(local_node, prev_node))
				1548	node_load[node] += load;
				1549	prev_node = node;
				1550	load--;
				1551	for (i = 0; i < GFP_ZONETYPES; i++) {
				1552	zonelist = pgdat->node_zonelists + i;
				1553	for (j = 0; zonelist->zones[j] != NULL; j++);
				1554
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1555	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1556
				1557	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1558	zonelist->zones[j] = NULL;
				1559	}
				1560	}
				1561	}
				1562
				1563	#else /* CONFIG_NUMA */
				1564
				1565	static void __init build_zonelists(pg_data_t *pgdat)
				1566	{
				1567	int i, j, k, node, local_node;
				1568
				1569	local_node = pgdat->node_id;
				1570	for (i = 0; i < GFP_ZONETYPES; i++) {
				1571	struct zonelist *zonelist;
				1572
				1573	zonelist = pgdat->node_zonelists + i;
				1574
				1575	j = 0;
Al Viro	260b236	2005-10-21 03:22:44 -0400	[diff] [blame]	1576	k = highest_zone(i);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1577	j = build_zonelists_node(pgdat, zonelist, j, k);
				1578	/*
				1579	* Now we build the zonelist so that it contains the zones
				1580	* of all the other nodes.
				1581	* We don't want to pressure a particular node, so when
				1582	* building the zones for node N, we make sure that the
				1583	* zones coming right after the local ones are those from
				1584	* node N+1 (modulo N)
				1585	*/
				1586	for (node = local_node + 1; node < MAX_NUMNODES; node++) {
				1587	if (!node_online(node))
				1588	continue;
				1589	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1590	}
				1591	for (node = 0; node < local_node; node++) {
				1592	if (!node_online(node))
				1593	continue;
				1594	j = build_zonelists_node(NODE_DATA(node), zonelist, j, k);
				1595	}
				1596
				1597	zonelist->zones[j] = NULL;
				1598	}
				1599	}
				1600
				1601	#endif /* CONFIG_NUMA */
				1602
				1603	void __init build_all_zonelists(void)
				1604	{
				1605	int i;
				1606
				1607	for_each_online_node(i)
				1608	build_zonelists(NODE_DATA(i));
				1609	printk("Built %i zonelists\n", num_online_nodes());
				1610	cpuset_init_current_mems_allowed();
				1611	}
				1612
				1613	/*
				1614	* Helper functions to size the waitqueue hash table.
				1615	* Essentially these want to choose hash table sizes sufficiently
				1616	* large so that collisions trying to wait on pages are rare.
				1617	* But in fact, the number of active page waitqueues on typical
				1618	* systems is ridiculously low, less than 200. So this is even
				1619	* conservative, even though it seems large.
				1620	*
				1621	* The constant PAGES_PER_WAITQUEUE specifies the ratio of pages to
				1622	* waitqueues, i.e. the size of the waitq table given the number of pages.
				1623	*/
				1624	#define PAGES_PER_WAITQUEUE 256
				1625
				1626	static inline unsigned long wait_table_size(unsigned long pages)
				1627	{
				1628	unsigned long size = 1;
				1629
				1630	pages /= PAGES_PER_WAITQUEUE;
				1631
				1632	while (size < pages)
				1633	size <<= 1;
				1634
				1635	/*
				1636	* Once we have dozens or even hundreds of threads sleeping
				1637	* on IO we've got bigger problems than wait queue collision.
				1638	* Limit the size of the wait table to a reasonable size.
				1639	*/
				1640	size = min(size, 4096UL);
				1641
				1642	return max(size, 4UL);
				1643	}
				1644
				1645	/*
				1646	* This is an integer logarithm so that shifts can be used later
				1647	* to extract the more random high bits from the multiplicative
				1648	* hash function before the remainder is taken.
				1649	*/
				1650	static inline unsigned long wait_table_bits(unsigned long size)
				1651	{
				1652	return ffz(~size);
				1653	}
				1654
				1655	#define LONG_ALIGN(x) (((x)+(sizeof(long))-1)&~((sizeof(long))-1))
				1656
				1657	static void __init calculate_zone_totalpages(struct pglist_data *pgdat,
				1658	unsigned long zones_size, unsigned long zholes_size)
				1659	{
				1660	unsigned long realtotalpages, totalpages = 0;
				1661	int i;
				1662
				1663	for (i = 0; i < MAX_NR_ZONES; i++)
				1664	totalpages += zones_size[i];
				1665	pgdat->node_spanned_pages = totalpages;
				1666
				1667	realtotalpages = totalpages;
				1668	if (zholes_size)
				1669	for (i = 0; i < MAX_NR_ZONES; i++)
				1670	realtotalpages -= zholes_size[i];
				1671	pgdat->node_present_pages = realtotalpages;
				1672	printk(KERN_DEBUG "On node %d totalpages: %lu\n", pgdat->node_id, realtotalpages);
				1673	}
				1674
				1675
				1676	/*
				1677	* Initially all pages are reserved - free ones are freed
				1678	* up by free_all_bootmem() once the early boot process is
				1679	* done. Non-atomic initialization, single-pass.
				1680	*/
				1681	void __init memmap_init_zone(unsigned long size, int nid, unsigned long zone,
				1682	unsigned long start_pfn)
				1683	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1684	struct page *page;
Andy Whitcroft	29751f6	2005-06-23 00:08:00 -0700	[diff] [blame]	1685	unsigned long end_pfn = start_pfn + size;
				1686	unsigned long pfn;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1687
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1688	for (pfn = start_pfn; pfn < end_pfn; pfn++, page++) {
				1689	if (!early_pfn_valid(pfn))
				1690	continue;
Andy Whitcroft	641c767	2005-06-23 00:07:59 -0700	[diff] [blame]	1691	if (!early_pfn_in_nid(pfn, nid))
				1692	continue;
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1693	page = pfn_to_page(pfn);
				1694	set_page_links(page, zone, nid, pfn);
Nick Piggin	b581003	2005-10-29 18:16:12 -0700	[diff] [blame]	1695	set_page_count(page, 1);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1696	reset_page_mapcount(page);
				1697	SetPageReserved(page);
				1698	INIT_LIST_HEAD(&page->lru);
				1699	#ifdef WANT_PAGE_VIRTUAL
				1700	/* The shift won't overflow because ZONE_NORMAL is below 4G. */
				1701	if (!is_highmem_idx(zone))
Bob Picco	3212c6b	2005-06-27 14:36:28 -0700	[diff] [blame]	1702	set_page_address(page, __va(pfn << PAGE_SHIFT));
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1703	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1704	}
				1705	}
				1706
				1707	void zone_init_free_lists(struct pglist_data pgdat, struct zone zone,
				1708	unsigned long size)
				1709	{
				1710	int order;
				1711	for (order = 0; order < MAX_ORDER ; order++) {
				1712	INIT_LIST_HEAD(&zone->free_area[order].free_list);
				1713	zone->free_area[order].nr_free = 0;
				1714	}
				1715	}
				1716
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1717	#define ZONETABLE_INDEX(x, zone_nr) ((x << ZONES_SHIFT) \| zone_nr)
				1718	void zonetable_add(struct zone *zone, int nid, int zid, unsigned long pfn,
				1719	unsigned long size)
				1720	{
				1721	unsigned long snum = pfn_to_section_nr(pfn);
				1722	unsigned long end = pfn_to_section_nr(pfn + size);
				1723
				1724	if (FLAGS_HAS_NODE)
				1725	zone_table[ZONETABLE_INDEX(nid, zid)] = zone;
				1726	else
				1727	for (; snum <= end; snum++)
				1728	zone_table[ZONETABLE_INDEX(snum, zid)] = zone;
				1729	}
				1730
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1731	#ifndef __HAVE_ARCH_MEMMAP_INIT
				1732	#define memmap_init(size, nid, zone, start_pfn) \
				1733	memmap_init_zone((size), (nid), (zone), (start_pfn))
				1734	#endif
				1735
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1736	static int __devinit zone_batchsize(struct zone *zone)
				1737	{
				1738	int batch;
				1739
				1740	/*
				1741	* The per-cpu-pages pools are set to around 1000th of the
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1742	* size of the zone. But no more than 1/2 of a meg.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1743	*
				1744	* OK, so we don't know how big the cache is. So guess.
				1745	*/
				1746	batch = zone->present_pages / 1024;
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1747	if (batch * PAGE_SIZE > 512 * 1024)
				1748	batch = (512 * 1024) / PAGE_SIZE;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1749	batch /= 4; /* We effectively = 4 below /
				1750	if (batch < 1)
				1751	batch = 1;
				1752
				1753	/*
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1754	* We will be trying to allcoate bigger chunks of contiguous
				1755	* memory of the order of fls(batch). This should result in
				1756	* better cache coloring.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1757	*
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1758	* A sanity check also to ensure that batch is still in limits.
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1759	*/
Seth, Rohit	ba56e91	2005-10-29 18:15:47 -0700	[diff] [blame]	1760	batch = (1 << fls(batch + batch/2));
				1761
				1762	if (fls(batch) >= (PAGE_SHIFT + MAX_ORDER - 2))
				1763	batch = PAGE_SHIFT + ((MAX_ORDER - 1 - PAGE_SHIFT)/2);
				1764
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1765	return batch;
				1766	}
				1767
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1768	inline void setup_pageset(struct per_cpu_pageset *p, unsigned long batch)
				1769	{
				1770	struct per_cpu_pages *pcp;
				1771
Magnus Damm	1c6fe94	2005-10-26 01:58:59 -0700	[diff] [blame]	1772	memset(p, 0, sizeof(*p));
				1773
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1774	pcp = &p->pcp[0]; /* hot */
				1775	pcp->count = 0;
Seth, Rohit	e46a5e2	2005-10-29 18:15:48 -0700	[diff] [blame]	1776	pcp->low = 0;
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1777	pcp->high = 6 * batch;
				1778	pcp->batch = max(1UL, 1 * batch);
				1779	INIT_LIST_HEAD(&pcp->list);
				1780
				1781	pcp = &p->pcp[1]; /* cold*/
				1782	pcp->count = 0;
				1783	pcp->low = 0;
				1784	pcp->high = 2 * batch;
Seth, Rohit	e46a5e2	2005-10-29 18:15:48 -0700	[diff] [blame]	1785	pcp->batch = max(1UL, batch/2);
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1786	INIT_LIST_HEAD(&pcp->list);
				1787	}
				1788
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1789	#ifdef CONFIG_NUMA
				1790	/*
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1791	* Boot pageset table. One per cpu which is going to be used for all
				1792	* zones and all nodes. The parameters will be set in such a way
				1793	* that an item put on a list will immediately be handed over to
				1794	* the buddy list. This is safe since pageset manipulation is done
				1795	* with interrupts disabled.
				1796	*
				1797	* Some NUMA counter updates may also be caught by the boot pagesets.
Christoph Lameter	b7c84c6	2005-06-22 20:26:07 -0700	[diff] [blame]	1798	*
				1799	* The boot_pagesets must be kept even after bootup is complete for
				1800	* unused processors and/or zones. They do play a role for bootstrapping
				1801	* hotplugged processors.
				1802	*
				1803	* zoneinfo_show() and maybe other functions do
				1804	* not check if the processor is online before following the pageset pointer.
				1805	* Other parts of the kernel may not check if the zone is available.
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1806	*/
				1807	static struct per_cpu_pageset
Christoph Lameter	b7c84c6	2005-06-22 20:26:07 -0700	[diff] [blame]	1808	boot_pageset[NR_CPUS];
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1809
				1810	/*
				1811	* Dynamically allocate memory for the
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1812	* per cpu pageset array in struct zone.
				1813	*/
				1814	static int __devinit process_zones(int cpu)
				1815	{
				1816	struct zone zone, dzone;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1817
				1818	for_each_zone(zone) {
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1819
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1820	zone->pageset[cpu] = kmalloc_node(sizeof(struct per_cpu_pageset),
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1821	GFP_KERNEL, cpu_to_node(cpu));
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1822	if (!zone->pageset[cpu])
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1823	goto bad;
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1824
Christoph Lameter	2caaad4	2005-06-21 17:15:00 -0700	[diff] [blame]	1825	setup_pageset(zone->pageset[cpu], zone_batchsize(zone));
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	1826	}
				1827
				1828	return 0;
				1829	bad:
				1830	for_each_zone(dzone) {
				1831	if (dzone == zone)
				1832	break;
				1833	kfree(dzone->pageset[cpu]);
				1834	dzone->pageset[cpu] = NULL;
				1835	}
				1836	return -ENOMEM;
				1837	}
				1838
				1839	static inline void free_zone_pagesets(int cpu)
				1840	{
				1841	#ifdef CONFIG_NUMA
				1842	struct zone *zone;
				1843
				1844	for_each_zone(zone) {
				1845	struct per_cpu_pageset *pset = zone_pcp(zone, cpu);
				1846
				1847	zone_pcp(zone, cpu) = NULL;
				1848	kfree(pset);
				1849	}
				1850	#endif
				1851	}
				1852
				1853	static int __devinit pageset_cpuup_callback(struct notifier_block *nfb,
				1854	unsigned long action,
				1855	void *hcpu)
				1856	{
				1857	int cpu = (long)hcpu;
				1858	int ret = NOTIFY_OK;
				1859
				1860	switch (action) {
				1861	case CPU_UP_PREPARE:
				1862	if (process_zones(cpu))
				1863	ret = NOTIFY_BAD;
				1864	break;
				1865	#ifdef CONFIG_HOTPLUG_CPU
				1866	case CPU_DEAD:
				1867	free_zone_pagesets(cpu);
				1868	break;
				1869	#endif
				1870	default:
				1871	break;
				1872	}
				1873	return ret;
				1874	}
				1875
				1876	static struct notifier_block pageset_notifier =
				1877	{ &pageset_cpuup_callback, NULL, 0 };
				1878
				1879	void __init setup_per_cpu_pageset()
				1880	{
				1881	int err;
				1882
				1883	/* Initialize per_cpu_pageset for cpu 0.
				1884	* A cpuup callback will do this for every cpu
				1885	* as it comes online
				1886	*/
				1887	err = process_zones(smp_processor_id());
				1888	BUG_ON(err);
				1889	register_cpu_notifier(&pageset_notifier);
				1890	}
				1891
				1892	#endif
				1893
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1894	static __devinit
				1895	void zone_wait_table_init(struct zone *zone, unsigned long zone_size_pages)
				1896	{
				1897	int i;
				1898	struct pglist_data *pgdat = zone->zone_pgdat;
				1899
				1900	/*
				1901	* The per-page waitqueue mechanism uses hashed waitqueues
				1902	* per zone.
				1903	*/
				1904	zone->wait_table_size = wait_table_size(zone_size_pages);
				1905	zone->wait_table_bits = wait_table_bits(zone->wait_table_size);
				1906	zone->wait_table = (wait_queue_head_t *)
				1907	alloc_bootmem_node(pgdat, zone->wait_table_size
				1908	* sizeof(wait_queue_head_t));
				1909
				1910	for(i = 0; i < zone->wait_table_size; ++i)
				1911	init_waitqueue_head(zone->wait_table + i);
				1912	}
				1913
				1914	static __devinit void zone_pcp_init(struct zone *zone)
				1915	{
				1916	int cpu;
				1917	unsigned long batch = zone_batchsize(zone);
				1918
				1919	for (cpu = 0; cpu < NR_CPUS; cpu++) {
				1920	#ifdef CONFIG_NUMA
				1921	/* Early boot. Slab allocator not functional yet */
				1922	zone->pageset[cpu] = &boot_pageset[cpu];
				1923	setup_pageset(&boot_pageset[cpu],0);
				1924	#else
				1925	setup_pageset(zone_pcp(zone,cpu), batch);
				1926	#endif
				1927	}
				1928	printk(KERN_DEBUG " %s zone: %lu pages, LIFO batch:%lu\n",
				1929	zone->name, zone->present_pages, batch);
				1930	}
				1931
				1932	static __devinit void init_currently_empty_zone(struct zone *zone,
				1933	unsigned long zone_start_pfn, unsigned long size)
				1934	{
				1935	struct pglist_data *pgdat = zone->zone_pgdat;
				1936
				1937	zone_wait_table_init(zone, size);
				1938	pgdat->nr_zones = zone_idx(zone) + 1;
				1939
				1940	zone->zone_mem_map = pfn_to_page(zone_start_pfn);
				1941	zone->zone_start_pfn = zone_start_pfn;
				1942
				1943	memmap_init(size, pgdat->node_id, zone_idx(zone), zone_start_pfn);
				1944
				1945	zone_init_free_lists(pgdat, zone, zone->spanned_pages);
				1946	}
				1947
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1948	/*
				1949	* Set up the zone data structures:
				1950	* - mark all pages reserved
				1951	* - mark all memory queues empty
				1952	* - clear the memory bitmaps
				1953	*/
				1954	static void __init free_area_init_core(struct pglist_data *pgdat,
				1955	unsigned long zones_size, unsigned long zholes_size)
				1956	{
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1957	unsigned long j;
				1958	int nid = pgdat->node_id;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1959	unsigned long zone_start_pfn = pgdat->node_start_pfn;
				1960
				1961	pgdat->nr_zones = 0;
				1962	init_waitqueue_head(&pgdat->kswapd_wait);
				1963	pgdat->kswapd_max_order = 0;
				1964
				1965	for (j = 0; j < MAX_NR_ZONES; j++) {
				1966	struct zone *zone = pgdat->node_zones + j;
				1967	unsigned long size, realsize;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1968
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1969	realsize = size = zones_size[j];
				1970	if (zholes_size)
				1971	realsize -= zholes_size[j];
				1972
				1973	if (j == ZONE_DMA \|\| j == ZONE_NORMAL)
				1974	nr_kernel_pages += realsize;
				1975	nr_all_pages += realsize;
				1976
				1977	zone->spanned_pages = size;
				1978	zone->present_pages = realsize;
				1979	zone->name = zone_names[j];
				1980	spin_lock_init(&zone->lock);
				1981	spin_lock_init(&zone->lru_lock);
				1982	zone->zone_pgdat = pgdat;
				1983	zone->free_pages = 0;
				1984
				1985	zone->temp_priority = zone->prev_priority = DEF_PRIORITY;
				1986
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1987	zone_pcp_init(zone);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1988	INIT_LIST_HEAD(&zone->active_list);
				1989	INIT_LIST_HEAD(&zone->inactive_list);
				1990	zone->nr_scan_active = 0;
				1991	zone->nr_scan_inactive = 0;
				1992	zone->nr_active = 0;
				1993	zone->nr_inactive = 0;
Martin Hicks	53e9a61	2005-09-03 15:54:51 -0700	[diff] [blame]	1994	atomic_set(&zone->reclaim_in_progress, 0);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	1995	if (!size)
				1996	continue;
				1997
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	1998	zonetable_add(zone, nid, j, zone_start_pfn, size);
Dave Hansen	ed8ece2	2005-10-29 18:16:50 -0700	[diff] [blame]	1999	init_currently_empty_zone(zone, zone_start_pfn, size);
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2000	zone_start_pfn += size;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2001	}
				2002	}
				2003
				2004	static void __init alloc_node_mem_map(struct pglist_data *pgdat)
				2005	{
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2006	/* Skip empty nodes */
				2007	if (!pgdat->node_spanned_pages)
				2008	return;
				2009
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2010	#ifdef CONFIG_FLAT_NODE_MEM_MAP
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2011	/* ia64 gets its own node_mem_map, before this, without bootmem */
				2012	if (!pgdat->node_mem_map) {
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2013	unsigned long size;
				2014	struct page *map;
				2015
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2016	size = (pgdat->node_spanned_pages + 1) * sizeof(struct page);
Dave Hansen	6f167ec	2005-06-23 00:07:39 -0700	[diff] [blame]	2017	map = alloc_remap(pgdat->node_id, size);
				2018	if (!map)
				2019	map = alloc_bootmem_node(pgdat, size);
				2020	pgdat->node_mem_map = map;
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2021	}
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2022	#ifdef CONFIG_FLATMEM
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2023	/*
				2024	* With no DISCONTIG, the global mem_map is just set as node 0's
				2025	*/
				2026	if (pgdat == NODE_DATA(0))
				2027	mem_map = NODE_DATA(0)->node_mem_map;
				2028	#endif
Andy Whitcroft	d41dee3	2005-06-23 00:07:54 -0700	[diff] [blame]	2029	#endif /* CONFIG_FLAT_NODE_MEM_MAP */
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2030	}
				2031
				2032	void __init free_area_init_node(int nid, struct pglist_data *pgdat,
				2033	unsigned long *zones_size, unsigned long node_start_pfn,
				2034	unsigned long *zholes_size)
				2035	{
				2036	pgdat->node_id = nid;
				2037	pgdat->node_start_pfn = node_start_pfn;
				2038	calculate_zone_totalpages(pgdat, zones_size, zholes_size);
				2039
				2040	alloc_node_mem_map(pgdat);
				2041
				2042	free_area_init_core(pgdat, zones_size, zholes_size);
				2043	}
				2044
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2045	#ifndef CONFIG_NEED_MULTIPLE_NODES
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2046	static bootmem_data_t contig_bootmem_data;
				2047	struct pglist_data contig_page_data = { .bdata = &contig_bootmem_data };
				2048
				2049	EXPORT_SYMBOL(contig_page_data);
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2050	#endif
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2051
				2052	void __init free_area_init(unsigned long *zones_size)
				2053	{
Dave Hansen	93b7504	2005-06-23 00:07:47 -0700	[diff] [blame]	2054	free_area_init_node(0, NODE_DATA(0), zones_size,
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2055	__pa(PAGE_OFFSET) >> PAGE_SHIFT, NULL);
				2056	}
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2057
				2058	#ifdef CONFIG_PROC_FS
				2059
				2060	#include <linux/seq_file.h>
				2061
				2062	static void frag_start(struct seq_file m, loff_t *pos)
				2063	{
				2064	pg_data_t *pgdat;
				2065	loff_t node = *pos;
				2066
				2067	for (pgdat = pgdat_list; pgdat && node; pgdat = pgdat->pgdat_next)
				2068	--node;
				2069
				2070	return pgdat;
				2071	}
				2072
				2073	static void frag_next(struct seq_file m, void arg, loff_t pos)
				2074	{
				2075	pg_data_t pgdat = (pg_data_t )arg;
				2076
				2077	(*pos)++;
				2078	return pgdat->pgdat_next;
				2079	}
				2080
				2081	static void frag_stop(struct seq_file m, void arg)
				2082	{
				2083	}
				2084
				2085	/*
				2086	* This walks the free areas for each zone.
				2087	*/
				2088	static int frag_show(struct seq_file m, void arg)
				2089	{
				2090	pg_data_t pgdat = (pg_data_t )arg;
				2091	struct zone *zone;
				2092	struct zone *node_zones = pgdat->node_zones;
				2093	unsigned long flags;
				2094	int order;
				2095
				2096	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; ++zone) {
				2097	if (!zone->present_pages)
				2098	continue;
				2099
				2100	spin_lock_irqsave(&zone->lock, flags);
				2101	seq_printf(m, "Node %d, zone %8s ", pgdat->node_id, zone->name);
				2102	for (order = 0; order < MAX_ORDER; ++order)
				2103	seq_printf(m, "%6lu ", zone->free_area[order].nr_free);
				2104	spin_unlock_irqrestore(&zone->lock, flags);
				2105	seq_putc(m, '\n');
				2106	}
				2107	return 0;
				2108	}
				2109
				2110	struct seq_operations fragmentation_op = {
				2111	.start = frag_start,
				2112	.next = frag_next,
				2113	.stop = frag_stop,
				2114	.show = frag_show,
				2115	};
				2116
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2117	/*
				2118	* Output information about zones in @pgdat.
				2119	*/
				2120	static int zoneinfo_show(struct seq_file m, void arg)
				2121	{
				2122	pg_data_t *pgdat = arg;
				2123	struct zone *zone;
				2124	struct zone *node_zones = pgdat->node_zones;
				2125	unsigned long flags;
				2126
				2127	for (zone = node_zones; zone - node_zones < MAX_NR_ZONES; zone++) {
				2128	int i;
				2129
				2130	if (!zone->present_pages)
				2131	continue;
				2132
				2133	spin_lock_irqsave(&zone->lock, flags);
				2134	seq_printf(m, "Node %d, zone %8s", pgdat->node_id, zone->name);
				2135	seq_printf(m,
				2136	"\n pages free %lu"
				2137	"\n min %lu"
				2138	"\n low %lu"
				2139	"\n high %lu"
				2140	"\n active %lu"
				2141	"\n inactive %lu"
				2142	"\n scanned %lu (a: %lu i: %lu)"
				2143	"\n spanned %lu"
				2144	"\n present %lu",
				2145	zone->free_pages,
				2146	zone->pages_min,
				2147	zone->pages_low,
				2148	zone->pages_high,
				2149	zone->nr_active,
				2150	zone->nr_inactive,
				2151	zone->pages_scanned,
				2152	zone->nr_scan_active, zone->nr_scan_inactive,
				2153	zone->spanned_pages,
				2154	zone->present_pages);
				2155	seq_printf(m,
				2156	"\n protection: (%lu",
				2157	zone->lowmem_reserve[0]);
				2158	for (i = 1; i < ARRAY_SIZE(zone->lowmem_reserve); i++)
				2159	seq_printf(m, ", %lu", zone->lowmem_reserve[i]);
				2160	seq_printf(m,
				2161	")"
				2162	"\n pagesets");
				2163	for (i = 0; i < ARRAY_SIZE(zone->pageset); i++) {
				2164	struct per_cpu_pageset *pageset;
				2165	int j;
				2166
Christoph Lameter	e7c8d5c	2005-06-21 17:14:47 -0700	[diff] [blame]	2167	pageset = zone_pcp(zone, i);
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2168	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2169	if (pageset->pcp[j].count)
				2170	break;
				2171	}
				2172	if (j == ARRAY_SIZE(pageset->pcp))
				2173	continue;
				2174	for (j = 0; j < ARRAY_SIZE(pageset->pcp); j++) {
				2175	seq_printf(m,
				2176	"\n cpu: %i pcp: %i"
				2177	"\n count: %i"
				2178	"\n low: %i"
				2179	"\n high: %i"
				2180	"\n batch: %i",
				2181	i, j,
				2182	pageset->pcp[j].count,
				2183	pageset->pcp[j].low,
				2184	pageset->pcp[j].high,
				2185	pageset->pcp[j].batch);
				2186	}
				2187	#ifdef CONFIG_NUMA
				2188	seq_printf(m,
				2189	"\n numa_hit: %lu"
				2190	"\n numa_miss: %lu"
				2191	"\n numa_foreign: %lu"
				2192	"\n interleave_hit: %lu"
				2193	"\n local_node: %lu"
				2194	"\n other_node: %lu",
				2195	pageset->numa_hit,
				2196	pageset->numa_miss,
				2197	pageset->numa_foreign,
				2198	pageset->interleave_hit,
				2199	pageset->local_node,
				2200	pageset->other_node);
				2201	#endif
				2202	}
				2203	seq_printf(m,
				2204	"\n all_unreclaimable: %u"
				2205	"\n prev_priority: %i"
				2206	"\n temp_priority: %i"
				2207	"\n start_pfn: %lu",
				2208	zone->all_unreclaimable,
				2209	zone->prev_priority,
				2210	zone->temp_priority,
				2211	zone->zone_start_pfn);
				2212	spin_unlock_irqrestore(&zone->lock, flags);
				2213	seq_putc(m, '\n');
				2214	}
				2215	return 0;
				2216	}
				2217
				2218	struct seq_operations zoneinfo_op = {
				2219	.start = frag_start, /* iterate over all zones. The same as in
				2220	* fragmentation. */
				2221	.next = frag_next,
				2222	.stop = frag_stop,
				2223	.show = zoneinfo_show,
				2224	};
				2225
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2226	static char *vmstat_text[] = {
				2227	"nr_dirty",
				2228	"nr_writeback",
				2229	"nr_unstable",
				2230	"nr_page_table_pages",
				2231	"nr_mapped",
				2232	"nr_slab",
				2233
				2234	"pgpgin",
				2235	"pgpgout",
				2236	"pswpin",
				2237	"pswpout",
				2238	"pgalloc_high",
				2239
				2240	"pgalloc_normal",
				2241	"pgalloc_dma",
				2242	"pgfree",
				2243	"pgactivate",
				2244	"pgdeactivate",
				2245
				2246	"pgfault",
				2247	"pgmajfault",
				2248	"pgrefill_high",
				2249	"pgrefill_normal",
				2250	"pgrefill_dma",
				2251
				2252	"pgsteal_high",
				2253	"pgsteal_normal",
				2254	"pgsteal_dma",
				2255	"pgscan_kswapd_high",
				2256	"pgscan_kswapd_normal",
				2257
				2258	"pgscan_kswapd_dma",
				2259	"pgscan_direct_high",
				2260	"pgscan_direct_normal",
				2261	"pgscan_direct_dma",
				2262	"pginodesteal",
				2263
				2264	"slabs_scanned",
				2265	"kswapd_steal",
				2266	"kswapd_inodesteal",
				2267	"pageoutrun",
				2268	"allocstall",
				2269
				2270	"pgrotated",
KAMEZAWA Hiroyuki	edfbe2b	2005-05-01 08:58:37 -0700	[diff] [blame]	2271	"nr_bounce",
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2272	};
				2273
				2274	static void vmstat_start(struct seq_file m, loff_t *pos)
				2275	{
				2276	struct page_state *ps;
				2277
				2278	if (*pos >= ARRAY_SIZE(vmstat_text))
				2279	return NULL;
				2280
				2281	ps = kmalloc(sizeof(*ps), GFP_KERNEL);
				2282	m->private = ps;
				2283	if (!ps)
				2284	return ERR_PTR(-ENOMEM);
				2285	get_full_page_state(ps);
				2286	ps->pgpgin /= 2; /* sectors -> kbytes */
				2287	ps->pgpgout /= 2;
				2288	return (unsigned long )ps + pos;
				2289	}
				2290
				2291	static void vmstat_next(struct seq_file m, void arg, loff_t pos)
				2292	{
				2293	(*pos)++;
				2294	if (*pos >= ARRAY_SIZE(vmstat_text))
				2295	return NULL;
				2296	return (unsigned long )m->private + pos;
				2297	}
				2298
				2299	static int vmstat_show(struct seq_file m, void arg)
				2300	{
				2301	unsigned long *l = arg;
				2302	unsigned long off = l - (unsigned long *)m->private;
				2303
				2304	seq_printf(m, "%s %lu\n", vmstat_text[off], *l);
				2305	return 0;
				2306	}
				2307
				2308	static void vmstat_stop(struct seq_file m, void arg)
				2309	{
				2310	kfree(m->private);
				2311	m->private = NULL;
				2312	}
				2313
				2314	struct seq_operations vmstat_op = {
				2315	.start = vmstat_start,
				2316	.next = vmstat_next,
				2317	.stop = vmstat_stop,
				2318	.show = vmstat_show,
				2319	};
				2320
				2321	#endif /* CONFIG_PROC_FS */
				2322
				2323	#ifdef CONFIG_HOTPLUG_CPU
				2324	static int page_alloc_cpu_notify(struct notifier_block *self,
				2325	unsigned long action, void *hcpu)
				2326	{
				2327	int cpu = (unsigned long)hcpu;
				2328	long *count;
				2329	unsigned long src, dest;
				2330
				2331	if (action == CPU_DEAD) {
				2332	int i;
				2333
				2334	/* Drain local pagecache count. */
				2335	count = &per_cpu(nr_pagecache_local, cpu);
				2336	atomic_add(*count, &nr_pagecache);
				2337	*count = 0;
				2338	local_irq_disable();
				2339	__drain_pages(cpu);
				2340
				2341	/* Add dead cpu's page_states to our own. */
				2342	dest = (unsigned long *)&__get_cpu_var(page_states);
				2343	src = (unsigned long *)&per_cpu(page_states, cpu);
				2344
				2345	for (i = 0; i < sizeof(struct page_state)/sizeof(unsigned long);
				2346	i++) {
				2347	dest[i] += src[i];
				2348	src[i] = 0;
				2349	}
				2350
				2351	local_irq_enable();
				2352	}
				2353	return NOTIFY_OK;
				2354	}
				2355	#endif /* CONFIG_HOTPLUG_CPU */
				2356
				2357	void __init page_alloc_init(void)
				2358	{
				2359	hotcpu_notifier(page_alloc_cpu_notify, 0);
				2360	}
				2361
				2362	/*
				2363	* setup_per_zone_lowmem_reserve - called whenever
				2364	* sysctl_lower_zone_reserve_ratio changes. Ensures that each zone
				2365	* has a correct pages reserved value, so an adequate number of
				2366	* pages are left in the zone after a successful __alloc_pages().
				2367	*/
				2368	static void setup_per_zone_lowmem_reserve(void)
				2369	{
				2370	struct pglist_data *pgdat;
				2371	int j, idx;
				2372
				2373	for_each_pgdat(pgdat) {
				2374	for (j = 0; j < MAX_NR_ZONES; j++) {
				2375	struct zone *zone = pgdat->node_zones + j;
				2376	unsigned long present_pages = zone->present_pages;
				2377
				2378	zone->lowmem_reserve[j] = 0;
				2379
				2380	for (idx = j-1; idx >= 0; idx--) {
				2381	struct zone *lower_zone;
				2382
				2383	if (sysctl_lowmem_reserve_ratio[idx] < 1)
				2384	sysctl_lowmem_reserve_ratio[idx] = 1;
				2385
				2386	lower_zone = pgdat->node_zones + idx;
				2387	lower_zone->lowmem_reserve[j] = present_pages /
				2388	sysctl_lowmem_reserve_ratio[idx];
				2389	present_pages += lower_zone->present_pages;
				2390	}
				2391	}
				2392	}
				2393	}
				2394
				2395	/*
				2396	* setup_per_zone_pages_min - called when min_free_kbytes changes. Ensures
				2397	* that the pages_{min,low,high} values for each zone are set correctly
				2398	* with respect to min_free_kbytes.
				2399	*/
				2400	static void setup_per_zone_pages_min(void)
				2401	{
				2402	unsigned long pages_min = min_free_kbytes >> (PAGE_SHIFT - 10);
				2403	unsigned long lowmem_pages = 0;
				2404	struct zone *zone;
				2405	unsigned long flags;
				2406
				2407	/* Calculate total number of !ZONE_HIGHMEM pages */
				2408	for_each_zone(zone) {
				2409	if (!is_highmem(zone))
				2410	lowmem_pages += zone->present_pages;
				2411	}
				2412
				2413	for_each_zone(zone) {
				2414	spin_lock_irqsave(&zone->lru_lock, flags);
				2415	if (is_highmem(zone)) {
				2416	/*
				2417	* Often, highmem doesn't need to reserve any pages.
				2418	* But the pages_min/low/high values are also used for
				2419	* batching up page reclaim activity so we need a
				2420	* decent value here.
				2421	*/
				2422	int min_pages;
				2423
				2424	min_pages = zone->present_pages / 1024;
				2425	if (min_pages < SWAP_CLUSTER_MAX)
				2426	min_pages = SWAP_CLUSTER_MAX;
				2427	if (min_pages > 128)
				2428	min_pages = 128;
				2429	zone->pages_min = min_pages;
				2430	} else {
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2431	/* if it's a lowmem zone, reserve a number of pages
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2432	* proportionate to the zone's size.
				2433	*/
Nikita Danilov	295ab93	2005-06-21 17:14:38 -0700	[diff] [blame]	2434	zone->pages_min = (pages_min * zone->present_pages) /
Linus Torvalds	1da177e	2005-04-16 15:20:36 -0700	[diff] [blame]	2435	lowmem_pages;
				2436	}
				2437
				2438	/*
				2439	* When interpreting these watermarks, just keep in mind that:
				2440	* zone->pages_min == (zone->pages_min * 4) / 4;
				2441	*/
				2442	zone->pages_low = (zone->pages_min * 5) / 4;
				2443	zone->pages_high = (zone->pages_min * 6) / 4;
				2444	spin_unlock_irqrestore(&zone->lru_lock, flags);
				2445	}
				2446	}
				2447
				2448	/*
				2449	* Initialise min_free_kbytes.
				2450	*
				2451	* For small machines we want it small (128k min). For large machines
				2452	* we want it large (64MB max). But it is not linear, because network
				2453	* bandwidth does not increase linearly with machine size. We use
				2454	*
				2455	* min_free_kbytes = 4 * sqrt(lowmem_kbytes), for better accuracy:
				2456	* min_free_kbytes = sqrt(lowmem_kbytes * 16)
				2457	*
				2458	* which yields
				2459	*
				2460	* 16MB: 512k
				2461	* 32MB: 724k
				2462	* 64MB: 1024k
				2463	* 128MB: 1448k
				2464	* 256MB: 2048k
				2465	* 512MB: 2896k
				2466	* 1024MB: 4096k
				2467	* 2048MB: 5792k
				2468	* 4096MB: 8192k
				2469	* 8192MB: 11584k
				2470	* 16384MB: 16384k
				2471	*/
				2472	static int __init init_per_zone_pages_min(void)
				2473	{
				2474	unsigned long lowmem_kbytes;
				2475
				2476	lowmem_kbytes = nr_free_buffer_pages() * (PAGE_SIZE >> 10);
				2477
				2478	min_free_kbytes = int_sqrt(lowmem_kbytes * 16);
				2479	if (min_free_kbytes < 128)
				2480	min_free_kbytes = 128;
				2481	if (min_free_kbytes > 65536)
				2482	min_free_kbytes = 65536;
				2483	setup_per_zone_pages_min();
				2484	setup_per_zone_lowmem_reserve();
				2485	return 0;
				2486	}
				2487	module_init(init_per_zone_pages_min)
				2488
				2489	/*
				2490	* min_free_kbytes_sysctl_handler - just a wrapper around proc_dointvec() so
				2491	* that we can call two helper functions whenever min_free_kbytes
				2492	* changes.
				2493	*/
				2494	int min_free_kbytes_sysctl_handler(ctl_table *table, int write,
				2495	struct file file, void __user buffer, size_t length, loff_t ppos)
				2496	{
				2497	proc_dointvec(table, write, file, buffer, length, ppos);
				2498	setup_per_zone_pages_min();
				2499	return 0;
				2500	}
				2501
				2502	/*
				2503	* lowmem_reserve_ratio_sysctl_handler - just a wrapper around
				2504	* proc_dointvec() so that we can call setup_per_zone_lowmem_reserve()
				2505	* whenever sysctl_lowmem_reserve_ratio changes.
				2506	*
				2507	* The reserve ratio obviously has absolutely no relation with the
				2508	* pages_min watermarks. The lowmem reserve ratio can only make sense
				2509	* if in function of the boot time zone sizes.
				2510	*/
				2511	int lowmem_reserve_ratio_sysctl_handler(ctl_table *table, int write,
				2512	struct file file, void __user buffer, size_t length, loff_t ppos)
				2513	{
				2514	proc_dointvec_minmax(table, write, file, buffer, length, ppos);
				2515	setup_per_zone_lowmem_reserve();
				2516	return 0;
				2517	}
				2518
				2519	__initdata int hashdist = HASHDIST_DEFAULT;
				2520
				2521	#ifdef CONFIG_NUMA
				2522	static int __init set_hashdist(char *str)
				2523	{
				2524	if (!str)
				2525	return 0;
				2526	hashdist = simple_strtoul(str, &str, 0);
				2527	return 1;
				2528	}
				2529	__setup("hashdist=", set_hashdist);
				2530	#endif
				2531
				2532	/*
				2533	* allocate a large system hash table from bootmem
				2534	* - it is assumed that the hash table must contain an exact power-of-2
				2535	* quantity of entries
				2536	* - limit is the number of hash buckets, not the total allocation size
				2537	*/
				2538	void __init alloc_large_system_hash(const char tablename,
				2539	unsigned long bucketsize,
				2540	unsigned long numentries,
				2541	int scale,
				2542	int flags,
				2543	unsigned int *_hash_shift,
				2544	unsigned int *_hash_mask,
				2545	unsigned long limit)
				2546	{
				2547	unsigned long long max = limit;
				2548	unsigned long log2qty, size;
				2549	void *table = NULL;
				2550
				2551	/* allow the kernel cmdline to have a say */
				2552	if (!numentries) {
				2553	/* round applicable memory size up to nearest megabyte */
				2554	numentries = (flags & HASH_HIGHMEM) ? nr_all_pages : nr_kernel_pages;
				2555	numentries += (1UL << (20 - PAGE_SHIFT)) - 1;
				2556	numentries >>= 20 - PAGE_SHIFT;
				2557	numentries <<= 20 - PAGE_SHIFT;
				2558
				2559	/* limit to 1 bucket per 2^scale bytes of low memory */
				2560	if (scale > PAGE_SHIFT)
				2561	numentries >>= (scale - PAGE_SHIFT);
				2562	else
				2563	numentries <<= (PAGE_SHIFT - scale);
				2564	}
				2565	/* rounded up to nearest power of 2 in size */
				2566	numentries = 1UL << (long_log2(numentries) + 1);
				2567
				2568	/* limit allocation size to 1/16 total memory by default */
				2569	if (max == 0) {
				2570	max = ((unsigned long long)nr_all_pages << PAGE_SHIFT) >> 4;
				2571	do_div(max, bucketsize);
				2572	}
				2573
				2574	if (numentries > max)
				2575	numentries = max;
				2576
				2577	log2qty = long_log2(numentries);
				2578
				2579	do {
				2580	size = bucketsize << log2qty;
				2581	if (flags & HASH_EARLY)
				2582	table = alloc_bootmem(size);
				2583	else if (hashdist)
				2584	table = __vmalloc(size, GFP_ATOMIC, PAGE_KERNEL);
				2585	else {
				2586	unsigned long order;
				2587	for (order = 0; ((1UL << order) << PAGE_SHIFT) < size; order++)
				2588	;
				2589	table = (void*) __get_free_pages(GFP_ATOMIC, order);
				2590	}
				2591	} while (!table && size > PAGE_SIZE && --log2qty);
				2592
				2593	if (!table)
				2594	panic("Failed to allocate %s hash table\n", tablename);
				2595
				2596	printk("%s hash table entries: %d (order: %d, %lu bytes)\n",
				2597	tablename,
				2598	(1U << log2qty),
				2599	long_log2(size) - PAGE_SHIFT,
				2600	size);
				2601
				2602	if (_hash_shift)
				2603	*_hash_shift = log2qty;
				2604	if (_hash_mask)
				2605	*_hash_mask = (1 << log2qty) - 1;
				2606
				2607	return table;
				2608	}