[PATCH] VM: early zone reclaim This is the core of the (much simplified) early reclaim. The goal of this patch is to reclaim some easily-freed pages from a zone before falling back onto another zone. One of the major uses of this is NUMA machines. With the default allocator behavior the allocator would look for memory in another zone, which might be off-node, before trying to reclaim from the current zone. This adds a zone tuneable to enable early zone reclaim. It is selected on a per-zone basis and is turned on/off via syscall. Adding some extra throttling on the reclaim was also required (patch 4/4). Without the machine would grind to a crawl when doing a "make -j" kernel build. Even with this patch the System Time is higher on average, but it seems tolerable. Here are some numbers for kernbench runs on a 2-node, 4cpu, 8Gig RAM Altix in the "make -j" run: wall user sys %cpu ctx sw. sleeps ---- ---- --- ---- ------ ------ No patch 1009 1384 847 258 298170 504402 w/patch, no reclaim 880 1376 667 288 254064 396745 w/patch & reclaim 1079 1385 926 252 291625 548873 These numbers are the average of 2 runs of 3 "make -j" runs done right after system boot. Run-to-run variability for "make -j" is huge, so these numbers aren't terribly useful except to seee that with reclaim the benchmark still finishes in a reasonable amount of time. I also looked at the NUMA hit/miss stats for the "make -j" runs and the reclaim doesn't make any difference when the machine is thrashing away. Doing a "make -j8" on a single node that is filled with page cache pages takes 700 seconds with reclaim turned on and 735 seconds without reclaim (due to remote memory accesses). The simple zone_reclaim syscall program is at http://www.bork.org/~mort/sgi/zone_reclaim.c Signed-off-by: Martin Hicks <mort@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>

commit: 753ee728964e5afb80c17659cc6c3a6fd0a42fe0 [log] [tgz]
author: Martin Hicks <mort@sgi.com> Tue Jun 21 17:14:41 2005 -0700
committer: Linus Torvalds <torvalds@ppc970.osdl.org> Tue Jun 21 18:46:14 2005 -0700
tree: 41c9a7700d0858c1f77c5bdaba97e5b636f69b06
parent: bfbb38fb808ac23ef44472d05d9bb36edfb49ed0 [diff]
diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S
index 6cd1ed3..d408afa 100644
--- a/arch/i386/kernel/syscall_table.S
+++ b/arch/i386/kernel/syscall_table.S

@@ -251,7 +251,7 @@
 	.long sys_io_submit
 	.long sys_io_cancel
 	.long sys_fadvise64	/* 250 */
-	.long sys_ni_syscall
+	.long sys_set_zone_reclaim
 	.long sys_exit_group
 	.long sys_lookup_dcookie
 	.long sys_epoll_create

diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S
index d99316c..b1d5d3d 100644
--- a/arch/ia64/kernel/entry.S
+++ b/arch/ia64/kernel/entry.S

@@ -1579,7 +1579,7 @@
 	data8 sys_keyctl
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall			// 1275
-	data8 sys_ni_syscall
+	data8 sys_set_zone_reclaim
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall
 	data8 sys_ni_syscall

diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h
index 61bcc1b..176413f 100644
--- a/include/asm-i386/unistd.h
+++ b/include/asm-i386/unistd.h

@@ -256,7 +256,7 @@
 #define __NR_io_submit		248
 #define __NR_io_cancel		249
 #define __NR_fadvise64		250
-
+#define __NR_set_zone_reclaim	251
 #define __NR_exit_group		252
 #define __NR_lookup_dcookie	253
 #define __NR_epoll_create	254

diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h
index 33e26c5..f7f43ec 100644
--- a/include/asm-ia64/unistd.h
+++ b/include/asm-ia64/unistd.h

@@ -263,6 +263,7 @@
 #define __NR_add_key			1271
 #define __NR_request_key		1272
 #define __NR_keyctl			1273
+#define __NR_set_zone_reclaim		1276
 
 #ifdef __KERNEL__
 

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index beacd93..dfc2452 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h

@@ -145,6 +145,12 @@
 	int			all_unreclaimable; /* All pages pinned */
 
 	/*
+	 * Does the allocator try to reclaim pages from the zone as soon
+	 * as it fails a watermark_ok() in __alloc_pages?
+	 */
+	int			reclaim_pages;
+
+	/*
 	 * prev_priority holds the scanning priority for this zone.  It is
 	 * defined as the scanning priority at which we achieved our reclaim
 	 * target at the previous try_to_free_pages() or balance_pgdat()

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 3bbc41b..0d21e682 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h

@@ -173,6 +173,7 @@
 
 /* linux/mm/vmscan.c */
 extern int try_to_free_pages(struct zone **, unsigned int, unsigned int);
+extern int zone_reclaim(struct zone *, unsigned int, unsigned int);
 extern int shrink_all_memory(int);
 extern int vm_swappiness;
 

diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c
index 0dda70e..6f15bea 100644
--- a/kernel/sys_ni.c
+++ b/kernel/sys_ni.c

@@ -77,6 +77,7 @@
 cond_syscall(sys_keyctl);
 cond_syscall(compat_sys_keyctl);
 cond_syscall(compat_sys_socketcall);
+cond_syscall(sys_set_zone_reclaim);
 
 /* arch-specific weak syscall entries */
 cond_syscall(sys_pciconfig_read);

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 40169f0..3c0f69d 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c

@@ -724,6 +724,14 @@
 	return 1;
 }
 
+static inline int
+should_reclaim_zone(struct zone *z, unsigned int gfp_mask)
+{
+	if (!z->reclaim_pages)
+		return 0;
+	return 1;
+}
+
 /*
  * This is the 'heart' of the zoned buddy allocator.
  */
@@ -760,17 +768,32 @@
 
 	classzone_idx = zone_idx(zones[0]);
 
- restart:
+restart:
 	/* Go through the zonelist once, looking for a zone with enough free */
 	for (i = 0; (z = zones[i]) != NULL; i++) {
-
-		if (!zone_watermark_ok(z, order, z->pages_low,
-				       classzone_idx, 0, 0))
-			continue;
+		int do_reclaim = should_reclaim_zone(z, gfp_mask);
 
 		if (!cpuset_zone_allowed(z))
 			continue;
 
+		/*
+		 * If the zone is to attempt early page reclaim then this loop
+		 * will try to reclaim pages and check the watermark a second
+		 * time before giving up and falling back to the next zone.
+		 */
+zone_reclaim_retry:
+		if (!zone_watermark_ok(z, order, z->pages_low,
+				       classzone_idx, 0, 0)) {
+			if (!do_reclaim)
+				continue;
+			else {
+				zone_reclaim(z, gfp_mask, order);
+				/* Only try reclaim once */
+				do_reclaim = 0;
+				goto zone_reclaim_retry;
+			}
+		}
+
 		page = buffered_rmqueue(z, order, gfp_mask);
 		if (page)
 			goto got_pg;

diff --git a/mm/vmscan.c b/mm/vmscan.c
index 6379ddb..7da8469 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c

@@ -1323,3 +1323,67 @@
 }
 
 module_init(kswapd_init)
+
+
+/*
+ * Try to free up some pages from this zone through reclaim.
+ */
+int zone_reclaim(struct zone *zone, unsigned int gfp_mask, unsigned int order)
+{
+	struct scan_control sc;
+	int nr_pages = 1 << order;
+	int total_reclaimed = 0;
+
+	/* The reclaim may sleep, so don't do it if sleep isn't allowed */
+	if (!(gfp_mask & __GFP_WAIT))
+		return 0;
+	if (zone->all_unreclaimable)
+		return 0;
+
+	sc.gfp_mask = gfp_mask;
+	sc.may_writepage = 0;
+	sc.may_swap = 0;
+	sc.nr_mapped = read_page_state(nr_mapped);
+	sc.nr_scanned = 0;
+	sc.nr_reclaimed = 0;
+	/* scan at the highest priority */
+	sc.priority = 0;
+
+	if (nr_pages > SWAP_CLUSTER_MAX)
+		sc.swap_cluster_max = nr_pages;
+	else
+		sc.swap_cluster_max = SWAP_CLUSTER_MAX;
+
+	shrink_zone(zone, &sc);
+	total_reclaimed = sc.nr_reclaimed;
+
+	return total_reclaimed;
+}
+
+asmlinkage long sys_set_zone_reclaim(unsigned int node, unsigned int zone,
+				     unsigned int state)
+{
+	struct zone *z;
+	int i;
+
+	if (node >= MAX_NUMNODES || !node_online(node))
+		return -EINVAL;
+
+	/* This will break if we ever add more zones */
+	if (!(zone & (1<<ZONE_DMA|1<<ZONE_NORMAL|1<<ZONE_HIGHMEM)))
+		return -EINVAL;
+
+	for (i = 0; i < MAX_NR_ZONES; i++) {
+		if (!(zone & 1<<i))
+			continue;
+
+		z = &NODE_DATA(node)->node_zones[i];
+
+		if (state)
+			z->reclaim_pages = 1;
+		else
+			z->reclaim_pages = 0;
+	}
+
+	return 0;
+}
commit	753ee728964e5afb80c17659cc6c3a6fd0a42fe0	[log] [tgz]
author	Martin Hicks <mort@sgi.com>	Tue Jun 21 17:14:41 2005 -0700
committer	Linus Torvalds <torvalds@ppc970.osdl.org>	Tue Jun 21 18:46:14 2005 -0700
tree	41c9a7700d0858c1f77c5bdaba97e5b636f69b06
parent	bfbb38fb808ac23ef44472d05d9bb36edfb49ed0 [diff]