[IA64] Percpu quicklist for combined allocator for pgd/pmd/pte.
This patch introduces using the quicklists for pgd, pmd, and pte levels
by combining the alloc and free functions into a common set of routines.
This greatly simplifies the reading of this header file.
This patch is simple but necessary for large numa configurations.
It simply ensures that only pages from the local node are added to a
cpus quicklist. This prevents the trapping of pages on a remote nodes
quicklist by starting a process, touching a large number of pages to
fill pmd and pte entries, migrating to another node, and then unmapping
or exiting. With those conditions, the pages get trapped and if the
machine has more than 100 nodes of the same size, the calculation of
the pgtable high water mark will be larger than any single node so page
table cache flushing will never occur.
I ran lmbench lat_proc fork and lat_proc exec on a zx1 with and without
this patch and did not notice any change.
On an sn2 machine, there was a slight improvement which is possibly
due to pages from other nodes trapped on the test node before starting
the run. I did not investigate further.
This patch shrinks the quicklist based upon free memory on the node
instead of the high/low water marks. I have written it to enable
preemption periodically and recalculate the amount to shrink every time
we have freed enough pages that the quicklist size should have grown.
I rescan the nodes zones each pass because other processess may be
draining node memory at the same time as we are adding.
Signed-off-by: Robin Holt <holt@sgi.com>
Signed-off-by: Tony Luck <tony.luck@intel.com>
diff --git a/arch/ia64/mm/contig.c b/arch/ia64/mm/contig.c
index 6daf15a..91a055f 100644
--- a/arch/ia64/mm/contig.c
+++ b/arch/ia64/mm/contig.c
@@ -61,7 +61,8 @@
printk("%d reserved pages\n", reserved);
printk("%d pages shared\n", shared);
printk("%d pages swap cached\n", cached);
- printk("%ld pages in page table cache\n", pgtable_cache_size);
+ printk("%ld pages in page table cache\n",
+ pgtable_quicklist_total_size());
}
/* physical address where the bootmem map is located */
diff --git a/arch/ia64/mm/discontig.c b/arch/ia64/mm/discontig.c
index 3456a9b..c007109 100644
--- a/arch/ia64/mm/discontig.c
+++ b/arch/ia64/mm/discontig.c
@@ -582,7 +582,8 @@
printk("%d reserved pages\n", total_reserved);
printk("%d pages shared\n", total_shared);
printk("%d pages swap cached\n", total_cached);
- printk("Total of %ld pages in page table cache\n", pgtable_cache_size);
+ printk("Total of %ld pages in page table cache\n",
+ pgtable_quicklist_total_size());
printk("%d free buffer pages\n", nr_free_buffer_pages());
}
diff --git a/arch/ia64/mm/init.c b/arch/ia64/mm/init.c
index 65cf839..4892be5 100644
--- a/arch/ia64/mm/init.c
+++ b/arch/ia64/mm/init.c
@@ -39,6 +39,9 @@
DEFINE_PER_CPU(struct mmu_gather, mmu_gathers);
+DEFINE_PER_CPU(unsigned long *, __pgtable_quicklist);
+DEFINE_PER_CPU(long, __pgtable_quicklist_size);
+
extern void ia64_tlb_init (void);
unsigned long MAX_DMA_ADDRESS = PAGE_OFFSET + 0x100000000UL;
@@ -50,27 +53,53 @@
EXPORT_SYMBOL(vmem_map);
#endif
-static int pgt_cache_water[2] = { 25, 50 };
-
-struct page *zero_page_memmap_ptr; /* map entry for zero page */
+struct page *zero_page_memmap_ptr; /* map entry for zero page */
EXPORT_SYMBOL(zero_page_memmap_ptr);
-void
-check_pgt_cache (void)
-{
- int low, high;
+#define MIN_PGT_PAGES 25UL
+#define MAX_PGT_FREES_PER_PASS 16
+#define PGT_FRACTION_OF_NODE_MEM 16
- low = pgt_cache_water[0];
- high = pgt_cache_water[1];
+static inline long
+max_pgt_pages(void)
+{
+ u64 node_free_pages, max_pgt_pages;
+
+#ifndef CONFIG_NUMA
+ node_free_pages = nr_free_pages();
+#else
+ node_free_pages = nr_free_pages_pgdat(NODE_DATA(numa_node_id()));
+#endif
+ max_pgt_pages = node_free_pages / PGT_FRACTION_OF_NODE_MEM;
+ max_pgt_pages = max(max_pgt_pages, MIN_PGT_PAGES);
+ return max_pgt_pages;
+}
+
+static inline long
+min_pages_to_free(void)
+{
+ long pages_to_free;
+
+ pages_to_free = pgtable_quicklist_size - max_pgt_pages();
+ pages_to_free = min(pages_to_free, MAX_PGT_FREES_PER_PASS);
+ return pages_to_free;
+}
+
+void
+check_pgt_cache(void)
+{
+ long pages_to_free;
+
+ if (unlikely(pgtable_quicklist_size <= MIN_PGT_PAGES))
+ return;
preempt_disable();
- if (pgtable_cache_size > (u64) high) {
- do {
- if (pgd_quicklist)
- free_page((unsigned long)pgd_alloc_one_fast(NULL));
- if (pmd_quicklist)
- free_page((unsigned long)pmd_alloc_one_fast(NULL, 0));
- } while (pgtable_cache_size > (u64) low);
+ while (unlikely((pages_to_free = min_pages_to_free()) > 0)) {
+ while (pages_to_free--) {
+ free_page((unsigned long)pgtable_quicklist_alloc());
+ }
+ preempt_enable();
+ preempt_disable();
}
preempt_enable();
}
@@ -523,11 +552,14 @@
mem_init (void)
{
long reserved_pages, codesize, datasize, initsize;
- unsigned long num_pgt_pages;
pg_data_t *pgdat;
int i;
static struct kcore_list kcore_mem, kcore_vmem, kcore_kernel;
+ BUG_ON(PTRS_PER_PGD * sizeof(pgd_t) != PAGE_SIZE);
+ BUG_ON(PTRS_PER_PMD * sizeof(pmd_t) != PAGE_SIZE);
+ BUG_ON(PTRS_PER_PTE * sizeof(pte_t) != PAGE_SIZE);
+
#ifdef CONFIG_PCI
/*
* This needs to be called _after_ the command line has been parsed but _before_
@@ -564,18 +596,6 @@
num_physpages << (PAGE_SHIFT - 10), codesize >> 10,
reserved_pages << (PAGE_SHIFT - 10), datasize >> 10, initsize >> 10);
- /*
- * Allow for enough (cached) page table pages so that we can map the entire memory
- * at least once. Each task also needs a couple of page tables pages, so add in a
- * fudge factor for that (don't use "threads-max" here; that would be wrong!).
- * Don't allow the cache to be more than 10% of total memory, though.
- */
-# define NUM_TASKS 500 /* typical number of tasks */
- num_pgt_pages = nr_free_pages() / PTRS_PER_PGD + NUM_TASKS;
- if (num_pgt_pages > nr_free_pages() / 10)
- num_pgt_pages = nr_free_pages() / 10;
- if (num_pgt_pages > (u64) pgt_cache_water[1])
- pgt_cache_water[1] = num_pgt_pages;
/*
* For fsyscall entrpoints with no light-weight handler, use the ordinary