x86: alloc dyn_array all together

so could spare some memory with small alignment in bootmem

also tighten the alignment checking, and make print out less debug info.

Signed-off-by: Yinghai Lu <yhlu.kernel@gmail.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
diff --git a/arch/x86/kernel/setup_percpu.c b/arch/x86/kernel/setup_percpu.c
index 13ba7a8..2b7dab6 100644
--- a/arch/x86/kernel/setup_percpu.c
+++ b/arch/x86/kernel/setup_percpu.c
@@ -140,26 +140,31 @@
  */
 void __init setup_per_cpu_areas(void)
 {
-	ssize_t size, old_size;
+	ssize_t size, old_size, da_size;
 	char *ptr;
 	int cpu;
+	unsigned long align = 1;
 
 	/* Setup cpu_pda map */
 	setup_cpu_pda_map();
 
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
-	size = old_size + per_cpu_dyn_array_size();
+	da_size = per_cpu_dyn_array_size(&align);
+	align = max_t(unsigned long, PAGE_SIZE, align);
+	size = roundup(old_size + da_size, align);
 	printk(KERN_INFO "PERCPU: Allocating %zd bytes of per cpu data\n",
 			  size);
 
 	for_each_possible_cpu(cpu) {
 #ifndef CONFIG_NEED_MULTIPLE_NODES
-		ptr = alloc_bootmem_pages(size);
+		ptr = __alloc_bootmem(size, align,
+				 __pa(MAX_DMA_ADDRESS));
 #else
 		int node = early_cpu_to_node(cpu);
 		if (!node_online(node) || !NODE_DATA(node)) {
-			ptr = alloc_bootmem_pages(size);
+			ptr = __alloc_bootmem(size, align,
+					 __pa(MAX_DMA_ADDRESS));
 			printk(KERN_INFO
 			       "cpu %d has no node %d or node-local memory\n",
 				cpu, node);
@@ -168,7 +173,8 @@
 					 cpu, __pa(ptr));
 		}
 		else {
-			ptr = alloc_bootmem_pages_node(NODE_DATA(node), size);
+			ptr = __alloc_bootmem_node(NODE_DATA(node), size, align,
+							__pa(MAX_DMA_ADDRESS));
 			if (ptr)
 				printk(KERN_DEBUG "per cpu data for cpu%d on node%d at %016lx\n",
 					 cpu, node, __pa(ptr));
diff --git a/include/linux/init.h b/include/linux/init.h
index 3328068..59fbb4a 100644
--- a/include/linux/init.h
+++ b/include/linux/init.h
@@ -288,7 +288,7 @@
 	DEFINE_PER_CPU_DYN_ARRAY_ADDR(nameX, nameX, nrX, alignX, init_workX)
 
 extern void pre_alloc_dyn_array(void);
-extern unsigned long per_cpu_dyn_array_size(void);
+extern unsigned long per_cpu_dyn_array_size(unsigned long *align);
 extern void per_cpu_alloc_dyn_array(int cpu, char *ptr);
 #endif /* __ASSEMBLY__ */
 
diff --git a/init/main.c b/init/main.c
index 416bca4..ab97d08 100644
--- a/init/main.c
+++ b/init/main.c
@@ -394,10 +394,14 @@
 	unsigned long size, i, old_size;
 	char *ptr;
 	unsigned long nr_possible_cpus = num_possible_cpus();
+	unsigned long align = 1;
+	unsigned da_size;
 
 	/* Copy section for each CPU (we discard the original) */
 	old_size = PERCPU_ENOUGH_ROOM;
-	size = ALIGN(old_size + per_cpu_dyn_array_size(), PAGE_SIZE);
+	da_size = per_cpu_dyn_array_size(&align);
+	align = max_t(unsigned long, PAGE_SIZE, align);
+	size = ALIGN(old_size + da_size, align);
 	ptr = alloc_bootmem_pages(size * nr_possible_cpus);
 
 	for_each_possible_cpu(i) {
@@ -541,45 +545,78 @@
 void pre_alloc_dyn_array(void)
 {
 #ifdef CONFIG_HAVE_DYN_ARRAY
-	unsigned long size, phys = 0;
+	unsigned long total_size = 0, size, phys;
+	unsigned long max_align = 1;
 	struct dyn_array **daa;
+	char *ptr;
 
+	/* get the total size at first */
 	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("dyna_array %s ", da->name);
-		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
+		print_fn_descriptor_symbol("dyn_array %s ", da->name);
+		printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
 			da->size, *da->nr, da->align);
-		*da->name = __alloc_bootmem(size, da->align, phys);
-		phys = virt_to_phys(*da->name);
+		total_size += roundup(size, da->align);
+		if (da->align > max_align)
+			max_align = da->align;
+	}
+	if (total_size)
+		printk(KERN_DEBUG "dyn_array total_size: %#lx\n",
+			 total_size);
+	else
+		return;
+
+	/* allocate them all together */
+	max_align = max_t(unsigned long, max_align, PAGE_SIZE);
+	ptr = __alloc_bootmem_nopanic(total_size, max_align, 0);
+	if (!ptr)
+		panic("Can not alloc dyn_alloc\n");
+
+	phys = virt_to_phys(ptr);
+	for (daa = __dyn_array_start ; daa < __dyn_array_end; daa++) {
+		struct dyn_array *da = *daa;
+
+		size = da->size * (*da->nr);
+		print_fn_descriptor_symbol("dyn_array %s ", da->name);
+
+		phys = roundup(phys, da->align);
+		*da->name = phys_to_virt(phys);
 		printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
 
+		phys += size;
+
 		if (da->init_work)
 			da->init_work(da);
 	}
 #endif
 }
 
-unsigned long per_cpu_dyn_array_size(void)
+unsigned long per_cpu_dyn_array_size(unsigned long *align)
 {
 	unsigned long total_size = 0;
 #ifdef CONFIG_HAVE_DYN_ARRAY
 	unsigned long size;
 	struct dyn_array **daa;
+	unsigned max_align = 1;
 
 	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
+		print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
 		printk(KERN_CONT "size:%#lx nr:%d align:%#lx\n",
 			da->size, *da->nr, da->align);
 		total_size += roundup(size, da->align);
+		if (da->align > max_align)
+			max_align = da->align;
 	}
-	if (total_size)
-		printk(KERN_DEBUG "per_cpu_dyna_array total_size: %#lx\n",
+	if (total_size) {
+		printk(KERN_DEBUG "per_cpu_dyn_array total_size: %#lx\n",
 			 total_size);
+		*align = max_align;
+	}
 #endif
 	return total_size;
 }
@@ -593,14 +630,11 @@
 	void **array;
 
 	phys = virt_to_phys(ptr);
-
 	for (daa = __per_cpu_dyn_array_start ; daa < __per_cpu_dyn_array_end; daa++) {
 		struct dyn_array *da = *daa;
 
 		size = da->size * (*da->nr);
-		print_fn_descriptor_symbol("per_cpu_dyna_array %s ", da->name);
-		printk(KERN_CONT "size:%#lx nr:%d align:%#lx",
-			da->size, *da->nr, da->align);
+		print_fn_descriptor_symbol("per_cpu_dyn_array %s ", da->name);
 
 		phys = roundup(phys, da->align);
 		addr = (unsigned long)da->name;
@@ -608,7 +642,8 @@
 		array = (void **)addr;
 		*array = phys_to_virt(phys);
 		*da->name = *array; /* so init_work could use it directly */
-		printk(KERN_CONT " %p ==> [%#lx - %#lx]\n", array, phys, phys + size);
+		printk(KERN_CONT " ==> [%#lx - %#lx]\n", phys, phys + size);
+
 		phys += size;
 
 		if (da->init_work) {