[SPARC64]: Eliminate NR_CPUS limitations.

Cheetah systems can have cpuids as large as 1023, although physical
systems don't have that many cpus.

Only three limitations existed in the kernel preventing arbitrary
NR_CPUS values:

1) dcache dirty cpu state stored in page->flags on
   D-cache aliasing platforms.  With some build time
   calculations and some build-time BUG checks on
   page->flags layout, this one was easily solved.

2) The cheetah XCALL delivery code could only handle
   a cpumask with up to 32 cpus set.  Some simple looping
   logic clears that up too.

3) thread_info->cpu was a u8, easily changed to a u16.

There are a few spots in the kernel that still put NR_CPUS
sized arrays on the kernel stack, but that's not a sparc64
specific problem.

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/arch/sparc64/mm/init.c b/arch/sparc64/mm/init.c
index 9776982..087cbf0 100644
--- a/arch/sparc64/mm/init.c
+++ b/arch/sparc64/mm/init.c
@@ -191,12 +191,9 @@
 }
 
 #define PG_dcache_dirty		PG_arch_1
-#define PG_dcache_cpu_shift	24UL
-#define PG_dcache_cpu_mask	(256UL - 1UL)
-
-#if NR_CPUS > 256
-#error D-cache dirty tracking and thread_info->cpu need fixing for > 256 cpus
-#endif
+#define PG_dcache_cpu_shift	32UL
+#define PG_dcache_cpu_mask	\
+	((1UL<<ilog2(roundup_pow_of_two(NR_CPUS)))-1UL)
 
 #define dcache_dirty_cpu(page) \
 	(((page)->flags >> PG_dcache_cpu_shift) & PG_dcache_cpu_mask)
@@ -1349,6 +1346,19 @@
 	unsigned long end_pfn, pages_avail, shift, phys_base;
 	unsigned long real_end, i;
 
+	/* These build time checkes make sure that the dcache_dirty_cpu()
+	 * page->flags usage will work.
+	 *
+	 * When a page gets marked as dcache-dirty, we store the
+	 * cpu number starting at bit 32 in the page->flags.  Also,
+	 * functions like clear_dcache_dirty_cpu use the cpu mask
+	 * in 13-bit signed-immediate instruction fields.
+	 */
+	BUILD_BUG_ON(FLAGS_RESERVED != 32);
+	BUILD_BUG_ON(SECTIONS_WIDTH + NODES_WIDTH + ZONES_WIDTH +
+		     ilog2(roundup_pow_of_two(NR_CPUS)) > FLAGS_RESERVED);
+	BUILD_BUG_ON(NR_CPUS > 4096);
+
 	kern_base = (prom_boot_mapping_phys_low >> 22UL) << 22UL;
 	kern_size = (unsigned long)&_end - (unsigned long)KERNBASE;