[IPV4]: rt_cache_stat can be statically defined

Using __get_cpu_var(obj) is slightly faster than per_cpu_ptr(obj, 
raw_smp_processor_id()).

1) Smaller code and memory use
For static and small objects, DEFINE_PER_CPU(type, object) is preferred over a 
alloc_percpu() : Better and smaller code to access them, and no extra memory 
(storing the pointer, and the percpu array of pointers)

x86_64 code before patch

mov    1237577(%rip),%rax        # ffffffff803e5990 <rt_cache_stat>
not    %rax  # part of per_cpu machinery
mov    %gs:0x3c,%edx # get cpu number
movslq %edx,%rdx # extend 32 bits cpu number to 64 bits
mov    (%rax,%rdx,8),%rax # get the pointer for this cpu
incl   0x38(%rax)

x86_64 code after patch

mov    $per_cpu__rt_cache_stat,%rdx
mov    %gs:0x48,%rax # get percpu data offset
incl   0x38(%rax,%rdx,1)

2) False sharing avoidance for SMP :
For a small NR_CPUS, the array of per cpu pointers allocated in alloc_percpu() 
can be <= 32 bytes. This let slab code gives a part of a cache line. If the 
other part of this 64 bytes (or 128 bytes) cache line is used by a mostly 
written object, we can have false sharing and expensive per_cpu_ptr() operations.

Size of rt_cache_stat is 64 bytes, so this patch is not a danger of a too big 
increase of bss (in UP mode) or static per_cpu data for SMP 
(PERCPU_ENOUGH_ROOM is currently 32768 bytes)

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f701a13..f2e82af 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -240,9 +240,8 @@
 static int			rt_hash_log;
 static unsigned int		rt_hash_rnd;
 
-static struct rt_cache_stat *rt_cache_stat;
-#define RT_CACHE_STAT_INC(field)					  \
-		(per_cpu_ptr(rt_cache_stat, raw_smp_processor_id())->field++)
+static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
+#define RT_CACHE_STAT_INC(field) (__get_cpu_var(rt_cache_stat).field++)
 
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 				struct rtable **res);
@@ -401,7 +400,7 @@
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return per_cpu_ptr(rt_cache_stat, cpu);
+		return &per_cpu(rt_cache_stat, cpu);
 	}
 	return NULL;
 }
@@ -414,7 +413,7 @@
 		if (!cpu_possible(cpu))
 			continue;
 		*pos = cpu+1;
-		return per_cpu_ptr(rt_cache_stat, cpu);
+		return &per_cpu(rt_cache_stat, cpu);
 	}
 	return NULL;
 	
@@ -3160,10 +3159,6 @@
 	ipv4_dst_ops.gc_thresh = (rt_hash_mask + 1);
 	ip_rt_max_size = (rt_hash_mask + 1) * 16;
 
-	rt_cache_stat = alloc_percpu(struct rt_cache_stat);
-	if (!rt_cache_stat)
-		return -ENOMEM;
-
 	devinet_init();
 	ip_fib_init();
 
@@ -3191,7 +3186,6 @@
 	if (!proc_net_fops_create("rt_cache", S_IRUGO, &rt_cache_seq_fops) ||
 	    !(rtstat_pde = create_proc_entry("rt_cache", S_IRUGO, 
 			    		     proc_net_stat))) {
-		free_percpu(rt_cache_stat);
 		return -ENOMEM;
 	}
 	rtstat_pde->proc_fops = &rt_cpu_seq_fops;