[NET]: Define infrastructure to keep 'inuse' changes in an efficent SMP/NUMA way.

"struct proto" currently uses an array stats[NR_CPUS] to track change on
'inuse' sockets per protocol.

If NR_CPUS is big, this means we use a big memory area for this.
Moreover, all this memory area is located on a single node on NUMA
machines, increasing memory pressure on the boot node.

In this patch, I tried to :

- Keep a fast !CONFIG_SMP implementation
- Keep a fast CONFIG_SMP implementation for often used protocols
(tcp,udp,raw,...)
- Introduce a NUMA efficient implementation

Some helper macros are defined in include/net/sock.h
These macros take into account CONFIG_SMP

If a "struct proto" is declared without using DEFINE_PROTO_INUSE /
REF_PROTO_INUSE
macros, it will automatically use a default implementation, using a
dynamically allocated percpu zone.
This default implementation will be NUMA efficient, but might use 32/64
bytes per possible cpu
because of current alloc_percpu() implementation.
However it still should be better than previous implementation based on
stats[NR_CPUS] field.

When a "struct proto" is changed to use the new macros, we use a single
static "int" percpu variable,
lowering the memory and cpu costs, still preserving NUMA efficiency.

Signed-off-by: Eric Dumazet <dada1@cosmosbay.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/core/sock.c b/net/core/sock.c
index 12ad206..e077f26 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1801,12 +1801,41 @@
 static DEFINE_RWLOCK(proto_list_lock);
 static LIST_HEAD(proto_list);
 
+#ifdef CONFIG_SMP
+/*
+ * Define default functions to keep track of inuse sockets per protocol
+ * Note that often used protocols use dedicated functions to get a speed increase.
+ * (see DEFINE_PROTO_INUSE/REF_PROTO_INUSE)
+ */
+static void inuse_add(struct proto *prot, int inc)
+{
+	per_cpu_ptr(prot->inuse_ptr, smp_processor_id())[0] += inc;
+}
+
+static int inuse_get(const struct proto *prot)
+{
+	int res = 0, cpu;
+	for_each_possible_cpu(cpu)
+		res += per_cpu_ptr(prot->inuse_ptr, cpu)[0];
+	return res;
+}
+#endif
+
 int proto_register(struct proto *prot, int alloc_slab)
 {
 	char *request_sock_slab_name = NULL;
 	char *timewait_sock_slab_name;
 	int rc = -ENOBUFS;
 
+#ifdef CONFIG_SMP
+	if (!prot->inuse_getval || !prot->inuse_add) {
+		prot->inuse_ptr = alloc_percpu(int);
+		if (prot->inuse_ptr == NULL)
+			goto out;
+		prot->inuse_getval = inuse_get;
+		prot->inuse_add = inuse_add;
+	}
+#endif
 	if (alloc_slab) {
 		prot->slab = kmem_cache_create(prot->name, prot->obj_size, 0,
 					       SLAB_HWCACHE_ALIGN, NULL);
@@ -1814,7 +1843,7 @@
 		if (prot->slab == NULL) {
 			printk(KERN_CRIT "%s: Can't create sock SLAB cache!\n",
 			       prot->name);
-			goto out;
+			goto out_free_inuse;
 		}
 
 		if (prot->rsk_prot != NULL) {
@@ -1873,6 +1902,15 @@
 out_free_sock_slab:
 	kmem_cache_destroy(prot->slab);
 	prot->slab = NULL;
+out_free_inuse:
+#ifdef CONFIG_SMP
+	if (prot->inuse_ptr != NULL) {
+		free_percpu(prot->inuse_ptr);
+		prot->inuse_ptr = NULL;
+		prot->inuse_getval = NULL;
+		prot->inuse_add = NULL;
+	}
+#endif
 	goto out;
 }
 
@@ -1884,6 +1922,14 @@
 	list_del(&prot->node);
 	write_unlock(&proto_list_lock);
 
+#ifdef CONFIG_SMP
+	if (prot->inuse_ptr != NULL) {
+		free_percpu(prot->inuse_ptr);
+		prot->inuse_ptr = NULL;
+		prot->inuse_getval = NULL;
+		prot->inuse_add = NULL;
+	}
+#endif
 	if (prot->slab != NULL) {
 		kmem_cache_destroy(prot->slab);
 		prot->slab = NULL;