udp: no longer use SLAB_DESTROY_BY_RCU

Tom Herbert would like not touching UDP socket refcnt for encapsulated
traffic. For this to happen, we need to use normal RCU rules, with a grace
period before freeing a socket. UDP sockets are not short lived in the
high usage case, so the added cost of call_rcu() should not be a concern.

This actually removes a lot of complexity in UDP stack.

Multicast receives no longer need to hold a bucket spinlock.

Note that ip early demux still needs to take a reference on the socket.

Same remark for functions used by xt_socket and xt_PROXY netfilter modules,
but this might be changed later.

Performance for a single UDP socket receiving flood traffic from
many RX queues/cpus.

Simple udp_rx using simple recvfrom() loop :
438 kpps instead of 374 kpps : 17 % increase of the peak rate.

v2: Addressed Willem de Bruijn feedback in multicast handling
 - keep early demux break in __udp4_lib_demux_lookup()

Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Tom Herbert <tom@herbertland.com>
Cc: Willem de Bruijn <willemb@google.com>
Tested-by: Tom Herbert <tom@herbertland.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/include/linux/udp.h b/include/linux/udp.h
index 87c0949..3234275 100644
--- a/include/linux/udp.h
+++ b/include/linux/udp.h
@@ -98,11 +98,11 @@
 	return udp_sk(sk)->no_check6_rx;
 }
 
-#define udp_portaddr_for_each_entry(__sk, node, list) \
-	hlist_nulls_for_each_entry(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry(__sk, list) \
+	hlist_for_each_entry(__sk, list, __sk_common.skc_portaddr_node)
 
-#define udp_portaddr_for_each_entry_rcu(__sk, node, list) \
-	hlist_nulls_for_each_entry_rcu(__sk, node, list, __sk_common.skc_portaddr_node)
+#define udp_portaddr_for_each_entry_rcu(__sk, list) \
+	hlist_for_each_entry_rcu(__sk, list, __sk_common.skc_portaddr_node)
 
 #define IS_UDPLITE(__sk) (udp_sk(__sk)->pcflag)
 
diff --git a/include/net/sock.h b/include/net/sock.h
index 9e77353..7ad73db 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -178,7 +178,7 @@
 	int			skc_bound_dev_if;
 	union {
 		struct hlist_node	skc_bind_node;
-		struct hlist_nulls_node skc_portaddr_node;
+		struct hlist_node	skc_portaddr_node;
 	};
 	struct proto		*skc_prot;
 	possible_net_t		skc_net;
@@ -670,18 +670,18 @@
 	hlist_for_each_entry(__sk, list, sk_bind_node)
 
 /**
- * sk_nulls_for_each_entry_offset - iterate over a list at a given struct offset
+ * sk_for_each_entry_offset_rcu - iterate over a list at a given struct offset
  * @tpos:	the type * to use as a loop cursor.
  * @pos:	the &struct hlist_node to use as a loop cursor.
  * @head:	the head for your list.
  * @offset:	offset of hlist_node within the struct.
  *
  */
-#define sk_nulls_for_each_entry_offset(tpos, pos, head, offset)		       \
-	for (pos = (head)->first;					       \
-	     (!is_a_nulls(pos)) &&					       \
+#define sk_for_each_entry_offset_rcu(tpos, pos, head, offset)		       \
+	for (pos = rcu_dereference((head)->first);			       \
+	     pos != NULL &&						       \
 		({ tpos = (typeof(*tpos) *)((void *)pos - offset); 1;});       \
-	     pos = pos->next)
+	     pos = rcu_dereference(pos->next))
 
 static inline struct user_namespace *sk_user_ns(struct sock *sk)
 {
diff --git a/include/net/udp.h b/include/net/udp.h
index 92927f7..d870ec1 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -59,7 +59,7 @@
  *	@lock:	spinlock protecting changes to head/count
  */
 struct udp_hslot {
-	struct hlist_nulls_head	head;
+	struct hlist_head	head;
 	int			count;
 	spinlock_t		lock;
 } __attribute__((aligned(2 * sizeof(long))));