net: align sk_refcnt on 128 bytes boundary sk->sk_refcnt is dirtied for every TCP/UDP incoming packet. This is a performance issue if multiple cpus hit a common socket, or multiple sockets are chained due to SO_REUSEPORT. By moving sk_refcnt 8 bytes further, first 128 bytes of sockets are mostly read. As they contain the lookup keys, this has a considerable performance impact, as cpus can cache them. These 8 bytes are not wasted, we use them as a place holder for various fields, depending on the socket type. Tested: SYN flood hitting a 16 RX queues NIC. TCP listener using 16 sockets and SO_REUSEPORT and SO_INCOMING_CPU for proper siloing. Could process 6.0 Mpps SYN instead of 4.2 Mpps Kernel profile looked like : 11.68% [kernel] [k] sha_transform 6.51% [kernel] [k] __inet_lookup_listener 5.07% [kernel] [k] __inet_lookup_established 4.15% [kernel] [k] memcpy_erms 3.46% [kernel] [k] ipt_do_table 2.74% [kernel] [k] fib_table_lookup 2.54% [kernel] [k] tcp_make_synack 2.34% [kernel] [k] tcp_conn_request 2.05% [kernel] [k] __netif_receive_skb_core 2.03% [kernel] [k] kmem_cache_alloc Signed-off-by: Eric Dumazet <edumazet@google.com> Signed-off-by: David S. Miller <davem@davemloft.net>

commit: 8e5eb54d303b7cb1174977ca79030e135728c95e [log] [tgz]
author: Eric Dumazet <edumazet@google.com> Thu Oct 08 19:33:22 2015 -0700
committer: David S. Miller <davem@davemloft.net> Mon Oct 12 19:28:22 2015 -0700
tree: 9aebc1d9e60ccbb559dac5d0d8d11f41b95df3c2
parent: 70da268b569d32a9fddeea85dc18043de9d89f89 [diff] [blame]
diff --git a/include/net/sock.h b/include/net/sock.h
index cf54739..6571240 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h

@@ -150,6 +150,9 @@
  *	@skc_node: main hash linkage for various protocol lookup tables
  *	@skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
  *	@skc_tx_queue_mapping: tx queue number for this connection
+ *	@skc_flags: place holder for sk_flags
+ *		%SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
+ *		%SO_OOBINLINE settings, %SO_TIMESTAMPING settings
  *	@skc_incoming_cpu: record/match cpu processing incoming packets
  *	@skc_refcnt: reference count
  *
@@ -201,6 +204,16 @@
 
 	atomic64_t		skc_cookie;
 
+	/* following fields are padding to force
+	 * offset(struct sock, sk_refcnt) == 128 on 64bit arches
+	 * assuming IPV6 is enabled. We use this padding differently
+	 * for different kind of 'sockets'
+	 */
+	union {
+		unsigned long	skc_flags;
+		struct sock	*skc_listener; /* request_sock */
+		struct inet_timewait_death_row *skc_tw_dr; /* inet_timewait_sock */
+	};
 	/*
 	 * fields between dontcopy_begin/dontcopy_end
 	 * are not copied in sock_copy()
@@ -246,8 +259,6 @@
   *	@sk_pacing_rate: Pacing rate (if supported by transport/packet scheduler)
   *	@sk_max_pacing_rate: Maximum pacing rate (%SO_MAX_PACING_RATE)
   *	@sk_sndbuf: size of send buffer in bytes
-  *	@sk_flags: %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
-  *		   %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
   *	@sk_no_check_tx: %SO_NO_CHECK setting, set checksum in TX packets
   *	@sk_no_check_rx: allow zero checksum in RX packets
   *	@sk_route_caps: route capabilities (e.g. %NETIF_F_TSO)
@@ -334,6 +345,7 @@
 #define sk_v6_rcv_saddr	__sk_common.skc_v6_rcv_saddr
 #define sk_cookie		__sk_common.skc_cookie
 #define sk_incoming_cpu		__sk_common.skc_incoming_cpu
+#define sk_flags		__sk_common.skc_flags
 
 	socket_lock_t		sk_lock;
 	struct sk_buff_head	sk_receive_queue;
@@ -371,7 +383,6 @@
 #ifdef CONFIG_XFRM
 	struct xfrm_policy	*sk_policy[2];
 #endif
-	unsigned long 		sk_flags;
 	struct dst_entry	*sk_rx_dst;
 	struct dst_entry __rcu	*sk_dst_cache;
 	spinlock_t		sk_dst_lock;
commit	8e5eb54d303b7cb1174977ca79030e135728c95e	[log] [tgz]
author	Eric Dumazet <edumazet@google.com>	Thu Oct 08 19:33:22 2015 -0700
committer	David S. Miller <davem@davemloft.net>	Mon Oct 12 19:28:22 2015 -0700
tree	9aebc1d9e60ccbb559dac5d0d8d11f41b95df3c2
parent	70da268b569d32a9fddeea85dc18043de9d89f89 [diff] [blame]