net: better pcpu data alignment

Tunnels can force an alignment of their percpu data to reduce number of
cache lines used in fast path, or read in .ndo_get_stats()

percpu_alloc() is a very fine grained allocator, so any small hole will
be used anyway.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index d55110e..38f7c07 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -171,7 +171,7 @@
 	unsigned long	rx_bytes;
 	unsigned long	tx_packets;
 	unsigned long	tx_bytes;
-};
+} __attribute__((aligned(4*sizeof(unsigned long))));
 
 static struct net_device_stats *ipgre_get_stats(struct net_device *dev)
 {
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index 065effd..9490690 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -148,7 +148,7 @@
 	unsigned long	rx_bytes;
 	unsigned long	tx_packets;
 	unsigned long	tx_bytes;
-};
+} __attribute__((aligned(4*sizeof(unsigned long))));
 
 static struct net_device_stats *ipip_get_stats(struct net_device *dev)
 {
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index bdc15c9..f36ca13 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -93,7 +93,7 @@
 	unsigned long	rx_bytes;
 	unsigned long	tx_packets;
 	unsigned long	tx_bytes;
-};
+} __attribute__((aligned(4*sizeof(unsigned long))));
 
 static struct net_device_stats *ip6_get_stats(struct net_device *dev)
 {
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a7a1860..cec0938 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -91,7 +91,7 @@
 	unsigned long	rx_bytes;
 	unsigned long	tx_packets;
 	unsigned long	tx_bytes;
-};
+} __attribute__((aligned(4*sizeof(unsigned long))));
 
 static struct net_device_stats *ipip6_get_stats(struct net_device *dev)
 {