vlan: Precise RX stats accounting

With multi queue devices, its possible that several cpus call
vlan RX routines simultaneously for the same vlan device.

We update RX stats counter without any locking, so we can
get slightly wrong counters.

One possible fix is to use percpu counters, to get precise
accounting and also get guarantee of no cache line ping pongs
between cpus.

Note: this adds 16 bytes (32 bytes on 64bit arches) of percpu
data per vlan device.

Signed-off-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index 9159659..de0dc6b 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -140,7 +140,7 @@
 		  struct packet_type *ptype, struct net_device *orig_dev)
 {
 	struct vlan_hdr *vhdr;
-	struct net_device_stats *stats;
+	struct vlan_rx_stats *rx_stats;
 	u16 vlan_id;
 	u16 vlan_tci;
 
@@ -163,9 +163,10 @@
 		goto err_unlock;
 	}
 
-	stats = &skb->dev->stats;
-	stats->rx_packets++;
-	stats->rx_bytes += skb->len;
+	rx_stats = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats,
+			       smp_processor_id());
+	rx_stats->rx_packets++;
+	rx_stats->rx_bytes += skb->len;
 
 	skb_pull_rcsum(skb, VLAN_HLEN);
 
@@ -180,7 +181,7 @@
 		break;
 
 	case PACKET_MULTICAST:
-		stats->multicast++;
+		rx_stats->multicast++;
 		break;
 
 	case PACKET_OTHERHOST:
@@ -200,7 +201,7 @@
 
 	skb = vlan_check_reorder_header(skb);
 	if (!skb) {
-		stats->rx_errors++;
+		rx_stats->rx_errors++;
 		goto err_unlock;
 	}
 
@@ -731,6 +732,11 @@
 		subclass = 1;
 
 	vlan_dev_set_lockdep_class(dev, subclass);
+
+	vlan_dev_info(dev)->vlan_rx_stats = alloc_percpu(struct vlan_rx_stats);
+	if (!vlan_dev_info(dev)->vlan_rx_stats)
+		return -ENOMEM;
+
 	return 0;
 }
 
@@ -740,6 +746,8 @@
 	struct vlan_dev_info *vlan = vlan_dev_info(dev);
 	int i;
 
+	free_percpu(vlan->vlan_rx_stats);
+	vlan->vlan_rx_stats = NULL;
 	for (i = 0; i < ARRAY_SIZE(vlan->egress_priority_map); i++) {
 		while ((pm = vlan->egress_priority_map[i]) != NULL) {
 			vlan->egress_priority_map[i] = pm->next;
@@ -775,6 +783,31 @@
 	return dev_ethtool_get_flags(vlan->real_dev);
 }
 
+static struct net_device_stats *vlan_dev_get_stats(struct net_device *dev)
+{
+	struct net_device_stats *stats = &dev->stats;
+
+	dev_txq_stats_fold(dev, stats);
+
+	if (vlan_dev_info(dev)->vlan_rx_stats) {
+		struct vlan_rx_stats *p, rx = {0};
+		int i;
+
+		for_each_possible_cpu(i) {
+			p = per_cpu_ptr(vlan_dev_info(dev)->vlan_rx_stats, i);
+			rx.rx_packets += p->rx_packets;
+			rx.rx_bytes   += p->rx_bytes;
+			rx.rx_errors  += p->rx_errors;
+			rx.multicast  += p->multicast;
+		}
+		stats->rx_packets = rx.rx_packets;
+		stats->rx_bytes   = rx.rx_bytes;
+		stats->rx_errors  = rx.rx_errors;
+		stats->multicast  = rx.multicast;
+	}
+	return stats;
+}
+
 static const struct ethtool_ops vlan_ethtool_ops = {
 	.get_settings	        = vlan_ethtool_get_settings,
 	.get_drvinfo	        = vlan_ethtool_get_drvinfo,
@@ -797,6 +830,7 @@
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
+	.ndo_get_stats		= vlan_dev_get_stats,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,
@@ -820,6 +854,7 @@
 	.ndo_change_rx_flags	= vlan_dev_change_rx_flags,
 	.ndo_do_ioctl		= vlan_dev_ioctl,
 	.ndo_neigh_setup	= vlan_dev_neigh_setup,
+	.ndo_get_stats		= vlan_dev_get_stats,
 #if defined(CONFIG_FCOE) || defined(CONFIG_FCOE_MODULE)
 	.ndo_fcoe_ddp_setup	= vlan_dev_fcoe_ddp_setup,
 	.ndo_fcoe_ddp_done	= vlan_dev_fcoe_ddp_done,