Merge branch 'master' of git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/net-next

Jeff Kirsher says:

====================
Intel Wired LAN Driver Updates 2015-03-06

This series contains updates to e1000, e1000e and igb.

Yanir provides updates to e1000e based on the patches provided by John
Linville.  First updates the code comment to better describe the changes
and the impact on the driver.  Second removed calls to ioremap/unmap for
i219 since this is only relevant to older hardware only.  Starting with
i219, the NVM will not be mapped to its one BAR but to a address region
in another bar.

Alex Duyck provides two fixes for igb, first fixes a compile warning
where a variable may be used uninitialized, so Alex initializes it.
Second fixes an issue where all of the pin register values were having
to be pushed onto the stack each time the function was called, so to
avoid this, Alex made them static const so that they should only need
to be allocated once and we can avoid all the instructions to get them
onto the stack.

Eliezer found an issue in e1000 where we needed to be calling
netif_carrier_off earlier in the down() to prevent the stack from
queuing more packets to the interface.

Sabrina Dubroca resolved a potential race condition by adding a
dummy allocator.  There was a race condition between e1000_change_mtu()
cleanups and netpoll, when changing the MTU across jumbo sizes.
====================

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/Documentation/networking/ip-sysctl.txt b/Documentation/networking/ip-sysctl.txt
index 1b8c964..4412f69 100644
--- a/Documentation/networking/ip-sysctl.txt
+++ b/Documentation/networking/ip-sysctl.txt
@@ -388,6 +388,16 @@
 	  1 - Disabled by default, enabled when an ICMP black hole detected
 	  2 - Always enabled, use initial MSS of tcp_base_mss.
 
+tcp_probe_interval - INTEGER
+	Controls how often to start TCP Packetization-Layer Path MTU
+	Discovery reprobe. The default is reprobing every 10 minutes as
+	per RFC4821.
+
+tcp_probe_threshold - INTEGER
+	Controls when TCP Packetization-Layer Path MTU Discovery probing
+	will stop in respect to the width of search range in bytes. Default
+	is 8 bytes.
+
 tcp_no_metrics_save - BOOLEAN
 	By default, TCP saves various connection metrics in the route cache
 	when the connection closes, so that connections established in the
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 675b082..c026ce9 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -928,6 +928,39 @@
 
 static void bond_poll_controller(struct net_device *bond_dev)
 {
+	struct bonding *bond = netdev_priv(bond_dev);
+	struct slave *slave = NULL;
+	struct list_head *iter;
+	struct ad_info ad_info;
+	struct netpoll_info *ni;
+	const struct net_device_ops *ops;
+
+	if (BOND_MODE(bond) == BOND_MODE_8023AD)
+		if (bond_3ad_get_active_agg_info(bond, &ad_info))
+			return;
+
+	rcu_read_lock_bh();
+	bond_for_each_slave_rcu(bond, slave, iter) {
+		ops = slave->dev->netdev_ops;
+		if (!bond_slave_is_up(slave) || !ops->ndo_poll_controller)
+			continue;
+
+		if (BOND_MODE(bond) == BOND_MODE_8023AD) {
+			struct aggregator *agg =
+			    SLAVE_AD_INFO(slave)->port.aggregator;
+
+			if (agg &&
+			    agg->aggregator_identifier != ad_info.aggregator_id)
+				continue;
+		}
+
+		ni = rcu_dereference_bh(slave->dev->npinfo);
+		if (down_trylock(&ni->dev_lock))
+			continue;
+		ops->ndo_poll_controller(slave->dev);
+		up(&ni->dev_lock);
+	}
+	rcu_read_unlock_bh();
 }
 
 static void bond_netpoll_cleanup(struct net_device *bond_dev)
diff --git a/drivers/net/ethernet/rocker/rocker.c b/drivers/net/ethernet/rocker/rocker.c
index d04d3b3..cc1bbfd 100644
--- a/drivers/net/ethernet/rocker/rocker.c
+++ b/drivers/net/ethernet/rocker/rocker.c
@@ -2737,7 +2737,7 @@
 {
 	struct rocker_neigh_tbl_entry *found;
 
-	hash_for_each_possible(rocker->neigh_tbl, found, entry, ip_addr)
+	hash_for_each_possible(rocker->neigh_tbl, found, entry, (u32)ip_addr)
 		if (found->ip_addr == ip_addr)
 			return found;
 
@@ -2749,7 +2749,7 @@
 {
 	entry->index = rocker->neigh_tbl_next_index++;
 	entry->ref_count++;
-	hash_add(rocker->neigh_tbl, &entry->entry, entry->ip_addr);
+	hash_add(rocker->neigh_tbl, &entry->entry, (u32)entry->ip_addr);
 }
 
 static void _rocker_neigh_del(struct rocker *rocker,
@@ -2868,7 +2868,7 @@
 				    __be32 ip_addr)
 {
 	struct net_device *dev = rocker_port->dev;
-	struct neighbour *n = __ipv4_neigh_lookup(dev, ip_addr);
+	struct neighbour *n = __ipv4_neigh_lookup(dev, (u32)ip_addr);
 	int err = 0;
 
 	if (!n)
diff --git a/include/net/dn_neigh.h b/include/net/dn_neigh.h
index fac4e3f..0f26aa7 100644
--- a/include/net/dn_neigh.h
+++ b/include/net/dn_neigh.h
@@ -22,6 +22,7 @@
 int dn_neigh_endnode_hello(struct sk_buff *skb);
 void dn_neigh_pointopoint_hello(struct sk_buff *skb);
 int dn_neigh_elist(struct net_device *dev, unsigned char *ptr, int n);
+int dn_to_neigh_output(struct sk_buff *skb);
 
 extern struct neigh_table dn_neigh_table;
 
diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
index 5976bde..b9a6b0a 100644
--- a/include/net/inet_connection_sock.h
+++ b/include/net/inet_connection_sock.h
@@ -126,6 +126,8 @@
 
 		/* Information on the current probe. */
 		int		  probe_size;
+
+		u32		  probe_timestamp;
 	} icsk_mtup;
 	u32			  icsk_ca_priv[16];
 	u32			  icsk_user_timeout;
diff --git a/include/net/netns/ipv4.h b/include/net/netns/ipv4.h
index 1085e12f..8f3a1a1 100644
--- a/include/net/netns/ipv4.h
+++ b/include/net/netns/ipv4.h
@@ -87,6 +87,8 @@
 	int sysctl_tcp_fwmark_accept;
 	int sysctl_tcp_mtu_probing;
 	int sysctl_tcp_base_mss;
+	int sysctl_tcp_probe_threshold;
+	u32 sysctl_tcp_probe_interval;
 
 	struct ping_group_range ping_group_range;
 
diff --git a/include/net/tcp.h b/include/net/tcp.h
index f87599d..2e11e38 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -65,7 +65,13 @@
 #define TCP_MIN_MSS		88U
 
 /* The least MTU to use for probing */
-#define TCP_BASE_MSS		512
+#define TCP_BASE_MSS		1024
+
+/* probing interval, default to 10 minutes as per RFC4821 */
+#define TCP_PROBE_INTERVAL	600
+
+/* Specify interval when tcp mtu probing will stop */
+#define TCP_PROBE_THRESHOLD	8
 
 /* After receiving this amount of duplicate ACKs fast retransmit starts. */
 #define TCP_FASTRETRANS_THRESH 3
diff --git a/net/decnet/dn_neigh.c b/net/decnet/dn_neigh.c
index ee7d1ce..be1f08c 100644
--- a/net/decnet/dn_neigh.c
+++ b/net/decnet/dn_neigh.c
@@ -49,41 +49,17 @@
 #include <net/dn_route.h>
 
 static int dn_neigh_construct(struct neighbour *);
-static void dn_long_error_report(struct neighbour *, struct sk_buff *);
-static void dn_short_error_report(struct neighbour *, struct sk_buff *);
-static int dn_long_output(struct neighbour *, struct sk_buff *);
-static int dn_short_output(struct neighbour *, struct sk_buff *);
-static int dn_phase3_output(struct neighbour *, struct sk_buff *);
-
+static void dn_neigh_error_report(struct neighbour *, struct sk_buff *);
+static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb);
 
 /*
- * For talking to broadcast devices: Ethernet & PPP
+ * Operations for adding the link layer header.
  */
-static const struct neigh_ops dn_long_ops = {
+static const struct neigh_ops dn_neigh_ops = {
 	.family =		AF_DECnet,
-	.error_report =		dn_long_error_report,
-	.output =		dn_long_output,
-	.connected_output =	dn_long_output,
-};
-
-/*
- * For talking to pointopoint and multidrop devices: DDCMP and X.25
- */
-static const struct neigh_ops dn_short_ops = {
-	.family =		AF_DECnet,
-	.error_report =		dn_short_error_report,
-	.output =		dn_short_output,
-	.connected_output =	dn_short_output,
-};
-
-/*
- * For talking to DECnet phase III nodes
- */
-static const struct neigh_ops dn_phase3_ops = {
-	.family =		AF_DECnet,
-	.error_report =		dn_short_error_report, /* Can use short version here */
-	.output =		dn_phase3_output,
-	.connected_output =	dn_phase3_output,
+	.error_report =		dn_neigh_error_report,
+	.output =		dn_neigh_output,
+	.connected_output =	dn_neigh_output,
 };
 
 static u32 dn_neigh_hash(const void *pkey,
@@ -153,16 +129,9 @@
 
 	__neigh_parms_put(neigh->parms);
 	neigh->parms = neigh_parms_clone(parms);
-
-	if (dn_db->use_long)
-		neigh->ops = &dn_long_ops;
-	else
-		neigh->ops = &dn_short_ops;
 	rcu_read_unlock();
 
-	if (dn->flags & DN_NDFLAG_P3)
-		neigh->ops = &dn_phase3_ops;
-
+	neigh->ops = &dn_neigh_ops;
 	neigh->nud_state = NUD_NOARP;
 	neigh->output = neigh->ops->connected_output;
 
@@ -194,24 +163,16 @@
 	return 0;
 }
 
-static void dn_long_error_report(struct neighbour *neigh, struct sk_buff *skb)
+static void dn_neigh_error_report(struct neighbour *neigh, struct sk_buff *skb)
 {
-	printk(KERN_DEBUG "dn_long_error_report: called\n");
+	printk(KERN_DEBUG "dn_neigh_error_report: called\n");
 	kfree_skb(skb);
 }
 
-
-static void dn_short_error_report(struct neighbour *neigh, struct sk_buff *skb)
-{
-	printk(KERN_DEBUG "dn_short_error_report: called\n");
-	kfree_skb(skb);
-}
-
-static int dn_neigh_output_packet(struct sk_buff *skb)
+static int dn_neigh_output(struct neighbour *neigh, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
 	struct dn_route *rt = (struct dn_route *)dst;
-	struct neighbour *neigh = rt->n;
 	struct net_device *dev = neigh->dev;
 	char mac_addr[ETH_ALEN];
 	unsigned int seq;
@@ -233,6 +194,18 @@
 	return err;
 }
 
+static int dn_neigh_output_packet(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct dn_route *rt = (struct dn_route *)dst;
+	struct neighbour *neigh = rt->n;
+
+	return neigh->output(neigh, skb);
+}
+
+/*
+ * For talking to broadcast devices: Ethernet & PPP
+ */
 static int dn_long_output(struct neighbour *neigh, struct sk_buff *skb)
 {
 	struct net_device *dev = neigh->dev;
@@ -276,6 +249,9 @@
 		       neigh->dev, dn_neigh_output_packet);
 }
 
+/*
+ * For talking to pointopoint and multidrop devices: DDCMP and X.25
+ */
 static int dn_short_output(struct neighbour *neigh, struct sk_buff *skb)
 {
 	struct net_device *dev = neigh->dev;
@@ -313,7 +289,8 @@
 }
 
 /*
- * Phase 3 output is the same is short output, execpt that
+ * For talking to DECnet phase III nodes
+ * Phase 3 output is the same as short output, execpt that
  * it clears the area bits before transmission.
  */
 static int dn_phase3_output(struct neighbour *neigh, struct sk_buff *skb)
@@ -351,6 +328,32 @@
 		       neigh->dev, dn_neigh_output_packet);
 }
 
+int dn_to_neigh_output(struct sk_buff *skb)
+{
+	struct dst_entry *dst = skb_dst(skb);
+	struct dn_route *rt = (struct dn_route *) dst;
+	struct neighbour *neigh = rt->n;
+	struct dn_neigh *dn = (struct dn_neigh *)neigh;
+	struct dn_dev *dn_db;
+	bool use_long;
+
+	rcu_read_lock();
+	dn_db = rcu_dereference(neigh->dev->dn_ptr);
+	if (dn_db == NULL) {
+		rcu_read_unlock();
+		return -EINVAL;
+	}
+	use_long = dn_db->use_long;
+	rcu_read_unlock();
+
+	if (dn->flags & DN_NDFLAG_P3)
+		return dn_phase3_output(neigh, skb);
+	if (use_long)
+		return dn_long_output(neigh, skb);
+	else
+		return dn_short_output(neigh, skb);
+}
+
 /*
  * Unfortunately, the neighbour code uses the device in its hash
  * function, so we don't get any advantage from it. This function
diff --git a/net/decnet/dn_route.c b/net/decnet/dn_route.c
index 3b81092..7718155 100644
--- a/net/decnet/dn_route.c
+++ b/net/decnet/dn_route.c
@@ -743,15 +743,6 @@
 	return NET_RX_DROP;
 }
 
-static int dn_to_neigh_output(struct sk_buff *skb)
-{
-	struct dst_entry *dst = skb_dst(skb);
-	struct dn_route *rt = (struct dn_route *) dst;
-	struct neighbour *n = rt->n;
-
-	return n->output(n, skb);
-}
-
 static int dn_output(struct sock *sk, struct sk_buff *skb)
 {
 	struct dst_entry *dst = skb_dst(skb);
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index d151539..fdf8991 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -883,6 +883,20 @@
 		.mode		= 0644,
 		.proc_handler	= proc_dointvec,
 	},
+	{
+		.procname	= "tcp_probe_threshold",
+		.data		= &init_net.ipv4.sysctl_tcp_probe_threshold,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
+	{
+		.procname	= "tcp_probe_interval",
+		.data		= &init_net.ipv4.sysctl_tcp_probe_interval,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec,
+	},
 	{ }
 };
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 5a2dfed..f0c6fc3 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -2460,6 +2460,8 @@
 	}
 	net->ipv4.sysctl_tcp_ecn = 2;
 	net->ipv4.sysctl_tcp_base_mss = TCP_BASE_MSS;
+	net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
+	net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
 	return 0;
 
 fail:
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index 8bbd86c..5a73ad5 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -1354,6 +1354,8 @@
 			       icsk->icsk_af_ops->net_header_len;
 	icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, net->ipv4.sysctl_tcp_base_mss);
 	icsk->icsk_mtup.probe_size = 0;
+	if (icsk->icsk_mtup.enabled)
+		icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
 }
 EXPORT_SYMBOL(tcp_mtup_init);
 
@@ -1828,6 +1830,31 @@
 	return false;
 }
 
+static inline void tcp_mtu_check_reprobe(struct sock *sk)
+{
+	struct inet_connection_sock *icsk = inet_csk(sk);
+	struct tcp_sock *tp = tcp_sk(sk);
+	struct net *net = sock_net(sk);
+	u32 interval;
+	s32 delta;
+
+	interval = net->ipv4.sysctl_tcp_probe_interval;
+	delta = tcp_time_stamp - icsk->icsk_mtup.probe_timestamp;
+	if (unlikely(delta >= interval * HZ)) {
+		int mss = tcp_current_mss(sk);
+
+		/* Update current search range */
+		icsk->icsk_mtup.probe_size = 0;
+		icsk->icsk_mtup.search_high = tp->rx_opt.mss_clamp +
+			sizeof(struct tcphdr) +
+			icsk->icsk_af_ops->net_header_len;
+		icsk->icsk_mtup.search_low = tcp_mss_to_mtu(sk, mss);
+
+		/* Update probe time stamp */
+		icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
+	}
+}
+
 /* Create a new MTU probe if we are ready.
  * MTU probe is regularly attempting to increase the path MTU by
  * deliberately sending larger packets.  This discovers routing
@@ -1842,11 +1869,13 @@
 	struct tcp_sock *tp = tcp_sk(sk);
 	struct inet_connection_sock *icsk = inet_csk(sk);
 	struct sk_buff *skb, *nskb, *next;
+	struct net *net = sock_net(sk);
 	int len;
 	int probe_size;
 	int size_needed;
 	int copy;
 	int mss_now;
+	int interval;
 
 	/* Not currently probing/verifying,
 	 * not in recovery,
@@ -1859,12 +1888,25 @@
 	    tp->rx_opt.num_sacks || tp->rx_opt.dsack)
 		return -1;
 
-	/* Very simple search strategy: just double the MSS. */
+	/* Use binary search for probe_size between tcp_mss_base,
+	 * and current mss_clamp. if (search_high - search_low)
+	 * smaller than a threshold, backoff from probing.
+	 */
 	mss_now = tcp_current_mss(sk);
-	probe_size = 2 * tp->mss_cache;
+	probe_size = tcp_mtu_to_mss(sk, (icsk->icsk_mtup.search_high +
+				    icsk->icsk_mtup.search_low) >> 1);
 	size_needed = probe_size + (tp->reordering + 1) * tp->mss_cache;
-	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high)) {
-		/* TODO: set timer for probe_converge_event */
+	interval = icsk->icsk_mtup.search_high - icsk->icsk_mtup.search_low;
+	/* When misfortune happens, we are reprobing actively,
+	 * and then reprobe timer has expired. We stick with current
+	 * probing process by not resetting search range to its orignal.
+	 */
+	if (probe_size > tcp_mtu_to_mss(sk, icsk->icsk_mtup.search_high) ||
+		interval < net->ipv4.sysctl_tcp_probe_threshold) {
+		/* Check whether enough time has elaplased for
+		 * another round of probing.
+		 */
+		tcp_mtu_check_reprobe(sk);
 		return -1;
 	}
 
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index 0732b78..1550593 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -107,6 +107,7 @@
 	if (net->ipv4.sysctl_tcp_mtu_probing) {
 		if (!icsk->icsk_mtup.enabled) {
 			icsk->icsk_mtup.enabled = 1;
+			icsk->icsk_mtup.probe_timestamp = tcp_time_stamp;
 			tcp_sync_mss(sk, icsk->icsk_pmtu_cookie);
 		} else {
 			struct net *net = sock_net(sk);
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index f4fd575..19e4e72 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -309,8 +309,12 @@
 	 * IPv4 FIB offloading has been disabled completely.
 	 */
 
-	if (fi->fib_net->ipv4.fib_has_custom_rules |
-	    fi->fib_net->ipv4.fib_offload_disabled)
+#ifdef CONFIG_IP_MULTIPLE_TABLES
+	if (fi->fib_net->ipv4.fib_has_custom_rules)
+		return 0;
+#endif
+
+	if (fi->fib_net->ipv4.fib_offload_disabled)
 		return 0;
 
 	dev = netdev_switch_get_dev_by_nhs(fi);