[PATCH] bonding: Validate probe replies in ARP monitor

	Add logic to check ARP request / reply packets used for ARP
monitor link integrity checking.

	The current method simply examines the slave device to see if it
has sent and received traffic; this can be fooled by extraneous traffic.
For example, if multiple hosts running bonding are behind a common
switch, the probe traffic from the multiple instances of bonding will
update the tx/rx times on each other's slave devices.

Signed-off-by: Jay Vosburgh <fubar@us.ibm.com>
Signed-off-by: Jeff Garzik <jeff@garzik.org>
diff --git a/Documentation/networking/bonding.txt b/Documentation/networking/bonding.txt
index afac780..dc942ea 100644
--- a/Documentation/networking/bonding.txt
+++ b/Documentation/networking/bonding.txt
@@ -192,6 +192,17 @@
 arp_interval
 
 	Specifies the ARP link monitoring frequency in milliseconds.
+
+	The ARP monitor works by periodically checking the slave
+	devices to determine whether they have sent or received
+	traffic recently (the precise criteria depends upon the
+	bonding mode, and the state of the slave).  Regular traffic is
+	generated via ARP probes issued for the addresses specified by
+	the arp_ip_target option.
+
+	This behavior can be modified by the arp_validate option,
+	below.
+
 	If ARP monitoring is used in an etherchannel compatible mode
 	(modes 0 and 2), the switch should be configured in a mode
 	that evenly distributes packets across all links. If the
@@ -213,6 +224,54 @@
 	maximum number of targets that can be specified is 16.  The
 	default value is no IP addresses.
 
+arp_validate
+
+	Specifies whether or not ARP probes and replies should be
+	validated in the active-backup mode.  This causes the ARP
+	monitor to examine the incoming ARP requests and replies, and
+	only consider a slave to be up if it is receiving the
+	appropriate ARP traffic.
+
+	Possible values are:
+
+	none or 0
+
+		No validation is performed.  This is the default.
+
+	active or 1
+
+		Validation is performed only for the active slave.
+
+	backup or 2
+
+		Validation is performed only for backup slaves.
+
+	all or 3
+
+		Validation is performed for all slaves.
+
+	For the active slave, the validation checks ARP replies to
+	confirm that they were generated by an arp_ip_target.  Since
+	backup slaves do not typically receive these replies, the
+	validation performed for backup slaves is on the ARP request
+	sent out via the active slave.  It is possible that some
+	switch or network configurations may result in situations
+	wherein the backup slaves do not receive the ARP requests; in
+	such a situation, validation of backup slaves must be
+	disabled.
+
+	This option is useful in network configurations in which
+	multiple bonding hosts are concurrently issuing ARPs to one or
+	more targets beyond a common switch.  Should the link between
+	the switch and target fail (but not the switch itself), the
+	probe traffic generated by the multiple bonding instances will
+	fool the standard ARP monitor into considering the links as
+	still up.  Use of the arp_validate option can resolve this, as
+	the ARP monitor will only consider ARP requests and replies
+	associated with its own instance of bonding.
+
+	This option was added in bonding version 3.1.0.
+
 downdelay
 
 	Specifies the time, in milliseconds, to wait before disabling
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index bafe62f..fd521b0 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -96,6 +96,7 @@
 static char *xmit_hash_policy = NULL;
 static int arp_interval = BOND_LINK_ARP_INTERV;
 static char *arp_ip_target[BOND_MAX_ARP_TARGETS] = { NULL, };
+static char *arp_validate = NULL;
 struct bond_params bonding_defaults;
 
 module_param(max_bonds, int, 0);
@@ -127,6 +128,8 @@
 MODULE_PARM_DESC(arp_interval, "arp interval in milliseconds");
 module_param_array(arp_ip_target, charp, NULL, 0);
 MODULE_PARM_DESC(arp_ip_target, "arp targets in n.n.n.n form");
+module_param(arp_validate, charp, 0);
+MODULE_PARM_DESC(arp_validate, "validate src/dst of ARP probes: none (default), active, backup or all");
 
 /*----------------------------- Global variables ----------------------------*/
 
@@ -170,6 +173,14 @@
 {	NULL,			-1},
 };
 
+struct bond_parm_tbl arp_validate_tbl[] = {
+{	"none",			BOND_ARP_VALIDATE_NONE},
+{	"active",		BOND_ARP_VALIDATE_ACTIVE},
+{	"backup",		BOND_ARP_VALIDATE_BACKUP},
+{	"all",			BOND_ARP_VALIDATE_ALL},
+{	NULL,			-1},
+};
+
 /*-------------------------- Forward declarations ---------------------------*/
 
 static void bond_send_gratuitous_arp(struct bonding *bond);
@@ -1424,6 +1435,8 @@
 
 	bond_compute_features(bond);
 
+	new_slave->last_arp_rx = jiffies;
+
 	if (bond->params.miimon && !bond->params.use_carrier) {
 		link_reporting = bond_check_dev_link(bond, slave_dev, 1);
 
@@ -1785,7 +1798,8 @@
 	dev_set_mac_address(slave_dev, &addr);
 
 	slave_dev->priv_flags &= ~(IFF_MASTER_8023AD | IFF_MASTER_ALB |
-				   IFF_SLAVE_INACTIVE | IFF_BONDING);
+				   IFF_SLAVE_INACTIVE | IFF_BONDING |
+				   IFF_SLAVE_NEEDARP);
 
 	kfree(slave);
 
@@ -2298,6 +2312,25 @@
 	return 0;
 }
 
+static int bond_has_this_ip(struct bonding *bond, u32 ip)
+{
+	struct vlan_entry *vlan, *vlan_next;
+
+	if (ip == bond->master_ip)
+		return 1;
+
+	if (list_empty(&bond->vlan_list))
+		return 0;
+
+	list_for_each_entry_safe(vlan, vlan_next, &bond->vlan_list,
+				 vlan_list) {
+		if (ip == vlan->vlan_ip)
+			return 1;
+	}
+
+	return 0;
+}
+
 /*
  * We go to the (large) trouble of VLAN tagging ARP frames because
  * switches in VLAN mode (especially if ports are configured as
@@ -2436,6 +2469,93 @@
 	}
 }
 
+static void bond_validate_arp(struct bonding *bond, struct slave *slave, u32 sip, u32 tip)
+{
+	int i;
+	u32 *targets = bond->params.arp_targets;
+
+	targets = bond->params.arp_targets;
+	for (i = 0; (i < BOND_MAX_ARP_TARGETS) && targets[i]; i++) {
+		dprintk("bva: sip %u.%u.%u.%u tip %u.%u.%u.%u t[%d] "
+			"%u.%u.%u.%u bhti(tip) %d\n",
+		       NIPQUAD(sip), NIPQUAD(tip), i, NIPQUAD(targets[i]),
+		       bond_has_this_ip(bond, tip));
+		if (sip == targets[i]) {
+			if (bond_has_this_ip(bond, tip))
+				slave->last_arp_rx = jiffies;
+			return;
+		}
+	}
+}
+
+static int bond_arp_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev)
+{
+	struct arphdr *arp;
+	struct slave *slave;
+	struct bonding *bond;
+	unsigned char *arp_ptr;
+	u32 sip, tip;
+
+	if (!(dev->priv_flags & IFF_BONDING) || !(dev->flags & IFF_MASTER))
+		goto out;
+
+	bond = dev->priv;
+	read_lock(&bond->lock);
+
+	dprintk("bond_arp_rcv: bond %s skb->dev %s orig_dev %s\n",
+		bond->dev->name, skb->dev ? skb->dev->name : "NULL",
+		orig_dev ? orig_dev->name : "NULL");
+
+	slave = bond_get_slave_by_dev(bond, orig_dev);
+	if (!slave || !slave_do_arp_validate(bond, slave))
+		goto out_unlock;
+
+	/* ARP header, plus 2 device addresses, plus 2 IP addresses.  */
+	if (!pskb_may_pull(skb, (sizeof(struct arphdr) +
+				 (2 * dev->addr_len) +
+				 (2 * sizeof(u32)))))
+		goto out_unlock;
+
+	arp = skb->nh.arph;
+	if (arp->ar_hln != dev->addr_len ||
+	    skb->pkt_type == PACKET_OTHERHOST ||
+	    skb->pkt_type == PACKET_LOOPBACK ||
+	    arp->ar_hrd != htons(ARPHRD_ETHER) ||
+	    arp->ar_pro != htons(ETH_P_IP) ||
+	    arp->ar_pln != 4)
+		goto out_unlock;
+
+	arp_ptr = (unsigned char *)(arp + 1);
+	arp_ptr += dev->addr_len;
+	memcpy(&sip, arp_ptr, 4);
+	arp_ptr += 4 + dev->addr_len;
+	memcpy(&tip, arp_ptr, 4);
+
+	dprintk("bond_arp_rcv: %s %s/%d av %d sv %d sip %u.%u.%u.%u"
+		" tip %u.%u.%u.%u\n", bond->dev->name, slave->dev->name,
+		slave->state, bond->params.arp_validate,
+		slave_do_arp_validate(bond, slave), NIPQUAD(sip), NIPQUAD(tip));
+
+	/*
+	 * Backup slaves won't see the ARP reply, but do come through
+	 * here for each ARP probe (so we swap the sip/tip to validate
+	 * the probe).  In a "redundant switch, common router" type of
+	 * configuration, the ARP probe will (hopefully) travel from
+	 * the active, through one switch, the router, then the other
+	 * switch before reaching the backup.
+	 */
+	if (slave->state == BOND_STATE_ACTIVE)
+		bond_validate_arp(bond, slave, sip, tip);
+	else
+		bond_validate_arp(bond, slave, tip, sip);
+
+out_unlock:
+	read_unlock(&bond->lock);
+out:
+	dev_kfree_skb(skb);
+	return NET_RX_SUCCESS;
+}
+
 /*
  * this function is called regularly to monitor each slave's link
  * ensuring that traffic is being sent and received when arp monitoring
@@ -2600,7 +2720,8 @@
 	 */
 	bond_for_each_slave(bond, slave, i) {
 		if (slave->link != BOND_LINK_UP) {
-			if ((jiffies - slave->dev->last_rx) <= delta_in_ticks) {
+			if ((jiffies - slave_last_rx(bond, slave)) <=
+			     delta_in_ticks) {
 
 				slave->link = BOND_LINK_UP;
 
@@ -2645,7 +2766,7 @@
 
 			if ((slave != bond->curr_active_slave) &&
 			    (!bond->current_arp_slave) &&
-			    (((jiffies - slave->dev->last_rx) >= 3*delta_in_ticks) &&
+			    (((jiffies - slave_last_rx(bond, slave)) >= 3*delta_in_ticks) &&
 			     bond_has_ip(bond))) {
 				/* a backup slave has gone down; three times
 				 * the delta allows the current slave to be
@@ -2692,7 +2813,7 @@
 		 * if it is up and needs to take over as the curr_active_slave
 		 */
 		if ((((jiffies - slave->dev->trans_start) >= (2*delta_in_ticks)) ||
-	    (((jiffies - slave->dev->last_rx) >= (2*delta_in_ticks)) &&
+	    (((jiffies - slave_last_rx(bond, slave)) >= (2*delta_in_ticks)) &&
 	     bond_has_ip(bond))) &&
 		    ((jiffies - slave->jiffies) >= 2*delta_in_ticks)) {
 
@@ -3315,6 +3436,21 @@
 	dev_remove_pack(&(BOND_AD_INFO(bond).ad_pkt_type));
 }
 
+void bond_register_arp(struct bonding *bond)
+{
+	struct packet_type *pt = &bond->arp_mon_pt;
+
+	pt->type = htons(ETH_P_ARP);
+	pt->dev = NULL; /*bond->dev;XXX*/
+	pt->func = bond_arp_rcv;
+	dev_add_pack(pt);
+}
+
+void bond_unregister_arp(struct bonding *bond)
+{
+	dev_remove_pack(&bond->arp_mon_pt);
+}
+
 /*---------------------------- Hashing Policies -----------------------------*/
 
 /*
@@ -3401,6 +3537,9 @@
 		} else {
 			arp_timer->function = (void *)&bond_loadbalance_arp_mon;
 		}
+		if (bond->params.arp_validate)
+			bond_register_arp(bond);
+
 		add_timer(arp_timer);
 	}
 
@@ -3428,6 +3567,9 @@
 		bond_unregister_lacpdu(bond);
 	}
 
+	if (bond->params.arp_validate)
+		bond_unregister_arp(bond);
+
 	write_lock_bh(&bond->lock);
 
 
@@ -4281,6 +4423,8 @@
 
 static int bond_check_params(struct bond_params *params)
 {
+	int arp_validate_value;
+
 	/*
 	 * Convert string parameters.
 	 */
@@ -4484,6 +4628,29 @@
 		arp_interval = 0;
 	}
 
+	if (arp_validate) {
+		if (bond_mode != BOND_MODE_ACTIVEBACKUP) {
+			printk(KERN_ERR DRV_NAME
+	       ": arp_validate only supported in active-backup mode\n");
+			return -EINVAL;
+		}
+		if (!arp_interval) {
+			printk(KERN_ERR DRV_NAME
+			       ": arp_validate requires arp_interval\n");
+			return -EINVAL;
+		}
+
+		arp_validate_value = bond_parse_parm(arp_validate,
+						     arp_validate_tbl);
+		if (arp_validate_value == -1) {
+			printk(KERN_ERR DRV_NAME
+			       ": Error: invalid arp_validate \"%s\"\n",
+			       arp_validate == NULL ? "NULL" : arp_validate);
+			return -EINVAL;
+		}
+	} else
+		arp_validate_value = 0;
+
 	if (miimon) {
 		printk(KERN_INFO DRV_NAME
 		       ": MII link monitoring set to %d ms\n",
@@ -4492,8 +4659,10 @@
 		int i;
 
 		printk(KERN_INFO DRV_NAME
-		       ": ARP monitoring set to %d ms with %d target(s):",
-		       arp_interval, arp_ip_count);
+		       ": ARP monitoring set to %d ms, validate %s, with %d target(s):",
+		       arp_interval,
+		       arp_validate_tbl[arp_validate_value].modename,
+		       arp_ip_count);
 
 		for (i = 0; i < arp_ip_count; i++)
 			printk (" %s", arp_ip_target[i]);
@@ -4527,6 +4696,7 @@
 	params->xmit_policy = xmit_hashtype;
 	params->miimon = miimon;
 	params->arp_interval = arp_interval;
+	params->arp_validate = arp_validate_value;
 	params->updelay = updelay;
 	params->downdelay = downdelay;
 	params->use_carrier = use_carrier;
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 15b6a29..ced9ed8 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -51,6 +51,7 @@
 extern struct bond_parm_tbl bond_mode_tbl[];
 extern struct bond_parm_tbl bond_lacp_tbl[];
 extern struct bond_parm_tbl xmit_hashtype_tbl[];
+extern struct bond_parm_tbl arp_validate_tbl[];
 
 static int expected_refcount = -1;
 static struct class *netdev_class;
@@ -503,6 +504,53 @@
 static CLASS_DEVICE_ATTR(xmit_hash_policy, S_IRUGO | S_IWUSR, bonding_show_xmit_hash, bonding_store_xmit_hash);
 
 /*
+ * Show and set arp_validate.
+ */
+static ssize_t bonding_show_arp_validate(struct class_device *cd, char *buf)
+{
+	struct bonding *bond = to_bond(cd);
+
+	return sprintf(buf, "%s %d\n",
+		       arp_validate_tbl[bond->params.arp_validate].modename,
+		       bond->params.arp_validate) + 1;
+}
+
+static ssize_t bonding_store_arp_validate(struct class_device *cd, const char *buf, size_t count)
+{
+	int new_value;
+	struct bonding *bond = to_bond(cd);
+
+	new_value = bond_parse_parm((char *)buf, arp_validate_tbl);
+	if (new_value < 0) {
+		printk(KERN_ERR DRV_NAME
+		       ": %s: Ignoring invalid arp_validate value %s\n",
+		       bond->dev->name, buf);
+		return -EINVAL;
+	}
+	if (new_value && (bond->params.mode != BOND_MODE_ACTIVEBACKUP)) {
+		printk(KERN_ERR DRV_NAME
+		       ": %s: arp_validate only supported in active-backup mode.\n",
+		       bond->dev->name);
+		return -EINVAL;
+	}
+	printk(KERN_INFO DRV_NAME ": %s: setting arp_validate to %s (%d).\n",
+	       bond->dev->name, arp_validate_tbl[new_value].modename,
+	       new_value);
+
+	if (!bond->params.arp_validate && new_value) {
+		bond_register_arp(bond);
+	} else if (bond->params.arp_validate && !new_value) {
+		bond_unregister_arp(bond);
+	}
+
+	bond->params.arp_validate = new_value;
+
+	return count;
+}
+
+static CLASS_DEVICE_ATTR(arp_validate, S_IRUGO | S_IWUSR, bonding_show_arp_validate, bonding_store_arp_validate);
+
+/*
  * Show and set the arp timer interval.  There are two tricky bits
  * here.  First, if ARP monitoring is activated, then we must disable
  * MII monitoring.  Second, if the ARP timer isn't running, we must
@@ -914,6 +962,11 @@
 			       "ARP monitoring. Disabling ARP monitoring...\n",
 			       bond->dev->name);
 			bond->params.arp_interval = 0;
+			if (bond->params.arp_validate) {
+				bond_unregister_arp(bond);
+				bond->params.arp_validate =
+					BOND_ARP_VALIDATE_NONE;
+			}
 			/* Kill ARP timer, else it brings bond's link down */
 			if (bond->mii_timer.function) {
 				printk(KERN_INFO DRV_NAME
@@ -1273,6 +1326,7 @@
 static struct attribute *per_bond_attrs[] = {
 	&class_device_attr_slaves.attr,
 	&class_device_attr_mode.attr,
+	&class_device_attr_arp_validate.attr,
 	&class_device_attr_arp_interval.attr,
 	&class_device_attr_arp_ip_target.attr,
 	&class_device_attr_downdelay.attr,
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 17caafe..db16fee 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -22,8 +22,8 @@
 #include "bond_3ad.h"
 #include "bond_alb.h"
 
-#define DRV_VERSION	"3.0.3"
-#define DRV_RELDATE	"March 23, 2006"
+#define DRV_VERSION	"3.1.0-test"
+#define DRV_RELDATE	"September 9, 2006"
 #define DRV_NAME	"bonding"
 #define DRV_DESCRIPTION	"Ethernet Channel Bonding Driver"
 
@@ -126,6 +126,7 @@
 	int xmit_policy;
 	int miimon;
 	int arp_interval;
+	int arp_validate;
 	int use_carrier;
 	int updelay;
 	int downdelay;
@@ -151,6 +152,7 @@
 	struct slave *prev;
 	int    delay;
 	u32    jiffies;
+	u32    last_arp_rx;
 	s8     link;    /* one of BOND_LINK_XXXX */
 	s8     state;   /* one of BOND_STATE_XXXX */
 	u32    original_flags;
@@ -198,6 +200,7 @@
 	struct   bond_params params;
 	struct   list_head vlan_list;
 	struct   vlan_group *vlgrp;
+	struct   packet_type arp_mon_pt;
 };
 
 /**
@@ -228,6 +231,25 @@
 	return (struct bonding *)slave->dev->master->priv;
 }
 
+#define BOND_ARP_VALIDATE_NONE		0
+#define BOND_ARP_VALIDATE_ACTIVE	(1 << BOND_STATE_ACTIVE)
+#define BOND_ARP_VALIDATE_BACKUP	(1 << BOND_STATE_BACKUP)
+#define BOND_ARP_VALIDATE_ALL		(BOND_ARP_VALIDATE_ACTIVE | \
+					 BOND_ARP_VALIDATE_BACKUP)
+
+extern inline int slave_do_arp_validate(struct bonding *bond, struct slave *slave)
+{
+	return bond->params.arp_validate & (1 << slave->state);
+}
+
+extern inline u32 slave_last_rx(struct bonding *bond, struct slave *slave)
+{
+	if (slave_do_arp_validate(bond, slave))
+		return slave->last_arp_rx;
+
+	return slave->dev->last_rx;
+}
+
 static inline void bond_set_slave_inactive_flags(struct slave *slave)
 {
 	struct bonding *bond = slave->dev->master->priv;
@@ -235,12 +257,14 @@
 	    bond->params.mode != BOND_MODE_ALB)
 		slave->state = BOND_STATE_BACKUP;
 	slave->dev->priv_flags |= IFF_SLAVE_INACTIVE;
+	if (slave_do_arp_validate(bond, slave))
+		slave->dev->priv_flags |= IFF_SLAVE_NEEDARP;
 }
 
 static inline void bond_set_slave_active_flags(struct slave *slave)
 {
 	slave->state = BOND_STATE_ACTIVE;
-	slave->dev->priv_flags &= ~IFF_SLAVE_INACTIVE;
+	slave->dev->priv_flags &= ~(IFF_SLAVE_INACTIVE | IFF_SLAVE_NEEDARP);
 }
 
 static inline void bond_set_master_3ad_flags(struct bonding *bond)
@@ -284,6 +308,8 @@
 const char *bond_mode_name(int mode);
 void bond_select_active_slave(struct bonding *bond);
 void bond_change_active_slave(struct bonding *bond, struct slave *new_active);
+void bond_register_arp(struct bonding *);
+void bond_unregister_arp(struct bonding *);
 
 #endif /* _LINUX_BONDING_H */
 
diff --git a/include/linux/if.h b/include/linux/if.h
index a023ec1..8018c2e 100644
--- a/include/linux/if.h
+++ b/include/linux/if.h
@@ -60,6 +60,7 @@
 #define IFF_MASTER_8023AD	0x8	/* bonding master, 802.3ad. 	*/
 #define IFF_MASTER_ALB	0x10		/* bonding master, balance-alb.	*/
 #define IFF_BONDING	0x20		/* bonding master or slave	*/
+#define IFF_SLAVE_NEEDARP 0x40		/* need ARPs for validation	*/
 
 #define IF_GET_IFACE	0x0001		/* for querying only */
 #define IF_GET_PROTO	0x0002
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 4328912..afd80ef 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1016,7 +1016,8 @@
 }
 
 /* On bonding slaves other than the currently active slave, suppress
- * duplicates except for 802.3ad ETH_P_SLOW and alb non-mcast/bcast.
+ * duplicates except for 802.3ad ETH_P_SLOW, alb non-mcast/bcast, and
+ * ARP on active-backup slaves with arp_validate enabled.
  */
 static inline int skb_bond_should_drop(struct sk_buff *skb)
 {
@@ -1025,6 +1026,10 @@
 
 	if (master &&
 	    (dev->priv_flags & IFF_SLAVE_INACTIVE)) {
+		if ((dev->priv_flags & IFF_SLAVE_NEEDARP) &&
+		    skb->protocol == __constant_htons(ETH_P_ARP))
+			return 0;
+
 		if (master->priv_flags & IFF_MASTER_ALB) {
 			if (skb->pkt_type != PACKET_BROADCAST &&
 			    skb->pkt_type != PACKET_MULTICAST)