[IPV4]: Aggregate route entries with different TOS values

When we get an ICMP need-to-frag message, the original TOS value in the
ICMP payload cannot be used as a key to look up the routes to update.
This is because the TOS field may have been modified by routers on the
way.  Similarly, ip_rt_redirect should also ignore the TOS as the router
that gave us the message may have modified the TOS value.

The patch achieves this objective by aggregating entries with different
TOS values (but are otherwise identical) into the same bucket.  This
makes it easy to update them at the same time when an ICMP message is
received.

In future we should use a twin-hashing scheme where teh aggregation
occurs at the entry level.  That is, the TOS goes back into the hash
for normal lookups while ICMP lookups will end up with a node that
gives us a list that contains all other route entries that differ
only by TOS.

Signed-off-by: Ilia Sotnikov <hostcc@gmail.com>
Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index fca5fe0..94fcbc5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -55,6 +55,8 @@
  *	Robert Olsson		:	Added rt_cache statistics
  *	Arnaldo C. Melo		:	Convert proc stuff to seq_file
  *	Eric Dumazet		:	hashed spinlocks and rt_check_expire() fixes.
+ * 	Ilia Sotnikov		:	Ignore TOS on PMTUD and Redirect
+ * 	Ilia Sotnikov		:	Removed TOS from hash calculations
  *
  *		This program is free software; you can redistribute it and/or
  *		modify it under the terms of the GNU General Public License
@@ -247,9 +249,9 @@
 static int rt_intern_hash(unsigned hash, struct rtable *rth,
 				struct rtable **res);
 
-static unsigned int rt_hash_code(u32 daddr, u32 saddr, u8 tos)
+static unsigned int rt_hash_code(u32 daddr, u32 saddr)
 {
-	return (jhash_3words(daddr, saddr, (u32) tos, rt_hash_rnd)
+	return (jhash_2words(daddr, saddr, rt_hash_rnd)
 		& rt_hash_mask);
 }
 
@@ -1111,7 +1113,7 @@
 }
 
 void ip_rt_redirect(u32 old_gw, u32 daddr, u32 new_gw,
-		    u32 saddr, u8 tos, struct net_device *dev)
+		    u32 saddr, struct net_device *dev)
 {
 	int i, k;
 	struct in_device *in_dev = in_dev_get(dev);
@@ -1119,8 +1121,6 @@
 	u32  skeys[2] = { saddr, 0 };
 	int  ikeys[2] = { dev->ifindex, 0 };
 
-	tos &= IPTOS_RT_MASK;
-
 	if (!in_dev)
 		return;
 
@@ -1141,8 +1141,7 @@
 	for (i = 0; i < 2; i++) {
 		for (k = 0; k < 2; k++) {
 			unsigned hash = rt_hash_code(daddr,
-						     skeys[i] ^ (ikeys[k] << 5),
-						     tos);
+						     skeys[i] ^ (ikeys[k] << 5));
 
 			rthp=&rt_hash_table[hash].chain;
 
@@ -1152,7 +1151,6 @@
 
 				if (rth->fl.fl4_dst != daddr ||
 				    rth->fl.fl4_src != skeys[i] ||
-				    rth->fl.fl4_tos != tos ||
 				    rth->fl.oif != ikeys[k] ||
 				    rth->fl.iif != 0) {
 					rthp = &rth->u.rt_next;
@@ -1232,10 +1230,9 @@
 	if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit())
 		printk(KERN_INFO "Redirect from %u.%u.%u.%u on %s about "
 			"%u.%u.%u.%u ignored.\n"
-			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u, "
-			"tos %02x\n",
+			"  Advised path = %u.%u.%u.%u -> %u.%u.%u.%u\n",
 		       NIPQUAD(old_gw), dev->name, NIPQUAD(new_gw),
-		       NIPQUAD(saddr), NIPQUAD(daddr), tos);
+		       NIPQUAD(saddr), NIPQUAD(daddr));
 #endif
 	in_dev_put(in_dev);
 }
@@ -1253,8 +1250,7 @@
 			   rt->u.dst.expires) {
 			unsigned hash = rt_hash_code(rt->fl.fl4_dst,
 						     rt->fl.fl4_src ^
-							(rt->fl.oif << 5),
-						     rt->fl.fl4_tos);
+							(rt->fl.oif << 5));
 #if RT_CACHE_DEBUG >= 1
 			printk(KERN_DEBUG "ip_rt_advice: redirect to "
 					  "%u.%u.%u.%u/%02x dropped\n",
@@ -1391,14 +1387,13 @@
 	struct rtable *rth;
 	u32  skeys[2] = { iph->saddr, 0, };
 	u32  daddr = iph->daddr;
-	u8   tos = iph->tos & IPTOS_RT_MASK;
 	unsigned short est_mtu = 0;
 
 	if (ipv4_config.no_pmtu_disc)
 		return 0;
 
 	for (i = 0; i < 2; i++) {
-		unsigned hash = rt_hash_code(daddr, skeys[i], tos);
+		unsigned hash = rt_hash_code(daddr, skeys[i]);
 
 		rcu_read_lock();
 		for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -1407,7 +1402,6 @@
 			    rth->fl.fl4_src == skeys[i] &&
 			    rth->rt_dst  == daddr &&
 			    rth->rt_src  == iph->saddr &&
-			    rth->fl.fl4_tos == tos &&
 			    rth->fl.iif == 0 &&
 			    !(dst_metric_locked(&rth->u.dst, RTAX_MTU))) {
 				unsigned short mtu = new_mtu;
@@ -1658,7 +1652,7 @@
 	RT_CACHE_STAT_INC(in_slow_mc);
 
 	in_dev_put(in_dev);
-	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5), tos);
+	hash = rt_hash_code(daddr, saddr ^ (dev->ifindex << 5));
 	return rt_intern_hash(hash, rth, (struct rtable**) &skb->dst);
 
 e_nobufs:
@@ -1823,7 +1817,7 @@
 		return err;
 
 	/* put it into the cache */
-	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+	hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
 	return rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);	
 }
 
@@ -1864,7 +1858,7 @@
 			return err;
 
 		/* put it into the cache */
-		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5), tos);
+		hash = rt_hash_code(daddr, saddr ^ (fl->iif << 5));
 		err = rt_intern_hash(hash, rth, &rtres);
 		if (err)
 			return err;
@@ -2041,7 +2035,7 @@
 		rth->rt_flags 	&= ~RTCF_LOCAL;
 	}
 	rth->rt_type	= res.type;
-	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5), tos);
+	hash = rt_hash_code(daddr, saddr ^ (fl.iif << 5));
 	err = rt_intern_hash(hash, rth, (struct rtable**)&skb->dst);
 	goto done;
 
@@ -2088,7 +2082,7 @@
 	int iif = dev->ifindex;
 
 	tos &= IPTOS_RT_MASK;
-	hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos);
+	hash = rt_hash_code(daddr, saddr ^ (iif << 5));
 
 	rcu_read_lock();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;
@@ -2286,10 +2280,8 @@
 	int err = __mkroute_output(&rth, res, fl, oldflp, dev_out, flags);
 	unsigned hash;
 	if (err == 0) {
-		u32 tos = RT_FL_TOS(oldflp);
-
 		hash = rt_hash_code(oldflp->fl4_dst, 
-				    oldflp->fl4_src ^ (oldflp->oif << 5), tos);
+				    oldflp->fl4_src ^ (oldflp->oif << 5));
 		err = rt_intern_hash(hash, rth, rp);
 	}
 	
@@ -2304,7 +2296,6 @@
 				    unsigned flags)
 {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH_CACHED
-	u32 tos = RT_FL_TOS(oldflp);
 	unsigned char hop;
 	unsigned hash;
 	int err = -EINVAL;
@@ -2334,7 +2325,7 @@
 
 			hash = rt_hash_code(oldflp->fl4_dst, 
 					    oldflp->fl4_src ^
-					    (oldflp->oif << 5), tos);
+					    (oldflp->oif << 5));
 			err = rt_intern_hash(hash, rth, rp);
 
 			/* forward hop information to multipath impl. */
@@ -2563,7 +2554,7 @@
 	unsigned hash;
 	struct rtable *rth;
 
-	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos);
+	hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5));
 
 	rcu_read_lock_bh();
 	for (rth = rcu_dereference(rt_hash_table[hash].chain); rth;