geneve: Add support to collect tunnel metadata.

Following patch create new tunnel flag which enable
tunnel metadata collection on given device. These devices
can be used by tunnel metadata based routing or by OVS.
Geneve Consolidation patch get rid of collect_md_tun to
simplify tunnel lookup further.

Signed-off-by: Pravin B Shelar <pshelar@nicira.com>
Reviewed-by: Jesse Gross <jesse@nicira.com>
Acked-by: Thomas Graf <tgraf@suug.ch>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 0a6d974..d05150c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -15,6 +15,7 @@
 #include <linux/netdevice.h>
 #include <linux/etherdevice.h>
 #include <linux/hash.h>
+#include <net/dst_metadata.h>
 #include <net/rtnetlink.h>
 #include <net/geneve.h>
 
@@ -36,6 +37,7 @@
 struct geneve_net {
 	struct list_head  geneve_list;
 	struct hlist_head vni_list[VNI_HASH_SIZE];
+	struct geneve_dev __rcu *collect_md_tun;
 };
 
 /* Pseudo network device */
@@ -50,6 +52,7 @@
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
 	struct list_head   next;	/* geneve's per namespace list */
 	__be16		   dst_port;
+	bool		   collect_md;
 };
 
 static int geneve_net_id;
@@ -62,48 +65,95 @@
 	return hash_32(vnid, VNI_HASH_BITS);
 }
 
-/* geneve receive/decap routine */
-static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+static __be64 vni_to_tunnel_id(const __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	return (vni[0] << 16) | (vni[1] << 8) | vni[2];
+#else
+	return (__force __be64)(((__force u64)vni[0] << 40) |
+				((__force u64)vni[1] << 48) |
+				((__force u64)vni[2] << 56));
+#endif
+}
+
+static struct geneve_dev *geneve_lookup(struct geneve_net *gn,
+					struct geneve_sock *gs,
+					struct iphdr *iph,
+					struct genevehdr *gnvh)
 {
 	struct inet_sock *sk = inet_sk(gs->sock->sk);
-	struct genevehdr *gnvh = geneve_hdr(skb);
-	struct geneve_dev *dummy, *geneve = NULL;
-	struct geneve_net *gn;
-	struct iphdr *iph = NULL;
-	struct pcpu_sw_netstats *stats;
 	struct hlist_head *vni_list_head;
-	int err = 0;
+	struct geneve_dev *geneve;
 	__u32 hash;
 
-	iph = ip_hdr(skb); /* Still outer IP header... */
-
-	gn = gs->rcv_data;
+	geneve = rcu_dereference(gn->collect_md_tun);
+	if (geneve)
+		return geneve;
 
 	/* Find the device for this VNI */
 	hash = geneve_net_vni_hash(gnvh->vni);
 	vni_list_head = &gn->vni_list[hash];
-	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
-		if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
-		    iph->saddr == dummy->remote.sin_addr.s_addr &&
-		    sk->inet_sport == dummy->dst_port) {
-			geneve = dummy;
-			break;
+	hlist_for_each_entry_rcu(geneve, vni_list_head, hlist) {
+		if (!memcmp(gnvh->vni, geneve->vni, sizeof(geneve->vni)) &&
+		    iph->saddr == geneve->remote.sin_addr.s_addr &&
+		    sk->inet_sport == geneve->dst_port) {
+			return geneve;
 		}
 	}
+	return NULL;
+}
+
+/* geneve receive/decap routine */
+static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
+{
+	struct genevehdr *gnvh = geneve_hdr(skb);
+	struct metadata_dst *tun_dst = NULL;
+	struct geneve_dev *geneve = NULL;
+	struct pcpu_sw_netstats *stats;
+	struct geneve_net *gn;
+	struct iphdr *iph;
+	int err;
+
+	iph = ip_hdr(skb); /* Still outer IP header... */
+	gn = gs->rcv_data;
+	geneve = geneve_lookup(gn, gs, iph, gnvh);
 	if (!geneve)
 		goto drop;
 
-	/* Drop packets w/ critical options,
-	 * since we don't support any...
-	 */
-	if (gnvh->critical)
-		goto drop;
+	if (ip_tunnel_collect_metadata() || geneve->collect_md) {
+		__be16 flags;
+		void *opts;
+
+		flags = TUNNEL_KEY | TUNNEL_GENEVE_OPT |
+			(gnvh->oam ? TUNNEL_OAM : 0) |
+			(gnvh->critical ? TUNNEL_CRIT_OPT : 0);
+
+		tun_dst = udp_tun_rx_dst(skb, AF_INET, flags,
+					 vni_to_tunnel_id(gnvh->vni),
+					 gnvh->opt_len * 4);
+		if (!tun_dst)
+			goto drop;
+
+		/* Update tunnel dst according to Geneve options. */
+		opts = ip_tunnel_info_opts(&tun_dst->u.tun_info,
+					   gnvh->opt_len * 4);
+		memcpy(opts, gnvh->options, gnvh->opt_len * 4);
+	} else {
+		/* Drop packets w/ critical options,
+		 * since we don't support any...
+		 */
+		if (gnvh->critical)
+			goto drop;
+	}
 
 	skb_reset_mac_header(skb);
 	skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
 	skb->protocol = eth_type_trans(skb, geneve->dev);
 	skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
 
+	if (tun_dst)
+		skb_dst_set(skb, &tun_dst->dst);
+
 	/* Ignore packet loops (and multicast echo) */
 	if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
 		goto drop;
@@ -131,7 +181,6 @@
 	u64_stats_update_end(&stats->syncp);
 
 	netif_rx(skb);
-
 	return;
 drop:
 	/* Consume bad packet */
@@ -144,7 +193,6 @@
 	dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
 	if (!dev->tstats)
 		return -ENOMEM;
-
 	return 0;
 }
 
@@ -180,69 +228,137 @@
 	return 0;
 }
 
+static struct rtable *geneve_get_rt(struct sk_buff *skb,
+				    struct net_device *dev,
+				    struct flowi4 *fl4,
+				    struct ip_tunnel_info *info)
+{
+	struct geneve_dev *geneve = netdev_priv(dev);
+	struct rtable *rt = NULL;
+	__u8 tos;
+
+	memset(fl4, 0, sizeof(*fl4));
+	fl4->flowi4_mark = skb->mark;
+	fl4->flowi4_proto = IPPROTO_UDP;
+
+	if (info) {
+		fl4->daddr = info->key.u.ipv4.dst;
+		fl4->saddr = info->key.u.ipv4.src;
+		fl4->flowi4_tos = RT_TOS(info->key.tos);
+	} else {
+		tos = geneve->tos;
+		if (tos == 1) {
+			const struct iphdr *iip = ip_hdr(skb);
+
+			tos = ip_tunnel_get_dsfield(iip, skb);
+		}
+
+		fl4->flowi4_tos = RT_TOS(tos);
+		fl4->daddr = geneve->remote.sin_addr.s_addr;
+	}
+
+	rt = ip_route_output_key(geneve->net, fl4);
+	if (IS_ERR(rt)) {
+		netdev_dbg(dev, "no route to %pI4\n", &fl4->daddr);
+		dev->stats.tx_carrier_errors++;
+		return rt;
+	}
+	if (rt->dst.dev == dev) { /* is this necessary? */
+		netdev_dbg(dev, "circular route to %pI4\n", &fl4->daddr);
+		dev->stats.collisions++;
+		ip_rt_put(rt);
+		return ERR_PTR(-EINVAL);
+	}
+
+	return rt;
+}
+
+/* Convert 64 bit tunnel ID to 24 bit VNI. */
+static void tunnel_id_to_vni(__be64 tun_id, __u8 *vni)
+{
+#ifdef __BIG_ENDIAN
+	vni[0] = (__force __u8)(tun_id >> 16);
+	vni[1] = (__force __u8)(tun_id >> 8);
+	vni[2] = (__force __u8)tun_id;
+#else
+	vni[0] = (__force __u8)((__force u64)tun_id >> 40);
+	vni[1] = (__force __u8)((__force u64)tun_id >> 48);
+	vni[2] = (__force __u8)((__force u64)tun_id >> 56);
+#endif
+}
+
 static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 	struct geneve_sock *gs = geneve->sock;
+	struct ip_tunnel_info *info = NULL;
 	struct rtable *rt = NULL;
 	const struct iphdr *iip; /* interior IP header */
 	struct flowi4 fl4;
-	int err;
-	__be16 sport;
 	__u8 tos, ttl;
+	__be16 sport;
+	bool xnet;
+	int err;
 
-	iip = ip_hdr(skb);
+	sport = udp_flow_src_port(geneve->net, skb, 1, USHRT_MAX, true);
 
-	skb_reset_mac_header(skb);
+	if (geneve->collect_md) {
+		info = skb_tunnel_info(skb);
+		if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
+			netdev_dbg(dev, "no tunnel metadata\n");
+			goto tx_error;
+		}
+	}
 
-	/* TODO: port min/max limits should be configurable */
-	sport = udp_flow_src_port(dev_net(dev), skb, 0, 0, true);
-
-	tos = geneve->tos;
-	if (tos == 1)
-		tos = ip_tunnel_get_dsfield(iip, skb);
-
-	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_tos = RT_TOS(tos);
-	fl4.daddr = geneve->remote.sin_addr.s_addr;
-	fl4.flowi4_mark = skb->mark;
-	fl4.flowi4_proto = IPPROTO_UDP;
-
-	rt = ip_route_output_key(geneve->net, &fl4);
+	rt = geneve_get_rt(skb, dev, &fl4, info);
 	if (IS_ERR(rt)) {
 		netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
 		dev->stats.tx_carrier_errors++;
 		goto tx_error;
 	}
-	if (rt->dst.dev == dev) { /* is this necessary? */
-		netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
-		dev->stats.collisions++;
-		goto rt_tx_error;
+	skb_reset_mac_header(skb);
+	xnet = !net_eq(geneve->net, dev_net(geneve->dev));
+
+	if (info) {
+		const struct ip_tunnel_key *key = &info->key;
+		bool udp_csum;
+		u8 *opts = NULL;
+		u8 vni[3];
+		__be16 df;
+
+		tunnel_id_to_vni(key->tun_id, vni);
+		df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
+		udp_csum = !!(key->tun_flags & TUNNEL_CSUM);
+
+		if (key->tun_flags & TUNNEL_GENEVE_OPT)
+			opts = ip_tunnel_info_opts(info, info->options_len);
+
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
+				      key->tos, key->ttl, df,
+				      sport, geneve->dst_port,
+				      key->tun_flags, vni,
+				      info->options_len, opts, udp_csum, xnet);
+	} else {
+		iip = ip_hdr(skb);
+		tos = ip_tunnel_ecn_encap(fl4.flowi4_tos, iip, skb);
+
+		ttl = geneve->ttl;
+		if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
+			ttl = 1;
+
+		ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
+
+		/* no need to handle local destination and encap bypass...yet... */
+		err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr, tos,
+				      ttl, 0, sport, geneve->dst_port, 0,
+				      geneve->vni, 0, NULL, false, xnet);
 	}
-
-	tos = ip_tunnel_ecn_encap(tos, iip, skb);
-
-	ttl = geneve->ttl;
-	if (!ttl && IN_MULTICAST(ntohl(fl4.daddr)))
-		ttl = 1;
-
-	ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
-
-	/* no need to handle local destination and encap bypass...yet... */
-
-	err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
-			      tos, ttl, 0, sport, geneve->dst_port, 0,
-	                      geneve->vni, 0, NULL, false,
-	                      !net_eq(geneve->net, dev_net(geneve->dev)));
 	if (err < 0)
 		ip_rt_put(rt);
 
 	iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-
 	return NETDEV_TX_OK;
 
-rt_tx_error:
-	ip_rt_put(rt);
 tx_error:
 	dev->stats.tx_errors++;
 	dev_kfree_skb(skb);
@@ -312,6 +428,7 @@
 	[IFLA_GENEVE_TTL]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_TOS]		= { .type = NLA_U8 },
 	[IFLA_GENEVE_PORT]		= { .type = NLA_U16 },
+	[IFLA_GENEVE_COLLECT_METADATA]	= { .type = NLA_FLAG },
 };
 
 static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
@@ -337,71 +454,112 @@
 	return 0;
 }
 
-static int geneve_newlink(struct net *net, struct net_device *dev,
-			 struct nlattr *tb[], struct nlattr *data[])
+static int geneve_configure(struct net *net, struct net_device *dev,
+			    __be32 rem_addr, __u32 vni, __u8 ttl, __u8 tos,
+			    __u16 dst_port, bool metadata)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);
 	struct geneve_dev *dummy, *geneve = netdev_priv(dev);
 	struct hlist_head *vni_list_head;
 	struct sockaddr_in remote;	/* IPv4 address for link partner */
-	__u32 vni, hash;
-	__be16 dst_port;
+	__u32 hash;
 	int err;
 
-	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
-		return -EINVAL;
+	if (metadata) {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EEXIST;
+		if (!list_empty(&gn->geneve_list))
+			return -EPERM;
+	} else {
+		if (rtnl_dereference(gn->collect_md_tun))
+			return -EPERM;
+	}
 
 	geneve->net = net;
 	geneve->dev = dev;
 
-	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
 	geneve->vni[0] = (vni & 0x00ff0000) >> 16;
 	geneve->vni[1] = (vni & 0x0000ff00) >> 8;
 	geneve->vni[2] =  vni & 0x000000ff;
 
-	geneve->remote.sin_addr.s_addr =
-		nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+	geneve->remote.sin_addr.s_addr = rem_addr;
 	if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
 		return -EINVAL;
 
-	if (data[IFLA_GENEVE_PORT])
-		dst_port = htons(nla_get_u16(data[IFLA_GENEVE_PORT]));
-	else
-		dst_port = htons(GENEVE_UDP_PORT);
-
 	remote = geneve->remote;
+	if (metadata) {
+		if (rem_addr || vni || tos || ttl)
+			return -EINVAL;
+	}
+
 	hash = geneve_net_vni_hash(geneve->vni);
 	vni_list_head = &gn->vni_list[hash];
 	hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
 		if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
 		    !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)) &&
-		    dst_port == dummy->dst_port) {
+		    htons(dst_port) == dummy->dst_port) {
 			return -EBUSY;
 		}
 	}
 
+	geneve->ttl = ttl;
+	geneve->tos = tos;
+	geneve->dst_port = htons(dst_port);
+	geneve->collect_md = metadata;
+
 	err = register_netdevice(dev);
 	if (err)
 		return err;
 
-	if (data[IFLA_GENEVE_TTL])
-		geneve->ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
-
-	if (data[IFLA_GENEVE_TOS])
-		geneve->tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
-
-	geneve->dst_port = dst_port;
 	list_add(&geneve->next, &gn->geneve_list);
-
 	hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
 
+	if (geneve->collect_md)
+		rcu_assign_pointer(gn->collect_md_tun, geneve);
 	return 0;
 }
 
+static int geneve_newlink(struct net *net, struct net_device *dev,
+			  struct nlattr *tb[], struct nlattr *data[])
+{
+	__u16 dst_port = GENEVE_UDP_PORT;
+	__u8 ttl = 0, tos = 0;
+	bool metadata = false;
+	__be32 rem_addr;
+	__u32 vni;
+
+	if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
+		return -EINVAL;
+
+	vni = nla_get_u32(data[IFLA_GENEVE_ID]);
+	rem_addr = nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
+
+	if (data[IFLA_GENEVE_TTL])
+		ttl = nla_get_u8(data[IFLA_GENEVE_TTL]);
+
+	if (data[IFLA_GENEVE_TOS])
+		tos = nla_get_u8(data[IFLA_GENEVE_TOS]);
+
+	if (data[IFLA_GENEVE_PORT])
+		dst_port = nla_get_u16(data[IFLA_GENEVE_PORT]);
+
+	if (data[IFLA_GENEVE_COLLECT_METADATA])
+		metadata = true;
+
+	return geneve_configure(net, dev, rem_addr, vni,
+				ttl, tos, dst_port, metadata);
+}
+
 static void geneve_dellink(struct net_device *dev, struct list_head *head)
 {
 	struct geneve_dev *geneve = netdev_priv(dev);
 
+	if (geneve->collect_md) {
+		struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
+
+		rcu_assign_pointer(gn->collect_md_tun, NULL);
+	}
+
 	if (!hlist_unhashed(&geneve->hlist))
 		hlist_del_rcu(&geneve->hlist);
 
@@ -416,6 +574,7 @@
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TTL */
 		nla_total_size(sizeof(__u8)) +  /* IFLA_GENEVE_TOS */
 		nla_total_size(sizeof(__u16)) +  /* IFLA_GENEVE_PORT */
+		nla_total_size(0) +	 /* IFLA_GENEVE_COLLECT_METADATA */
 		0;
 }
 
@@ -439,6 +598,11 @@
 	if (nla_put_u16(skb, IFLA_GENEVE_PORT, ntohs(geneve->dst_port)))
 		goto nla_put_failure;
 
+	if (geneve->collect_md) {
+		if (nla_put_flag(skb, IFLA_GENEVE_COLLECT_METADATA))
+			goto nla_put_failure;
+	}
+
 	return 0;
 
 nla_put_failure:
@@ -458,6 +622,28 @@
 	.fill_info	= geneve_fill_info,
 };
 
+struct net_device *geneve_dev_create_fb(struct net *net, const char *name,
+					u8 name_assign_type, u16 dst_port)
+{
+	struct nlattr *tb[IFLA_MAX + 1];
+	struct net_device *dev;
+	int err;
+
+	memset(tb, 0, sizeof(tb));
+	dev = rtnl_create_link(net, name, name_assign_type,
+			       &geneve_link_ops, tb);
+	if (IS_ERR(dev))
+		return dev;
+
+	err = geneve_configure(net, dev, 0, 0, 0, 0, dst_port, true);
+	if (err) {
+		free_netdev(dev);
+		return ERR_PTR(err);
+	}
+	return dev;
+}
+EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
+
 static __net_init int geneve_init_net(struct net *net)
 {
 	struct geneve_net *gn = net_generic(net, geneve_net_id);