vxlan: support both IPv4 and IPv6 sockets in a single vxlan device

For metadata based vxlan interface, open both IPv4 and IPv6 socket. This is
much more user friendly: it's not necessary to create two vxlan interfaces
and pay attention to using the right one in routing rules.

Signed-off-by: Jiri Benc <jbenc@redhat.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index a866130..ce704df 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -993,19 +993,30 @@
 static bool vxlan_group_used(struct vxlan_net *vn, struct vxlan_dev *dev)
 {
 	struct vxlan_dev *vxlan;
+	unsigned short family = dev->default_dst.remote_ip.sa.sa_family;
 
 	/* The vxlan_sock is only used by dev, leaving group has
 	 * no effect on other vxlan devices.
 	 */
-	if (atomic_read(&dev->vn_sock->refcnt) == 1)
+	if (family == AF_INET && dev->vn4_sock &&
+	    atomic_read(&dev->vn4_sock->refcnt) == 1)
 		return false;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6 && dev->vn6_sock &&
+	    atomic_read(&dev->vn6_sock->refcnt) == 1)
+		return false;
+#endif
 
 	list_for_each_entry(vxlan, &vn->vxlan_list, next) {
 		if (!netif_running(vxlan->dev) || vxlan == dev)
 			continue;
 
-		if (vxlan->vn_sock != dev->vn_sock)
+		if (family == AF_INET && vxlan->vn4_sock != dev->vn4_sock)
 			continue;
+#if IS_ENABLED(CONFIG_IPV6)
+		if (family == AF_INET6 && vxlan->vn6_sock != dev->vn6_sock)
+			continue;
+#endif
 
 		if (!vxlan_addr_equal(&vxlan->default_dst.remote_ip,
 				      &dev->default_dst.remote_ip))
@@ -1021,16 +1032,16 @@
 	return false;
 }
 
-static void vxlan_sock_release(struct vxlan_dev *vxlan)
+static void __vxlan_sock_release(struct vxlan_sock *vs)
 {
-	struct vxlan_sock *vs = vxlan->vn_sock;
-	struct sock *sk = vs->sock->sk;
-	struct net *net = sock_net(sk);
-	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
+	struct vxlan_net *vn;
 
+	if (!vs)
+		return;
 	if (!atomic_dec_and_test(&vs->refcnt))
 		return;
 
+	vn = net_generic(sock_net(vs->sock->sk), vxlan_net_id);
 	spin_lock(&vn->sock_lock);
 	hlist_del_rcu(&vs->hlist);
 	vxlan_notify_del_rx_port(vs);
@@ -1039,32 +1050,43 @@
 	queue_work(vxlan_wq, &vs->del_work);
 }
 
+static void vxlan_sock_release(struct vxlan_dev *vxlan)
+{
+	__vxlan_sock_release(vxlan->vn4_sock);
+#if IS_ENABLED(CONFIG_IPV6)
+	__vxlan_sock_release(vxlan->vn6_sock);
+#endif
+}
+
 /* Update multicast group membership when first VNI on
  * multicast address is brought up
  */
 static int vxlan_igmp_join(struct vxlan_dev *vxlan)
 {
-	struct vxlan_sock *vs = vxlan->vn_sock;
-	struct sock *sk = vs->sock->sk;
+	struct sock *sk;
 	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
 	int ifindex = vxlan->default_dst.remote_ifindex;
 	int ret = -EINVAL;
 
-	lock_sock(sk);
 	if (ip->sa.sa_family == AF_INET) {
 		struct ip_mreqn mreq = {
 			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
 			.imr_ifindex		= ifindex,
 		};
 
+		sk = vxlan->vn4_sock->sock->sk;
+		lock_sock(sk);
 		ret = ip_mc_join_group(sk, &mreq);
+		release_sock(sk);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
+		sk = vxlan->vn6_sock->sock->sk;
+		lock_sock(sk);
 		ret = ipv6_stub->ipv6_sock_mc_join(sk, ifindex,
 						   &ip->sin6.sin6_addr);
+		release_sock(sk);
 #endif
 	}
-	release_sock(sk);
 
 	return ret;
 }
@@ -1072,27 +1094,30 @@
 /* Inverse of vxlan_igmp_join when last VNI is brought down */
 static int vxlan_igmp_leave(struct vxlan_dev *vxlan)
 {
-	struct vxlan_sock *vs = vxlan->vn_sock;
-	struct sock *sk = vs->sock->sk;
+	struct sock *sk;
 	union vxlan_addr *ip = &vxlan->default_dst.remote_ip;
 	int ifindex = vxlan->default_dst.remote_ifindex;
 	int ret = -EINVAL;
 
-	lock_sock(sk);
 	if (ip->sa.sa_family == AF_INET) {
 		struct ip_mreqn mreq = {
 			.imr_multiaddr.s_addr	= ip->sin.sin_addr.s_addr,
 			.imr_ifindex		= ifindex,
 		};
 
+		sk = vxlan->vn4_sock->sock->sk;
+		lock_sock(sk);
 		ret = ip_mc_leave_group(sk, &mreq);
+		release_sock(sk);
 #if IS_ENABLED(CONFIG_IPV6)
 	} else {
+		sk = vxlan->vn6_sock->sock->sk;
+		lock_sock(sk);
 		ret = ipv6_stub->ipv6_sock_mc_drop(sk, ifindex,
 						   &ip->sin6.sin6_addr);
+		release_sock(sk);
 #endif
 	}
-	release_sock(sk);
 
 	return ret;
 }
@@ -1873,8 +1898,7 @@
 {
 	struct ip_tunnel_info *info;
 	struct vxlan_dev *vxlan = netdev_priv(dev);
-	struct sock *sk = vxlan->vn_sock->sock->sk;
-	unsigned short family = vxlan_get_sk_family(vxlan->vn_sock);
+	struct sock *sk;
 	struct rtable *rt = NULL;
 	const struct iphdr *old_iph;
 	struct flowi4 fl4;
@@ -1901,13 +1925,10 @@
 				  dev->name);
 			goto drop;
 		}
-		if (family != ip_tunnel_info_af(info))
-			goto drop;
-
 		dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
 		vni = be64_to_cpu(info->key.tun_id);
-		remote_ip.sa.sa_family = family;
-		if (family == AF_INET)
+		remote_ip.sa.sa_family = ip_tunnel_info_af(info);
+		if (remote_ip.sa.sa_family == AF_INET)
 			remote_ip.sin.sin_addr.s_addr = info->key.u.ipv4.dst;
 		else
 			remote_ip.sin6.sin6_addr = info->key.u.ipv6.dst;
@@ -1952,6 +1973,10 @@
 	}
 
 	if (dst->sa.sa_family == AF_INET) {
+		if (!vxlan->vn4_sock)
+			goto drop;
+		sk = vxlan->vn4_sock->sock->sk;
+
 		if (info && (info->key.tun_flags & TUNNEL_DONT_FRAGMENT))
 			df = htons(IP_DF);
 
@@ -2013,6 +2038,10 @@
 		struct flowi6 fl6;
 		u32 rt6i_flags;
 
+		if (!vxlan->vn6_sock)
+			goto drop;
+		sk = vxlan->vn6_sock->sock->sk;
+
 		memset(&fl6, 0, sizeof(fl6));
 		fl6.flowi6_oif = rdst ? rdst->remote_ifindex : 0;
 		fl6.daddr = dst->sin6.sin6_addr;
@@ -2204,7 +2233,6 @@
 	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
 	__u32 vni = vxlan->default_dst.remote_vni;
 
-	vxlan->vn_sock = vs;
 	spin_lock(&vn->sock_lock);
 	hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
 	spin_unlock(&vn->sock_lock);
@@ -2535,14 +2563,13 @@
 }
 
 /* Create new listen socket if needed */
-static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port,
-					      u32 flags)
+static struct vxlan_sock *vxlan_socket_create(struct net *net, bool ipv6,
+					      __be16 port, u32 flags)
 {
 	struct vxlan_net *vn = net_generic(net, vxlan_net_id);
 	struct vxlan_sock *vs;
 	struct socket *sock;
 	unsigned int h;
-	bool ipv6 = !!(flags & VXLAN_F_IPV6);
 	struct udp_tunnel_sock_cfg tunnel_cfg;
 
 	vs = kzalloc(sizeof(*vs), GFP_KERNEL);
@@ -2587,11 +2614,10 @@
 	return vs;
 }
 
-static int vxlan_sock_add(struct vxlan_dev *vxlan)
+static int __vxlan_sock_add(struct vxlan_dev *vxlan, bool ipv6)
 {
 	struct vxlan_net *vn = net_generic(vxlan->net, vxlan_net_id);
 	struct vxlan_sock *vs = NULL;
-	bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
 
 	if (!vxlan->cfg.no_share) {
 		spin_lock(&vn->sock_lock);
@@ -2604,20 +2630,46 @@
 		spin_unlock(&vn->sock_lock);
 	}
 	if (!vs)
-		vs = vxlan_socket_create(vxlan->net, vxlan->cfg.dst_port,
-					 vxlan->flags);
+		vs = vxlan_socket_create(vxlan->net, ipv6,
+					 vxlan->cfg.dst_port, vxlan->flags);
 	if (IS_ERR(vs))
 		return PTR_ERR(vs);
+#if IS_ENABLED(CONFIG_IPV6)
+	if (ipv6)
+		vxlan->vn6_sock = vs;
+	else
+#endif
+		vxlan->vn4_sock = vs;
 	vxlan_vs_add_dev(vs, vxlan);
 	return 0;
 }
 
+static int vxlan_sock_add(struct vxlan_dev *vxlan)
+{
+	bool ipv6 = vxlan->flags & VXLAN_F_IPV6;
+	bool metadata = vxlan->flags & VXLAN_F_COLLECT_METADATA;
+	int ret = 0;
+
+	vxlan->vn4_sock = NULL;
+#if IS_ENABLED(CONFIG_IPV6)
+	vxlan->vn6_sock = NULL;
+	if (ipv6 || metadata)
+		ret = __vxlan_sock_add(vxlan, true);
+#endif
+	if (!ret && (!ipv6 || metadata))
+		ret = __vxlan_sock_add(vxlan, false);
+	if (ret < 0)
+		vxlan_sock_release(vxlan);
+	return ret;
+}
+
 static int vxlan_dev_configure(struct net *src_net, struct net_device *dev,
 			       struct vxlan_config *conf)
 {
 	struct vxlan_net *vn = net_generic(src_net, vxlan_net_id);
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct vxlan_rdst *dst = &vxlan->default_dst;
+	unsigned short needed_headroom = ETH_HLEN;
 	int err;
 	bool use_ipv6 = false;
 	__be16 default_port = vxlan->cfg.dst_port;
@@ -2637,6 +2689,7 @@
 		if (!IS_ENABLED(CONFIG_IPV6))
 			return -EPFNOSUPPORT;
 		use_ipv6 = true;
+		vxlan->flags |= VXLAN_F_IPV6;
 	}
 
 	if (conf->remote_ifindex) {
@@ -2657,22 +2710,21 @@
 				pr_info("IPv6 is disabled via sysctl\n");
 				return -EPERM;
 			}
-			vxlan->flags |= VXLAN_F_IPV6;
 		}
 #endif
 
 		if (!conf->mtu)
 			dev->mtu = lowerdev->mtu - (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
 
-		dev->needed_headroom = lowerdev->hard_header_len +
-				       (use_ipv6 ? VXLAN6_HEADROOM : VXLAN_HEADROOM);
-	} else if (use_ipv6) {
-		vxlan->flags |= VXLAN_F_IPV6;
-		dev->needed_headroom = ETH_HLEN + VXLAN6_HEADROOM;
-	} else {
-		dev->needed_headroom = ETH_HLEN + VXLAN_HEADROOM;
+		needed_headroom = lowerdev->hard_header_len;
 	}
 
+	if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
+		needed_headroom += VXLAN6_HEADROOM;
+	else
+		needed_headroom += VXLAN_HEADROOM;
+	dev->needed_headroom = needed_headroom;
+
 	memcpy(&vxlan->cfg, conf, sizeof(*conf));
 	if (!vxlan->cfg.dst_port)
 		vxlan->cfg.dst_port = default_port;
diff --git a/include/net/vxlan.h b/include/net/vxlan.h
index 480a319..c1c899c 100644
--- a/include/net/vxlan.h
+++ b/include/net/vxlan.h
@@ -152,7 +152,10 @@
 struct vxlan_dev {
 	struct hlist_node hlist;	/* vni hash table */
 	struct list_head  next;		/* vxlan's per namespace list */
-	struct vxlan_sock *vn_sock;	/* listening socket */
+	struct vxlan_sock *vn4_sock;	/* listening socket for IPv4 */
+#if IS_ENABLED(CONFIG_IPV6)
+	struct vxlan_sock *vn6_sock;	/* listening socket for IPv6 */
+#endif
 	struct net_device *dev;
 	struct net	  *net;		/* netns for packet i/o */
 	struct vxlan_rdst default_dst;	/* default destination */
@@ -195,9 +198,14 @@
 struct net_device *vxlan_dev_create(struct net *net, const char *name,
 				    u8 name_assign_type, struct vxlan_config *conf);
 
-static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan)
+static inline __be16 vxlan_dev_dst_port(struct vxlan_dev *vxlan,
+					unsigned short family)
 {
-	return inet_sk(vxlan->vn_sock->sock->sk)->inet_sport;
+#if IS_ENABLED(CONFIG_IPV6)
+	if (family == AF_INET6)
+		return inet_sk(vxlan->vn6_sock->sock->sk)->inet_sport;
+#endif
+	return inet_sk(vxlan->vn4_sock->sock->sk)->inet_sport;
 }
 
 static inline netdev_features_t vxlan_features_check(struct sk_buff *skb,
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index c11413d..fb3cdb8 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -151,7 +151,8 @@
 {
 	struct vxlan_dev *vxlan = netdev_priv(vport->dev);
 	struct net *net = ovs_dp_get_net(vport->dp);
-	__be16 dst_port = vxlan_dev_dst_port(vxlan);
+	unsigned short family = ip_tunnel_info_af(upcall->egress_tun_info);
+	__be16 dst_port = vxlan_dev_dst_port(vxlan, family);
 	__be16 src_port;
 	int port_min;
 	int port_max;