vxlan: generalize forwarding tables

This patch generalizes VXLAN forwarding table entries allowing an administrator
to:
	1) specify multiple destinations for a given MAC
	2) specify alternate vni's in the VXLAN header
	3) specify alternate destination UDP ports
	4) use multicast MAC addresses as fdb lookup keys
	5) specify multicast destinations
	6) specify the outgoing interface for forwarded packets

The combination allows configuration of more complex topologies using VXLAN
encapsulation.

Changes since v1: rebase to 3.9.0-rc2

Signed-Off-By: David L Stevens <dlstevens@us.ibm.com>

Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index db0df07..33427fd 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -81,13 +81,22 @@
 	struct hlist_head vni_list[VNI_HASH_SIZE];
 };
 
+struct vxlan_rdst {
+	struct rcu_head		 rcu;
+	__be32			 remote_ip;
+	__be16			 remote_port;
+	u32			 remote_vni;
+	u32			 remote_ifindex;
+	struct vxlan_rdst	*remote_next;
+};
+
 /* Forwarding table entry */
 struct vxlan_fdb {
 	struct hlist_node hlist;	/* linked list of entries */
 	struct rcu_head	  rcu;
 	unsigned long	  updated;	/* jiffies */
 	unsigned long	  used;
-	__be32		  remote_ip;
+	struct vxlan_rdst remote;
 	u16		  state;	/* see ndm_state */
 	u8		  eth_addr[ETH_ALEN];
 };
@@ -157,7 +166,8 @@
 /* Fill in neighbour message in skbuff. */
 static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
 			   const struct vxlan_fdb *fdb,
-			   u32 portid, u32 seq, int type, unsigned int flags)
+			   u32 portid, u32 seq, int type, unsigned int flags,
+			   const struct vxlan_rdst *rdst)
 {
 	unsigned long now = jiffies;
 	struct nda_cacheinfo ci;
@@ -176,7 +186,7 @@
 
 	if (type == RTM_GETNEIGH) {
 		ndm->ndm_family	= AF_INET;
-		send_ip = fdb->remote_ip != 0;
+		send_ip = rdst->remote_ip != htonl(INADDR_ANY);
 		send_eth = !is_zero_ether_addr(fdb->eth_addr);
 	} else
 		ndm->ndm_family	= AF_BRIDGE;
@@ -188,7 +198,17 @@
 	if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
 		goto nla_put_failure;
 
-	if (send_ip && nla_put_be32(skb, NDA_DST, fdb->remote_ip))
+	if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
+		goto nla_put_failure;
+
+	if (rdst->remote_port && rdst->remote_port != vxlan_port &&
+	    nla_put_be16(skb, NDA_PORT, rdst->remote_port))
+		goto nla_put_failure;
+	if (rdst->remote_vni != vxlan->vni &&
+	    nla_put_be32(skb, NDA_VNI, rdst->remote_vni))
+		goto nla_put_failure;
+	if (rdst->remote_ifindex &&
+	    nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
 		goto nla_put_failure;
 
 	ci.ndm_used	 = jiffies_to_clock_t(now - fdb->used);
@@ -211,6 +231,9 @@
 	return NLMSG_ALIGN(sizeof(struct ndmsg))
 		+ nla_total_size(ETH_ALEN) /* NDA_LLADDR */
 		+ nla_total_size(sizeof(__be32)) /* NDA_DST */
+		+ nla_total_size(sizeof(__be32)) /* NDA_PORT */
+		+ nla_total_size(sizeof(__be32)) /* NDA_VNI */
+		+ nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
 		+ nla_total_size(sizeof(struct nda_cacheinfo));
 }
 
@@ -225,7 +248,7 @@
 	if (skb == NULL)
 		goto errout;
 
-	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0);
+	err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote);
 	if (err < 0) {
 		/* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
 		WARN_ON(err == -EMSGSIZE);
@@ -247,7 +270,8 @@
 
 	memset(&f, 0, sizeof f);
 	f.state = NUD_STALE;
-	f.remote_ip = ipa; /* goes to NDA_DST */
+	f.remote.remote_ip = ipa; /* goes to NDA_DST */
+	f.remote.remote_vni = VXLAN_N_VID;
 
 	vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
 }
@@ -300,10 +324,38 @@
 	return NULL;
 }
 
+/* Add/update destinations for multicast */
+static int vxlan_fdb_append(struct vxlan_fdb *f,
+			    __be32 ip, __u32 port, __u32 vni, __u32 ifindex)
+{
+	struct vxlan_rdst *rd_prev, *rd;
+
+	rd_prev = NULL;
+	for (rd = &f->remote; rd; rd = rd->remote_next) {
+		if (rd->remote_ip == ip &&
+		    rd->remote_port == port &&
+		    rd->remote_vni == vni &&
+		    rd->remote_ifindex == ifindex)
+			return 0;
+		rd_prev = rd;
+	}
+	rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
+	if (rd == NULL)
+		return -ENOBUFS;
+	rd->remote_ip = ip;
+	rd->remote_port = port;
+	rd->remote_vni = vni;
+	rd->remote_ifindex = ifindex;
+	rd->remote_next = NULL;
+	rd_prev->remote_next = rd;
+	return 1;
+}
+
 /* Add new entry to forwarding table -- assumes lock held */
 static int vxlan_fdb_create(struct vxlan_dev *vxlan,
 			    const u8 *mac, __be32 ip,
-			    __u16 state, __u16 flags)
+			    __u16 state, __u16 flags,
+			    __u32 port, __u32 vni, __u32 ifindex)
 {
 	struct vxlan_fdb *f;
 	int notify = 0;
@@ -320,6 +372,14 @@
 			f->updated = jiffies;
 			notify = 1;
 		}
+		if ((flags & NLM_F_APPEND) &&
+		    is_multicast_ether_addr(f->eth_addr)) {
+			int rc = vxlan_fdb_append(f, ip, port, vni, ifindex);
+
+			if (rc < 0)
+				return rc;
+			notify |= rc;
+		}
 	} else {
 		if (!(flags & NLM_F_CREATE))
 			return -ENOENT;
@@ -333,7 +393,11 @@
 			return -ENOMEM;
 
 		notify = 1;
-		f->remote_ip = ip;
+		f->remote.remote_ip = ip;
+		f->remote.remote_port = port;
+		f->remote.remote_vni = vni;
+		f->remote.remote_ifindex = ifindex;
+		f->remote.remote_next = NULL;
 		f->state = state;
 		f->updated = f->used = jiffies;
 		memcpy(f->eth_addr, mac, ETH_ALEN);
@@ -349,6 +413,19 @@
 	return 0;
 }
 
+void vxlan_fdb_free(struct rcu_head *head)
+{
+	struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
+
+	while (f->remote.remote_next) {
+		struct vxlan_rdst *rd = f->remote.remote_next;
+
+		f->remote.remote_next = rd->remote_next;
+		kfree(rd);
+	}
+	kfree(f);
+}
+
 static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
 {
 	netdev_dbg(vxlan->dev,
@@ -358,7 +435,7 @@
 	vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
 
 	hlist_del_rcu(&f->hlist);
-	kfree_rcu(f, rcu);
+	call_rcu(&f->rcu, vxlan_fdb_free);
 }
 
 /* Add static entry (via netlink) */
@@ -367,7 +444,9 @@
 			 const unsigned char *addr, u16 flags)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct net *net = dev_net(vxlan->dev);
 	__be32 ip;
+	u32 port, vni, ifindex;
 	int err;
 
 	if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
@@ -384,8 +463,36 @@
 
 	ip = nla_get_be32(tb[NDA_DST]);
 
+	if (tb[NDA_PORT]) {
+		if (nla_len(tb[NDA_PORT]) != sizeof(u32))
+			return -EINVAL;
+		port = nla_get_u32(tb[NDA_PORT]);
+	} else
+		port = vxlan_port;
+
+	if (tb[NDA_VNI]) {
+		if (nla_len(tb[NDA_VNI]) != sizeof(u32))
+			return -EINVAL;
+		vni = nla_get_u32(tb[NDA_VNI]);
+	} else
+		vni = vxlan->vni;
+
+	if (tb[NDA_IFINDEX]) {
+		struct net_device *dev;
+
+		if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
+			return -EINVAL;
+		ifindex = nla_get_u32(tb[NDA_IFINDEX]);
+		dev = dev_get_by_index(net, ifindex);
+		if (!dev)
+			return -EADDRNOTAVAIL;
+		dev_put(dev);
+	} else
+		ifindex = 0;
+
 	spin_lock_bh(&vxlan->hash_lock);
-	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags);
+	err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags, port,
+		vni, ifindex);
 	spin_unlock_bh(&vxlan->hash_lock);
 
 	return err;
@@ -423,18 +530,21 @@
 		int err;
 
 		hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
-			if (idx < cb->args[0])
-				goto skip;
+			struct vxlan_rdst *rd;
+			for (rd = &f->remote; rd; rd = rd->remote_next) {
+				if (idx < cb->args[0])
+					goto skip;
 
-			err = vxlan_fdb_info(skb, vxlan, f,
-					     NETLINK_CB(cb->skb).portid,
-					     cb->nlh->nlmsg_seq,
-					     RTM_NEWNEIGH,
-					     NLM_F_MULTI);
-			if (err < 0)
-				break;
+				err = vxlan_fdb_info(skb, vxlan, f,
+						     NETLINK_CB(cb->skb).portid,
+						     cb->nlh->nlmsg_seq,
+						     RTM_NEWNEIGH,
+						     NLM_F_MULTI, rd);
+				if (err < 0)
+					break;
 skip:
-			++idx;
+				++idx;
+			}
 		}
 	}
 
@@ -454,22 +564,23 @@
 	f = vxlan_find_mac(vxlan, src_mac);
 	if (likely(f)) {
 		f->used = jiffies;
-		if (likely(f->remote_ip == src_ip))
+		if (likely(f->remote.remote_ip == src_ip))
 			return;
 
 		if (net_ratelimit())
 			netdev_info(dev,
 				    "%pM migrated from %pI4 to %pI4\n",
-				    src_mac, &f->remote_ip, &src_ip);
+				    src_mac, &f->remote.remote_ip, &src_ip);
 
-		f->remote_ip = src_ip;
+		f->remote.remote_ip = src_ip;
 		f->updated = jiffies;
 	} else {
 		/* learned new entry */
 		spin_lock(&vxlan->hash_lock);
 		err = vxlan_fdb_create(vxlan, src_mac, src_ip,
 				       NUD_REACHABLE,
-				       NLM_F_EXCL|NLM_F_CREATE);
+				       NLM_F_EXCL|NLM_F_CREATE,
+				       vxlan_port, vxlan->vni, 0);
 		spin_unlock(&vxlan->hash_lock);
 	}
 }
@@ -701,7 +812,7 @@
 		}
 
 		f = vxlan_find_mac(vxlan, n->ha);
-		if (f && f->remote_ip == 0) {
+		if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
 			/* bridge-local neighbor */
 			neigh_release(n);
 			goto out;
@@ -834,47 +945,26 @@
 	return 0;
 }
 
-/* Transmit local packets over Vxlan
- *
- * Outer IP header inherits ECN and DF from inner header.
- * Outer UDP destination is the VXLAN assigned port.
- *           source port is based on hash of flow
- */
-static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
+				  struct vxlan_rdst *rdst, bool did_rsc)
 {
 	struct vxlan_dev *vxlan = netdev_priv(dev);
 	struct rtable *rt;
 	const struct iphdr *old_iph;
-	struct ethhdr *eth;
 	struct iphdr *iph;
 	struct vxlanhdr *vxh;
 	struct udphdr *uh;
 	struct flowi4 fl4;
 	unsigned int pkt_len = skb->len;
 	__be32 dst;
-	__u16 src_port;
+	__u16 src_port, dst_port;
+        u32 vni;
 	__be16 df = 0;
 	__u8 tos, ttl;
-	bool did_rsc = false;
-	const struct vxlan_fdb *f;
 
-	skb_reset_mac_header(skb);
-	eth = eth_hdr(skb);
-
-	if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
-		return arp_reduce(dev, skb);
-	else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
-		did_rsc = route_shortcircuit(dev, skb);
-
-	f = vxlan_find_mac(vxlan, eth->h_dest);
-	if (f == NULL) {
-		did_rsc = false;
-		dst = vxlan->gaddr;
-		if (!dst && (vxlan->flags & VXLAN_F_L2MISS) &&
-		    !is_multicast_ether_addr(eth->h_dest))
-			vxlan_fdb_miss(vxlan, eth->h_dest);
-	} else
-		dst = f->remote_ip;
+	dst_port = rdst->remote_port ? rdst->remote_port : vxlan_port;
+	vni = rdst->remote_vni;
+	dst = rdst->remote_ip;
 
 	if (!dst) {
 		if (did_rsc) {
@@ -922,7 +1012,7 @@
 	src_port = vxlan_src_port(vxlan, skb);
 
 	memset(&fl4, 0, sizeof(fl4));
-	fl4.flowi4_oif = vxlan->link;
+	fl4.flowi4_oif = rdst->remote_ifindex;
 	fl4.flowi4_tos = RT_TOS(tos);
 	fl4.daddr = dst;
 	fl4.saddr = vxlan->saddr;
@@ -949,13 +1039,13 @@
 
 	vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
 	vxh->vx_flags = htonl(VXLAN_FLAGS);
-	vxh->vx_vni = htonl(vxlan->vni << 8);
+	vxh->vx_vni = htonl(vni << 8);
 
 	__skb_push(skb, sizeof(*uh));
 	skb_reset_transport_header(skb);
 	uh = udp_hdr(skb);
 
-	uh->dest = htons(vxlan_port);
+	uh->dest = htons(dst_port);
 	uh->source = htons(src_port);
 
 	uh->len = htons(skb->len);
@@ -995,6 +1085,64 @@
 	return NETDEV_TX_OK;
 }
 
+/* Transmit local packets over Vxlan
+ *
+ * Outer IP header inherits ECN and DF from inner header.
+ * Outer UDP destination is the VXLAN assigned port.
+ *           source port is based on hash of flow
+ */
+static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
+{
+	struct vxlan_dev *vxlan = netdev_priv(dev);
+	struct ethhdr *eth;
+	bool did_rsc = false;
+	struct vxlan_rdst group, *rdst0, *rdst;
+	struct vxlan_fdb *f;
+	int rc1, rc;
+
+	skb_reset_mac_header(skb);
+	eth = eth_hdr(skb);
+
+	if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
+		return arp_reduce(dev, skb);
+	else if ((vxlan->flags&VXLAN_F_RSC) && ntohs(eth->h_proto) == ETH_P_IP)
+		did_rsc = route_shortcircuit(dev, skb);
+
+	f = vxlan_find_mac(vxlan, eth->h_dest);
+	if (f == NULL) {
+		did_rsc = false;
+		group.remote_port = vxlan_port;
+		group.remote_vni = vxlan->vni;
+		group.remote_ip = vxlan->gaddr;
+		group.remote_ifindex = vxlan->link;
+		group.remote_next = 0;
+		rdst0 = &group;
+
+		if (group.remote_ip == htonl(INADDR_ANY) &&
+		    (vxlan->flags & VXLAN_F_L2MISS) &&
+		    !is_multicast_ether_addr(eth->h_dest))
+			vxlan_fdb_miss(vxlan, eth->h_dest);
+	} else
+		rdst0 = &f->remote;
+
+	rc = NETDEV_TX_OK;
+
+	/* if there are multiple destinations, send copies */
+	for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
+		struct sk_buff *skb1;
+
+		skb1 = skb_clone(skb, GFP_ATOMIC);
+		rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
+		if (rc == NETDEV_TX_OK)
+			rc = rc1;
+	}
+
+	rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);
+	if (rc == NETDEV_TX_OK)
+		rc = rc1;
+	return rc;
+}
+
 /* Walk the forwarding table and purge stale entries */
 static void vxlan_cleanup(unsigned long arg)
 {
@@ -1558,6 +1706,7 @@
 {
 	rtnl_link_unregister(&vxlan_link_ops);
 	unregister_pernet_device(&vxlan_net_ops);
+	rcu_barrier();
 }
 module_exit(vxlan_cleanup_module);