blob: 8111565c35fceec89b3da0afcf0e8861a19eed40 [file] [log] [blame]
stephen hemmingerd3428942012-10-01 12:32:35 +00001/*
Rami Roseneb5ce432012-11-13 13:29:15 +00002 * VXLAN: Virtual eXtensible Local Area Network
stephen hemmingerd3428942012-10-01 12:32:35 +00003 *
stephen hemminger3b8df3c2013-04-27 11:31:52 +00004 * Copyright (c) 2012-2013 Vyatta Inc.
stephen hemmingerd3428942012-10-01 12:32:35 +00005 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 *
10 * TODO
stephen hemmingerd3428942012-10-01 12:32:35 +000011 * - IPv6 (not in RFC)
12 */
13
14#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
15
16#include <linux/kernel.h>
17#include <linux/types.h>
18#include <linux/module.h>
19#include <linux/errno.h>
20#include <linux/slab.h>
21#include <linux/skbuff.h>
22#include <linux/rculist.h>
23#include <linux/netdevice.h>
24#include <linux/in.h>
25#include <linux/ip.h>
26#include <linux/udp.h>
27#include <linux/igmp.h>
28#include <linux/etherdevice.h>
29#include <linux/if_ether.h>
stephen hemmingerd3428942012-10-01 12:32:35 +000030#include <linux/hash.h>
Yan Burman1b13c972013-01-29 23:43:07 +000031#include <linux/ethtool.h>
David Stevense4f67ad2012-11-20 02:50:14 +000032#include <net/arp.h>
33#include <net/ndisc.h>
stephen hemmingerd3428942012-10-01 12:32:35 +000034#include <net/ip.h>
Pravin B Shelarc5441932013-03-25 14:49:35 +000035#include <net/ip_tunnels.h>
stephen hemmingerd3428942012-10-01 12:32:35 +000036#include <net/icmp.h>
37#include <net/udp.h>
38#include <net/rtnetlink.h>
39#include <net/route.h>
40#include <net/dsfield.h>
41#include <net/inet_ecn.h>
42#include <net/net_namespace.h>
43#include <net/netns/generic.h>
44
45#define VXLAN_VERSION "0.1"
46
stephen hemminger553675f2013-05-16 11:35:20 +000047#define PORT_HASH_BITS 8
48#define PORT_HASH_SIZE (1<<PORT_HASH_BITS)
stephen hemmingerd3428942012-10-01 12:32:35 +000049#define VNI_HASH_BITS 10
50#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
51#define FDB_HASH_BITS 8
52#define FDB_HASH_SIZE (1<<FDB_HASH_BITS)
53#define FDB_AGE_DEFAULT 300 /* 5 min */
54#define FDB_AGE_INTERVAL (10 * HZ) /* rescan interval */
55
56#define VXLAN_N_VID (1u << 24)
57#define VXLAN_VID_MASK (VXLAN_N_VID - 1)
Alexander Duyck52b702f2012-11-09 13:35:24 +000058/* IP header + UDP + VXLAN + Ethernet header */
59#define VXLAN_HEADROOM (20 + 8 + 8 + 14)
stephen hemmingerd3428942012-10-01 12:32:35 +000060
61#define VXLAN_FLAGS 0x08000000 /* struct vxlanhdr.vx_flags required value. */
62
63/* VXLAN protocol header */
64struct vxlanhdr {
65 __be32 vx_flags;
66 __be32 vx_vni;
67};
68
stephen hemminger23c578b2013-04-27 11:31:53 +000069/* UDP port for VXLAN traffic.
70 * The IANA assigned port is 4789, but the Linux default is 8472
71 * for compatability with early adopters.
72 */
stephen hemmingerd3428942012-10-01 12:32:35 +000073static unsigned int vxlan_port __read_mostly = 8472;
74module_param_named(udp_port, vxlan_port, uint, 0444);
75MODULE_PARM_DESC(udp_port, "Destination UDP port");
76
77static bool log_ecn_error = true;
78module_param(log_ecn_error, bool, 0644);
79MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
80
stephen hemmingerd3428942012-10-01 12:32:35 +000081static unsigned int vxlan_net_id;
stephen hemminger553675f2013-05-16 11:35:20 +000082
83/* per UDP socket information */
84struct vxlan_sock {
85 struct hlist_node hlist;
86 struct rcu_head rcu;
87 struct work_struct del_work;
88 unsigned int refcnt;
89 struct socket *sock;
stephen hemmingerd3428942012-10-01 12:32:35 +000090 struct hlist_head vni_list[VNI_HASH_SIZE];
91};
92
stephen hemminger553675f2013-05-16 11:35:20 +000093/* per-network namespace private data for this module */
94struct vxlan_net {
95 struct list_head vxlan_list;
96 struct hlist_head sock_list[PORT_HASH_SIZE];
97};
98
David Stevens66817122013-03-15 04:35:51 +000099struct vxlan_rdst {
David Stevens66817122013-03-15 04:35:51 +0000100 __be32 remote_ip;
101 __be16 remote_port;
102 u32 remote_vni;
103 u32 remote_ifindex;
104 struct vxlan_rdst *remote_next;
105};
106
stephen hemmingerd3428942012-10-01 12:32:35 +0000107/* Forwarding table entry */
108struct vxlan_fdb {
109 struct hlist_node hlist; /* linked list of entries */
110 struct rcu_head rcu;
111 unsigned long updated; /* jiffies */
112 unsigned long used;
David Stevens66817122013-03-15 04:35:51 +0000113 struct vxlan_rdst remote;
stephen hemmingerd3428942012-10-01 12:32:35 +0000114 u16 state; /* see ndm_state */
David Stevensae884082013-04-19 00:36:26 +0000115 u8 flags; /* see ndm_flags */
stephen hemmingerd3428942012-10-01 12:32:35 +0000116 u8 eth_addr[ETH_ALEN];
117};
118
stephen hemmingerd3428942012-10-01 12:32:35 +0000119/* Pseudo network device */
120struct vxlan_dev {
stephen hemminger553675f2013-05-16 11:35:20 +0000121 struct hlist_node hlist; /* vni hash table */
122 struct list_head next; /* vxlan's per namespace list */
123 struct vxlan_sock *vn_sock; /* listening socket */
stephen hemmingerd3428942012-10-01 12:32:35 +0000124 struct net_device *dev;
Atzm Watanabec7995c42013-04-16 02:50:52 +0000125 struct vxlan_rdst default_dst; /* default destination */
stephen hemmingerd3428942012-10-01 12:32:35 +0000126 __be32 saddr; /* source address */
stephen hemminger823aa872013-04-27 11:31:57 +0000127 __be16 dst_port;
stephen hemminger05f47d62012-10-09 20:35:50 +0000128 __u16 port_min; /* source port range */
129 __u16 port_max;
stephen hemmingerd3428942012-10-01 12:32:35 +0000130 __u8 tos; /* TOS override */
131 __u8 ttl;
David Stevense4f67ad2012-11-20 02:50:14 +0000132 u32 flags; /* VXLAN_F_* below */
stephen hemmingerd3428942012-10-01 12:32:35 +0000133
134 unsigned long age_interval;
135 struct timer_list age_timer;
136 spinlock_t hash_lock;
137 unsigned int addrcnt;
138 unsigned int addrmax;
stephen hemmingerd3428942012-10-01 12:32:35 +0000139
140 struct hlist_head fdb_head[FDB_HASH_SIZE];
141};
142
David Stevense4f67ad2012-11-20 02:50:14 +0000143#define VXLAN_F_LEARN 0x01
144#define VXLAN_F_PROXY 0x02
145#define VXLAN_F_RSC 0x04
146#define VXLAN_F_L2MISS 0x08
147#define VXLAN_F_L3MISS 0x10
148
stephen hemmingerd3428942012-10-01 12:32:35 +0000149/* salt for hash table */
150static u32 vxlan_salt __read_mostly;
151
stephen hemminger553675f2013-05-16 11:35:20 +0000152/* Virtual Network hash table head */
153static inline struct hlist_head *vni_head(struct vxlan_sock *vs, u32 id)
154{
155 return &vs->vni_list[hash_32(id, VNI_HASH_BITS)];
156}
157
158/* Socket hash table head */
159static inline struct hlist_head *vs_head(struct net *net, __be16 port)
stephen hemmingerd3428942012-10-01 12:32:35 +0000160{
161 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
162
stephen hemminger553675f2013-05-16 11:35:20 +0000163 return &vn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)];
164}
165
166/* Find VXLAN socket based on network namespace and UDP port */
167static struct vxlan_sock *vxlan_find_port(struct net *net, __be16 port)
168{
169 struct vxlan_sock *vs;
170
171 hlist_for_each_entry_rcu(vs, vs_head(net, port), hlist) {
172 if (inet_sk(vs->sock->sk)->inet_sport == port)
173 return vs;
174 }
175 return NULL;
stephen hemmingerd3428942012-10-01 12:32:35 +0000176}
177
178/* Look up VNI in a per net namespace table */
stephen hemminger553675f2013-05-16 11:35:20 +0000179static struct vxlan_dev *vxlan_find_vni(struct net *net, u32 id, __be16 port)
stephen hemmingerd3428942012-10-01 12:32:35 +0000180{
stephen hemminger553675f2013-05-16 11:35:20 +0000181 struct vxlan_sock *vs;
stephen hemmingerd3428942012-10-01 12:32:35 +0000182 struct vxlan_dev *vxlan;
stephen hemmingerd3428942012-10-01 12:32:35 +0000183
stephen hemminger553675f2013-05-16 11:35:20 +0000184 vs = vxlan_find_port(net, port);
185 if (!vs)
186 return NULL;
187
188 hlist_for_each_entry_rcu(vxlan, vni_head(vs, id), hlist) {
Atzm Watanabec7995c42013-04-16 02:50:52 +0000189 if (vxlan->default_dst.remote_vni == id)
stephen hemmingerd3428942012-10-01 12:32:35 +0000190 return vxlan;
191 }
192
193 return NULL;
194}
195
196/* Fill in neighbour message in skbuff. */
197static int vxlan_fdb_info(struct sk_buff *skb, struct vxlan_dev *vxlan,
198 const struct vxlan_fdb *fdb,
David Stevens66817122013-03-15 04:35:51 +0000199 u32 portid, u32 seq, int type, unsigned int flags,
200 const struct vxlan_rdst *rdst)
stephen hemmingerd3428942012-10-01 12:32:35 +0000201{
202 unsigned long now = jiffies;
203 struct nda_cacheinfo ci;
204 struct nlmsghdr *nlh;
205 struct ndmsg *ndm;
David Stevense4f67ad2012-11-20 02:50:14 +0000206 bool send_ip, send_eth;
stephen hemmingerd3428942012-10-01 12:32:35 +0000207
208 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*ndm), flags);
209 if (nlh == NULL)
210 return -EMSGSIZE;
211
212 ndm = nlmsg_data(nlh);
213 memset(ndm, 0, sizeof(*ndm));
David Stevense4f67ad2012-11-20 02:50:14 +0000214
215 send_eth = send_ip = true;
216
217 if (type == RTM_GETNEIGH) {
218 ndm->ndm_family = AF_INET;
David Stevens66817122013-03-15 04:35:51 +0000219 send_ip = rdst->remote_ip != htonl(INADDR_ANY);
David Stevense4f67ad2012-11-20 02:50:14 +0000220 send_eth = !is_zero_ether_addr(fdb->eth_addr);
221 } else
222 ndm->ndm_family = AF_BRIDGE;
stephen hemmingerd3428942012-10-01 12:32:35 +0000223 ndm->ndm_state = fdb->state;
224 ndm->ndm_ifindex = vxlan->dev->ifindex;
David Stevensae884082013-04-19 00:36:26 +0000225 ndm->ndm_flags = fdb->flags;
stephen hemmingerd3428942012-10-01 12:32:35 +0000226 ndm->ndm_type = NDA_DST;
227
David Stevense4f67ad2012-11-20 02:50:14 +0000228 if (send_eth && nla_put(skb, NDA_LLADDR, ETH_ALEN, &fdb->eth_addr))
stephen hemmingerd3428942012-10-01 12:32:35 +0000229 goto nla_put_failure;
230
David Stevens66817122013-03-15 04:35:51 +0000231 if (send_ip && nla_put_be32(skb, NDA_DST, rdst->remote_ip))
232 goto nla_put_failure;
233
stephen hemminger823aa872013-04-27 11:31:57 +0000234 if (rdst->remote_port && rdst->remote_port != vxlan->dst_port &&
David Stevens66817122013-03-15 04:35:51 +0000235 nla_put_be16(skb, NDA_PORT, rdst->remote_port))
236 goto nla_put_failure;
Atzm Watanabec7995c42013-04-16 02:50:52 +0000237 if (rdst->remote_vni != vxlan->default_dst.remote_vni &&
David Stevens66817122013-03-15 04:35:51 +0000238 nla_put_be32(skb, NDA_VNI, rdst->remote_vni))
239 goto nla_put_failure;
240 if (rdst->remote_ifindex &&
241 nla_put_u32(skb, NDA_IFINDEX, rdst->remote_ifindex))
stephen hemmingerd3428942012-10-01 12:32:35 +0000242 goto nla_put_failure;
243
244 ci.ndm_used = jiffies_to_clock_t(now - fdb->used);
245 ci.ndm_confirmed = 0;
246 ci.ndm_updated = jiffies_to_clock_t(now - fdb->updated);
247 ci.ndm_refcnt = 0;
248
249 if (nla_put(skb, NDA_CACHEINFO, sizeof(ci), &ci))
250 goto nla_put_failure;
251
252 return nlmsg_end(skb, nlh);
253
254nla_put_failure:
255 nlmsg_cancel(skb, nlh);
256 return -EMSGSIZE;
257}
258
259static inline size_t vxlan_nlmsg_size(void)
260{
261 return NLMSG_ALIGN(sizeof(struct ndmsg))
262 + nla_total_size(ETH_ALEN) /* NDA_LLADDR */
263 + nla_total_size(sizeof(__be32)) /* NDA_DST */
stephen hemminger73cf3312013-04-27 11:31:54 +0000264 + nla_total_size(sizeof(__be16)) /* NDA_PORT */
David Stevens66817122013-03-15 04:35:51 +0000265 + nla_total_size(sizeof(__be32)) /* NDA_VNI */
266 + nla_total_size(sizeof(__u32)) /* NDA_IFINDEX */
stephen hemmingerd3428942012-10-01 12:32:35 +0000267 + nla_total_size(sizeof(struct nda_cacheinfo));
268}
269
270static void vxlan_fdb_notify(struct vxlan_dev *vxlan,
271 const struct vxlan_fdb *fdb, int type)
272{
273 struct net *net = dev_net(vxlan->dev);
274 struct sk_buff *skb;
275 int err = -ENOBUFS;
276
277 skb = nlmsg_new(vxlan_nlmsg_size(), GFP_ATOMIC);
278 if (skb == NULL)
279 goto errout;
280
David Stevens66817122013-03-15 04:35:51 +0000281 err = vxlan_fdb_info(skb, vxlan, fdb, 0, 0, type, 0, &fdb->remote);
stephen hemmingerd3428942012-10-01 12:32:35 +0000282 if (err < 0) {
283 /* -EMSGSIZE implies BUG in vxlan_nlmsg_size() */
284 WARN_ON(err == -EMSGSIZE);
285 kfree_skb(skb);
286 goto errout;
287 }
288
289 rtnl_notify(skb, net, 0, RTNLGRP_NEIGH, NULL, GFP_ATOMIC);
290 return;
291errout:
292 if (err < 0)
293 rtnl_set_sk_err(net, RTNLGRP_NEIGH, err);
294}
295
David Stevense4f67ad2012-11-20 02:50:14 +0000296static void vxlan_ip_miss(struct net_device *dev, __be32 ipa)
297{
298 struct vxlan_dev *vxlan = netdev_priv(dev);
299 struct vxlan_fdb f;
300
301 memset(&f, 0, sizeof f);
302 f.state = NUD_STALE;
David Stevens66817122013-03-15 04:35:51 +0000303 f.remote.remote_ip = ipa; /* goes to NDA_DST */
304 f.remote.remote_vni = VXLAN_N_VID;
David Stevense4f67ad2012-11-20 02:50:14 +0000305
306 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
307}
308
309static void vxlan_fdb_miss(struct vxlan_dev *vxlan, const u8 eth_addr[ETH_ALEN])
310{
311 struct vxlan_fdb f;
312
313 memset(&f, 0, sizeof f);
314 f.state = NUD_STALE;
315 memcpy(f.eth_addr, eth_addr, ETH_ALEN);
316
317 vxlan_fdb_notify(vxlan, &f, RTM_GETNEIGH);
318}
319
stephen hemmingerd3428942012-10-01 12:32:35 +0000320/* Hash Ethernet address */
321static u32 eth_hash(const unsigned char *addr)
322{
323 u64 value = get_unaligned((u64 *)addr);
324
325 /* only want 6 bytes */
326#ifdef __BIG_ENDIAN
stephen hemmingerd3428942012-10-01 12:32:35 +0000327 value >>= 16;
stephen hemminger321fb992012-10-09 20:35:47 +0000328#else
329 value <<= 16;
stephen hemmingerd3428942012-10-01 12:32:35 +0000330#endif
331 return hash_64(value, FDB_HASH_BITS);
332}
333
334/* Hash chain to use given mac address */
335static inline struct hlist_head *vxlan_fdb_head(struct vxlan_dev *vxlan,
336 const u8 *mac)
337{
338 return &vxlan->fdb_head[eth_hash(mac)];
339}
340
341/* Look up Ethernet address in forwarding table */
Sridhar Samudrala014be2c2013-05-17 06:39:07 +0000342static struct vxlan_fdb *__vxlan_find_mac(struct vxlan_dev *vxlan,
stephen hemmingerd3428942012-10-01 12:32:35 +0000343 const u8 *mac)
344
345{
346 struct hlist_head *head = vxlan_fdb_head(vxlan, mac);
347 struct vxlan_fdb *f;
stephen hemmingerd3428942012-10-01 12:32:35 +0000348
Sasha Levinb67bfe02013-02-27 17:06:00 -0800349 hlist_for_each_entry_rcu(f, head, hlist) {
stephen hemmingerd3428942012-10-01 12:32:35 +0000350 if (compare_ether_addr(mac, f->eth_addr) == 0)
351 return f;
352 }
353
354 return NULL;
355}
356
Sridhar Samudrala014be2c2013-05-17 06:39:07 +0000357static struct vxlan_fdb *vxlan_find_mac(struct vxlan_dev *vxlan,
358 const u8 *mac)
359{
360 struct vxlan_fdb *f;
361
362 f = __vxlan_find_mac(vxlan, mac);
363 if (f)
364 f->used = jiffies;
365
366 return f;
367}
368
David Stevens66817122013-03-15 04:35:51 +0000369/* Add/update destinations for multicast */
370static int vxlan_fdb_append(struct vxlan_fdb *f,
stephen hemminger73cf3312013-04-27 11:31:54 +0000371 __be32 ip, __be16 port, __u32 vni, __u32 ifindex)
David Stevens66817122013-03-15 04:35:51 +0000372{
373 struct vxlan_rdst *rd_prev, *rd;
374
375 rd_prev = NULL;
376 for (rd = &f->remote; rd; rd = rd->remote_next) {
377 if (rd->remote_ip == ip &&
378 rd->remote_port == port &&
379 rd->remote_vni == vni &&
380 rd->remote_ifindex == ifindex)
381 return 0;
382 rd_prev = rd;
383 }
384 rd = kmalloc(sizeof(*rd), GFP_ATOMIC);
385 if (rd == NULL)
386 return -ENOBUFS;
387 rd->remote_ip = ip;
388 rd->remote_port = port;
389 rd->remote_vni = vni;
390 rd->remote_ifindex = ifindex;
391 rd->remote_next = NULL;
392 rd_prev->remote_next = rd;
393 return 1;
394}
395
stephen hemmingerd3428942012-10-01 12:32:35 +0000396/* Add new entry to forwarding table -- assumes lock held */
397static int vxlan_fdb_create(struct vxlan_dev *vxlan,
398 const u8 *mac, __be32 ip,
David Stevens66817122013-03-15 04:35:51 +0000399 __u16 state, __u16 flags,
stephen hemminger73cf3312013-04-27 11:31:54 +0000400 __be16 port, __u32 vni, __u32 ifindex,
David Stevensae884082013-04-19 00:36:26 +0000401 __u8 ndm_flags)
stephen hemmingerd3428942012-10-01 12:32:35 +0000402{
403 struct vxlan_fdb *f;
404 int notify = 0;
405
Sridhar Samudrala014be2c2013-05-17 06:39:07 +0000406 f = __vxlan_find_mac(vxlan, mac);
stephen hemmingerd3428942012-10-01 12:32:35 +0000407 if (f) {
408 if (flags & NLM_F_EXCL) {
409 netdev_dbg(vxlan->dev,
410 "lost race to create %pM\n", mac);
411 return -EEXIST;
412 }
413 if (f->state != state) {
414 f->state = state;
415 f->updated = jiffies;
416 notify = 1;
417 }
David Stevensae884082013-04-19 00:36:26 +0000418 if (f->flags != ndm_flags) {
419 f->flags = ndm_flags;
420 f->updated = jiffies;
421 notify = 1;
422 }
David Stevens66817122013-03-15 04:35:51 +0000423 if ((flags & NLM_F_APPEND) &&
424 is_multicast_ether_addr(f->eth_addr)) {
425 int rc = vxlan_fdb_append(f, ip, port, vni, ifindex);
426
427 if (rc < 0)
428 return rc;
429 notify |= rc;
430 }
stephen hemmingerd3428942012-10-01 12:32:35 +0000431 } else {
432 if (!(flags & NLM_F_CREATE))
433 return -ENOENT;
434
435 if (vxlan->addrmax && vxlan->addrcnt >= vxlan->addrmax)
436 return -ENOSPC;
437
438 netdev_dbg(vxlan->dev, "add %pM -> %pI4\n", mac, &ip);
439 f = kmalloc(sizeof(*f), GFP_ATOMIC);
440 if (!f)
441 return -ENOMEM;
442
443 notify = 1;
David Stevens66817122013-03-15 04:35:51 +0000444 f->remote.remote_ip = ip;
445 f->remote.remote_port = port;
446 f->remote.remote_vni = vni;
447 f->remote.remote_ifindex = ifindex;
448 f->remote.remote_next = NULL;
stephen hemmingerd3428942012-10-01 12:32:35 +0000449 f->state = state;
David Stevensae884082013-04-19 00:36:26 +0000450 f->flags = ndm_flags;
stephen hemmingerd3428942012-10-01 12:32:35 +0000451 f->updated = f->used = jiffies;
452 memcpy(f->eth_addr, mac, ETH_ALEN);
453
454 ++vxlan->addrcnt;
455 hlist_add_head_rcu(&f->hlist,
456 vxlan_fdb_head(vxlan, mac));
457 }
458
459 if (notify)
460 vxlan_fdb_notify(vxlan, f, RTM_NEWNEIGH);
461
462 return 0;
463}
464
Wei Yongjun6706c822013-04-11 19:00:35 +0000465static void vxlan_fdb_free(struct rcu_head *head)
David Stevens66817122013-03-15 04:35:51 +0000466{
467 struct vxlan_fdb *f = container_of(head, struct vxlan_fdb, rcu);
468
469 while (f->remote.remote_next) {
470 struct vxlan_rdst *rd = f->remote.remote_next;
471
472 f->remote.remote_next = rd->remote_next;
473 kfree(rd);
474 }
475 kfree(f);
476}
477
stephen hemmingerd3428942012-10-01 12:32:35 +0000478static void vxlan_fdb_destroy(struct vxlan_dev *vxlan, struct vxlan_fdb *f)
479{
480 netdev_dbg(vxlan->dev,
481 "delete %pM\n", f->eth_addr);
482
483 --vxlan->addrcnt;
484 vxlan_fdb_notify(vxlan, f, RTM_DELNEIGH);
485
486 hlist_del_rcu(&f->hlist);
David Stevens66817122013-03-15 04:35:51 +0000487 call_rcu(&f->rcu, vxlan_fdb_free);
stephen hemmingerd3428942012-10-01 12:32:35 +0000488}
489
490/* Add static entry (via netlink) */
491static int vxlan_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
492 struct net_device *dev,
493 const unsigned char *addr, u16 flags)
494{
495 struct vxlan_dev *vxlan = netdev_priv(dev);
David Stevens66817122013-03-15 04:35:51 +0000496 struct net *net = dev_net(vxlan->dev);
stephen hemmingerd3428942012-10-01 12:32:35 +0000497 __be32 ip;
stephen hemminger73cf3312013-04-27 11:31:54 +0000498 __be16 port;
499 u32 vni, ifindex;
stephen hemmingerd3428942012-10-01 12:32:35 +0000500 int err;
501
502 if (!(ndm->ndm_state & (NUD_PERMANENT|NUD_REACHABLE))) {
503 pr_info("RTM_NEWNEIGH with invalid state %#x\n",
504 ndm->ndm_state);
505 return -EINVAL;
506 }
507
508 if (tb[NDA_DST] == NULL)
509 return -EINVAL;
510
511 if (nla_len(tb[NDA_DST]) != sizeof(__be32))
512 return -EAFNOSUPPORT;
513
514 ip = nla_get_be32(tb[NDA_DST]);
515
David Stevens66817122013-03-15 04:35:51 +0000516 if (tb[NDA_PORT]) {
stephen hemminger73cf3312013-04-27 11:31:54 +0000517 if (nla_len(tb[NDA_PORT]) != sizeof(__be16))
David Stevens66817122013-03-15 04:35:51 +0000518 return -EINVAL;
stephen hemminger73cf3312013-04-27 11:31:54 +0000519 port = nla_get_be16(tb[NDA_PORT]);
David Stevens66817122013-03-15 04:35:51 +0000520 } else
stephen hemminger823aa872013-04-27 11:31:57 +0000521 port = vxlan->dst_port;
David Stevens66817122013-03-15 04:35:51 +0000522
523 if (tb[NDA_VNI]) {
524 if (nla_len(tb[NDA_VNI]) != sizeof(u32))
525 return -EINVAL;
526 vni = nla_get_u32(tb[NDA_VNI]);
527 } else
Atzm Watanabec7995c42013-04-16 02:50:52 +0000528 vni = vxlan->default_dst.remote_vni;
David Stevens66817122013-03-15 04:35:51 +0000529
530 if (tb[NDA_IFINDEX]) {
Pravin B Shelar5abb0022013-03-26 08:29:30 +0000531 struct net_device *tdev;
David Stevens66817122013-03-15 04:35:51 +0000532
533 if (nla_len(tb[NDA_IFINDEX]) != sizeof(u32))
534 return -EINVAL;
535 ifindex = nla_get_u32(tb[NDA_IFINDEX]);
Pravin B Shelar5abb0022013-03-26 08:29:30 +0000536 tdev = dev_get_by_index(net, ifindex);
537 if (!tdev)
David Stevens66817122013-03-15 04:35:51 +0000538 return -EADDRNOTAVAIL;
Pravin B Shelar5abb0022013-03-26 08:29:30 +0000539 dev_put(tdev);
David Stevens66817122013-03-15 04:35:51 +0000540 } else
541 ifindex = 0;
542
stephen hemmingerd3428942012-10-01 12:32:35 +0000543 spin_lock_bh(&vxlan->hash_lock);
stephen hemminger73cf3312013-04-27 11:31:54 +0000544 err = vxlan_fdb_create(vxlan, addr, ip, ndm->ndm_state, flags,
545 port, vni, ifindex, ndm->ndm_flags);
stephen hemmingerd3428942012-10-01 12:32:35 +0000546 spin_unlock_bh(&vxlan->hash_lock);
547
548 return err;
549}
550
551/* Delete entry (via netlink) */
Vlad Yasevich1690be62013-02-13 12:00:18 +0000552static int vxlan_fdb_delete(struct ndmsg *ndm, struct nlattr *tb[],
553 struct net_device *dev,
stephen hemmingerd3428942012-10-01 12:32:35 +0000554 const unsigned char *addr)
555{
556 struct vxlan_dev *vxlan = netdev_priv(dev);
557 struct vxlan_fdb *f;
558 int err = -ENOENT;
559
560 spin_lock_bh(&vxlan->hash_lock);
561 f = vxlan_find_mac(vxlan, addr);
562 if (f) {
563 vxlan_fdb_destroy(vxlan, f);
564 err = 0;
565 }
566 spin_unlock_bh(&vxlan->hash_lock);
567
568 return err;
569}
570
571/* Dump forwarding table */
572static int vxlan_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
573 struct net_device *dev, int idx)
574{
575 struct vxlan_dev *vxlan = netdev_priv(dev);
576 unsigned int h;
577
578 for (h = 0; h < FDB_HASH_SIZE; ++h) {
579 struct vxlan_fdb *f;
stephen hemmingerd3428942012-10-01 12:32:35 +0000580 int err;
581
Sasha Levinb67bfe02013-02-27 17:06:00 -0800582 hlist_for_each_entry_rcu(f, &vxlan->fdb_head[h], hlist) {
David Stevens66817122013-03-15 04:35:51 +0000583 struct vxlan_rdst *rd;
584 for (rd = &f->remote; rd; rd = rd->remote_next) {
585 if (idx < cb->args[0])
586 goto skip;
stephen hemmingerd3428942012-10-01 12:32:35 +0000587
David Stevens66817122013-03-15 04:35:51 +0000588 err = vxlan_fdb_info(skb, vxlan, f,
589 NETLINK_CB(cb->skb).portid,
590 cb->nlh->nlmsg_seq,
591 RTM_NEWNEIGH,
592 NLM_F_MULTI, rd);
593 if (err < 0)
594 break;
stephen hemmingerd3428942012-10-01 12:32:35 +0000595skip:
David Stevens66817122013-03-15 04:35:51 +0000596 ++idx;
597 }
stephen hemmingerd3428942012-10-01 12:32:35 +0000598 }
599 }
600
601 return idx;
602}
603
604/* Watch incoming packets to learn mapping between Ethernet address
605 * and Tunnel endpoint.
606 */
607static void vxlan_snoop(struct net_device *dev,
608 __be32 src_ip, const u8 *src_mac)
609{
610 struct vxlan_dev *vxlan = netdev_priv(dev);
611 struct vxlan_fdb *f;
612 int err;
613
614 f = vxlan_find_mac(vxlan, src_mac);
615 if (likely(f)) {
David Stevens66817122013-03-15 04:35:51 +0000616 if (likely(f->remote.remote_ip == src_ip))
stephen hemmingerd3428942012-10-01 12:32:35 +0000617 return;
618
619 if (net_ratelimit())
620 netdev_info(dev,
621 "%pM migrated from %pI4 to %pI4\n",
David Stevens66817122013-03-15 04:35:51 +0000622 src_mac, &f->remote.remote_ip, &src_ip);
stephen hemmingerd3428942012-10-01 12:32:35 +0000623
David Stevens66817122013-03-15 04:35:51 +0000624 f->remote.remote_ip = src_ip;
stephen hemmingerd3428942012-10-01 12:32:35 +0000625 f->updated = jiffies;
626 } else {
627 /* learned new entry */
628 spin_lock(&vxlan->hash_lock);
629 err = vxlan_fdb_create(vxlan, src_mac, src_ip,
630 NUD_REACHABLE,
David Stevens66817122013-03-15 04:35:51 +0000631 NLM_F_EXCL|NLM_F_CREATE,
stephen hemminger823aa872013-04-27 11:31:57 +0000632 vxlan->dst_port,
David Stevensae884082013-04-19 00:36:26 +0000633 vxlan->default_dst.remote_vni,
634 0, NTF_SELF);
stephen hemmingerd3428942012-10-01 12:32:35 +0000635 spin_unlock(&vxlan->hash_lock);
636 }
637}
638
639
640/* See if multicast group is already in use by other ID */
641static bool vxlan_group_used(struct vxlan_net *vn,
642 const struct vxlan_dev *this)
643{
stephen hemminger553675f2013-05-16 11:35:20 +0000644 struct vxlan_dev *vxlan;
stephen hemmingerd3428942012-10-01 12:32:35 +0000645
stephen hemminger553675f2013-05-16 11:35:20 +0000646 list_for_each_entry(vxlan, &vn->vxlan_list, next) {
647 if (vxlan == this)
648 continue;
stephen hemmingerd3428942012-10-01 12:32:35 +0000649
stephen hemminger553675f2013-05-16 11:35:20 +0000650 if (!netif_running(vxlan->dev))
651 continue;
stephen hemmingerd3428942012-10-01 12:32:35 +0000652
stephen hemminger553675f2013-05-16 11:35:20 +0000653 if (vxlan->default_dst.remote_ip == this->default_dst.remote_ip)
654 return true;
655 }
stephen hemmingerd3428942012-10-01 12:32:35 +0000656
657 return false;
658}
659
660/* kernel equivalent to IP_ADD_MEMBERSHIP */
661static int vxlan_join_group(struct net_device *dev)
662{
663 struct vxlan_dev *vxlan = netdev_priv(dev);
664 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
stephen hemminger553675f2013-05-16 11:35:20 +0000665 struct sock *sk = vxlan->vn_sock->sock->sk;
stephen hemmingerd3428942012-10-01 12:32:35 +0000666 struct ip_mreqn mreq = {
Atzm Watanabec7995c42013-04-16 02:50:52 +0000667 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
668 .imr_ifindex = vxlan->default_dst.remote_ifindex,
stephen hemmingerd3428942012-10-01 12:32:35 +0000669 };
670 int err;
671
672 /* Already a member of group */
673 if (vxlan_group_used(vn, vxlan))
674 return 0;
675
676 /* Need to drop RTNL to call multicast join */
677 rtnl_unlock();
678 lock_sock(sk);
679 err = ip_mc_join_group(sk, &mreq);
680 release_sock(sk);
681 rtnl_lock();
682
683 return err;
684}
685
686
687/* kernel equivalent to IP_DROP_MEMBERSHIP */
688static int vxlan_leave_group(struct net_device *dev)
689{
690 struct vxlan_dev *vxlan = netdev_priv(dev);
691 struct vxlan_net *vn = net_generic(dev_net(dev), vxlan_net_id);
692 int err = 0;
stephen hemminger553675f2013-05-16 11:35:20 +0000693 struct sock *sk = vxlan->vn_sock->sock->sk;
stephen hemmingerd3428942012-10-01 12:32:35 +0000694 struct ip_mreqn mreq = {
Atzm Watanabec7995c42013-04-16 02:50:52 +0000695 .imr_multiaddr.s_addr = vxlan->default_dst.remote_ip,
696 .imr_ifindex = vxlan->default_dst.remote_ifindex,
stephen hemmingerd3428942012-10-01 12:32:35 +0000697 };
698
699 /* Only leave group when last vxlan is done. */
700 if (vxlan_group_used(vn, vxlan))
701 return 0;
702
703 /* Need to drop RTNL to call multicast leave */
704 rtnl_unlock();
705 lock_sock(sk);
706 err = ip_mc_leave_group(sk, &mreq);
707 release_sock(sk);
708 rtnl_lock();
709
710 return err;
711}
712
713/* Callback from net/ipv4/udp.c to receive packets */
714static int vxlan_udp_encap_recv(struct sock *sk, struct sk_buff *skb)
715{
716 struct iphdr *oip;
717 struct vxlanhdr *vxh;
718 struct vxlan_dev *vxlan;
Pravin B Shelare8171042013-03-25 14:49:46 +0000719 struct pcpu_tstats *stats;
stephen hemminger553675f2013-05-16 11:35:20 +0000720 __be16 port;
stephen hemmingerd3428942012-10-01 12:32:35 +0000721 __u32 vni;
722 int err;
723
724 /* pop off outer UDP header */
725 __skb_pull(skb, sizeof(struct udphdr));
726
727 /* Need Vxlan and inner Ethernet header to be present */
728 if (!pskb_may_pull(skb, sizeof(struct vxlanhdr)))
729 goto error;
730
731 /* Drop packets with reserved bits set */
732 vxh = (struct vxlanhdr *) skb->data;
733 if (vxh->vx_flags != htonl(VXLAN_FLAGS) ||
734 (vxh->vx_vni & htonl(0xff))) {
735 netdev_dbg(skb->dev, "invalid vxlan flags=%#x vni=%#x\n",
736 ntohl(vxh->vx_flags), ntohl(vxh->vx_vni));
737 goto error;
738 }
739
740 __skb_pull(skb, sizeof(struct vxlanhdr));
stephen hemmingerd3428942012-10-01 12:32:35 +0000741
742 /* Is this VNI defined? */
743 vni = ntohl(vxh->vx_vni) >> 8;
stephen hemminger553675f2013-05-16 11:35:20 +0000744 port = inet_sk(sk)->inet_sport;
745 vxlan = vxlan_find_vni(sock_net(sk), vni, port);
stephen hemmingerd3428942012-10-01 12:32:35 +0000746 if (!vxlan) {
stephen hemminger553675f2013-05-16 11:35:20 +0000747 netdev_dbg(skb->dev, "unknown vni %d port %u\n",
748 vni, ntohs(port));
stephen hemmingerd3428942012-10-01 12:32:35 +0000749 goto drop;
750 }
751
752 if (!pskb_may_pull(skb, ETH_HLEN)) {
753 vxlan->dev->stats.rx_length_errors++;
754 vxlan->dev->stats.rx_errors++;
755 goto drop;
756 }
757
David Stevense4f67ad2012-11-20 02:50:14 +0000758 skb_reset_mac_header(skb);
759
stephen hemmingerd3428942012-10-01 12:32:35 +0000760 /* Re-examine inner Ethernet packet */
761 oip = ip_hdr(skb);
762 skb->protocol = eth_type_trans(skb, vxlan->dev);
stephen hemmingerd3428942012-10-01 12:32:35 +0000763
764 /* Ignore packet loops (and multicast echo) */
765 if (compare_ether_addr(eth_hdr(skb)->h_source,
766 vxlan->dev->dev_addr) == 0)
767 goto drop;
768
David Stevense4f67ad2012-11-20 02:50:14 +0000769 if (vxlan->flags & VXLAN_F_LEARN)
stephen hemmingerd3428942012-10-01 12:32:35 +0000770 vxlan_snoop(skb->dev, oip->saddr, eth_hdr(skb)->h_source);
771
772 __skb_tunnel_rx(skb, vxlan->dev);
773 skb_reset_network_header(skb);
Joseph Gasparakis0afb1662012-12-07 14:14:18 +0000774
775 /* If the NIC driver gave us an encapsulated packet with
776 * CHECKSUM_UNNECESSARY and Rx checksum feature is enabled,
777 * leave the CHECKSUM_UNNECESSARY, the device checksummed it
778 * for us. Otherwise force the upper layers to verify it.
779 */
780 if (skb->ip_summed != CHECKSUM_UNNECESSARY || !skb->encapsulation ||
781 !(vxlan->dev->features & NETIF_F_RXCSUM))
782 skb->ip_summed = CHECKSUM_NONE;
783
784 skb->encapsulation = 0;
stephen hemmingerd3428942012-10-01 12:32:35 +0000785
786 err = IP_ECN_decapsulate(oip, skb);
787 if (unlikely(err)) {
788 if (log_ecn_error)
789 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
790 &oip->saddr, oip->tos);
791 if (err > 1) {
792 ++vxlan->dev->stats.rx_frame_errors;
793 ++vxlan->dev->stats.rx_errors;
794 goto drop;
795 }
796 }
797
Pravin B Shelare8171042013-03-25 14:49:46 +0000798 stats = this_cpu_ptr(vxlan->dev->tstats);
stephen hemmingerd3428942012-10-01 12:32:35 +0000799 u64_stats_update_begin(&stats->syncp);
800 stats->rx_packets++;
801 stats->rx_bytes += skb->len;
802 u64_stats_update_end(&stats->syncp);
803
804 netif_rx(skb);
805
806 return 0;
807error:
808 /* Put UDP header back */
809 __skb_push(skb, sizeof(struct udphdr));
810
811 return 1;
812drop:
813 /* Consume bad packet */
814 kfree_skb(skb);
815 return 0;
816}
817
David Stevense4f67ad2012-11-20 02:50:14 +0000818static int arp_reduce(struct net_device *dev, struct sk_buff *skb)
819{
820 struct vxlan_dev *vxlan = netdev_priv(dev);
821 struct arphdr *parp;
822 u8 *arpptr, *sha;
823 __be32 sip, tip;
824 struct neighbour *n;
825
826 if (dev->flags & IFF_NOARP)
827 goto out;
828
829 if (!pskb_may_pull(skb, arp_hdr_len(dev))) {
830 dev->stats.tx_dropped++;
831 goto out;
832 }
833 parp = arp_hdr(skb);
834
835 if ((parp->ar_hrd != htons(ARPHRD_ETHER) &&
836 parp->ar_hrd != htons(ARPHRD_IEEE802)) ||
837 parp->ar_pro != htons(ETH_P_IP) ||
838 parp->ar_op != htons(ARPOP_REQUEST) ||
839 parp->ar_hln != dev->addr_len ||
840 parp->ar_pln != 4)
841 goto out;
842 arpptr = (u8 *)parp + sizeof(struct arphdr);
843 sha = arpptr;
844 arpptr += dev->addr_len; /* sha */
845 memcpy(&sip, arpptr, sizeof(sip));
846 arpptr += sizeof(sip);
847 arpptr += dev->addr_len; /* tha */
848 memcpy(&tip, arpptr, sizeof(tip));
849
850 if (ipv4_is_loopback(tip) ||
851 ipv4_is_multicast(tip))
852 goto out;
853
854 n = neigh_lookup(&arp_tbl, &tip, dev);
855
856 if (n) {
David Stevense4f67ad2012-11-20 02:50:14 +0000857 struct vxlan_fdb *f;
858 struct sk_buff *reply;
859
860 if (!(n->nud_state & NUD_CONNECTED)) {
861 neigh_release(n);
862 goto out;
863 }
864
865 f = vxlan_find_mac(vxlan, n->ha);
David Stevens66817122013-03-15 04:35:51 +0000866 if (f && f->remote.remote_ip == htonl(INADDR_ANY)) {
David Stevense4f67ad2012-11-20 02:50:14 +0000867 /* bridge-local neighbor */
868 neigh_release(n);
869 goto out;
870 }
871
872 reply = arp_create(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
873 n->ha, sha);
874
875 neigh_release(n);
876
877 skb_reset_mac_header(reply);
878 __skb_pull(reply, skb_network_offset(reply));
879 reply->ip_summed = CHECKSUM_UNNECESSARY;
880 reply->pkt_type = PACKET_HOST;
881
882 if (netif_rx_ni(reply) == NET_RX_DROP)
883 dev->stats.rx_dropped++;
884 } else if (vxlan->flags & VXLAN_F_L3MISS)
885 vxlan_ip_miss(dev, tip);
886out:
887 consume_skb(skb);
888 return NETDEV_TX_OK;
889}
890
891static bool route_shortcircuit(struct net_device *dev, struct sk_buff *skb)
892{
893 struct vxlan_dev *vxlan = netdev_priv(dev);
894 struct neighbour *n;
895 struct iphdr *pip;
896
897 if (is_multicast_ether_addr(eth_hdr(skb)->h_dest))
898 return false;
899
900 n = NULL;
901 switch (ntohs(eth_hdr(skb)->h_proto)) {
902 case ETH_P_IP:
903 if (!pskb_may_pull(skb, sizeof(struct iphdr)))
904 return false;
905 pip = ip_hdr(skb);
906 n = neigh_lookup(&arp_tbl, &pip->daddr, dev);
907 break;
908 default:
909 return false;
910 }
911
912 if (n) {
913 bool diff;
914
915 diff = compare_ether_addr(eth_hdr(skb)->h_dest, n->ha) != 0;
916 if (diff) {
917 memcpy(eth_hdr(skb)->h_source, eth_hdr(skb)->h_dest,
918 dev->addr_len);
919 memcpy(eth_hdr(skb)->h_dest, n->ha, dev->addr_len);
920 }
921 neigh_release(n);
922 return diff;
923 } else if (vxlan->flags & VXLAN_F_L3MISS)
924 vxlan_ip_miss(dev, pip->daddr);
925 return false;
926}
927
stephen hemminger553675f2013-05-16 11:35:20 +0000928static void vxlan_sock_put(struct sk_buff *skb)
stephen hemminger1cad8712012-10-09 20:35:49 +0000929{
930 sock_put(skb->sk);
931}
932
933/* On transmit, associate with the tunnel socket */
934static void vxlan_set_owner(struct net_device *dev, struct sk_buff *skb)
935{
stephen hemminger553675f2013-05-16 11:35:20 +0000936 struct vxlan_dev *vxlan = netdev_priv(dev);
937 struct sock *sk = vxlan->vn_sock->sock->sk;
stephen hemminger1cad8712012-10-09 20:35:49 +0000938
939 skb_orphan(skb);
940 sock_hold(sk);
941 skb->sk = sk;
stephen hemminger553675f2013-05-16 11:35:20 +0000942 skb->destructor = vxlan_sock_put;
stephen hemminger1cad8712012-10-09 20:35:49 +0000943}
944
stephen hemminger05f47d62012-10-09 20:35:50 +0000945/* Compute source port for outgoing packet
946 * first choice to use L4 flow hash since it will spread
947 * better and maybe available from hardware
948 * secondary choice is to use jhash on the Ethernet header
949 */
stephen hemminger7d836a72013-04-27 11:31:56 +0000950static __be16 vxlan_src_port(const struct vxlan_dev *vxlan, struct sk_buff *skb)
stephen hemminger05f47d62012-10-09 20:35:50 +0000951{
952 unsigned int range = (vxlan->port_max - vxlan->port_min) + 1;
953 u32 hash;
954
955 hash = skb_get_rxhash(skb);
956 if (!hash)
957 hash = jhash(skb->data, 2 * ETH_ALEN,
958 (__force u32) skb->protocol);
959
stephen hemminger7d836a72013-04-27 11:31:56 +0000960 return htons((((u64) hash * range) >> 32) + vxlan->port_min);
stephen hemminger05f47d62012-10-09 20:35:50 +0000961}
962
Pravin B Shelar05c0db02013-03-07 13:22:36 +0000963static int handle_offloads(struct sk_buff *skb)
964{
965 if (skb_is_gso(skb)) {
966 int err = skb_unclone(skb, GFP_ATOMIC);
967 if (unlikely(err))
968 return err;
969
Dmitry Kravkovf6ace502013-04-28 08:16:01 +0000970 skb_shinfo(skb)->gso_type |= SKB_GSO_UDP_TUNNEL;
Pravin B Shelar05c0db02013-03-07 13:22:36 +0000971 } else if (skb->ip_summed != CHECKSUM_PARTIAL)
972 skb->ip_summed = CHECKSUM_NONE;
973
974 return 0;
975}
976
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +0000977/* Bypass encapsulation if the destination is local */
978static void vxlan_encap_bypass(struct sk_buff *skb, struct vxlan_dev *src_vxlan,
979 struct vxlan_dev *dst_vxlan)
980{
981 struct pcpu_tstats *tx_stats = this_cpu_ptr(src_vxlan->dev->tstats);
982 struct pcpu_tstats *rx_stats = this_cpu_ptr(dst_vxlan->dev->tstats);
983
984 skb->pkt_type = PACKET_HOST;
985 skb->encapsulation = 0;
986 skb->dev = dst_vxlan->dev;
987 __skb_pull(skb, skb_network_offset(skb));
988
989 if (dst_vxlan->flags & VXLAN_F_LEARN)
Mike Rapoport9d9f1632013-04-13 23:21:39 +0000990 vxlan_snoop(skb->dev, htonl(INADDR_LOOPBACK),
991 eth_hdr(skb)->h_source);
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +0000992
993 u64_stats_update_begin(&tx_stats->syncp);
994 tx_stats->tx_packets++;
995 tx_stats->tx_bytes += skb->len;
996 u64_stats_update_end(&tx_stats->syncp);
997
998 if (netif_rx(skb) == NET_RX_SUCCESS) {
999 u64_stats_update_begin(&rx_stats->syncp);
1000 rx_stats->rx_packets++;
1001 rx_stats->rx_bytes += skb->len;
1002 u64_stats_update_end(&rx_stats->syncp);
1003 } else {
1004 skb->dev->stats.rx_dropped++;
1005 }
1006}
1007
David Stevens66817122013-03-15 04:35:51 +00001008static netdev_tx_t vxlan_xmit_one(struct sk_buff *skb, struct net_device *dev,
1009 struct vxlan_rdst *rdst, bool did_rsc)
stephen hemmingerd3428942012-10-01 12:32:35 +00001010{
1011 struct vxlan_dev *vxlan = netdev_priv(dev);
1012 struct rtable *rt;
stephen hemmingerd3428942012-10-01 12:32:35 +00001013 const struct iphdr *old_iph;
1014 struct iphdr *iph;
1015 struct vxlanhdr *vxh;
1016 struct udphdr *uh;
1017 struct flowi4 fl4;
stephen hemmingerd3428942012-10-01 12:32:35 +00001018 __be32 dst;
stephen hemminger7d836a72013-04-27 11:31:56 +00001019 __be16 src_port, dst_port;
David Stevens66817122013-03-15 04:35:51 +00001020 u32 vni;
stephen hemmingerd3428942012-10-01 12:32:35 +00001021 __be16 df = 0;
1022 __u8 tos, ttl;
stephen hemmingerd3428942012-10-01 12:32:35 +00001023
stephen hemminger823aa872013-04-27 11:31:57 +00001024 dst_port = rdst->remote_port ? rdst->remote_port : vxlan->dst_port;
David Stevens66817122013-03-15 04:35:51 +00001025 vni = rdst->remote_vni;
1026 dst = rdst->remote_ip;
David Stevense4f67ad2012-11-20 02:50:14 +00001027
1028 if (!dst) {
1029 if (did_rsc) {
David Stevense4f67ad2012-11-20 02:50:14 +00001030 /* short-circuited back to local bridge */
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +00001031 vxlan_encap_bypass(skb, vxlan, vxlan);
David Stevense4f67ad2012-11-20 02:50:14 +00001032 return NETDEV_TX_OK;
1033 }
stephen hemmingeref59feb2012-10-09 20:35:46 +00001034 goto drop;
David Stevense4f67ad2012-11-20 02:50:14 +00001035 }
stephen hemmingeref59feb2012-10-09 20:35:46 +00001036
Joseph Gasparakisd6727fe2012-12-07 14:14:16 +00001037 if (!skb->encapsulation) {
1038 skb_reset_inner_headers(skb);
1039 skb->encapsulation = 1;
1040 }
1041
stephen hemmingerd3428942012-10-01 12:32:35 +00001042 /* Need space for new headers (invalidates iph ptr) */
1043 if (skb_cow_head(skb, VXLAN_HEADROOM))
1044 goto drop;
1045
stephen hemmingerd3428942012-10-01 12:32:35 +00001046 old_iph = ip_hdr(skb);
1047
stephen hemmingerd3428942012-10-01 12:32:35 +00001048 ttl = vxlan->ttl;
1049 if (!ttl && IN_MULTICAST(ntohl(dst)))
1050 ttl = 1;
1051
1052 tos = vxlan->tos;
1053 if (tos == 1)
Pravin B Shelar206aaaf2013-03-25 14:49:53 +00001054 tos = ip_tunnel_get_dsfield(old_iph, skb);
stephen hemmingerd3428942012-10-01 12:32:35 +00001055
stephen hemminger05f47d62012-10-09 20:35:50 +00001056 src_port = vxlan_src_port(vxlan, skb);
stephen hemmingerd3428942012-10-01 12:32:35 +00001057
stephen hemmingerca78f182012-10-09 20:35:48 +00001058 memset(&fl4, 0, sizeof(fl4));
David Stevens66817122013-03-15 04:35:51 +00001059 fl4.flowi4_oif = rdst->remote_ifindex;
stephen hemmingerca78f182012-10-09 20:35:48 +00001060 fl4.flowi4_tos = RT_TOS(tos);
1061 fl4.daddr = dst;
1062 fl4.saddr = vxlan->saddr;
1063
1064 rt = ip_route_output_key(dev_net(dev), &fl4);
stephen hemmingerd3428942012-10-01 12:32:35 +00001065 if (IS_ERR(rt)) {
1066 netdev_dbg(dev, "no route to %pI4\n", &dst);
1067 dev->stats.tx_carrier_errors++;
1068 goto tx_error;
1069 }
1070
1071 if (rt->dst.dev == dev) {
1072 netdev_dbg(dev, "circular route to %pI4\n", &dst);
1073 ip_rt_put(rt);
1074 dev->stats.collisions++;
1075 goto tx_error;
1076 }
1077
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +00001078 /* Bypass encapsulation if the destination is local */
Mike Rapoportab09a6d2013-04-13 23:21:51 +00001079 if (rt->rt_flags & RTCF_LOCAL &&
1080 !(rt->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST))) {
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +00001081 struct vxlan_dev *dst_vxlan;
1082
1083 ip_rt_put(rt);
stephen hemminger553675f2013-05-16 11:35:20 +00001084 dst_vxlan = vxlan_find_vni(dev_net(dev), vni, dst_port);
Sridhar Samudrala9dcc71e2013-04-02 12:31:52 +00001085 if (!dst_vxlan)
1086 goto tx_error;
1087 vxlan_encap_bypass(skb, vxlan, dst_vxlan);
1088 return NETDEV_TX_OK;
1089 }
1090
stephen hemmingerd3428942012-10-01 12:32:35 +00001091 memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
1092 IPCB(skb)->flags &= ~(IPSKB_XFRM_TUNNEL_SIZE | IPSKB_XFRM_TRANSFORMED |
1093 IPSKB_REROUTED);
1094 skb_dst_drop(skb);
1095 skb_dst_set(skb, &rt->dst);
1096
1097 vxh = (struct vxlanhdr *) __skb_push(skb, sizeof(*vxh));
1098 vxh->vx_flags = htonl(VXLAN_FLAGS);
David Stevens66817122013-03-15 04:35:51 +00001099 vxh->vx_vni = htonl(vni << 8);
stephen hemmingerd3428942012-10-01 12:32:35 +00001100
1101 __skb_push(skb, sizeof(*uh));
1102 skb_reset_transport_header(skb);
1103 uh = udp_hdr(skb);
1104
stephen hemminger73cf3312013-04-27 11:31:54 +00001105 uh->dest = dst_port;
stephen hemminger7d836a72013-04-27 11:31:56 +00001106 uh->source = src_port;
stephen hemmingerd3428942012-10-01 12:32:35 +00001107
1108 uh->len = htons(skb->len);
1109 uh->check = 0;
1110
1111 __skb_push(skb, sizeof(*iph));
1112 skb_reset_network_header(skb);
1113 iph = ip_hdr(skb);
1114 iph->version = 4;
1115 iph->ihl = sizeof(struct iphdr) >> 2;
1116 iph->frag_off = df;
1117 iph->protocol = IPPROTO_UDP;
Pravin B Shelar206aaaf2013-03-25 14:49:53 +00001118 iph->tos = ip_tunnel_ecn_encap(tos, old_iph, skb);
stephen hemmingerca78f182012-10-09 20:35:48 +00001119 iph->daddr = dst;
stephen hemmingerd3428942012-10-01 12:32:35 +00001120 iph->saddr = fl4.saddr;
1121 iph->ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
Pravin B Shelar8dc98eb2013-02-22 07:30:40 +00001122 tunnel_ip_select_ident(skb, old_iph, &rt->dst);
stephen hemmingerd3428942012-10-01 12:32:35 +00001123
Zang MingJie88c4c062013-03-04 06:07:34 +00001124 nf_reset(skb);
1125
stephen hemminger1cad8712012-10-09 20:35:49 +00001126 vxlan_set_owner(dev, skb);
1127
Pravin B Shelar05c0db02013-03-07 13:22:36 +00001128 if (handle_offloads(skb))
1129 goto drop;
stephen hemmingerd3428942012-10-01 12:32:35 +00001130
Cong Wang6aed0c82013-03-09 16:38:39 +00001131 iptunnel_xmit(skb, dev);
stephen hemmingerd3428942012-10-01 12:32:35 +00001132 return NETDEV_TX_OK;
1133
1134drop:
1135 dev->stats.tx_dropped++;
1136 goto tx_free;
1137
1138tx_error:
1139 dev->stats.tx_errors++;
1140tx_free:
1141 dev_kfree_skb(skb);
1142 return NETDEV_TX_OK;
1143}
1144
David Stevens66817122013-03-15 04:35:51 +00001145/* Transmit local packets over Vxlan
1146 *
1147 * Outer IP header inherits ECN and DF from inner header.
1148 * Outer UDP destination is the VXLAN assigned port.
1149 * source port is based on hash of flow
1150 */
1151static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct net_device *dev)
1152{
1153 struct vxlan_dev *vxlan = netdev_priv(dev);
1154 struct ethhdr *eth;
1155 bool did_rsc = false;
Atzm Watanabec7995c42013-04-16 02:50:52 +00001156 struct vxlan_rdst *rdst0, *rdst;
David Stevens66817122013-03-15 04:35:51 +00001157 struct vxlan_fdb *f;
1158 int rc1, rc;
1159
1160 skb_reset_mac_header(skb);
1161 eth = eth_hdr(skb);
1162
1163 if ((vxlan->flags & VXLAN_F_PROXY) && ntohs(eth->h_proto) == ETH_P_ARP)
1164 return arp_reduce(dev, skb);
David Stevens66817122013-03-15 04:35:51 +00001165
1166 f = vxlan_find_mac(vxlan, eth->h_dest);
David Stevensae884082013-04-19 00:36:26 +00001167 did_rsc = false;
1168
1169 if (f && (f->flags & NTF_ROUTER) && (vxlan->flags & VXLAN_F_RSC) &&
1170 ntohs(eth->h_proto) == ETH_P_IP) {
1171 did_rsc = route_shortcircuit(dev, skb);
1172 if (did_rsc)
1173 f = vxlan_find_mac(vxlan, eth->h_dest);
1174 }
1175
David Stevens66817122013-03-15 04:35:51 +00001176 if (f == NULL) {
Atzm Watanabec7995c42013-04-16 02:50:52 +00001177 rdst0 = &vxlan->default_dst;
David Stevens66817122013-03-15 04:35:51 +00001178
Atzm Watanabec7995c42013-04-16 02:50:52 +00001179 if (rdst0->remote_ip == htonl(INADDR_ANY) &&
David Stevens66817122013-03-15 04:35:51 +00001180 (vxlan->flags & VXLAN_F_L2MISS) &&
1181 !is_multicast_ether_addr(eth->h_dest))
1182 vxlan_fdb_miss(vxlan, eth->h_dest);
1183 } else
1184 rdst0 = &f->remote;
1185
1186 rc = NETDEV_TX_OK;
1187
1188 /* if there are multiple destinations, send copies */
1189 for (rdst = rdst0->remote_next; rdst; rdst = rdst->remote_next) {
1190 struct sk_buff *skb1;
1191
1192 skb1 = skb_clone(skb, GFP_ATOMIC);
1193 rc1 = vxlan_xmit_one(skb1, dev, rdst, did_rsc);
1194 if (rc == NETDEV_TX_OK)
1195 rc = rc1;
1196 }
1197
1198 rc1 = vxlan_xmit_one(skb, dev, rdst0, did_rsc);
1199 if (rc == NETDEV_TX_OK)
1200 rc = rc1;
1201 return rc;
1202}
1203
stephen hemmingerd3428942012-10-01 12:32:35 +00001204/* Walk the forwarding table and purge stale entries */
1205static void vxlan_cleanup(unsigned long arg)
1206{
1207 struct vxlan_dev *vxlan = (struct vxlan_dev *) arg;
1208 unsigned long next_timer = jiffies + FDB_AGE_INTERVAL;
1209 unsigned int h;
1210
1211 if (!netif_running(vxlan->dev))
1212 return;
1213
1214 spin_lock_bh(&vxlan->hash_lock);
1215 for (h = 0; h < FDB_HASH_SIZE; ++h) {
1216 struct hlist_node *p, *n;
1217 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1218 struct vxlan_fdb *f
1219 = container_of(p, struct vxlan_fdb, hlist);
1220 unsigned long timeout;
1221
stephen hemminger3c172862012-10-26 06:24:34 +00001222 if (f->state & NUD_PERMANENT)
stephen hemmingerd3428942012-10-01 12:32:35 +00001223 continue;
1224
1225 timeout = f->used + vxlan->age_interval * HZ;
1226 if (time_before_eq(timeout, jiffies)) {
1227 netdev_dbg(vxlan->dev,
1228 "garbage collect %pM\n",
1229 f->eth_addr);
1230 f->state = NUD_STALE;
1231 vxlan_fdb_destroy(vxlan, f);
1232 } else if (time_before(timeout, next_timer))
1233 next_timer = timeout;
1234 }
1235 }
1236 spin_unlock_bh(&vxlan->hash_lock);
1237
1238 mod_timer(&vxlan->age_timer, next_timer);
1239}
1240
1241/* Setup stats when device is created */
1242static int vxlan_init(struct net_device *dev)
1243{
Pravin B Shelare8171042013-03-25 14:49:46 +00001244 dev->tstats = alloc_percpu(struct pcpu_tstats);
1245 if (!dev->tstats)
stephen hemmingerd3428942012-10-01 12:32:35 +00001246 return -ENOMEM;
1247
1248 return 0;
1249}
1250
1251/* Start ageing timer and join group when device is brought up */
1252static int vxlan_open(struct net_device *dev)
1253{
1254 struct vxlan_dev *vxlan = netdev_priv(dev);
1255 int err;
1256
Atzm Watanabec7995c42013-04-16 02:50:52 +00001257 if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip))) {
stephen hemmingerd3428942012-10-01 12:32:35 +00001258 err = vxlan_join_group(dev);
1259 if (err)
1260 return err;
1261 }
1262
1263 if (vxlan->age_interval)
1264 mod_timer(&vxlan->age_timer, jiffies + FDB_AGE_INTERVAL);
1265
1266 return 0;
1267}
1268
1269/* Purge the forwarding table */
1270static void vxlan_flush(struct vxlan_dev *vxlan)
1271{
Cong Wang31fec5a2013-05-27 22:35:52 +00001272 unsigned int h;
stephen hemmingerd3428942012-10-01 12:32:35 +00001273
1274 spin_lock_bh(&vxlan->hash_lock);
1275 for (h = 0; h < FDB_HASH_SIZE; ++h) {
1276 struct hlist_node *p, *n;
1277 hlist_for_each_safe(p, n, &vxlan->fdb_head[h]) {
1278 struct vxlan_fdb *f
1279 = container_of(p, struct vxlan_fdb, hlist);
1280 vxlan_fdb_destroy(vxlan, f);
1281 }
1282 }
1283 spin_unlock_bh(&vxlan->hash_lock);
1284}
1285
1286/* Cleanup timer and forwarding table on shutdown */
1287static int vxlan_stop(struct net_device *dev)
1288{
1289 struct vxlan_dev *vxlan = netdev_priv(dev);
1290
Atzm Watanabec7995c42013-04-16 02:50:52 +00001291 if (IN_MULTICAST(ntohl(vxlan->default_dst.remote_ip)))
stephen hemmingerd3428942012-10-01 12:32:35 +00001292 vxlan_leave_group(dev);
1293
1294 del_timer_sync(&vxlan->age_timer);
1295
1296 vxlan_flush(vxlan);
1297
1298 return 0;
1299}
1300
stephen hemmingerd3428942012-10-01 12:32:35 +00001301/* Stub, nothing needs to be done. */
1302static void vxlan_set_multicast_list(struct net_device *dev)
1303{
1304}
1305
1306static const struct net_device_ops vxlan_netdev_ops = {
1307 .ndo_init = vxlan_init,
1308 .ndo_open = vxlan_open,
1309 .ndo_stop = vxlan_stop,
1310 .ndo_start_xmit = vxlan_xmit,
Pravin B Shelare8171042013-03-25 14:49:46 +00001311 .ndo_get_stats64 = ip_tunnel_get_stats64,
stephen hemmingerd3428942012-10-01 12:32:35 +00001312 .ndo_set_rx_mode = vxlan_set_multicast_list,
1313 .ndo_change_mtu = eth_change_mtu,
1314 .ndo_validate_addr = eth_validate_addr,
1315 .ndo_set_mac_address = eth_mac_addr,
1316 .ndo_fdb_add = vxlan_fdb_add,
1317 .ndo_fdb_del = vxlan_fdb_delete,
1318 .ndo_fdb_dump = vxlan_fdb_dump,
1319};
1320
1321/* Info for udev, that this is a virtual tunnel endpoint */
1322static struct device_type vxlan_type = {
1323 .name = "vxlan",
1324};
1325
1326static void vxlan_free(struct net_device *dev)
1327{
Pravin B Shelare8171042013-03-25 14:49:46 +00001328 free_percpu(dev->tstats);
stephen hemmingerd3428942012-10-01 12:32:35 +00001329 free_netdev(dev);
1330}
1331
1332/* Initialize the device structure. */
1333static void vxlan_setup(struct net_device *dev)
1334{
1335 struct vxlan_dev *vxlan = netdev_priv(dev);
Cong Wang31fec5a2013-05-27 22:35:52 +00001336 unsigned int h;
stephen hemminger05f47d62012-10-09 20:35:50 +00001337 int low, high;
stephen hemmingerd3428942012-10-01 12:32:35 +00001338
1339 eth_hw_addr_random(dev);
1340 ether_setup(dev);
stephen hemminger2840bf22012-10-09 20:35:51 +00001341 dev->hard_header_len = ETH_HLEN + VXLAN_HEADROOM;
stephen hemmingerd3428942012-10-01 12:32:35 +00001342
1343 dev->netdev_ops = &vxlan_netdev_ops;
1344 dev->destructor = vxlan_free;
1345 SET_NETDEV_DEVTYPE(dev, &vxlan_type);
1346
1347 dev->tx_queue_len = 0;
1348 dev->features |= NETIF_F_LLTX;
1349 dev->features |= NETIF_F_NETNS_LOCAL;
Joseph Gasparakisd6727fe2012-12-07 14:14:16 +00001350 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
Joseph Gasparakis0afb1662012-12-07 14:14:18 +00001351 dev->features |= NETIF_F_RXCSUM;
Pravin B Shelar05c0db02013-03-07 13:22:36 +00001352 dev->features |= NETIF_F_GSO_SOFTWARE;
Joseph Gasparakis0afb1662012-12-07 14:14:18 +00001353
1354 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
Pravin B Shelar05c0db02013-03-07 13:22:36 +00001355 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
stephen hemmingerd3428942012-10-01 12:32:35 +00001356 dev->priv_flags &= ~IFF_XMIT_DST_RELEASE;
stephen hemminger6602d002012-12-31 12:00:21 +00001357 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
stephen hemmingerd3428942012-10-01 12:32:35 +00001358
stephen hemminger553675f2013-05-16 11:35:20 +00001359 INIT_LIST_HEAD(&vxlan->next);
stephen hemmingerd3428942012-10-01 12:32:35 +00001360 spin_lock_init(&vxlan->hash_lock);
1361
1362 init_timer_deferrable(&vxlan->age_timer);
1363 vxlan->age_timer.function = vxlan_cleanup;
1364 vxlan->age_timer.data = (unsigned long) vxlan;
1365
stephen hemminger05f47d62012-10-09 20:35:50 +00001366 inet_get_local_port_range(&low, &high);
1367 vxlan->port_min = low;
1368 vxlan->port_max = high;
stephen hemminger823aa872013-04-27 11:31:57 +00001369 vxlan->dst_port = htons(vxlan_port);
stephen hemminger05f47d62012-10-09 20:35:50 +00001370
stephen hemmingerd3428942012-10-01 12:32:35 +00001371 vxlan->dev = dev;
1372
1373 for (h = 0; h < FDB_HASH_SIZE; ++h)
1374 INIT_HLIST_HEAD(&vxlan->fdb_head[h]);
1375}
1376
1377static const struct nla_policy vxlan_policy[IFLA_VXLAN_MAX + 1] = {
1378 [IFLA_VXLAN_ID] = { .type = NLA_U32 },
stephen hemminger5d174dd2013-04-27 11:31:55 +00001379 [IFLA_VXLAN_GROUP] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
stephen hemmingerd3428942012-10-01 12:32:35 +00001380 [IFLA_VXLAN_LINK] = { .type = NLA_U32 },
1381 [IFLA_VXLAN_LOCAL] = { .len = FIELD_SIZEOF(struct iphdr, saddr) },
1382 [IFLA_VXLAN_TOS] = { .type = NLA_U8 },
1383 [IFLA_VXLAN_TTL] = { .type = NLA_U8 },
1384 [IFLA_VXLAN_LEARNING] = { .type = NLA_U8 },
1385 [IFLA_VXLAN_AGEING] = { .type = NLA_U32 },
1386 [IFLA_VXLAN_LIMIT] = { .type = NLA_U32 },
stephen hemminger05f47d62012-10-09 20:35:50 +00001387 [IFLA_VXLAN_PORT_RANGE] = { .len = sizeof(struct ifla_vxlan_port_range) },
David Stevense4f67ad2012-11-20 02:50:14 +00001388 [IFLA_VXLAN_PROXY] = { .type = NLA_U8 },
1389 [IFLA_VXLAN_RSC] = { .type = NLA_U8 },
1390 [IFLA_VXLAN_L2MISS] = { .type = NLA_U8 },
1391 [IFLA_VXLAN_L3MISS] = { .type = NLA_U8 },
stephen hemminger823aa872013-04-27 11:31:57 +00001392 [IFLA_VXLAN_PORT] = { .type = NLA_U16 },
stephen hemmingerd3428942012-10-01 12:32:35 +00001393};
1394
1395static int vxlan_validate(struct nlattr *tb[], struct nlattr *data[])
1396{
1397 if (tb[IFLA_ADDRESS]) {
1398 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
1399 pr_debug("invalid link address (not ethernet)\n");
1400 return -EINVAL;
1401 }
1402
1403 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
1404 pr_debug("invalid all zero ethernet address\n");
1405 return -EADDRNOTAVAIL;
1406 }
1407 }
1408
1409 if (!data)
1410 return -EINVAL;
1411
1412 if (data[IFLA_VXLAN_ID]) {
1413 __u32 id = nla_get_u32(data[IFLA_VXLAN_ID]);
1414 if (id >= VXLAN_VID_MASK)
1415 return -ERANGE;
1416 }
1417
stephen hemminger05f47d62012-10-09 20:35:50 +00001418 if (data[IFLA_VXLAN_PORT_RANGE]) {
1419 const struct ifla_vxlan_port_range *p
1420 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1421
1422 if (ntohs(p->high) < ntohs(p->low)) {
1423 pr_debug("port range %u .. %u not valid\n",
1424 ntohs(p->low), ntohs(p->high));
1425 return -EINVAL;
1426 }
1427 }
1428
stephen hemmingerd3428942012-10-01 12:32:35 +00001429 return 0;
1430}
1431
Yan Burman1b13c972013-01-29 23:43:07 +00001432static void vxlan_get_drvinfo(struct net_device *netdev,
1433 struct ethtool_drvinfo *drvinfo)
1434{
1435 strlcpy(drvinfo->version, VXLAN_VERSION, sizeof(drvinfo->version));
1436 strlcpy(drvinfo->driver, "vxlan", sizeof(drvinfo->driver));
1437}
1438
1439static const struct ethtool_ops vxlan_ethtool_ops = {
1440 .get_drvinfo = vxlan_get_drvinfo,
1441 .get_link = ethtool_op_get_link,
1442};
1443
stephen hemminger553675f2013-05-16 11:35:20 +00001444static void vxlan_del_work(struct work_struct *work)
1445{
1446 struct vxlan_sock *vs = container_of(work, struct vxlan_sock, del_work);
1447
1448 sk_release_kernel(vs->sock->sk);
1449 kfree_rcu(vs, rcu);
1450}
1451
1452/* Create new listen socket if needed */
1453static struct vxlan_sock *vxlan_socket_create(struct net *net, __be16 port)
1454{
1455 struct vxlan_sock *vs;
1456 struct sock *sk;
1457 struct sockaddr_in vxlan_addr = {
1458 .sin_family = AF_INET,
1459 .sin_addr.s_addr = htonl(INADDR_ANY),
1460 };
1461 int rc;
Cong Wang31fec5a2013-05-27 22:35:52 +00001462 unsigned int h;
stephen hemminger553675f2013-05-16 11:35:20 +00001463
1464 vs = kmalloc(sizeof(*vs), GFP_KERNEL);
1465 if (!vs)
1466 return ERR_PTR(-ENOMEM);
1467
1468 for (h = 0; h < VNI_HASH_SIZE; ++h)
1469 INIT_HLIST_HEAD(&vs->vni_list[h]);
1470
1471 INIT_WORK(&vs->del_work, vxlan_del_work);
1472
1473 /* Create UDP socket for encapsulation receive. */
1474 rc = sock_create_kern(AF_INET, SOCK_DGRAM, IPPROTO_UDP, &vs->sock);
1475 if (rc < 0) {
1476 pr_debug("UDP socket create failed\n");
1477 kfree(vs);
1478 return ERR_PTR(rc);
1479 }
1480
1481 /* Put in proper namespace */
1482 sk = vs->sock->sk;
1483 sk_change_net(sk, net);
1484
1485 vxlan_addr.sin_port = port;
1486
1487 rc = kernel_bind(vs->sock, (struct sockaddr *) &vxlan_addr,
1488 sizeof(vxlan_addr));
1489 if (rc < 0) {
1490 pr_debug("bind for UDP socket %pI4:%u (%d)\n",
1491 &vxlan_addr.sin_addr, ntohs(vxlan_addr.sin_port), rc);
1492 sk_release_kernel(sk);
1493 kfree(vs);
1494 return ERR_PTR(rc);
1495 }
1496
1497 /* Disable multicast loopback */
1498 inet_sk(sk)->mc_loop = 0;
1499
1500 /* Mark socket as an encapsulation socket. */
1501 udp_sk(sk)->encap_type = 1;
1502 udp_sk(sk)->encap_rcv = vxlan_udp_encap_recv;
1503 udp_encap_enable();
1504
1505 vs->refcnt = 1;
1506 return vs;
1507}
1508
stephen hemmingerd3428942012-10-01 12:32:35 +00001509static int vxlan_newlink(struct net *net, struct net_device *dev,
1510 struct nlattr *tb[], struct nlattr *data[])
1511{
stephen hemminger553675f2013-05-16 11:35:20 +00001512 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
stephen hemmingerd3428942012-10-01 12:32:35 +00001513 struct vxlan_dev *vxlan = netdev_priv(dev);
Atzm Watanabec7995c42013-04-16 02:50:52 +00001514 struct vxlan_rdst *dst = &vxlan->default_dst;
stephen hemminger553675f2013-05-16 11:35:20 +00001515 struct vxlan_sock *vs;
stephen hemmingerd3428942012-10-01 12:32:35 +00001516 __u32 vni;
1517 int err;
1518
1519 if (!data[IFLA_VXLAN_ID])
1520 return -EINVAL;
1521
1522 vni = nla_get_u32(data[IFLA_VXLAN_ID]);
Atzm Watanabec7995c42013-04-16 02:50:52 +00001523 dst->remote_vni = vni;
stephen hemmingerd3428942012-10-01 12:32:35 +00001524
stephen hemminger5d174dd2013-04-27 11:31:55 +00001525 if (data[IFLA_VXLAN_GROUP])
1526 dst->remote_ip = nla_get_be32(data[IFLA_VXLAN_GROUP]);
stephen hemmingerd3428942012-10-01 12:32:35 +00001527
1528 if (data[IFLA_VXLAN_LOCAL])
1529 vxlan->saddr = nla_get_be32(data[IFLA_VXLAN_LOCAL]);
1530
stephen hemminger34e02aa2012-10-09 20:35:53 +00001531 if (data[IFLA_VXLAN_LINK] &&
Atzm Watanabec7995c42013-04-16 02:50:52 +00001532 (dst->remote_ifindex = nla_get_u32(data[IFLA_VXLAN_LINK]))) {
stephen hemminger34e02aa2012-10-09 20:35:53 +00001533 struct net_device *lowerdev
Atzm Watanabec7995c42013-04-16 02:50:52 +00001534 = __dev_get_by_index(net, dst->remote_ifindex);
stephen hemmingerd3428942012-10-01 12:32:35 +00001535
stephen hemminger34e02aa2012-10-09 20:35:53 +00001536 if (!lowerdev) {
Atzm Watanabec7995c42013-04-16 02:50:52 +00001537 pr_info("ifindex %d does not exist\n", dst->remote_ifindex);
stephen hemminger34e02aa2012-10-09 20:35:53 +00001538 return -ENODEV;
stephen hemmingerd3428942012-10-01 12:32:35 +00001539 }
stephen hemminger34e02aa2012-10-09 20:35:53 +00001540
1541 if (!tb[IFLA_MTU])
1542 dev->mtu = lowerdev->mtu - VXLAN_HEADROOM;
Alexander Duyck1ba56fb2012-11-13 13:10:59 +00001543
1544 /* update header length based on lower device */
1545 dev->hard_header_len = lowerdev->hard_header_len +
1546 VXLAN_HEADROOM;
stephen hemmingerd3428942012-10-01 12:32:35 +00001547 }
1548
1549 if (data[IFLA_VXLAN_TOS])
1550 vxlan->tos = nla_get_u8(data[IFLA_VXLAN_TOS]);
1551
Vincent Bernatafb97182012-10-30 10:27:16 +00001552 if (data[IFLA_VXLAN_TTL])
1553 vxlan->ttl = nla_get_u8(data[IFLA_VXLAN_TTL]);
1554
stephen hemmingerd3428942012-10-01 12:32:35 +00001555 if (!data[IFLA_VXLAN_LEARNING] || nla_get_u8(data[IFLA_VXLAN_LEARNING]))
David Stevense4f67ad2012-11-20 02:50:14 +00001556 vxlan->flags |= VXLAN_F_LEARN;
stephen hemmingerd3428942012-10-01 12:32:35 +00001557
1558 if (data[IFLA_VXLAN_AGEING])
1559 vxlan->age_interval = nla_get_u32(data[IFLA_VXLAN_AGEING]);
1560 else
1561 vxlan->age_interval = FDB_AGE_DEFAULT;
1562
David Stevense4f67ad2012-11-20 02:50:14 +00001563 if (data[IFLA_VXLAN_PROXY] && nla_get_u8(data[IFLA_VXLAN_PROXY]))
1564 vxlan->flags |= VXLAN_F_PROXY;
1565
1566 if (data[IFLA_VXLAN_RSC] && nla_get_u8(data[IFLA_VXLAN_RSC]))
1567 vxlan->flags |= VXLAN_F_RSC;
1568
1569 if (data[IFLA_VXLAN_L2MISS] && nla_get_u8(data[IFLA_VXLAN_L2MISS]))
1570 vxlan->flags |= VXLAN_F_L2MISS;
1571
1572 if (data[IFLA_VXLAN_L3MISS] && nla_get_u8(data[IFLA_VXLAN_L3MISS]))
1573 vxlan->flags |= VXLAN_F_L3MISS;
1574
stephen hemmingerd3428942012-10-01 12:32:35 +00001575 if (data[IFLA_VXLAN_LIMIT])
1576 vxlan->addrmax = nla_get_u32(data[IFLA_VXLAN_LIMIT]);
1577
stephen hemminger05f47d62012-10-09 20:35:50 +00001578 if (data[IFLA_VXLAN_PORT_RANGE]) {
1579 const struct ifla_vxlan_port_range *p
1580 = nla_data(data[IFLA_VXLAN_PORT_RANGE]);
1581 vxlan->port_min = ntohs(p->low);
1582 vxlan->port_max = ntohs(p->high);
1583 }
1584
stephen hemminger823aa872013-04-27 11:31:57 +00001585 if (data[IFLA_VXLAN_PORT])
1586 vxlan->dst_port = nla_get_be16(data[IFLA_VXLAN_PORT]);
1587
stephen hemminger553675f2013-05-16 11:35:20 +00001588 if (vxlan_find_vni(net, vni, vxlan->dst_port)) {
1589 pr_info("duplicate VNI %u\n", vni);
1590 return -EEXIST;
1591 }
1592
1593 vs = vxlan_find_port(net, vxlan->dst_port);
1594 if (vs)
1595 ++vs->refcnt;
1596 else {
1597 /* Drop lock because socket create acquires RTNL lock */
1598 rtnl_unlock();
1599 vs = vxlan_socket_create(net, vxlan->dst_port);
1600 rtnl_lock();
1601 if (IS_ERR(vs))
1602 return PTR_ERR(vs);
1603
1604 hlist_add_head_rcu(&vs->hlist, vs_head(net, vxlan->dst_port));
1605 }
1606 vxlan->vn_sock = vs;
1607
Yan Burman1b13c972013-01-29 23:43:07 +00001608 SET_ETHTOOL_OPS(dev, &vxlan_ethtool_ops);
1609
stephen hemmingerd3428942012-10-01 12:32:35 +00001610 err = register_netdevice(dev);
stephen hemminger553675f2013-05-16 11:35:20 +00001611 if (err) {
1612 if (--vs->refcnt == 0) {
1613 rtnl_unlock();
1614 sk_release_kernel(vs->sock->sk);
1615 kfree(vs);
1616 rtnl_lock();
1617 }
1618 return err;
1619 }
stephen hemmingerd3428942012-10-01 12:32:35 +00001620
stephen hemminger553675f2013-05-16 11:35:20 +00001621 list_add(&vxlan->next, &vn->vxlan_list);
1622 hlist_add_head_rcu(&vxlan->hlist, vni_head(vs, vni));
1623
1624 return 0;
stephen hemmingerd3428942012-10-01 12:32:35 +00001625}
1626
1627static void vxlan_dellink(struct net_device *dev, struct list_head *head)
1628{
1629 struct vxlan_dev *vxlan = netdev_priv(dev);
stephen hemminger553675f2013-05-16 11:35:20 +00001630 struct vxlan_sock *vs = vxlan->vn_sock;
stephen hemmingerd3428942012-10-01 12:32:35 +00001631
1632 hlist_del_rcu(&vxlan->hlist);
stephen hemminger553675f2013-05-16 11:35:20 +00001633 list_del(&vxlan->next);
stephen hemmingerd3428942012-10-01 12:32:35 +00001634 unregister_netdevice_queue(dev, head);
stephen hemminger553675f2013-05-16 11:35:20 +00001635
1636 if (--vs->refcnt == 0) {
1637 hlist_del_rcu(&vs->hlist);
1638 schedule_work(&vs->del_work);
1639 }
stephen hemmingerd3428942012-10-01 12:32:35 +00001640}
1641
1642static size_t vxlan_get_size(const struct net_device *dev)
1643{
1644
1645 return nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_ID */
stephen hemminger5d174dd2013-04-27 11:31:55 +00001646 nla_total_size(sizeof(__be32)) +/* IFLA_VXLAN_GROUP */
stephen hemmingerd3428942012-10-01 12:32:35 +00001647 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LINK */
1648 nla_total_size(sizeof(__be32))+ /* IFLA_VXLAN_LOCAL */
1649 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TTL */
1650 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_TOS */
1651 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_LEARNING */
David Stevense4f67ad2012-11-20 02:50:14 +00001652 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_PROXY */
1653 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_RSC */
1654 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L2MISS */
1655 nla_total_size(sizeof(__u8)) + /* IFLA_VXLAN_L3MISS */
stephen hemmingerd3428942012-10-01 12:32:35 +00001656 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_AGEING */
1657 nla_total_size(sizeof(__u32)) + /* IFLA_VXLAN_LIMIT */
stephen hemminger05f47d62012-10-09 20:35:50 +00001658 nla_total_size(sizeof(struct ifla_vxlan_port_range)) +
stephen hemminger823aa872013-04-27 11:31:57 +00001659 nla_total_size(sizeof(__be16))+ /* IFLA_VXLAN_PORT */
stephen hemmingerd3428942012-10-01 12:32:35 +00001660 0;
1661}
1662
1663static int vxlan_fill_info(struct sk_buff *skb, const struct net_device *dev)
1664{
1665 const struct vxlan_dev *vxlan = netdev_priv(dev);
Atzm Watanabec7995c42013-04-16 02:50:52 +00001666 const struct vxlan_rdst *dst = &vxlan->default_dst;
stephen hemminger05f47d62012-10-09 20:35:50 +00001667 struct ifla_vxlan_port_range ports = {
1668 .low = htons(vxlan->port_min),
1669 .high = htons(vxlan->port_max),
1670 };
stephen hemmingerd3428942012-10-01 12:32:35 +00001671
Atzm Watanabec7995c42013-04-16 02:50:52 +00001672 if (nla_put_u32(skb, IFLA_VXLAN_ID, dst->remote_vni))
stephen hemmingerd3428942012-10-01 12:32:35 +00001673 goto nla_put_failure;
1674
stephen hemminger5d174dd2013-04-27 11:31:55 +00001675 if (dst->remote_ip && nla_put_be32(skb, IFLA_VXLAN_GROUP, dst->remote_ip))
stephen hemmingerd3428942012-10-01 12:32:35 +00001676 goto nla_put_failure;
1677
Atzm Watanabec7995c42013-04-16 02:50:52 +00001678 if (dst->remote_ifindex && nla_put_u32(skb, IFLA_VXLAN_LINK, dst->remote_ifindex))
stephen hemmingerd3428942012-10-01 12:32:35 +00001679 goto nla_put_failure;
1680
Stephen Hemminger7c41c422012-10-08 14:55:30 -07001681 if (vxlan->saddr && nla_put_be32(skb, IFLA_VXLAN_LOCAL, vxlan->saddr))
stephen hemmingerd3428942012-10-01 12:32:35 +00001682 goto nla_put_failure;
1683
1684 if (nla_put_u8(skb, IFLA_VXLAN_TTL, vxlan->ttl) ||
1685 nla_put_u8(skb, IFLA_VXLAN_TOS, vxlan->tos) ||
David Stevense4f67ad2012-11-20 02:50:14 +00001686 nla_put_u8(skb, IFLA_VXLAN_LEARNING,
1687 !!(vxlan->flags & VXLAN_F_LEARN)) ||
1688 nla_put_u8(skb, IFLA_VXLAN_PROXY,
1689 !!(vxlan->flags & VXLAN_F_PROXY)) ||
1690 nla_put_u8(skb, IFLA_VXLAN_RSC, !!(vxlan->flags & VXLAN_F_RSC)) ||
1691 nla_put_u8(skb, IFLA_VXLAN_L2MISS,
1692 !!(vxlan->flags & VXLAN_F_L2MISS)) ||
1693 nla_put_u8(skb, IFLA_VXLAN_L3MISS,
1694 !!(vxlan->flags & VXLAN_F_L3MISS)) ||
stephen hemmingerd3428942012-10-01 12:32:35 +00001695 nla_put_u32(skb, IFLA_VXLAN_AGEING, vxlan->age_interval) ||
stephen hemminger823aa872013-04-27 11:31:57 +00001696 nla_put_u32(skb, IFLA_VXLAN_LIMIT, vxlan->addrmax) ||
1697 nla_put_be16(skb, IFLA_VXLAN_PORT, vxlan->dst_port))
stephen hemmingerd3428942012-10-01 12:32:35 +00001698 goto nla_put_failure;
1699
stephen hemminger05f47d62012-10-09 20:35:50 +00001700 if (nla_put(skb, IFLA_VXLAN_PORT_RANGE, sizeof(ports), &ports))
1701 goto nla_put_failure;
1702
stephen hemmingerd3428942012-10-01 12:32:35 +00001703 return 0;
1704
1705nla_put_failure:
1706 return -EMSGSIZE;
1707}
1708
1709static struct rtnl_link_ops vxlan_link_ops __read_mostly = {
1710 .kind = "vxlan",
1711 .maxtype = IFLA_VXLAN_MAX,
1712 .policy = vxlan_policy,
1713 .priv_size = sizeof(struct vxlan_dev),
1714 .setup = vxlan_setup,
1715 .validate = vxlan_validate,
1716 .newlink = vxlan_newlink,
1717 .dellink = vxlan_dellink,
1718 .get_size = vxlan_get_size,
1719 .fill_info = vxlan_fill_info,
1720};
1721
1722static __net_init int vxlan_init_net(struct net *net)
1723{
1724 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
Cong Wang31fec5a2013-05-27 22:35:52 +00001725 unsigned int h;
stephen hemmingerd3428942012-10-01 12:32:35 +00001726
stephen hemminger553675f2013-05-16 11:35:20 +00001727 INIT_LIST_HEAD(&vn->vxlan_list);
stephen hemmingerd3428942012-10-01 12:32:35 +00001728
stephen hemminger553675f2013-05-16 11:35:20 +00001729 for (h = 0; h < PORT_HASH_SIZE; ++h)
1730 INIT_HLIST_HEAD(&vn->sock_list[h]);
stephen hemmingerd3428942012-10-01 12:32:35 +00001731
1732 return 0;
1733}
1734
1735static __net_exit void vxlan_exit_net(struct net *net)
1736{
1737 struct vxlan_net *vn = net_generic(net, vxlan_net_id);
Zang MingJie9cb6cb72013-03-06 04:37:37 +00001738 struct vxlan_dev *vxlan;
Zang MingJie9cb6cb72013-03-06 04:37:37 +00001739
1740 rtnl_lock();
stephen hemminger553675f2013-05-16 11:35:20 +00001741 list_for_each_entry(vxlan, &vn->vxlan_list, next)
1742 dev_close(vxlan->dev);
Zang MingJie9cb6cb72013-03-06 04:37:37 +00001743 rtnl_unlock();
stephen hemmingerd3428942012-10-01 12:32:35 +00001744}
1745
1746static struct pernet_operations vxlan_net_ops = {
1747 .init = vxlan_init_net,
1748 .exit = vxlan_exit_net,
1749 .id = &vxlan_net_id,
1750 .size = sizeof(struct vxlan_net),
1751};
1752
1753static int __init vxlan_init_module(void)
1754{
1755 int rc;
1756
1757 get_random_bytes(&vxlan_salt, sizeof(vxlan_salt));
1758
1759 rc = register_pernet_device(&vxlan_net_ops);
1760 if (rc)
1761 goto out1;
1762
1763 rc = rtnl_link_register(&vxlan_link_ops);
1764 if (rc)
1765 goto out2;
1766
1767 return 0;
1768
1769out2:
1770 unregister_pernet_device(&vxlan_net_ops);
1771out1:
1772 return rc;
1773}
Cong Wang7332a132013-05-27 22:35:53 +00001774late_initcall(vxlan_init_module);
stephen hemmingerd3428942012-10-01 12:32:35 +00001775
1776static void __exit vxlan_cleanup_module(void)
1777{
1778 rtnl_link_unregister(&vxlan_link_ops);
1779 unregister_pernet_device(&vxlan_net_ops);
David Stevens66817122013-03-15 04:35:51 +00001780 rcu_barrier();
stephen hemmingerd3428942012-10-01 12:32:35 +00001781}
1782module_exit(vxlan_cleanup_module);
1783
1784MODULE_LICENSE("GPL");
1785MODULE_VERSION(VXLAN_VERSION);
stephen hemminger3b8df3c2013-04-27 11:31:52 +00001786MODULE_AUTHOR("Stephen Hemminger <stephen@networkplumber.org>");
stephen hemmingerd3428942012-10-01 12:32:35 +00001787MODULE_ALIAS_RTNL_LINK("vxlan");