blob: b7eafa4c1a673efa2672c1ae13ff599b6d389818 [file] [log] [blame]
John W. Linville2d07dc72015-05-13 12:57:30 -04001/*
2 * GENEVE: Generic Network Virtualization Encapsulation
3 *
4 * Copyright (c) 2015 Red Hat, Inc.
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License version 2 as
8 * published by the Free Software Foundation.
9 */
10
11#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
12
13#include <linux/kernel.h>
14#include <linux/module.h>
15#include <linux/netdevice.h>
16#include <linux/etherdevice.h>
17#include <linux/hash.h>
18#include <net/rtnetlink.h>
19#include <net/geneve.h>
20
21#define GENEVE_NETDEV_VER "0.6"
22
23#define GENEVE_UDP_PORT 6081
24
25#define GENEVE_N_VID (1u << 24)
26#define GENEVE_VID_MASK (GENEVE_N_VID - 1)
27
28#define VNI_HASH_BITS 10
29#define VNI_HASH_SIZE (1<<VNI_HASH_BITS)
30
31static bool log_ecn_error = true;
32module_param(log_ecn_error, bool, 0644);
33MODULE_PARM_DESC(log_ecn_error, "Log packets received with corrupted ECN");
34
35/* per-network namespace private data for this module */
36struct geneve_net {
37 struct list_head geneve_list;
38 struct hlist_head vni_list[VNI_HASH_SIZE];
39};
40
41/* Pseudo network device */
42struct geneve_dev {
43 struct hlist_node hlist; /* vni hash table */
44 struct net *net; /* netns for packet i/o */
45 struct net_device *dev; /* netdev for geneve tunnel */
46 struct geneve_sock *sock; /* socket used for geneve tunnel */
47 u8 vni[3]; /* virtual network ID for tunnel */
48 struct sockaddr_in remote; /* IPv4 address for link partner */
49 struct list_head next; /* geneve's per namespace list */
50};
51
52static int geneve_net_id;
53
54static inline __u32 geneve_net_vni_hash(u8 vni[3])
55{
56 __u32 vnid;
57
58 vnid = (vni[0] << 16) | (vni[1] << 8) | vni[2];
59 return hash_32(vnid, VNI_HASH_BITS);
60}
61
62/* geneve receive/decap routine */
63static void geneve_rx(struct geneve_sock *gs, struct sk_buff *skb)
64{
65 struct genevehdr *gnvh = geneve_hdr(skb);
66 struct geneve_dev *dummy, *geneve = NULL;
67 struct geneve_net *gn;
68 struct iphdr *iph = NULL;
69 struct pcpu_sw_netstats *stats;
70 struct hlist_head *vni_list_head;
71 int err = 0;
72 __u32 hash;
73
74 iph = ip_hdr(skb); /* Still outer IP header... */
75
76 gn = gs->rcv_data;
77
78 /* Find the device for this VNI */
79 hash = geneve_net_vni_hash(gnvh->vni);
80 vni_list_head = &gn->vni_list[hash];
81 hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
82 if (!memcmp(gnvh->vni, dummy->vni, sizeof(dummy->vni)) &&
83 iph->saddr == dummy->remote.sin_addr.s_addr) {
84 geneve = dummy;
85 break;
86 }
87 }
88 if (!geneve)
89 goto drop;
90
91 /* Drop packets w/ critical options,
92 * since we don't support any...
93 */
94 if (gnvh->critical)
95 goto drop;
96
97 skb_reset_mac_header(skb);
98 skb_scrub_packet(skb, !net_eq(geneve->net, dev_net(geneve->dev)));
99 skb->protocol = eth_type_trans(skb, geneve->dev);
100 skb_postpull_rcsum(skb, eth_hdr(skb), ETH_HLEN);
101
102 /* Ignore packet loops (and multicast echo) */
103 if (ether_addr_equal(eth_hdr(skb)->h_source, geneve->dev->dev_addr))
104 goto drop;
105
106 skb_reset_network_header(skb);
107
108 iph = ip_hdr(skb); /* Now inner IP header... */
109 err = IP_ECN_decapsulate(iph, skb);
110
111 if (unlikely(err)) {
112 if (log_ecn_error)
113 net_info_ratelimited("non-ECT from %pI4 with TOS=%#x\n",
114 &iph->saddr, iph->tos);
115 if (err > 1) {
116 ++geneve->dev->stats.rx_frame_errors;
117 ++geneve->dev->stats.rx_errors;
118 goto drop;
119 }
120 }
121
122 stats = this_cpu_ptr(geneve->dev->tstats);
123 u64_stats_update_begin(&stats->syncp);
124 stats->rx_packets++;
125 stats->rx_bytes += skb->len;
126 u64_stats_update_end(&stats->syncp);
127
128 netif_rx(skb);
129
130 return;
131drop:
132 /* Consume bad packet */
133 kfree_skb(skb);
134}
135
136/* Setup stats when device is created */
137static int geneve_init(struct net_device *dev)
138{
139 dev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats);
140 if (!dev->tstats)
141 return -ENOMEM;
142
143 return 0;
144}
145
146static void geneve_uninit(struct net_device *dev)
147{
148 free_percpu(dev->tstats);
149}
150
151static int geneve_open(struct net_device *dev)
152{
153 struct geneve_dev *geneve = netdev_priv(dev);
154 struct net *net = geneve->net;
155 struct geneve_net *gn = net_generic(geneve->net, geneve_net_id);
156 struct geneve_sock *gs;
157
158 gs = geneve_sock_add(net, htons(GENEVE_UDP_PORT), geneve_rx, gn,
159 false, false);
160 if (IS_ERR(gs))
161 return PTR_ERR(gs);
162
163 geneve->sock = gs;
164
165 return 0;
166}
167
168static int geneve_stop(struct net_device *dev)
169{
170 struct geneve_dev *geneve = netdev_priv(dev);
171 struct geneve_sock *gs = geneve->sock;
172
173 geneve_sock_release(gs);
174
175 return 0;
176}
177
178static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct net_device *dev)
179{
180 struct geneve_dev *geneve = netdev_priv(dev);
181 struct geneve_sock *gs = geneve->sock;
182 struct rtable *rt = NULL;
183 const struct iphdr *iip; /* interior IP header */
184 struct flowi4 fl4;
185 int err;
186 __be16 sport;
187 __u8 tos, ttl = 0;
188
189 iip = ip_hdr(skb);
190
191 skb_reset_mac_header(skb);
192
193 /* TODO: port min/max limits should be configurable */
194 sport = udp_flow_src_port(dev_net(dev), skb, 0, 0, true);
195
196 memset(&fl4, 0, sizeof(fl4));
197 fl4.daddr = geneve->remote.sin_addr.s_addr;
198 rt = ip_route_output_key(geneve->net, &fl4);
199 if (IS_ERR(rt)) {
200 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
201 dev->stats.tx_carrier_errors++;
202 goto tx_error;
203 }
204 if (rt->dst.dev == dev) { /* is this necessary? */
205 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
206 dev->stats.collisions++;
207 goto rt_tx_error;
208 }
209
210 /* TODO: tos and ttl should be configurable */
211
212 tos = ip_tunnel_ecn_encap(0, iip, skb);
213
214 if (IN_MULTICAST(ntohl(fl4.daddr)))
215 ttl = 1;
216
217 ttl = ttl ? : ip4_dst_hoplimit(&rt->dst);
218
219 /* no need to handle local destination and encap bypass...yet... */
220
221 err = geneve_xmit_skb(gs, rt, skb, fl4.saddr, fl4.daddr,
222 tos, ttl, 0, sport, htons(GENEVE_UDP_PORT), 0,
223 geneve->vni, 0, NULL, false,
224 !net_eq(geneve->net, dev_net(geneve->dev)));
225 if (err < 0)
226 ip_rt_put(rt);
227
228 iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
229
230 return NETDEV_TX_OK;
231
232rt_tx_error:
233 ip_rt_put(rt);
234tx_error:
235 dev->stats.tx_errors++;
236 dev_kfree_skb(skb);
237 return NETDEV_TX_OK;
238}
239
240static const struct net_device_ops geneve_netdev_ops = {
241 .ndo_init = geneve_init,
242 .ndo_uninit = geneve_uninit,
243 .ndo_open = geneve_open,
244 .ndo_stop = geneve_stop,
245 .ndo_start_xmit = geneve_xmit,
246 .ndo_get_stats64 = ip_tunnel_get_stats64,
247 .ndo_change_mtu = eth_change_mtu,
248 .ndo_validate_addr = eth_validate_addr,
249 .ndo_set_mac_address = eth_mac_addr,
250};
251
252static void geneve_get_drvinfo(struct net_device *dev,
253 struct ethtool_drvinfo *drvinfo)
254{
255 strlcpy(drvinfo->version, GENEVE_NETDEV_VER, sizeof(drvinfo->version));
256 strlcpy(drvinfo->driver, "geneve", sizeof(drvinfo->driver));
257}
258
259static const struct ethtool_ops geneve_ethtool_ops = {
260 .get_drvinfo = geneve_get_drvinfo,
261 .get_link = ethtool_op_get_link,
262};
263
264/* Info for udev, that this is a virtual tunnel endpoint */
265static struct device_type geneve_type = {
266 .name = "geneve",
267};
268
269/* Initialize the device structure. */
270static void geneve_setup(struct net_device *dev)
271{
272 ether_setup(dev);
273
274 dev->netdev_ops = &geneve_netdev_ops;
275 dev->ethtool_ops = &geneve_ethtool_ops;
276 dev->destructor = free_netdev;
277
278 SET_NETDEV_DEVTYPE(dev, &geneve_type);
279
280 dev->tx_queue_len = 0;
281 dev->features |= NETIF_F_LLTX;
282 dev->features |= NETIF_F_SG | NETIF_F_HW_CSUM;
283 dev->features |= NETIF_F_RXCSUM;
284 dev->features |= NETIF_F_GSO_SOFTWARE;
285
286 dev->vlan_features = dev->features;
287 dev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
288
289 dev->hw_features |= NETIF_F_SG | NETIF_F_HW_CSUM | NETIF_F_RXCSUM;
290 dev->hw_features |= NETIF_F_GSO_SOFTWARE;
291 dev->hw_features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
292
293 netif_keep_dst(dev);
294 dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
295}
296
297static const struct nla_policy geneve_policy[IFLA_GENEVE_MAX + 1] = {
298 [IFLA_GENEVE_ID] = { .type = NLA_U32 },
299 [IFLA_GENEVE_REMOTE] = { .len = FIELD_SIZEOF(struct iphdr, daddr) },
300};
301
302static int geneve_validate(struct nlattr *tb[], struct nlattr *data[])
303{
304 if (tb[IFLA_ADDRESS]) {
305 if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN)
306 return -EINVAL;
307
308 if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS])))
309 return -EADDRNOTAVAIL;
310 }
311
312 if (!data)
313 return -EINVAL;
314
315 if (data[IFLA_GENEVE_ID]) {
316 __u32 vni = nla_get_u32(data[IFLA_GENEVE_ID]);
317
318 if (vni >= GENEVE_VID_MASK)
319 return -ERANGE;
320 }
321
322 return 0;
323}
324
325static int geneve_newlink(struct net *net, struct net_device *dev,
326 struct nlattr *tb[], struct nlattr *data[])
327{
328 struct geneve_net *gn = net_generic(net, geneve_net_id);
329 struct geneve_dev *dummy, *geneve = netdev_priv(dev);
330 struct hlist_head *vni_list_head;
331 struct sockaddr_in remote; /* IPv4 address for link partner */
332 __u32 vni, hash;
333 int err;
334
335 if (!data[IFLA_GENEVE_ID] || !data[IFLA_GENEVE_REMOTE])
336 return -EINVAL;
337
338 geneve->net = net;
339 geneve->dev = dev;
340
341 vni = nla_get_u32(data[IFLA_GENEVE_ID]);
342 geneve->vni[0] = (vni & 0x00ff0000) >> 16;
343 geneve->vni[1] = (vni & 0x0000ff00) >> 8;
344 geneve->vni[2] = vni & 0x000000ff;
345
346 geneve->remote.sin_addr.s_addr =
347 nla_get_in_addr(data[IFLA_GENEVE_REMOTE]);
348 if (IN_MULTICAST(ntohl(geneve->remote.sin_addr.s_addr)))
349 return -EINVAL;
350
351 remote = geneve->remote;
352 hash = geneve_net_vni_hash(geneve->vni);
353 vni_list_head = &gn->vni_list[hash];
354 hlist_for_each_entry_rcu(dummy, vni_list_head, hlist) {
355 if (!memcmp(geneve->vni, dummy->vni, sizeof(dummy->vni)) &&
356 !memcmp(&remote, &dummy->remote, sizeof(dummy->remote)))
357 return -EBUSY;
358 }
359
360 if (tb[IFLA_ADDRESS] == NULL)
361 eth_hw_addr_random(dev);
362
363 err = register_netdevice(dev);
364 if (err)
365 return err;
366
367 list_add(&geneve->next, &gn->geneve_list);
368
369 hlist_add_head_rcu(&geneve->hlist, &gn->vni_list[hash]);
370
371 return 0;
372}
373
374static void geneve_dellink(struct net_device *dev, struct list_head *head)
375{
376 struct geneve_dev *geneve = netdev_priv(dev);
377
378 if (!hlist_unhashed(&geneve->hlist))
379 hlist_del_rcu(&geneve->hlist);
380
381 list_del(&geneve->next);
382 unregister_netdevice_queue(dev, head);
383}
384
385static size_t geneve_get_size(const struct net_device *dev)
386{
387 return nla_total_size(sizeof(__u32)) + /* IFLA_GENEVE_ID */
388 nla_total_size(sizeof(struct in_addr)) + /* IFLA_GENEVE_REMOTE */
389 0;
390}
391
392static int geneve_fill_info(struct sk_buff *skb, const struct net_device *dev)
393{
394 struct geneve_dev *geneve = netdev_priv(dev);
395 __u32 vni;
396
397 vni = (geneve->vni[0] << 16) | (geneve->vni[1] << 8) | geneve->vni[2];
398 if (nla_put_u32(skb, IFLA_GENEVE_ID, vni))
399 goto nla_put_failure;
400
401 if (nla_put_in_addr(skb, IFLA_GENEVE_REMOTE,
402 geneve->remote.sin_addr.s_addr))
403 goto nla_put_failure;
404
405 return 0;
406
407nla_put_failure:
408 return -EMSGSIZE;
409}
410
411static struct rtnl_link_ops geneve_link_ops __read_mostly = {
412 .kind = "geneve",
413 .maxtype = IFLA_GENEVE_MAX,
414 .policy = geneve_policy,
415 .priv_size = sizeof(struct geneve_dev),
416 .setup = geneve_setup,
417 .validate = geneve_validate,
418 .newlink = geneve_newlink,
419 .dellink = geneve_dellink,
420 .get_size = geneve_get_size,
421 .fill_info = geneve_fill_info,
422};
423
424static __net_init int geneve_init_net(struct net *net)
425{
426 struct geneve_net *gn = net_generic(net, geneve_net_id);
427 unsigned int h;
428
429 INIT_LIST_HEAD(&gn->geneve_list);
430
431 for (h = 0; h < VNI_HASH_SIZE; ++h)
432 INIT_HLIST_HEAD(&gn->vni_list[h]);
433
434 return 0;
435}
436
437static void __net_exit geneve_exit_net(struct net *net)
438{
439 struct geneve_net *gn = net_generic(net, geneve_net_id);
440 struct geneve_dev *geneve, *next;
441 struct net_device *dev, *aux;
442 LIST_HEAD(list);
443
444 rtnl_lock();
445
446 /* gather any geneve devices that were moved into this ns */
447 for_each_netdev_safe(net, dev, aux)
448 if (dev->rtnl_link_ops == &geneve_link_ops)
449 unregister_netdevice_queue(dev, &list);
450
451 /* now gather any other geneve devices that were created in this ns */
452 list_for_each_entry_safe(geneve, next, &gn->geneve_list, next) {
453 /* If geneve->dev is in the same netns, it was already added
454 * to the list by the previous loop.
455 */
456 if (!net_eq(dev_net(geneve->dev), net))
457 unregister_netdevice_queue(geneve->dev, &list);
458 }
459
460 /* unregister the devices gathered above */
461 unregister_netdevice_many(&list);
462 rtnl_unlock();
463}
464
465static struct pernet_operations geneve_net_ops = {
466 .init = geneve_init_net,
467 .exit = geneve_exit_net,
468 .id = &geneve_net_id,
469 .size = sizeof(struct geneve_net),
470};
471
472static int __init geneve_init_module(void)
473{
474 int rc;
475
476 rc = register_pernet_subsys(&geneve_net_ops);
477 if (rc)
478 goto out1;
479
480 rc = rtnl_link_register(&geneve_link_ops);
481 if (rc)
482 goto out2;
483
484 return 0;
485out2:
486 unregister_pernet_subsys(&geneve_net_ops);
487out1:
488 return rc;
489}
490late_initcall(geneve_init_module);
491
492static void __exit geneve_cleanup_module(void)
493{
494 rtnl_link_unregister(&geneve_link_ops);
495 unregister_pernet_subsys(&geneve_net_ops);
496}
497module_exit(geneve_cleanup_module);
498
499MODULE_LICENSE("GPL");
500MODULE_VERSION(GENEVE_NETDEV_VER);
501MODULE_AUTHOR("John W. Linville <linville@tuxdriver.com>");
502MODULE_DESCRIPTION("Interface driver for GENEVE encapsulated traffic");
503MODULE_ALIAS_RTNL_LINK("geneve");