Andy Zhou | 0b5e8b8 | 2014-10-03 15:35:28 -0700 | [diff] [blame] | 1 | /* |
| 2 | * Geneve: Generic Network Virtualization Encapsulation |
| 3 | * |
| 4 | * Copyright (c) 2014 Nicira, Inc. |
| 5 | * |
| 6 | * This program is free software; you can redistribute it and/or |
| 7 | * modify it under the terms of the GNU General Public License |
| 8 | * as published by the Free Software Foundation; either version |
| 9 | * 2 of the License, or (at your option) any later version. |
| 10 | */ |
| 11 | |
| 12 | #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt |
| 13 | |
| 14 | #include <linux/kernel.h> |
| 15 | #include <linux/types.h> |
| 16 | #include <linux/module.h> |
| 17 | #include <linux/errno.h> |
| 18 | #include <linux/slab.h> |
| 19 | #include <linux/skbuff.h> |
| 20 | #include <linux/rculist.h> |
| 21 | #include <linux/netdevice.h> |
| 22 | #include <linux/in.h> |
| 23 | #include <linux/ip.h> |
| 24 | #include <linux/udp.h> |
| 25 | #include <linux/igmp.h> |
| 26 | #include <linux/etherdevice.h> |
| 27 | #include <linux/if_ether.h> |
| 28 | #include <linux/if_vlan.h> |
| 29 | #include <linux/hash.h> |
| 30 | #include <linux/ethtool.h> |
| 31 | #include <net/arp.h> |
| 32 | #include <net/ndisc.h> |
| 33 | #include <net/ip.h> |
| 34 | #include <net/ip_tunnels.h> |
| 35 | #include <net/icmp.h> |
| 36 | #include <net/udp.h> |
| 37 | #include <net/rtnetlink.h> |
| 38 | #include <net/route.h> |
| 39 | #include <net/dsfield.h> |
| 40 | #include <net/inet_ecn.h> |
| 41 | #include <net/net_namespace.h> |
| 42 | #include <net/netns/generic.h> |
| 43 | #include <net/geneve.h> |
| 44 | #include <net/protocol.h> |
| 45 | #include <net/udp_tunnel.h> |
| 46 | #if IS_ENABLED(CONFIG_IPV6) |
| 47 | #include <net/ipv6.h> |
| 48 | #include <net/addrconf.h> |
| 49 | #include <net/ip6_tunnel.h> |
| 50 | #include <net/ip6_checksum.h> |
| 51 | #endif |
| 52 | |
| 53 | #define PORT_HASH_BITS 8 |
| 54 | #define PORT_HASH_SIZE (1<<PORT_HASH_BITS) |
| 55 | |
| 56 | /* per-network namespace private data for this module */ |
| 57 | struct geneve_net { |
| 58 | struct hlist_head sock_list[PORT_HASH_SIZE]; |
| 59 | spinlock_t sock_lock; /* Protects sock_list */ |
| 60 | }; |
| 61 | |
| 62 | static int geneve_net_id; |
| 63 | |
| 64 | static struct workqueue_struct *geneve_wq; |
| 65 | |
| 66 | static inline struct genevehdr *geneve_hdr(const struct sk_buff *skb) |
| 67 | { |
| 68 | return (struct genevehdr *)(udp_hdr(skb) + 1); |
| 69 | } |
| 70 | |
| 71 | static struct hlist_head *gs_head(struct net *net, __be16 port) |
| 72 | { |
| 73 | struct geneve_net *gn = net_generic(net, geneve_net_id); |
| 74 | |
| 75 | return &gn->sock_list[hash_32(ntohs(port), PORT_HASH_BITS)]; |
| 76 | } |
| 77 | |
| 78 | /* Find geneve socket based on network namespace and UDP port */ |
| 79 | static struct geneve_sock *geneve_find_sock(struct net *net, __be16 port) |
| 80 | { |
| 81 | struct geneve_sock *gs; |
| 82 | |
| 83 | hlist_for_each_entry_rcu(gs, gs_head(net, port), hlist) { |
| 84 | if (inet_sk(gs->sock->sk)->inet_sport == port) |
| 85 | return gs; |
| 86 | } |
| 87 | |
| 88 | return NULL; |
| 89 | } |
| 90 | |
| 91 | static void geneve_build_header(struct genevehdr *geneveh, |
| 92 | __be16 tun_flags, u8 vni[3], |
| 93 | u8 options_len, u8 *options) |
| 94 | { |
| 95 | geneveh->ver = GENEVE_VER; |
| 96 | geneveh->opt_len = options_len / 4; |
| 97 | geneveh->oam = !!(tun_flags & TUNNEL_OAM); |
| 98 | geneveh->critical = !!(tun_flags & TUNNEL_CRIT_OPT); |
| 99 | geneveh->rsvd1 = 0; |
| 100 | memcpy(geneveh->vni, vni, 3); |
| 101 | geneveh->proto_type = htons(ETH_P_TEB); |
| 102 | geneveh->rsvd2 = 0; |
| 103 | |
| 104 | memcpy(geneveh->options, options, options_len); |
| 105 | } |
| 106 | |
| 107 | /* Transmit a fully formated Geneve frame. |
| 108 | * |
| 109 | * When calling this function. The skb->data should point |
| 110 | * to the geneve header which is fully formed. |
| 111 | * |
| 112 | * This function will add other UDP tunnel headers. |
| 113 | */ |
| 114 | int geneve_xmit_skb(struct geneve_sock *gs, struct rtable *rt, |
| 115 | struct sk_buff *skb, __be32 src, __be32 dst, __u8 tos, |
| 116 | __u8 ttl, __be16 df, __be16 src_port, __be16 dst_port, |
| 117 | __be16 tun_flags, u8 vni[3], u8 opt_len, u8 *opt, |
| 118 | bool xnet) |
| 119 | { |
| 120 | struct genevehdr *gnvh; |
| 121 | int min_headroom; |
| 122 | int err; |
| 123 | |
| 124 | skb = udp_tunnel_handle_offloads(skb, !gs->sock->sk->sk_no_check_tx); |
| 125 | |
| 126 | min_headroom = LL_RESERVED_SPACE(rt->dst.dev) + rt->dst.header_len |
| 127 | + GENEVE_BASE_HLEN + opt_len + sizeof(struct iphdr) |
| 128 | + (vlan_tx_tag_present(skb) ? VLAN_HLEN : 0); |
| 129 | |
| 130 | err = skb_cow_head(skb, min_headroom); |
| 131 | if (unlikely(err)) |
| 132 | return err; |
| 133 | |
| 134 | if (vlan_tx_tag_present(skb)) { |
| 135 | if (unlikely(!__vlan_put_tag(skb, |
| 136 | skb->vlan_proto, |
| 137 | vlan_tx_tag_get(skb)))) { |
| 138 | err = -ENOMEM; |
| 139 | return err; |
| 140 | } |
| 141 | skb->vlan_tci = 0; |
| 142 | } |
| 143 | |
| 144 | gnvh = (struct genevehdr *)__skb_push(skb, sizeof(*gnvh) + opt_len); |
| 145 | geneve_build_header(gnvh, tun_flags, vni, opt_len, opt); |
| 146 | |
| 147 | return udp_tunnel_xmit_skb(gs->sock, rt, skb, src, dst, |
| 148 | tos, ttl, df, src_port, dst_port, xnet); |
| 149 | } |
| 150 | EXPORT_SYMBOL_GPL(geneve_xmit_skb); |
| 151 | |
| 152 | static void geneve_notify_add_rx_port(struct geneve_sock *gs) |
| 153 | { |
| 154 | struct sock *sk = gs->sock->sk; |
| 155 | sa_family_t sa_family = sk->sk_family; |
| 156 | int err; |
| 157 | |
| 158 | if (sa_family == AF_INET) { |
| 159 | err = udp_add_offload(&gs->udp_offloads); |
| 160 | if (err) |
| 161 | pr_warn("geneve: udp_add_offload failed with status %d\n", |
| 162 | err); |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | /* Callback from net/ipv4/udp.c to receive packets */ |
| 167 | static int geneve_udp_encap_recv(struct sock *sk, struct sk_buff *skb) |
| 168 | { |
| 169 | struct genevehdr *geneveh; |
| 170 | struct geneve_sock *gs; |
| 171 | int opts_len; |
| 172 | |
| 173 | /* Need Geneve and inner Ethernet header to be present */ |
| 174 | if (unlikely(!pskb_may_pull(skb, GENEVE_BASE_HLEN))) |
| 175 | goto error; |
| 176 | |
| 177 | /* Return packets with reserved bits set */ |
| 178 | geneveh = geneve_hdr(skb); |
| 179 | |
| 180 | if (unlikely(geneveh->ver != GENEVE_VER)) |
| 181 | goto error; |
| 182 | |
| 183 | if (unlikely(geneveh->proto_type != htons(ETH_P_TEB))) |
| 184 | goto error; |
| 185 | |
| 186 | opts_len = geneveh->opt_len * 4; |
| 187 | if (iptunnel_pull_header(skb, GENEVE_BASE_HLEN + opts_len, |
| 188 | htons(ETH_P_TEB))) |
| 189 | goto drop; |
| 190 | |
| 191 | gs = rcu_dereference_sk_user_data(sk); |
| 192 | if (!gs) |
| 193 | goto drop; |
| 194 | |
| 195 | gs->rcv(gs, skb); |
| 196 | return 0; |
| 197 | |
| 198 | drop: |
| 199 | /* Consume bad packet */ |
| 200 | kfree_skb(skb); |
| 201 | return 0; |
| 202 | |
| 203 | error: |
| 204 | /* Let the UDP layer deal with the skb */ |
| 205 | return 1; |
| 206 | } |
| 207 | |
| 208 | static void geneve_del_work(struct work_struct *work) |
| 209 | { |
| 210 | struct geneve_sock *gs = container_of(work, struct geneve_sock, |
| 211 | del_work); |
| 212 | |
| 213 | udp_tunnel_sock_release(gs->sock); |
| 214 | kfree_rcu(gs, rcu); |
| 215 | } |
| 216 | |
| 217 | static struct socket *geneve_create_sock(struct net *net, bool ipv6, |
| 218 | __be16 port) |
| 219 | { |
| 220 | struct socket *sock; |
| 221 | struct udp_port_cfg udp_conf; |
| 222 | int err; |
| 223 | |
| 224 | memset(&udp_conf, 0, sizeof(udp_conf)); |
| 225 | |
| 226 | if (ipv6) { |
| 227 | udp_conf.family = AF_INET6; |
| 228 | } else { |
| 229 | udp_conf.family = AF_INET; |
Andy Zhou | 42350dc | 2014-10-06 13:22:50 -0700 | [diff] [blame] | 230 | udp_conf.local_ip.s_addr = htonl(INADDR_ANY); |
Andy Zhou | 0b5e8b8 | 2014-10-03 15:35:28 -0700 | [diff] [blame] | 231 | } |
| 232 | |
| 233 | udp_conf.local_udp_port = port; |
| 234 | |
| 235 | /* Open UDP socket */ |
| 236 | err = udp_sock_create(net, &udp_conf, &sock); |
| 237 | if (err < 0) |
| 238 | return ERR_PTR(err); |
| 239 | |
| 240 | return sock; |
| 241 | } |
| 242 | |
| 243 | /* Create new listen socket if needed */ |
| 244 | static struct geneve_sock *geneve_socket_create(struct net *net, __be16 port, |
| 245 | geneve_rcv_t *rcv, void *data, |
| 246 | bool ipv6) |
| 247 | { |
| 248 | struct geneve_net *gn = net_generic(net, geneve_net_id); |
| 249 | struct geneve_sock *gs; |
| 250 | struct socket *sock; |
| 251 | struct udp_tunnel_sock_cfg tunnel_cfg; |
| 252 | |
| 253 | gs = kzalloc(sizeof(*gs), GFP_KERNEL); |
| 254 | if (!gs) |
| 255 | return ERR_PTR(-ENOMEM); |
| 256 | |
| 257 | INIT_WORK(&gs->del_work, geneve_del_work); |
| 258 | |
| 259 | sock = geneve_create_sock(net, ipv6, port); |
| 260 | if (IS_ERR(sock)) { |
| 261 | kfree(gs); |
| 262 | return ERR_CAST(sock); |
| 263 | } |
| 264 | |
| 265 | gs->sock = sock; |
| 266 | atomic_set(&gs->refcnt, 1); |
| 267 | gs->rcv = rcv; |
| 268 | gs->rcv_data = data; |
| 269 | |
| 270 | /* Initialize the geneve udp offloads structure */ |
| 271 | gs->udp_offloads.port = port; |
| 272 | gs->udp_offloads.callbacks.gro_receive = NULL; |
| 273 | gs->udp_offloads.callbacks.gro_complete = NULL; |
| 274 | |
| 275 | spin_lock(&gn->sock_lock); |
| 276 | hlist_add_head_rcu(&gs->hlist, gs_head(net, port)); |
| 277 | geneve_notify_add_rx_port(gs); |
| 278 | spin_unlock(&gn->sock_lock); |
| 279 | |
| 280 | /* Mark socket as an encapsulation socket */ |
| 281 | tunnel_cfg.sk_user_data = gs; |
| 282 | tunnel_cfg.encap_type = 1; |
| 283 | tunnel_cfg.encap_rcv = geneve_udp_encap_recv; |
| 284 | tunnel_cfg.encap_destroy = NULL; |
| 285 | setup_udp_tunnel_sock(net, sock, &tunnel_cfg); |
| 286 | |
| 287 | return gs; |
| 288 | } |
| 289 | |
| 290 | struct geneve_sock *geneve_sock_add(struct net *net, __be16 port, |
| 291 | geneve_rcv_t *rcv, void *data, |
| 292 | bool no_share, bool ipv6) |
| 293 | { |
| 294 | struct geneve_sock *gs; |
| 295 | |
| 296 | gs = geneve_socket_create(net, port, rcv, data, ipv6); |
| 297 | if (!IS_ERR(gs)) |
| 298 | return gs; |
| 299 | |
| 300 | if (no_share) /* Return error if sharing is not allowed. */ |
| 301 | return ERR_PTR(-EINVAL); |
| 302 | |
| 303 | gs = geneve_find_sock(net, port); |
| 304 | if (gs) { |
| 305 | if (gs->rcv == rcv) |
| 306 | atomic_inc(&gs->refcnt); |
| 307 | else |
| 308 | gs = ERR_PTR(-EBUSY); |
| 309 | } else { |
| 310 | gs = ERR_PTR(-EINVAL); |
| 311 | } |
| 312 | |
| 313 | return gs; |
| 314 | } |
| 315 | EXPORT_SYMBOL_GPL(geneve_sock_add); |
| 316 | |
| 317 | void geneve_sock_release(struct geneve_sock *gs) |
| 318 | { |
| 319 | if (!atomic_dec_and_test(&gs->refcnt)) |
| 320 | return; |
| 321 | |
| 322 | queue_work(geneve_wq, &gs->del_work); |
| 323 | } |
| 324 | EXPORT_SYMBOL_GPL(geneve_sock_release); |
| 325 | |
| 326 | static __net_init int geneve_init_net(struct net *net) |
| 327 | { |
| 328 | struct geneve_net *gn = net_generic(net, geneve_net_id); |
| 329 | unsigned int h; |
| 330 | |
| 331 | spin_lock_init(&gn->sock_lock); |
| 332 | |
| 333 | for (h = 0; h < PORT_HASH_SIZE; ++h) |
| 334 | INIT_HLIST_HEAD(&gn->sock_list[h]); |
| 335 | |
| 336 | return 0; |
| 337 | } |
| 338 | |
| 339 | static struct pernet_operations geneve_net_ops = { |
| 340 | .init = geneve_init_net, |
| 341 | .exit = NULL, |
| 342 | .id = &geneve_net_id, |
| 343 | .size = sizeof(struct geneve_net), |
| 344 | }; |
| 345 | |
| 346 | static int __init geneve_init_module(void) |
| 347 | { |
| 348 | int rc; |
| 349 | |
| 350 | geneve_wq = alloc_workqueue("geneve", 0, 0); |
| 351 | if (!geneve_wq) |
| 352 | return -ENOMEM; |
| 353 | |
| 354 | rc = register_pernet_subsys(&geneve_net_ops); |
| 355 | if (rc) |
| 356 | return rc; |
| 357 | |
| 358 | pr_info("Geneve driver\n"); |
| 359 | |
| 360 | return 0; |
| 361 | } |
| 362 | late_initcall(geneve_init_module); |
| 363 | |
| 364 | static void __exit geneve_cleanup_module(void) |
| 365 | { |
| 366 | destroy_workqueue(geneve_wq); |
| 367 | } |
| 368 | module_exit(geneve_cleanup_module); |
| 369 | |
| 370 | MODULE_LICENSE("GPL"); |
| 371 | MODULE_AUTHOR("Jesse Gross <jesse@nicira.com>"); |
| 372 | MODULE_DESCRIPTION("Driver for GENEVE encapsulated traffic"); |
| 373 | MODULE_ALIAS_RTNL_LINK("geneve"); |