net: do not create fallback tunnels for non-default namespaces
fallback tunnels (like tunl0, gre0, gretap0, erspan0, sit0,
ip6tnl0, ip6gre0) are automatically created when the corresponding
module is loaded.
These tunnels are also automatically created when a new network
namespace is created, at a great cost.
In many cases, netns are used for isolation purposes, and these
extra network devices are a waste of resources. We are using
thousands of netns per host, and hit the netns creation/delete
bottleneck a lot. (Many thanks to Kirill for recent work on this)
Add a new sysctl so that we can opt-out from this automatic creation.
Note that these tunnels are still created for the initial namespace,
to be the least intrusive for typical setups.
Tested:
lpk43:~# cat add_del_unshare.sh
for i in `seq 1 40`
do
(for j in `seq 1 100` ; do unshare -n /bin/true >/dev/null ; done) &
done
wait
lpk43:~# echo 0 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m37.521s
user 0m0.886s
sys 7m7.084s
lpk43:~# echo 1 >/proc/sys/net/core/fb_tunnels_only_for_init_net
lpk43:~# time ./add_del_unshare.sh
real 0m4.761s
user 0m0.851s
sys 1m8.343s
lpk43:~#
Signed-off-by: Eric Dumazet <edumazet@google.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index d714f65..4f47f92 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -32,6 +32,9 @@
static int net_msg_warn; /* Unused, but still a sysctl */
+int sysctl_fb_tunnels_only_for_init_net __read_mostly = 0;
+EXPORT_SYMBOL(sysctl_fb_tunnels_only_for_init_net);
+
#ifdef CONFIG_RPS
static int rps_sock_flow_sysctl(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
@@ -513,6 +516,15 @@
.proc_handler = proc_dointvec_minmax,
.extra1 = &zero,
},
+ {
+ .procname = "fb_tunnels_only_for_init_net",
+ .data = &sysctl_fb_tunnels_only_for_init_net,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
{ }
};
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index 602597d..5fcb17c 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -347,8 +347,7 @@
struct net_device *dev;
int t_hlen;
- BUG_ON(!itn->fb_tunnel_dev);
- dev = __ip_tunnel_create(net, itn->fb_tunnel_dev->rtnl_link_ops, parms);
+ dev = __ip_tunnel_create(net, itn->rtnl_link_ops, parms);
if (IS_ERR(dev))
return ERR_CAST(dev);
@@ -822,7 +821,6 @@
struct net *net = t->net;
struct ip_tunnel_net *itn = net_generic(net, t->ip_tnl_net_id);
- BUG_ON(!itn->fb_tunnel_dev);
switch (cmd) {
case SIOCGETTUNNEL:
if (dev == itn->fb_tunnel_dev) {
@@ -847,7 +845,7 @@
p->o_key = 0;
}
- t = ip_tunnel_find(itn, p, itn->fb_tunnel_dev->type);
+ t = ip_tunnel_find(itn, p, itn->type);
if (cmd == SIOCADDTUNNEL) {
if (!t) {
@@ -991,10 +989,15 @@
struct ip_tunnel_parm parms;
unsigned int i;
+ itn->rtnl_link_ops = ops;
for (i = 0; i < IP_TNL_HASH_SIZE; i++)
INIT_HLIST_HEAD(&itn->tunnels[i]);
- if (!ops) {
+ if (!ops || !net_has_fallback_tunnels(net)) {
+ struct ip_tunnel_net *it_init_net;
+
+ it_init_net = net_generic(&init_net, ip_tnl_net_id);
+ itn->type = it_init_net->type;
itn->fb_tunnel_dev = NULL;
return 0;
}
@@ -1012,6 +1015,7 @@
itn->fb_tunnel_dev->features |= NETIF_F_NETNS_LOCAL;
itn->fb_tunnel_dev->mtu = ip_tunnel_bind_dev(itn->fb_tunnel_dev);
ip_tunnel_add(itn, netdev_priv(itn->fb_tunnel_dev));
+ itn->type = itn->fb_tunnel_dev->type;
}
rtnl_unlock();
@@ -1019,10 +1023,10 @@
}
EXPORT_SYMBOL_GPL(ip_tunnel_init_net);
-static void ip_tunnel_destroy(struct ip_tunnel_net *itn, struct list_head *head,
+static void ip_tunnel_destroy(struct net *net, struct ip_tunnel_net *itn,
+ struct list_head *head,
struct rtnl_link_ops *ops)
{
- struct net *net = dev_net(itn->fb_tunnel_dev);
struct net_device *dev, *aux;
int h;
@@ -1054,7 +1058,7 @@
rtnl_lock();
list_for_each_entry(net, net_list, exit_list) {
itn = net_generic(net, id);
- ip_tunnel_destroy(itn, &list, ops);
+ ip_tunnel_destroy(net, itn, &list, ops);
}
unregister_netdevice_many(&list);
rtnl_unlock();
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 18a3dfb..7d8775c 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -236,7 +236,7 @@
return t;
dev = ign->fb_tunnel_dev;
- if (dev->flags & IFF_UP)
+ if (dev && dev->flags & IFF_UP)
return netdev_priv(dev);
return NULL;
@@ -1472,6 +1472,8 @@
struct ip6gre_net *ign = net_generic(net, ip6gre_net_id);
int err;
+ if (!net_has_fallback_tunnels(net))
+ return 0;
ign->fb_tunnel_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6gre0",
NET_NAME_UNKNOWN,
ip6gre_tunnel_setup);
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index 56c4967..5c045fa 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -2205,6 +2205,8 @@
ip6n->tnls[0] = ip6n->tnls_wc;
ip6n->tnls[1] = ip6n->tnls_r_l;
+ if (!net_has_fallback_tunnels(net))
+ return 0;
err = -ENOMEM;
ip6n->fb_tnl_dev = alloc_netdev(sizeof(struct ip6_tnl), "ip6tnl0",
NET_NAME_UNKNOWN, ip6_tnl_dev_setup);
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index a9c4ac6..8a4f8fd 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -182,7 +182,7 @@
#ifdef CONFIG_IPV6_SIT_6RD
struct ip_tunnel *t = netdev_priv(dev);
- if (dev == sitn->fb_tunnel_dev) {
+ if (dev == sitn->fb_tunnel_dev || !sitn->fb_tunnel_dev) {
ipv6_addr_set(&t->ip6rd.prefix, htonl(0x20020000), 0, 0, 0);
t->ip6rd.relay_prefix = 0;
t->ip6rd.prefixlen = 16;
@@ -1835,6 +1835,9 @@
sitn->tunnels[2] = sitn->tunnels_r;
sitn->tunnels[3] = sitn->tunnels_r_l;
+ if (!net_has_fallback_tunnels(net))
+ return 0;
+
sitn->fb_tunnel_dev = alloc_netdev(sizeof(struct ip_tunnel), "sit0",
NET_NAME_UNKNOWN,
ipip6_tunnel_setup);