iproute2: Add processless network namespace support

The goal of this code change is to implement a mechanism such that it is
simple to work with a kernel that is using multiple network namespaces
at once.

This comes in handy for interacting with vpns where there may be rfc1918
address overlaps, and different policies default routes, name servers
and the like.

Configuration specific to a network namespace that would ordinarily be
stored under /etc/ is stored under /etc/netns/<name>.  For example if
the dns server configuration is different for your vpn you would create
a file /etc/netns/myvpn/resolv.conf.

File descriptors that can be used to manipulate a network namespace can
be created by opening /var/run/netns/<NAME>.

This adds the following commands to iproute.
ip netns add NAME
ip netns delete NAME
ip netns monitor
ip netns list
ip netns exec NAME cmd ....
ip link set DEV netns NAME

ip netns exec exists to cater the vast majority of programs that only
know how to operate in a single network namespace.  ip netns exec
changes the default network namespace, creates a new mount namespace,
remounts /sys and bind mounts netns specific configuration files to
their standard locations.

Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
diff --git a/ip/ipnetns.c b/ip/ipnetns.c
new file mode 100644
index 0000000..db7007c
--- /dev/null
+++ b/ip/ipnetns.c
@@ -0,0 +1,314 @@
+#define _ATFILE_SOURCE
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/wait.h>
+#include <sys/inotify.h>
+#include <sys/mount.h>
+#include <sys/param.h>
+#include <sys/syscall.h>
+#include <stdio.h>
+#include <string.h>
+#include <sched.h>
+#include <fcntl.h>
+#include <dirent.h>
+#include <errno.h>
+#include <unistd.h>
+
+#include "utils.h"
+#include "ip_common.h"
+
+#define NETNS_RUN_DIR "/var/run/netns"
+#define NETNS_ETC_DIR "/etc/netns"
+
+#ifndef CLONE_NEWNET
+#define CLONE_NEWNET 0x40000000	/* New network namespace (lo, device, names sockets, etc) */
+#endif
+
+#ifndef MNT_DETACH
+#define MNT_DETACH	0x00000002	/* Just detach from the tree */
+#endif /* MNT_DETACH */
+
+static int setns(int fd, int nstype)
+{
+#ifdef __NR_setns
+	return syscall(__NR_setns, fd, nstype);
+#else
+	errno = ENOSYS;
+	return -1;
+#endif
+}
+
+
+static int touch(const char *path, mode_t mode)
+{
+	int fd;
+	fd = open(path, O_RDONLY|O_CREAT, mode);
+	if (fd < 0)
+		return -1;
+	close(fd);
+	return 0;
+}
+
+static void usage(void) __attribute__((noreturn));
+
+static void usage(void)
+{
+	fprintf(stderr, "Usage: ip netns list\n");
+	fprintf(stderr, "       ip netns add NAME\n");
+	fprintf(stderr, "       ip netns delete NAME\n");
+	fprintf(stderr, "       ip netns exec NAME cmd ...\n");
+	fprintf(stderr, "       ip netns monitor\n");
+	exit(-1);
+}
+
+int get_netns_fd(const char *name)
+{
+	char pathbuf[MAXPATHLEN];
+	const char *path, *ptr;
+
+	path = name;
+	ptr = strchr(name, '/');
+	if (!ptr) {
+		snprintf(pathbuf, sizeof(pathbuf), "%s/%s",
+			NETNS_RUN_DIR, name );
+		path = pathbuf;
+	}
+	return open(path, O_RDONLY);
+}
+
+static int netns_list(int argc, char **argv)
+{
+	struct dirent *entry;
+	DIR *dir;
+
+	dir = opendir(NETNS_RUN_DIR);
+	if (!dir)
+		return 0;
+
+	while ((entry = readdir(dir)) != NULL) {
+		if (strcmp(entry->d_name, ".") == 0)
+			continue;
+		if (strcmp(entry->d_name, "..") == 0)
+			continue;
+		printf("%s\n", entry->d_name);
+	}
+	closedir(dir);
+	return 0;
+}
+
+static void bind_etc(const char *name)
+{
+	char etc_netns_path[MAXPATHLEN];
+	char netns_name[MAXPATHLEN];
+	char etc_name[MAXPATHLEN];
+	struct dirent *entry;
+	DIR *dir;
+
+	snprintf(etc_netns_path, sizeof(etc_netns_path), "%s/%s", NETNS_ETC_DIR, name);
+	dir = opendir(etc_netns_path);
+	if (!dir)
+		return;
+
+	while ((entry = readdir(dir)) != NULL) {
+		if (strcmp(entry->d_name, ".") == 0)
+			continue;
+		if (strcmp(entry->d_name, "..") == 0)
+			continue;
+		snprintf(netns_name, sizeof(netns_name), "%s/%s", etc_netns_path, entry->d_name);
+		snprintf(etc_name, sizeof(etc_name), "/etc/%s", entry->d_name);
+		if (mount(netns_name, etc_name, "none", MS_BIND, NULL) < 0) {
+			fprintf(stderr, "Bind %s -> %s failed: %s\n",
+				netns_name, etc_name, strerror(errno));
+		}
+	}
+	closedir(dir);
+}
+
+static int netns_exec(int argc, char **argv)
+{
+	/* Setup the proper environment for apps that are not netns
+	 * aware, and execute a program in that environment.
+	 */
+	const char *name, *cmd;
+	char net_path[MAXPATHLEN];
+	int netns;
+
+	if (argc < 1) {
+		fprintf(stderr, "No netns name specified\n");
+		return -1;
+	}
+	if (argc < 2) {
+		fprintf(stderr, "No cmd specified\n");
+		return -1;
+	}
+	name = argv[0];
+	cmd = argv[1];
+	snprintf(net_path, sizeof(net_path), "%s/%s", NETNS_RUN_DIR, name);
+	netns = open(net_path, O_RDONLY);
+	if (netns < 0) {
+		fprintf(stderr, "Cannot open network namespace: %s\n",
+			strerror(errno));
+		return -1;
+	}
+	if (setns(netns, CLONE_NEWNET) < 0) {
+		fprintf(stderr, "seting the network namespace failed: %s\n",
+			strerror(errno));
+		return -1;
+	}
+
+	if (unshare(CLONE_NEWNS) < 0) {
+		fprintf(stderr, "unshare failed: %s\n", strerror(errno));
+		return -1;
+	}
+	/* Mount a version of /sys that describes the network namespace */
+	if (umount2("/sys", MNT_DETACH) < 0) {
+		fprintf(stderr, "umount of /sys failed: %s\n", strerror(errno));
+		return -1;
+	}
+	if (mount(name, "/sys", "sysfs", 0, NULL) < 0) {
+		fprintf(stderr, "mount of /sys failed: %s\n",strerror(errno));
+		return -1;
+	}
+
+	/* Setup bind mounts for config files in /etc */
+	bind_etc(name);
+
+	if (execvp(cmd, argv + 1)  < 0)
+		fprintf(stderr, "exec of %s failed: %s\n",
+			cmd, strerror(errno));
+	exit(-1);
+}
+
+static int netns_delete(int argc, char **argv)
+{
+	const char *name;
+	char netns_path[MAXPATHLEN];
+
+	if (argc < 1) {
+		fprintf(stderr, "No netns name specified\n");
+		return -1;
+	}
+
+	name = argv[0];
+	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
+	umount2(netns_path, MNT_DETACH);
+	if (unlink(netns_path) < 0) {
+		fprintf(stderr, "Cannot remove %s: %s\n",
+			netns_path, strerror(errno));
+		return -1;
+	}
+	return 0;
+}
+
+static int netns_add(int argc, char **argv)
+{
+	/* This function creates a new network namespace and
+	 * a new mount namespace and bind them into a well known
+	 * location in the filesystem based on the name provided.
+	 *
+	 * The mount namespace is created so that any necessary
+	 * userspace tweaks like remounting /sys, or bind mounting
+	 * a new /etc/resolv.conf can be shared between uers.
+	 */
+	char netns_path[MAXPATHLEN];
+	const char *name;
+
+	if (argc < 1) {
+		fprintf(stderr, "No netns name specified\n");
+		return -1;
+	}
+	name = argv[0];
+
+	snprintf(netns_path, sizeof(netns_path), "%s/%s", NETNS_RUN_DIR, name);
+
+	/* Create the base netns directory if it doesn't exist */
+	mkdir(NETNS_RUN_DIR, S_IRWXU|S_IRGRP|S_IXGRP|S_IROTH|S_IXOTH);
+
+	/* Create the filesystem state */
+	if (touch(netns_path, 0) < 0) {
+		fprintf(stderr, "Could not create %s: %s\n",
+			netns_path, strerror(errno));
+		goto out_delete;
+	}
+	if (unshare(CLONE_NEWNET) < 0) {
+		fprintf(stderr, "Failed to create a new network namespace: %s\n",
+			strerror(errno));
+		goto out_delete;
+	}
+
+	/* Bind the netns last so I can watch for it */
+	if (mount("/proc/self/ns/net", netns_path, "none", MS_BIND, NULL) < 0) {
+		fprintf(stderr, "Bind /proc/self/ns/net -> %s failed: %s\n",
+			netns_path, strerror(errno));
+		goto out_delete;
+	}
+	return 0;
+out_delete:
+	netns_delete(argc, argv);
+	exit(-1);
+	return -1;
+}
+
+
+static int netns_monitor(int argc, char **argv)
+{
+	char buf[4096];
+	struct inotify_event *event;
+	int fd;
+	fd = inotify_init();
+	if (fd < 0) {
+		fprintf(stderr, "inotify_init failed: %s\n",
+			strerror(errno));
+		return -1;
+	}
+	if (inotify_add_watch(fd, NETNS_RUN_DIR, IN_CREATE | IN_DELETE) < 0) {
+		fprintf(stderr, "inotify_add_watch failed: %s\n",
+			strerror(errno));
+		return -1;
+	}
+	for(;;) {
+		ssize_t len = read(fd, buf, sizeof(buf));
+		if (len < 0) {
+			fprintf(stderr, "read failed: %s\n",
+				strerror(errno));
+			return -1;
+		}
+		for (event = (struct inotify_event *)buf;
+		     (char *)event < &buf[len];
+		     event = (struct inotify_event *)((char *)event + sizeof(*event) + event->len)) {
+			if (event->mask & IN_CREATE)
+				printf("add %s\n", event->name);
+			if (event->mask & IN_DELETE)
+				printf("delete %s\n", event->name);
+		}
+	}
+	return 0;
+}
+
+int do_netns(int argc, char **argv)
+{
+	if (argc < 1)
+		return netns_list(0, NULL);
+
+	if ((matches(*argv, "list") == 0) || (matches(*argv, "show") == 0) ||
+	    (matches(*argv, "lst") == 0))
+		return netns_list(argc-1, argv+1);
+
+	if (matches(*argv, "help") == 0)
+		usage();
+
+	if (matches(*argv, "add") == 0)
+		return netns_add(argc-1, argv+1);
+
+	if (matches(*argv, "delete") == 0)
+		return netns_delete(argc-1, argv+1);
+
+	if (matches(*argv, "exec") == 0)
+		return netns_exec(argc-1, argv+1);
+
+	if (matches(*argv, "monitor") == 0)
+		return netns_monitor(argc-1, argv+1);
+
+	fprintf(stderr, "Command \"%s\" is unknown, try \"ip netns help\".\n", *argv);
+	exit(-1);
+}