Merge branch 'master' into net-next

commit: 93531fac419cf8450a6159feeb108071da980acb [log] [tgz]
author: Stephen Hemminger <shemming@brocade.com> Mon Apr 13 09:39:46 2015 -0700
committer: Stephen Hemminger <shemming@brocade.com> Mon Apr 13 09:39:46 2015 -0700
tree: 62b27d80451c6243a26781a846563bfa5ab5b5e0
parent: 6256f8c9e45f01187b297a576e148534a393c990 [diff]
parent: 672acc723859290b94b753b8437d8cc2f0810174 [diff]
diff --git a/Makefile b/Makefile
index 9dbb29f..ca6c2e1 100644
--- a/Makefile
+++ b/Makefile

@@ -26,6 +26,9 @@
 #options for ipx
 ADDLIB+=ipx_ntop.o ipx_pton.o
 
+#options for mpls
+ADDLIB+=mpls_ntop.o mpls_pton.o
+
 CC = gcc
 HOSTCC = gcc
 DEFINES += -D_GNU_SOURCE

diff --git a/README.iproute2+tc b/README.iproute2+tc
index 6aa5d18..2a5638d 100644
--- a/README.iproute2+tc
+++ b/README.iproute2+tc

@@ -72,12 +72,16 @@
 etc. The same thing can be made with rules.
 I still did not test ipchains, but they should work too.
 
+
+Setup and code example of BPF classifier and action can be found under
+examples/bpf/, which should explain everything for getting started.
+
+
 Setup of rsvp and u32 classifiers is more hairy.
 If you read RSVP specs, you will understand how rsvp classifier
 works easily. What's about u32... That's example:
 
 
-
 #! /bin/sh
 
 TC=/home/root/tc

diff --git a/configure b/configure
index 9470c1d..f1325df 100755
--- a/configure
+++ b/configure

@@ -249,6 +249,29 @@
     rm -f $TMPDIR/ipsettest.c $TMPDIR/ipsettest
 }
 
+check_elf()
+{
+    cat >$TMPDIR/elftest.c <<EOF
+#include <libelf.h>
+#include <gelf.h>
+int main(void)
+{
+	Elf_Scn *scn;
+	GElf_Shdr shdr;
+	return elf_version(EV_CURRENT);
+}
+EOF
+
+    if $CC -I$INCLUDE -o $TMPDIR/elftest $TMPDIR/elftest.c -lelf >/dev/null 2>&1
+    then
+	echo "TC_CONFIG_ELF:=y" >>Config
+	echo "yes"
+    else
+	echo "no"
+    fi
+    rm -f $TMPDIR/elftest.c $TMPDIR/elftest
+}
+
 check_selinux()
 # SELinux is a compile time option in the ss utility
 {
@@ -287,5 +310,8 @@
 echo -n "SELinux support: "
 check_selinux
 
+echo -n "ELF support: "
+check_elf
+
 echo -e "\nDocs"
 check_docs

diff --git a/doc/ip-cref.tex b/doc/ip-cref.tex
index e7a79a5..ea14795 100644
--- a/doc/ip-cref.tex
+++ b/doc/ip-cref.tex

@@ -1432,6 +1432,17 @@
 even if it does not match any interface prefix. One application of this
 option may be found in~\cite{IP-TUNNELS}.
 
+\item \verb|pref PREF|
+
+--- the IPv6 route preference.
+\verb|PREF| PREF is a string specifying the route preference as defined in
+RFC4191 for Router Discovery messages. Namely:
+\begin{itemize}
+\item \verb|low| --- the route has a lowest priority.
+\item \verb|medium| --- the route has a default priority.
+\item \verb|high| --- the route has a highest priority.
+\end{itemize}
+
 \end{itemize}
 
 

diff --git a/examples/bpf/bpf_agent.c b/examples/bpf/bpf_agent.c
new file mode 100644
index 0000000..0f481b1
--- /dev/null
+++ b/examples/bpf/bpf_agent.c

@@ -0,0 +1,223 @@
+/*
+ * eBPF user space agent part
+ *
+ * Simple, _self-contained_ user space agent for the eBPF kernel
+ * ebpf_prog.c program, which gets all map fds passed from tc via unix
+ * domain socket in one transaction and can thus keep referencing
+ * them from user space in order to read out (or possibly modify)
+ * map data. Here, just as a minimal example to display counters.
+ *
+ * The agent only uses the bpf(2) syscall API to read or possibly
+ * write to eBPF maps, it doesn't need to be aware of the low-level
+ * bytecode parts and/or ELF parsing bits.
+ *
+ * ! For more details, see header comment in bpf_prog.c !
+ *
+ * gcc bpf_agent.c -o bpf_agent -Wall -O2
+ *
+ * For example, a more complex user space agent could run on each
+ * host, reading and writing into eBPF maps used by tc classifier
+ * and actions. It would thus allow for implementing a distributed
+ * tc architecture, for example, which would push down central
+ * policies into eBPF maps, and thus altering run-time behaviour.
+ *
+ *   -- Happy eBPF hacking! ;)
+ */
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <errno.h>
+#include <unistd.h>
+#include <stdint.h>
+#include <assert.h>
+#include <sys/un.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/socket.h>
+
+/* Just some misc macros as min(), offsetof(), etc. */
+#include "../../include/utils.h"
+/* Common code from fd passing. */
+#include "../../include/bpf_scm.h"
+/* Common, shared definitions with ebpf_prog.c */
+#include "bpf_shared.h"
+/* Mini syscall wrapper */
+#include "bpf_sys.h"
+
+static void bpf_dump_drops(int fd)
+{
+	int cpu, max;
+
+	max = sysconf(_SC_NPROCESSORS_ONLN);
+
+	printf(" `- number of drops:");
+	for (cpu = 0; cpu < max; cpu++) {
+		long drops;
+
+		assert(bpf_lookup_elem(fd, &cpu, &drops) == 0);
+		printf("\tcpu%d: %5ld", cpu, drops);
+	}
+	printf("\n");
+}
+
+static void bpf_dump_queue(int fd)
+{
+	/* Just for the same of the example. */
+	int max_queue = 4, i;
+
+	printf("  | nic queues:");
+	for (i = 0; i < max_queue; i++) {
+		struct count_queue cq;
+		int ret;
+
+		memset(&cq, 0, sizeof(cq));
+		ret = bpf_lookup_elem(fd, &i, &cq);
+		assert(ret == 0 || (ret < 0 && errno == ENOENT));
+
+		printf("\tq%d:[pkts: %ld, mis: %ld]",
+		       i, cq.total, cq.mismatch);
+	}
+	printf("\n");
+}
+
+static void bpf_dump_proto(int fd)
+{
+	uint8_t protos[] = { IPPROTO_TCP, IPPROTO_UDP, IPPROTO_ICMP };
+	char *names[] = { "tcp", "udp", "icmp" };
+	int i;
+
+	printf("  ` protos:");
+	for (i = 0; i < ARRAY_SIZE(protos); i++) {
+		struct count_tuple ct;
+		int ret;
+
+		memset(&ct, 0, sizeof(ct));
+		ret = bpf_lookup_elem(fd, &protos[i], &ct);
+		assert(ret == 0 || (ret < 0 && errno == ENOENT));
+
+		printf("\t%s:[pkts: %ld, bytes: %ld]",
+		       names[i], ct.packets, ct.bytes);
+	}
+	printf("\n");
+}
+
+static void bpf_info_loop(int *fds, struct bpf_map_aux *aux)
+{
+	int i, tfd[BPF_MAP_ID_MAX];
+
+	printf("ver: %d\nobj: %s\ndev: %lu\nino: %lu\nmaps: %u\n",
+	       aux->uds_ver, aux->obj_name, aux->obj_st.st_dev,
+	       aux->obj_st.st_ino, aux->num_ent);
+
+	for (i = 0; i < aux->num_ent; i++) {
+		printf("map%d:\n", i);
+		printf(" `- fd: %u\n", fds[i]);
+		printf("  | serial: %u\n", aux->ent[i].id);
+		printf("  | type: %u\n", aux->ent[i].type);
+		printf("  | max elem: %u\n", aux->ent[i].max_elem);
+		printf("  | size key: %u\n", aux->ent[i].size_key);
+		printf("  ` size val: %u\n", aux->ent[i].size_value);
+
+		tfd[aux->ent[i].id] = fds[i];
+	}
+
+	for (i = 0; i < 30; i++) {
+		int period = 5;
+
+		printf("data, period: %dsec\n", period);
+
+		bpf_dump_drops(tfd[BPF_MAP_ID_DROPS]);
+		bpf_dump_queue(tfd[BPF_MAP_ID_QUEUE]);
+		bpf_dump_proto(tfd[BPF_MAP_ID_PROTO]);
+
+		sleep(period);
+	}
+}
+
+static int bpf_map_set_recv(int fd, int *fds,  struct bpf_map_aux *aux,
+			    unsigned int entries)
+{
+	struct bpf_map_set_msg msg;
+	int *cmsg_buf, min_fd, i;
+	char *amsg_buf, *mmsg_buf;
+
+	cmsg_buf = bpf_map_set_init(&msg, NULL, 0);
+	amsg_buf = (char *)msg.aux.ent;
+	mmsg_buf = (char *)&msg.aux;
+
+	for (i = 0; i < entries; i += min_fd) {
+		struct cmsghdr *cmsg;
+		int ret;
+
+		min_fd = min(BPF_SCM_MAX_FDS * 1U, entries - i);
+
+		bpf_map_set_init_single(&msg, min_fd);
+
+		ret = recvmsg(fd, &msg.hdr, 0);
+		if (ret <= 0)
+			return ret ? : -1;
+
+		cmsg = CMSG_FIRSTHDR(&msg.hdr);
+		if (!cmsg || cmsg->cmsg_type != SCM_RIGHTS)
+			return -EINVAL;
+		if (msg.hdr.msg_flags & MSG_CTRUNC)
+			return -EIO;
+
+		min_fd = (cmsg->cmsg_len - sizeof(*cmsg)) / sizeof(fd);
+		if (min_fd > entries || min_fd <= 0)
+			return -1;
+
+		memcpy(&fds[i], cmsg_buf, sizeof(fds[0]) * min_fd);
+		memcpy(&aux->ent[i], amsg_buf, sizeof(aux->ent[0]) * min_fd);
+		memcpy(aux, mmsg_buf, offsetof(struct bpf_map_aux, ent));
+
+		if (i + min_fd == aux->num_ent)
+			break;
+	}
+
+	return 0;
+}
+
+int main(int argc, char **argv)
+{
+	int fds[BPF_SCM_MAX_FDS];
+	struct bpf_map_aux aux;
+	struct sockaddr_un addr;
+	int fd, ret, i;
+
+	if (argc < 2) {
+		fprintf(stderr, "Usage: %s <path-uds>\n", argv[0]);
+		exit(1);
+	}
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0) {
+		fprintf(stderr, "Cannot open socket: %s\n",
+			strerror(errno));
+		exit(1);
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sun_family = AF_UNIX;
+	strncpy(addr.sun_path, argv[argc - 1], sizeof(addr.sun_path));
+
+	ret = bind(fd, (struct sockaddr *)&addr, sizeof(addr));
+	if (ret < 0) {
+		fprintf(stderr, "Cannot bind to socket: %s\n",
+			strerror(errno));
+		exit(1);
+	}
+
+	memset(fds, 0, sizeof(fds));
+	memset(&aux, 0, sizeof(aux));
+
+	ret = bpf_map_set_recv(fd, fds, &aux, BPF_SCM_MAX_FDS);
+	if (ret >= 0)
+		bpf_info_loop(fds, &aux);
+
+	for (i = 0; i < aux.num_ent; i++)
+		close(fds[i]);
+	close(fd);
+	return 0;
+}

diff --git a/examples/bpf/bpf_funcs.h b/examples/bpf/bpf_funcs.h
new file mode 100644
index 0000000..1545fa9
--- /dev/null
+++ b/examples/bpf/bpf_funcs.h

@@ -0,0 +1,58 @@
+#ifndef __BPF_FUNCS__
+#define __BPF_FUNCS__
+
+/* Misc macros. */
+#ifndef __maybe_unused
+# define __maybe_unused		__attribute__ ((__unused__))
+#endif
+
+#ifndef __section
+# define __section(NAME)	__attribute__((section(NAME), used))
+#endif
+
+#ifndef offsetof
+# define offsetof		__builtin_offsetof
+#endif
+
+#ifndef htons
+# define htons(x)		__constant_htons((x))
+#endif
+
+#ifndef likely
+# define likely(x)		__builtin_expect(!!(x), 1)
+#endif
+
+#ifndef unlikely
+# define unlikely(x)		__builtin_expect(!!(x), 0)
+#endif
+
+/* The verifier will translate them to actual function calls. */
+static void *(*bpf_map_lookup_elem)(void *map, void *key) __maybe_unused =
+	(void *) BPF_FUNC_map_lookup_elem;
+
+static int (*bpf_map_update_elem)(void *map, void *key, void *value,
+				  unsigned long long flags) __maybe_unused =
+	(void *) BPF_FUNC_map_update_elem;
+
+static int (*bpf_map_delete_elem)(void *map, void *key) __maybe_unused =
+	(void *) BPF_FUNC_map_delete_elem;
+
+static unsigned int (*get_smp_processor_id)(void) __maybe_unused =
+	(void *) BPF_FUNC_get_smp_processor_id;
+
+static unsigned int (*get_prandom_u32)(void) __maybe_unused =
+	(void *) BPF_FUNC_get_prandom_u32;
+
+/* LLVM built-in functions that an eBPF C program may use to emit
+ * BPF_LD_ABS and BPF_LD_IND instructions.
+ */
+unsigned long long load_byte(void *skb, unsigned long long off)
+	asm ("llvm.bpf.load.byte");
+
+unsigned long long load_half(void *skb, unsigned long long off)
+	asm ("llvm.bpf.load.half");
+
+unsigned long long load_word(void *skb, unsigned long long off)
+	asm ("llvm.bpf.load.word");
+
+#endif /* __BPF_FUNCS__ */

diff --git a/examples/bpf/bpf_prog.c b/examples/bpf/bpf_prog.c
new file mode 100644
index 0000000..ca9b54f
--- /dev/null
+++ b/examples/bpf/bpf_prog.c

@@ -0,0 +1,463 @@
+/*
+ * eBPF kernel space program part
+ *
+ * Toy eBPF program for demonstration purposes, some parts derived from
+ * kernel tree's samples/bpf/sockex2_kern.c example.
+ *
+ * More background on eBPF, kernel tree: Documentation/networking/filter.txt
+ *
+ * Note, this file is rather large, and most classifier and actions are
+ * likely smaller to accomplish one specific use-case and are tailored
+ * for high performance. For performance reasons, you might also have the
+ * classifier and action already merged inside the classifier.
+ *
+ * In order to show various features it serves as a bigger programming
+ * example, which you should feel free to rip apart and experiment with.
+ *
+ * Compilation, configuration example:
+ *
+ *  Note: as long as the BPF backend in LLVM is still experimental,
+ *  you need to build LLVM with LLVM with --enable-experimental-targets=BPF
+ *  Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
+ *  and you have libelf.h and gelf.h headers and can link tc against -lelf.
+ *
+ *  In case you need to sync kernel headers, go to your kernel source tree:
+ *  # make headers_install INSTALL_HDR_PATH=/usr/
+ *
+ *  $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
+ *  $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
+ *  $ objdump -h bpf.o
+ *  [...]
+ *  3 classifier    000007f8  0000000000000000  0000000000000000  00000040  2**3
+ *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
+ *  4 action-mark   00000088  0000000000000000  0000000000000000  00000838  2**3
+ *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
+ *  5 action-rand   00000098  0000000000000000  0000000000000000  000008c0  2**3
+ *                  CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
+ *  6 maps          00000030  0000000000000000  0000000000000000  00000958  2**2
+ *                  CONTENTS, ALLOC, LOAD, DATA
+ *  7 license       00000004  0000000000000000  0000000000000000  00000988  2**0
+ *                  CONTENTS, ALLOC, LOAD, DATA
+ *  [...]
+ *  # echo 1 > /proc/sys/net/core/bpf_jit_enable
+ *  $ gcc bpf_agent.c -o bpf_agent -Wall -O2
+ *  # ./bpf_agent /tmp/bpf-uds      (e.g. on a different terminal)
+ *  # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
+ *                             action bpf obj bpf.o sec action-mark            \
+ *                             action bpf obj bpf.o sec action-rand ok
+ *  # tc filter show dev em1
+ *  filter parent 1: protocol all pref 49152 bpf
+ *  filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
+ *    action order 1: bpf bpf.o:[action-mark] default-action pipe
+ *    index 52 ref 1 bind 1
+ *
+ *    action order 2: bpf bpf.o:[action-rand] default-action pipe
+ *    index 53 ref 1 bind 1
+ *
+ *    action order 3: gact action pass
+ *    random type none pass val 0
+ *    index 38 ref 1 bind 1
+ *
+ * BPF agent example output:
+ *
+ * ver: 1
+ * obj: bpf.o
+ * dev: 64770
+ * ino: 6045133
+ * maps: 3
+ * map0:
+ *  `- fd: 4
+ *   | serial: 1
+ *   | type: 1
+ *   | max elem: 256
+ *   | size key: 1
+ *   ` size val: 16
+ * map1:
+ *  `- fd: 5
+ *   | serial: 2
+ *   | type: 1
+ *   | max elem: 1024
+ *   | size key: 4
+ *   ` size val: 16
+ * map2:
+ *  `- fd: 6
+ *   | serial: 3
+ *   | type: 2
+ *   | max elem: 64
+ *   | size key: 4
+ *   ` size val: 8
+ * data, period: 5sec
+ *  `- number of drops:	cpu0:     0	cpu1:     0	cpu2:     0	cpu3:     0
+ *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 0, mis: 0]	q3:[pkts: 0, mis: 0]
+ *   ` protos:	tcp:[pkts: 0, bytes: 0]	udp:[pkts: 0, bytes: 0]	icmp:[pkts: 0, bytes: 0]
+ * data, period: 5sec
+ *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     0	cpu3:     1
+ *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 24, mis: 14]	q3:[pkts: 0, mis: 0]
+ *   ` protos:	tcp:[pkts: 13, bytes: 1989]	udp:[pkts: 10, bytes: 710]	icmp:[pkts: 0, bytes: 0]
+ * data, period: 5sec
+ *  `- number of drops:	cpu0:     5	cpu1:     0	cpu2:     3	cpu3:     3
+ *   | nic queues:	q0:[pkts: 0, mis: 0]	q1:[pkts: 0, mis: 0]	q2:[pkts: 39, mis: 21]	q3:[pkts: 0, mis: 0]
+ *   ` protos:	tcp:[pkts: 20, bytes: 3549]	udp:[pkts: 18, bytes: 1278]	icmp:[pkts: 0, bytes: 0]
+ * [...]
+ *
+ * This now means, the below classifier and action pipeline has been loaded
+ * as eBPF bytecode into the kernel, the kernel has verified that the
+ * execution of the bytecode is "safe", and it has JITed the programs
+ * afterwards, so that upon invocation they're running on native speed. tc
+ * has transferred all map file descriptors to the bpf_agent via IPC and
+ * even after tc exits, the agent can read out or modify all map data.
+ *
+ * Note that the export to the uds is done only once in the classifier and
+ * not in the action. It's enough to export the (here) shared descriptors
+ * once.
+ *
+ * If you need to disassemble the generated JIT image (echo with 2), the
+ * kernel tree has under tools/net/ a small helper, you can invoke e.g.
+ * `bpf_jit_disasm -o`.
+ *
+ * Please find in the code below further comments.
+ *
+ *   -- Happy eBPF hacking! ;)
+ */
+#include <stdint.h>
+#include <stdbool.h>
+#include <sys/types.h>
+#include <sys/socket.h>
+#include <asm/types.h>
+#include <linux/in.h>
+#include <linux/if.h>
+#include <linux/if_ether.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_tunnel.h>
+#include <linux/bpf.h>
+
+/* Common, shared definitions with ebpf_agent.c. */
+#include "bpf_shared.h"
+/* Selection of BPF helper functions for our example. */
+#include "bpf_funcs.h"
+
+/* Could be defined here as well, or included from the header. */
+#define TC_ACT_UNSPEC		(-1)
+#define TC_ACT_OK		0
+#define TC_ACT_RECLASSIFY	1
+#define TC_ACT_SHOT		2
+#define TC_ACT_PIPE		3
+#define TC_ACT_STOLEN		4
+#define TC_ACT_QUEUED		5
+#define TC_ACT_REPEAT		6
+
+/* Other, misc stuff. */
+#define IP_MF			0x2000
+#define IP_OFFSET		0x1FFF
+
+/* eBPF map definitions, all placed in section "maps". */
+struct bpf_elf_map __section("maps") map_proto = {
+	.type		=	BPF_MAP_TYPE_HASH,
+	.id		=	BPF_MAP_ID_PROTO,
+	.size_key	=	sizeof(uint8_t),
+	.size_value	=	sizeof(struct count_tuple),
+	.max_elem	=	256,
+};
+
+struct bpf_elf_map __section("maps") map_queue = {
+	.type		=	BPF_MAP_TYPE_HASH,
+	.id		=	BPF_MAP_ID_QUEUE,
+	.size_key	=	sizeof(uint32_t),
+	.size_value	=	sizeof(struct count_queue),
+	.max_elem	=	1024,
+};
+
+struct bpf_elf_map __section("maps") map_drops = {
+	.type		=	BPF_MAP_TYPE_ARRAY,
+	.id		=	BPF_MAP_ID_DROPS,
+	.size_key	=	sizeof(uint32_t),
+	.size_value	=	sizeof(long),
+	.max_elem	=	64,
+};
+
+/* Helper functions and definitions for the flow dissector used by the
+ * example classifier. This resembles the kernel's flow dissector to
+ * some extend and is just used as an example to show what's possible
+ * with eBPF.
+ */
+struct sockaddr;
+
+struct vlan_hdr {
+	__be16 h_vlan_TCI;
+	__be16 h_vlan_encapsulated_proto;
+};
+
+struct flow_keys {
+	__u32 src;
+	__u32 dst;
+	union {
+		__u32 ports;
+		__u16 port16[2];
+	};
+	__u16 th_off;
+	__u8 ip_proto;
+};
+
+static inline int flow_ports_offset(__u8 ip_proto)
+{
+	switch (ip_proto) {
+	case IPPROTO_TCP:
+	case IPPROTO_UDP:
+	case IPPROTO_DCCP:
+	case IPPROTO_ESP:
+	case IPPROTO_SCTP:
+	case IPPROTO_UDPLITE:
+	default:
+		return 0;
+	case IPPROTO_AH:
+		return 4;
+	}
+}
+
+static inline bool flow_is_frag(struct __sk_buff *skb, __u32 nh_off)
+{
+	return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
+		  (IP_MF | IP_OFFSET));
+}
+
+static inline __u32 flow_parse_ipv4(struct __sk_buff *skb, __u32 nh_off,
+				    __u8 *ip_proto, struct flow_keys *flow)
+{
+	__u8 ip_ver_len;
+
+	if (unlikely(flow_is_frag(skb, nh_off)))
+		*ip_proto = 0;
+	else
+		*ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
+							     protocol));
+	if (*ip_proto != IPPROTO_GRE) {
+		flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
+		flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
+	}
+
+	ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
+	if (likely(ip_ver_len == 0x45))
+		nh_off += 20;
+	else
+		nh_off += (ip_ver_len & 0xF) << 2;
+
+	return nh_off;
+}
+
+static inline __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, __u32 off)
+{
+	__u32 w0 = load_word(skb, off);
+	__u32 w1 = load_word(skb, off + sizeof(w0));
+	__u32 w2 = load_word(skb, off + sizeof(w0) * 2);
+	__u32 w3 = load_word(skb, off + sizeof(w0) * 3);
+
+	return (__u32)(w0 ^ w1 ^ w2 ^ w3);
+}
+
+static inline __u32 flow_parse_ipv6(struct __sk_buff *skb, __u32 nh_off,
+				    __u8 *ip_proto, struct flow_keys *flow)
+{
+	*ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
+
+	flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
+	flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
+
+	return nh_off + sizeof(struct ipv6hdr);
+}
+
+static inline bool flow_dissector(struct __sk_buff *skb,
+				  struct flow_keys *flow)
+{
+	__be16 proto = skb->protocol;
+	__u32 nh_off = ETH_HLEN;
+	__u8 ip_proto;
+	int poff;
+
+	/* TODO: check for skb->vlan_tci, skb->vlan_proto first */
+	if (proto == htons(ETH_P_8021AD)) {
+		proto = load_half(skb, nh_off +
+				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
+		nh_off += sizeof(struct vlan_hdr);
+	}
+	if (proto == htons(ETH_P_8021Q)) {
+		proto = load_half(skb, nh_off +
+				  offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
+		nh_off += sizeof(struct vlan_hdr);
+	}
+
+	if (likely(proto == htons(ETH_P_IP)))
+		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
+	else if (proto == htons(ETH_P_IPV6))
+		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
+	else
+		return false;
+
+	switch (ip_proto) {
+	case IPPROTO_GRE: {
+		struct gre_hdr {
+			__be16 flags;
+			__be16 proto;
+		};
+
+		__u16 gre_flags = load_half(skb, nh_off +
+					    offsetof(struct gre_hdr, flags));
+		__u16 gre_proto = load_half(skb, nh_off +
+					    offsetof(struct gre_hdr, proto));
+
+		if (gre_flags & (GRE_VERSION | GRE_ROUTING))
+			break;
+
+		nh_off += 4;
+		if (gre_flags & GRE_CSUM)
+			nh_off += 4;
+		if (gre_flags & GRE_KEY)
+			nh_off += 4;
+		if (gre_flags & GRE_SEQ)
+			nh_off += 4;
+
+		if (gre_proto == ETH_P_8021Q) {
+			gre_proto = load_half(skb, nh_off +
+					      offsetof(struct vlan_hdr,
+						       h_vlan_encapsulated_proto));
+			nh_off += sizeof(struct vlan_hdr);
+		}
+		if (gre_proto == ETH_P_IP)
+			nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
+		else if (gre_proto == ETH_P_IPV6)
+			nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
+		else
+			return false;
+		break;
+	}
+	case IPPROTO_IPIP:
+		nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
+		break;
+	case IPPROTO_IPV6:
+		nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
+	default:
+		break;
+	}
+
+	nh_off += flow_ports_offset(ip_proto);
+
+	flow->ports = load_word(skb, nh_off);
+	flow->th_off = (__u16)nh_off;
+	flow->ip_proto = ip_proto;
+
+	return true;
+}
+
+static inline void cls_update_proto_map(const struct __sk_buff *skb,
+					const struct flow_keys *flow)
+{
+	uint8_t proto = flow->ip_proto;
+	struct count_tuple *ct, _ct;
+
+	ct = bpf_map_lookup_elem(&map_proto, &proto);
+	if (likely(ct)) {
+		__sync_fetch_and_add(&ct->packets, 1);
+		__sync_fetch_and_add(&ct->bytes, skb->len);
+		return;
+	}
+
+	/* No hit yet, we need to create a new entry. */
+	_ct.packets = 1;
+	_ct.bytes = skb->len;
+
+	bpf_map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
+}
+
+static inline void cls_update_queue_map(const struct __sk_buff *skb)
+{
+	uint32_t queue = skb->queue_mapping;
+	struct count_queue *cq, _cq;
+	bool mismatch;
+
+	mismatch = skb->queue_mapping != get_smp_processor_id();
+
+	cq = bpf_map_lookup_elem(&map_queue, &queue);
+	if (likely(cq)) {
+		__sync_fetch_and_add(&cq->total, 1);
+		if (mismatch)
+			__sync_fetch_and_add(&cq->mismatch, 1);
+		return;
+	}
+
+	/* No hit yet, we need to create a new entry. */
+	_cq.total = 1;
+	_cq.mismatch = mismatch ? 1 : 0;
+
+	bpf_map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
+}
+
+/* eBPF program definitions, placed in various sections, which can
+ * have custom section names. If custom names are in use, it's
+ * required to point tc to the correct section, e.g.
+ *
+ *     tc filter add [...] bpf obj cls.o sec cls-tos [...]
+ *
+ * in case the program resides in __section("cls-tos").
+ *
+ * Default section for cls_bpf is: "classifier", for act_bpf is:
+ * "action". Naturally, if for example multiple actions are present
+ * in the same file, they need to have distinct section names.
+ *
+ * It is however not required to have multiple programs sharing
+ * a file.
+ */
+__section("classifier") int cls_main(struct __sk_buff *skb)
+{
+	struct flow_keys flow;
+
+	if (!flow_dissector(skb, &flow))
+		return 0; /* No match in cls_bpf. */
+
+	cls_update_proto_map(skb, &flow);
+	cls_update_queue_map(skb);
+
+	return flow.ip_proto;
+}
+
+static inline void act_update_drop_map(void)
+{
+	uint32_t *count, cpu = get_smp_processor_id();
+
+	count = bpf_map_lookup_elem(&map_drops, &cpu);
+	if (count)
+		/* Only this cpu is accessing this element. */
+		(*count)++;
+}
+
+__section("action-mark") int act_mark_main(struct __sk_buff *skb)
+{
+	/* You could also mangle skb data here with the helper function
+	 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
+	 * do that already in the classifier itself as a merged combination
+	 * of classifier'n'action model.
+	 */
+
+	if (skb->mark == 0xcafe) {
+		act_update_drop_map();
+		return TC_ACT_SHOT;
+	}
+
+	/* Default configured tc opcode. */
+	return TC_ACT_UNSPEC;
+}
+
+__section("action-rand") int act_rand_main(struct __sk_buff *skb)
+{
+	/* Sorry, we're near event horizon ... */
+	if ((get_prandom_u32() & 3) == 0) {
+		act_update_drop_map();
+		return TC_ACT_SHOT;
+	}
+
+	return TC_ACT_UNSPEC;
+}
+
+/* Last but not least, the file contains a license. Some future helper
+ * functions may only be available with a GPL license.
+ */
+char __license[] __section("license") = "GPL";

diff --git a/examples/bpf/bpf_shared.h b/examples/bpf/bpf_shared.h
new file mode 100644
index 0000000..46423ec
--- /dev/null
+++ b/examples/bpf/bpf_shared.h

@@ -0,0 +1,26 @@
+#ifndef __BPF_SHARED__
+#define __BPF_SHARED__
+
+#include <stdint.h>
+
+#include "../../include/bpf_elf.h"
+
+enum {
+	BPF_MAP_ID_PROTO,
+	BPF_MAP_ID_QUEUE,
+	BPF_MAP_ID_DROPS,
+	__BPF_MAP_ID_MAX,
+#define BPF_MAP_ID_MAX	__BPF_MAP_ID_MAX
+};
+
+struct count_tuple {
+	long packets; /* type long for __sync_fetch_and_add() */
+	long bytes;
+};
+
+struct count_queue {
+	long total;
+	long mismatch;
+};
+
+#endif /* __BPF_SHARED__ */

diff --git a/examples/bpf/bpf_sys.h b/examples/bpf/bpf_sys.h
new file mode 100644
index 0000000..6e4f09e
--- /dev/null
+++ b/examples/bpf/bpf_sys.h

@@ -0,0 +1,23 @@
+#ifndef __BPF_SYS__
+#define __BPF_SYS__
+
+#include <sys/syscall.h>
+#include <linux/bpf.h>
+
+static inline __u64 bpf_ptr_to_u64(const void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+static inline int bpf_lookup_elem(int fd, void *key, void *value)
+{
+	union bpf_attr attr = {
+		.map_fd		= fd,
+		.key		= bpf_ptr_to_u64(key),
+		.value		= bpf_ptr_to_u64(value),
+	};
+
+	return syscall(__NR_bpf, BPF_MAP_LOOKUP_ELEM, &attr, sizeof(attr));
+}
+
+#endif /* __BPF_SYS__ */

diff --git a/include/bpf_elf.h b/include/bpf_elf.h
new file mode 100644
index 0000000..4bd6bb0
--- /dev/null
+++ b/include/bpf_elf.h

@@ -0,0 +1,33 @@
+#ifndef __BPF_ELF__
+#define __BPF_ELF__
+
+#include <asm/types.h>
+
+/* Note:
+ *
+ * Below ELF section names and bpf_elf_map structure definition
+ * are not (!) kernel ABI. It's rather a "contract" between the
+ * application and the BPF loader in tc. For compatibility, the
+ * section names should stay as-is. Introduction of aliases, if
+ * needed, are a possibility, though.
+ */
+
+/* ELF section names, etc */
+#define ELF_SECTION_LICENSE	"license"
+#define ELF_SECTION_MAPS	"maps"
+#define ELF_SECTION_CLASSIFIER	"classifier"
+#define ELF_SECTION_ACTION	"action"
+
+#define ELF_MAX_MAPS		64
+#define ELF_MAX_LICENSE_LEN	128
+
+/* ELF map definition */
+struct bpf_elf_map {
+	__u32 type;
+	__u32 size_key;
+	__u32 size_value;
+	__u32 max_elem;
+	__u32 id;
+};
+
+#endif /* __BPF_ELF__ */

diff --git a/include/bpf_scm.h b/include/bpf_scm.h
new file mode 100644
index 0000000..35117d1
--- /dev/null
+++ b/include/bpf_scm.h

@@ -0,0 +1,75 @@
+#ifndef __BPF_SCM__
+#define __BPF_SCM__
+
+#include <sys/types.h>
+#include <sys/socket.h>
+
+#include "utils.h"
+#include "bpf_elf.h"
+
+#define BPF_SCM_AUX_VER		1
+#define BPF_SCM_MAX_FDS		ELF_MAX_MAPS
+#define BPF_SCM_MSG_SIZE	1024
+
+struct bpf_elf_st {
+	dev_t st_dev;
+	ino_t st_ino;
+};
+
+struct bpf_map_aux {
+	unsigned short uds_ver;
+	unsigned short num_ent;
+	char obj_name[64];
+	struct bpf_elf_st obj_st;
+	struct bpf_elf_map ent[BPF_SCM_MAX_FDS];
+};
+
+struct bpf_map_set_msg {
+	struct msghdr hdr;
+	struct iovec iov;
+	char msg_buf[BPF_SCM_MSG_SIZE];
+	struct bpf_map_aux aux;
+};
+
+static inline int *bpf_map_set_init(struct bpf_map_set_msg *msg,
+				    struct sockaddr_un *addr,
+				    unsigned int addr_len)
+{
+	const unsigned int cmsg_ctl_len = sizeof(int) * BPF_SCM_MAX_FDS;
+	struct cmsghdr *cmsg;
+
+	msg->iov.iov_base = &msg->aux;
+	msg->iov.iov_len = sizeof(msg->aux);
+
+	msg->hdr.msg_iov = &msg->iov;
+	msg->hdr.msg_iovlen = 1;
+
+	msg->hdr.msg_name = (struct sockaddr *)addr;
+	msg->hdr.msg_namelen = addr_len;
+
+	BUILD_BUG_ON(sizeof(msg->msg_buf) < cmsg_ctl_len);
+	msg->hdr.msg_control = &msg->msg_buf;
+	msg->hdr.msg_controllen	= cmsg_ctl_len;
+
+	cmsg = CMSG_FIRSTHDR(&msg->hdr);
+	cmsg->cmsg_len = msg->hdr.msg_controllen;
+	cmsg->cmsg_level = SOL_SOCKET;
+	cmsg->cmsg_type	= SCM_RIGHTS;
+
+	return (int *)CMSG_DATA(cmsg);
+}
+
+static inline void bpf_map_set_init_single(struct bpf_map_set_msg *msg,
+					   int num)
+{
+	struct cmsghdr *cmsg;
+
+	msg->hdr.msg_controllen = CMSG_LEN(sizeof(int) * num);
+	msg->iov.iov_len = offsetof(struct bpf_map_aux, ent) +
+			   sizeof(struct bpf_elf_map) * num;
+
+	cmsg = CMSG_FIRSTHDR(&msg->hdr);
+	cmsg->cmsg_len = msg->hdr.msg_controllen;
+}
+
+#endif /* __BPF_SCM__ */

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
new file mode 100644
index 0000000..08aab3a
--- /dev/null
+++ b/include/linux/bpf.h

@@ -0,0 +1,226 @@
+/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ */
+#ifndef __LINUX_BPF_H__
+#define __LINUX_BPF_H__
+
+#include <linux/types.h>
+#include <linux/bpf_common.h>
+
+/* Extended instruction set based on top of classic BPF */
+
+/* instruction classes */
+#define BPF_ALU64	0x07	/* alu mode in double word width */
+
+/* ld/ldx fields */
+#define BPF_DW		0x18	/* double word */
+#define BPF_XADD	0xc0	/* exclusive add */
+
+/* alu/jmp fields */
+#define BPF_MOV		0xb0	/* mov reg to reg */
+#define BPF_ARSH	0xc0	/* sign extending arithmetic shift right */
+
+/* change endianness of a register */
+#define BPF_END		0xd0	/* flags for endianness conversion: */
+#define BPF_TO_LE	0x00	/* convert to little-endian */
+#define BPF_TO_BE	0x08	/* convert to big-endian */
+#define BPF_FROM_LE	BPF_TO_LE
+#define BPF_FROM_BE	BPF_TO_BE
+
+#define BPF_JNE		0x50	/* jump != */
+#define BPF_JSGT	0x60	/* SGT is signed '>', GT in x86 */
+#define BPF_JSGE	0x70	/* SGE is signed '>=', GE in x86 */
+#define BPF_CALL	0x80	/* function call */
+#define BPF_EXIT	0x90	/* function return */
+
+/* Register numbers */
+enum {
+	BPF_REG_0 = 0,
+	BPF_REG_1,
+	BPF_REG_2,
+	BPF_REG_3,
+	BPF_REG_4,
+	BPF_REG_5,
+	BPF_REG_6,
+	BPF_REG_7,
+	BPF_REG_8,
+	BPF_REG_9,
+	BPF_REG_10,
+	__MAX_BPF_REG,
+};
+
+/* BPF has 10 general purpose 64-bit registers and stack frame. */
+#define MAX_BPF_REG	__MAX_BPF_REG
+
+struct bpf_insn {
+	__u8	code;		/* opcode */
+	__u8	dst_reg:4;	/* dest register */
+	__u8	src_reg:4;	/* source register */
+	__s16	off;		/* signed offset */
+	__s32	imm;		/* signed immediate constant */
+};
+
+/* BPF syscall commands */
+enum bpf_cmd {
+	/* create a map with given type and attributes
+	 * fd = bpf(BPF_MAP_CREATE, union bpf_attr *, u32 size)
+	 * returns fd or negative error
+	 * map is deleted when fd is closed
+	 */
+	BPF_MAP_CREATE,
+
+	/* lookup key in a given map
+	 * err = bpf(BPF_MAP_LOOKUP_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value
+	 * returns zero and stores found elem into value
+	 * or negative error
+	 */
+	BPF_MAP_LOOKUP_ELEM,
+
+	/* create or update key/value pair in a given map
+	 * err = bpf(BPF_MAP_UPDATE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->value, attr->flags
+	 * returns zero or negative error
+	 */
+	BPF_MAP_UPDATE_ELEM,
+
+	/* find and delete elem by key in a given map
+	 * err = bpf(BPF_MAP_DELETE_ELEM, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key
+	 * returns zero or negative error
+	 */
+	BPF_MAP_DELETE_ELEM,
+
+	/* lookup key in a given map and return next key
+	 * err = bpf(BPF_MAP_GET_NEXT_KEY, union bpf_attr *attr, u32 size)
+	 * Using attr->map_fd, attr->key, attr->next_key
+	 * returns zero and stores next key or negative error
+	 */
+	BPF_MAP_GET_NEXT_KEY,
+
+	/* verify and load eBPF program
+	 * prog_fd = bpf(BPF_PROG_LOAD, union bpf_attr *attr, u32 size)
+	 * Using attr->prog_type, attr->insns, attr->license
+	 * returns fd or negative error
+	 */
+	BPF_PROG_LOAD,
+};
+
+enum bpf_map_type {
+	BPF_MAP_TYPE_UNSPEC,
+	BPF_MAP_TYPE_HASH,
+	BPF_MAP_TYPE_ARRAY,
+};
+
+enum bpf_prog_type {
+	BPF_PROG_TYPE_UNSPEC,
+	BPF_PROG_TYPE_SOCKET_FILTER,
+	BPF_PROG_TYPE_SCHED_CLS,
+	BPF_PROG_TYPE_SCHED_ACT,
+};
+
+#define BPF_PSEUDO_MAP_FD	1
+
+/* flags for BPF_MAP_UPDATE_ELEM command */
+#define BPF_ANY		0 /* create new element or update existing */
+#define BPF_NOEXIST	1 /* create new element if it didn't exist */
+#define BPF_EXIST	2 /* update existing element */
+
+union bpf_attr {
+	struct { /* anonymous struct used by BPF_MAP_CREATE command */
+		__u32	map_type;	/* one of enum bpf_map_type */
+		__u32	key_size;	/* size of key in bytes */
+		__u32	value_size;	/* size of value in bytes */
+		__u32	max_entries;	/* max number of entries in a map */
+	};
+
+	struct { /* anonymous struct used by BPF_MAP_*_ELEM commands */
+		__u32		map_fd;
+		__aligned_u64	key;
+		union {
+			__aligned_u64 value;
+			__aligned_u64 next_key;
+		};
+		__u64		flags;
+	};
+
+	struct { /* anonymous struct used by BPF_PROG_LOAD command */
+		__u32		prog_type;	/* one of enum bpf_prog_type */
+		__u32		insn_cnt;
+		__aligned_u64	insns;
+		__aligned_u64	license;
+		__u32		log_level;	/* verbosity level of verifier */
+		__u32		log_size;	/* size of user buffer */
+		__aligned_u64	log_buf;	/* user supplied buffer */
+	};
+} __attribute__((aligned(8)));
+
+/* integer value in 'imm' field of BPF_CALL instruction selects which helper
+ * function eBPF program intends to call
+ */
+enum bpf_func_id {
+	BPF_FUNC_unspec,
+	BPF_FUNC_map_lookup_elem, /* void *map_lookup_elem(&map, &key) */
+	BPF_FUNC_map_update_elem, /* int map_update_elem(&map, &key, &value, flags) */
+	BPF_FUNC_map_delete_elem, /* int map_delete_elem(&map, &key) */
+	BPF_FUNC_get_prandom_u32, /* u32 prandom_u32(void) */
+	BPF_FUNC_get_smp_processor_id, /* u32 raw_smp_processor_id(void) */
+
+	/**
+	 * skb_store_bytes(skb, offset, from, len, flags) - store bytes into packet
+	 * @skb: pointer to skb
+	 * @offset: offset within packet from skb->data
+	 * @from: pointer where to copy bytes from
+	 * @len: number of bytes to store into packet
+	 * @flags: bit 0 - if true, recompute skb->csum
+	 *         other bits - reserved
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_skb_store_bytes,
+
+	/**
+	 * l3_csum_replace(skb, offset, from, to, flags) - recompute IP checksum
+	 * @skb: pointer to skb
+	 * @offset: offset within packet where IP checksum is located
+	 * @from: old value of header field
+	 * @to: new value of header field
+	 * @flags: bits 0-3 - size of header field
+	 *         other bits - reserved
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_l3_csum_replace,
+
+	/**
+	 * l4_csum_replace(skb, offset, from, to, flags) - recompute TCP/UDP checksum
+	 * @skb: pointer to skb
+	 * @offset: offset within packet where TCP/UDP checksum is located
+	 * @from: old value of header field
+	 * @to: new value of header field
+	 * @flags: bits 0-3 - size of header field
+	 *         bit 4 - is pseudo header
+	 *         other bits - reserved
+	 * Return: 0 on success
+	 */
+	BPF_FUNC_l4_csum_replace,
+	__BPF_FUNC_MAX_ID,
+};
+
+/* user accessible mirror of in-kernel sk_buff.
+ * new fields can only be added to the end of this structure
+ */
+struct __sk_buff {
+	__u32 len;
+	__u32 pkt_type;
+	__u32 mark;
+	__u32 queue_mapping;
+	__u32 protocol;
+	__u32 vlan_present;
+	__u32 vlan_tci;
+	__u32 vlan_proto;
+	__u32 priority;
+};
+
+#endif /* __LINUX_BPF_H__ */

diff --git a/include/linux/filter.h b/include/linux/filter.h
index 8688a98..344781d 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h

@@ -77,7 +77,8 @@
 #define SKF_AD_VLAN_TAG_PRESENT 48
 #define SKF_AD_PAY_OFFSET	52
 #define SKF_AD_RANDOM	56
-#define SKF_AD_MAX	60
+#define SKF_AD_VLAN_TPID	60
+#define SKF_AD_MAX	64
 #define SKF_NET_OFF   (-0x100000)
 #define SKF_LL_OFF    (-0x200000)
 

diff --git a/include/linux/if_addr.h b/include/linux/if_addr.h
index cc375e4..26f0ecf 100644
--- a/include/linux/if_addr.h
+++ b/include/linux/if_addr.h

@@ -50,6 +50,8 @@
 #define IFA_F_PERMANENT		0x80
 #define IFA_F_MANAGETEMPADDR	0x100
 #define IFA_F_NOPREFIXROUTE	0x200
+#define IFA_F_MCAUTOJOIN	0x400
+#define IFA_F_STABLE_PRIVACY	0x800
 
 struct ifa_cacheinfo {
 	__u32	ifa_prefered;

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 3450c3f..6689e8f 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h

@@ -147,6 +147,7 @@
 	IFLA_CARRIER_CHANGES,
 	IFLA_PHYS_SWITCH_ID,
 	IFLA_LINK_NETNSID,
+	IFLA_PHYS_PORT_NAME,
 	__IFLA_MAX
 };
 
@@ -213,6 +214,7 @@
 enum in6_addr_gen_mode {
 	IN6_ADDR_GEN_MODE_EUI64,
 	IN6_ADDR_GEN_MODE_NONE,
+	IN6_ADDR_GEN_MODE_STABLE_PRIVACY,
 };
 
 /* Bridge section */
@@ -222,6 +224,9 @@
 	IFLA_BR_FORWARD_DELAY,
 	IFLA_BR_HELLO_TIME,
 	IFLA_BR_MAX_AGE,
+	IFLA_BR_AGEING_TIME,
+	IFLA_BR_STP_STATE,
+	IFLA_BR_PRIORITY,
 	__IFLA_BR_MAX,
 };
 
@@ -245,6 +250,7 @@
 	IFLA_BRPORT_UNICAST_FLOOD, /* flood unicast traffic */
 	IFLA_BRPORT_PROXYARP,	/* proxy ARP */
 	IFLA_BRPORT_LEARNING_SYNC, /* mac learning sync from device */
+	IFLA_BRPORT_PROXYARP_WIFI, /* proxy ARP for Wi-Fi */
 	__IFLA_BRPORT_MAX
 };
 #define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)

diff --git a/include/linux/mpls.h b/include/linux/mpls.h
new file mode 100644
index 0000000..0893902
--- /dev/null
+++ b/include/linux/mpls.h

@@ -0,0 +1,34 @@
+#ifndef _MPLS_H
+#define _MPLS_H
+
+#include <linux/types.h>
+#include <asm/byteorder.h>
+
+/* Reference: RFC 5462, RFC 3032
+ *
+ *  0                   1                   2                   3
+ *  0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ * |                Label                  | TC  |S|       TTL     |
+ * +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *
+ *	Label:  Label Value, 20 bits
+ *	TC:     Traffic Class field, 3 bits
+ *	S:      Bottom of Stack, 1 bit
+ *	TTL:    Time to Live, 8 bits
+ */
+
+struct mpls_label {
+	__be32 entry;
+};
+
+#define MPLS_LS_LABEL_MASK      0xFFFFF000
+#define MPLS_LS_LABEL_SHIFT     12
+#define MPLS_LS_TC_MASK         0x00000E00
+#define MPLS_LS_TC_SHIFT        9
+#define MPLS_LS_S_MASK          0x00000100
+#define MPLS_LS_S_SHIFT         8
+#define MPLS_LS_TTL_MASK        0x000000FF
+#define MPLS_LS_TTL_SHIFT       0
+
+#endif /* _MPLS_H */

diff --git a/include/linux/neighbour.h b/include/linux/neighbour.h
index 3873a35..2e35c61 100644
--- a/include/linux/neighbour.h
+++ b/include/linux/neighbour.h

@@ -126,6 +126,7 @@
 	NDTPA_PROXY_QLEN,		/* u32 */
 	NDTPA_LOCKTIME,			/* u64, msecs */
 	NDTPA_QUEUE_LENBYTES,		/* u32 */
+	NDTPA_MCAST_REPROBES,		/* u32 */
 	__NDTPA_MAX
 };
 #define NDTPA_MAX (__NDTPA_MAX - 1)

diff --git a/include/linux/pkt_cls.h b/include/linux/pkt_cls.h
index 25731df..bf08e76 100644
--- a/include/linux/pkt_cls.h
+++ b/include/linux/pkt_cls.h

@@ -397,6 +397,8 @@
 	TCA_BPF_CLASSID,
 	TCA_BPF_OPS_LEN,
 	TCA_BPF_OPS,
+	TCA_BPF_FD,
+	TCA_BPF_NAME,
 	__TCA_BPF_MAX,
 };
 

diff --git a/include/linux/rtnetlink.h b/include/linux/rtnetlink.h
index 3eb7810..702b19b 100644
--- a/include/linux/rtnetlink.h
+++ b/include/linux/rtnetlink.h

@@ -134,6 +134,8 @@
 
 	RTM_NEWNSID = 88,
 #define RTM_NEWNSID RTM_NEWNSID
+	RTM_DELNSID = 89,
+#define RTM_DELNSID RTM_DELNSID
 	RTM_GETNSID = 90,
 #define RTM_GETNSID RTM_GETNSID
 
@@ -303,6 +305,9 @@
 	RTA_TABLE,
 	RTA_MARK,
 	RTA_MFC_STATS,
+	RTA_VIA,
+	RTA_NEWDST,
+	RTA_PREF,
 	__RTA_MAX
 };
 
@@ -332,6 +337,7 @@
 #define RTNH_F_DEAD		1	/* Nexthop is dead (used by multipath)	*/
 #define RTNH_F_PERVASIVE	2	/* Do recursive gateway lookup	*/
 #define RTNH_F_ONLINK		4	/* Gateway is forced on link	*/
+#define RTNH_F_EXTERNAL		8	/* Route installed externally	*/
 
 /* Macros to handle hexthops */
 
@@ -344,6 +350,12 @@
 #define RTNH_SPACE(len)	RTNH_ALIGN(RTNH_LENGTH(len))
 #define RTNH_DATA(rtnh)   ((struct rtattr*)(((char*)(rtnh)) + RTNH_LENGTH(0)))
 
+/* RTA_VIA */
+struct rtvia {
+	__kernel_sa_family_t	rtvia_family;
+	__u8			rtvia_addr[0];
+};
+
 /* RTM_CACHEINFO */
 
 struct rta_cacheinfo {
@@ -621,6 +633,10 @@
 #define RTNLGRP_IPV6_NETCONF	RTNLGRP_IPV6_NETCONF
 	RTNLGRP_MDB,
 #define RTNLGRP_MDB		RTNLGRP_MDB
+	RTNLGRP_MPLS_ROUTE,
+#define RTNLGRP_MPLS_ROUTE	RTNLGRP_MPLS_ROUTE
+	RTNLGRP_NSID,
+#define RTNLGRP_NSID		RTNLGRP_NSID
 	__RTNLGRP_MAX
 };
 #define RTNLGRP_MAX	(__RTNLGRP_MAX - 1)

diff --git a/include/linux/tc_act/tc_bpf.h b/include/linux/tc_act/tc_bpf.h
index 5288bd7..07f17cc 100644
--- a/include/linux/tc_act/tc_bpf.h
+++ b/include/linux/tc_act/tc_bpf.h

@@ -24,6 +24,8 @@
 	TCA_ACT_BPF_PARMS,
 	TCA_ACT_BPF_OPS_LEN,
 	TCA_ACT_BPF_OPS,
+	TCA_ACT_BPF_FD,
+	TCA_ACT_BPF_NAME,
 	__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)

diff --git a/include/linux/xfrm.h b/include/linux/xfrm.h
index 3a1fd32..b8f5451 100644
--- a/include/linux/xfrm.h
+++ b/include/linux/xfrm.h

@@ -1,6 +1,7 @@
 #ifndef _LINUX_XFRM_H
 #define _LINUX_XFRM_H
 
+#include <linux/in6.h>
 #include <linux/types.h>
 
 /* All of the structures in this file may not change size as they are
@@ -13,6 +14,7 @@
 typedef union {
 	__be32		a4;
 	__be32		a6[4];
+	struct in6_addr	in6;
 } xfrm_address_t;
 
 /* Ident of a specific xfrm_state. It is used on input to lookup

diff --git a/include/rt_names.h b/include/rt_names.h
index c0ea4f9..921be06 100644
--- a/include/rt_names.h
+++ b/include/rt_names.h

@@ -22,7 +22,7 @@
 
 
 const char * ll_type_n2a(int type, char *buf, int len);
-const char *ll_addr_n2a(unsigned char *addr, int alen,
+const char *ll_addr_n2a(const unsigned char *addr, int alen,
 			int type, char *buf, int blen);
 int ll_addr_a2n(char *lladdr, int len, const char *arg);
 

diff --git a/include/utils.h b/include/utils.h
index 9151c4f..2277b74 100644
--- a/include/utils.h
+++ b/include/utils.h

@@ -50,10 +50,11 @@
 
 typedef struct
 {
-	__u8 family;
-	__u8 bytelen;
+	__u16 flags;
+	__u16 bytelen;
 	__s16 bitlen;
-	__u32 flags;
+	/* These next two fields match rtvia */
+	__u16 family;
 	__u32 data[8];
 } inet_prefix;
 
@@ -77,6 +78,13 @@
 	u_int8_t  ipx_node[IPX_NODE_LEN];
 };
 
+#ifndef AF_MPLS
+# define AF_MPLS 28
+#endif
+
+/* Maximum number of labels the mpls helpers support */
+#define MPLS_MAX_LABELS 8
+
 extern __u32 get_addr32(const char *name);
 extern int get_addr_1(inet_prefix *dst, const char *arg, int family);
 extern int get_prefix_1(inet_prefix *dst, char *arg, int family);
@@ -106,9 +114,12 @@
 
 extern const char *format_host(int af, int len, const void *addr,
 			       char *buf, int buflen);
-extern const char *rt_addr_n2a(int af, const void *addr,
+extern const char *rt_addr_n2a(int af, int len, const void *addr,
 			       char *buf, int buflen);
 
+extern int read_family(const char *name);
+extern const char *family_name(int family);
+
 void missarg(const char *) __attribute__((noreturn));
 void invarg(const char *, const char *) __attribute__((noreturn));
 void duparg(const char *, const char *) __attribute__((noreturn));
@@ -122,6 +133,9 @@
 const char *ipx_ntop(int af, const void *addr, char *str, size_t len);
 int ipx_pton(int af, const char *src, void *addr);
 
+const char *mpls_ntop(int af, const void *addr, char *str, size_t len);
+int mpls_pton(int af, const char *src, void *addr);
+
 extern int __iproute2_hz_internal;
 extern int __get_hz(void);
 
@@ -157,6 +171,25 @@
 
 #define ARRAY_SIZE(x) (sizeof(x) / sizeof((x)[0]))
 
+#define BUILD_BUG_ON(cond) ((void)sizeof(char[1 - 2 * !!(cond)]))
+
+#ifndef offsetof
+# define offsetof(type, member) ((size_t) &((type *)0)->member)
+#endif
+
+#ifndef min
+# define min(x, y) ({			\
+	typeof(x) _min1 = (x);		\
+	typeof(y) _min2 = (y);		\
+	(void) (&_min1 == &_min2);	\
+	_min1 < _min2 ? _min1 : _min2; })
+#endif
+
+#ifndef __check_format_string
+# define __check_format_string(pos_str, pos_args) \
+	__attribute__ ((format (printf, (pos_str), (pos_args))))
+#endif
+
 extern int cmdlineno;
 extern ssize_t getcmdline(char **line, size_t *len, FILE *in);
 extern int makeargs(char *line, char *argv[], int maxargs);

diff --git a/ip/ip.c b/ip/ip.c
index da16b15..f7f214b 100644
--- a/ip/ip.c
+++ b/ip/ip.c

@@ -52,7 +52,7 @@
 "                   netns | l2tp | fou | tcp_metrics | token | netconf }\n"
 "       OPTIONS := { -V[ersion] | -s[tatistics] | -d[etails] | -r[esolve] |\n"
 "                    -h[uman-readable] | -iec |\n"
-"                    -f[amily] { inet | inet6 | ipx | dnet | bridge | link } |\n"
+"                    -f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | link } |\n"
 "                    -4 | -6 | -I | -D | -B | -0 |\n"
 "                    -l[oops] { maximum-addr-flush-attempts } |\n"
 "                    -o[neline] | -t[imestamp] | -ts[hort] | -b[atch] [filename] |\n"
@@ -190,21 +190,11 @@
 			argv++;
 			if (argc <= 1)
 				usage();
-			if (strcmp(argv[1], "inet") == 0)
-				preferred_family = AF_INET;
-			else if (strcmp(argv[1], "inet6") == 0)
-				preferred_family = AF_INET6;
-			else if (strcmp(argv[1], "dnet") == 0)
-				preferred_family = AF_DECnet;
-			else if (strcmp(argv[1], "link") == 0)
-				preferred_family = AF_PACKET;
-			else if (strcmp(argv[1], "ipx") == 0)
-				preferred_family = AF_IPX;
-			else if (strcmp(argv[1], "bridge") == 0)
-				preferred_family = AF_BRIDGE;
-			else if (strcmp(argv[1], "help") == 0)
+			if (strcmp(argv[1], "help") == 0)
 				usage();
 			else
+				preferred_family = read_family(argv[1]);
+			if (preferred_family == AF_UNSPEC)
 				invarg("invalid protocol family", argv[1]);
 		} else if (strcmp(opt, "-4") == 0) {
 			preferred_family = AF_INET;
@@ -216,6 +206,8 @@
 			preferred_family = AF_IPX;
 		} else if (strcmp(opt, "-D") == 0) {
 			preferred_family = AF_DECnet;
+		} else if (strcmp(opt, "-M") == 0) {
+			preferred_family = AF_MPLS;
 		} else if (strcmp(opt, "-B") == 0) {
 			preferred_family = AF_BRIDGE;
 		} else if (matches(opt, "-human") == 0 ||

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 99a6ab5..e582da0 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c

@@ -85,7 +85,7 @@
 	fprintf(stderr, "           [-]tentative | [-]deprecated | [-]dadfailed | temporary |\n");
 	fprintf(stderr, "           CONFFLAG-LIST ]\n");
 	fprintf(stderr, "CONFFLAG-LIST := [ CONFFLAG-LIST ] CONFFLAG\n");
-	fprintf(stderr, "CONFFLAG  := [ home | nodad | mngtmpaddr | noprefixroute ]\n");
+	fprintf(stderr, "CONFFLAG  := [ home | nodad | mngtmpaddr | noprefixroute | autojoin ]\n");
 	fprintf(stderr, "LIFETIME := [ valid_lft LFT ] [ preferred_lft LFT ]\n");
 	fprintf(stderr, "LFT := forever | SECONDS\n");
 
@@ -915,6 +915,10 @@
 		ifa_flags &= ~IFA_F_NOPREFIXROUTE;
 		fprintf(fp, "noprefixroute ");
 	}
+	if (ifa_flags & IFA_F_MCAUTOJOIN) {
+		ifa_flags &= ~IFA_F_MCAUTOJOIN;
+		fprintf(fp, "autojoin ");
+	}
 	if (!(ifa_flags & IFA_F_PERMANENT)) {
 		fprintf(fp, "dynamic ");
 	} else
@@ -1354,6 +1358,9 @@
 		} else if (strcmp(*argv, "noprefixroute") == 0) {
 			filter.flags |= IFA_F_NOPREFIXROUTE;
 			filter.flagmask |= IFA_F_NOPREFIXROUTE;
+		} else if (strcmp(*argv, "autojoin") == 0) {
+			filter.flags |= IFA_F_MCAUTOJOIN;
+			filter.flagmask |= IFA_F_MCAUTOJOIN;
 		} else if (strcmp(*argv, "dadfailed") == 0) {
 			filter.flags |= IFA_F_DADFAILED;
 			filter.flagmask |= IFA_F_DADFAILED;
@@ -1558,6 +1565,16 @@
 	return 0;
 }
 
+static bool ipaddr_is_multicast(inet_prefix *a)
+{
+	if (a->family == AF_INET)
+		return IN_MULTICAST(ntohl(a->data[0]));
+	else if (a->family == AF_INET6)
+		return IN6_IS_ADDR_MULTICAST(a->data);
+	else
+		return false;
+}
+
 static int ipaddr_modify(int cmd, int flags, int argc, char **argv)
 {
 	struct {
@@ -1665,6 +1682,8 @@
 			ifa_flags |= IFA_F_MANAGETEMPADDR;
 		} else if (strcmp(*argv, "noprefixroute") == 0) {
 			ifa_flags |= IFA_F_NOPREFIXROUTE;
+		} else if (strcmp(*argv, "autojoin") == 0) {
+			ifa_flags |= IFA_F_MCAUTOJOIN;
 		} else {
 			if (strcmp(*argv, "local") == 0) {
 				NEXT_ARG();
@@ -1755,6 +1774,11 @@
 			  sizeof(cinfo));
 	}
 
+	if ((ifa_flags & IFA_F_MCAUTOJOIN) && !ipaddr_is_multicast(&lcl)) {
+		fprintf(stderr, "autojoin needs multicast address\n");
+		return -1;
+	}
+
 	if (rtnl_talk(&rth, &req.n, 0, 0, NULL) < 0)
 		return -2;
 

diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c
index 3009ec9..a573f92 100644
--- a/ip/iplink_bond.c
+++ b/ip/iplink_bond.c

@@ -415,6 +415,7 @@
 			if (iptb[i])
 				fprintf(f, "%s",
 					rt_addr_n2a(AF_INET,
+						    RTA_PAYLOAD(iptb[i]),
 						    RTA_DATA(iptb[i]),
 						    buf,
 						    INET_ADDRSTRLEN));

diff --git a/ip/ipmonitor.c b/ip/ipmonitor.c
index 6b5e665..7833a26 100644
--- a/ip/ipmonitor.c
+++ b/ip/ipmonitor.c

@@ -158,6 +158,7 @@
 	groups |= nl_mgrp(RTNLGRP_IPV6_IFADDR);
 	groups |= nl_mgrp(RTNLGRP_IPV4_ROUTE);
 	groups |= nl_mgrp(RTNLGRP_IPV6_ROUTE);
+	groups |= nl_mgrp(RTNLGRP_MPLS_ROUTE);
 	groups |= nl_mgrp(RTNLGRP_IPV4_MROUTE);
 	groups |= nl_mgrp(RTNLGRP_IPV6_MROUTE);
 	groups |= nl_mgrp(RTNLGRP_IPV6_PREFIX);
@@ -235,6 +236,8 @@
 			groups |= nl_mgrp(RTNLGRP_IPV4_ROUTE);
 		if (!preferred_family || preferred_family == AF_INET6)
 			groups |= nl_mgrp(RTNLGRP_IPV6_ROUTE);
+		if (!preferred_family || preferred_family == AF_MPLS)
+			groups |= nl_mgrp(RTNLGRP_MPLS_ROUTE);
 	}
 	if (lmroute) {
 		if (!preferred_family || preferred_family == AF_INET)

diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index b4ed9f1..13ac892 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c

@@ -116,6 +116,7 @@
 	if (tb[RTA_SRC])
 		len = snprintf(obuf, sizeof(obuf),
 			       "(%s, ", rt_addr_n2a(family,
+						    RTA_PAYLOAD(tb[RTA_SRC]),
 						    RTA_DATA(tb[RTA_SRC]),
 						    abuf, sizeof(abuf)));
 	else
@@ -123,6 +124,7 @@
 	if (tb[RTA_DST])
 		snprintf(obuf + len, sizeof(obuf) - len,
 			 "%s)", rt_addr_n2a(family,
+					    RTA_PAYLOAD(tb[RTA_DST]),
 					    RTA_DATA(tb[RTA_DST]),
 					    abuf, sizeof(abuf)));
 	else

diff --git a/ip/ipprefix.c b/ip/ipprefix.c
index 02c0efc..26b5961 100644
--- a/ip/ipprefix.c
+++ b/ip/ipprefix.c

@@ -80,7 +80,9 @@
 		pfx = (struct in6_addr *)RTA_DATA(tb[PREFIX_ADDRESS]);
 
 		memset(abuf, '\0', sizeof(abuf));
-		fprintf(fp, "%s", rt_addr_n2a(family, pfx,
+		fprintf(fp, "%s", rt_addr_n2a(family,
+					      RTA_PAYLOAD(tb[PREFIX_ADDRESS]),
+					      pfx,
 					      abuf, sizeof(abuf)));
 	}
 	fprintf(fp, "/%u ", prefix->prefix_len);

diff --git a/ip/iproute.c b/ip/iproute.c
index 024d401..132a83a 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c

@@ -23,6 +23,7 @@
 #include <netinet/ip.h>
 #include <arpa/inet.h>
 #include <linux/in_route.h>
+#include <linux/icmpv6.h>
 #include <errno.h>
 
 #include "rt_names.h"
@@ -75,19 +76,22 @@
 	fprintf(stderr, "             [ table TABLE_ID ] [ proto RTPROTO ]\n");
 	fprintf(stderr, "             [ scope SCOPE ] [ metric METRIC ]\n");
 	fprintf(stderr, "INFO_SPEC := NH OPTIONS FLAGS [ nexthop NH ]...\n");
-	fprintf(stderr, "NH := [ via ADDRESS ] [ dev STRING ] [ weight NUMBER ] NHFLAGS\n");
-	fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ]\n");
+	fprintf(stderr, "NH := [ via [ FAMILY ] ADDRESS ] [ dev STRING ] [ weight NUMBER ] NHFLAGS\n");
+	fprintf(stderr, "FAMILY := [ inet | inet6 | ipx | dnet | mpls | bridge | link ]");
+	fprintf(stderr, "OPTIONS := FLAGS [ mtu NUMBER ] [ advmss NUMBER ] [ as [ to ] ADDRESS ]\n");
 	fprintf(stderr, "           [ rtt TIME ] [ rttvar TIME ] [ reordering NUMBER ]\n");
 	fprintf(stderr, "           [ window NUMBER] [ cwnd NUMBER ] [ initcwnd NUMBER ]\n");
 	fprintf(stderr, "           [ ssthresh NUMBER ] [ realms REALM ] [ src ADDRESS ]\n");
 	fprintf(stderr, "           [ rto_min TIME ] [ hoplimit NUMBER ] [ initrwnd NUMBER ]\n");
 	fprintf(stderr, "           [ features FEATURES ] [ quickack BOOL ] [ congctl NAME ]\n");
+	fprintf(stderr, "           [ pref PREF ]\n");
 	fprintf(stderr, "TYPE := [ unicast | local | broadcast | multicast | throw |\n");
 	fprintf(stderr, "          unreachable | prohibit | blackhole | nat ]\n");
 	fprintf(stderr, "TABLE_ID := [ local | main | default | all | NUMBER ]\n");
 	fprintf(stderr, "SCOPE := [ host | link | global | NUMBER ]\n");
 	fprintf(stderr, "NHFLAGS := [ onlink | pervasive ]\n");
 	fprintf(stderr, "RTPROTO := [ kernel | boot | static | NUMBER ]\n");
+	fprintf(stderr, "PREF := [ low | medium | high ]\n");
 	fprintf(stderr, "TIME := NUMBER[s|ms]\n");
 	fprintf(stderr, "BOOL := [1|0]\n");
 	fprintf(stderr, "FEATURES := ecn\n");
@@ -185,8 +189,15 @@
 	    (r->rtm_family != filter.msrc.family ||
 	     (filter.msrc.bitlen >= 0 && filter.msrc.bitlen < r->rtm_src_len)))
 		return 0;
-	if (filter.rvia.family && r->rtm_family != filter.rvia.family)
-		return 0;
+	if (filter.rvia.family) {
+		int family = r->rtm_family;
+		if (tb[RTA_VIA]) {
+			struct rtvia *via = RTA_DATA(tb[RTA_VIA]);
+			family = via->rtvia_family;
+		}
+		if (family != filter.rvia.family)
+			return 0;
+	}
 	if (filter.rprefsrc.family && r->rtm_family != filter.rprefsrc.family)
 		return 0;
 
@@ -205,6 +216,12 @@
 		via.family = r->rtm_family;
 		if (tb[RTA_GATEWAY])
 			memcpy(&via.data, RTA_DATA(tb[RTA_GATEWAY]), host_len/8);
+		if (tb[RTA_VIA]) {
+			size_t len = RTA_PAYLOAD(tb[RTA_VIA]) - 2;
+			struct rtvia *rtvia = RTA_DATA(tb[RTA_VIA]);
+			via.family = rtvia->rtvia_family;
+			memcpy(&via.data, rtvia->rtvia_addr, len);
+		}
 	}
 	if (filter.rprefsrc.bitlen>0) {
 		memset(&prefsrc, 0, sizeof(prefsrc));
@@ -339,8 +356,9 @@
 	if (tb[RTA_DST]) {
 		if (r->rtm_dst_len != host_len) {
 			fprintf(fp, "%s/%u ", rt_addr_n2a(r->rtm_family,
-							 RTA_DATA(tb[RTA_DST]),
-							 abuf, sizeof(abuf)),
+						       RTA_PAYLOAD(tb[RTA_DST]),
+						       RTA_DATA(tb[RTA_DST]),
+						       abuf, sizeof(abuf)),
 				r->rtm_dst_len
 				);
 		} else {
@@ -358,8 +376,9 @@
 	if (tb[RTA_SRC]) {
 		if (r->rtm_src_len != host_len) {
 			fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family,
-							 RTA_DATA(tb[RTA_SRC]),
-							 abuf, sizeof(abuf)),
+						       RTA_PAYLOAD(tb[RTA_SRC]),
+						       RTA_DATA(tb[RTA_SRC]),
+						       abuf, sizeof(abuf)),
 				r->rtm_src_len
 				);
 		} else {
@@ -372,6 +391,13 @@
 	} else if (r->rtm_src_len) {
 		fprintf(fp, "from 0/%u ", r->rtm_src_len);
 	}
+	if (tb[RTA_NEWDST]) {
+		fprintf(fp, "as to %s ", format_host(r->rtm_family,
+						  RTA_PAYLOAD(tb[RTA_NEWDST]),
+						  RTA_DATA(tb[RTA_NEWDST]),
+						  abuf, sizeof(abuf))
+			);
+	}
 	if (r->rtm_tos && filter.tosmask != -1) {
 		SPRINT_BUF(b1);
 		fprintf(fp, "tos %s ", rtnl_dsfield_n2a(r->rtm_tos, b1, sizeof(b1)));
@@ -384,6 +410,14 @@
 				    RTA_DATA(tb[RTA_GATEWAY]),
 				    abuf, sizeof(abuf)));
 	}
+	if (tb[RTA_VIA]) {
+		size_t len = RTA_PAYLOAD(tb[RTA_VIA]) - 2;
+		struct rtvia *via = RTA_DATA(tb[RTA_VIA]);
+		fprintf(fp, "via %s %s ",
+			family_name(via->rtvia_family),
+			format_host(via->rtvia_family, len, via->rtvia_addr,
+				    abuf, sizeof(abuf)));
+	}
 	if (tb[RTA_OIF] && filter.oifmask != -1)
 		fprintf(fp, "dev %s ", ll_index_to_name(*(int*)RTA_DATA(tb[RTA_OIF])));
 
@@ -401,6 +435,7 @@
 		 */
 		fprintf(fp, " src %s ",
 			rt_addr_n2a(r->rtm_family,
+				    RTA_PAYLOAD(tb[RTA_PREFSRC]),
 				    RTA_DATA(tb[RTA_PREFSRC]),
 				    abuf, sizeof(abuf)));
 	}
@@ -412,6 +447,8 @@
 		fprintf(fp, "onlink ");
 	if (r->rtm_flags & RTNH_F_PERVASIVE)
 		fprintf(fp, "pervasive ");
+	if (r->rtm_flags & RTNH_F_EXTERNAL)
+		fprintf(fp, "external ");
 	if (r->rtm_flags & RTM_F_NOTIFY)
 		fprintf(fp, "notify ");
 	if (tb[RTA_MARK]) {
@@ -598,6 +635,14 @@
 							    RTA_DATA(tb[RTA_GATEWAY]),
 							    abuf, sizeof(abuf)));
 				}
+				if (tb[RTA_VIA]) {
+					size_t len = RTA_PAYLOAD(tb[RTA_VIA]) - 2;
+					struct rtvia *via = RTA_DATA(tb[RTA_VIA]);
+					fprintf(fp, "via %s %s ",
+						family_name(via->rtvia_family),
+						format_host(via->rtvia_family, len, via->rtvia_addr,
+							    abuf, sizeof(abuf)));
+				}
 				if (tb[RTA_FLOW]) {
 					__u32 to = rta_getattr_u32(tb[RTA_FLOW]);
 					__u32 from = to>>16;
@@ -629,6 +674,24 @@
 			nh = RTNH_NEXT(nh);
 		}
 	}
+	if (tb[RTA_PREF]) {
+		unsigned int pref = rta_getattr_u8(tb[RTA_PREF]);
+		fprintf(fp, " pref ");
+
+		switch (pref) {
+		case ICMPV6_ROUTER_PREF_LOW:
+			fprintf(fp, "low");
+			break;
+		case ICMPV6_ROUTER_PREF_MEDIUM:
+			fprintf(fp, "medium");
+			break;
+		case ICMPV6_ROUTER_PREF_HIGH:
+			fprintf(fp, "high");
+			break;
+		default:
+			fprintf(fp, "%u", pref);
+		}
+	}
 	fprintf(fp, "\n");
 	fflush(fp);
 	return 0;
@@ -645,12 +708,23 @@
 	while (++argv, --argc > 0) {
 		if (strcmp(*argv, "via") == 0) {
 			inet_prefix addr;
+			int family;
 			NEXT_ARG();
-			get_addr(&addr, *argv, r->rtm_family);
+			family = read_family(*argv);
+			if (family == AF_UNSPEC)
+				family = r->rtm_family;
+			else
+				NEXT_ARG();
+			get_addr(&addr, *argv, family);
 			if (r->rtm_family == AF_UNSPEC)
 				r->rtm_family = addr.family;
-			rta_addattr_l(rta, 4096, RTA_GATEWAY, &addr.data, addr.bytelen);
-			rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
+			if (addr.family == r->rtm_family) {
+				rta_addattr_l(rta, 4096, RTA_GATEWAY, &addr.data, addr.bytelen);
+				rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen;
+			} else {
+				rta_addattr_l(rta, 4096, RTA_VIA, &addr.family, addr.bytelen+2);
+				rtnh->rtnh_len += sizeof(struct rtattr) + addr.bytelen+2;
+			}
 		} else if (strcmp(*argv, "dev") == 0) {
 			NEXT_ARG();
 			if ((rtnh->rtnh_ifindex = ll_name_to_index(*argv)) == 0) {
@@ -756,14 +830,33 @@
 			if (req.r.rtm_family == AF_UNSPEC)
 				req.r.rtm_family = addr.family;
 			addattr_l(&req.n, sizeof(req), RTA_PREFSRC, &addr.data, addr.bytelen);
-		} else if (strcmp(*argv, "via") == 0) {
+		} else if (strcmp(*argv, "as") == 0) {
 			inet_prefix addr;
-			gw_ok = 1;
 			NEXT_ARG();
+			if (strcmp(*argv, "to") == 0) {
+				NEXT_ARG();
+			}
 			get_addr(&addr, *argv, req.r.rtm_family);
 			if (req.r.rtm_family == AF_UNSPEC)
 				req.r.rtm_family = addr.family;
-			addattr_l(&req.n, sizeof(req), RTA_GATEWAY, &addr.data, addr.bytelen);
+			addattr_l(&req.n, sizeof(req), RTA_NEWDST, &addr.data, addr.bytelen);
+		} else if (strcmp(*argv, "via") == 0) {
+			inet_prefix addr;
+			int family;
+			gw_ok = 1;
+			NEXT_ARG();
+			family = read_family(*argv);
+			if (family == AF_UNSPEC)
+				family = req.r.rtm_family;
+			else
+				NEXT_ARG();
+			get_addr(&addr, *argv, family);
+			if (req.r.rtm_family == AF_UNSPEC)
+				req.r.rtm_family = addr.family;
+			if (addr.family == req.r.rtm_family)
+				addattr_l(&req.n, sizeof(req), RTA_GATEWAY, &addr.data, addr.bytelen);
+			else
+				addattr_l(&req.n, sizeof(req), RTA_VIA, &addr.family, addr.bytelen+2);
 		} else if (strcmp(*argv, "from") == 0) {
 			inet_prefix addr;
 			NEXT_ARG();
@@ -782,7 +875,7 @@
 			req.r.rtm_tos = tos;
 		} else if (matches(*argv, "metric") == 0 ||
 			   matches(*argv, "priority") == 0 ||
-			   matches(*argv, "preference") == 0) {
+			   strcmp(*argv, "preference") == 0) {
 			__u32 metric;
 			NEXT_ARG();
 			if (get_u32(&metric, *argv, 0))
@@ -979,6 +1072,18 @@
 			   strcmp(*argv, "oif") == 0) {
 			NEXT_ARG();
 			d = *argv;
+		} else if (matches(*argv, "pref") == 0) {
+			__u8 pref;
+			NEXT_ARG();
+			if (strcmp(*argv, "low") == 0)
+				pref = ICMPV6_ROUTER_PREF_LOW;
+			else if (strcmp(*argv, "medium") == 0)
+				pref = ICMPV6_ROUTER_PREF_MEDIUM;
+			else if (strcmp(*argv, "high") == 0)
+				pref = ICMPV6_ROUTER_PREF_HIGH;
+			else if (get_u8(&pref, *argv, 0))
+				invarg("\"pref\" value is invalid\n", *argv);
+			addattr8(&req.n, sizeof(req), RTA_PREF, pref);
 		} else {
 			int type;
 			inet_prefix dst;
@@ -1248,8 +1353,14 @@
 			get_unsigned(&mark, *argv, 0);
 			filter.markmask = -1;
 		} else if (strcmp(*argv, "via") == 0) {
+			int family;
 			NEXT_ARG();
-			get_prefix(&filter.rvia, *argv, do_ipv6);
+			family = read_family(*argv);
+			if (family == AF_UNSPEC)
+				family = do_ipv6;
+			else
+				NEXT_ARG();
+			get_prefix(&filter.rvia, *argv, family);
 		} else if (strcmp(*argv, "src") == 0) {
 			NEXT_ARG();
 			get_prefix(&filter.rprefsrc, *argv, do_ipv6);
@@ -1551,6 +1662,8 @@
 			tb[RTA_OIF]->rta_type = 0;
 		if (tb[RTA_GATEWAY])
 			tb[RTA_GATEWAY]->rta_type = 0;
+		if (tb[RTA_VIA])
+			tb[RTA_VIA]->rta_type = 0;
 		if (!idev && tb[RTA_IIF])
 			tb[RTA_IIF]->rta_type = 0;
 		req.n.nlmsg_flags = NLM_F_REQUEST;

diff --git a/ip/iprule.c b/ip/iprule.c
index 54ed753..967969c 100644
--- a/ip/iprule.c
+++ b/ip/iprule.c

@@ -82,8 +82,9 @@
 	if (tb[FRA_SRC]) {
 		if (r->rtm_src_len != host_len) {
 			fprintf(fp, "from %s/%u ", rt_addr_n2a(r->rtm_family,
-							 RTA_DATA(tb[FRA_SRC]),
-							 abuf, sizeof(abuf)),
+						       RTA_PAYLOAD(tb[FRA_SRC]),
+						       RTA_DATA(tb[FRA_SRC]),
+						       abuf, sizeof(abuf)),
 				r->rtm_src_len
 				);
 		} else {
@@ -102,8 +103,9 @@
 	if (tb[FRA_DST]) {
 		if (r->rtm_dst_len != host_len) {
 			fprintf(fp, "to %s/%u ", rt_addr_n2a(r->rtm_family,
-							 RTA_DATA(tb[FRA_DST]),
-							 abuf, sizeof(abuf)),
+						       RTA_PAYLOAD(tb[FRA_DST]),
+						       RTA_DATA(tb[FRA_DST]),
+						       abuf, sizeof(abuf)),
 				r->rtm_dst_len
 				);
 		} else {

diff --git a/ip/iptunnel.c b/ip/iptunnel.c
index caf8a28..be84b83 100644
--- a/ip/iptunnel.c
+++ b/ip/iptunnel.c

@@ -342,8 +342,8 @@
 	printf("%s: %s/ip  remote %s  local %s ",
 	       p->name,
 	       tnl_strproto(p->iph.protocol),
-	       p->iph.daddr ? format_host(AF_INET, 4, &p->iph.daddr, s1, sizeof(s1))  : "any",
-	       p->iph.saddr ? rt_addr_n2a(AF_INET, &p->iph.saddr, s2, sizeof(s2)) : "any");
+	       p->iph.daddr ? format_host(AF_INET, 4, &p->iph.daddr, s1, sizeof(s1)) : "any",
+	       p->iph.saddr ? rt_addr_n2a(AF_INET, 4, &p->iph.saddr, s2, sizeof(s2)) : "any");
 
 	if (p->iph.protocol == IPPROTO_IPV6 && (p->i_flags & SIT_ISATAP)) {
 		struct ip_tunnel_prl prl[16];

diff --git a/ip/ipxfrm.c b/ip/ipxfrm.c
index 95f91a5..9aaf58d 100644
--- a/ip/ipxfrm.c
+++ b/ip/ipxfrm.c

@@ -288,10 +288,10 @@
 		fputs(title, fp);
 
 	memset(abuf, '\0', sizeof(abuf));
-	fprintf(fp, "src %s ", rt_addr_n2a(family,
+	fprintf(fp, "src %s ", rt_addr_n2a(family, sizeof(*saddr),
 					   saddr, abuf, sizeof(abuf)));
 	memset(abuf, '\0', sizeof(abuf));
-	fprintf(fp, "dst %s", rt_addr_n2a(family,
+	fprintf(fp, "dst %s", rt_addr_n2a(family, sizeof(id->daddr),
 					  &id->daddr, abuf, sizeof(abuf)));
 	fprintf(fp, "%s", _SL_);
 
@@ -455,11 +455,15 @@
 		fputs(prefix, fp);
 
 	memset(abuf, '\0', sizeof(abuf));
-	fprintf(fp, "src %s/%u ", rt_addr_n2a(f, &sel->saddr, abuf, sizeof(abuf)),
+	fprintf(fp, "src %s/%u ",
+		rt_addr_n2a(f, sizeof(sel->saddr), &sel->saddr,
+			    abuf, sizeof(abuf)),
 		sel->prefixlen_s);
 
 	memset(abuf, '\0', sizeof(abuf));
-	fprintf(fp, "dst %s/%u ", rt_addr_n2a(f, &sel->daddr, abuf, sizeof(abuf)),
+	fprintf(fp, "dst %s/%u ",
+		rt_addr_n2a(f, sizeof(sel->daddr), &sel->daddr,
+			    abuf, sizeof(abuf)),
 		sel->prefixlen_d);
 
 	if (sel->proto)
@@ -755,7 +759,8 @@
 
 		memset(abuf, '\0', sizeof(abuf));
 		fprintf(fp, "addr %s",
-			rt_addr_n2a(family, &e->encap_oa, abuf, sizeof(abuf)));
+			rt_addr_n2a(family, sizeof(e->encap_oa), &e->encap_oa,
+				    abuf, sizeof(abuf)));
 		fprintf(fp, "%s", _SL_);
 	}
 
@@ -783,7 +788,7 @@
 
 		memset(abuf, '\0', sizeof(abuf));
 		fprintf(fp, "%s",
-			rt_addr_n2a(family, coa,
+			rt_addr_n2a(family, sizeof(*coa), coa,
 				    abuf, sizeof(abuf)));
 		fprintf(fp, "%s", _SL_);
 	}

diff --git a/ip/link_ip6tnl.c b/ip/link_ip6tnl.c
index 5ed3d5a..cf59a93 100644
--- a/ip/link_ip6tnl.c
+++ b/ip/link_ip6tnl.c

@@ -285,6 +285,7 @@
 	if (tb[IFLA_IPTUN_REMOTE]) {
 		fprintf(f, "remote %s ",
 			rt_addr_n2a(AF_INET6,
+				    RTA_PAYLOAD(tb[IFLA_IPTUN_REMOTE]),
 				    RTA_DATA(tb[IFLA_IPTUN_REMOTE]),
 				    s1, sizeof(s1)));
 	}
@@ -292,6 +293,7 @@
 	if (tb[IFLA_IPTUN_LOCAL]) {
 		fprintf(f, "local %s ",
 			rt_addr_n2a(AF_INET6,
+				    RTA_PAYLOAD(tb[IFLA_IPTUN_LOCAL]),
 				    RTA_DATA(tb[IFLA_IPTUN_LOCAL]),
 				    s1, sizeof(s1)));
 	}

diff --git a/ip/xfrm_monitor.c b/ip/xfrm_monitor.c
index 50116a7..58c7d7f 100644
--- a/ip/xfrm_monitor.c
+++ b/ip/xfrm_monitor.c

@@ -27,7 +27,9 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <netinet/in.h>
 #include <linux/xfrm.h>
+
 #include "utils.h"
 #include "xfrm.h"
 #include "ip_common.h"
@@ -227,7 +229,8 @@
 
 	buf[0] = 0;
 	fprintf(fp, "dst %s ",
-		rt_addr_n2a(sa_id->family, &sa_id->daddr, buf, sizeof(buf)));
+		rt_addr_n2a(sa_id->family, sizeof(sa_id->daddr), &sa_id->daddr,
+			    buf, sizeof(buf)));
 
 	fprintf(fp, " reqid 0x%x", reqid);
 
@@ -246,7 +249,8 @@
 	xfrm_ae_flags_print(id->flags, arg);
 	fprintf(fp,"\n\t");
 	memset(abuf, '\0', sizeof(abuf));
-	fprintf(fp, "src %s ", rt_addr_n2a(id->sa_id.family, &id->saddr,
+	fprintf(fp, "src %s ", rt_addr_n2a(id->sa_id.family,
+					   sizeof(id->saddr), &id->saddr,
 					   abuf, sizeof(abuf)));
 
 	xfrm_usersa_print(&id->sa_id, id->reqid, fp);
@@ -262,7 +266,7 @@
 	char buf[256];
 
 	buf[0] = 0;
-	fprintf(fp, "%s", rt_addr_n2a(family, a, buf, sizeof(buf)));
+	fprintf(fp, "%s", rt_addr_n2a(family, sizeof(*a), a, buf, sizeof(buf)));
 }
 
 static int xfrm_mapping_print(const struct sockaddr_nl *who,

diff --git a/ip/xfrm_policy.c b/ip/xfrm_policy.c
index 2337d35..7333dc5 100644
--- a/ip/xfrm_policy.c
+++ b/ip/xfrm_policy.c

@@ -63,7 +63,8 @@
 	fprintf(stderr, "        [ index INDEX ] [ ptype PTYPE ] [ action ACTION ] [ priority PRIORITY ]\n");
 	fprintf(stderr, "        [ flag FLAG-LIST ]\n");
 	fprintf(stderr, "Usage: ip xfrm policy flush [ ptype PTYPE ]\n");
-	fprintf(stderr, "Usage: ip xfrm count\n");
+	fprintf(stderr, "Usage: ip xfrm policy count\n");
+	fprintf(stderr, "Usage: ip xfrm policy set [ hthresh4 LBITS RBITS ] [ hthresh6 LBITS RBITS ]\n");
 	fprintf(stderr, "SELECTOR := [ src ADDR[/PLEN] ] [ dst ADDR[/PLEN] ] [ dev DEV ] [ UPSPEC ]\n");
 	fprintf(stderr, "UPSPEC := proto { { ");
 	fprintf(stderr, "%s | ", strxf_proto(IPPROTO_TCP));
@@ -934,7 +935,7 @@
 			fprintf(fp,")");
 		}
 
-		fprintf(fp,"\n");
+		fprintf(fp, "%s", _SL_);
 	}
 	if (show_stats > 1) {
 		struct xfrmu_spdhinfo *sh;
@@ -948,13 +949,109 @@
 			fprintf(fp,"\t SPD buckets:");
 			fprintf(fp," count %d", sh->spdhcnt);
 			fprintf(fp," Max %d", sh->spdhmcnt);
+			fprintf(fp, "%s", _SL_);
+		}
+		if (tb[XFRMA_SPD_IPV4_HTHRESH]) {
+			struct xfrmu_spdhthresh *th;
+			if (RTA_PAYLOAD(tb[XFRMA_SPD_IPV4_HTHRESH]) < sizeof(*th)) {
+				fprintf(stderr, "SPDinfo: Wrong len %d\n", len);
+				return -1;
+			}
+			th = RTA_DATA(tb[XFRMA_SPD_IPV4_HTHRESH]);
+			fprintf(fp,"\t SPD IPv4 thresholds:");
+			fprintf(fp," local %d", th->lbits);
+			fprintf(fp," remote %d", th->rbits);
+			fprintf(fp, "%s", _SL_);
+
+		}
+		if (tb[XFRMA_SPD_IPV6_HTHRESH]) {
+			struct xfrmu_spdhthresh *th;
+			if (RTA_PAYLOAD(tb[XFRMA_SPD_IPV6_HTHRESH]) < sizeof(*th)) {
+				fprintf(stderr, "SPDinfo: Wrong len %d\n", len);
+				return -1;
+			}
+			th = RTA_DATA(tb[XFRMA_SPD_IPV6_HTHRESH]);
+			fprintf(fp,"\t SPD IPv6 thresholds:");
+			fprintf(fp," local %d", th->lbits);
+			fprintf(fp," remote %d", th->rbits);
+			fprintf(fp, "%s", _SL_);
 		}
 	}
-	fprintf(fp,"\n");
+
+	if (oneline)
+		fprintf(fp, "\n");
 
         return 0;
 }
 
+static int xfrm_spd_setinfo(int argc, char **argv)
+{
+	struct rtnl_handle rth;
+	struct {
+		struct nlmsghdr			n;
+		__u32				flags;
+		char				buf[RTA_BUF_SIZE];
+	} req;
+
+	char *thr4 = NULL;
+	char *thr6 = NULL;
+
+	memset(&req, 0, sizeof(req));
+
+	req.n.nlmsg_len = NLMSG_LENGTH(sizeof(__u32));
+	req.n.nlmsg_flags = NLM_F_REQUEST;
+	req.n.nlmsg_type = XFRM_MSG_NEWSPDINFO;
+	req.flags = 0XFFFFFFFF;
+
+	while (argc > 0) {
+		if (strcmp(*argv, "hthresh4") == 0) {
+			struct xfrmu_spdhthresh thr;
+
+			if (thr4)
+				duparg("hthresh4", *argv);
+			thr4 = *argv;
+			NEXT_ARG();
+			if (get_u8(&thr.lbits, *argv, 0) || thr.lbits > 32)
+				invarg("hthresh4 LBITS value is invalid", *argv);
+			NEXT_ARG();
+			if (get_u8(&thr.rbits, *argv, 0) || thr.rbits > 32)
+				invarg("hthresh4 RBITS value is invalid", *argv);
+
+			addattr_l(&req.n, sizeof(req), XFRMA_SPD_IPV4_HTHRESH,
+				  (void *)&thr, sizeof(thr));
+		} else if (strcmp(*argv, "hthresh6") == 0) {
+			struct xfrmu_spdhthresh thr;
+
+			if (thr6)
+				duparg("hthresh6", *argv);
+			thr6 = *argv;
+			NEXT_ARG();
+			if (get_u8(&thr.lbits, *argv, 0) || thr.lbits > 128)
+				invarg("hthresh6 LBITS value is invalid", *argv);
+			NEXT_ARG();
+			if (get_u8(&thr.rbits, *argv, 0) || thr.rbits > 128)
+				invarg("hthresh6 RBITS value is invalid", *argv);
+
+			addattr_l(&req.n, sizeof(req), XFRMA_SPD_IPV6_HTHRESH,
+				  (void *)&thr, sizeof(thr));
+		} else {
+			invarg("unknown", *argv);
+		}
+
+		argc--; argv++;
+	}
+
+	if (rtnl_open_byproto(&rth, 0, NETLINK_XFRM) < 0)
+		exit(1);
+
+	if (rtnl_talk(&rth, &req.n, 0, 0, NULL) < 0)
+		exit(2);
+
+	rtnl_close(&rth);
+
+	return 0;
+}
+
 static int xfrm_spd_getinfo(int argc, char **argv)
 {
 	struct rtnl_handle rth;
@@ -1058,6 +1155,8 @@
 		return xfrm_policy_flush(argc-1, argv+1);
 	if (matches(*argv, "count") == 0)
 		return xfrm_spd_getinfo(argc, argv);
+	if (matches(*argv, "set") == 0)
+		return xfrm_spd_setinfo(argc-1, argv+1);
 	if (matches(*argv, "help") == 0)
 		usage();
 	fprintf(stderr, "Command \"%s\" is unknown, try \"ip xfrm policy help\".\n", *argv);

diff --git a/lib/ll_addr.c b/lib/ll_addr.c
index c12ab07..2ce9abf 100644
--- a/lib/ll_addr.c
+++ b/lib/ll_addr.c

@@ -29,7 +29,7 @@
 #include "utils.h"
 
 
-const char *ll_addr_n2a(unsigned char *addr, int alen, int type, char *buf, int blen)
+const char *ll_addr_n2a(const unsigned char *addr, int alen, int type, char *buf, int blen)
 {
 	int i;
 	int l;

diff --git a/lib/mpls_ntop.c b/lib/mpls_ntop.c
new file mode 100644
index 0000000..945d6d5
--- /dev/null
+++ b/lib/mpls_ntop.c

@@ -0,0 +1,48 @@
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <linux/mpls.h>
+
+#include "utils.h"
+
+static const char *mpls_ntop1(const struct mpls_label *addr, char *buf, size_t buflen)
+{
+	size_t destlen = buflen;
+	char *dest = buf;
+	int count;
+
+	for (count = 0; count < MPLS_MAX_LABELS; count++) {
+		uint32_t entry = ntohl(addr[count].entry);
+		uint32_t label = (entry & MPLS_LS_LABEL_MASK) >> MPLS_LS_LABEL_SHIFT;
+		int len = snprintf(dest, destlen, "%u", label);
+
+		/* Is this the end? */
+		if (entry & MPLS_LS_S_MASK)
+			return buf;
+
+
+		dest += len;
+		destlen -= len;
+		if (destlen) {
+			*dest = '/';
+			dest++;
+			destlen--;
+		}
+	}
+	errno = -E2BIG;
+	return NULL;
+}
+
+const char *mpls_ntop(int af, const void *addr, char *buf, size_t buflen)
+{
+	switch(af) {
+	case AF_MPLS:
+		errno = 0;
+		return mpls_ntop1((struct mpls_label *)addr, buf, buflen);
+	default:
+		errno = EAFNOSUPPORT;
+	}
+
+	return NULL;
+}

diff --git a/lib/mpls_pton.c b/lib/mpls_pton.c
new file mode 100644
index 0000000..bd448cf
--- /dev/null
+++ b/lib/mpls_pton.c

@@ -0,0 +1,58 @@
+#include <errno.h>
+#include <string.h>
+#include <sys/types.h>
+#include <netinet/in.h>
+#include <linux/mpls.h>
+
+#include "utils.h"
+
+
+static int mpls_pton1(const char *name, struct mpls_label *addr)
+{
+	char *endp;
+	unsigned count;
+
+	for (count = 0; count < MPLS_MAX_LABELS; count++) {
+		unsigned long label;
+
+		label = strtoul(name, &endp, 0);
+		/* Fail when the label value is out or range */
+		if (label >= (1 << 20))
+			return 0;
+
+		if (endp == name) /* no digits */
+			return 0;
+
+		addr->entry = htonl(label << MPLS_LS_LABEL_SHIFT);
+		if (*endp == '\0') {
+			addr->entry |= htonl(1 << MPLS_LS_S_SHIFT);
+			return 1;
+		}
+
+		/* Bad character in the address */
+		if (*endp != '/')
+			return 0;
+
+		name = endp + 1;
+		addr += 1;
+	}
+	/* The address was too long */
+	return 0;
+}
+
+int mpls_pton(int af, const char *src, void *addr)
+{
+	int err;
+
+	switch(af) {
+	case AF_MPLS:
+		errno = 0;
+		err = mpls_pton1(src, (struct mpls_label *)addr);
+		break;
+	default:
+		errno = EAFNOSUPPORT;
+		err = -1;
+	}
+
+	return err;
+}

diff --git a/lib/utils.c b/lib/utils.c
index 0d08a86..428ad8f 100644
--- a/lib/utils.c
+++ b/lib/utils.c

@@ -25,11 +25,13 @@
 #include <asm/types.h>
 #include <linux/pkt_sched.h>
 #include <linux/param.h>
+#include <linux/if_arp.h>
+#include <linux/mpls.h>
 #include <time.h>
 #include <sys/time.h>
 #include <errno.h>
 
-
+#include "rt_names.h"
 #include "utils.h"
 #include "namespace.h"
 
@@ -389,7 +391,7 @@
 	if (strcmp(name, "default") == 0 ||
 	    strcmp(name, "all") == 0 ||
 	    strcmp(name, "any") == 0) {
-		if (family == AF_DECnet)
+		if ((family == AF_DECnet) || (family == AF_MPLS))
 			return -1;
 		addr->family = family;
 		addr->bytelen = (family == AF_INET6 ? 16 : 4);
@@ -397,6 +399,18 @@
 		return 0;
 	}
 
+	if (family == AF_PACKET) {
+		int len;
+		len = ll_addr_a2n((char *)&addr->data, sizeof(addr->data), name);
+		if (len < 0)
+			return -1;
+
+		addr->family = AF_PACKET;
+		addr->bytelen = len;
+		addr->bitlen = len * 8;
+		return 0;
+	}
+
 	if (strchr(name, ':')) {
 		addr->family = AF_INET6;
 		if (family != AF_UNSPEC && family != AF_INET6)
@@ -419,6 +433,23 @@
 		return 0;
 	}
 
+	if (family == AF_MPLS) {
+		int i;
+		addr->family = AF_MPLS;
+		if (mpls_pton(AF_MPLS, name, addr->data) <= 0)
+			return -1;
+		addr->bytelen = 4;
+		addr->bitlen = 20;
+		/* How many bytes do I need? */
+		for (i = 0; i < 8; i++) {
+			if (ntohl(addr->data[i]) & MPLS_LS_S_MASK) {
+				addr->bytelen = (i + 1)*4;
+				break;
+			}
+		}
+		return 0;
+	}
+
 	addr->family = AF_INET;
 	if (family != AF_UNSPEC && family != AF_INET)
 		return -1;
@@ -442,6 +473,8 @@
 		return 16;
 	case AF_IPX:
 		return 80;
+	case AF_MPLS:
+		return 20;
 	}
 
 	return 0;
@@ -463,7 +496,7 @@
 	if (strcmp(arg, "default") == 0 ||
 	    strcmp(arg, "any") == 0 ||
 	    strcmp(arg, "all") == 0) {
-		if (family == AF_DECnet)
+		if ((family == AF_DECnet) || (family = AF_MPLS))
 			return -1;
 		dst->family = family;
 		dst->bytelen = 0;
@@ -497,10 +530,6 @@
 
 int get_addr(inet_prefix *dst, const char *arg, int family)
 {
-	if (family == AF_PACKET) {
-		fprintf(stderr, "Error: \"%s\" may be inet address, but it is not allowed in this context.\n", arg);
-		exit(1);
-	}
 	if (get_addr_1(dst, arg, family)) {
 		fprintf(stderr, "Error: an inet address is expected rather than \"%s\".\n", arg);
 		exit(1);
@@ -636,12 +665,14 @@
 	return sysconf(_SC_CLK_TCK);
 }
 
-const char *rt_addr_n2a(int af, const void *addr, char *buf, int buflen)
+const char *rt_addr_n2a(int af, int len, const void *addr, char *buf, int buflen)
 {
 	switch (af) {
 	case AF_INET:
 	case AF_INET6:
 		return inet_ntop(af, addr, buf, buflen);
+	case AF_MPLS:
+		return mpls_ntop(af, addr, buf, buflen);
 	case AF_IPX:
 		return ipx_ntop(af, addr, buf, buflen);
 	case AF_DECnet:
@@ -650,11 +681,52 @@
 		memcpy(dna.a_addr, addr, 2);
 		return dnet_ntop(af, &dna, buf, buflen);
 	}
+	case AF_PACKET:
+		return ll_addr_n2a(addr, len, ARPHRD_VOID, buf, buflen);
 	default:
 		return "???";
 	}
 }
 
+int read_family(const char *name)
+{
+	int family = AF_UNSPEC;
+	if (strcmp(name, "inet") == 0)
+		family = AF_INET;
+	else if (strcmp(name, "inet6") == 0)
+		family = AF_INET6;
+	else if (strcmp(name, "dnet") == 0)
+		family = AF_DECnet;
+	else if (strcmp(name, "link") == 0)
+		family = AF_PACKET;
+	else if (strcmp(name, "ipx") == 0)
+		family = AF_IPX;
+	else if (strcmp(name, "mpls") == 0)
+		family = AF_MPLS;
+	else if (strcmp(name, "bridge") == 0)
+		family = AF_BRIDGE;
+	return family;
+}
+
+const char *family_name(int family)
+{
+	if (family == AF_INET)
+		return "inet";
+	if (family == AF_INET6)
+		return "inet6";
+	if (family == AF_DECnet)
+		return "dnet";
+	if (family == AF_PACKET)
+		return "link";
+	if (family == AF_IPX)
+		return "ipx";
+	if (family == AF_MPLS)
+		return "mpls";
+	if (family == AF_BRIDGE)
+		return "bridge";
+	return "???";
+}
+
 #ifdef RESOLVE_HOSTNAMES
 struct namerec
 {
@@ -723,7 +795,7 @@
 			return n;
 	}
 #endif
-	return rt_addr_n2a(af, addr, buf, buflen);
+	return rt_addr_n2a(af, len, addr, buf, buflen);
 }
 
 

diff --git a/man/man8/ip-route.8.in b/man/man8/ip-route.8.in
index d53cc76..72d8d77 100644
--- a/man/man8/ip-route.8.in
+++ b/man/man8/ip-route.8.in

@@ -81,18 +81,28 @@
 .ti -8
 .IR NH " := [ "
 .B  via
-.IR ADDRESS " ] [ "
+[
+.IR FAMILY " ] " ADDRESS " ] [ "
 .B  dev
 .IR STRING " ] [ "
 .B  weight
 .IR NUMBER " ] " NHFLAGS
 
 .ti -8
+.IR FAMILY " := [ "
+.BR inet " | " inet6 " | " ipx " | " dnet " | " mpls " | " bridge " | " link " ]"
+
+.ti -8
 .IR OPTIONS " := " FLAGS " [ "
 .B  mtu
 .IR NUMBER " ] [ "
 .B  advmss
 .IR NUMBER " ] [ "
+.B  as
+[
+.B to
+]
+.IR ADDRESS " ]"
 .B  rtt
 .IR TIME " ] [ "
 .B  rttvar
@@ -119,6 +129,8 @@
 .IR BOOL " ] [ "
 .B  congctl
 .IR NAME " ]"
+.B  pref
+.IR PREF " ]"
 
 .ti -8
 .IR TYPE " := [ "
@@ -148,6 +160,10 @@
 .IR FEATURES " := [ "
 .BR ecn " | ]"
 
+.ti -8
+.IR PREF " := [ "
+.BR low " | " medium " | " high " ]"
+
 
 .SH DESCRIPTION
 .B ip route
@@ -333,9 +349,10 @@
 the output device name.
 
 .TP
-.BI via " ADDRESS"
-the address of the nexthop router. Actually, the sense of this field
-depends on the route type. For normal
+.BI via " [ FAMILY ] ADDRESS"
+the address of the nexthop router, in the address family FAMILY.
+Actually, the sense of this field depends on the route type.  For
+normal
 .B unicast
 routes it is either the true next hop router or, if it is a direct
 route installed in BSD compatibility mode, it can be a local address
@@ -472,7 +489,7 @@
 argument lists:
 
 .in +8
-.BI via " ADDRESS"
+.BI via " [ FAMILY ] ADDRESS"
 - is the nexthop router.
 .sp
 
@@ -551,6 +568,28 @@
 .B onlink
 pretend that the nexthop is directly attached to this link,
 even if it does not match any interface prefix.
+
+.TP
+.BI pref " PREF"
+the IPv6 route preference.
+.I PREF
+is a string specifying the route preference as defined in RFC4191 for Router
+Discovery messages. Namely:
+
+.in +8
+.B low
+- the route has a lowest priority
+.sp
+
+.B medium
+- the route has a default priority
+.sp
+
+.B high
+- the route has a highest priority
+.sp
+
+.in -8
 .RE
 
 .TP
@@ -669,7 +708,7 @@
 only list routes going via this device.
 
 .TP
-.BI via " PREFIX"
+.BI via " [ FAMILY ] PREFIX"
 only list routes going via the nexthop routers selected by
 .IR PREFIX "."
 

diff --git a/man/man8/ip-xfrm.8 b/man/man8/ip-xfrm.8
index c9d2a2e..29b397f 100644
--- a/man/man8/ip-xfrm.8
+++ b/man/man8/ip-xfrm.8

@@ -257,6 +257,13 @@
 .B "ip xfrm policy count"
 
 .ti -8
+.B "ip xfrm policy set"
+.RB "[ " hthresh4
+.IR LBITS " " RBITS " ]"
+.RB "[ " hthresh6
+.IR LBITS " " RBITS " ]"
+
+.ti -8
 .IR SELECTOR " :="
 .RB "[ " src
 .IR ADDR "[/" PLEN "] ]"
@@ -360,6 +367,13 @@
 .BR "ip xfrm monitor" " [ " all " |"
 .IR LISTofXFRM-OBJECTS " ]"
 
+.ti -8
+.IR LISTofXFRM-OBJECTS " := [ " LISTofXFRM-OBJECTS " ] " XFRM-OBJECT
+
+.ti -8
+.IR XFRM-OBJECT " := "
+.BR acquire " | " expire " | " SA " | " policy " | " aevent " | " report
+
 .in -8
 .ad b
 
@@ -385,7 +399,6 @@
 ip xfrm state list	print out the list of existing state in xfrm
 ip xfrm state flush	flush all state in xfrm
 ip xfrm state count	count all existing state in xfrm
-ip xfrm monitor 	state monitoring for xfrm objects
 .TE
 
 .TP
@@ -507,7 +520,9 @@
 .BR espinudp " or " espinudp-nonike ","
 .RI "using source port " SPORT ", destination port "  DPORT
 .RI ", and original address " OADDR "."
+
 .sp
+.PP
 .TS
 l l.
 ip xfrm policy add	add a new policy
@@ -517,7 +532,6 @@
 ip xfrm policy deleteall	delete all existing xfrm policies
 ip xfrm policy list	print out the list of xfrm policies
 ip xfrm policy flush	flush policies
-ip xfrm policy count	count existing policies
 .TE
 
 .TP
@@ -612,7 +626,50 @@
 can be
 .BR required " (default) or " use "."
 
+.sp
+.PP
+.TS
+l l.
+ip xfrm policy count	count existing policies
+.TE
+
+.PP
+Use one or more -s options to display more details, including policy hash table
+information.
+
+.sp
+.PP
+.TS
+l l.
+ip xfrm policy set	configure the policy hash table
+.TE
+
+.PP
+Security policies whose address prefix lengths are greater than or equal
+policy hash table thresholds are hashed. Others are stored in the
+policy_inexact chained list.
+
+.TP
+.I LBITS
+specifies the minimum local address prefix length of policies that are
+stored in the Security Policy Database hash table.
+
+.TP
+.I RBITS
+specifies the minimum remote address prefix length of policies that are
+stored in the Security Policy Database hash table.
+
+.sp
+.PP
+.TS
+l l.
+ip xfrm monitor 	state monitoring for xfrm objects
+.TE
+
+.PP
 The xfrm objects to monitor can be optionally specified.
 
 .SH AUTHOR
 Manpage revised by David Ward <david.ward@ll.mit.edu>
+.br
+Manpage revised by Christophe Gouault <christophe.gouault@6wind.com>

diff --git a/man/man8/ip.8 b/man/man8/ip.8
index 4cd71de..44d1ee6 100644
--- a/man/man8/ip.8
+++ b/man/man8/ip.8

@@ -73,7 +73,7 @@
 .TP
 .BR "\-f" , " \-family " <FAMILY>
 Specifies the protocol family to use. The protocol family identifier can be one of
-.BR "inet" , " inet6" , " bridge" , " ipx" , " dnet"
+.BR "inet" , " inet6" , " bridge" , " ipx" , " dnet" , " mpls"
 or
 .BR link .
 If this option is not present,
@@ -115,6 +115,11 @@
 .BR "\-family ipx" .
 
 .TP
+.B \-M
+shortcut for
+.BR "\-family mpls" .
+
+.TP
 .B \-0
 shortcut for
 .BR "\-family link" .

diff --git a/tc/Makefile b/tc/Makefile
index d831a15..2eff082 100644
--- a/tc/Makefile
+++ b/tc/Makefile

@@ -89,6 +89,11 @@
   endif
 endif
 
+ifeq ($(TC_CONFIG_ELF),y)
+  CFLAGS += -DHAVE_ELF
+  LDLIBS += -lelf
+endif
+
 TCOBJ += $(TCMODULES)
 LDLIBS += -L. -ltc -lm
 

diff --git a/tc/f_bpf.c b/tc/f_bpf.c
index e2af94e..8bdd602 100644
--- a/tc/f_bpf.c
+++ b/tc/f_bpf.c

@@ -14,6 +14,7 @@
 #include <unistd.h>
 #include <syslog.h>
 #include <fcntl.h>
+#include <libgen.h>
 #include <sys/socket.h>
 #include <netinet/in.h>
 #include <arpa/inet.h>
@@ -28,20 +29,36 @@
 #include "tc_util.h"
 #include "tc_bpf.h"
 
+static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_CLS;
+
 static void explain(void)
 {
 	fprintf(stderr, "Usage: ... bpf ...\n");
 	fprintf(stderr, "\n");
-	fprintf(stderr, " [inline]:     run bytecode BPF_BYTECODE\n");
-	fprintf(stderr, " [from file]:  run bytecode-file FILE\n");
+	fprintf(stderr, "BPF use case:\n");
+	fprintf(stderr, " bytecode BPF_BYTECODE\n");
+	fprintf(stderr, " bytecode-file FILE\n");
 	fprintf(stderr, "\n");
-	fprintf(stderr, "               [ action ACTION_SPEC ]\n");
-	fprintf(stderr, "               [ classid CLASSID ]\n");
+	fprintf(stderr, "eBPF use case:\n");
+	fprintf(stderr, " object-file FILE [ section CLS_NAME ] [ export UDS_FILE ]\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Common remaining options:\n");
+	fprintf(stderr, " [ action ACTION_SPEC ]\n");
+	fprintf(stderr, " [ classid CLASSID ]\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n");
-	fprintf(stderr, "      c,t,f,k and s are decimals; s denotes number of 4-tuples\n");
-	fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string\n");
-	fprintf(stderr, "\nACTION_SPEC := ... look at individual actions\n");
+	fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n");
+	fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where CLS_NAME refers to the section name containing the\n");
+	fprintf(stderr, "classifier (default \'%s\').\n", bpf_default_section(bpf_type));
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where UDS_FILE points to a unix domain socket file in order\n");
+	fprintf(stderr, "to hand off control of all created eBPF maps to an agent.\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "ACTION_SPEC := ... look at individual actions\n");
 	fprintf(stderr, "NOTE: CLASSID is parsed as hexadecimal input.\n");
 }
 
@@ -49,8 +66,13 @@
 			 int argc, char **argv, struct nlmsghdr *n)
 {
 	struct tcmsg *t = NLMSG_DATA(n);
+	const char *bpf_uds_name = NULL;
+	const char *bpf_sec_name = NULL;
+	char *bpf_obj = NULL;
 	struct rtattr *tail;
+	bool seen_run = false;
 	long h = 0;
+	int ret = 0;
 
 	if (argc == 0)
 		return 0;
@@ -66,39 +88,85 @@
 
 	t->tcm_handle = h;
 
-	tail = (struct rtattr*)(((void*)n)+NLMSG_ALIGN(n->nlmsg_len));
+	tail = (struct rtattr *)(((void *)n) + NLMSG_ALIGN(n->nlmsg_len));
 	addattr_l(n, MAX_MSG, TCA_OPTIONS, NULL, 0);
 
 	while (argc > 0) {
 		if (matches(*argv, "run") == 0) {
-			bool from_file;
 			struct sock_filter bpf_ops[BPF_MAXINSNS];
-			__u16 bpf_len;
+			bool from_file, ebpf;
 			int ret;
 
 			NEXT_ARG();
-			if (strcmp(*argv, "bytecode-file") == 0) {
+opt_bpf:
+			bpf_sec_name = bpf_default_section(bpf_type);
+			ebpf = false;
+			seen_run = true;
+
+			if (strcmp(*argv, "bytecode-file") == 0 ||
+			    strcmp(*argv, "bcf") == 0) {
 				from_file = true;
-			} else if (strcmp(*argv, "bytecode") == 0) {
+			} else if (strcmp(*argv, "bytecode") == 0 ||
+				   strcmp(*argv, "bc") == 0) {
 				from_file = false;
+			} else if (strcmp(*argv, "object-file") == 0 ||
+				   strcmp(*argv, "obj") == 0) {
+				ebpf = true;
 			} else {
 				fprintf(stderr, "What is \"%s\"?\n", *argv);
 				explain();
 				return -1;
 			}
+
 			NEXT_ARG();
-			ret = bpf_parse_ops(argc, argv, bpf_ops, from_file);
+			if (ebpf) {
+				bpf_obj = *argv;
+				NEXT_ARG();
+
+				if (strcmp(*argv, "section") == 0 ||
+				    strcmp(*argv, "sec") == 0) {
+					NEXT_ARG();
+					bpf_sec_name = *argv;
+					NEXT_ARG();
+				}
+				if (strcmp(*argv, "export") == 0 ||
+				    strcmp(*argv, "exp") == 0) {
+					NEXT_ARG();
+					bpf_uds_name = *argv;
+					NEXT_ARG();
+				}
+
+				PREV_ARG();
+			}
+
+			ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name) :
+				     bpf_parse_ops(argc, argv, bpf_ops, from_file);
 			if (ret < 0) {
-				fprintf(stderr, "Illegal \"bytecode\"\n");
+				fprintf(stderr, "%s\n", ebpf ?
+					"Could not load object" :
+					"Illegal \"bytecode\"");
 				return -1;
 			}
-			bpf_len = ret;
-			addattr16(n, MAX_MSG, TCA_BPF_OPS_LEN, bpf_len);
-			addattr_l(n, MAX_MSG, TCA_BPF_OPS, &bpf_ops,
-				  bpf_len * sizeof(struct sock_filter));
+
+			if (ebpf) {
+				char bpf_name[256];
+
+				bpf_obj = basename(bpf_obj);
+
+				snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]",
+					 bpf_obj, bpf_sec_name);
+
+				addattr32(n, MAX_MSG, TCA_BPF_FD, ret);
+				addattrstrz(n, MAX_MSG, TCA_BPF_NAME, bpf_name);
+			} else {
+				addattr16(n, MAX_MSG, TCA_BPF_OPS_LEN, ret);
+				addattr_l(n, MAX_MSG, TCA_BPF_OPS, &bpf_ops,
+					  ret * sizeof(struct sock_filter));
+			}
 		} else if (matches(*argv, "classid") == 0 ||
 			   strcmp(*argv, "flowid") == 0) {
-			unsigned handle;
+			unsigned int handle;
+
 			NEXT_ARG();
 			if (get_tc_classid(&handle, *argv)) {
 				fprintf(stderr, "Illegal \"classid\"\n");
@@ -123,15 +191,23 @@
 			explain();
 			return -1;
 		} else {
+			if (!seen_run)
+				goto opt_bpf;
+
 			fprintf(stderr, "What is \"%s\"?\n", *argv);
 			explain();
 			return -1;
 		}
-		argc--; argv++;
+		argc--;
+		argv++;
 	}
 
-	tail->rta_len = (((void*)n)+n->nlmsg_len) - (void*)tail;
-	return 0;
+	tail->rta_len = (((void *)n) + n->nlmsg_len) - (void *)tail;
+
+	if (bpf_uds_name)
+		ret = bpf_handoff_map_fds(bpf_uds_name, bpf_obj);
+
+	return ret;
 }
 
 static int bpf_print_opt(struct filter_util *qu, FILE *f,
@@ -153,9 +229,16 @@
 			sprint_tc_classid(rta_getattr_u32(tb[TCA_BPF_CLASSID]), b1));
 	}
 
-	if (tb[TCA_BPF_OPS] && tb[TCA_BPF_OPS_LEN])
+	if (tb[TCA_BPF_NAME])
+		fprintf(f, "%s ", rta_getattr_str(tb[TCA_BPF_NAME]));
+	else if (tb[TCA_BPF_FD])
+		fprintf(f, "pfd %u ", rta_getattr_u32(tb[TCA_BPF_FD]));
+
+	if (tb[TCA_BPF_OPS] && tb[TCA_BPF_OPS_LEN]) {
 		bpf_print_ops(f, tb[TCA_BPF_OPS],
 			      rta_getattr_u16(tb[TCA_BPF_OPS_LEN]));
+		fprintf(f, "\n");
+	}
 
 	if (tb[TCA_BPF_POLICE]) {
 		fprintf(f, "\n");

diff --git a/tc/m_bpf.c b/tc/m_bpf.c
index bc6cc47..c817579 100644
--- a/tc/m_bpf.c
+++ b/tc/m_bpf.c

@@ -7,6 +7,7 @@
  *              2 of the License, or (at your option) any later version.
  *
  * Authors:     Jiri Pirko <jiri@resnulli.us>
+ *              Daniel Borkmann <daniel@iogearbox.net>
  */
 
 #include <stdio.h>
@@ -14,6 +15,8 @@
 #include <unistd.h>
 #include <string.h>
 #include <stdbool.h>
+#include <libgen.h>
+#include <linux/bpf.h>
 #include <linux/tc_act/tc_bpf.h>
 
 #include "utils.h"
@@ -21,16 +24,30 @@
 #include "tc_util.h"
 #include "tc_bpf.h"
 
+static const enum bpf_prog_type bpf_type = BPF_PROG_TYPE_SCHED_ACT;
+
 static void explain(void)
 {
 	fprintf(stderr, "Usage: ... bpf ...\n");
 	fprintf(stderr, "\n");
-	fprintf(stderr, " [inline]:     run bytecode BPF_BYTECODE\n");
-	fprintf(stderr, " [from file]:  run bytecode-file FILE\n");
+	fprintf(stderr, "BPF use case:\n");
+	fprintf(stderr, " bytecode BPF_BYTECODE\n");
+	fprintf(stderr, " bytecode-file FILE\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "eBPF use case:\n");
+	fprintf(stderr, " object-file FILE [ section ACT_NAME ] [ export UDS_FILE ]\n");
 	fprintf(stderr, "\n");
 	fprintf(stderr, "Where BPF_BYTECODE := \'s,c t f k,c t f k,c t f k,...\'\n");
-	fprintf(stderr, "      c,t,f,k and s are decimals; s denotes number of 4-tuples\n");
-	fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string\n");
+	fprintf(stderr, "c,t,f,k and s are decimals; s denotes number of 4-tuples\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where FILE points to a file containing the BPF_BYTECODE string,\n");
+	fprintf(stderr, "an ELF file containing eBPF map definitions and bytecode.\n");
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where ACT_NAME refers to the section name containing the\n");
+	fprintf(stderr, "action (default \'%s\').\n", bpf_default_section(bpf_type));
+	fprintf(stderr, "\n");
+	fprintf(stderr, "Where UDS_FILE points to a unix domain socket file in order\n");
+	fprintf(stderr, "to hand off control of all created eBPF maps to an agent.\n");
 }
 
 static void usage(void)
@@ -42,12 +59,17 @@
 static int parse_bpf(struct action_util *a, int *argc_p, char ***argv_p,
 		     int tca_id, struct nlmsghdr *n)
 {
-	int argc = *argc_p;
-	char **argv = *argv_p;
+	char **argv = *argv_p, bpf_name[256];
 	struct rtattr *tail;
 	struct tc_act_bpf parm = { 0 };
 	struct sock_filter bpf_ops[BPF_MAXINSNS];
+	bool ebpf = false, seen_run = false;
+	const char *bpf_uds_name = NULL;
+	const char *bpf_sec_name = NULL;
+	char *bpf_obj = NULL;
+	int argc = *argc_p, ret = 0;
 	__u16 bpf_len = 0;
+	__u32 bpf_fd = 0;
 
 	if (matches(*argv, "bpf") != 0)
 		return -1;
@@ -60,25 +82,70 @@
 			int ret;
 
 			NEXT_ARG();
-			if (strcmp(*argv, "bytecode-file") == 0) {
+opt_bpf:
+			bpf_sec_name = bpf_default_section(bpf_type);
+			seen_run = true;
+
+			if (strcmp(*argv, "bytecode-file") == 0 ||
+			    strcmp(*argv, "bcf") == 0) {
 				from_file = true;
-			} else if (strcmp(*argv, "bytecode") == 0) {
+			} else if (strcmp(*argv, "bytecode") == 0 ||
+				   strcmp(*argv, "bc") == 0) {
 				from_file = false;
+			} else if (strcmp(*argv, "object-file") == 0 ||
+				   strcmp(*argv, "obj") == 0) {
+				ebpf = true;
 			} else {
 				fprintf(stderr, "unexpected \"%s\"\n", *argv);
 				explain();
 				return -1;
 			}
+
 			NEXT_ARG();
-			ret = bpf_parse_ops(argc, argv, bpf_ops, from_file);
+			if (ebpf) {
+				bpf_obj = *argv;
+				NEXT_ARG();
+
+				if (strcmp(*argv, "section") == 0 ||
+				    strcmp(*argv, "sec") == 0) {
+					NEXT_ARG();
+					bpf_sec_name = *argv;
+					NEXT_ARG();
+				}
+				if (strcmp(*argv, "export") == 0 ||
+				    strcmp(*argv, "exp") == 0) {
+					NEXT_ARG();
+					bpf_uds_name = *argv;
+					NEXT_ARG();
+				}
+
+				PREV_ARG();
+			}
+
+			ret = ebpf ? bpf_open_object(bpf_obj, bpf_type, bpf_sec_name) :
+				     bpf_parse_ops(argc, argv, bpf_ops, from_file);
 			if (ret < 0) {
-				fprintf(stderr, "Illegal \"bytecode\"\n");
+				fprintf(stderr, "%s\n", ebpf ?
+					"Could not load object" :
+					"Illegal \"bytecode\"");
 				return -1;
 			}
-			bpf_len = ret;
+
+			if (ebpf) {
+				bpf_obj = basename(bpf_obj);
+
+				snprintf(bpf_name, sizeof(bpf_name), "%s:[%s]",
+					 bpf_obj, bpf_sec_name);
+
+				bpf_fd = ret;
+			} else {
+				bpf_len = ret;
+			}
 		} else if (matches(*argv, "help") == 0) {
 			usage();
 		} else {
+			if (!seen_run)
+				goto opt_bpf;
 			break;
 		}
 		argc--;
@@ -123,29 +190,42 @@
 		}
 	}
 
-	if (!bpf_len) {
+	if ((!bpf_len && !ebpf) || (!bpf_fd && ebpf)) {
 		fprintf(stderr, "bpf: Bytecode needs to be passed\n");
 		explain();
 		return -1;
 	}
 
 	tail = NLMSG_TAIL(n);
+
 	addattr_l(n, MAX_MSG, tca_id, NULL, 0);
 	addattr_l(n, MAX_MSG, TCA_ACT_BPF_PARMS, &parm, sizeof(parm));
-	addattr16(n, MAX_MSG, TCA_ACT_BPF_OPS_LEN, bpf_len);
-	addattr_l(n, MAX_MSG, TCA_ACT_BPF_OPS, &bpf_ops,
-		  bpf_len * sizeof(struct sock_filter));
+
+	if (ebpf) {
+		addattr32(n, MAX_MSG, TCA_ACT_BPF_FD, bpf_fd);
+		addattrstrz(n, MAX_MSG, TCA_ACT_BPF_NAME, bpf_name);
+	} else {
+		addattr16(n, MAX_MSG, TCA_ACT_BPF_OPS_LEN, bpf_len);
+		addattr_l(n, MAX_MSG, TCA_ACT_BPF_OPS, &bpf_ops,
+			  bpf_len * sizeof(struct sock_filter));
+	}
+
 	tail->rta_len = (char *)NLMSG_TAIL(n) - (char *)tail;
 
 	*argc_p = argc;
 	*argv_p = argv;
-	return 0;
+
+	if (bpf_uds_name)
+		ret = bpf_handoff_map_fds(bpf_uds_name, bpf_obj);
+
+	return ret;
 }
 
 static int print_bpf(struct action_util *au, FILE *f, struct rtattr *arg)
 {
 	struct rtattr *tb[TCA_ACT_BPF_MAX + 1];
 	struct tc_act_bpf *parm;
+	SPRINT_BUF(action_buf);
 
 	if (arg == NULL)
 		return -1;
@@ -156,15 +236,25 @@
 		fprintf(f, "[NULL bpf parameters]");
 		return -1;
 	}
+
 	parm = RTA_DATA(tb[TCA_ACT_BPF_PARMS]);
 
-	fprintf(f, " bpf ");
+	fprintf(f, "bpf ");
 
-	if (tb[TCA_ACT_BPF_OPS] && tb[TCA_ACT_BPF_OPS_LEN])
+	if (tb[TCA_ACT_BPF_NAME])
+		fprintf(f, "%s ", rta_getattr_str(tb[TCA_ACT_BPF_NAME]));
+	else if (tb[TCA_ACT_BPF_FD])
+		fprintf(f, "pfd %u ", rta_getattr_u32(tb[TCA_ACT_BPF_FD]));
+
+	if (tb[TCA_ACT_BPF_OPS] && tb[TCA_ACT_BPF_OPS_LEN]) {
 		bpf_print_ops(f, tb[TCA_ACT_BPF_OPS],
 			      rta_getattr_u16(tb[TCA_ACT_BPF_OPS_LEN]));
+		fprintf(f, " ");
+	}
 
-	fprintf(f, "\n\tindex %d ref %d bind %d", parm->index, parm->refcnt,
+	fprintf(f, "default-action %s\n", action_n2a(parm->action, action_buf,
+		sizeof(action_buf)));
+	fprintf(f, "\tindex %d ref %d bind %d", parm->index, parm->refcnt,
 		parm->bindcnt);
 
 	if (show_stats) {

diff --git a/tc/tc_bpf.c b/tc/tc_bpf.c
index c6901d6..326d098 100644
--- a/tc/tc_bpf.c
+++ b/tc/tc_bpf.c

@@ -8,6 +8,7 @@
  *
  * Authors:	Daniel Borkmann <dborkman@redhat.com>
  *		Jiri Pirko <jiri@resnulli.us>
+ *		Alexei Starovoitov <ast@plumgrid.com>
  */
 
 #include <stdio.h>
@@ -16,11 +17,25 @@
 #include <string.h>
 #include <stdbool.h>
 #include <errno.h>
+#include <fcntl.h>
+#include <stdarg.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <sys/un.h>
 #include <linux/filter.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
 
+#ifdef HAVE_ELF
+#include <libelf.h>
+#include <gelf.h>
+#endif
+
 #include "utils.h"
+
+#include "bpf_elf.h"
+#include "bpf_scm.h"
+
 #include "tc_util.h"
 #include "tc_bpf.h"
 
@@ -141,6 +156,552 @@
 		fprintf(f, "%hu %hhu %hhu %u,", ops[i].code, ops[i].jt,
 			ops[i].jf, ops[i].k);
 
-	fprintf(f, "%hu %hhu %hhu %u\'\n", ops[i].code, ops[i].jt,
+	fprintf(f, "%hu %hhu %hhu %u\'", ops[i].code, ops[i].jt,
 		ops[i].jf, ops[i].k);
 }
+
+const char *bpf_default_section(const enum bpf_prog_type type)
+{
+	switch (type) {
+	case BPF_PROG_TYPE_SCHED_CLS:
+		return ELF_SECTION_CLASSIFIER;
+	case BPF_PROG_TYPE_SCHED_ACT:
+		return ELF_SECTION_ACTION;
+	default:
+		return NULL;
+	}
+}
+
+#ifdef HAVE_ELF
+struct bpf_elf_sec_data {
+	GElf_Shdr sec_hdr;
+	char *sec_name;
+	Elf_Data *sec_data;
+};
+
+struct bpf_map_data {
+	int *fds;
+	const char *obj;
+	struct bpf_elf_st *st;
+	struct bpf_elf_map *ent;
+};
+
+/* If we provide a small buffer with log level enabled, the kernel
+ * could fail program load as no buffer space is available for the
+ * log and thus verifier fails. In case something doesn't pass the
+ * verifier we still want to hand something descriptive to the user.
+ */
+static char bpf_log_buf[65536];
+
+static struct bpf_elf_st bpf_st;
+
+static int map_fds[ELF_MAX_MAPS];
+static struct bpf_elf_map map_ent[ELF_MAX_MAPS];
+
+static void bpf_dump_error(const char *format, ...)  __check_format_string(1, 2);
+static void bpf_dump_error(const char *format, ...)
+{
+	va_list vl;
+
+	va_start(vl, format);
+	vfprintf(stderr, format, vl);
+	va_end(vl);
+
+	fprintf(stderr, "%s\n", bpf_log_buf);
+	memset(bpf_log_buf, 0, sizeof(bpf_log_buf));
+}
+
+static void bpf_save_finfo(int file_fd)
+{
+	struct stat st;
+	int ret;
+
+	memset(&bpf_st, 0, sizeof(bpf_st));
+
+	ret = fstat(file_fd, &st);
+	if (ret < 0) {
+		fprintf(stderr, "Stat of elf file failed: %s\n",
+			strerror(errno));
+		return;
+	}
+
+	bpf_st.st_dev = st.st_dev;
+	bpf_st.st_ino = st.st_ino;
+}
+
+static void bpf_clear_finfo(void)
+{
+	memset(&bpf_st, 0, sizeof(bpf_st));
+}
+
+static bool bpf_may_skip_map_creation(int file_fd)
+{
+	struct stat st;
+	int ret;
+
+	ret = fstat(file_fd, &st);
+	if (ret < 0) {
+		fprintf(stderr, "Stat of elf file failed: %s\n",
+			strerror(errno));
+		return false;
+	}
+
+	return (bpf_st.st_dev == st.st_dev) &&
+	       (bpf_st.st_ino == st.st_ino);
+}
+
+static int bpf_create_map(enum bpf_map_type type, unsigned int size_key,
+			  unsigned int size_value, unsigned int max_elem)
+{
+	union bpf_attr attr = {
+		.map_type	= type,
+		.key_size	= size_key,
+		.value_size	= size_value,
+		.max_entries	= max_elem,
+	};
+
+	return bpf(BPF_MAP_CREATE, &attr, sizeof(attr));
+}
+
+static int bpf_prog_load(enum bpf_prog_type type, const struct bpf_insn *insns,
+			 unsigned int len, const char *license)
+{
+	union bpf_attr attr = {
+		.prog_type	= type,
+		.insns		= bpf_ptr_to_u64(insns),
+		.insn_cnt	= len / sizeof(struct bpf_insn),
+		.license	= bpf_ptr_to_u64(license),
+		.log_buf	= bpf_ptr_to_u64(bpf_log_buf),
+		.log_size	= sizeof(bpf_log_buf),
+		.log_level	= 1,
+	};
+
+	return bpf(BPF_PROG_LOAD, &attr, sizeof(attr));
+}
+
+static int bpf_prog_attach(enum bpf_prog_type type, const struct bpf_insn *insns,
+			   unsigned int size, const char *license)
+{
+	int prog_fd = bpf_prog_load(type, insns, size, license);
+
+	if (prog_fd < 0)
+		bpf_dump_error("BPF program rejected: %s\n", strerror(errno));
+
+	return prog_fd;
+}
+
+static int bpf_map_attach(enum bpf_map_type type, unsigned int size_key,
+			  unsigned int size_value, unsigned int max_elem)
+{
+	int map_fd = bpf_create_map(type, size_key, size_value, max_elem);
+
+	if (map_fd < 0)
+		bpf_dump_error("BPF map rejected: %s\n", strerror(errno));
+
+	return map_fd;
+}
+
+static void bpf_maps_init(void)
+{
+	int i;
+
+	memset(map_ent, 0, sizeof(map_ent));
+	for (i = 0; i < ARRAY_SIZE(map_fds); i++)
+		map_fds[i] = -1;
+}
+
+static int bpf_maps_count(void)
+{
+	int i, count = 0;
+
+	for (i = 0; i < ARRAY_SIZE(map_fds); i++) {
+		if (map_fds[i] < 0)
+			break;
+		count++;
+	}
+
+	return count;
+}
+
+static void bpf_maps_destroy(void)
+{
+	int i;
+
+	memset(map_ent, 0, sizeof(map_ent));
+	for (i = 0; i < ARRAY_SIZE(map_fds); i++) {
+		if (map_fds[i] >= 0)
+			close(map_fds[i]);
+	}
+}
+
+static int bpf_maps_attach(struct bpf_elf_map *maps, unsigned int num_maps)
+{
+	int i, ret;
+
+	for (i = 0; (i < num_maps) && (num_maps <= ARRAY_SIZE(map_fds)); i++) {
+		struct bpf_elf_map *map = &maps[i];
+
+		ret = bpf_map_attach(map->type, map->size_key,
+				     map->size_value, map->max_elem);
+		if (ret < 0)
+			goto err_unwind;
+
+		map_fds[i] = ret;
+	}
+
+	return 0;
+
+err_unwind:
+	bpf_maps_destroy();
+	return ret;
+}
+
+static int bpf_fill_section_data(Elf *elf_fd, GElf_Ehdr *elf_hdr, int sec_index,
+				 struct bpf_elf_sec_data *sec_data)
+{
+	GElf_Shdr sec_hdr;
+	Elf_Scn *sec_fd;
+	Elf_Data *sec_edata;
+	char *sec_name;
+
+	memset(sec_data, 0, sizeof(*sec_data));
+
+	sec_fd = elf_getscn(elf_fd, sec_index);
+	if (!sec_fd)
+		return -EINVAL;
+
+	if (gelf_getshdr(sec_fd, &sec_hdr) != &sec_hdr)
+		return -EIO;
+
+	sec_name = elf_strptr(elf_fd, elf_hdr->e_shstrndx,
+			      sec_hdr.sh_name);
+	if (!sec_name || !sec_hdr.sh_size)
+		return -ENOENT;
+
+	sec_edata = elf_getdata(sec_fd, NULL);
+	if (!sec_edata || elf_getdata(sec_fd, sec_edata))
+		return -EIO;
+
+	memcpy(&sec_data->sec_hdr, &sec_hdr, sizeof(sec_hdr));
+	sec_data->sec_name = sec_name;
+	sec_data->sec_data = sec_edata;
+
+	return 0;
+}
+
+static int bpf_apply_relo_data(struct bpf_elf_sec_data *data_relo,
+			       struct bpf_elf_sec_data *data_insn,
+			       Elf_Data *sym_tab)
+{
+	Elf_Data *idata = data_insn->sec_data;
+	GElf_Shdr *rhdr = &data_relo->sec_hdr;
+	int relo_ent, relo_num = rhdr->sh_size / rhdr->sh_entsize;
+	struct bpf_insn *insns = idata->d_buf;
+	unsigned int num_insns = idata->d_size / sizeof(*insns);
+
+	for (relo_ent = 0; relo_ent < relo_num; relo_ent++) {
+		unsigned int ioff, fnum;
+		GElf_Rel relo;
+		GElf_Sym sym;
+
+		if (gelf_getrel(data_relo->sec_data, relo_ent, &relo) != &relo)
+			return -EIO;
+
+		ioff = relo.r_offset / sizeof(struct bpf_insn);
+		if (ioff >= num_insns)
+			return -EINVAL;
+		if (insns[ioff].code != (BPF_LD | BPF_IMM | BPF_DW))
+			return -EINVAL;
+
+		if (gelf_getsym(sym_tab, GELF_R_SYM(relo.r_info), &sym) != &sym)
+			return -EIO;
+
+		fnum = sym.st_value / sizeof(struct bpf_elf_map);
+		if (fnum >= ARRAY_SIZE(map_fds))
+			return -EINVAL;
+		if (map_fds[fnum] < 0)
+			return -EINVAL;
+
+		insns[ioff].src_reg = BPF_PSEUDO_MAP_FD;
+		insns[ioff].imm = map_fds[fnum];
+	}
+
+	return 0;
+}
+
+static int bpf_fetch_ancillary(int file_fd, Elf *elf_fd, GElf_Ehdr *elf_hdr,
+			       bool *sec_seen, char *license, unsigned int lic_len,
+			       Elf_Data **sym_tab)
+{
+	int sec_index, ret = -1;
+
+	for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) {
+		struct bpf_elf_sec_data data_anc;
+
+		ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index,
+					    &data_anc);
+		if (ret < 0)
+			continue;
+
+		/* Extract and load eBPF map fds. */
+		if (!strcmp(data_anc.sec_name, ELF_SECTION_MAPS) &&
+		    !bpf_may_skip_map_creation(file_fd)) {
+			struct bpf_elf_map *maps;
+			unsigned int maps_num;
+
+			if (data_anc.sec_data->d_size % sizeof(*maps) != 0)
+				return -EINVAL;
+
+			maps = data_anc.sec_data->d_buf;
+			maps_num = data_anc.sec_data->d_size / sizeof(*maps);
+			memcpy(map_ent, maps, data_anc.sec_data->d_size);
+
+			sec_seen[sec_index] = true;
+			ret = bpf_maps_attach(maps, maps_num);
+			if (ret < 0)
+				return ret;
+		}
+		/* Extract eBPF license. */
+		else if (!strcmp(data_anc.sec_name, ELF_SECTION_LICENSE)) {
+			if (data_anc.sec_data->d_size > lic_len)
+				return -ENOMEM;
+
+			sec_seen[sec_index] = true;
+			memcpy(license, data_anc.sec_data->d_buf,
+			       data_anc.sec_data->d_size);
+		}
+		/* Extract symbol table for relocations (map fd fixups). */
+		else if (data_anc.sec_hdr.sh_type == SHT_SYMTAB) {
+			sec_seen[sec_index] = true;
+			*sym_tab = data_anc.sec_data;
+		}
+	}
+
+	return ret;
+}
+
+static int bpf_fetch_prog_relo(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_seen,
+			       enum bpf_prog_type type, const char *sec,
+			       const char *license, Elf_Data *sym_tab)
+{
+	int sec_index, prog_fd = -1;
+
+	for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) {
+		struct bpf_elf_sec_data data_relo, data_insn;
+		int ins_index, ret;
+
+		/* Attach eBPF programs with relocation data (maps). */
+		ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index,
+					    &data_relo);
+		if (ret < 0 || data_relo.sec_hdr.sh_type != SHT_REL)
+			continue;
+
+		ins_index = data_relo.sec_hdr.sh_info;
+
+		ret = bpf_fill_section_data(elf_fd, elf_hdr, ins_index,
+					    &data_insn);
+		if (ret < 0)
+			continue;
+		if (strcmp(data_insn.sec_name, sec))
+			continue;
+
+		sec_seen[sec_index] = true;
+		sec_seen[ins_index] = true;
+
+		ret = bpf_apply_relo_data(&data_relo, &data_insn, sym_tab);
+		if (ret < 0)
+			continue;
+
+		prog_fd = bpf_prog_attach(type, data_insn.sec_data->d_buf,
+					  data_insn.sec_data->d_size, license);
+		if (prog_fd < 0)
+			continue;
+
+		break;
+	}
+
+	return prog_fd;
+}
+
+static int bpf_fetch_prog(Elf *elf_fd, GElf_Ehdr *elf_hdr, bool *sec_seen,
+			  enum bpf_prog_type type, const char *sec,
+			  const char *license)
+{
+	int sec_index, prog_fd = -1;
+
+	for (sec_index = 1; sec_index < elf_hdr->e_shnum; sec_index++) {
+		struct bpf_elf_sec_data data_insn;
+		int ret;
+
+		/* Attach eBPF programs without relocation data. */
+		if (sec_seen[sec_index])
+			continue;
+
+		ret = bpf_fill_section_data(elf_fd, elf_hdr, sec_index,
+					    &data_insn);
+		if (ret < 0)
+			continue;
+		if (strcmp(data_insn.sec_name, sec))
+			continue;
+
+		prog_fd = bpf_prog_attach(type, data_insn.sec_data->d_buf,
+					  data_insn.sec_data->d_size, license);
+		if (prog_fd < 0)
+			continue;
+
+		break;
+	}
+
+	return prog_fd;
+}
+
+int bpf_open_object(const char *path, enum bpf_prog_type type, const char *sec)
+{
+	char license[ELF_MAX_LICENSE_LEN];
+	int file_fd, prog_fd = -1, ret;
+	Elf_Data *sym_tab = NULL;
+	GElf_Ehdr elf_hdr;
+	bool *sec_seen;
+	Elf *elf_fd;
+
+	if (elf_version(EV_CURRENT) == EV_NONE)
+		return -EINVAL;
+
+	file_fd = open(path, O_RDONLY, 0);
+	if (file_fd < 0)
+		return -errno;
+
+	elf_fd = elf_begin(file_fd, ELF_C_READ, NULL);
+	if (!elf_fd) {
+		ret = -EINVAL;
+		goto out;
+	}
+
+	if (gelf_getehdr(elf_fd, &elf_hdr) != &elf_hdr) {
+		ret = -EIO;
+		goto out_elf;
+	}
+
+	sec_seen = calloc(elf_hdr.e_shnum, sizeof(*sec_seen));
+	if (!sec_seen) {
+		ret = -ENOMEM;
+		goto out_elf;
+	}
+
+	memset(license, 0, sizeof(license));
+	if (!bpf_may_skip_map_creation(file_fd))
+		bpf_maps_init();
+
+	ret = bpf_fetch_ancillary(file_fd, elf_fd, &elf_hdr, sec_seen,
+				  license, sizeof(license), &sym_tab);
+	if (ret < 0)
+		goto out_maps;
+	if (sym_tab)
+		prog_fd = bpf_fetch_prog_relo(elf_fd, &elf_hdr, sec_seen, type,
+					      sec, license, sym_tab);
+	if (prog_fd < 0)
+		prog_fd = bpf_fetch_prog(elf_fd, &elf_hdr, sec_seen, type, sec,
+					 license);
+	if (prog_fd < 0)
+		goto out_maps;
+
+	bpf_save_finfo(file_fd);
+
+	free(sec_seen);
+
+	elf_end(elf_fd);
+	close(file_fd);
+
+	return prog_fd;
+
+out_maps:
+	bpf_maps_destroy();
+	free(sec_seen);
+out_elf:
+	elf_end(elf_fd);
+out:
+	close(file_fd);
+	bpf_clear_finfo();
+	return prog_fd;
+}
+
+static int
+bpf_map_set_xmit(int fd, struct sockaddr_un *addr, unsigned int addr_len,
+		 const struct bpf_map_data *aux, unsigned int ents)
+{
+	struct bpf_map_set_msg msg;
+	int *cmsg_buf, min_fd;
+	char *amsg_buf;
+	int i;
+
+	memset(&msg, 0, sizeof(msg));
+
+	msg.aux.uds_ver = BPF_SCM_AUX_VER;
+	msg.aux.num_ent = ents;
+
+	strncpy(msg.aux.obj_name, aux->obj, sizeof(msg.aux.obj_name));
+	memcpy(&msg.aux.obj_st, aux->st, sizeof(msg.aux.obj_st));
+
+	cmsg_buf = bpf_map_set_init(&msg, addr, addr_len);
+	amsg_buf = (char *)msg.aux.ent;
+
+	for (i = 0; i < ents; i += min_fd) {
+		int ret;
+
+		min_fd = min(BPF_SCM_MAX_FDS * 1U, ents - i);
+
+		bpf_map_set_init_single(&msg, min_fd);
+
+		memcpy(cmsg_buf, &aux->fds[i], sizeof(aux->fds[0]) * min_fd);
+		memcpy(amsg_buf, &aux->ent[i], sizeof(aux->ent[0]) * min_fd);
+
+		ret = sendmsg(fd, &msg.hdr, 0);
+		if (ret <= 0)
+			return ret ? : -1;
+	}
+
+	return 0;
+}
+
+int bpf_handoff_map_fds(const char *path, const char *obj)
+{
+	struct sockaddr_un addr;
+	struct bpf_map_data bpf_aux;
+	int fd, ret;
+
+	fd = socket(AF_UNIX, SOCK_DGRAM, 0);
+	if (fd < 0) {
+		fprintf(stderr, "Cannot open socket: %s\n",
+			strerror(errno));
+		return -1;
+	}
+
+	memset(&addr, 0, sizeof(addr));
+	addr.sun_family = AF_UNIX;
+	strncpy(addr.sun_path, path, sizeof(addr.sun_path));
+
+	ret = connect(fd, (struct sockaddr *)&addr, sizeof(addr));
+	if (ret < 0) {
+		fprintf(stderr, "Cannot connect to %s: %s\n",
+			path, strerror(errno));
+		return -1;
+	}
+
+	memset(&bpf_aux, 0, sizeof(bpf_aux));
+
+	bpf_aux.fds = map_fds;
+	bpf_aux.ent = map_ent;
+
+	bpf_aux.obj = obj;
+	bpf_aux.st = &bpf_st;
+
+	ret = bpf_map_set_xmit(fd, &addr, sizeof(addr), &bpf_aux,
+			       bpf_maps_count());
+	if (ret < 0)
+		fprintf(stderr, "Cannot xmit fds to %s: %s\n",
+			path, strerror(errno));
+
+	close(fd);
+	return ret;
+}
+#endif /* HAVE_ELF */

diff --git a/tc/tc_bpf.h b/tc/tc_bpf.h
index 08cca92..8b214b8 100644
--- a/tc/tc_bpf.h
+++ b/tc/tc_bpf.h

@@ -13,10 +13,16 @@
 #ifndef _TC_BPF_H_
 #define _TC_BPF_H_ 1
 
-#include <stdio.h>
 #include <linux/filter.h>
 #include <linux/netlink.h>
 #include <linux/rtnetlink.h>
+#include <linux/bpf.h>
+#include <sys/syscall.h>
+#include <errno.h>
+#include <stdio.h>
+#include <stdint.h>
+
+#include "utils.h"
 
 int bpf_parse_string(char *arg, bool from_file, __u16 *bpf_len,
 		     char **bpf_string, bool *need_release,
@@ -25,4 +31,40 @@
 		  bool from_file);
 void bpf_print_ops(FILE *f, struct rtattr *bpf_ops, __u16 len);
 
+const char *bpf_default_section(const enum bpf_prog_type type);
+
+#ifdef HAVE_ELF
+int bpf_open_object(const char *path, enum bpf_prog_type type,
+		    const char *sec);
+int bpf_handoff_map_fds(const char *path, const char *obj);
+
+static inline __u64 bpf_ptr_to_u64(const void *ptr)
+{
+	return (__u64) (unsigned long) ptr;
+}
+
+static inline int bpf(int cmd, union bpf_attr *attr, unsigned int size)
+{
+#ifdef __NR_bpf
+	return syscall(__NR_bpf, cmd, attr, size);
+#else
+	fprintf(stderr, "No bpf syscall, kernel headers too old?\n");
+	errno = ENOSYS;
+	return -1;
 #endif
+}
+#else
+static inline int bpf_open_object(const char *path, enum bpf_prog_type type,
+				  const char *sec)
+{
+	fprintf(stderr, "No ELF library support compiled in.\n");
+	errno = ENOSYS;
+	return -1;
+}
+
+static inline int bpf_handoff_map_fds(const char *path, const char *obj)
+{
+	return 0;
+}
+#endif /* HAVE_ELF */
+#endif /* _TC_BPF_H_ */
commit	93531fac419cf8450a6159feeb108071da980acb	[log] [tgz]
author	Stephen Hemminger <shemming@brocade.com>	Mon Apr 13 09:39:46 2015 -0700
committer	Stephen Hemminger <shemming@brocade.com>	Mon Apr 13 09:39:46 2015 -0700
tree	62b27d80451c6243a26781a846563bfa5ab5b5e0
parent	6256f8c9e45f01187b297a576e148534a393c990 [diff]
parent	672acc723859290b94b753b8437d8cc2f0810174 [diff]