blob: d6caf374b137e2082800f61fe78da229f1608eef [file] [log] [blame]
Daniel Borkmann6256f8c2015-04-01 17:57:44 +02001/*
2 * eBPF kernel space program part
3 *
4 * Toy eBPF program for demonstration purposes, some parts derived from
5 * kernel tree's samples/bpf/sockex2_kern.c example.
6 *
7 * More background on eBPF, kernel tree: Documentation/networking/filter.txt
8 *
9 * Note, this file is rather large, and most classifier and actions are
10 * likely smaller to accomplish one specific use-case and are tailored
11 * for high performance. For performance reasons, you might also have the
12 * classifier and action already merged inside the classifier.
13 *
14 * In order to show various features it serves as a bigger programming
15 * example, which you should feel free to rip apart and experiment with.
16 *
17 * Compilation, configuration example:
18 *
19 * Note: as long as the BPF backend in LLVM is still experimental,
20 * you need to build LLVM with LLVM with --enable-experimental-targets=BPF
21 * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
22 * and you have libelf.h and gelf.h headers and can link tc against -lelf.
23 *
24 * In case you need to sync kernel headers, go to your kernel source tree:
25 * # make headers_install INSTALL_HDR_PATH=/usr/
26 *
27 * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
28 * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
29 * $ objdump -h bpf.o
30 * [...]
31 * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
32 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
33 * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
34 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
35 * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
36 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
37 * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
38 * CONTENTS, ALLOC, LOAD, DATA
39 * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
40 * CONTENTS, ALLOC, LOAD, DATA
41 * [...]
42 * # echo 1 > /proc/sys/net/core/bpf_jit_enable
43 * $ gcc bpf_agent.c -o bpf_agent -Wall -O2
44 * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
45 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
46 * action bpf obj bpf.o sec action-mark \
47 * action bpf obj bpf.o sec action-rand ok
48 * # tc filter show dev em1
49 * filter parent 1: protocol all pref 49152 bpf
50 * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
51 * action order 1: bpf bpf.o:[action-mark] default-action pipe
52 * index 52 ref 1 bind 1
53 *
54 * action order 2: bpf bpf.o:[action-rand] default-action pipe
55 * index 53 ref 1 bind 1
56 *
57 * action order 3: gact action pass
58 * random type none pass val 0
59 * index 38 ref 1 bind 1
60 *
Daniel Borkmann279d6a82015-04-20 13:48:54 +020061 * The same program can also be installed on ingress side (as opposed to above
62 * egress configuration), e.g.:
63 *
64 * # tc qdisc add dev em1 handle ffff: ingress
65 * # tc filter add dev em1 parent ffff: bpf obj ...
66 *
Daniel Borkmann4bd62442015-04-16 21:20:06 +020067 * Notes on BPF agent:
68 *
69 * In the above example, the bpf_agent creates the unix domain socket
70 * natively. "tc exec" can also spawn a shell and hold the socktes there:
71 *
72 * # tc exec bpf imp /tmp/bpf-uds
73 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
74 * action bpf obj bpf.o sec action-mark \
75 * action bpf obj bpf.o sec action-rand ok
76 * sh-4.2# (shell spawned from tc exec)
77 * sh-4.2# bpf_agent
78 * [...]
79 *
80 * This will read out fds over environment and produce the same data dump
81 * as below. This has the advantage that the spawned shell owns the fds
82 * and thus if the agent is restarted, it can reattach to the same fds, also
83 * various programs can easily read/modify the data simultaneously from user
84 * space side.
85 *
86 * If the shell is unnecessary, the agent can also just be spawned directly
87 * via tc exec:
88 *
89 * # tc exec bpf imp /tmp/bpf-uds run bpf_agent
90 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
91 * action bpf obj bpf.o sec action-mark \
92 * action bpf obj bpf.o sec action-rand ok
93 *
Daniel Borkmann6256f8c2015-04-01 17:57:44 +020094 * BPF agent example output:
95 *
96 * ver: 1
97 * obj: bpf.o
98 * dev: 64770
99 * ino: 6045133
100 * maps: 3
101 * map0:
102 * `- fd: 4
103 * | serial: 1
104 * | type: 1
105 * | max elem: 256
106 * | size key: 1
107 * ` size val: 16
108 * map1:
109 * `- fd: 5
110 * | serial: 2
111 * | type: 1
112 * | max elem: 1024
113 * | size key: 4
114 * ` size val: 16
115 * map2:
116 * `- fd: 6
117 * | serial: 3
118 * | type: 2
119 * | max elem: 64
120 * | size key: 4
121 * ` size val: 8
122 * data, period: 5sec
123 * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
124 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
125 * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
126 * data, period: 5sec
127 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
128 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
129 * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
130 * data, period: 5sec
131 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
132 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
133 * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
134 * [...]
135 *
136 * This now means, the below classifier and action pipeline has been loaded
137 * as eBPF bytecode into the kernel, the kernel has verified that the
138 * execution of the bytecode is "safe", and it has JITed the programs
139 * afterwards, so that upon invocation they're running on native speed. tc
140 * has transferred all map file descriptors to the bpf_agent via IPC and
141 * even after tc exits, the agent can read out or modify all map data.
142 *
143 * Note that the export to the uds is done only once in the classifier and
144 * not in the action. It's enough to export the (here) shared descriptors
145 * once.
146 *
147 * If you need to disassemble the generated JIT image (echo with 2), the
148 * kernel tree has under tools/net/ a small helper, you can invoke e.g.
149 * `bpf_jit_disasm -o`.
150 *
151 * Please find in the code below further comments.
152 *
153 * -- Happy eBPF hacking! ;)
154 */
155#include <stdint.h>
156#include <stdbool.h>
157#include <sys/types.h>
158#include <sys/socket.h>
159#include <asm/types.h>
160#include <linux/in.h>
161#include <linux/if.h>
162#include <linux/if_ether.h>
163#include <linux/ip.h>
164#include <linux/ipv6.h>
165#include <linux/if_tunnel.h>
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200166#include <linux/filter.h>
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200167#include <linux/bpf.h>
168
169/* Common, shared definitions with ebpf_agent.c. */
170#include "bpf_shared.h"
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100171/* BPF helper functions for our example. */
172#include "../../include/bpf_api.h"
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200173
174/* Could be defined here as well, or included from the header. */
175#define TC_ACT_UNSPEC (-1)
176#define TC_ACT_OK 0
177#define TC_ACT_RECLASSIFY 1
178#define TC_ACT_SHOT 2
179#define TC_ACT_PIPE 3
180#define TC_ACT_STOLEN 4
181#define TC_ACT_QUEUED 5
182#define TC_ACT_REPEAT 6
183
184/* Other, misc stuff. */
185#define IP_MF 0x2000
186#define IP_OFFSET 0x1FFF
187
188/* eBPF map definitions, all placed in section "maps". */
189struct bpf_elf_map __section("maps") map_proto = {
190 .type = BPF_MAP_TYPE_HASH,
191 .id = BPF_MAP_ID_PROTO,
192 .size_key = sizeof(uint8_t),
193 .size_value = sizeof(struct count_tuple),
194 .max_elem = 256,
Daniel Borkmann4dd3f502016-04-09 00:32:05 +0200195 .flags = BPF_F_NO_PREALLOC,
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200196};
197
198struct bpf_elf_map __section("maps") map_queue = {
199 .type = BPF_MAP_TYPE_HASH,
200 .id = BPF_MAP_ID_QUEUE,
201 .size_key = sizeof(uint32_t),
202 .size_value = sizeof(struct count_queue),
203 .max_elem = 1024,
Daniel Borkmann4dd3f502016-04-09 00:32:05 +0200204 .flags = BPF_F_NO_PREALLOC,
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200205};
206
207struct bpf_elf_map __section("maps") map_drops = {
208 .type = BPF_MAP_TYPE_ARRAY,
209 .id = BPF_MAP_ID_DROPS,
210 .size_key = sizeof(uint32_t),
211 .size_value = sizeof(long),
212 .max_elem = 64,
213};
214
215/* Helper functions and definitions for the flow dissector used by the
216 * example classifier. This resembles the kernel's flow dissector to
217 * some extend and is just used as an example to show what's possible
218 * with eBPF.
219 */
220struct sockaddr;
221
222struct vlan_hdr {
223 __be16 h_vlan_TCI;
224 __be16 h_vlan_encapsulated_proto;
225};
226
227struct flow_keys {
228 __u32 src;
229 __u32 dst;
230 union {
231 __u32 ports;
232 __u16 port16[2];
233 };
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200234 __s32 th_off;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200235 __u8 ip_proto;
236};
237
Daniel Borkmann92a36992016-02-07 02:11:50 +0100238static __inline__ int flow_ports_offset(__u8 ip_proto)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200239{
240 switch (ip_proto) {
241 case IPPROTO_TCP:
242 case IPPROTO_UDP:
243 case IPPROTO_DCCP:
244 case IPPROTO_ESP:
245 case IPPROTO_SCTP:
246 case IPPROTO_UDPLITE:
247 default:
248 return 0;
249 case IPPROTO_AH:
250 return 4;
251 }
252}
253
Daniel Borkmann92a36992016-02-07 02:11:50 +0100254static __inline__ bool flow_is_frag(struct __sk_buff *skb, int nh_off)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200255{
256 return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
257 (IP_MF | IP_OFFSET));
258}
259
Daniel Borkmann92a36992016-02-07 02:11:50 +0100260static __inline__ int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
261 __u8 *ip_proto, struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200262{
263 __u8 ip_ver_len;
264
265 if (unlikely(flow_is_frag(skb, nh_off)))
266 *ip_proto = 0;
267 else
268 *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
269 protocol));
270 if (*ip_proto != IPPROTO_GRE) {
271 flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
272 flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
273 }
274
275 ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
276 if (likely(ip_ver_len == 0x45))
277 nh_off += 20;
278 else
279 nh_off += (ip_ver_len & 0xF) << 2;
280
281 return nh_off;
282}
283
Daniel Borkmann92a36992016-02-07 02:11:50 +0100284static __inline__ __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200285{
286 __u32 w0 = load_word(skb, off);
287 __u32 w1 = load_word(skb, off + sizeof(w0));
288 __u32 w2 = load_word(skb, off + sizeof(w0) * 2);
289 __u32 w3 = load_word(skb, off + sizeof(w0) * 3);
290
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200291 return w0 ^ w1 ^ w2 ^ w3;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200292}
293
Daniel Borkmann92a36992016-02-07 02:11:50 +0100294static __inline__ int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
295 __u8 *ip_proto, struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200296{
297 *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
298
299 flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
300 flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
301
302 return nh_off + sizeof(struct ipv6hdr);
303}
304
Daniel Borkmann92a36992016-02-07 02:11:50 +0100305static __inline__ bool flow_dissector(struct __sk_buff *skb,
306 struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200307{
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200308 int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200309 __be16 proto = skb->protocol;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200310 __u8 ip_proto;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200311
312 /* TODO: check for skb->vlan_tci, skb->vlan_proto first */
313 if (proto == htons(ETH_P_8021AD)) {
314 proto = load_half(skb, nh_off +
315 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
316 nh_off += sizeof(struct vlan_hdr);
317 }
318 if (proto == htons(ETH_P_8021Q)) {
319 proto = load_half(skb, nh_off +
320 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
321 nh_off += sizeof(struct vlan_hdr);
322 }
323
324 if (likely(proto == htons(ETH_P_IP)))
325 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
326 else if (proto == htons(ETH_P_IPV6))
327 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
328 else
329 return false;
330
331 switch (ip_proto) {
332 case IPPROTO_GRE: {
333 struct gre_hdr {
334 __be16 flags;
335 __be16 proto;
336 };
337
338 __u16 gre_flags = load_half(skb, nh_off +
339 offsetof(struct gre_hdr, flags));
340 __u16 gre_proto = load_half(skb, nh_off +
341 offsetof(struct gre_hdr, proto));
342
343 if (gre_flags & (GRE_VERSION | GRE_ROUTING))
344 break;
345
346 nh_off += 4;
347 if (gre_flags & GRE_CSUM)
348 nh_off += 4;
349 if (gre_flags & GRE_KEY)
350 nh_off += 4;
351 if (gre_flags & GRE_SEQ)
352 nh_off += 4;
353
354 if (gre_proto == ETH_P_8021Q) {
355 gre_proto = load_half(skb, nh_off +
356 offsetof(struct vlan_hdr,
357 h_vlan_encapsulated_proto));
358 nh_off += sizeof(struct vlan_hdr);
359 }
360 if (gre_proto == ETH_P_IP)
361 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
362 else if (gre_proto == ETH_P_IPV6)
363 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
364 else
365 return false;
366 break;
367 }
368 case IPPROTO_IPIP:
369 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
370 break;
371 case IPPROTO_IPV6:
372 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
373 default:
374 break;
375 }
376
377 nh_off += flow_ports_offset(ip_proto);
378
379 flow->ports = load_word(skb, nh_off);
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200380 flow->th_off = nh_off;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200381 flow->ip_proto = ip_proto;
382
383 return true;
384}
385
Daniel Borkmann92a36992016-02-07 02:11:50 +0100386static __inline__ void cls_update_proto_map(const struct __sk_buff *skb,
387 const struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200388{
389 uint8_t proto = flow->ip_proto;
390 struct count_tuple *ct, _ct;
391
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100392 ct = map_lookup_elem(&map_proto, &proto);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200393 if (likely(ct)) {
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100394 lock_xadd(&ct->packets, 1);
395 lock_xadd(&ct->bytes, skb->len);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200396 return;
397 }
398
399 /* No hit yet, we need to create a new entry. */
400 _ct.packets = 1;
401 _ct.bytes = skb->len;
402
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100403 map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200404}
405
Daniel Borkmann92a36992016-02-07 02:11:50 +0100406static __inline__ void cls_update_queue_map(const struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200407{
408 uint32_t queue = skb->queue_mapping;
409 struct count_queue *cq, _cq;
410 bool mismatch;
411
412 mismatch = skb->queue_mapping != get_smp_processor_id();
413
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100414 cq = map_lookup_elem(&map_queue, &queue);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200415 if (likely(cq)) {
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100416 lock_xadd(&cq->total, 1);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200417 if (mismatch)
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100418 lock_xadd(&cq->mismatch, 1);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200419 return;
420 }
421
422 /* No hit yet, we need to create a new entry. */
423 _cq.total = 1;
424 _cq.mismatch = mismatch ? 1 : 0;
425
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100426 map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200427}
428
429/* eBPF program definitions, placed in various sections, which can
430 * have custom section names. If custom names are in use, it's
431 * required to point tc to the correct section, e.g.
432 *
433 * tc filter add [...] bpf obj cls.o sec cls-tos [...]
434 *
435 * in case the program resides in __section("cls-tos").
436 *
437 * Default section for cls_bpf is: "classifier", for act_bpf is:
438 * "action". Naturally, if for example multiple actions are present
439 * in the same file, they need to have distinct section names.
440 *
441 * It is however not required to have multiple programs sharing
442 * a file.
443 */
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100444__section("classifier")
445int cls_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200446{
447 struct flow_keys flow;
448
449 if (!flow_dissector(skb, &flow))
450 return 0; /* No match in cls_bpf. */
451
452 cls_update_proto_map(skb, &flow);
453 cls_update_queue_map(skb);
454
455 return flow.ip_proto;
456}
457
Daniel Borkmann92a36992016-02-07 02:11:50 +0100458static __inline__ void act_update_drop_map(void)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200459{
460 uint32_t *count, cpu = get_smp_processor_id();
461
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100462 count = map_lookup_elem(&map_drops, &cpu);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200463 if (count)
464 /* Only this cpu is accessing this element. */
465 (*count)++;
466}
467
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100468__section("action-mark")
469int act_mark_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200470{
471 /* You could also mangle skb data here with the helper function
472 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
473 * do that already in the classifier itself as a merged combination
474 * of classifier'n'action model.
475 */
476
477 if (skb->mark == 0xcafe) {
478 act_update_drop_map();
479 return TC_ACT_SHOT;
480 }
481
482 /* Default configured tc opcode. */
483 return TC_ACT_UNSPEC;
484}
485
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100486__section("action-rand")
487int act_rand_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200488{
489 /* Sorry, we're near event horizon ... */
490 if ((get_prandom_u32() & 3) == 0) {
491 act_update_drop_map();
492 return TC_ACT_SHOT;
493 }
494
495 return TC_ACT_UNSPEC;
496}
497
498/* Last but not least, the file contains a license. Some future helper
499 * functions may only be available with a GPL license.
500 */
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100501BPF_LICENSE("GPL");