blob: f15e876c20a4f147804cf35997ce1411d6688907 [file] [log] [blame]
Daniel Borkmann6256f8c2015-04-01 17:57:44 +02001/*
2 * eBPF kernel space program part
3 *
4 * Toy eBPF program for demonstration purposes, some parts derived from
5 * kernel tree's samples/bpf/sockex2_kern.c example.
6 *
7 * More background on eBPF, kernel tree: Documentation/networking/filter.txt
8 *
9 * Note, this file is rather large, and most classifier and actions are
10 * likely smaller to accomplish one specific use-case and are tailored
11 * for high performance. For performance reasons, you might also have the
12 * classifier and action already merged inside the classifier.
13 *
14 * In order to show various features it serves as a bigger programming
15 * example, which you should feel free to rip apart and experiment with.
16 *
17 * Compilation, configuration example:
18 *
19 * Note: as long as the BPF backend in LLVM is still experimental,
20 * you need to build LLVM with LLVM with --enable-experimental-targets=BPF
21 * Also, make sure your 4.1+ kernel is compiled with CONFIG_BPF_SYSCALL=y,
22 * and you have libelf.h and gelf.h headers and can link tc against -lelf.
23 *
24 * In case you need to sync kernel headers, go to your kernel source tree:
25 * # make headers_install INSTALL_HDR_PATH=/usr/
26 *
27 * $ export PATH=/home/<...>/llvm/Debug+Asserts/bin/:$PATH
28 * $ clang -O2 -emit-llvm -c bpf_prog.c -o - | llc -march=bpf -filetype=obj -o bpf.o
29 * $ objdump -h bpf.o
30 * [...]
31 * 3 classifier 000007f8 0000000000000000 0000000000000000 00000040 2**3
32 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
33 * 4 action-mark 00000088 0000000000000000 0000000000000000 00000838 2**3
34 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
35 * 5 action-rand 00000098 0000000000000000 0000000000000000 000008c0 2**3
36 * CONTENTS, ALLOC, LOAD, RELOC, READONLY, CODE
37 * 6 maps 00000030 0000000000000000 0000000000000000 00000958 2**2
38 * CONTENTS, ALLOC, LOAD, DATA
39 * 7 license 00000004 0000000000000000 0000000000000000 00000988 2**0
40 * CONTENTS, ALLOC, LOAD, DATA
41 * [...]
42 * # echo 1 > /proc/sys/net/core/bpf_jit_enable
43 * $ gcc bpf_agent.c -o bpf_agent -Wall -O2
44 * # ./bpf_agent /tmp/bpf-uds (e.g. on a different terminal)
45 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
46 * action bpf obj bpf.o sec action-mark \
47 * action bpf obj bpf.o sec action-rand ok
48 * # tc filter show dev em1
49 * filter parent 1: protocol all pref 49152 bpf
50 * filter parent 1: protocol all pref 49152 bpf handle 0x1 flowid 1:1 bpf.o:[classifier]
51 * action order 1: bpf bpf.o:[action-mark] default-action pipe
52 * index 52 ref 1 bind 1
53 *
54 * action order 2: bpf bpf.o:[action-rand] default-action pipe
55 * index 53 ref 1 bind 1
56 *
57 * action order 3: gact action pass
58 * random type none pass val 0
59 * index 38 ref 1 bind 1
60 *
Daniel Borkmann279d6a82015-04-20 13:48:54 +020061 * The same program can also be installed on ingress side (as opposed to above
62 * egress configuration), e.g.:
63 *
64 * # tc qdisc add dev em1 handle ffff: ingress
65 * # tc filter add dev em1 parent ffff: bpf obj ...
66 *
Daniel Borkmann4bd62442015-04-16 21:20:06 +020067 * Notes on BPF agent:
68 *
69 * In the above example, the bpf_agent creates the unix domain socket
70 * natively. "tc exec" can also spawn a shell and hold the socktes there:
71 *
72 * # tc exec bpf imp /tmp/bpf-uds
73 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
74 * action bpf obj bpf.o sec action-mark \
75 * action bpf obj bpf.o sec action-rand ok
76 * sh-4.2# (shell spawned from tc exec)
77 * sh-4.2# bpf_agent
78 * [...]
79 *
80 * This will read out fds over environment and produce the same data dump
81 * as below. This has the advantage that the spawned shell owns the fds
82 * and thus if the agent is restarted, it can reattach to the same fds, also
83 * various programs can easily read/modify the data simultaneously from user
84 * space side.
85 *
86 * If the shell is unnecessary, the agent can also just be spawned directly
87 * via tc exec:
88 *
89 * # tc exec bpf imp /tmp/bpf-uds run bpf_agent
90 * # tc filter add dev em1 parent 1: bpf obj bpf.o exp /tmp/bpf-uds flowid 1:1 \
91 * action bpf obj bpf.o sec action-mark \
92 * action bpf obj bpf.o sec action-rand ok
93 *
Daniel Borkmann6256f8c2015-04-01 17:57:44 +020094 * BPF agent example output:
95 *
96 * ver: 1
97 * obj: bpf.o
98 * dev: 64770
99 * ino: 6045133
100 * maps: 3
101 * map0:
102 * `- fd: 4
103 * | serial: 1
104 * | type: 1
105 * | max elem: 256
106 * | size key: 1
107 * ` size val: 16
108 * map1:
109 * `- fd: 5
110 * | serial: 2
111 * | type: 1
112 * | max elem: 1024
113 * | size key: 4
114 * ` size val: 16
115 * map2:
116 * `- fd: 6
117 * | serial: 3
118 * | type: 2
119 * | max elem: 64
120 * | size key: 4
121 * ` size val: 8
122 * data, period: 5sec
123 * `- number of drops: cpu0: 0 cpu1: 0 cpu2: 0 cpu3: 0
124 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 0, mis: 0] q3:[pkts: 0, mis: 0]
125 * ` protos: tcp:[pkts: 0, bytes: 0] udp:[pkts: 0, bytes: 0] icmp:[pkts: 0, bytes: 0]
126 * data, period: 5sec
127 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 0 cpu3: 1
128 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 24, mis: 14] q3:[pkts: 0, mis: 0]
129 * ` protos: tcp:[pkts: 13, bytes: 1989] udp:[pkts: 10, bytes: 710] icmp:[pkts: 0, bytes: 0]
130 * data, period: 5sec
131 * `- number of drops: cpu0: 5 cpu1: 0 cpu2: 3 cpu3: 3
132 * | nic queues: q0:[pkts: 0, mis: 0] q1:[pkts: 0, mis: 0] q2:[pkts: 39, mis: 21] q3:[pkts: 0, mis: 0]
133 * ` protos: tcp:[pkts: 20, bytes: 3549] udp:[pkts: 18, bytes: 1278] icmp:[pkts: 0, bytes: 0]
134 * [...]
135 *
136 * This now means, the below classifier and action pipeline has been loaded
137 * as eBPF bytecode into the kernel, the kernel has verified that the
138 * execution of the bytecode is "safe", and it has JITed the programs
139 * afterwards, so that upon invocation they're running on native speed. tc
140 * has transferred all map file descriptors to the bpf_agent via IPC and
141 * even after tc exits, the agent can read out or modify all map data.
142 *
143 * Note that the export to the uds is done only once in the classifier and
144 * not in the action. It's enough to export the (here) shared descriptors
145 * once.
146 *
147 * If you need to disassemble the generated JIT image (echo with 2), the
148 * kernel tree has under tools/net/ a small helper, you can invoke e.g.
149 * `bpf_jit_disasm -o`.
150 *
151 * Please find in the code below further comments.
152 *
153 * -- Happy eBPF hacking! ;)
154 */
155#include <stdint.h>
156#include <stdbool.h>
157#include <sys/types.h>
158#include <sys/socket.h>
159#include <asm/types.h>
160#include <linux/in.h>
161#include <linux/if.h>
162#include <linux/if_ether.h>
163#include <linux/ip.h>
164#include <linux/ipv6.h>
165#include <linux/if_tunnel.h>
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200166#include <linux/filter.h>
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200167#include <linux/bpf.h>
168
169/* Common, shared definitions with ebpf_agent.c. */
170#include "bpf_shared.h"
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100171/* BPF helper functions for our example. */
172#include "../../include/bpf_api.h"
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200173
174/* Could be defined here as well, or included from the header. */
175#define TC_ACT_UNSPEC (-1)
176#define TC_ACT_OK 0
177#define TC_ACT_RECLASSIFY 1
178#define TC_ACT_SHOT 2
179#define TC_ACT_PIPE 3
180#define TC_ACT_STOLEN 4
181#define TC_ACT_QUEUED 5
182#define TC_ACT_REPEAT 6
183
184/* Other, misc stuff. */
185#define IP_MF 0x2000
186#define IP_OFFSET 0x1FFF
187
188/* eBPF map definitions, all placed in section "maps". */
189struct bpf_elf_map __section("maps") map_proto = {
190 .type = BPF_MAP_TYPE_HASH,
191 .id = BPF_MAP_ID_PROTO,
192 .size_key = sizeof(uint8_t),
193 .size_value = sizeof(struct count_tuple),
194 .max_elem = 256,
195};
196
197struct bpf_elf_map __section("maps") map_queue = {
198 .type = BPF_MAP_TYPE_HASH,
199 .id = BPF_MAP_ID_QUEUE,
200 .size_key = sizeof(uint32_t),
201 .size_value = sizeof(struct count_queue),
202 .max_elem = 1024,
203};
204
205struct bpf_elf_map __section("maps") map_drops = {
206 .type = BPF_MAP_TYPE_ARRAY,
207 .id = BPF_MAP_ID_DROPS,
208 .size_key = sizeof(uint32_t),
209 .size_value = sizeof(long),
210 .max_elem = 64,
211};
212
213/* Helper functions and definitions for the flow dissector used by the
214 * example classifier. This resembles the kernel's flow dissector to
215 * some extend and is just used as an example to show what's possible
216 * with eBPF.
217 */
218struct sockaddr;
219
220struct vlan_hdr {
221 __be16 h_vlan_TCI;
222 __be16 h_vlan_encapsulated_proto;
223};
224
225struct flow_keys {
226 __u32 src;
227 __u32 dst;
228 union {
229 __u32 ports;
230 __u16 port16[2];
231 };
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200232 __s32 th_off;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200233 __u8 ip_proto;
234};
235
Daniel Borkmann92a36992016-02-07 02:11:50 +0100236static __inline__ int flow_ports_offset(__u8 ip_proto)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200237{
238 switch (ip_proto) {
239 case IPPROTO_TCP:
240 case IPPROTO_UDP:
241 case IPPROTO_DCCP:
242 case IPPROTO_ESP:
243 case IPPROTO_SCTP:
244 case IPPROTO_UDPLITE:
245 default:
246 return 0;
247 case IPPROTO_AH:
248 return 4;
249 }
250}
251
Daniel Borkmann92a36992016-02-07 02:11:50 +0100252static __inline__ bool flow_is_frag(struct __sk_buff *skb, int nh_off)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200253{
254 return !!(load_half(skb, nh_off + offsetof(struct iphdr, frag_off)) &
255 (IP_MF | IP_OFFSET));
256}
257
Daniel Borkmann92a36992016-02-07 02:11:50 +0100258static __inline__ int flow_parse_ipv4(struct __sk_buff *skb, int nh_off,
259 __u8 *ip_proto, struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200260{
261 __u8 ip_ver_len;
262
263 if (unlikely(flow_is_frag(skb, nh_off)))
264 *ip_proto = 0;
265 else
266 *ip_proto = load_byte(skb, nh_off + offsetof(struct iphdr,
267 protocol));
268 if (*ip_proto != IPPROTO_GRE) {
269 flow->src = load_word(skb, nh_off + offsetof(struct iphdr, saddr));
270 flow->dst = load_word(skb, nh_off + offsetof(struct iphdr, daddr));
271 }
272
273 ip_ver_len = load_byte(skb, nh_off + 0 /* offsetof(struct iphdr, ihl) */);
274 if (likely(ip_ver_len == 0x45))
275 nh_off += 20;
276 else
277 nh_off += (ip_ver_len & 0xF) << 2;
278
279 return nh_off;
280}
281
Daniel Borkmann92a36992016-02-07 02:11:50 +0100282static __inline__ __u32 flow_addr_hash_ipv6(struct __sk_buff *skb, int off)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200283{
284 __u32 w0 = load_word(skb, off);
285 __u32 w1 = load_word(skb, off + sizeof(w0));
286 __u32 w2 = load_word(skb, off + sizeof(w0) * 2);
287 __u32 w3 = load_word(skb, off + sizeof(w0) * 3);
288
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200289 return w0 ^ w1 ^ w2 ^ w3;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200290}
291
Daniel Borkmann92a36992016-02-07 02:11:50 +0100292static __inline__ int flow_parse_ipv6(struct __sk_buff *skb, int nh_off,
293 __u8 *ip_proto, struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200294{
295 *ip_proto = load_byte(skb, nh_off + offsetof(struct ipv6hdr, nexthdr));
296
297 flow->src = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, saddr));
298 flow->dst = flow_addr_hash_ipv6(skb, nh_off + offsetof(struct ipv6hdr, daddr));
299
300 return nh_off + sizeof(struct ipv6hdr);
301}
302
Daniel Borkmann92a36992016-02-07 02:11:50 +0100303static __inline__ bool flow_dissector(struct __sk_buff *skb,
304 struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200305{
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200306 int poff, nh_off = BPF_LL_OFF + ETH_HLEN;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200307 __be16 proto = skb->protocol;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200308 __u8 ip_proto;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200309
310 /* TODO: check for skb->vlan_tci, skb->vlan_proto first */
311 if (proto == htons(ETH_P_8021AD)) {
312 proto = load_half(skb, nh_off +
313 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
314 nh_off += sizeof(struct vlan_hdr);
315 }
316 if (proto == htons(ETH_P_8021Q)) {
317 proto = load_half(skb, nh_off +
318 offsetof(struct vlan_hdr, h_vlan_encapsulated_proto));
319 nh_off += sizeof(struct vlan_hdr);
320 }
321
322 if (likely(proto == htons(ETH_P_IP)))
323 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
324 else if (proto == htons(ETH_P_IPV6))
325 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
326 else
327 return false;
328
329 switch (ip_proto) {
330 case IPPROTO_GRE: {
331 struct gre_hdr {
332 __be16 flags;
333 __be16 proto;
334 };
335
336 __u16 gre_flags = load_half(skb, nh_off +
337 offsetof(struct gre_hdr, flags));
338 __u16 gre_proto = load_half(skb, nh_off +
339 offsetof(struct gre_hdr, proto));
340
341 if (gre_flags & (GRE_VERSION | GRE_ROUTING))
342 break;
343
344 nh_off += 4;
345 if (gre_flags & GRE_CSUM)
346 nh_off += 4;
347 if (gre_flags & GRE_KEY)
348 nh_off += 4;
349 if (gre_flags & GRE_SEQ)
350 nh_off += 4;
351
352 if (gre_proto == ETH_P_8021Q) {
353 gre_proto = load_half(skb, nh_off +
354 offsetof(struct vlan_hdr,
355 h_vlan_encapsulated_proto));
356 nh_off += sizeof(struct vlan_hdr);
357 }
358 if (gre_proto == ETH_P_IP)
359 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
360 else if (gre_proto == ETH_P_IPV6)
361 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
362 else
363 return false;
364 break;
365 }
366 case IPPROTO_IPIP:
367 nh_off = flow_parse_ipv4(skb, nh_off, &ip_proto, flow);
368 break;
369 case IPPROTO_IPV6:
370 nh_off = flow_parse_ipv6(skb, nh_off, &ip_proto, flow);
371 default:
372 break;
373 }
374
375 nh_off += flow_ports_offset(ip_proto);
376
377 flow->ports = load_word(skb, nh_off);
Daniel Borkmann279d6a82015-04-20 13:48:54 +0200378 flow->th_off = nh_off;
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200379 flow->ip_proto = ip_proto;
380
381 return true;
382}
383
Daniel Borkmann92a36992016-02-07 02:11:50 +0100384static __inline__ void cls_update_proto_map(const struct __sk_buff *skb,
385 const struct flow_keys *flow)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200386{
387 uint8_t proto = flow->ip_proto;
388 struct count_tuple *ct, _ct;
389
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100390 ct = map_lookup_elem(&map_proto, &proto);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200391 if (likely(ct)) {
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100392 lock_xadd(&ct->packets, 1);
393 lock_xadd(&ct->bytes, skb->len);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200394 return;
395 }
396
397 /* No hit yet, we need to create a new entry. */
398 _ct.packets = 1;
399 _ct.bytes = skb->len;
400
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100401 map_update_elem(&map_proto, &proto, &_ct, BPF_ANY);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200402}
403
Daniel Borkmann92a36992016-02-07 02:11:50 +0100404static __inline__ void cls_update_queue_map(const struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200405{
406 uint32_t queue = skb->queue_mapping;
407 struct count_queue *cq, _cq;
408 bool mismatch;
409
410 mismatch = skb->queue_mapping != get_smp_processor_id();
411
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100412 cq = map_lookup_elem(&map_queue, &queue);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200413 if (likely(cq)) {
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100414 lock_xadd(&cq->total, 1);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200415 if (mismatch)
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100416 lock_xadd(&cq->mismatch, 1);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200417 return;
418 }
419
420 /* No hit yet, we need to create a new entry. */
421 _cq.total = 1;
422 _cq.mismatch = mismatch ? 1 : 0;
423
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100424 map_update_elem(&map_queue, &queue, &_cq, BPF_ANY);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200425}
426
427/* eBPF program definitions, placed in various sections, which can
428 * have custom section names. If custom names are in use, it's
429 * required to point tc to the correct section, e.g.
430 *
431 * tc filter add [...] bpf obj cls.o sec cls-tos [...]
432 *
433 * in case the program resides in __section("cls-tos").
434 *
435 * Default section for cls_bpf is: "classifier", for act_bpf is:
436 * "action". Naturally, if for example multiple actions are present
437 * in the same file, they need to have distinct section names.
438 *
439 * It is however not required to have multiple programs sharing
440 * a file.
441 */
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100442__section("classifier")
443int cls_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200444{
445 struct flow_keys flow;
446
447 if (!flow_dissector(skb, &flow))
448 return 0; /* No match in cls_bpf. */
449
450 cls_update_proto_map(skb, &flow);
451 cls_update_queue_map(skb);
452
453 return flow.ip_proto;
454}
455
Daniel Borkmann92a36992016-02-07 02:11:50 +0100456static __inline__ void act_update_drop_map(void)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200457{
458 uint32_t *count, cpu = get_smp_processor_id();
459
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100460 count = map_lookup_elem(&map_drops, &cpu);
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200461 if (count)
462 /* Only this cpu is accessing this element. */
463 (*count)++;
464}
465
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100466__section("action-mark")
467int act_mark_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200468{
469 /* You could also mangle skb data here with the helper function
470 * BPF_FUNC_skb_store_bytes, etc. Or, alternatively you could
471 * do that already in the classifier itself as a merged combination
472 * of classifier'n'action model.
473 */
474
475 if (skb->mark == 0xcafe) {
476 act_update_drop_map();
477 return TC_ACT_SHOT;
478 }
479
480 /* Default configured tc opcode. */
481 return TC_ACT_UNSPEC;
482}
483
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100484__section("action-rand")
485int act_rand_main(struct __sk_buff *skb)
Daniel Borkmann6256f8c2015-04-01 17:57:44 +0200486{
487 /* Sorry, we're near event horizon ... */
488 if ((get_prandom_u32() & 3) == 0) {
489 act_update_drop_map();
490 return TC_ACT_SHOT;
491 }
492
493 return TC_ACT_UNSPEC;
494}
495
496/* Last but not least, the file contains a license. Some future helper
497 * functions may only be available with a GPL license.
498 */
Daniel Borkmann41d6e332015-12-02 00:25:36 +0100499BPF_LICENSE("GPL");