Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 1 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> |
| 2 | * |
| 3 | * This program is free software; you can redistribute it and/or |
| 4 | * modify it under the terms of version 2 of the GNU General Public |
| 5 | * License as published by the Free Software Foundation. |
| 6 | * |
| 7 | * This program is distributed in the hope that it will be useful, but |
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 10 | * General Public License for more details. |
| 11 | */ |
| 12 | |
| 13 | #include <linux/kernel.h> |
| 14 | #include <linux/module.h> |
| 15 | #include <linux/skbuff.h> |
| 16 | #include <linux/types.h> |
| 17 | #include <linux/bpf.h> |
| 18 | #include <net/lwtunnel.h> |
| 19 | |
| 20 | struct bpf_lwt_prog { |
| 21 | struct bpf_prog *prog; |
| 22 | char *name; |
| 23 | }; |
| 24 | |
| 25 | struct bpf_lwt { |
| 26 | struct bpf_lwt_prog in; |
| 27 | struct bpf_lwt_prog out; |
| 28 | struct bpf_lwt_prog xmit; |
| 29 | int family; |
| 30 | }; |
| 31 | |
| 32 | #define MAX_PROG_NAME 256 |
| 33 | |
| 34 | static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) |
| 35 | { |
| 36 | return (struct bpf_lwt *)lwt->data; |
| 37 | } |
| 38 | |
| 39 | #define NO_REDIRECT false |
| 40 | #define CAN_REDIRECT true |
| 41 | |
| 42 | static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, |
| 43 | struct dst_entry *dst, bool can_redirect) |
| 44 | { |
| 45 | int ret; |
| 46 | |
| 47 | /* Preempt disable is needed to protect per-cpu redirect_info between |
| 48 | * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and |
| 49 | * access to maps strictly require a rcu_read_lock() for protection, |
| 50 | * mixing with BH RCU lock doesn't work. |
| 51 | */ |
| 52 | preempt_disable(); |
Daniel Borkmann | 6aaae2b | 2017-09-25 02:25:50 +0200 | [diff] [blame] | 53 | bpf_compute_data_pointers(skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 54 | ret = bpf_prog_run_save_cb(lwt->prog, skb); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 55 | |
| 56 | switch (ret) { |
| 57 | case BPF_OK: |
| 58 | break; |
| 59 | |
| 60 | case BPF_REDIRECT: |
| 61 | if (unlikely(!can_redirect)) { |
| 62 | pr_warn_once("Illegal redirect return code in prog %s\n", |
| 63 | lwt->name ? : "<unknown>"); |
| 64 | ret = BPF_OK; |
| 65 | } else { |
| 66 | ret = skb_do_redirect(skb); |
| 67 | if (ret == 0) |
| 68 | ret = BPF_REDIRECT; |
| 69 | } |
| 70 | break; |
| 71 | |
| 72 | case BPF_DROP: |
| 73 | kfree_skb(skb); |
| 74 | ret = -EPERM; |
| 75 | break; |
| 76 | |
| 77 | default: |
| 78 | pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); |
| 79 | kfree_skb(skb); |
| 80 | ret = -EINVAL; |
| 81 | break; |
| 82 | } |
| 83 | |
| 84 | preempt_enable(); |
| 85 | |
| 86 | return ret; |
| 87 | } |
| 88 | |
| 89 | static int bpf_input(struct sk_buff *skb) |
| 90 | { |
| 91 | struct dst_entry *dst = skb_dst(skb); |
| 92 | struct bpf_lwt *bpf; |
| 93 | int ret; |
| 94 | |
| 95 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 96 | if (bpf->in.prog) { |
| 97 | ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); |
| 98 | if (ret < 0) |
| 99 | return ret; |
| 100 | } |
| 101 | |
| 102 | if (unlikely(!dst->lwtstate->orig_input)) { |
| 103 | pr_warn_once("orig_input not set on dst for prog %s\n", |
| 104 | bpf->out.name); |
| 105 | kfree_skb(skb); |
| 106 | return -EINVAL; |
| 107 | } |
| 108 | |
| 109 | return dst->lwtstate->orig_input(skb); |
| 110 | } |
| 111 | |
| 112 | static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
| 113 | { |
| 114 | struct dst_entry *dst = skb_dst(skb); |
| 115 | struct bpf_lwt *bpf; |
| 116 | int ret; |
| 117 | |
| 118 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 119 | if (bpf->out.prog) { |
| 120 | ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); |
| 121 | if (ret < 0) |
| 122 | return ret; |
| 123 | } |
| 124 | |
| 125 | if (unlikely(!dst->lwtstate->orig_output)) { |
| 126 | pr_warn_once("orig_output not set on dst for prog %s\n", |
| 127 | bpf->out.name); |
| 128 | kfree_skb(skb); |
| 129 | return -EINVAL; |
| 130 | } |
| 131 | |
| 132 | return dst->lwtstate->orig_output(net, sk, skb); |
| 133 | } |
| 134 | |
| 135 | static int xmit_check_hhlen(struct sk_buff *skb) |
| 136 | { |
| 137 | int hh_len = skb_dst(skb)->dev->hard_header_len; |
| 138 | |
| 139 | if (skb_headroom(skb) < hh_len) { |
| 140 | int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); |
| 141 | |
| 142 | if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) |
| 143 | return -ENOMEM; |
| 144 | } |
| 145 | |
| 146 | return 0; |
| 147 | } |
| 148 | |
| 149 | static int bpf_xmit(struct sk_buff *skb) |
| 150 | { |
| 151 | struct dst_entry *dst = skb_dst(skb); |
| 152 | struct bpf_lwt *bpf; |
| 153 | |
| 154 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 155 | if (bpf->xmit.prog) { |
| 156 | int ret; |
| 157 | |
| 158 | ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); |
| 159 | switch (ret) { |
| 160 | case BPF_OK: |
| 161 | /* If the header was expanded, headroom might be too |
| 162 | * small for L2 header to come, expand as needed. |
| 163 | */ |
| 164 | ret = xmit_check_hhlen(skb); |
| 165 | if (unlikely(ret)) |
| 166 | return ret; |
| 167 | |
| 168 | return LWTUNNEL_XMIT_CONTINUE; |
| 169 | case BPF_REDIRECT: |
| 170 | return LWTUNNEL_XMIT_DONE; |
| 171 | default: |
| 172 | return ret; |
| 173 | } |
| 174 | } |
| 175 | |
| 176 | return LWTUNNEL_XMIT_CONTINUE; |
| 177 | } |
| 178 | |
| 179 | static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) |
| 180 | { |
| 181 | if (prog->prog) |
| 182 | bpf_prog_put(prog->prog); |
| 183 | |
| 184 | kfree(prog->name); |
| 185 | } |
| 186 | |
| 187 | static void bpf_destroy_state(struct lwtunnel_state *lwt) |
| 188 | { |
| 189 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 190 | |
| 191 | bpf_lwt_prog_destroy(&bpf->in); |
| 192 | bpf_lwt_prog_destroy(&bpf->out); |
| 193 | bpf_lwt_prog_destroy(&bpf->xmit); |
| 194 | } |
| 195 | |
| 196 | static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { |
| 197 | [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, |
| 198 | [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, |
| 199 | .len = MAX_PROG_NAME }, |
| 200 | }; |
| 201 | |
| 202 | static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, |
| 203 | enum bpf_prog_type type) |
| 204 | { |
| 205 | struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; |
| 206 | struct bpf_prog *p; |
| 207 | int ret; |
| 208 | u32 fd; |
| 209 | |
Johannes Berg | fceb643 | 2017-04-12 14:34:07 +0200 | [diff] [blame] | 210 | ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, |
| 211 | NULL); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 212 | if (ret < 0) |
| 213 | return ret; |
| 214 | |
| 215 | if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) |
| 216 | return -EINVAL; |
| 217 | |
Taehee Yoo | 71eb525 | 2018-07-29 00:28:31 +0900 | [diff] [blame] | 218 | prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_ATOMIC); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 219 | if (!prog->name) |
| 220 | return -ENOMEM; |
| 221 | |
| 222 | fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); |
| 223 | p = bpf_prog_get_type(fd, type); |
| 224 | if (IS_ERR(p)) |
| 225 | return PTR_ERR(p); |
| 226 | |
| 227 | prog->prog = p; |
| 228 | |
| 229 | return 0; |
| 230 | } |
| 231 | |
| 232 | static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { |
| 233 | [LWT_BPF_IN] = { .type = NLA_NESTED, }, |
| 234 | [LWT_BPF_OUT] = { .type = NLA_NESTED, }, |
| 235 | [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, |
| 236 | [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, |
| 237 | }; |
| 238 | |
David Ahern | 30357d7 | 2017-01-30 12:07:37 -0800 | [diff] [blame] | 239 | static int bpf_build_state(struct nlattr *nla, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 240 | unsigned int family, const void *cfg, |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 241 | struct lwtunnel_state **ts, |
| 242 | struct netlink_ext_ack *extack) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 243 | { |
| 244 | struct nlattr *tb[LWT_BPF_MAX + 1]; |
| 245 | struct lwtunnel_state *newts; |
| 246 | struct bpf_lwt *bpf; |
| 247 | int ret; |
| 248 | |
| 249 | if (family != AF_INET && family != AF_INET6) |
| 250 | return -EAFNOSUPPORT; |
| 251 | |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 252 | ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 253 | if (ret < 0) |
| 254 | return ret; |
| 255 | |
| 256 | if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) |
| 257 | return -EINVAL; |
| 258 | |
| 259 | newts = lwtunnel_state_alloc(sizeof(*bpf)); |
| 260 | if (!newts) |
| 261 | return -ENOMEM; |
| 262 | |
| 263 | newts->type = LWTUNNEL_ENCAP_BPF; |
| 264 | bpf = bpf_lwt_lwtunnel(newts); |
| 265 | |
| 266 | if (tb[LWT_BPF_IN]) { |
| 267 | newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; |
| 268 | ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, |
| 269 | BPF_PROG_TYPE_LWT_IN); |
| 270 | if (ret < 0) |
| 271 | goto errout; |
| 272 | } |
| 273 | |
| 274 | if (tb[LWT_BPF_OUT]) { |
| 275 | newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; |
| 276 | ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, |
| 277 | BPF_PROG_TYPE_LWT_OUT); |
| 278 | if (ret < 0) |
| 279 | goto errout; |
| 280 | } |
| 281 | |
| 282 | if (tb[LWT_BPF_XMIT]) { |
| 283 | newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; |
| 284 | ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, |
| 285 | BPF_PROG_TYPE_LWT_XMIT); |
| 286 | if (ret < 0) |
| 287 | goto errout; |
| 288 | } |
| 289 | |
| 290 | if (tb[LWT_BPF_XMIT_HEADROOM]) { |
| 291 | u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); |
| 292 | |
| 293 | if (headroom > LWT_BPF_MAX_HEADROOM) { |
| 294 | ret = -ERANGE; |
| 295 | goto errout; |
| 296 | } |
| 297 | |
| 298 | newts->headroom = headroom; |
| 299 | } |
| 300 | |
| 301 | bpf->family = family; |
| 302 | *ts = newts; |
| 303 | |
| 304 | return 0; |
| 305 | |
| 306 | errout: |
| 307 | bpf_destroy_state(newts); |
| 308 | kfree(newts); |
| 309 | return ret; |
| 310 | } |
| 311 | |
| 312 | static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, |
| 313 | struct bpf_lwt_prog *prog) |
| 314 | { |
| 315 | struct nlattr *nest; |
| 316 | |
| 317 | if (!prog->prog) |
| 318 | return 0; |
| 319 | |
| 320 | nest = nla_nest_start(skb, attr); |
| 321 | if (!nest) |
| 322 | return -EMSGSIZE; |
| 323 | |
| 324 | if (prog->name && |
| 325 | nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) |
| 326 | return -EMSGSIZE; |
| 327 | |
| 328 | return nla_nest_end(skb, nest); |
| 329 | } |
| 330 | |
| 331 | static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) |
| 332 | { |
| 333 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 334 | |
| 335 | if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || |
| 336 | bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || |
| 337 | bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) |
| 338 | return -EMSGSIZE; |
| 339 | |
| 340 | return 0; |
| 341 | } |
| 342 | |
| 343 | static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) |
| 344 | { |
| 345 | int nest_len = nla_total_size(sizeof(struct nlattr)) + |
| 346 | nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ |
| 347 | 0; |
| 348 | |
| 349 | return nest_len + /* LWT_BPF_IN */ |
| 350 | nest_len + /* LWT_BPF_OUT */ |
| 351 | nest_len + /* LWT_BPF_XMIT */ |
| 352 | 0; |
| 353 | } |
| 354 | |
Wei Yongjun | 79471b1 | 2017-01-12 14:39:28 +0000 | [diff] [blame] | 355 | static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 356 | { |
| 357 | /* FIXME: |
| 358 | * The LWT state is currently rebuilt for delete requests which |
| 359 | * results in a new bpf_prog instance. Comparing names for now. |
| 360 | */ |
| 361 | if (!a->name && !b->name) |
| 362 | return 0; |
| 363 | |
| 364 | if (!a->name || !b->name) |
| 365 | return 1; |
| 366 | |
| 367 | return strcmp(a->name, b->name); |
| 368 | } |
| 369 | |
| 370 | static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) |
| 371 | { |
| 372 | struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); |
| 373 | struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); |
| 374 | |
| 375 | return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || |
| 376 | bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || |
| 377 | bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); |
| 378 | } |
| 379 | |
| 380 | static const struct lwtunnel_encap_ops bpf_encap_ops = { |
| 381 | .build_state = bpf_build_state, |
| 382 | .destroy_state = bpf_destroy_state, |
| 383 | .input = bpf_input, |
| 384 | .output = bpf_output, |
| 385 | .xmit = bpf_xmit, |
| 386 | .fill_encap = bpf_fill_encap_info, |
| 387 | .get_encap_size = bpf_encap_nlsize, |
| 388 | .cmp_encap = bpf_encap_cmp, |
Robert Shearman | 88ff733 | 2017-01-24 16:26:47 +0000 | [diff] [blame] | 389 | .owner = THIS_MODULE, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 390 | }; |
| 391 | |
| 392 | static int __init bpf_lwt_init(void) |
| 393 | { |
| 394 | return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); |
| 395 | } |
| 396 | |
| 397 | subsys_initcall(bpf_lwt_init) |