Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 1 | /* Copyright (c) 2016 Thomas Graf <tgraf@tgraf.ch> |
| 2 | * |
| 3 | * This program is free software; you can redistribute it and/or |
| 4 | * modify it under the terms of version 2 of the GNU General Public |
| 5 | * License as published by the Free Software Foundation. |
| 6 | * |
| 7 | * This program is distributed in the hope that it will be useful, but |
| 8 | * WITHOUT ANY WARRANTY; without even the implied warranty of |
| 9 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| 10 | * General Public License for more details. |
| 11 | */ |
| 12 | |
| 13 | #include <linux/kernel.h> |
| 14 | #include <linux/module.h> |
| 15 | #include <linux/skbuff.h> |
| 16 | #include <linux/types.h> |
| 17 | #include <linux/bpf.h> |
| 18 | #include <net/lwtunnel.h> |
| 19 | |
| 20 | struct bpf_lwt_prog { |
| 21 | struct bpf_prog *prog; |
| 22 | char *name; |
| 23 | }; |
| 24 | |
| 25 | struct bpf_lwt { |
| 26 | struct bpf_lwt_prog in; |
| 27 | struct bpf_lwt_prog out; |
| 28 | struct bpf_lwt_prog xmit; |
| 29 | int family; |
| 30 | }; |
| 31 | |
| 32 | #define MAX_PROG_NAME 256 |
| 33 | |
| 34 | static inline struct bpf_lwt *bpf_lwt_lwtunnel(struct lwtunnel_state *lwt) |
| 35 | { |
| 36 | return (struct bpf_lwt *)lwt->data; |
| 37 | } |
| 38 | |
| 39 | #define NO_REDIRECT false |
| 40 | #define CAN_REDIRECT true |
| 41 | |
| 42 | static int run_lwt_bpf(struct sk_buff *skb, struct bpf_lwt_prog *lwt, |
| 43 | struct dst_entry *dst, bool can_redirect) |
| 44 | { |
| 45 | int ret; |
| 46 | |
| 47 | /* Preempt disable is needed to protect per-cpu redirect_info between |
| 48 | * BPF prog and skb_do_redirect(). The call_rcu in bpf_prog_put() and |
| 49 | * access to maps strictly require a rcu_read_lock() for protection, |
| 50 | * mixing with BH RCU lock doesn't work. |
| 51 | */ |
| 52 | preempt_disable(); |
| 53 | rcu_read_lock(); |
| 54 | bpf_compute_data_end(skb); |
| 55 | ret = bpf_prog_run_save_cb(lwt->prog, skb); |
| 56 | rcu_read_unlock(); |
| 57 | |
| 58 | switch (ret) { |
| 59 | case BPF_OK: |
| 60 | break; |
| 61 | |
| 62 | case BPF_REDIRECT: |
| 63 | if (unlikely(!can_redirect)) { |
| 64 | pr_warn_once("Illegal redirect return code in prog %s\n", |
| 65 | lwt->name ? : "<unknown>"); |
| 66 | ret = BPF_OK; |
| 67 | } else { |
| 68 | ret = skb_do_redirect(skb); |
| 69 | if (ret == 0) |
| 70 | ret = BPF_REDIRECT; |
| 71 | } |
| 72 | break; |
| 73 | |
| 74 | case BPF_DROP: |
| 75 | kfree_skb(skb); |
| 76 | ret = -EPERM; |
| 77 | break; |
| 78 | |
| 79 | default: |
| 80 | pr_warn_once("bpf-lwt: Illegal return value %u, expect packet loss\n", ret); |
| 81 | kfree_skb(skb); |
| 82 | ret = -EINVAL; |
| 83 | break; |
| 84 | } |
| 85 | |
| 86 | preempt_enable(); |
| 87 | |
| 88 | return ret; |
| 89 | } |
| 90 | |
| 91 | static int bpf_input(struct sk_buff *skb) |
| 92 | { |
| 93 | struct dst_entry *dst = skb_dst(skb); |
| 94 | struct bpf_lwt *bpf; |
| 95 | int ret; |
| 96 | |
| 97 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 98 | if (bpf->in.prog) { |
| 99 | ret = run_lwt_bpf(skb, &bpf->in, dst, NO_REDIRECT); |
| 100 | if (ret < 0) |
| 101 | return ret; |
| 102 | } |
| 103 | |
| 104 | if (unlikely(!dst->lwtstate->orig_input)) { |
| 105 | pr_warn_once("orig_input not set on dst for prog %s\n", |
| 106 | bpf->out.name); |
| 107 | kfree_skb(skb); |
| 108 | return -EINVAL; |
| 109 | } |
| 110 | |
| 111 | return dst->lwtstate->orig_input(skb); |
| 112 | } |
| 113 | |
| 114 | static int bpf_output(struct net *net, struct sock *sk, struct sk_buff *skb) |
| 115 | { |
| 116 | struct dst_entry *dst = skb_dst(skb); |
| 117 | struct bpf_lwt *bpf; |
| 118 | int ret; |
| 119 | |
| 120 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 121 | if (bpf->out.prog) { |
| 122 | ret = run_lwt_bpf(skb, &bpf->out, dst, NO_REDIRECT); |
| 123 | if (ret < 0) |
| 124 | return ret; |
| 125 | } |
| 126 | |
| 127 | if (unlikely(!dst->lwtstate->orig_output)) { |
| 128 | pr_warn_once("orig_output not set on dst for prog %s\n", |
| 129 | bpf->out.name); |
| 130 | kfree_skb(skb); |
| 131 | return -EINVAL; |
| 132 | } |
| 133 | |
| 134 | return dst->lwtstate->orig_output(net, sk, skb); |
| 135 | } |
| 136 | |
| 137 | static int xmit_check_hhlen(struct sk_buff *skb) |
| 138 | { |
| 139 | int hh_len = skb_dst(skb)->dev->hard_header_len; |
| 140 | |
| 141 | if (skb_headroom(skb) < hh_len) { |
| 142 | int nhead = HH_DATA_ALIGN(hh_len - skb_headroom(skb)); |
| 143 | |
| 144 | if (pskb_expand_head(skb, nhead, 0, GFP_ATOMIC)) |
| 145 | return -ENOMEM; |
| 146 | } |
| 147 | |
| 148 | return 0; |
| 149 | } |
| 150 | |
| 151 | static int bpf_xmit(struct sk_buff *skb) |
| 152 | { |
| 153 | struct dst_entry *dst = skb_dst(skb); |
| 154 | struct bpf_lwt *bpf; |
| 155 | |
| 156 | bpf = bpf_lwt_lwtunnel(dst->lwtstate); |
| 157 | if (bpf->xmit.prog) { |
| 158 | int ret; |
| 159 | |
| 160 | ret = run_lwt_bpf(skb, &bpf->xmit, dst, CAN_REDIRECT); |
| 161 | switch (ret) { |
| 162 | case BPF_OK: |
| 163 | /* If the header was expanded, headroom might be too |
| 164 | * small for L2 header to come, expand as needed. |
| 165 | */ |
| 166 | ret = xmit_check_hhlen(skb); |
| 167 | if (unlikely(ret)) |
| 168 | return ret; |
| 169 | |
| 170 | return LWTUNNEL_XMIT_CONTINUE; |
| 171 | case BPF_REDIRECT: |
| 172 | return LWTUNNEL_XMIT_DONE; |
| 173 | default: |
| 174 | return ret; |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | return LWTUNNEL_XMIT_CONTINUE; |
| 179 | } |
| 180 | |
| 181 | static void bpf_lwt_prog_destroy(struct bpf_lwt_prog *prog) |
| 182 | { |
| 183 | if (prog->prog) |
| 184 | bpf_prog_put(prog->prog); |
| 185 | |
| 186 | kfree(prog->name); |
| 187 | } |
| 188 | |
| 189 | static void bpf_destroy_state(struct lwtunnel_state *lwt) |
| 190 | { |
| 191 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 192 | |
| 193 | bpf_lwt_prog_destroy(&bpf->in); |
| 194 | bpf_lwt_prog_destroy(&bpf->out); |
| 195 | bpf_lwt_prog_destroy(&bpf->xmit); |
| 196 | } |
| 197 | |
| 198 | static const struct nla_policy bpf_prog_policy[LWT_BPF_PROG_MAX + 1] = { |
| 199 | [LWT_BPF_PROG_FD] = { .type = NLA_U32, }, |
| 200 | [LWT_BPF_PROG_NAME] = { .type = NLA_NUL_STRING, |
| 201 | .len = MAX_PROG_NAME }, |
| 202 | }; |
| 203 | |
| 204 | static int bpf_parse_prog(struct nlattr *attr, struct bpf_lwt_prog *prog, |
| 205 | enum bpf_prog_type type) |
| 206 | { |
| 207 | struct nlattr *tb[LWT_BPF_PROG_MAX + 1]; |
| 208 | struct bpf_prog *p; |
| 209 | int ret; |
| 210 | u32 fd; |
| 211 | |
Johannes Berg | fceb643 | 2017-04-12 14:34:07 +0200 | [diff] [blame] | 212 | ret = nla_parse_nested(tb, LWT_BPF_PROG_MAX, attr, bpf_prog_policy, |
| 213 | NULL); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 214 | if (ret < 0) |
| 215 | return ret; |
| 216 | |
| 217 | if (!tb[LWT_BPF_PROG_FD] || !tb[LWT_BPF_PROG_NAME]) |
| 218 | return -EINVAL; |
| 219 | |
| 220 | prog->name = nla_memdup(tb[LWT_BPF_PROG_NAME], GFP_KERNEL); |
| 221 | if (!prog->name) |
| 222 | return -ENOMEM; |
| 223 | |
| 224 | fd = nla_get_u32(tb[LWT_BPF_PROG_FD]); |
| 225 | p = bpf_prog_get_type(fd, type); |
| 226 | if (IS_ERR(p)) |
| 227 | return PTR_ERR(p); |
| 228 | |
| 229 | prog->prog = p; |
| 230 | |
| 231 | return 0; |
| 232 | } |
| 233 | |
| 234 | static const struct nla_policy bpf_nl_policy[LWT_BPF_MAX + 1] = { |
| 235 | [LWT_BPF_IN] = { .type = NLA_NESTED, }, |
| 236 | [LWT_BPF_OUT] = { .type = NLA_NESTED, }, |
| 237 | [LWT_BPF_XMIT] = { .type = NLA_NESTED, }, |
| 238 | [LWT_BPF_XMIT_HEADROOM] = { .type = NLA_U32 }, |
| 239 | }; |
| 240 | |
David Ahern | 30357d7 | 2017-01-30 12:07:37 -0800 | [diff] [blame] | 241 | static int bpf_build_state(struct nlattr *nla, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 242 | unsigned int family, const void *cfg, |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 243 | struct lwtunnel_state **ts, |
| 244 | struct netlink_ext_ack *extack) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 245 | { |
| 246 | struct nlattr *tb[LWT_BPF_MAX + 1]; |
| 247 | struct lwtunnel_state *newts; |
| 248 | struct bpf_lwt *bpf; |
| 249 | int ret; |
| 250 | |
| 251 | if (family != AF_INET && family != AF_INET6) |
| 252 | return -EAFNOSUPPORT; |
| 253 | |
David Ahern | 9ae2872 | 2017-05-27 16:19:28 -0600 | [diff] [blame] | 254 | ret = nla_parse_nested(tb, LWT_BPF_MAX, nla, bpf_nl_policy, extack); |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 255 | if (ret < 0) |
| 256 | return ret; |
| 257 | |
| 258 | if (!tb[LWT_BPF_IN] && !tb[LWT_BPF_OUT] && !tb[LWT_BPF_XMIT]) |
| 259 | return -EINVAL; |
| 260 | |
| 261 | newts = lwtunnel_state_alloc(sizeof(*bpf)); |
| 262 | if (!newts) |
| 263 | return -ENOMEM; |
| 264 | |
| 265 | newts->type = LWTUNNEL_ENCAP_BPF; |
| 266 | bpf = bpf_lwt_lwtunnel(newts); |
| 267 | |
| 268 | if (tb[LWT_BPF_IN]) { |
| 269 | newts->flags |= LWTUNNEL_STATE_INPUT_REDIRECT; |
| 270 | ret = bpf_parse_prog(tb[LWT_BPF_IN], &bpf->in, |
| 271 | BPF_PROG_TYPE_LWT_IN); |
| 272 | if (ret < 0) |
| 273 | goto errout; |
| 274 | } |
| 275 | |
| 276 | if (tb[LWT_BPF_OUT]) { |
| 277 | newts->flags |= LWTUNNEL_STATE_OUTPUT_REDIRECT; |
| 278 | ret = bpf_parse_prog(tb[LWT_BPF_OUT], &bpf->out, |
| 279 | BPF_PROG_TYPE_LWT_OUT); |
| 280 | if (ret < 0) |
| 281 | goto errout; |
| 282 | } |
| 283 | |
| 284 | if (tb[LWT_BPF_XMIT]) { |
| 285 | newts->flags |= LWTUNNEL_STATE_XMIT_REDIRECT; |
| 286 | ret = bpf_parse_prog(tb[LWT_BPF_XMIT], &bpf->xmit, |
| 287 | BPF_PROG_TYPE_LWT_XMIT); |
| 288 | if (ret < 0) |
| 289 | goto errout; |
| 290 | } |
| 291 | |
| 292 | if (tb[LWT_BPF_XMIT_HEADROOM]) { |
| 293 | u32 headroom = nla_get_u32(tb[LWT_BPF_XMIT_HEADROOM]); |
| 294 | |
| 295 | if (headroom > LWT_BPF_MAX_HEADROOM) { |
| 296 | ret = -ERANGE; |
| 297 | goto errout; |
| 298 | } |
| 299 | |
| 300 | newts->headroom = headroom; |
| 301 | } |
| 302 | |
| 303 | bpf->family = family; |
| 304 | *ts = newts; |
| 305 | |
| 306 | return 0; |
| 307 | |
| 308 | errout: |
| 309 | bpf_destroy_state(newts); |
| 310 | kfree(newts); |
| 311 | return ret; |
| 312 | } |
| 313 | |
| 314 | static int bpf_fill_lwt_prog(struct sk_buff *skb, int attr, |
| 315 | struct bpf_lwt_prog *prog) |
| 316 | { |
| 317 | struct nlattr *nest; |
| 318 | |
| 319 | if (!prog->prog) |
| 320 | return 0; |
| 321 | |
| 322 | nest = nla_nest_start(skb, attr); |
| 323 | if (!nest) |
| 324 | return -EMSGSIZE; |
| 325 | |
| 326 | if (prog->name && |
| 327 | nla_put_string(skb, LWT_BPF_PROG_NAME, prog->name)) |
| 328 | return -EMSGSIZE; |
| 329 | |
| 330 | return nla_nest_end(skb, nest); |
| 331 | } |
| 332 | |
| 333 | static int bpf_fill_encap_info(struct sk_buff *skb, struct lwtunnel_state *lwt) |
| 334 | { |
| 335 | struct bpf_lwt *bpf = bpf_lwt_lwtunnel(lwt); |
| 336 | |
| 337 | if (bpf_fill_lwt_prog(skb, LWT_BPF_IN, &bpf->in) < 0 || |
| 338 | bpf_fill_lwt_prog(skb, LWT_BPF_OUT, &bpf->out) < 0 || |
| 339 | bpf_fill_lwt_prog(skb, LWT_BPF_XMIT, &bpf->xmit) < 0) |
| 340 | return -EMSGSIZE; |
| 341 | |
| 342 | return 0; |
| 343 | } |
| 344 | |
| 345 | static int bpf_encap_nlsize(struct lwtunnel_state *lwtstate) |
| 346 | { |
| 347 | int nest_len = nla_total_size(sizeof(struct nlattr)) + |
| 348 | nla_total_size(MAX_PROG_NAME) + /* LWT_BPF_PROG_NAME */ |
| 349 | 0; |
| 350 | |
| 351 | return nest_len + /* LWT_BPF_IN */ |
| 352 | nest_len + /* LWT_BPF_OUT */ |
| 353 | nest_len + /* LWT_BPF_XMIT */ |
| 354 | 0; |
| 355 | } |
| 356 | |
Wei Yongjun | 79471b1 | 2017-01-12 14:39:28 +0000 | [diff] [blame] | 357 | static int bpf_lwt_prog_cmp(struct bpf_lwt_prog *a, struct bpf_lwt_prog *b) |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 358 | { |
| 359 | /* FIXME: |
| 360 | * The LWT state is currently rebuilt for delete requests which |
| 361 | * results in a new bpf_prog instance. Comparing names for now. |
| 362 | */ |
| 363 | if (!a->name && !b->name) |
| 364 | return 0; |
| 365 | |
| 366 | if (!a->name || !b->name) |
| 367 | return 1; |
| 368 | |
| 369 | return strcmp(a->name, b->name); |
| 370 | } |
| 371 | |
| 372 | static int bpf_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) |
| 373 | { |
| 374 | struct bpf_lwt *a_bpf = bpf_lwt_lwtunnel(a); |
| 375 | struct bpf_lwt *b_bpf = bpf_lwt_lwtunnel(b); |
| 376 | |
| 377 | return bpf_lwt_prog_cmp(&a_bpf->in, &b_bpf->in) || |
| 378 | bpf_lwt_prog_cmp(&a_bpf->out, &b_bpf->out) || |
| 379 | bpf_lwt_prog_cmp(&a_bpf->xmit, &b_bpf->xmit); |
| 380 | } |
| 381 | |
| 382 | static const struct lwtunnel_encap_ops bpf_encap_ops = { |
| 383 | .build_state = bpf_build_state, |
| 384 | .destroy_state = bpf_destroy_state, |
| 385 | .input = bpf_input, |
| 386 | .output = bpf_output, |
| 387 | .xmit = bpf_xmit, |
| 388 | .fill_encap = bpf_fill_encap_info, |
| 389 | .get_encap_size = bpf_encap_nlsize, |
| 390 | .cmp_encap = bpf_encap_cmp, |
Robert Shearman | 88ff733 | 2017-01-24 16:26:47 +0000 | [diff] [blame] | 391 | .owner = THIS_MODULE, |
Thomas Graf | 3a0af8f | 2016-11-30 17:10:10 +0100 | [diff] [blame] | 392 | }; |
| 393 | |
| 394 | static int __init bpf_lwt_init(void) |
| 395 | { |
| 396 | return lwtunnel_encap_add_ops(&bpf_encap_ops, LWTUNNEL_ENCAP_BPF); |
| 397 | } |
| 398 | |
| 399 | subsys_initcall(bpf_lwt_init) |