blob: 871edc1f2e1fad3b9d89daa2b4d5c2b1ff4bd4b1 [file] [log] [blame]
Alexei Starovoitov51580e72014-09-26 00:17:02 -07001/* Copyright (c) 2011-2014 PLUMgrid, http://plumgrid.com
2 *
3 * This program is free software; you can redistribute it and/or
4 * modify it under the terms of version 2 of the GNU General Public
5 * License as published by the Free Software Foundation.
6 *
7 * This program is distributed in the hope that it will be useful, but
8 * WITHOUT ANY WARRANTY; without even the implied warranty of
9 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
10 * General Public License for more details.
11 */
12#include <linux/kernel.h>
13#include <linux/types.h>
14#include <linux/slab.h>
15#include <linux/bpf.h>
16#include <linux/filter.h>
17#include <net/netlink.h>
18#include <linux/file.h>
19#include <linux/vmalloc.h>
20
21/* bpf_check() is a static code analyzer that walks eBPF program
22 * instruction by instruction and updates register/stack state.
23 * All paths of conditional branches are analyzed until 'bpf_exit' insn.
24 *
25 * The first pass is depth-first-search to check that the program is a DAG.
26 * It rejects the following programs:
27 * - larger than BPF_MAXINSNS insns
28 * - if loop is present (detected via back-edge)
29 * - unreachable insns exist (shouldn't be a forest. program = one function)
30 * - out of bounds or malformed jumps
31 * The second pass is all possible path descent from the 1st insn.
32 * Since it's analyzing all pathes through the program, the length of the
33 * analysis is limited to 32k insn, which may be hit even if total number of
34 * insn is less then 4K, but there are too many branches that change stack/regs.
35 * Number of 'branches to be analyzed' is limited to 1k
36 *
37 * On entry to each instruction, each register has a type, and the instruction
38 * changes the types of the registers depending on instruction semantics.
39 * If instruction is BPF_MOV64_REG(BPF_REG_1, BPF_REG_5), then type of R5 is
40 * copied to R1.
41 *
42 * All registers are 64-bit.
43 * R0 - return register
44 * R1-R5 argument passing registers
45 * R6-R9 callee saved registers
46 * R10 - frame pointer read-only
47 *
48 * At the start of BPF program the register R1 contains a pointer to bpf_context
49 * and has type PTR_TO_CTX.
50 *
51 * Verifier tracks arithmetic operations on pointers in case:
52 * BPF_MOV64_REG(BPF_REG_1, BPF_REG_10),
53 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_1, -20),
54 * 1st insn copies R10 (which has FRAME_PTR) type into R1
55 * and 2nd arithmetic instruction is pattern matched to recognize
56 * that it wants to construct a pointer to some element within stack.
57 * So after 2nd insn, the register R1 has type PTR_TO_STACK
58 * (and -20 constant is saved for further stack bounds checking).
59 * Meaning that this reg is a pointer to stack plus known immediate constant.
60 *
61 * Most of the time the registers have UNKNOWN_VALUE type, which
62 * means the register has some value, but it's not a valid pointer.
63 * (like pointer plus pointer becomes UNKNOWN_VALUE type)
64 *
65 * When verifier sees load or store instructions the type of base register
66 * can be: PTR_TO_MAP_VALUE, PTR_TO_CTX, FRAME_PTR. These are three pointer
67 * types recognized by check_mem_access() function.
68 *
69 * PTR_TO_MAP_VALUE means that this register is pointing to 'map element value'
70 * and the range of [ptr, ptr + map's value_size) is accessible.
71 *
72 * registers used to pass values to function calls are checked against
73 * function argument constraints.
74 *
75 * ARG_PTR_TO_MAP_KEY is one of such argument constraints.
76 * It means that the register type passed to this function must be
77 * PTR_TO_STACK and it will be used inside the function as
78 * 'pointer to map element key'
79 *
80 * For example the argument constraints for bpf_map_lookup_elem():
81 * .ret_type = RET_PTR_TO_MAP_VALUE_OR_NULL,
82 * .arg1_type = ARG_CONST_MAP_PTR,
83 * .arg2_type = ARG_PTR_TO_MAP_KEY,
84 *
85 * ret_type says that this function returns 'pointer to map elem value or null'
86 * function expects 1st argument to be a const pointer to 'struct bpf_map' and
87 * 2nd argument should be a pointer to stack, which will be used inside
88 * the helper function as a pointer to map element key.
89 *
90 * On the kernel side the helper function looks like:
91 * u64 bpf_map_lookup_elem(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
92 * {
93 * struct bpf_map *map = (struct bpf_map *) (unsigned long) r1;
94 * void *key = (void *) (unsigned long) r2;
95 * void *value;
96 *
97 * here kernel can access 'key' and 'map' pointers safely, knowing that
98 * [key, key + map->key_size) bytes are valid and were initialized on
99 * the stack of eBPF program.
100 * }
101 *
102 * Corresponding eBPF program may look like:
103 * BPF_MOV64_REG(BPF_REG_2, BPF_REG_10), // after this insn R2 type is FRAME_PTR
104 * BPF_ALU64_IMM(BPF_ADD, BPF_REG_2, -4), // after this insn R2 type is PTR_TO_STACK
105 * BPF_LD_MAP_FD(BPF_REG_1, map_fd), // after this insn R1 type is CONST_PTR_TO_MAP
106 * BPF_RAW_INSN(BPF_JMP | BPF_CALL, 0, 0, 0, BPF_FUNC_map_lookup_elem),
107 * here verifier looks at prototype of map_lookup_elem() and sees:
108 * .arg1_type == ARG_CONST_MAP_PTR and R1->type == CONST_PTR_TO_MAP, which is ok,
109 * Now verifier knows that this map has key of R1->map_ptr->key_size bytes
110 *
111 * Then .arg2_type == ARG_PTR_TO_MAP_KEY and R2->type == PTR_TO_STACK, ok so far,
112 * Now verifier checks that [R2, R2 + map's key_size) are within stack limits
113 * and were initialized prior to this call.
114 * If it's ok, then verifier allows this BPF_CALL insn and looks at
115 * .ret_type which is RET_PTR_TO_MAP_VALUE_OR_NULL, so it sets
116 * R0->type = PTR_TO_MAP_VALUE_OR_NULL which means bpf_map_lookup_elem() function
117 * returns ether pointer to map value or NULL.
118 *
119 * When type PTR_TO_MAP_VALUE_OR_NULL passes through 'if (reg != 0) goto +off'
120 * insn, the register holding that pointer in the true branch changes state to
121 * PTR_TO_MAP_VALUE and the same register changes state to CONST_IMM in the false
122 * branch. See check_cond_jmp_op().
123 *
124 * After the call R0 is set to return type of the function and registers R1-R5
125 * are set to NOT_INIT to indicate that they are no longer readable.
126 */
127
Alexei Starovoitovcbd35702014-09-26 00:17:03 -0700128/* single container for all structs
129 * one verifier_env per bpf_check() call
130 */
131struct verifier_env {
132};
133
134/* verbose verifier prints what it's seeing
135 * bpf_check() is called under lock, so no race to access these global vars
136 */
137static u32 log_level, log_size, log_len;
138static char *log_buf;
139
140static DEFINE_MUTEX(bpf_verifier_lock);
141
142/* log_level controls verbosity level of eBPF verifier.
143 * verbose() is used to dump the verification trace to the log, so the user
144 * can figure out what's wrong with the program
145 */
146static void verbose(const char *fmt, ...)
147{
148 va_list args;
149
150 if (log_level == 0 || log_len >= log_size - 1)
151 return;
152
153 va_start(args, fmt);
154 log_len += vscnprintf(log_buf + log_len, log_size - log_len, fmt, args);
155 va_end(args);
156}
157
158static const char *const bpf_class_string[] = {
159 [BPF_LD] = "ld",
160 [BPF_LDX] = "ldx",
161 [BPF_ST] = "st",
162 [BPF_STX] = "stx",
163 [BPF_ALU] = "alu",
164 [BPF_JMP] = "jmp",
165 [BPF_RET] = "BUG",
166 [BPF_ALU64] = "alu64",
167};
168
169static const char *const bpf_alu_string[] = {
170 [BPF_ADD >> 4] = "+=",
171 [BPF_SUB >> 4] = "-=",
172 [BPF_MUL >> 4] = "*=",
173 [BPF_DIV >> 4] = "/=",
174 [BPF_OR >> 4] = "|=",
175 [BPF_AND >> 4] = "&=",
176 [BPF_LSH >> 4] = "<<=",
177 [BPF_RSH >> 4] = ">>=",
178 [BPF_NEG >> 4] = "neg",
179 [BPF_MOD >> 4] = "%=",
180 [BPF_XOR >> 4] = "^=",
181 [BPF_MOV >> 4] = "=",
182 [BPF_ARSH >> 4] = "s>>=",
183 [BPF_END >> 4] = "endian",
184};
185
186static const char *const bpf_ldst_string[] = {
187 [BPF_W >> 3] = "u32",
188 [BPF_H >> 3] = "u16",
189 [BPF_B >> 3] = "u8",
190 [BPF_DW >> 3] = "u64",
191};
192
193static const char *const bpf_jmp_string[] = {
194 [BPF_JA >> 4] = "jmp",
195 [BPF_JEQ >> 4] = "==",
196 [BPF_JGT >> 4] = ">",
197 [BPF_JGE >> 4] = ">=",
198 [BPF_JSET >> 4] = "&",
199 [BPF_JNE >> 4] = "!=",
200 [BPF_JSGT >> 4] = "s>",
201 [BPF_JSGE >> 4] = "s>=",
202 [BPF_CALL >> 4] = "call",
203 [BPF_EXIT >> 4] = "exit",
204};
205
206static void print_bpf_insn(struct bpf_insn *insn)
207{
208 u8 class = BPF_CLASS(insn->code);
209
210 if (class == BPF_ALU || class == BPF_ALU64) {
211 if (BPF_SRC(insn->code) == BPF_X)
212 verbose("(%02x) %sr%d %s %sr%d\n",
213 insn->code, class == BPF_ALU ? "(u32) " : "",
214 insn->dst_reg,
215 bpf_alu_string[BPF_OP(insn->code) >> 4],
216 class == BPF_ALU ? "(u32) " : "",
217 insn->src_reg);
218 else
219 verbose("(%02x) %sr%d %s %s%d\n",
220 insn->code, class == BPF_ALU ? "(u32) " : "",
221 insn->dst_reg,
222 bpf_alu_string[BPF_OP(insn->code) >> 4],
223 class == BPF_ALU ? "(u32) " : "",
224 insn->imm);
225 } else if (class == BPF_STX) {
226 if (BPF_MODE(insn->code) == BPF_MEM)
227 verbose("(%02x) *(%s *)(r%d %+d) = r%d\n",
228 insn->code,
229 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
230 insn->dst_reg,
231 insn->off, insn->src_reg);
232 else if (BPF_MODE(insn->code) == BPF_XADD)
233 verbose("(%02x) lock *(%s *)(r%d %+d) += r%d\n",
234 insn->code,
235 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
236 insn->dst_reg, insn->off,
237 insn->src_reg);
238 else
239 verbose("BUG_%02x\n", insn->code);
240 } else if (class == BPF_ST) {
241 if (BPF_MODE(insn->code) != BPF_MEM) {
242 verbose("BUG_st_%02x\n", insn->code);
243 return;
244 }
245 verbose("(%02x) *(%s *)(r%d %+d) = %d\n",
246 insn->code,
247 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
248 insn->dst_reg,
249 insn->off, insn->imm);
250 } else if (class == BPF_LDX) {
251 if (BPF_MODE(insn->code) != BPF_MEM) {
252 verbose("BUG_ldx_%02x\n", insn->code);
253 return;
254 }
255 verbose("(%02x) r%d = *(%s *)(r%d %+d)\n",
256 insn->code, insn->dst_reg,
257 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
258 insn->src_reg, insn->off);
259 } else if (class == BPF_LD) {
260 if (BPF_MODE(insn->code) == BPF_ABS) {
261 verbose("(%02x) r0 = *(%s *)skb[%d]\n",
262 insn->code,
263 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
264 insn->imm);
265 } else if (BPF_MODE(insn->code) == BPF_IND) {
266 verbose("(%02x) r0 = *(%s *)skb[r%d + %d]\n",
267 insn->code,
268 bpf_ldst_string[BPF_SIZE(insn->code) >> 3],
269 insn->src_reg, insn->imm);
270 } else if (BPF_MODE(insn->code) == BPF_IMM) {
271 verbose("(%02x) r%d = 0x%x\n",
272 insn->code, insn->dst_reg, insn->imm);
273 } else {
274 verbose("BUG_ld_%02x\n", insn->code);
275 return;
276 }
277 } else if (class == BPF_JMP) {
278 u8 opcode = BPF_OP(insn->code);
279
280 if (opcode == BPF_CALL) {
281 verbose("(%02x) call %d\n", insn->code, insn->imm);
282 } else if (insn->code == (BPF_JMP | BPF_JA)) {
283 verbose("(%02x) goto pc%+d\n",
284 insn->code, insn->off);
285 } else if (insn->code == (BPF_JMP | BPF_EXIT)) {
286 verbose("(%02x) exit\n", insn->code);
287 } else if (BPF_SRC(insn->code) == BPF_X) {
288 verbose("(%02x) if r%d %s r%d goto pc%+d\n",
289 insn->code, insn->dst_reg,
290 bpf_jmp_string[BPF_OP(insn->code) >> 4],
291 insn->src_reg, insn->off);
292 } else {
293 verbose("(%02x) if r%d %s 0x%x goto pc%+d\n",
294 insn->code, insn->dst_reg,
295 bpf_jmp_string[BPF_OP(insn->code) >> 4],
296 insn->imm, insn->off);
297 }
298 } else {
299 verbose("(%02x) %s\n", insn->code, bpf_class_string[class]);
300 }
301}
302
Alexei Starovoitov51580e72014-09-26 00:17:02 -0700303int bpf_check(struct bpf_prog *prog, union bpf_attr *attr)
304{
Alexei Starovoitovcbd35702014-09-26 00:17:03 -0700305 char __user *log_ubuf = NULL;
306 struct verifier_env *env;
Alexei Starovoitov51580e72014-09-26 00:17:02 -0700307 int ret = -EINVAL;
308
Alexei Starovoitovcbd35702014-09-26 00:17:03 -0700309 if (prog->len <= 0 || prog->len > BPF_MAXINSNS)
310 return -E2BIG;
311
312 /* 'struct verifier_env' can be global, but since it's not small,
313 * allocate/free it every time bpf_check() is called
314 */
315 env = kzalloc(sizeof(struct verifier_env), GFP_KERNEL);
316 if (!env)
317 return -ENOMEM;
318
319 /* grab the mutex to protect few globals used by verifier */
320 mutex_lock(&bpf_verifier_lock);
321
322 if (attr->log_level || attr->log_buf || attr->log_size) {
323 /* user requested verbose verifier output
324 * and supplied buffer to store the verification trace
325 */
326 log_level = attr->log_level;
327 log_ubuf = (char __user *) (unsigned long) attr->log_buf;
328 log_size = attr->log_size;
329 log_len = 0;
330
331 ret = -EINVAL;
332 /* log_* values have to be sane */
333 if (log_size < 128 || log_size > UINT_MAX >> 8 ||
334 log_level == 0 || log_ubuf == NULL)
335 goto free_env;
336
337 ret = -ENOMEM;
338 log_buf = vmalloc(log_size);
339 if (!log_buf)
340 goto free_env;
341 } else {
342 log_level = 0;
343 }
344
345 /* ret = do_check(env); */
346
347 if (log_level && log_len >= log_size - 1) {
348 BUG_ON(log_len >= log_size);
349 /* verifier log exceeded user supplied buffer */
350 ret = -ENOSPC;
351 /* fall through to return what was recorded */
352 }
353
354 /* copy verifier log back to user space including trailing zero */
355 if (log_level && copy_to_user(log_ubuf, log_buf, log_len + 1) != 0) {
356 ret = -EFAULT;
357 goto free_log_buf;
358 }
359
360
361free_log_buf:
362 if (log_level)
363 vfree(log_buf);
364free_env:
365 kfree(env);
366 mutex_unlock(&bpf_verifier_lock);
Alexei Starovoitov51580e72014-09-26 00:17:02 -0700367 return ret;
368}