blob: 66e42a098d70f2f3742c1a2941fa0353db3ce59c [file] [log] [blame]
Greg Kroah-Hartmanb2441312017-11-01 15:07:57 +01001// SPDX-License-Identifier: GPL-2.0
Kees Cook7de828d2016-04-18 09:42:14 -07002/*
3 * kaslr.c
4 *
5 * This contains the routines needed to generate a reasonable level of
6 * entropy to choose a randomized kernel base address offset in support
7 * of Kernel Address Space Layout Randomization (KASLR). Additionally
8 * handles walking the physical memory maps (and tracking memory regions
9 * to avoid) in order to select a physical memory location that can
10 * contain the entire properly aligned running kernel image.
11 *
12 */
Baoquan Hed52e7d52017-05-13 13:46:28 +080013
14/*
15 * isspace() in linux/ctype.h is expected by next_args() to filter
16 * out "space/lf/tab". While boot/ctype.h conflicts with linux/ctype.h,
17 * since isdigit() is implemented in both of them. Hence disable it
18 * here.
19 */
20#define BOOT_CTYPE_H
21
22/*
23 * _ctype[] in lib/ctype.c is needed by isspace() of linux/ctype.h.
24 * While both lib/ctype.c and lib/cmdline.c will bring EXPORT_SYMBOL
25 * which is meaningless and will cause compiling error in some cases.
26 * So do not include linux/export.h and define EXPORT_SYMBOL(sym)
27 * as empty.
28 */
29#define _LINUX_EXPORT_H
30#define EXPORT_SYMBOL(sym)
31
Kees Cook8ab38202013-10-10 17:18:14 -070032#include "misc.h"
Kees Cookdc425a62016-05-02 15:51:00 -070033#include "error.h"
Arnd Bergmann5b8b9cf2017-05-30 11:14:17 +020034#include "../string.h"
Kees Cook8ab38202013-10-10 17:18:14 -070035
Kees Cooka653f352013-11-11 14:28:39 -080036#include <generated/compile.h>
37#include <linux/module.h>
38#include <linux/uts.h>
39#include <linux/utsname.h>
Baoquan Hed52e7d52017-05-13 13:46:28 +080040#include <linux/ctype.h>
Baoquan Hec05cd792017-08-14 22:54:24 +080041#include <linux/efi.h>
Kees Cooka653f352013-11-11 14:28:39 -080042#include <generated/utsrelease.h>
Baoquan Hec05cd792017-08-14 22:54:24 +080043#include <asm/efi.h>
Kees Cooka653f352013-11-11 14:28:39 -080044
Baoquan Hed52e7d52017-05-13 13:46:28 +080045/* Macros used by the included decompressor code below. */
46#define STATIC
47#include <linux/decompress/mm.h>
48
Kirill A. Shutemove626e6b2018-02-14 14:16:51 +030049#ifdef CONFIG_X86_5LEVEL
Kirill A. Shutemov4c2b4052018-02-14 21:25:34 +030050unsigned int pgtable_l5_enabled __ro_after_init;
Kirill A. Shutemovb16e7702018-02-14 21:25:35 +030051unsigned int pgdir_shift __ro_after_init = 39;
52unsigned int ptrs_per_p4d __ro_after_init = 1;
Kirill A. Shutemove626e6b2018-02-14 14:16:51 +030053#endif
54
Baoquan Hed52e7d52017-05-13 13:46:28 +080055extern unsigned long get_cmd_line_ptr(void);
56
Kees Cooka653f352013-11-11 14:28:39 -080057/* Simplified build-specific string for starting entropy. */
Kees Cook327f7d72013-11-12 08:56:07 -080058static const char build_str[] = UTS_RELEASE " (" LINUX_COMPILE_BY "@"
Kees Cooka653f352013-11-11 14:28:39 -080059 LINUX_COMPILE_HOST ") (" LINUX_COMPILER ") " UTS_VERSION;
60
Kees Cooka653f352013-11-11 14:28:39 -080061static unsigned long rotate_xor(unsigned long hash, const void *area,
62 size_t size)
63{
64 size_t i;
65 unsigned long *ptr = (unsigned long *)area;
66
67 for (i = 0; i < size / sizeof(hash); i++) {
68 /* Rotate by odd number of bits and XOR. */
69 hash = (hash << ((sizeof(hash) * 8) - 7)) | (hash >> 7);
70 hash ^= ptr[i];
71 }
72
73 return hash;
74}
75
76/* Attempt to create a simple but unpredictable starting entropy. */
Thomas Garnierd899a7d2016-06-21 17:46:58 -070077static unsigned long get_boot_seed(void)
Kees Cooka653f352013-11-11 14:28:39 -080078{
79 unsigned long hash = 0;
80
81 hash = rotate_xor(hash, build_str, sizeof(build_str));
Kees Cook6655e0a2016-04-18 09:42:12 -070082 hash = rotate_xor(hash, boot_params, sizeof(*boot_params));
Kees Cooka653f352013-11-11 14:28:39 -080083
84 return hash;
85}
86
Thomas Garnierd899a7d2016-06-21 17:46:58 -070087#define KASLR_COMPRESSED_BOOT
88#include "../../lib/kaslr.c"
Kees Cook8ab38202013-10-10 17:18:14 -070089
Kees Cook82fa9632013-10-10 17:18:16 -070090struct mem_vector {
Dave Jiangf2844242017-01-11 16:20:01 -070091 unsigned long long start;
92 unsigned long long size;
Kees Cook82fa9632013-10-10 17:18:16 -070093};
94
Dave Jiangf2844242017-01-11 16:20:01 -070095/* Only supporting at most 4 unusable memmap regions with kaslr */
96#define MAX_MEMMAP_REGIONS 4
97
98static bool memmap_too_large;
99
Baoquan Hed52e7d52017-05-13 13:46:28 +0800100
Baoquan He4cdba142017-05-13 13:46:29 +0800101/* Store memory limit specified by "mem=nn[KMG]" or "memmap=nn[KMG]" */
102unsigned long long mem_limit = ULLONG_MAX;
103
104
Kees Cooked09acd2016-05-06 12:44:59 -0700105enum mem_avoid_index {
106 MEM_AVOID_ZO_RANGE = 0,
107 MEM_AVOID_INITRD,
108 MEM_AVOID_CMDLINE,
109 MEM_AVOID_BOOTPARAMS,
Dave Jiangf2844242017-01-11 16:20:01 -0700110 MEM_AVOID_MEMMAP_BEGIN,
111 MEM_AVOID_MEMMAP_END = MEM_AVOID_MEMMAP_BEGIN + MAX_MEMMAP_REGIONS - 1,
Kees Cooked09acd2016-05-06 12:44:59 -0700112 MEM_AVOID_MAX,
113};
114
Kees Cooke290e8c2014-02-09 13:56:44 -0800115static struct mem_vector mem_avoid[MEM_AVOID_MAX];
Kees Cook82fa9632013-10-10 17:18:16 -0700116
Kees Cook82fa9632013-10-10 17:18:16 -0700117static bool mem_overlaps(struct mem_vector *one, struct mem_vector *two)
118{
119 /* Item one is entirely before item two. */
120 if (one->start + one->size <= two->start)
121 return false;
122 /* Item one is entirely after item two. */
123 if (one->start >= two->start + two->size)
124 return false;
125 return true;
126}
127
Baoquan Hed52e7d52017-05-13 13:46:28 +0800128char *skip_spaces(const char *str)
Dave Jiangf2844242017-01-11 16:20:01 -0700129{
Baoquan Hed52e7d52017-05-13 13:46:28 +0800130 while (isspace(*str))
131 ++str;
132 return (char *)str;
Dave Jiangf2844242017-01-11 16:20:01 -0700133}
Baoquan Hed52e7d52017-05-13 13:46:28 +0800134#include "../../../../lib/ctype.c"
135#include "../../../../lib/cmdline.c"
Dave Jiangf2844242017-01-11 16:20:01 -0700136
137static int
138parse_memmap(char *p, unsigned long long *start, unsigned long long *size)
139{
140 char *oldp;
141
142 if (!p)
143 return -EINVAL;
144
145 /* We don't care about this option here */
146 if (!strncmp(p, "exactmap", 8))
147 return -EINVAL;
148
149 oldp = p;
Baoquan Hed52e7d52017-05-13 13:46:28 +0800150 *size = memparse(p, &p);
Dave Jiangf2844242017-01-11 16:20:01 -0700151 if (p == oldp)
152 return -EINVAL;
153
154 switch (*p) {
Dave Jiangf2844242017-01-11 16:20:01 -0700155 case '#':
156 case '$':
157 case '!':
Baoquan Hed52e7d52017-05-13 13:46:28 +0800158 *start = memparse(p + 1, &p);
Dave Jiangf2844242017-01-11 16:20:01 -0700159 return 0;
Baoquan He4cdba142017-05-13 13:46:29 +0800160 case '@':
161 /* memmap=nn@ss specifies usable region, should be skipped */
162 *size = 0;
163 /* Fall through */
164 default:
165 /*
166 * If w/o offset, only size specified, memmap=nn[KMG] has the
167 * same behaviour as mem=nn[KMG]. It limits the max address
168 * system can use. Region above the limit should be avoided.
169 */
170 *start = 0;
Dave Jiangf2844242017-01-11 16:20:01 -0700171 return 0;
172 }
173
174 return -EINVAL;
175}
176
Baoquan Hed52e7d52017-05-13 13:46:28 +0800177static void mem_avoid_memmap(char *str)
Dave Jiangf2844242017-01-11 16:20:01 -0700178{
Baoquan Hed52e7d52017-05-13 13:46:28 +0800179 static int i;
Dave Jiangf2844242017-01-11 16:20:01 -0700180
Baoquan Hed52e7d52017-05-13 13:46:28 +0800181 if (i >= MAX_MEMMAP_REGIONS)
Dave Jiangf2844242017-01-11 16:20:01 -0700182 return;
183
Dave Jiangf2844242017-01-11 16:20:01 -0700184 while (str && (i < MAX_MEMMAP_REGIONS)) {
185 int rc;
186 unsigned long long start, size;
187 char *k = strchr(str, ',');
188
189 if (k)
190 *k++ = 0;
191
192 rc = parse_memmap(str, &start, &size);
193 if (rc < 0)
194 break;
195 str = k;
Baoquan He4cdba142017-05-13 13:46:29 +0800196
197 if (start == 0) {
198 /* Store the specified memory limit if size > 0 */
199 if (size > 0)
200 mem_limit = size;
201
Dave Jiangf2844242017-01-11 16:20:01 -0700202 continue;
Baoquan He4cdba142017-05-13 13:46:29 +0800203 }
Dave Jiangf2844242017-01-11 16:20:01 -0700204
205 mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].start = start;
206 mem_avoid[MEM_AVOID_MEMMAP_BEGIN + i].size = size;
207 i++;
208 }
209
210 /* More than 4 memmaps, fail kaslr */
211 if ((i >= MAX_MEMMAP_REGIONS) && str)
212 memmap_too_large = true;
213}
214
Baoquan Hed52e7d52017-05-13 13:46:28 +0800215static int handle_mem_memmap(void)
216{
217 char *args = (char *)get_cmd_line_ptr();
218 size_t len = strlen((char *)args);
219 char *tmp_cmdline;
220 char *param, *val;
Baoquan He4cdba142017-05-13 13:46:29 +0800221 u64 mem_size;
Baoquan Hed52e7d52017-05-13 13:46:28 +0800222
Baoquan He4cdba142017-05-13 13:46:29 +0800223 if (!strstr(args, "memmap=") && !strstr(args, "mem="))
Baoquan Hed52e7d52017-05-13 13:46:28 +0800224 return 0;
225
226 tmp_cmdline = malloc(len + 1);
Chao Fan69550d42017-11-23 17:08:47 +0800227 if (!tmp_cmdline)
Baoquan Hed52e7d52017-05-13 13:46:28 +0800228 error("Failed to allocate space for tmp_cmdline");
229
230 memcpy(tmp_cmdline, args, len);
231 tmp_cmdline[len] = 0;
232 args = tmp_cmdline;
233
234 /* Chew leading spaces */
235 args = skip_spaces(args);
236
237 while (*args) {
238 args = next_arg(args, &param, &val);
239 /* Stop at -- */
240 if (!val && strcmp(param, "--") == 0) {
241 warn("Only '--' specified in cmdline");
242 free(tmp_cmdline);
243 return -1;
244 }
245
Baoquan He4cdba142017-05-13 13:46:29 +0800246 if (!strcmp(param, "memmap")) {
Baoquan Hed52e7d52017-05-13 13:46:28 +0800247 mem_avoid_memmap(val);
Baoquan He4cdba142017-05-13 13:46:29 +0800248 } else if (!strcmp(param, "mem")) {
249 char *p = val;
250
251 if (!strcmp(p, "nopentium"))
252 continue;
253 mem_size = memparse(p, &p);
254 if (mem_size == 0) {
255 free(tmp_cmdline);
256 return -EINVAL;
257 }
258 mem_limit = mem_size;
259 }
Baoquan Hed52e7d52017-05-13 13:46:28 +0800260 }
261
262 free(tmp_cmdline);
263 return 0;
264}
265
Yinghai Lu9dc19692016-05-05 15:13:47 -0700266/*
Kees Cooked09acd2016-05-06 12:44:59 -0700267 * In theory, KASLR can put the kernel anywhere in the range of [16M, 64T).
268 * The mem_avoid array is used to store the ranges that need to be avoided
269 * when KASLR searches for an appropriate random address. We must avoid any
Yinghai Lu9dc19692016-05-05 15:13:47 -0700270 * regions that are unsafe to overlap with during decompression, and other
Kees Cooked09acd2016-05-06 12:44:59 -0700271 * things like the initrd, cmdline and boot_params. This comment seeks to
272 * explain mem_avoid as clearly as possible since incorrect mem_avoid
273 * memory ranges lead to really hard to debug boot failures.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700274 *
Kees Cooked09acd2016-05-06 12:44:59 -0700275 * The initrd, cmdline, and boot_params are trivial to identify for
Kees Cookcb18ef02016-05-09 13:22:05 -0700276 * avoiding. They are MEM_AVOID_INITRD, MEM_AVOID_CMDLINE, and
Kees Cooked09acd2016-05-06 12:44:59 -0700277 * MEM_AVOID_BOOTPARAMS respectively below.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700278 *
Kees Cooked09acd2016-05-06 12:44:59 -0700279 * What is not obvious how to avoid is the range of memory that is used
280 * during decompression (MEM_AVOID_ZO_RANGE below). This range must cover
281 * the compressed kernel (ZO) and its run space, which is used to extract
282 * the uncompressed kernel (VO) and relocs.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700283 *
Kees Cooked09acd2016-05-06 12:44:59 -0700284 * ZO's full run size sits against the end of the decompression buffer, so
285 * we can calculate where text, data, bss, etc of ZO are positioned more
286 * easily.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700287 *
Kees Cooked09acd2016-05-06 12:44:59 -0700288 * For additional background, the decompression calculations can be found
289 * in header.S, and the memory diagram is based on the one found in misc.c.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700290 *
Kees Cooked09acd2016-05-06 12:44:59 -0700291 * The following conditions are already enforced by the image layouts and
292 * associated code:
293 * - input + input_size >= output + output_size
294 * - kernel_total_size <= init_size
295 * - kernel_total_size <= output_size (see Note below)
296 * - output + init_size >= output + output_size
Yinghai Lu9dc19692016-05-05 15:13:47 -0700297 *
Kees Cooked09acd2016-05-06 12:44:59 -0700298 * (Note that kernel_total_size and output_size have no fundamental
299 * relationship, but output_size is passed to choose_random_location
300 * as a maximum of the two. The diagram is showing a case where
301 * kernel_total_size is larger than output_size, but this case is
302 * handled by bumping output_size.)
Yinghai Lu9dc19692016-05-05 15:13:47 -0700303 *
Kees Cooked09acd2016-05-06 12:44:59 -0700304 * The above conditions can be illustrated by a diagram:
Yinghai Lu9dc19692016-05-05 15:13:47 -0700305 *
Kees Cooked09acd2016-05-06 12:44:59 -0700306 * 0 output input input+input_size output+init_size
307 * | | | | |
308 * | | | | |
309 * |-----|--------|--------|--------------|-----------|--|-------------|
310 * | | |
311 * | | |
312 * output+init_size-ZO_INIT_SIZE output+output_size output+kernel_total_size
Yinghai Lu9dc19692016-05-05 15:13:47 -0700313 *
Kees Cooked09acd2016-05-06 12:44:59 -0700314 * [output, output+init_size) is the entire memory range used for
315 * extracting the compressed image.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700316 *
Kees Cooked09acd2016-05-06 12:44:59 -0700317 * [output, output+kernel_total_size) is the range needed for the
318 * uncompressed kernel (VO) and its run size (bss, brk, etc).
319 *
320 * [output, output+output_size) is VO plus relocs (i.e. the entire
321 * uncompressed payload contained by ZO). This is the area of the buffer
322 * written to during decompression.
323 *
324 * [output+init_size-ZO_INIT_SIZE, output+init_size) is the worst-case
325 * range of the copied ZO and decompression code. (i.e. the range
326 * covered backwards of size ZO_INIT_SIZE, starting from output+init_size.)
327 *
328 * [input, input+input_size) is the original copied compressed image (ZO)
329 * (i.e. it does not include its run size). This range must be avoided
330 * because it contains the data used for decompression.
331 *
332 * [input+input_size, output+init_size) is [_text, _end) for ZO. This
333 * range includes ZO's heap and stack, and must be avoided since it
334 * performs the decompression.
335 *
336 * Since the above two ranges need to be avoided and they are adjacent,
337 * they can be merged, resulting in: [input, output+init_size) which
338 * becomes the MEM_AVOID_ZO_RANGE below.
Yinghai Lu9dc19692016-05-05 15:13:47 -0700339 */
Kees Cook82fa9632013-10-10 17:18:16 -0700340static void mem_avoid_init(unsigned long input, unsigned long input_size,
Yinghai Lu9dc19692016-05-05 15:13:47 -0700341 unsigned long output)
Kees Cook82fa9632013-10-10 17:18:16 -0700342{
Yinghai Lu9dc19692016-05-05 15:13:47 -0700343 unsigned long init_size = boot_params->hdr.init_size;
Kees Cook82fa9632013-10-10 17:18:16 -0700344 u64 initrd_start, initrd_size;
345 u64 cmd_line, cmd_line_size;
Kees Cook82fa9632013-10-10 17:18:16 -0700346 char *ptr;
347
348 /*
349 * Avoid the region that is unsafe to overlap during
Yinghai Lu9dc19692016-05-05 15:13:47 -0700350 * decompression.
Kees Cook82fa9632013-10-10 17:18:16 -0700351 */
Kees Cooked09acd2016-05-06 12:44:59 -0700352 mem_avoid[MEM_AVOID_ZO_RANGE].start = input;
353 mem_avoid[MEM_AVOID_ZO_RANGE].size = (output + init_size) - input;
Kees Cook3a947072016-05-06 15:01:35 -0700354 add_identity_map(mem_avoid[MEM_AVOID_ZO_RANGE].start,
355 mem_avoid[MEM_AVOID_ZO_RANGE].size);
Kees Cook82fa9632013-10-10 17:18:16 -0700356
357 /* Avoid initrd. */
Kees Cook6655e0a2016-04-18 09:42:12 -0700358 initrd_start = (u64)boot_params->ext_ramdisk_image << 32;
359 initrd_start |= boot_params->hdr.ramdisk_image;
360 initrd_size = (u64)boot_params->ext_ramdisk_size << 32;
361 initrd_size |= boot_params->hdr.ramdisk_size;
Kees Cooked09acd2016-05-06 12:44:59 -0700362 mem_avoid[MEM_AVOID_INITRD].start = initrd_start;
363 mem_avoid[MEM_AVOID_INITRD].size = initrd_size;
Kees Cook3a947072016-05-06 15:01:35 -0700364 /* No need to set mapping for initrd, it will be handled in VO. */
Kees Cook82fa9632013-10-10 17:18:16 -0700365
366 /* Avoid kernel command line. */
Kees Cook6655e0a2016-04-18 09:42:12 -0700367 cmd_line = (u64)boot_params->ext_cmd_line_ptr << 32;
368 cmd_line |= boot_params->hdr.cmd_line_ptr;
Kees Cook82fa9632013-10-10 17:18:16 -0700369 /* Calculate size of cmd_line. */
370 ptr = (char *)(unsigned long)cmd_line;
Chao Fan69550d42017-11-23 17:08:47 +0800371 for (cmd_line_size = 0; ptr[cmd_line_size++];)
Kees Cook82fa9632013-10-10 17:18:16 -0700372 ;
Kees Cooked09acd2016-05-06 12:44:59 -0700373 mem_avoid[MEM_AVOID_CMDLINE].start = cmd_line;
374 mem_avoid[MEM_AVOID_CMDLINE].size = cmd_line_size;
Kees Cook3a947072016-05-06 15:01:35 -0700375 add_identity_map(mem_avoid[MEM_AVOID_CMDLINE].start,
376 mem_avoid[MEM_AVOID_CMDLINE].size);
Kees Cook82fa9632013-10-10 17:18:16 -0700377
Kees Cooked09acd2016-05-06 12:44:59 -0700378 /* Avoid boot parameters. */
379 mem_avoid[MEM_AVOID_BOOTPARAMS].start = (unsigned long)boot_params;
380 mem_avoid[MEM_AVOID_BOOTPARAMS].size = sizeof(*boot_params);
Kees Cook3a947072016-05-06 15:01:35 -0700381 add_identity_map(mem_avoid[MEM_AVOID_BOOTPARAMS].start,
382 mem_avoid[MEM_AVOID_BOOTPARAMS].size);
383
384 /* We don't need to set a mapping for setup_data. */
385
Dave Jiangf2844242017-01-11 16:20:01 -0700386 /* Mark the memmap regions we need to avoid */
Baoquan Hed52e7d52017-05-13 13:46:28 +0800387 handle_mem_memmap();
Dave Jiangf2844242017-01-11 16:20:01 -0700388
Kees Cook3a947072016-05-06 15:01:35 -0700389#ifdef CONFIG_X86_VERBOSE_BOOTUP
390 /* Make sure video RAM can be used. */
391 add_identity_map(0, PMD_SIZE);
392#endif
Kees Cook82fa9632013-10-10 17:18:16 -0700393}
394
Kees Cook06486d62016-05-09 13:22:07 -0700395/*
396 * Does this memory vector overlap a known avoided area? If so, record the
397 * overlap region with the lowest address.
398 */
399static bool mem_avoid_overlap(struct mem_vector *img,
400 struct mem_vector *overlap)
Kees Cook82fa9632013-10-10 17:18:16 -0700401{
402 int i;
Kees Cook0cacbfb2014-09-11 09:19:31 -0700403 struct setup_data *ptr;
Kees Cook06486d62016-05-09 13:22:07 -0700404 unsigned long earliest = img->start + img->size;
405 bool is_overlapping = false;
Kees Cook82fa9632013-10-10 17:18:16 -0700406
407 for (i = 0; i < MEM_AVOID_MAX; i++) {
Kees Cook06486d62016-05-09 13:22:07 -0700408 if (mem_overlaps(img, &mem_avoid[i]) &&
409 mem_avoid[i].start < earliest) {
410 *overlap = mem_avoid[i];
Baoquan He6daa2ec2016-07-01 15:34:40 +0800411 earliest = overlap->start;
Kees Cook06486d62016-05-09 13:22:07 -0700412 is_overlapping = true;
413 }
Kees Cook82fa9632013-10-10 17:18:16 -0700414 }
415
Kees Cook0cacbfb2014-09-11 09:19:31 -0700416 /* Avoid all entries in the setup_data linked list. */
Kees Cook6655e0a2016-04-18 09:42:12 -0700417 ptr = (struct setup_data *)(unsigned long)boot_params->hdr.setup_data;
Kees Cook0cacbfb2014-09-11 09:19:31 -0700418 while (ptr) {
419 struct mem_vector avoid;
420
Kees Cook20cc2882014-10-01 11:36:32 -0700421 avoid.start = (unsigned long)ptr;
Kees Cook0cacbfb2014-09-11 09:19:31 -0700422 avoid.size = sizeof(*ptr) + ptr->len;
423
Kees Cook06486d62016-05-09 13:22:07 -0700424 if (mem_overlaps(img, &avoid) && (avoid.start < earliest)) {
425 *overlap = avoid;
Baoquan He6daa2ec2016-07-01 15:34:40 +0800426 earliest = overlap->start;
Kees Cook06486d62016-05-09 13:22:07 -0700427 is_overlapping = true;
428 }
Kees Cook0cacbfb2014-09-11 09:19:31 -0700429
430 ptr = (struct setup_data *)(unsigned long)ptr->next;
431 }
432
Kees Cook06486d62016-05-09 13:22:07 -0700433 return is_overlapping;
Kees Cook82fa9632013-10-10 17:18:16 -0700434}
435
Baoquan Hec401cf12016-05-09 13:22:06 -0700436struct slot_area {
437 unsigned long addr;
438 int num;
439};
440
441#define MAX_SLOT_AREA 100
442
443static struct slot_area slot_areas[MAX_SLOT_AREA];
444
Kees Cooke290e8c2014-02-09 13:56:44 -0800445static unsigned long slot_max;
Kees Cook82fa9632013-10-10 17:18:16 -0700446
Baoquan Hec401cf12016-05-09 13:22:06 -0700447static unsigned long slot_area_index;
448
449static void store_slot_info(struct mem_vector *region, unsigned long image_size)
450{
451 struct slot_area slot_area;
452
453 if (slot_area_index == MAX_SLOT_AREA)
454 return;
455
456 slot_area.addr = region->start;
457 slot_area.num = (region->size - image_size) /
458 CONFIG_PHYSICAL_ALIGN + 1;
459
460 if (slot_area.num > 0) {
461 slot_areas[slot_area_index++] = slot_area;
462 slot_max += slot_area.num;
463 }
464}
465
Kees Cook82fa9632013-10-10 17:18:16 -0700466static unsigned long slots_fetch_random(void)
467{
Kees Cooked9f0072016-05-25 15:45:33 -0700468 unsigned long slot;
469 int i;
470
Kees Cook82fa9632013-10-10 17:18:16 -0700471 /* Handle case of no slots stored. */
472 if (slot_max == 0)
473 return 0;
474
Thomas Garnierd899a7d2016-06-21 17:46:58 -0700475 slot = kaslr_get_random_long("Physical") % slot_max;
Kees Cooked9f0072016-05-25 15:45:33 -0700476
477 for (i = 0; i < slot_area_index; i++) {
478 if (slot >= slot_areas[i].num) {
479 slot -= slot_areas[i].num;
480 continue;
481 }
482 return slot_areas[i].addr + slot * CONFIG_PHYSICAL_ALIGN;
483 }
484
485 if (i == slot_area_index)
486 debug_putstr("slots_fetch_random() failed!?\n");
487 return 0;
Kees Cook82fa9632013-10-10 17:18:16 -0700488}
489
Baoquan He27aac202017-07-09 20:37:41 +0800490static void process_mem_region(struct mem_vector *entry,
Kees Cook82fa9632013-10-10 17:18:16 -0700491 unsigned long minimum,
492 unsigned long image_size)
493{
Kees Cooked9f0072016-05-25 15:45:33 -0700494 struct mem_vector region, overlap;
495 struct slot_area slot_area;
Baoquan He4cdba142017-05-13 13:46:29 +0800496 unsigned long start_orig, end;
Baoquan He87891b02017-07-09 20:37:40 +0800497 struct mem_vector cur_entry;
Kees Cook82fa9632013-10-10 17:18:16 -0700498
Kees Cooked9f0072016-05-25 15:45:33 -0700499 /* On 32-bit, ignore entries entirely above our maximum. */
Baoquan He87891b02017-07-09 20:37:40 +0800500 if (IS_ENABLED(CONFIG_X86_32) && entry->start >= KERNEL_IMAGE_SIZE)
Kees Cook82fa9632013-10-10 17:18:16 -0700501 return;
502
503 /* Ignore entries entirely below our minimum. */
Baoquan He87891b02017-07-09 20:37:40 +0800504 if (entry->start + entry->size < minimum)
Kees Cook82fa9632013-10-10 17:18:16 -0700505 return;
506
Baoquan He4cdba142017-05-13 13:46:29 +0800507 /* Ignore entries above memory limit */
Baoquan He87891b02017-07-09 20:37:40 +0800508 end = min(entry->size + entry->start, mem_limit);
509 if (entry->start >= end)
Baoquan He4cdba142017-05-13 13:46:29 +0800510 return;
Baoquan He87891b02017-07-09 20:37:40 +0800511 cur_entry.start = entry->start;
512 cur_entry.size = end - entry->start;
Baoquan He4cdba142017-05-13 13:46:29 +0800513
Baoquan He87891b02017-07-09 20:37:40 +0800514 region.start = cur_entry.start;
Baoquan He4cdba142017-05-13 13:46:29 +0800515 region.size = cur_entry.size;
Kees Cook82fa9632013-10-10 17:18:16 -0700516
Kees Cooked9f0072016-05-25 15:45:33 -0700517 /* Give up if slot area array is full. */
518 while (slot_area_index < MAX_SLOT_AREA) {
519 start_orig = region.start;
Kees Cook82fa9632013-10-10 17:18:16 -0700520
Kees Cooked9f0072016-05-25 15:45:33 -0700521 /* Potentially raise address to minimum location. */
522 if (region.start < minimum)
523 region.start = minimum;
Kees Cook82fa9632013-10-10 17:18:16 -0700524
Kees Cooked9f0072016-05-25 15:45:33 -0700525 /* Potentially raise address to meet alignment needs. */
526 region.start = ALIGN(region.start, CONFIG_PHYSICAL_ALIGN);
Kees Cook82fa9632013-10-10 17:18:16 -0700527
Baoquan He27aac202017-07-09 20:37:41 +0800528 /* Did we raise the address above the passed in memory entry? */
Baoquan He87891b02017-07-09 20:37:40 +0800529 if (region.start > cur_entry.start + cur_entry.size)
Kees Cooked9f0072016-05-25 15:45:33 -0700530 return;
Kees Cook82fa9632013-10-10 17:18:16 -0700531
Kees Cooked9f0072016-05-25 15:45:33 -0700532 /* Reduce size by any delta from the original address. */
533 region.size -= region.start - start_orig;
Kees Cook82fa9632013-10-10 17:18:16 -0700534
Kees Cooked9f0072016-05-25 15:45:33 -0700535 /* On 32-bit, reduce region size to fit within max size. */
536 if (IS_ENABLED(CONFIG_X86_32) &&
537 region.start + region.size > KERNEL_IMAGE_SIZE)
538 region.size = KERNEL_IMAGE_SIZE - region.start;
539
540 /* Return if region can't contain decompressed kernel */
541 if (region.size < image_size)
542 return;
543
544 /* If nothing overlaps, store the region and return. */
545 if (!mem_avoid_overlap(&region, &overlap)) {
546 store_slot_info(&region, image_size);
547 return;
548 }
549
550 /* Store beginning of region if holds at least image_size. */
551 if (overlap.start > region.start + image_size) {
552 struct mem_vector beginning;
553
554 beginning.start = region.start;
555 beginning.size = overlap.start - region.start;
556 store_slot_info(&beginning, image_size);
557 }
558
559 /* Return if overlap extends to or past end of region. */
560 if (overlap.start + overlap.size >= region.start + region.size)
561 return;
562
563 /* Clip off the overlapping region and start over. */
564 region.size -= overlap.start - region.start + overlap.size;
565 region.start = overlap.start + overlap.size;
Kees Cook82fa9632013-10-10 17:18:16 -0700566 }
567}
568
Baoquan Hec05cd792017-08-14 22:54:24 +0800569#ifdef CONFIG_EFI
570/*
571 * Returns true if mirror region found (and must have been processed
572 * for slots adding)
573 */
574static bool
575process_efi_entries(unsigned long minimum, unsigned long image_size)
576{
577 struct efi_info *e = &boot_params->efi_info;
578 bool efi_mirror_found = false;
579 struct mem_vector region;
580 efi_memory_desc_t *md;
581 unsigned long pmap;
582 char *signature;
583 u32 nr_desc;
584 int i;
585
586 signature = (char *)&e->efi_loader_signature;
587 if (strncmp(signature, EFI32_LOADER_SIGNATURE, 4) &&
588 strncmp(signature, EFI64_LOADER_SIGNATURE, 4))
589 return false;
590
591#ifdef CONFIG_X86_32
592 /* Can't handle data above 4GB at this time */
593 if (e->efi_memmap_hi) {
594 warn("EFI memmap is above 4GB, can't be handled now on x86_32. EFI should be disabled.\n");
595 return false;
596 }
597 pmap = e->efi_memmap;
598#else
599 pmap = (e->efi_memmap | ((__u64)e->efi_memmap_hi << 32));
600#endif
601
602 nr_desc = e->efi_memmap_size / e->efi_memdesc_size;
603 for (i = 0; i < nr_desc; i++) {
604 md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
605 if (md->attribute & EFI_MEMORY_MORE_RELIABLE) {
Baoquan Hec05cd792017-08-14 22:54:24 +0800606 efi_mirror_found = true;
Naoya Horiguchi0982adc2017-08-28 16:30:59 +0900607 break;
Baoquan Hec05cd792017-08-14 22:54:24 +0800608 }
609 }
610
Naoya Horiguchi0982adc2017-08-28 16:30:59 +0900611 for (i = 0; i < nr_desc; i++) {
612 md = efi_early_memdesc_ptr(pmap, e->efi_memdesc_size, i);
613
614 /*
615 * Here we are more conservative in picking free memory than
616 * the EFI spec allows:
617 *
618 * According to the spec, EFI_BOOT_SERVICES_{CODE|DATA} are also
619 * free memory and thus available to place the kernel image into,
620 * but in practice there's firmware where using that memory leads
621 * to crashes.
622 *
623 * Only EFI_CONVENTIONAL_MEMORY is guaranteed to be free.
624 */
625 if (md->type != EFI_CONVENTIONAL_MEMORY)
626 continue;
627
628 if (efi_mirror_found &&
629 !(md->attribute & EFI_MEMORY_MORE_RELIABLE))
630 continue;
631
632 region.start = md->phys_addr;
633 region.size = md->num_pages << EFI_PAGE_SHIFT;
634 process_mem_region(&region, minimum, image_size);
635 if (slot_area_index == MAX_SLOT_AREA) {
636 debug_putstr("Aborted EFI scan (slot_areas full)!\n");
637 break;
638 }
639 }
640 return true;
Baoquan Hec05cd792017-08-14 22:54:24 +0800641}
642#else
643static inline bool
644process_efi_entries(unsigned long minimum, unsigned long image_size)
645{
646 return false;
647}
648#endif
649
Baoquan Hef62995c2017-07-09 20:37:39 +0800650static void process_e820_entries(unsigned long minimum,
651 unsigned long image_size)
652{
653 int i;
Baoquan He87891b02017-07-09 20:37:40 +0800654 struct mem_vector region;
Baoquan Hef62995c2017-07-09 20:37:39 +0800655 struct boot_e820_entry *entry;
656
657 /* Verify potential e820 positions, appending to slots list. */
658 for (i = 0; i < boot_params->e820_entries; i++) {
659 entry = &boot_params->e820_table[i];
660 /* Skip non-RAM entries. */
661 if (entry->type != E820_TYPE_RAM)
662 continue;
Baoquan He87891b02017-07-09 20:37:40 +0800663 region.start = entry->addr;
664 region.size = entry->size;
Baoquan He27aac202017-07-09 20:37:41 +0800665 process_mem_region(&region, minimum, image_size);
Baoquan Hef62995c2017-07-09 20:37:39 +0800666 if (slot_area_index == MAX_SLOT_AREA) {
667 debug_putstr("Aborted e820 scan (slot_areas full)!\n");
668 break;
669 }
670 }
671}
672
Baoquan He071a7492016-05-09 13:22:08 -0700673static unsigned long find_random_phys_addr(unsigned long minimum,
674 unsigned long image_size)
Kees Cook82fa9632013-10-10 17:18:16 -0700675{
Dave Jiangf2844242017-01-11 16:20:01 -0700676 /* Check if we had too many memmaps. */
677 if (memmap_too_large) {
Baoquan Hec05cd792017-08-14 22:54:24 +0800678 debug_putstr("Aborted memory entries scan (more than 4 memmap= args)!\n");
Dave Jiangf2844242017-01-11 16:20:01 -0700679 return 0;
680 }
681
Kees Cook82fa9632013-10-10 17:18:16 -0700682 /* Make sure minimum is aligned. */
683 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
684
Baoquan Hec05cd792017-08-14 22:54:24 +0800685 if (process_efi_entries(minimum, image_size))
686 return slots_fetch_random();
687
Baoquan Hef62995c2017-07-09 20:37:39 +0800688 process_e820_entries(minimum, image_size);
Kees Cook82fa9632013-10-10 17:18:16 -0700689 return slots_fetch_random();
690}
691
Baoquan He071a7492016-05-09 13:22:08 -0700692static unsigned long find_random_virt_addr(unsigned long minimum,
693 unsigned long image_size)
694{
695 unsigned long slots, random_addr;
696
697 /* Make sure minimum is aligned. */
698 minimum = ALIGN(minimum, CONFIG_PHYSICAL_ALIGN);
699 /* Align image_size for easy slot calculations. */
700 image_size = ALIGN(image_size, CONFIG_PHYSICAL_ALIGN);
701
702 /*
703 * There are how many CONFIG_PHYSICAL_ALIGN-sized slots
704 * that can hold image_size within the range of minimum to
705 * KERNEL_IMAGE_SIZE?
706 */
707 slots = (KERNEL_IMAGE_SIZE - minimum - image_size) /
708 CONFIG_PHYSICAL_ALIGN + 1;
709
Thomas Garnierd899a7d2016-06-21 17:46:58 -0700710 random_addr = kaslr_get_random_long("Virtual") % slots;
Baoquan He071a7492016-05-09 13:22:08 -0700711
712 return random_addr * CONFIG_PHYSICAL_ALIGN + minimum;
713}
714
Borislav Petkov549f90d2016-05-06 13:50:15 +0200715/*
716 * Since this function examines addresses much more numerically,
717 * it takes the input and output pointers as 'unsigned long'.
718 */
Baoquan He8391c732016-05-25 15:45:32 -0700719void choose_random_location(unsigned long input,
720 unsigned long input_size,
721 unsigned long *output,
722 unsigned long output_size,
723 unsigned long *virt_addr)
Kees Cook8ab38202013-10-10 17:18:14 -0700724{
Yinghai Lue066cc42016-05-25 15:45:34 -0700725 unsigned long random_addr, min_addr;
Kees Cook8ab38202013-10-10 17:18:14 -0700726
Kees Cook24f2e022014-06-13 13:30:36 -0700727 if (cmdline_find_option_bool("nokaslr")) {
Kees Cook0f8ede1b2016-04-20 13:55:46 -0700728 warn("KASLR disabled: 'nokaslr' on cmdline.");
Baoquan He8391c732016-05-25 15:45:32 -0700729 return;
Kees Cook24f2e022014-06-13 13:30:36 -0700730 }
Kees Cook8ab38202013-10-10 17:18:14 -0700731
Kirill A. Shutemov4c2b4052018-02-14 21:25:34 +0300732#ifdef CONFIG_X86_5LEVEL
733 if (__read_cr4() & X86_CR4_LA57) {
734 pgtable_l5_enabled = 1;
Kirill A. Shutemovb16e7702018-02-14 21:25:35 +0300735 pgdir_shift = 48;
736 ptrs_per_p4d = 512;
Kirill A. Shutemov4c2b4052018-02-14 21:25:34 +0300737 }
738#endif
739
Kees Cook6655e0a2016-04-18 09:42:12 -0700740 boot_params->hdr.loadflags |= KASLR_FLAG;
Borislav Petkov78cac482015-04-01 12:49:52 +0200741
Kees Cook11fdf972016-05-25 15:45:31 -0700742 /* Prepare to add new identity pagetables on demand. */
743 initialize_identity_maps();
744
Kees Cook82fa9632013-10-10 17:18:16 -0700745 /* Record the various known unsafe memory ranges. */
Baoquan He8391c732016-05-25 15:45:32 -0700746 mem_avoid_init(input, input_size, *output);
Kees Cook8ab38202013-10-10 17:18:14 -0700747
Yinghai Lue066cc42016-05-25 15:45:34 -0700748 /*
749 * Low end of the randomization range should be the
750 * smaller of 512M or the initial kernel image
751 * location:
752 */
753 min_addr = min(*output, 512UL << 20);
754
Baoquan Hec05cd792017-08-14 22:54:24 +0800755 /* Walk available memory entries to find a random address. */
Yinghai Lue066cc42016-05-25 15:45:34 -0700756 random_addr = find_random_phys_addr(min_addr, output_size);
Kees Cook90168752016-04-18 09:42:15 -0700757 if (!random_addr) {
Dave Jiangf2844242017-01-11 16:20:01 -0700758 warn("Physical KASLR disabled: no suitable memory region!");
Baoquan He8391c732016-05-25 15:45:32 -0700759 } else {
760 /* Update the new physical address location. */
761 if (*output != random_addr) {
762 add_identity_map(random_addr, output_size);
763 *output = random_addr;
764 }
Baoquan Heda63b6b2017-04-27 15:42:20 +0800765
766 /*
767 * This loads the identity mapping page table.
768 * This should only be done if a new physical address
769 * is found for the kernel, otherwise we should keep
770 * the old page table to make it be like the "nokaslr"
771 * case.
772 */
773 finalize_identity_maps();
Kees Cook82fa9632013-10-10 17:18:16 -0700774 }
775
Baoquan He8391c732016-05-25 15:45:32 -0700776
777 /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */
778 if (IS_ENABLED(CONFIG_X86_64))
779 random_addr = find_random_virt_addr(LOAD_PHYSICAL_ADDR, output_size);
780 *virt_addr = random_addr;
Kees Cook8ab38202013-10-10 17:18:14 -0700781}