blob: 5a840a605a162be7657920277621d50486961a5d [file] [log] [blame]
Andrea Arcangelic47174f2015-09-04 15:47:23 -07001/*
2 * Stress userfaultfd syscall.
3 *
4 * Copyright (C) 2015 Red Hat, Inc.
5 *
6 * This work is licensed under the terms of the GNU GPL, version 2. See
7 * the COPYING file in the top-level directory.
8 *
9 * This test allocates two virtual areas and bounces the physical
10 * memory across the two virtual areas (from area_src to area_dst)
11 * using userfaultfd.
12 *
13 * There are three threads running per CPU:
14 *
15 * 1) one per-CPU thread takes a per-page pthread_mutex in a random
16 * page of the area_dst (while the physical page may still be in
17 * area_src), and increments a per-page counter in the same page,
18 * and checks its value against a verification region.
19 *
20 * 2) another per-CPU thread handles the userfaults generated by
21 * thread 1 above. userfaultfd blocking reads or poll() modes are
22 * exercised interleaved.
23 *
24 * 3) one last per-CPU thread transfers the memory in the background
25 * at maximum bandwidth (if not already transferred by thread
26 * 2). Each cpu thread takes cares of transferring a portion of the
27 * area.
28 *
29 * When all threads of type 3 completed the transfer, one bounce is
30 * complete. area_src and area_dst are then swapped. All threads are
31 * respawned and so the bounce is immediately restarted in the
32 * opposite direction.
33 *
34 * per-CPU threads 1 by triggering userfaults inside
35 * pthread_mutex_lock will also verify the atomicity of the memory
36 * transfer (UFFDIO_COPY).
37 *
38 * The program takes two parameters: the amounts of physical memory in
39 * megabytes (MiB) of the area and the number of bounces to execute.
40 *
41 * # 100MiB 99999 bounces
42 * ./userfaultfd 100 99999
43 *
44 * # 1GiB 99 bounces
45 * ./userfaultfd 1000 99
46 *
47 * # 10MiB-~6GiB 999 bounces, continue forever unless an error triggers
48 * while ./userfaultfd $[RANDOM % 6000 + 10] 999; do true; done
49 */
50
51#define _GNU_SOURCE
52#include <stdio.h>
53#include <errno.h>
54#include <unistd.h>
55#include <stdlib.h>
56#include <sys/types.h>
57#include <sys/stat.h>
58#include <fcntl.h>
59#include <time.h>
60#include <signal.h>
61#include <poll.h>
62#include <string.h>
63#include <sys/mman.h>
64#include <sys/syscall.h>
65#include <sys/ioctl.h>
Mike Rapoportda5502c2017-02-22 15:44:06 -080066#include <sys/wait.h>
Andrea Arcangelic47174f2015-09-04 15:47:23 -070067#include <pthread.h>
Thierry Redingd0a87112015-09-22 14:58:52 -070068#include <linux/userfaultfd.h>
Andrea Arcangelic47174f2015-09-04 15:47:23 -070069
Michael Ellerman56ed8f12015-09-22 14:58:58 -070070#ifdef __NR_userfaultfd
Andrea Arcangelic47174f2015-09-04 15:47:23 -070071
72static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
73
74#define BOUNCE_RANDOM (1<<0)
75#define BOUNCE_RACINGFAULTS (1<<1)
76#define BOUNCE_VERIFY (1<<2)
77#define BOUNCE_POLL (1<<3)
78static int bounces;
79
Mike Kravetz9903bd72017-02-22 15:43:07 -080080#ifdef HUGETLB_TEST
81static int huge_fd;
82static char *huge_fd_off0;
83#endif
Andrea Arcangelic47174f2015-09-04 15:47:23 -070084static unsigned long long *count_verify;
Mike Rapoport6228b8f2017-02-22 15:44:01 -080085static int uffd, uffd_flags, finished, *pipefd;
Andrea Arcangelic47174f2015-09-04 15:47:23 -070086static char *area_src, *area_dst;
87static char *zeropage;
88pthread_attr_t attr;
89
90/* pthread_mutex_t starts at page offset 0 */
91#define area_mutex(___area, ___nr) \
92 ((pthread_mutex_t *) ((___area) + (___nr)*page_size))
93/*
94 * count is placed in the page after pthread_mutex_t naturally aligned
95 * to avoid non alignment faults on non-x86 archs.
96 */
97#define area_count(___area, ___nr) \
98 ((volatile unsigned long long *) ((unsigned long) \
99 ((___area) + (___nr)*page_size + \
100 sizeof(pthread_mutex_t) + \
101 sizeof(unsigned long long) - 1) & \
102 ~(unsigned long)(sizeof(unsigned long long) \
103 - 1)))
104
Mike Rapoport419624d2017-02-22 15:43:46 -0800105#if !defined(HUGETLB_TEST) && !defined(SHMEM_TEST)
Mike Kravetz9903bd72017-02-22 15:43:07 -0800106
Mike Rapoport419624d2017-02-22 15:43:46 -0800107/* Anonymous memory */
Mike Kravetz9903bd72017-02-22 15:43:07 -0800108#define EXPECTED_IOCTLS ((1 << _UFFDIO_WAKE) | \
109 (1 << _UFFDIO_COPY) | \
110 (1 << _UFFDIO_ZEROPAGE))
111
112static int release_pages(char *rel_area)
113{
114 int ret = 0;
115
116 if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
117 perror("madvise");
118 ret = 1;
119 }
120
121 return ret;
122}
123
124static void allocate_area(void **alloc_area)
125{
126 if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
127 fprintf(stderr, "out of memory\n");
128 *alloc_area = NULL;
129 }
130}
131
Mike Rapoport419624d2017-02-22 15:43:46 -0800132#else /* HUGETLB_TEST or SHMEM_TEST */
Mike Kravetz9903bd72017-02-22 15:43:07 -0800133
Mike Rapoportcac67322017-02-22 15:43:40 -0800134#define EXPECTED_IOCTLS UFFD_API_RANGE_IOCTLS_BASIC
Mike Kravetz9903bd72017-02-22 15:43:07 -0800135
Mike Rapoport419624d2017-02-22 15:43:46 -0800136#ifdef HUGETLB_TEST
137
138/* HugeTLB memory */
Mike Kravetz9903bd72017-02-22 15:43:07 -0800139static int release_pages(char *rel_area)
140{
141 int ret = 0;
142
143 if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
144 rel_area == huge_fd_off0 ? 0 :
145 nr_pages * page_size,
146 nr_pages * page_size)) {
147 perror("fallocate");
148 ret = 1;
149 }
150
151 return ret;
152}
153
154
155static void allocate_area(void **alloc_area)
156{
157 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
158 MAP_PRIVATE | MAP_HUGETLB, huge_fd,
159 *alloc_area == area_src ? 0 :
160 nr_pages * page_size);
161 if (*alloc_area == MAP_FAILED) {
162 fprintf(stderr, "mmap of hugetlbfs file failed\n");
163 *alloc_area = NULL;
164 }
165
166 if (*alloc_area == area_src)
167 huge_fd_off0 = *alloc_area;
168}
169
Mike Rapoport419624d2017-02-22 15:43:46 -0800170#elif defined(SHMEM_TEST)
171
172/* Shared memory */
173static int release_pages(char *rel_area)
174{
175 int ret = 0;
176
177 if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
178 perror("madvise");
179 ret = 1;
180 }
181
182 return ret;
183}
184
185static void allocate_area(void **alloc_area)
186{
187 *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
188 MAP_ANONYMOUS | MAP_SHARED, -1, 0);
189 if (*alloc_area == MAP_FAILED) {
190 fprintf(stderr, "shared memory mmap failed\n");
191 *alloc_area = NULL;
192 }
193}
194
195#else /* SHMEM_TEST */
196#error "Undefined test type"
Mike Kravetz9903bd72017-02-22 15:43:07 -0800197#endif /* HUGETLB_TEST */
198
Mike Rapoport419624d2017-02-22 15:43:46 -0800199#endif /* !defined(HUGETLB_TEST) && !defined(SHMEM_TEST) */
200
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700201static int my_bcmp(char *str1, char *str2, size_t n)
202{
203 unsigned long i;
204 for (i = 0; i < n; i++)
205 if (str1[i] != str2[i])
206 return 1;
207 return 0;
208}
209
210static void *locking_thread(void *arg)
211{
212 unsigned long cpu = (unsigned long) arg;
213 struct random_data rand;
214 unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
215 int32_t rand_nr;
216 unsigned long long count;
217 char randstate[64];
218 unsigned int seed;
219 time_t start;
220
221 if (bounces & BOUNCE_RANDOM) {
222 seed = (unsigned int) time(NULL) - bounces;
223 if (!(bounces & BOUNCE_RACINGFAULTS))
224 seed += cpu;
225 bzero(&rand, sizeof(rand));
226 bzero(&randstate, sizeof(randstate));
227 if (initstate_r(seed, randstate, sizeof(randstate), &rand))
228 fprintf(stderr, "srandom_r error\n"), exit(1);
229 } else {
230 page_nr = -bounces;
231 if (!(bounces & BOUNCE_RACINGFAULTS))
232 page_nr += cpu * nr_pages_per_cpu;
233 }
234
235 while (!finished) {
236 if (bounces & BOUNCE_RANDOM) {
237 if (random_r(&rand, &rand_nr))
238 fprintf(stderr, "random_r 1 error\n"), exit(1);
239 page_nr = rand_nr;
240 if (sizeof(page_nr) > sizeof(rand_nr)) {
241 if (random_r(&rand, &rand_nr))
242 fprintf(stderr, "random_r 2 error\n"), exit(1);
Geert Uytterhoevenaf8713b2015-09-08 14:58:25 -0700243 page_nr |= (((unsigned long) rand_nr) << 16) <<
244 16;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700245 }
246 } else
247 page_nr += 1;
248 page_nr %= nr_pages;
249
250 start = time(NULL);
251 if (bounces & BOUNCE_VERIFY) {
252 count = *area_count(area_dst, page_nr);
253 if (!count)
254 fprintf(stderr,
255 "page_nr %lu wrong count %Lu %Lu\n",
256 page_nr, count,
257 count_verify[page_nr]), exit(1);
258
259
260 /*
261 * We can't use bcmp (or memcmp) because that
262 * returns 0 erroneously if the memory is
263 * changing under it (even if the end of the
264 * page is never changing and always
265 * different).
266 */
267#if 1
268 if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
269 page_size))
270 fprintf(stderr,
271 "my_bcmp page_nr %lu wrong count %Lu %Lu\n",
272 page_nr, count,
273 count_verify[page_nr]), exit(1);
274#else
275 unsigned long loops;
276
277 loops = 0;
278 /* uncomment the below line to test with mutex */
279 /* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
280 while (!bcmp(area_dst + page_nr * page_size, zeropage,
281 page_size)) {
282 loops += 1;
283 if (loops > 10)
284 break;
285 }
286 /* uncomment below line to test with mutex */
287 /* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
288 if (loops) {
289 fprintf(stderr,
290 "page_nr %lu all zero thread %lu %p %lu\n",
291 page_nr, cpu, area_dst + page_nr * page_size,
292 loops);
293 if (loops > 10)
294 exit(1);
295 }
296#endif
297 }
298
299 pthread_mutex_lock(area_mutex(area_dst, page_nr));
300 count = *area_count(area_dst, page_nr);
301 if (count != count_verify[page_nr]) {
302 fprintf(stderr,
303 "page_nr %lu memory corruption %Lu %Lu\n",
304 page_nr, count,
305 count_verify[page_nr]), exit(1);
306 }
307 count++;
308 *area_count(area_dst, page_nr) = count_verify[page_nr] = count;
309 pthread_mutex_unlock(area_mutex(area_dst, page_nr));
310
311 if (time(NULL) - start > 1)
312 fprintf(stderr,
313 "userfault too slow %ld "
314 "possible false positive with overcommit\n",
315 time(NULL) - start);
316 }
317
318 return NULL;
319}
320
Mike Rapoportaa0d2722017-02-22 15:44:04 -0800321static int copy_page(int ufd, unsigned long offset)
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700322{
323 struct uffdio_copy uffdio_copy;
324
325 if (offset >= nr_pages * page_size)
326 fprintf(stderr, "unexpected offset %lu\n",
327 offset), exit(1);
328 uffdio_copy.dst = (unsigned long) area_dst + offset;
329 uffdio_copy.src = (unsigned long) area_src + offset;
330 uffdio_copy.len = page_size;
331 uffdio_copy.mode = 0;
332 uffdio_copy.copy = 0;
Mike Rapoportaa0d2722017-02-22 15:44:04 -0800333 if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700334 /* real retval in ufdio_copy.copy */
335 if (uffdio_copy.copy != -EEXIST)
336 fprintf(stderr, "UFFDIO_COPY error %Ld\n",
337 uffdio_copy.copy), exit(1);
338 } else if (uffdio_copy.copy != page_size) {
339 fprintf(stderr, "UFFDIO_COPY unexpected copy %Ld\n",
340 uffdio_copy.copy), exit(1);
341 } else
342 return 1;
343 return 0;
344}
345
346static void *uffd_poll_thread(void *arg)
347{
348 unsigned long cpu = (unsigned long) arg;
349 struct pollfd pollfd[2];
350 struct uffd_msg msg;
Mike Rapoportda5502c2017-02-22 15:44:06 -0800351 struct uffdio_register uffd_reg;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700352 int ret;
353 unsigned long offset;
354 char tmp_chr;
355 unsigned long userfaults = 0;
356
357 pollfd[0].fd = uffd;
358 pollfd[0].events = POLLIN;
359 pollfd[1].fd = pipefd[cpu*2];
360 pollfd[1].events = POLLIN;
361
362 for (;;) {
363 ret = poll(pollfd, 2, -1);
364 if (!ret)
365 fprintf(stderr, "poll error %d\n", ret), exit(1);
366 if (ret < 0)
367 perror("poll"), exit(1);
368 if (pollfd[1].revents & POLLIN) {
369 if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
370 fprintf(stderr, "read pipefd error\n"),
371 exit(1);
372 break;
373 }
374 if (!(pollfd[0].revents & POLLIN))
375 fprintf(stderr, "pollfd[0].revents %d\n",
376 pollfd[0].revents), exit(1);
377 ret = read(uffd, &msg, sizeof(msg));
378 if (ret < 0) {
379 if (errno == EAGAIN)
380 continue;
381 perror("nonblocking read error"), exit(1);
382 }
Mike Rapoportda5502c2017-02-22 15:44:06 -0800383 switch (msg.event) {
384 default:
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700385 fprintf(stderr, "unexpected msg event %u\n",
386 msg.event), exit(1);
Mike Rapoportda5502c2017-02-22 15:44:06 -0800387 break;
388 case UFFD_EVENT_PAGEFAULT:
389 if (msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
390 fprintf(stderr, "unexpected write fault\n"), exit(1);
391 offset = (char *)(unsigned long)msg.arg.pagefault.address -
392 area_dst;
393 offset &= ~(page_size-1);
394 if (copy_page(uffd, offset))
395 userfaults++;
396 break;
397 case UFFD_EVENT_FORK:
398 uffd = msg.arg.fork.ufd;
399 pollfd[0].fd = uffd;
400 break;
401 case UFFD_EVENT_MADVDONTNEED:
402 uffd_reg.range.start = msg.arg.madv_dn.start;
403 uffd_reg.range.len = msg.arg.madv_dn.end -
404 msg.arg.madv_dn.start;
405 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
406 fprintf(stderr, "madv_dn failure\n"), exit(1);
407 break;
408 case UFFD_EVENT_REMAP:
409 area_dst = (char *)(unsigned long)msg.arg.remap.to;
410 break;
411 }
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700412 }
413 return (void *)userfaults;
414}
415
416pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
417
418static void *uffd_read_thread(void *arg)
419{
420 unsigned long *this_cpu_userfaults;
421 struct uffd_msg msg;
422 unsigned long offset;
423 int ret;
424
425 this_cpu_userfaults = (unsigned long *) arg;
426 *this_cpu_userfaults = 0;
427
428 pthread_mutex_unlock(&uffd_read_mutex);
429 /* from here cancellation is ok */
430
431 for (;;) {
432 ret = read(uffd, &msg, sizeof(msg));
433 if (ret != sizeof(msg)) {
434 if (ret < 0)
435 perror("blocking read error"), exit(1);
436 else
437 fprintf(stderr, "short read\n"), exit(1);
438 }
439 if (msg.event != UFFD_EVENT_PAGEFAULT)
440 fprintf(stderr, "unexpected msg event %u\n",
441 msg.event), exit(1);
442 if (bounces & BOUNCE_VERIFY &&
443 msg.arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
444 fprintf(stderr, "unexpected write fault\n"), exit(1);
Geert Uytterhoevenaf8713b2015-09-08 14:58:25 -0700445 offset = (char *)(unsigned long)msg.arg.pagefault.address -
446 area_dst;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700447 offset &= ~(page_size-1);
Mike Rapoportaa0d2722017-02-22 15:44:04 -0800448 if (copy_page(uffd, offset))
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700449 (*this_cpu_userfaults)++;
450 }
451 return (void *)NULL;
452}
453
454static void *background_thread(void *arg)
455{
456 unsigned long cpu = (unsigned long) arg;
457 unsigned long page_nr;
458
459 for (page_nr = cpu * nr_pages_per_cpu;
460 page_nr < (cpu+1) * nr_pages_per_cpu;
461 page_nr++)
Mike Rapoportaa0d2722017-02-22 15:44:04 -0800462 copy_page(uffd, page_nr * page_size);
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700463
464 return NULL;
465}
466
467static int stress(unsigned long *userfaults)
468{
469 unsigned long cpu;
470 pthread_t locking_threads[nr_cpus];
471 pthread_t uffd_threads[nr_cpus];
472 pthread_t background_threads[nr_cpus];
473 void **_userfaults = (void **) userfaults;
474
475 finished = 0;
476 for (cpu = 0; cpu < nr_cpus; cpu++) {
477 if (pthread_create(&locking_threads[cpu], &attr,
478 locking_thread, (void *)cpu))
479 return 1;
480 if (bounces & BOUNCE_POLL) {
481 if (pthread_create(&uffd_threads[cpu], &attr,
482 uffd_poll_thread, (void *)cpu))
483 return 1;
484 } else {
485 if (pthread_create(&uffd_threads[cpu], &attr,
486 uffd_read_thread,
487 &_userfaults[cpu]))
488 return 1;
489 pthread_mutex_lock(&uffd_read_mutex);
490 }
491 if (pthread_create(&background_threads[cpu], &attr,
492 background_thread, (void *)cpu))
493 return 1;
494 }
495 for (cpu = 0; cpu < nr_cpus; cpu++)
496 if (pthread_join(background_threads[cpu], NULL))
497 return 1;
498
499 /*
500 * Be strict and immediately zap area_src, the whole area has
501 * been transferred already by the background treads. The
502 * area_src could then be faulted in in a racy way by still
503 * running uffdio_threads reading zeropages after we zapped
504 * area_src (but they're guaranteed to get -EEXIST from
505 * UFFDIO_COPY without writing zero pages into area_dst
506 * because the background threads already completed).
507 */
Mike Kravetz9903bd72017-02-22 15:43:07 -0800508 if (release_pages(area_src))
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700509 return 1;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700510
511 for (cpu = 0; cpu < nr_cpus; cpu++) {
512 char c;
513 if (bounces & BOUNCE_POLL) {
514 if (write(pipefd[cpu*2+1], &c, 1) != 1) {
515 fprintf(stderr, "pipefd write error\n");
516 return 1;
517 }
518 if (pthread_join(uffd_threads[cpu], &_userfaults[cpu]))
519 return 1;
520 } else {
521 if (pthread_cancel(uffd_threads[cpu]))
522 return 1;
523 if (pthread_join(uffd_threads[cpu], NULL))
524 return 1;
525 }
526 }
527
528 finished = 1;
529 for (cpu = 0; cpu < nr_cpus; cpu++)
530 if (pthread_join(locking_threads[cpu], NULL))
531 return 1;
532
533 return 0;
534}
535
Mike Rapoportda5502c2017-02-22 15:44:06 -0800536static int userfaultfd_open(int features)
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700537{
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700538 struct uffdio_api uffdio_api;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700539
540 uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
541 if (uffd < 0) {
542 fprintf(stderr,
543 "userfaultfd syscall not available in this kernel\n");
544 return 1;
545 }
546 uffd_flags = fcntl(uffd, F_GETFD, NULL);
547
548 uffdio_api.api = UFFD_API;
Mike Rapoportda5502c2017-02-22 15:44:06 -0800549 uffdio_api.features = features;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700550 if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
551 fprintf(stderr, "UFFDIO_API\n");
552 return 1;
553 }
554 if (uffdio_api.api != UFFD_API) {
555 fprintf(stderr, "UFFDIO_API error %Lu\n", uffdio_api.api);
556 return 1;
557 }
558
Mike Rapoport6228b8f2017-02-22 15:44:01 -0800559 return 0;
560}
561
Mike Rapoportda5502c2017-02-22 15:44:06 -0800562/*
563 * For non-cooperative userfaultfd test we fork() a process that will
564 * generate pagefaults, will mremap the area monitored by the
565 * userfaultfd and at last this process will release the monitored
566 * area.
567 * For the anonymous and shared memory the area is divided into two
568 * parts, the first part is accessed before mremap, and the second
569 * part is accessed after mremap. Since hugetlbfs does not support
570 * mremap, the entire monitored area is accessed in a single pass for
571 * HUGETLB_TEST.
572 * The release of the pages currently generates event only for
573 * anonymous memory (UFFD_EVENT_MADVDONTNEED), hence it is not checked
574 * for hugetlb and shmem.
575 */
576static int faulting_process(void)
577{
578 unsigned long nr;
579 unsigned long long count;
580
581#ifndef HUGETLB_TEST
582 unsigned long split_nr_pages = (nr_pages + 1) / 2;
583#else
584 unsigned long split_nr_pages = nr_pages;
585#endif
586
587 for (nr = 0; nr < split_nr_pages; nr++) {
588 count = *area_count(area_dst, nr);
589 if (count != count_verify[nr]) {
590 fprintf(stderr,
591 "nr %lu memory corruption %Lu %Lu\n",
592 nr, count,
593 count_verify[nr]), exit(1);
594 }
595 }
596
597#ifndef HUGETLB_TEST
598 area_dst = mremap(area_dst, nr_pages * page_size, nr_pages * page_size,
599 MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
600 if (area_dst == MAP_FAILED)
601 perror("mremap"), exit(1);
602
603 for (; nr < nr_pages; nr++) {
604 count = *area_count(area_dst, nr);
605 if (count != count_verify[nr]) {
606 fprintf(stderr,
607 "nr %lu memory corruption %Lu %Lu\n",
608 nr, count,
609 count_verify[nr]), exit(1);
610 }
611 }
612
613#ifndef SHMEM_TEST
614 if (release_pages(area_dst))
615 return 1;
616
617 for (nr = 0; nr < nr_pages; nr++) {
618 if (my_bcmp(area_dst + nr * page_size, zeropage, page_size))
619 fprintf(stderr, "nr %lu is not zero\n", nr), exit(1);
620 }
621#endif /* SHMEM_TEST */
622
623#endif /* HUGETLB_TEST */
624
625 return 0;
626}
627
Andrea Arcangeli7a0c4cf2017-02-22 15:44:10 -0800628static int uffdio_zeropage(int ufd, unsigned long offset)
629{
630 struct uffdio_zeropage uffdio_zeropage;
631 int ret;
632 unsigned long has_zeropage = EXPECTED_IOCTLS & (1 << _UFFDIO_ZEROPAGE);
633
634 if (offset >= nr_pages * page_size)
635 fprintf(stderr, "unexpected offset %lu\n",
636 offset), exit(1);
637 uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
638 uffdio_zeropage.range.len = page_size;
639 uffdio_zeropage.mode = 0;
640 ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
641 if (ret) {
642 /* real retval in ufdio_zeropage.zeropage */
643 if (has_zeropage) {
644 if (uffdio_zeropage.zeropage == -EEXIST)
645 fprintf(stderr, "UFFDIO_ZEROPAGE -EEXIST\n"),
646 exit(1);
647 else
648 fprintf(stderr, "UFFDIO_ZEROPAGE error %Ld\n",
649 uffdio_zeropage.zeropage), exit(1);
650 } else {
651 if (uffdio_zeropage.zeropage != -EINVAL)
652 fprintf(stderr,
653 "UFFDIO_ZEROPAGE not -EINVAL %Ld\n",
654 uffdio_zeropage.zeropage), exit(1);
655 }
656 } else if (has_zeropage) {
657 if (uffdio_zeropage.zeropage != page_size) {
658 fprintf(stderr, "UFFDIO_ZEROPAGE unexpected %Ld\n",
659 uffdio_zeropage.zeropage), exit(1);
660 } else
661 return 1;
662 } else {
663 fprintf(stderr,
664 "UFFDIO_ZEROPAGE succeeded %Ld\n",
665 uffdio_zeropage.zeropage), exit(1);
666 }
667
668 return 0;
669}
670
671/* exercise UFFDIO_ZEROPAGE */
672static int userfaultfd_zeropage_test(void)
673{
674 struct uffdio_register uffdio_register;
675 unsigned long expected_ioctls;
676
677 printf("testing UFFDIO_ZEROPAGE: ");
678 fflush(stdout);
679
680 if (release_pages(area_dst))
681 return 1;
682
683 if (userfaultfd_open(0) < 0)
684 return 1;
685 uffdio_register.range.start = (unsigned long) area_dst;
686 uffdio_register.range.len = nr_pages * page_size;
687 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
688 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
689 fprintf(stderr, "register failure\n"), exit(1);
690
691 expected_ioctls = EXPECTED_IOCTLS;
692 if ((uffdio_register.ioctls & expected_ioctls) !=
693 expected_ioctls)
694 fprintf(stderr,
695 "unexpected missing ioctl for anon memory\n"),
696 exit(1);
697
698 if (uffdio_zeropage(uffd, 0)) {
699 if (my_bcmp(area_dst, zeropage, page_size))
700 fprintf(stderr, "zeropage is not zero\n"), exit(1);
701 }
702
703 close(uffd);
704 printf("done.\n");
705 return 0;
706}
707
Mike Rapoportda5502c2017-02-22 15:44:06 -0800708static int userfaultfd_events_test(void)
709{
710 struct uffdio_register uffdio_register;
711 unsigned long expected_ioctls;
712 unsigned long userfaults;
713 pthread_t uffd_mon;
714 int err, features;
715 pid_t pid;
716 char c;
717
718 printf("testing events (fork, remap, madv_dn): ");
719 fflush(stdout);
720
721 if (release_pages(area_dst))
722 return 1;
723
724 features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
725 UFFD_FEATURE_EVENT_MADVDONTNEED;
726 if (userfaultfd_open(features) < 0)
727 return 1;
728 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
729
730 uffdio_register.range.start = (unsigned long) area_dst;
731 uffdio_register.range.len = nr_pages * page_size;
732 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
733 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register))
734 fprintf(stderr, "register failure\n"), exit(1);
735
736 expected_ioctls = EXPECTED_IOCTLS;
737 if ((uffdio_register.ioctls & expected_ioctls) !=
738 expected_ioctls)
739 fprintf(stderr,
740 "unexpected missing ioctl for anon memory\n"),
741 exit(1);
742
743 if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, NULL))
744 perror("uffd_poll_thread create"), exit(1);
745
746 pid = fork();
747 if (pid < 0)
748 perror("fork"), exit(1);
749
750 if (!pid)
751 return faulting_process();
752
753 waitpid(pid, &err, 0);
754 if (err)
755 fprintf(stderr, "faulting process failed\n"), exit(1);
756
757 if (write(pipefd[1], &c, sizeof(c)) != sizeof(c))
758 perror("pipe write"), exit(1);
759 if (pthread_join(uffd_mon, (void **)&userfaults))
760 return 1;
761
762 close(uffd);
763 printf("userfaults: %ld\n", userfaults);
764
765 return userfaults != nr_pages;
766}
767
Mike Rapoport6228b8f2017-02-22 15:44:01 -0800768static int userfaultfd_stress(void)
769{
770 void *area;
771 char *tmp_area;
772 unsigned long nr;
773 struct uffdio_register uffdio_register;
774 unsigned long cpu;
775 int err;
776 unsigned long userfaults[nr_cpus];
777
778 allocate_area((void **)&area_src);
779 if (!area_src)
780 return 1;
781 allocate_area((void **)&area_dst);
782 if (!area_dst)
783 return 1;
784
Mike Rapoportda5502c2017-02-22 15:44:06 -0800785 if (userfaultfd_open(0) < 0)
Mike Rapoport6228b8f2017-02-22 15:44:01 -0800786 return 1;
787
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700788 count_verify = malloc(nr_pages * sizeof(unsigned long long));
789 if (!count_verify) {
790 perror("count_verify");
791 return 1;
792 }
793
794 for (nr = 0; nr < nr_pages; nr++) {
795 *area_mutex(area_src, nr) = (pthread_mutex_t)
796 PTHREAD_MUTEX_INITIALIZER;
797 count_verify[nr] = *area_count(area_src, nr) = 1;
Andrea Arcangeli1f5fee22015-09-22 14:59:00 -0700798 /*
799 * In the transition between 255 to 256, powerpc will
800 * read out of order in my_bcmp and see both bytes as
801 * zero, so leave a placeholder below always non-zero
802 * after the count, to avoid my_bcmp to trigger false
803 * positives.
804 */
805 *(area_count(area_src, nr) + 1) = 1;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700806 }
807
808 pipefd = malloc(sizeof(int) * nr_cpus * 2);
809 if (!pipefd) {
810 perror("pipefd");
811 return 1;
812 }
813 for (cpu = 0; cpu < nr_cpus; cpu++) {
814 if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
815 perror("pipe");
816 return 1;
817 }
818 }
819
820 if (posix_memalign(&area, page_size, page_size)) {
821 fprintf(stderr, "out of memory\n");
822 return 1;
823 }
824 zeropage = area;
825 bzero(zeropage, page_size);
826
827 pthread_mutex_lock(&uffd_read_mutex);
828
829 pthread_attr_init(&attr);
830 pthread_attr_setstacksize(&attr, 16*1024*1024);
831
Andrea Arcangelia5932bf2015-09-22 14:59:03 -0700832 err = 0;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700833 while (bounces--) {
834 unsigned long expected_ioctls;
835
836 printf("bounces: %d, mode:", bounces);
837 if (bounces & BOUNCE_RANDOM)
838 printf(" rnd");
839 if (bounces & BOUNCE_RACINGFAULTS)
840 printf(" racing");
841 if (bounces & BOUNCE_VERIFY)
842 printf(" ver");
843 if (bounces & BOUNCE_POLL)
844 printf(" poll");
845 printf(", ");
846 fflush(stdout);
847
848 if (bounces & BOUNCE_POLL)
849 fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
850 else
851 fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
852
853 /* register */
854 uffdio_register.range.start = (unsigned long) area_dst;
855 uffdio_register.range.len = nr_pages * page_size;
856 uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
857 if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
858 fprintf(stderr, "register failure\n");
859 return 1;
860 }
Mike Kravetz9903bd72017-02-22 15:43:07 -0800861 expected_ioctls = EXPECTED_IOCTLS;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700862 if ((uffdio_register.ioctls & expected_ioctls) !=
863 expected_ioctls) {
864 fprintf(stderr,
865 "unexpected missing ioctl for anon memory\n");
866 return 1;
867 }
868
869 /*
870 * The madvise done previously isn't enough: some
871 * uffd_thread could have read userfaults (one of
872 * those already resolved by the background thread)
873 * and it may be in the process of calling
874 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
875 * area_src and it would map a zero page in it (of
876 * course such a UFFDIO_COPY is perfectly safe as it'd
877 * return -EEXIST). The problem comes at the next
878 * bounce though: that racing UFFDIO_COPY would
879 * generate zeropages in the area_src, so invalidating
880 * the previous MADV_DONTNEED. Without this additional
881 * MADV_DONTNEED those zeropages leftovers in the
882 * area_src would lead to -EEXIST failure during the
883 * next bounce, effectively leaving a zeropage in the
884 * area_dst.
885 *
886 * Try to comment this out madvise to see the memory
887 * corruption being caught pretty quick.
888 *
889 * khugepaged is also inhibited to collapse THP after
890 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
891 * required to MADV_DONTNEED here.
892 */
Mike Kravetz9903bd72017-02-22 15:43:07 -0800893 if (release_pages(area_dst))
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700894 return 1;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700895
896 /* bounce pass */
897 if (stress(userfaults))
898 return 1;
899
900 /* unregister */
901 if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
902 fprintf(stderr, "register failure\n");
903 return 1;
904 }
905
906 /* verification */
907 if (bounces & BOUNCE_VERIFY) {
908 for (nr = 0; nr < nr_pages; nr++) {
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700909 if (*area_count(area_dst, nr) != count_verify[nr]) {
910 fprintf(stderr,
911 "error area_count %Lu %Lu %lu\n",
912 *area_count(area_src, nr),
913 count_verify[nr],
914 nr);
Andrea Arcangelia5932bf2015-09-22 14:59:03 -0700915 err = 1;
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700916 bounces = 0;
917 }
918 }
919 }
920
921 /* prepare next bounce */
922 tmp_area = area_src;
923 area_src = area_dst;
924 area_dst = tmp_area;
925
926 printf("userfaults:");
927 for (cpu = 0; cpu < nr_cpus; cpu++)
928 printf(" %lu", userfaults[cpu]);
929 printf("\n");
930 }
931
Mike Rapoportda5502c2017-02-22 15:44:06 -0800932 if (err)
933 return err;
934
935 close(uffd);
Andrea Arcangeli7a0c4cf2017-02-22 15:44:10 -0800936 return userfaultfd_zeropage_test() || userfaultfd_events_test();
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700937}
938
Mike Kravetz9903bd72017-02-22 15:43:07 -0800939#ifndef HUGETLB_TEST
940
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700941int main(int argc, char **argv)
942{
943 if (argc < 3)
944 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
945 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
946 page_size = sysconf(_SC_PAGE_SIZE);
Andrea Arcangeli1f5fee22015-09-22 14:59:00 -0700947 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
948 > page_size)
Andrea Arcangelic47174f2015-09-04 15:47:23 -0700949 fprintf(stderr, "Impossible to run this test\n"), exit(2);
950 nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
951 nr_cpus;
952 if (!nr_pages_per_cpu) {
953 fprintf(stderr, "invalid MiB\n");
954 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
955 }
956 bounces = atoi(argv[2]);
957 if (bounces <= 0) {
958 fprintf(stderr, "invalid bounces\n");
959 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
960 }
961 nr_pages = nr_pages_per_cpu * nr_cpus;
962 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
963 nr_pages, nr_pages_per_cpu);
964 return userfaultfd_stress();
965}
Michael Ellerman56ed8f12015-09-22 14:58:58 -0700966
Mike Kravetz9903bd72017-02-22 15:43:07 -0800967#else /* HUGETLB_TEST */
968
969/*
970 * Copied from mlock2-tests.c
971 */
972unsigned long default_huge_page_size(void)
973{
974 unsigned long hps = 0;
975 char *line = NULL;
976 size_t linelen = 0;
977 FILE *f = fopen("/proc/meminfo", "r");
978
979 if (!f)
980 return 0;
981 while (getline(&line, &linelen, f) > 0) {
982 if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) {
983 hps <<= 10;
984 break;
985 }
986 }
987
988 free(line);
989 fclose(f);
990 return hps;
991}
992
993int main(int argc, char **argv)
994{
995 if (argc < 4)
996 fprintf(stderr, "Usage: <MiB> <bounces> <hugetlbfs_file>\n"),
997 exit(1);
998 nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
999 page_size = default_huge_page_size();
1000 if (!page_size)
1001 fprintf(stderr, "Unable to determine huge page size\n"),
1002 exit(2);
1003 if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1004 > page_size)
1005 fprintf(stderr, "Impossible to run this test\n"), exit(2);
1006 nr_pages_per_cpu = atol(argv[1]) * 1024*1024 / page_size /
1007 nr_cpus;
1008 if (!nr_pages_per_cpu) {
1009 fprintf(stderr, "invalid MiB\n");
1010 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1011 }
1012 bounces = atoi(argv[2]);
1013 if (bounces <= 0) {
1014 fprintf(stderr, "invalid bounces\n");
1015 fprintf(stderr, "Usage: <MiB> <bounces>\n"), exit(1);
1016 }
1017 nr_pages = nr_pages_per_cpu * nr_cpus;
1018 huge_fd = open(argv[3], O_CREAT | O_RDWR, 0755);
1019 if (huge_fd < 0) {
1020 fprintf(stderr, "Open of %s failed", argv[3]);
1021 perror("open");
1022 exit(1);
1023 }
1024 if (ftruncate(huge_fd, 0)) {
1025 fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1026 perror("ftruncate");
1027 exit(1);
1028 }
1029 printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1030 nr_pages, nr_pages_per_cpu);
1031 return userfaultfd_stress();
1032}
1033
1034#endif
Michael Ellerman56ed8f12015-09-22 14:58:58 -07001035#else /* __NR_userfaultfd */
1036
1037#warning "missing __NR_userfaultfd definition"
1038
1039int main(void)
1040{
1041 printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1042 return 0;
1043}
1044
1045#endif /* __NR_userfaultfd */