| /* |
| * Copyright © 2015 Intel Corporation |
| * |
| * Permission is hereby granted, free of charge, to any person obtaining a |
| * copy of this software and associated documentation files (the "Software"), |
| * to deal in the Software without restriction, including without limitation |
| * the rights to use, copy, modify, merge, publish, distribute, sublicense, |
| * and/or sell copies of the Software, and to permit persons to whom the |
| * Software is furnished to do so, subject to the following conditions: |
| * |
| * The above copyright notice and this permission notice (including the next |
| * paragraph) shall be included in all copies or substantial portions of the |
| * Software. |
| * |
| * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
| * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
| * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL |
| * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
| * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING |
| * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS |
| * IN THE SOFTWARE. |
| * |
| * Authors: |
| * Tiago Vignatti |
| */ |
| |
| /** @file prime_mmap_coherency.c |
| * |
| * TODO: need to show the need for prime_sync_end(). |
| */ |
| |
| #include "igt.h" |
| |
| IGT_TEST_DESCRIPTION("Test dma-buf mmap on !llc platforms mostly and provoke" |
| " coherency bugs so we know for sure where we need the sync ioctls."); |
| |
| #define ROUNDS 20 |
| |
| int fd; |
| int stale = 0; |
| static drm_intel_bufmgr *bufmgr; |
| struct intel_batchbuffer *batch; |
| static int width = 1024, height = 1024; |
| |
| /* |
| * Exercises the need for read flush: |
| * 1. create a BO and write '0's, in GTT domain. |
| * 2. read BO using the dma-buf CPU mmap. |
| * 3. write '1's, in GTT domain. |
| * 4. read again through the mapped dma-buf. |
| */ |
| static void test_read_flush(bool expect_stale_cache) |
| { |
| drm_intel_bo *bo_1; |
| drm_intel_bo *bo_2; |
| uint32_t *ptr_cpu; |
| uint32_t *ptr_gtt; |
| int dma_buf_fd, i; |
| |
| bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096); |
| |
| /* STEP #1: put the BO 1 in GTT domain. We use the blitter to copy and fill |
| * zeros to BO 1, so commands will be submitted and likely to place BO 1 in |
| * the GTT domain. */ |
| bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096); |
| intel_copy_bo(batch, bo_1, bo_2, width * height); |
| gem_sync(fd, bo_1->handle); |
| drm_intel_bo_unreference(bo_2); |
| |
| /* STEP #2: read BO 1 using the dma-buf CPU mmap. This dirties the CPU caches. */ |
| dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle); |
| |
| /* STEP #3: write 0x11 into BO 1. */ |
| bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096); |
| ptr_gtt = gem_mmap__gtt(fd, bo_2->handle, width * height, PROT_READ | PROT_WRITE); |
| gem_set_domain(fd, bo_2->handle, |
| I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT); |
| memset(ptr_gtt, 0xc5, width * height); |
| munmap(ptr_gtt, width * height); |
| |
| ptr_cpu = mmap(NULL, width * height, PROT_READ, |
| MAP_SHARED, dma_buf_fd, 0); |
| igt_assert(ptr_cpu != MAP_FAILED); |
| |
| for (i = 0; i < (width * height) / 4; i++) |
| igt_assert_eq(ptr_cpu[i], 0); |
| |
| intel_copy_bo(batch, bo_1, bo_2, width * height); |
| gem_sync(fd, bo_1->handle); |
| drm_intel_bo_unreference(bo_2); |
| |
| /* STEP #4: read again using the CPU mmap. Doing #1 before #3 makes sure we |
| * don't do a full CPU cache flush in step #3 again. That makes sure all the |
| * stale cachelines from step #2 survive (mostly, a few will be evicted) |
| * until we try to read them again in step #4. This behavior could be fixed |
| * by flush CPU read right before accessing the CPU pointer */ |
| if (!expect_stale_cache) |
| prime_sync_start(dma_buf_fd, false); |
| |
| for (i = 0; i < (width * height) / 4; i++) |
| if (ptr_cpu[i] != 0xc5c5c5c5) { |
| igt_warn_on_f(!expect_stale_cache, |
| "Found 0x%08x at offset 0x%08x\n", ptr_cpu[i], i); |
| stale++; |
| } |
| |
| drm_intel_bo_unreference(bo_1); |
| munmap(ptr_cpu, width * height); |
| } |
| |
| /* |
| * Exercises the need for write flush: |
| * 1. create BO 1 and write '0's, in GTT domain. |
| * 2. write '1's into BO 1 using the dma-buf CPU mmap. |
| * 3. copy BO 1 to new BO 2, in GTT domain. |
| * 4. read via dma-buf mmap BO 2. |
| */ |
| static void test_write_flush(bool expect_stale_cache) |
| { |
| drm_intel_bo *bo_1; |
| drm_intel_bo *bo_2; |
| uint32_t *ptr_cpu; |
| uint32_t *ptr2_cpu; |
| int dma_buf_fd, dma_buf2_fd, i; |
| |
| bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096); |
| |
| /* STEP #1: Put the BO 1 in GTT domain. We use the blitter to copy and fill |
| * zeros to BO 1, so commands will be submitted and likely to place BO 1 in |
| * the GTT domain. */ |
| bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096); |
| intel_copy_bo(batch, bo_1, bo_2, width * height); |
| gem_sync(fd, bo_1->handle); |
| drm_intel_bo_unreference(bo_2); |
| |
| /* STEP #2: Write '1's into BO 1 using the dma-buf CPU mmap. */ |
| dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle); |
| igt_skip_on(errno == EINVAL); |
| |
| ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE, |
| MAP_SHARED, dma_buf_fd, 0); |
| igt_assert(ptr_cpu != MAP_FAILED); |
| |
| /* This is the main point of this test: !llc hw requires a cache write |
| * flush right here (explained in step #4). */ |
| if (!expect_stale_cache) |
| prime_sync_start(dma_buf_fd, true); |
| |
| memset(ptr_cpu, 0x11, width * height); |
| |
| /* STEP #3: Copy BO 1 into BO 2, using blitter. */ |
| bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096); |
| intel_copy_bo(batch, bo_2, bo_1, width * height); |
| gem_sync(fd, bo_2->handle); |
| |
| /* STEP #4: compare BO 2 against written BO 1. In !llc hardware, there |
| * should be some cache lines that didn't get flushed out and are still 0, |
| * requiring cache flush before the write in step 2. */ |
| dma_buf2_fd = prime_handle_to_fd_for_mmap(fd, bo_2->handle); |
| igt_skip_on(errno == EINVAL); |
| |
| ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE, |
| MAP_SHARED, dma_buf2_fd, 0); |
| igt_assert(ptr2_cpu != MAP_FAILED); |
| |
| for (i = 0; i < (width * height) / 4; i++) |
| if (ptr2_cpu[i] != 0x11111111) { |
| igt_warn_on_f(!expect_stale_cache, |
| "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i); |
| stale++; |
| } |
| |
| drm_intel_bo_unreference(bo_1); |
| drm_intel_bo_unreference(bo_2); |
| munmap(ptr_cpu, width * height); |
| } |
| |
| static void blit_and_cmp(void) |
| { |
| drm_intel_bo *bo_1; |
| drm_intel_bo *bo_2; |
| uint32_t *ptr_cpu; |
| uint32_t *ptr2_cpu; |
| int dma_buf_fd, dma_buf2_fd, i; |
| int local_fd; |
| drm_intel_bufmgr *local_bufmgr; |
| struct intel_batchbuffer *local_batch; |
| |
| /* recreate process local variables */ |
| local_fd = drm_open_driver(DRIVER_INTEL); |
| local_bufmgr = drm_intel_bufmgr_gem_init(local_fd, 4096); |
| igt_assert(local_bufmgr); |
| |
| local_batch = intel_batchbuffer_alloc(local_bufmgr, intel_get_drm_devid(local_fd)); |
| igt_assert(local_batch); |
| |
| bo_1 = drm_intel_bo_alloc(local_bufmgr, "BO 1", width * height * 4, 4096); |
| dma_buf_fd = prime_handle_to_fd_for_mmap(local_fd, bo_1->handle); |
| igt_skip_on(errno == EINVAL); |
| |
| ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE, |
| MAP_SHARED, dma_buf_fd, 0); |
| igt_assert(ptr_cpu != MAP_FAILED); |
| |
| bo_2 = drm_intel_bo_alloc(local_bufmgr, "BO 2", width * height * 4, 4096); |
| dma_buf2_fd = prime_handle_to_fd_for_mmap(local_fd, bo_2->handle); |
| |
| ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE, |
| MAP_SHARED, dma_buf2_fd, 0); |
| igt_assert(ptr2_cpu != MAP_FAILED); |
| |
| /* Fill up BO 1 with '1's and BO 2 with '0's */ |
| prime_sync_start(dma_buf_fd, true); |
| memset(ptr_cpu, 0x11, width * height); |
| prime_sync_end(dma_buf_fd, true); |
| |
| prime_sync_start(dma_buf2_fd, true); |
| memset(ptr2_cpu, 0x00, width * height); |
| prime_sync_end(dma_buf2_fd, true); |
| |
| /* Copy BO 1 into BO 2, using blitter. */ |
| intel_copy_bo(local_batch, bo_2, bo_1, width * height); |
| usleep(0); /* let someone else claim the mutex */ |
| |
| /* Compare BOs. If prime_sync_* were executed properly, the caches |
| * should be synced. */ |
| prime_sync_start(dma_buf2_fd, false); |
| for (i = 0; i < (width * height) / 4; i++) |
| igt_fail_on_f(ptr2_cpu[i] != 0x11111111, "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i); |
| prime_sync_end(dma_buf2_fd, false); |
| |
| drm_intel_bo_unreference(bo_1); |
| drm_intel_bo_unreference(bo_2); |
| munmap(ptr_cpu, width * height); |
| munmap(ptr2_cpu, width * height); |
| |
| close(dma_buf_fd); |
| close(dma_buf2_fd); |
| |
| intel_batchbuffer_free(local_batch); |
| drm_intel_bufmgr_destroy(local_bufmgr); |
| close(local_fd); |
| } |
| |
| /* |
| * Constantly interrupt concurrent blits to stress out prime_sync_* and make |
| * sure these ioctl errors are handled accordingly. |
| * |
| * Important to note that in case of failure (e.g. in a case where the ioctl |
| * wouldn't try again in a return error) this test does not reliably catch the |
| * problem with 100% of accuracy. |
| */ |
| static void test_ioctl_errors(void) |
| { |
| int ncpus = sysconf(_SC_NPROCESSORS_ONLN); |
| |
| /* Ensure we can do at least one child */ |
| intel_require_memory(2, width*height*4, CHECK_RAM); |
| |
| for (int num_children = 1; num_children <= 8 *ncpus; num_children <<= 1) { |
| uint64_t required, total; |
| |
| igt_info("Spawing %d interruptible children\n", num_children); |
| if (!__intel_check_memory(2*num_children, |
| width*height*4, |
| CHECK_RAM, |
| &required, &total)) { |
| igt_debug("Estimated that we need %'lluMiB for test, but only have %'lluMiB\n", |
| (long long)(required >> 20), |
| (long long)(total >> 20)); |
| break; |
| } |
| |
| igt_fork(child, num_children) |
| igt_while_interruptible(true) blit_and_cmp(); |
| igt_waitchildren(); |
| } |
| } |
| |
| int main(int argc, char **argv) |
| { |
| int i; |
| igt_subtest_init(argc, argv); |
| |
| igt_fixture { |
| fd = drm_open_driver(DRIVER_INTEL); |
| igt_require_gem(fd); |
| |
| bufmgr = drm_intel_bufmgr_gem_init(fd, 4096); |
| batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd)); |
| } |
| |
| /* Cache coherency and the eviction are pretty much unpredictable, so |
| * reproducing boils down to trial and error to hit different scenarios. |
| * TODO: We may want to improve tests a bit by picking random subranges. */ |
| igt_info("%d rounds for each test\n", ROUNDS); |
| igt_subtest("read") { |
| stale = 0; |
| igt_info("exercising read flush\n"); |
| for (i = 0; i < ROUNDS; i++) |
| test_read_flush(false); |
| igt_fail_on_f(stale, "num of stale cache lines %d\n", stale); |
| } |
| |
| /* Only for !llc platforms */ |
| igt_subtest("read-and-fail") { |
| igt_require(!gem_has_llc(fd)); |
| stale = 0; |
| igt_info("exercising read flush and expect to fail on !llc\n"); |
| for (i = 0; i < ROUNDS; i++) |
| test_read_flush(true); |
| igt_fail_on_f(!stale, "couldn't find any stale cache lines\n"); |
| } |
| |
| igt_subtest("write") { |
| stale = 0; |
| igt_info("exercising write flush\n"); |
| for (i = 0; i < ROUNDS; i++) |
| test_write_flush(false); |
| igt_fail_on_f(stale, "num of stale cache lines %d\n", stale); |
| } |
| |
| /* Only for !llc platforms */ |
| igt_subtest("write-and-fail") { |
| igt_require(!gem_has_llc(fd)); |
| stale = 0; |
| igt_info("exercising write flush and expect to fail on !llc\n"); |
| for (i = 0; i < ROUNDS; i++) |
| test_write_flush(true); |
| igt_fail_on_f(!stale, "couldn't find any stale cache lines\n"); |
| } |
| |
| igt_subtest("ioctl-errors") { |
| igt_info("exercising concurrent blit to get ioctl errors\n"); |
| test_ioctl_errors(); |
| } |
| |
| igt_fixture { |
| intel_batchbuffer_free(batch); |
| drm_intel_bufmgr_destroy(bufmgr); |
| |
| close(fd); |
| } |
| |
| igt_exit(); |
| } |