tests/prime_mmap_coherency.c - platform/external/igt-gpu-tools - Gitiles

 /*
  * Copyright © 2015 Intel Corporation
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
  * IN THE SOFTWARE.
  *
  * Authors:
  *    Tiago Vignatti
  */

 /** @file prime_mmap_coherency.c
  *
  * TODO: need to show the need for prime_sync_end().
  */

 #include "igt.h"

 IGT_TEST_DESCRIPTION("Test dma-buf mmap on !llc platforms mostly and provoke"
 		" coherency bugs so we know for sure where we need the sync ioctls.");

 #define ROUNDS 20

 int fd;
 int stale = 0;
 static drm_intel_bufmgr *bufmgr;
 struct intel_batchbuffer *batch;
 static int width = 1024, height = 1024;

 /*
  * Exercises the need for read flush:
  *   1. create a BO and write '0's, in GTT domain.
  *   2. read BO using the dma-buf CPU mmap.
  *   3. write '1's, in GTT domain.
  *   4. read again through the mapped dma-buf.
  */
 static void test_read_flush(bool expect_stale_cache)
 {
 	drm_intel_bo *bo_1;
 	drm_intel_bo *bo_2;
 	uint32_t *ptr_cpu;
 	uint32_t *ptr_gtt;
 	int dma_buf_fd, i;

 	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);

 	/* STEP #1: put the BO 1 in GTT domain. We use the blitter to copy and fill
 	 * zeros to BO 1, so commands will be submitted and likely to place BO 1 in
 	 * the GTT domain. */
 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
 	intel_copy_bo(batch, bo_1, bo_2, width * height);
 	gem_sync(fd, bo_1->handle);
 	drm_intel_bo_unreference(bo_2);

 	/* STEP #2: read BO 1 using the dma-buf CPU mmap. This dirties the CPU caches. */
 	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);

 	/* STEP #3: write 0x11 into BO 1. */
 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
 	ptr_gtt = gem_mmap__gtt(fd, bo_2->handle, width * height, PROT_READ | PROT_WRITE);
 	gem_set_domain(fd, bo_2->handle,
 		       I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
 	memset(ptr_gtt, 0xc5, width * height);
 	munmap(ptr_gtt, width * height);

 	ptr_cpu = mmap(NULL, width * height, PROT_READ,
 		       MAP_SHARED, dma_buf_fd, 0);
 	igt_assert(ptr_cpu != MAP_FAILED);

 	for (i = 0; i < (width * height) / 4; i++)
 		igt_assert_eq(ptr_cpu[i], 0);

 	intel_copy_bo(batch, bo_1, bo_2, width * height);
 	gem_sync(fd, bo_1->handle);
 	drm_intel_bo_unreference(bo_2);

 	/* STEP #4: read again using the CPU mmap. Doing #1 before #3 makes sure we
 	 * don't do a full CPU cache flush in step #3 again. That makes sure all the
 	 * stale cachelines from step #2 survive (mostly, a few will be evicted)
 	 * until we try to read them again in step #4. This behavior could be fixed
 	 * by flush CPU read right before accessing the CPU pointer */
 	if (!expect_stale_cache)
 		prime_sync_start(dma_buf_fd, false);

 	for (i = 0; i < (width * height) / 4; i++)
 		if (ptr_cpu[i] != 0xc5c5c5c5) {
 			igt_warn_on_f(!expect_stale_cache,
 				    "Found 0x%08x at offset 0x%08x\n", ptr_cpu[i], i);
 			stale++;
 		}

 	drm_intel_bo_unreference(bo_1);
 	munmap(ptr_cpu, width * height);
 }

 /*
  * Exercises the need for write flush:
  *   1. create BO 1 and write '0's, in GTT domain.
  *   2. write '1's into BO 1 using the dma-buf CPU mmap.
  *   3. copy BO 1 to new BO 2, in GTT domain.
  *   4. read via dma-buf mmap BO 2.
  */
 static void test_write_flush(bool expect_stale_cache)
 {
 	drm_intel_bo *bo_1;
 	drm_intel_bo *bo_2;
 	uint32_t *ptr_cpu;
 	uint32_t *ptr2_cpu;
 	int dma_buf_fd, dma_buf2_fd, i;

 	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);

 	/* STEP #1: Put the BO 1 in GTT domain. We use the blitter to copy and fill
 	 * zeros to BO 1, so commands will be submitted and likely to place BO 1 in
 	 * the GTT domain. */
 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
 	intel_copy_bo(batch, bo_1, bo_2, width * height);
 	gem_sync(fd, bo_1->handle);
 	drm_intel_bo_unreference(bo_2);

 	/* STEP #2: Write '1's into BO 1 using the dma-buf CPU mmap. */
 	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);
 	igt_skip_on(errno == EINVAL);

 	ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
 		       MAP_SHARED, dma_buf_fd, 0);
 	igt_assert(ptr_cpu != MAP_FAILED);

 	/* This is the main point of this test: !llc hw requires a cache write
 	 * flush right here (explained in step #4). */
 	if (!expect_stale_cache)
 		prime_sync_start(dma_buf_fd, true);

 	memset(ptr_cpu, 0x11, width * height);

 	/* STEP #3: Copy BO 1 into BO 2, using blitter. */
 	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
 	intel_copy_bo(batch, bo_2, bo_1, width * height);
 	gem_sync(fd, bo_2->handle);

 	/* STEP #4: compare BO 2 against written BO 1. In !llc hardware, there
 	 * should be some cache lines that didn't get flushed out and are still 0,
 	 * requiring cache flush before the write in step 2. */
 	dma_buf2_fd = prime_handle_to_fd_for_mmap(fd, bo_2->handle);
 	igt_skip_on(errno == EINVAL);

 	ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
 		        MAP_SHARED, dma_buf2_fd, 0);
 	igt_assert(ptr2_cpu != MAP_FAILED);

 	for (i = 0; i < (width * height) / 4; i++)
 		if (ptr2_cpu[i] != 0x11111111) {
 			igt_warn_on_f(!expect_stale_cache,
 				      "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i);
 			stale++;
 		}

 	drm_intel_bo_unreference(bo_1);
 	drm_intel_bo_unreference(bo_2);
 	munmap(ptr_cpu, width * height);
 }

 static void blit_and_cmp(void)
 {
 	drm_intel_bo *bo_1;
 	drm_intel_bo *bo_2;
 	uint32_t *ptr_cpu;
 	uint32_t *ptr2_cpu;
 	int dma_buf_fd, dma_buf2_fd, i;
 	int local_fd;
 	drm_intel_bufmgr *local_bufmgr;
 	struct intel_batchbuffer *local_batch;

 	/* recreate process local variables */
 	local_fd = drm_open_driver(DRIVER_INTEL);
 	local_bufmgr = drm_intel_bufmgr_gem_init(local_fd, 4096);
 	igt_assert(local_bufmgr);

 	local_batch = intel_batchbuffer_alloc(local_bufmgr, intel_get_drm_devid(local_fd));
 	igt_assert(local_batch);

 	bo_1 = drm_intel_bo_alloc(local_bufmgr, "BO 1", width * height * 4, 4096);
 	dma_buf_fd = prime_handle_to_fd_for_mmap(local_fd, bo_1->handle);
 	igt_skip_on(errno == EINVAL);

 	ptr_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
 		       MAP_SHARED, dma_buf_fd, 0);
 	igt_assert(ptr_cpu != MAP_FAILED);

 	bo_2 = drm_intel_bo_alloc(local_bufmgr, "BO 2", width * height * 4, 4096);
 	dma_buf2_fd = prime_handle_to_fd_for_mmap(local_fd, bo_2->handle);

 	ptr2_cpu = mmap(NULL, width * height, PROT_READ | PROT_WRITE,
 			MAP_SHARED, dma_buf2_fd, 0);
 	igt_assert(ptr2_cpu != MAP_FAILED);

 	/* Fill up BO 1 with '1's and BO 2 with '0's */
 	prime_sync_start(dma_buf_fd, true);
 	memset(ptr_cpu, 0x11, width * height);
 	prime_sync_end(dma_buf_fd, true);

 	prime_sync_start(dma_buf2_fd, true);
 	memset(ptr2_cpu, 0x00, width * height);
 	prime_sync_end(dma_buf2_fd, true);

 	/* Copy BO 1 into BO 2, using blitter. */
 	intel_copy_bo(local_batch, bo_2, bo_1, width * height);
 	usleep(0); /* let someone else claim the mutex */

 	/* Compare BOs. If prime_sync_* were executed properly, the caches
 	 * should be synced. */
 	prime_sync_start(dma_buf2_fd, false);
 	for (i = 0; i < (width * height) / 4; i++)
 		igt_fail_on_f(ptr2_cpu[i] != 0x11111111, "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i);
 	prime_sync_end(dma_buf2_fd, false);

 	drm_intel_bo_unreference(bo_1);
 	drm_intel_bo_unreference(bo_2);
 	munmap(ptr_cpu, width * height);
 	munmap(ptr2_cpu, width * height);

 	close(dma_buf_fd);
 	close(dma_buf2_fd);

 	intel_batchbuffer_free(local_batch);
 	drm_intel_bufmgr_destroy(local_bufmgr);
 	close(local_fd);
 }

 /*
  * Constantly interrupt concurrent blits to stress out prime_sync_* and make
  * sure these ioctl errors are handled accordingly.
  *
  * Important to note that in case of failure (e.g. in a case where the ioctl
  * wouldn't try again in a return error) this test does not reliably catch the
  * problem with 100% of accuracy.
  */
 static void test_ioctl_errors(void)
 {
 	int ncpus = sysconf(_SC_NPROCESSORS_ONLN);

 	/* Ensure we can do at least one child */
 	intel_require_memory(2, width*height*4, CHECK_RAM);

 	for (int num_children = 1; num_children <= 8 *ncpus; num_children <<= 1) {
 		uint64_t required, total;

 		igt_info("Spawing %d interruptible children\n", num_children);
 		if (!__intel_check_memory(2*num_children,
 					  width*height*4,
 					  CHECK_RAM,
 					  &required, &total)) {
 			igt_debug("Estimated that we need %'lluMiB for test, but only have %'lluMiB\n",
 				  (long long)(required >> 20),
 				  (long long)(total >> 20));
 			break;
 		}

 		igt_fork(child, num_children)
 			igt_while_interruptible(true) blit_and_cmp();
 		igt_waitchildren();
 	}
 }

 int main(int argc, char **argv)
 {
 	int i;
 	igt_subtest_init(argc, argv);

 	igt_fixture {
 		fd = drm_open_driver(DRIVER_INTEL);
 		igt_require_gem(fd);

 		bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
 		batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
 	}

 	/* Cache coherency and the eviction are pretty much unpredictable, so
 	 * reproducing boils down to trial and error to hit different scenarios.
 	 * TODO: We may want to improve tests a bit by picking random subranges. */
 	igt_info("%d rounds for each test\n", ROUNDS);
 	igt_subtest("read") {
 		stale = 0;
 		igt_info("exercising read flush\n");
 		for (i = 0; i < ROUNDS; i++)
 			test_read_flush(false);
 		igt_fail_on_f(stale, "num of stale cache lines %d\n", stale);
 	}

 	/* Only for !llc platforms */
 	igt_subtest("read-and-fail") {
 		igt_require(!gem_has_llc(fd));
 		stale = 0;
 		igt_info("exercising read flush and expect to fail on !llc\n");
 		for (i = 0; i < ROUNDS; i++)
 			test_read_flush(true);
 		igt_fail_on_f(!stale, "couldn't find any stale cache lines\n");
 	}

 	igt_subtest("write") {
 		stale = 0;
 		igt_info("exercising write flush\n");
 		for (i = 0; i < ROUNDS; i++)
 			test_write_flush(false);
 		igt_fail_on_f(stale, "num of stale cache lines %d\n", stale);
 	}

 	/* Only for !llc platforms */
 	igt_subtest("write-and-fail") {
 		igt_require(!gem_has_llc(fd));
 		stale = 0;
 		igt_info("exercising write flush and expect to fail on !llc\n");
 		for (i = 0; i < ROUNDS; i++)
 			test_write_flush(true);
 		igt_fail_on_f(!stale, "couldn't find any stale cache lines\n");
 	}

 	igt_subtest("ioctl-errors") {
 		igt_info("exercising concurrent blit to get ioctl errors\n");
 		test_ioctl_errors();
 	}

 	igt_fixture {
 		intel_batchbuffer_free(batch);
 		drm_intel_bufmgr_destroy(bufmgr);

 		close(fd);
 	}

 	igt_exit();
 }
	/*
	* Copyright © 2015 Intel Corporation
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
	* and/or sell copies of the Software, and to permit persons to whom the
	* Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
	* IN THE SOFTWARE.
	*
	* Authors:
	* Tiago Vignatti
	*/

	/** @file prime_mmap_coherency.c
	*
	* TODO: need to show the need for prime_sync_end().
	*/

	#include "igt.h"

	IGT_TEST_DESCRIPTION("Test dma-buf mmap on !llc platforms mostly and provoke"
	" coherency bugs so we know for sure where we need the sync ioctls.");

	#define ROUNDS 20

	int fd;
	int stale = 0;
	static drm_intel_bufmgr *bufmgr;
	struct intel_batchbuffer *batch;
	static int width = 1024, height = 1024;

	/*
	* Exercises the need for read flush:
	* 1. create a BO and write '0's, in GTT domain.
	* 2. read BO using the dma-buf CPU mmap.
	* 3. write '1's, in GTT domain.
	* 4. read again through the mapped dma-buf.
	*/
	static void test_read_flush(bool expect_stale_cache)
	{
	drm_intel_bo *bo_1;
	drm_intel_bo *bo_2;
	uint32_t *ptr_cpu;
	uint32_t *ptr_gtt;
	int dma_buf_fd, i;

	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);

	/* STEP #1: put the BO 1 in GTT domain. We use the blitter to copy and fill
	* zeros to BO 1, so commands will be submitted and likely to place BO 1 in
	* the GTT domain. */
	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
	intel_copy_bo(batch, bo_1, bo_2, width * height);
	gem_sync(fd, bo_1->handle);
	drm_intel_bo_unreference(bo_2);

	/* STEP #2: read BO 1 using the dma-buf CPU mmap. This dirties the CPU caches. */
	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);

	/* STEP #3: write 0x11 into BO 1. */
	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
	ptr_gtt = gem_mmap__gtt(fd, bo_2->handle, width * height, PROT_READ \| PROT_WRITE);
	gem_set_domain(fd, bo_2->handle,
	I915_GEM_DOMAIN_GTT, I915_GEM_DOMAIN_GTT);
	memset(ptr_gtt, 0xc5, width * height);
	munmap(ptr_gtt, width * height);

	ptr_cpu = mmap(NULL, width * height, PROT_READ,
	MAP_SHARED, dma_buf_fd, 0);
	igt_assert(ptr_cpu != MAP_FAILED);

	for (i = 0; i < (width * height) / 4; i++)
	igt_assert_eq(ptr_cpu[i], 0);

	intel_copy_bo(batch, bo_1, bo_2, width * height);
	gem_sync(fd, bo_1->handle);
	drm_intel_bo_unreference(bo_2);

	/* STEP #4: read again using the CPU mmap. Doing #1 before #3 makes sure we
	* don't do a full CPU cache flush in step #3 again. That makes sure all the
	* stale cachelines from step #2 survive (mostly, a few will be evicted)
	* until we try to read them again in step #4. This behavior could be fixed
	* by flush CPU read right before accessing the CPU pointer */
	if (!expect_stale_cache)
	prime_sync_start(dma_buf_fd, false);

	for (i = 0; i < (width * height) / 4; i++)
	if (ptr_cpu[i] != 0xc5c5c5c5) {
	igt_warn_on_f(!expect_stale_cache,
	"Found 0x%08x at offset 0x%08x\n", ptr_cpu[i], i);
	stale++;
	}

	drm_intel_bo_unreference(bo_1);
	munmap(ptr_cpu, width * height);
	}

	/*
	* Exercises the need for write flush:
	* 1. create BO 1 and write '0's, in GTT domain.
	* 2. write '1's into BO 1 using the dma-buf CPU mmap.
	* 3. copy BO 1 to new BO 2, in GTT domain.
	* 4. read via dma-buf mmap BO 2.
	*/
	static void test_write_flush(bool expect_stale_cache)
	{
	drm_intel_bo *bo_1;
	drm_intel_bo *bo_2;
	uint32_t *ptr_cpu;
	uint32_t *ptr2_cpu;
	int dma_buf_fd, dma_buf2_fd, i;

	bo_1 = drm_intel_bo_alloc(bufmgr, "BO 1", width * height * 4, 4096);

	/* STEP #1: Put the BO 1 in GTT domain. We use the blitter to copy and fill
	* zeros to BO 1, so commands will be submitted and likely to place BO 1 in
	* the GTT domain. */
	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
	intel_copy_bo(batch, bo_1, bo_2, width * height);
	gem_sync(fd, bo_1->handle);
	drm_intel_bo_unreference(bo_2);

	/* STEP #2: Write '1's into BO 1 using the dma-buf CPU mmap. */
	dma_buf_fd = prime_handle_to_fd_for_mmap(fd, bo_1->handle);
	igt_skip_on(errno == EINVAL);

	ptr_cpu = mmap(NULL, width * height, PROT_READ \| PROT_WRITE,
	MAP_SHARED, dma_buf_fd, 0);
	igt_assert(ptr_cpu != MAP_FAILED);

	/* This is the main point of this test: !llc hw requires a cache write
	* flush right here (explained in step #4). */
	if (!expect_stale_cache)
	prime_sync_start(dma_buf_fd, true);

	memset(ptr_cpu, 0x11, width * height);

	/* STEP #3: Copy BO 1 into BO 2, using blitter. */
	bo_2 = drm_intel_bo_alloc(bufmgr, "BO 2", width * height * 4, 4096);
	intel_copy_bo(batch, bo_2, bo_1, width * height);
	gem_sync(fd, bo_2->handle);

	/* STEP #4: compare BO 2 against written BO 1. In !llc hardware, there
	* should be some cache lines that didn't get flushed out and are still 0,
	* requiring cache flush before the write in step 2. */
	dma_buf2_fd = prime_handle_to_fd_for_mmap(fd, bo_2->handle);
	igt_skip_on(errno == EINVAL);

	ptr2_cpu = mmap(NULL, width * height, PROT_READ \| PROT_WRITE,
	MAP_SHARED, dma_buf2_fd, 0);
	igt_assert(ptr2_cpu != MAP_FAILED);

	for (i = 0; i < (width * height) / 4; i++)
	if (ptr2_cpu[i] != 0x11111111) {
	igt_warn_on_f(!expect_stale_cache,
	"Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i);
	stale++;
	}

	drm_intel_bo_unreference(bo_1);
	drm_intel_bo_unreference(bo_2);
	munmap(ptr_cpu, width * height);
	}

	static void blit_and_cmp(void)
	{
	drm_intel_bo *bo_1;
	drm_intel_bo *bo_2;
	uint32_t *ptr_cpu;
	uint32_t *ptr2_cpu;
	int dma_buf_fd, dma_buf2_fd, i;
	int local_fd;
	drm_intel_bufmgr *local_bufmgr;
	struct intel_batchbuffer *local_batch;

	/* recreate process local variables */
	local_fd = drm_open_driver(DRIVER_INTEL);
	local_bufmgr = drm_intel_bufmgr_gem_init(local_fd, 4096);
	igt_assert(local_bufmgr);

	local_batch = intel_batchbuffer_alloc(local_bufmgr, intel_get_drm_devid(local_fd));
	igt_assert(local_batch);

	bo_1 = drm_intel_bo_alloc(local_bufmgr, "BO 1", width * height * 4, 4096);
	dma_buf_fd = prime_handle_to_fd_for_mmap(local_fd, bo_1->handle);
	igt_skip_on(errno == EINVAL);

	ptr_cpu = mmap(NULL, width * height, PROT_READ \| PROT_WRITE,
	MAP_SHARED, dma_buf_fd, 0);
	igt_assert(ptr_cpu != MAP_FAILED);

	bo_2 = drm_intel_bo_alloc(local_bufmgr, "BO 2", width * height * 4, 4096);
	dma_buf2_fd = prime_handle_to_fd_for_mmap(local_fd, bo_2->handle);

	ptr2_cpu = mmap(NULL, width * height, PROT_READ \| PROT_WRITE,
	MAP_SHARED, dma_buf2_fd, 0);
	igt_assert(ptr2_cpu != MAP_FAILED);

	/* Fill up BO 1 with '1's and BO 2 with '0's */
	prime_sync_start(dma_buf_fd, true);
	memset(ptr_cpu, 0x11, width * height);
	prime_sync_end(dma_buf_fd, true);

	prime_sync_start(dma_buf2_fd, true);
	memset(ptr2_cpu, 0x00, width * height);
	prime_sync_end(dma_buf2_fd, true);

	/* Copy BO 1 into BO 2, using blitter. */
	intel_copy_bo(local_batch, bo_2, bo_1, width * height);
	usleep(0); /* let someone else claim the mutex */

	/* Compare BOs. If prime_sync_* were executed properly, the caches
	* should be synced. */
	prime_sync_start(dma_buf2_fd, false);
	for (i = 0; i < (width * height) / 4; i++)
	igt_fail_on_f(ptr2_cpu[i] != 0x11111111, "Found 0x%08x at offset 0x%08x\n", ptr2_cpu[i], i);
	prime_sync_end(dma_buf2_fd, false);

	drm_intel_bo_unreference(bo_1);
	drm_intel_bo_unreference(bo_2);
	munmap(ptr_cpu, width * height);
	munmap(ptr2_cpu, width * height);

	close(dma_buf_fd);
	close(dma_buf2_fd);

	intel_batchbuffer_free(local_batch);
	drm_intel_bufmgr_destroy(local_bufmgr);
	close(local_fd);
	}

	/*
	* Constantly interrupt concurrent blits to stress out prime_sync_* and make
	* sure these ioctl errors are handled accordingly.
	*
	* Important to note that in case of failure (e.g. in a case where the ioctl
	* wouldn't try again in a return error) this test does not reliably catch the
	* problem with 100% of accuracy.
	*/
	static void test_ioctl_errors(void)
	{
	int ncpus = sysconf(_SC_NPROCESSORS_ONLN);

	/* Ensure we can do at least one child */
	intel_require_memory(2, widthheight4, CHECK_RAM);

	for (int num_children = 1; num_children <= 8 *ncpus; num_children <<= 1) {
	uint64_t required, total;

	igt_info("Spawing %d interruptible children\n", num_children);
	if (!__intel_check_memory(2*num_children,
	widthheight4,
	CHECK_RAM,
	&required, &total)) {
	igt_debug("Estimated that we need %'lluMiB for test, but only have %'lluMiB\n",
	(long long)(required >> 20),
	(long long)(total >> 20));
	break;
	}

	igt_fork(child, num_children)
	igt_while_interruptible(true) blit_and_cmp();
	igt_waitchildren();
	}
	}

	int main(int argc, char **argv)
	{
	int i;
	igt_subtest_init(argc, argv);

	igt_fixture {
	fd = drm_open_driver(DRIVER_INTEL);
	igt_require_gem(fd);

	bufmgr = drm_intel_bufmgr_gem_init(fd, 4096);
	batch = intel_batchbuffer_alloc(bufmgr, intel_get_drm_devid(fd));
	}

	/* Cache coherency and the eviction are pretty much unpredictable, so
	* reproducing boils down to trial and error to hit different scenarios.
	* TODO: We may want to improve tests a bit by picking random subranges. */
	igt_info("%d rounds for each test\n", ROUNDS);
	igt_subtest("read") {
	stale = 0;
	igt_info("exercising read flush\n");
	for (i = 0; i < ROUNDS; i++)
	test_read_flush(false);
	igt_fail_on_f(stale, "num of stale cache lines %d\n", stale);
	}

	/* Only for !llc platforms */
	igt_subtest("read-and-fail") {
	igt_require(!gem_has_llc(fd));
	stale = 0;
	igt_info("exercising read flush and expect to fail on !llc\n");
	for (i = 0; i < ROUNDS; i++)
	test_read_flush(true);
	igt_fail_on_f(!stale, "couldn't find any stale cache lines\n");
	}

	igt_subtest("write") {
	stale = 0;
	igt_info("exercising write flush\n");
	for (i = 0; i < ROUNDS; i++)
	test_write_flush(false);
	igt_fail_on_f(stale, "num of stale cache lines %d\n", stale);
	}

	/* Only for !llc platforms */
	igt_subtest("write-and-fail") {
	igt_require(!gem_has_llc(fd));
	stale = 0;
	igt_info("exercising write flush and expect to fail on !llc\n");
	for (i = 0; i < ROUNDS; i++)
	test_write_flush(true);
	igt_fail_on_f(!stale, "couldn't find any stale cache lines\n");
	}

	igt_subtest("ioctl-errors") {
	igt_info("exercising concurrent blit to get ioctl errors\n");
	test_ioctl_errors();
	}

	igt_fixture {
	intel_batchbuffer_free(batch);
	drm_intel_bufmgr_destroy(bufmgr);

	close(fd);
	}

	igt_exit();
	}