Blame - lib/gpgpu_fill.c - platform/external/igt-gpu-tools

blob: 4d98643df546ddab9e090f631e94a0ada2d26566 [file] [log] [blame]

Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	1	/*
				2	* Copyright © 2015 Intel Corporation
				3	*
				4	* Permission is hereby granted, free of charge, to any person obtaining a
				5	* copy of this software and associated documentation files (the "Software"),
				6	* to deal in the Software without restriction, including without limitation
				7	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
				8	* and/or sell copies of the Software, and to permit persons to whom the
				9	* Software is furnished to do so, subject to the following conditions:
				10	*
				11	* The above copyright notice and this permission notice (including the next
				12	* paragraph) shall be included in all copies or substantial portions of the
				13	* Software.
				14	*
				15	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
				16	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				17	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
				18	* THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
				19	* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
				20	* FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
				21	* IN THE SOFTWARE.
				22	*
				23	* Authors:
				24	* Zhenyu Wang <zhenyuw@linux.intel.com>
				25	* Dominik Zeromski <dominik.zeromski@intel.com>
				26	*/
				27
				28	#include <intel_bufmgr.h>
				29	#include <i915_drm.h>
				30
				31	#include "intel_reg.h"
				32	#include "drmtest.h"
				33	#include "intel_batchbuffer.h"
				34	#include "gen7_media.h"
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	35	#include "gen8_media.h"
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	36	#include "gpgpu_fill.h"
				37
				38	/* shaders/gpgpu/gpgpu_fill.gxa */
				39	static const uint32_t gen7_gpgpu_kernel[][4] = {
				40	{ 0x00400001, 0x20200231, 0x00000020, 0x00000000 },
				41	{ 0x00000041, 0x20400c21, 0x00000004, 0x00000010 },
				42	{ 0x00000001, 0x20440021, 0x00000018, 0x00000000 },
				43	{ 0x00600001, 0x20800021, 0x008d0000, 0x00000000 },
				44	{ 0x00200001, 0x20800021, 0x00450040, 0x00000000 },
				45	{ 0x00000001, 0x20880061, 0x00000000, 0x0000000f },
				46	{ 0x00800001, 0x20a00021, 0x00000020, 0x00000000 },
				47	{ 0x05800031, 0x24001ca8, 0x00000080, 0x060a8000 },
				48	{ 0x00600001, 0x2e000021, 0x008d0000, 0x00000000 },
				49	{ 0x07800031, 0x20001ca8, 0x00000e00, 0x82000010 },
				50	};
				51
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	52	static const uint32_t gen8_gpgpu_kernel[][4] = {
				53	{ 0x00400001, 0x20202288, 0x00000020, 0x00000000 },
				54	{ 0x00000041, 0x20400208, 0x06000004, 0x00000010 },
				55	{ 0x00000001, 0x20440208, 0x00000018, 0x00000000 },
				56	{ 0x00600001, 0x20800208, 0x008d0000, 0x00000000 },
				57	{ 0x00200001, 0x20800208, 0x00450040, 0x00000000 },
				58	{ 0x00000001, 0x20880608, 0x00000000, 0x0000000f },
				59	{ 0x00800001, 0x20a00208, 0x00000020, 0x00000000 },
				60	{ 0x0c800031, 0x24000a40, 0x0e000080, 0x060a8000 },
				61	{ 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
				62	{ 0x07800031, 0x20000a40, 0x0e000e00, 0x82000010 },
				63	};
				64
Dominik Zeromski	3676912	2015-07-17 11:25:45 +0200	[diff] [blame]	65	static const uint32_t gen9_gpgpu_kernel[][4] = {
				66	{ 0x00400001, 0x20202288, 0x00000020, 0x00000000 },
				67	{ 0x00000041, 0x20400208, 0x06000004, 0x00000010 },
				68	{ 0x00000001, 0x20440208, 0x00000018, 0x00000000 },
				69	{ 0x00600001, 0x20800208, 0x008d0000, 0x00000000 },
				70	{ 0x00200001, 0x20800208, 0x00450040, 0x00000000 },
				71	{ 0x00000001, 0x20880608, 0x00000000, 0x0000000f },
				72	{ 0x00800001, 0x20a00208, 0x00000020, 0x00000000 },
				73	{ 0x0c800031, 0x24000a40, 0x06000080, 0x060a8000 },
				74	{ 0x00600001, 0x2e000208, 0x008d0000, 0x00000000 },
				75	{ 0x07800031, 0x20000a40, 0x06000e00, 0x82000010 },
				76	};
				77
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	78	static uint32_t
				79	batch_used(struct intel_batchbuffer *batch)
				80	{
				81	return batch->ptr - batch->buffer;
				82	}
				83
				84	static uint32_t
				85	batch_align(struct intel_batchbuffer *batch, uint32_t align)
				86	{
				87	uint32_t offset = batch_used(batch);
				88	offset = ALIGN(offset, align);
				89	batch->ptr = batch->buffer + offset;
				90	return offset;
				91	}
				92
				93	static void *
				94	batch_alloc(struct intel_batchbuffer *batch, uint32_t size, uint32_t align)
				95	{
				96	uint32_t offset = batch_align(batch, align);
				97	batch->ptr += size;
				98	return memset(batch->buffer + offset, 0, size);
				99	}
				100
				101	static uint32_t
				102	batch_offset(struct intel_batchbuffer batch, void ptr)
				103	{
				104	return (uint8_t *)ptr - batch->buffer;
				105	}
				106
				107	static uint32_t
				108	batch_copy(struct intel_batchbuffer batch, const void ptr, uint32_t size,
				109	uint32_t align)
				110	{
				111	return batch_offset(batch, memcpy(batch_alloc(batch, size, align), ptr, size));
				112	}
				113
				114	static void
				115	gen7_render_flush(struct intel_batchbuffer *batch, uint32_t batch_end)
				116	{
				117	int ret;
				118
				119	ret = drm_intel_bo_subdata(batch->bo, 0, 4096, batch->buffer);
				120	if (ret == 0)
				121	ret = drm_intel_bo_mrb_exec(batch->bo, batch_end,
				122	NULL, 0, 0, 0);
				123	igt_assert(ret == 0);
				124	}
				125
				126	static uint32_t
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	127	gen7_fill_curbe_buffer_data(struct intel_batchbuffer *batch, uint8_t color)
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	128	{
				129	uint8_t *curbe_buffer;
				130	uint32_t offset;
				131
				132	curbe_buffer = batch_alloc(batch, sizeof(uint32_t) * 8, 64);
				133	offset = batch_offset(batch, curbe_buffer);
				134	*curbe_buffer = color;
				135
				136	return offset;
				137	}
				138
				139	static uint32_t
				140	gen7_fill_surface_state(struct intel_batchbuffer *batch,
				141	struct igt_buf *buf,
				142	uint32_t format,
				143	int is_dst)
				144	{
				145	struct gen7_surface_state *ss;
				146	uint32_t write_domain, read_domain, offset;
				147	int ret;
				148
				149	if (is_dst) {
				150	write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
				151	} else {
				152	write_domain = 0;
				153	read_domain = I915_GEM_DOMAIN_SAMPLER;
				154	}
				155
				156	ss = batch_alloc(batch, sizeof(*ss), 64);
				157	offset = batch_offset(batch, ss);
				158
				159	ss->ss0.surface_type = GEN7_SURFACE_2D;
				160	ss->ss0.surface_format = format;
				161	ss->ss0.render_cache_read_write = 1;
				162
				163	if (buf->tiling == I915_TILING_X)
				164	ss->ss0.tiled_mode = 2;
				165	else if (buf->tiling == I915_TILING_Y)
				166	ss->ss0.tiled_mode = 3;
				167
				168	ss->ss1.base_addr = buf->bo->offset;
				169	ret = drm_intel_bo_emit_reloc(batch->bo,
				170	batch_offset(batch, ss) + 4,
				171	buf->bo, 0,
				172	read_domain, write_domain);
				173	igt_assert(ret == 0);
				174
				175	ss->ss2.height = igt_buf_height(buf) - 1;
				176	ss->ss2.width = igt_buf_width(buf) - 1;
				177
				178	ss->ss3.pitch = buf->stride - 1;
				179
				180	ss->ss7.shader_chanel_select_r = 4;
				181	ss->ss7.shader_chanel_select_g = 5;
				182	ss->ss7.shader_chanel_select_b = 6;
				183	ss->ss7.shader_chanel_select_a = 7;
				184
				185	return offset;
				186	}
				187
				188	static uint32_t
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	189	gen8_fill_surface_state(struct intel_batchbuffer *batch,
				190	struct igt_buf *buf,
				191	uint32_t format,
				192	int is_dst)
				193	{
				194	struct gen8_surface_state *ss;
				195	uint32_t write_domain, read_domain, offset;
				196	int ret;
				197
				198	if (is_dst) {
				199	write_domain = read_domain = I915_GEM_DOMAIN_RENDER;
				200	} else {
				201	write_domain = 0;
				202	read_domain = I915_GEM_DOMAIN_SAMPLER;
				203	}
				204
				205	ss = batch_alloc(batch, sizeof(*ss), 64);
				206	offset = batch_offset(batch, ss);
				207
				208	ss->ss0.surface_type = GEN8_SURFACE_2D;
				209	ss->ss0.surface_format = format;
				210	ss->ss0.render_cache_read_write = 1;
				211	ss->ss0.vertical_alignment = 1; /* align 4 */
				212	ss->ss0.horizontal_alignment = 1; /* align 4 */
				213
				214	if (buf->tiling == I915_TILING_X)
				215	ss->ss0.tiled_mode = 2;
				216	else if (buf->tiling == I915_TILING_Y)
				217	ss->ss0.tiled_mode = 3;
				218
				219	ss->ss8.base_addr = buf->bo->offset;
				220
				221	ret = drm_intel_bo_emit_reloc(batch->bo,
				222	batch_offset(batch, ss) + 8 * 4,
				223	buf->bo, 0,
				224	read_domain, write_domain);
				225	igt_assert_eq(ret, 0);
				226
				227	ss->ss2.height = igt_buf_height(buf) - 1;
				228	ss->ss2.width = igt_buf_width(buf) - 1;
				229	ss->ss3.pitch = buf->stride - 1;
				230
				231	ss->ss7.shader_chanel_select_r = 4;
				232	ss->ss7.shader_chanel_select_g = 5;
				233	ss->ss7.shader_chanel_select_b = 6;
				234	ss->ss7.shader_chanel_select_a = 7;
				235
				236	return offset;
				237
				238	}
				239
				240	static uint32_t
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	241	gen7_fill_binding_table(struct intel_batchbuffer *batch,
				242	struct igt_buf *dst)
				243	{
				244	uint32_t *binding_table, offset;
				245
				246	binding_table = batch_alloc(batch, 32, 64);
				247	offset = batch_offset(batch, binding_table);
				248
				249	binding_table[0] = gen7_fill_surface_state(batch, dst, GEN7_SURFACEFORMAT_R8_UNORM, 1);
				250
				251	return offset;
				252	}
				253
				254	static uint32_t
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	255	gen8_fill_binding_table(struct intel_batchbuffer *batch,
				256	struct igt_buf *dst)
				257	{
				258	uint32_t *binding_table, offset;
				259
				260	binding_table = batch_alloc(batch, 32, 64);
				261	offset = batch_offset(batch, binding_table);
				262
				263	binding_table[0] = gen8_fill_surface_state(batch, dst, GEN8_SURFACEFORMAT_R8_UNORM, 1);
				264
				265	return offset;
				266	}
				267
				268	static uint32_t
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	269	gen7_fill_gpgpu_kernel(struct intel_batchbuffer *batch,
				270	const uint32_t kernel[][4],
				271	size_t size)
				272	{
				273	uint32_t offset;
				274
				275	offset = batch_copy(batch, kernel, size, 64);
				276
				277	return offset;
				278	}
				279
				280	static uint32_t
				281	gen7_fill_interface_descriptor(struct intel_batchbuffer batch, struct igt_buf dst,
				282	const uint32_t kernel[][4], size_t size)
				283	{
				284	struct gen7_interface_descriptor_data *idd;
				285	uint32_t offset;
				286	uint32_t binding_table_offset, kernel_offset;
				287
				288	binding_table_offset = gen7_fill_binding_table(batch, dst);
				289	kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
				290
				291	idd = batch_alloc(batch, sizeof(*idd), 64);
				292	offset = batch_offset(batch, idd);
				293
				294	idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
				295
				296	idd->desc1.single_program_flow = 1;
				297	idd->desc1.floating_point_mode = GEN7_FLOATING_POINT_IEEE_754;
				298
				299	idd->desc2.sampler_count = 0; /* 0 samplers used */
				300	idd->desc2.sampler_state_pointer = 0;
				301
				302	idd->desc3.binding_table_entry_count = 0;
				303	idd->desc3.binding_table_pointer = (binding_table_offset >> 5);
				304
				305	idd->desc4.constant_urb_entry_read_offset = 0;
				306	idd->desc4.constant_urb_entry_read_length = 1; /* grf 1 */
				307
				308	return offset;
				309	}
				310
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	311	static uint32_t
				312	gen8_fill_interface_descriptor(struct intel_batchbuffer batch, struct igt_buf dst,
				313	const uint32_t kernel[][4], size_t size)
				314	{
				315	struct gen8_interface_descriptor_data *idd;
				316	uint32_t offset;
				317	uint32_t binding_table_offset, kernel_offset;
				318
				319	binding_table_offset = gen8_fill_binding_table(batch, dst);
				320	kernel_offset = gen7_fill_gpgpu_kernel(batch, kernel, size);
				321
				322	idd = batch_alloc(batch, sizeof(*idd), 64);
				323	offset = batch_offset(batch, idd);
				324
				325	idd->desc0.kernel_start_pointer = (kernel_offset >> 6);
				326
				327	idd->desc2.single_program_flow = 1;
				328	idd->desc2.floating_point_mode = GEN8_FLOATING_POINT_IEEE_754;
				329
				330	idd->desc3.sampler_count = 0; /* 0 samplers used */
				331	idd->desc3.sampler_state_pointer = 0;
				332
				333	idd->desc4.binding_table_entry_count = 0;
				334	idd->desc4.binding_table_pointer = (binding_table_offset >> 5);
				335
				336	idd->desc5.constant_urb_entry_read_offset = 0;
				337	idd->desc5.constant_urb_entry_read_length = 1; /* grf 1 */
				338
				339	return offset;
				340	}
				341
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	342	static void
				343	gen7_emit_state_base_address(struct intel_batchbuffer *batch)
				344	{
				345	OUT_BATCH(GEN7_STATE_BASE_ADDRESS \| (10 - 2));
				346
				347	/* general */
				348	OUT_BATCH(0);
				349
				350	/* surface */
				351	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
				352
				353	/* dynamic */
				354	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
				355
				356	/* indirect */
				357	OUT_BATCH(0);
				358
				359	/* instruction */
				360	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
				361
				362	/* general/dynamic/indirect/instruction access Bound */
				363	OUT_BATCH(0);
				364	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				365	OUT_BATCH(0);
				366	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				367	}
				368
				369	static void
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	370	gen8_emit_state_base_address(struct intel_batchbuffer *batch)
				371	{
				372	OUT_BATCH(GEN8_STATE_BASE_ADDRESS \| (16 - 2));
				373
				374	/* general */
				375	OUT_BATCH(0 \| (0x78 << 4) \| (0 << 1) \| BASE_ADDRESS_MODIFY);
				376	OUT_BATCH(0);
				377
				378	/* stateless data port */
				379	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				380
				381	/* surface */
				382	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
				383
				384	/* dynamic */
				385	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER \| I915_GEM_DOMAIN_INSTRUCTION,
				386	0, BASE_ADDRESS_MODIFY);
				387
				388	/* indirect */
				389	OUT_BATCH(0);
				390	OUT_BATCH(0 );
				391
				392	/* instruction */
				393	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
				394
				395	/* general state buffer size */
				396	OUT_BATCH(0xfffff000 \| 1);
				397	/* dynamic state buffer size */
				398	OUT_BATCH(1 << 12 \| 1);
				399	/* indirect object buffer size */
				400	OUT_BATCH(0xfffff000 \| 1);
				401	/* intruction buffer size, must set modify enable bit, otherwise it may result in GPU hang */
				402	OUT_BATCH(1 << 12 \| 1);
				403	}
				404
				405	static void
Dominik Zeromski	3676912	2015-07-17 11:25:45 +0200	[diff] [blame]	406	gen9_emit_state_base_address(struct intel_batchbuffer *batch)
				407	{
				408	OUT_BATCH(GEN8_STATE_BASE_ADDRESS \| (19 - 2));
				409
				410	/* general */
				411	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				412	OUT_BATCH(0);
				413
				414	/* stateless data port */
				415	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				416
				417	/* surface */
				418	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_SAMPLER, 0, BASE_ADDRESS_MODIFY);
				419
				420	/* dynamic */
				421	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_RENDER \| I915_GEM_DOMAIN_INSTRUCTION,
				422	0, BASE_ADDRESS_MODIFY);
				423
				424	/* indirect */
				425	OUT_BATCH(0);
				426	OUT_BATCH(0);
				427
				428	/* instruction */
				429	OUT_RELOC(batch->bo, I915_GEM_DOMAIN_INSTRUCTION, 0, BASE_ADDRESS_MODIFY);
				430
				431	/* general state buffer size */
				432	OUT_BATCH(0xfffff000 \| 1);
				433	/* dynamic state buffer size */
				434	OUT_BATCH(1 << 12 \| 1);
				435	/* indirect object buffer size */
				436	OUT_BATCH(0xfffff000 \| 1);
				437	/* intruction buffer size, must set modify enable bit, otherwise it may result in GPU hang */
				438	OUT_BATCH(1 << 12 \| 1);
				439
				440	/* Bindless surface state base address */
				441	OUT_BATCH(0 \| BASE_ADDRESS_MODIFY);
				442	OUT_BATCH(0);
				443	OUT_BATCH(0xfffff000);
				444	}
				445
				446	static void
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	447	gen7_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
				448	{
				449	OUT_BATCH(GEN7_MEDIA_VFE_STATE \| (8 - 2));
				450
				451	/* scratch buffer */
				452	OUT_BATCH(0);
				453
				454	/* number of threads & urb entries */
				455	OUT_BATCH(1 << 16 \| /* max num of threads */
				456	0 << 8 \| /* num of URB entry */
				457	1 << 2); /* GPGPU mode */
				458
				459	OUT_BATCH(0);
				460
				461	/* urb entry size & curbe size */
				462	OUT_BATCH(0 << 16 \| /* URB entry size in 256 bits unit */
				463	1); /* CURBE entry size in 256 bits unit */
				464
				465	/* scoreboard */
				466	OUT_BATCH(0);
				467	OUT_BATCH(0);
				468	OUT_BATCH(0);
				469	}
				470
				471	static void
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	472	gen8_emit_vfe_state_gpgpu(struct intel_batchbuffer *batch)
				473	{
				474	OUT_BATCH(GEN8_MEDIA_VFE_STATE \| (9 - 2));
				475
				476	/* scratch buffer */
				477	OUT_BATCH(0);
				478	OUT_BATCH(0);
				479
				480	/* number of threads & urb entries */
				481	OUT_BATCH(1 << 16 \| 1 << 8);
				482
				483	OUT_BATCH(0);
				484
				485	/* urb entry size & curbe size */
				486	OUT_BATCH(0 << 16 \| 1);
				487
				488	/* scoreboard */
				489	OUT_BATCH(0);
				490	OUT_BATCH(0);
				491	OUT_BATCH(0);
				492	}
				493
				494	static void
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	495	gen7_emit_curbe_load(struct intel_batchbuffer *batch, uint32_t curbe_buffer)
				496	{
				497	OUT_BATCH(GEN7_MEDIA_CURBE_LOAD \| (4 - 2));
				498	OUT_BATCH(0);
				499	/* curbe total data length */
				500	OUT_BATCH(64);
				501	/* curbe data start address, is relative to the dynamics base address */
				502	OUT_BATCH(curbe_buffer);
				503	}
				504
				505	static void
				506	gen7_emit_interface_descriptor_load(struct intel_batchbuffer *batch, uint32_t interface_descriptor)
				507	{
				508	OUT_BATCH(GEN7_MEDIA_INTERFACE_DESCRIPTOR_LOAD \| (4 - 2));
				509	OUT_BATCH(0);
				510	/* interface descriptor data length */
				511	OUT_BATCH(sizeof(struct gen7_interface_descriptor_data));
				512	/* interface descriptor address, is relative to the dynamics base address */
				513	OUT_BATCH(interface_descriptor);
				514	}
				515
				516	static void
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	517	gen8_emit_interface_descriptor_load(struct intel_batchbuffer *batch, uint32_t interface_descriptor)
				518	{
				519	OUT_BATCH(GEN8_MEDIA_INTERFACE_DESCRIPTOR_LOAD \| (4 - 2));
				520	OUT_BATCH(0);
				521	/* interface descriptor data length */
				522	OUT_BATCH(sizeof(struct gen8_interface_descriptor_data));
				523	/* interface descriptor address, is relative to the dynamics base address */
				524	OUT_BATCH(interface_descriptor);
				525	}
				526
				527	static void
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	528	gen7_emit_gpgpu_walk(struct intel_batchbuffer *batch,
				529	unsigned x, unsigned y,
				530	unsigned width, unsigned height)
				531	{
				532	uint32_t x_dim, y_dim, tmp, right_mask;
				533
				534	/*
				535	* Simply do SIMD16 based dispatch, so every thread uses
				536	* SIMD16 channels.
				537	*
				538	* Define our own thread group size, e.g 16x1 for every group, then
				539	* will have 1 thread each group in SIMD16 dispatch. So thread
				540	* width/height/depth are all 1.
				541	*
				542	* Then thread group X = width / 16 (aligned to 16)
				543	* thread group Y = height;
				544	*/
				545	x_dim = (width + 15) / 16;
				546	y_dim = height;
				547
				548	tmp = width & 15;
				549	if (tmp == 0)
				550	right_mask = (1 << 16) - 1;
				551	else
				552	right_mask = (1 << tmp) - 1;
				553
				554	OUT_BATCH(GEN7_GPGPU_WALKER \| 9);
				555
				556	/* interface descriptor offset */
				557	OUT_BATCH(0);
				558
				559	/* SIMD size, thread w/h/d */
				560	OUT_BATCH(1 << 30 \| /* SIMD16 */
				561	0 << 16 \| /* depth:1 */
				562	0 << 8 \| /* height:1 */
				563	0); /* width:1 */
				564
				565	/* thread group X */
				566	OUT_BATCH(0);
				567	OUT_BATCH(x_dim);
				568
				569	/* thread group Y */
				570	OUT_BATCH(0);
				571	OUT_BATCH(y_dim);
				572
				573	/* thread group Z */
				574	OUT_BATCH(0);
				575	OUT_BATCH(1);
				576
				577	/* right mask */
				578	OUT_BATCH(right_mask);
				579
				580	/* bottom mask, height 1, always 0xffffffff */
				581	OUT_BATCH(0xffffffff);
				582	}
				583
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	584	static void
				585	gen8_emit_gpgpu_walk(struct intel_batchbuffer *batch,
				586	unsigned x, unsigned y,
				587	unsigned width, unsigned height)
				588	{
				589	uint32_t x_dim, y_dim, tmp, right_mask;
				590
				591	/*
				592	* Simply do SIMD16 based dispatch, so every thread uses
				593	* SIMD16 channels.
				594	*
				595	* Define our own thread group size, e.g 16x1 for every group, then
				596	* will have 1 thread each group in SIMD16 dispatch. So thread
				597	* width/height/depth are all 1.
				598	*
				599	* Then thread group X = width / 16 (aligned to 16)
				600	* thread group Y = height;
				601	*/
				602	x_dim = (width + 15) / 16;
				603	y_dim = height;
				604
				605	tmp = width & 15;
				606	if (tmp == 0)
				607	right_mask = (1 << 16) - 1;
				608	else
				609	right_mask = (1 << tmp) - 1;
				610
				611	OUT_BATCH(GEN7_GPGPU_WALKER \| 13);
				612
				613	OUT_BATCH(0); /* kernel offset */
				614	OUT_BATCH(0); /* indirect data length */
				615	OUT_BATCH(0); /* indirect data offset */
				616
				617	/* SIMD size, thread w/h/d */
				618	OUT_BATCH(1 << 30 \| /* SIMD16 */
				619	0 << 16 \| /* depth:1 */
				620	0 << 8 \| /* height:1 */
				621	0); /* width:1 */
				622
				623	/* thread group X */
				624	OUT_BATCH(0);
				625	OUT_BATCH(0);
				626	OUT_BATCH(x_dim);
				627
				628	/* thread group Y */
				629	OUT_BATCH(0);
				630	OUT_BATCH(0);
				631	OUT_BATCH(y_dim);
				632
				633	/* thread group Z */
				634	OUT_BATCH(0);
				635	OUT_BATCH(1);
				636
				637	/* right mask */
				638	OUT_BATCH(right_mask);
				639
				640	/* bottom mask, height 1, always 0xffffffff */
				641	OUT_BATCH(0xffffffff);
				642	}
				643
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	644	/*
				645	* This sets up the gpgpu pipeline,
				646	*
				647	* +---------------+ <---- 4096
				648	* \| ^ \|
				649	* \| \| \|
				650	* \| various \|
				651	* \| state \|
				652	* \| \| \|
				653	* \|_______\|_______\| <---- 2048 + ?
				654	* \| ^ \|
				655	* \| \| \|
				656	* \| batch \|
				657	* \| commands \|
				658	* \| \| \|
				659	* \| \| \|
				660	* +---------------+ <---- 0 + ?
				661	*
				662	*/
				663
				664	#define BATCH_STATE_SPLIT 2048
				665
				666	void
				667	gen7_gpgpu_fillfunc(struct intel_batchbuffer *batch,
				668	struct igt_buf *dst,
				669	unsigned x, unsigned y,
				670	unsigned width, unsigned height,
				671	uint8_t color)
				672	{
				673	uint32_t curbe_buffer, interface_descriptor;
				674	uint32_t batch_end;
				675
				676	intel_batchbuffer_flush(batch);
				677
				678	/* setup states */
				679	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
				680
				681	/*
				682	* const buffer needs to fill for every thread, but as we have just 1 thread
				683	* per every group, so need only one curbe data.
				684	*
				685	* For each thread, just use thread group ID for buffer offset.
				686	*/
				687	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
				688
				689	interface_descriptor = gen7_fill_interface_descriptor(batch, dst,
				690	gen7_gpgpu_kernel,
				691	sizeof(gen7_gpgpu_kernel));
				692	igt_assert(batch->ptr < &batch->buffer[4095]);
				693
				694	batch->ptr = batch->buffer;
				695
				696	/* GPGPU pipeline */
				697	OUT_BATCH(GEN7_PIPELINE_SELECT \| PIPELINE_SELECT_GPGPU);
				698
				699	gen7_emit_state_base_address(batch);
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	700	gen7_emit_vfe_state_gpgpu(batch);
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	701	gen7_emit_curbe_load(batch, curbe_buffer);
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	702	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
Dominik Zeromski	ed816d5	2015-07-17 11:25:43 +0200	[diff] [blame]	703	gen7_emit_gpgpu_walk(batch, x, y, width, height);
				704
				705	OUT_BATCH(MI_BATCH_BUFFER_END);
				706
				707	batch_end = batch_align(batch, 8);
				708	igt_assert(batch_end < BATCH_STATE_SPLIT);
				709
				710	gen7_render_flush(batch, batch_end);
				711	intel_batchbuffer_reset(batch);
				712	}
Dominik Zeromski	a017c29	2015-07-17 11:25:44 +0200	[diff] [blame]	713
				714	void
				715	gen8_gpgpu_fillfunc(struct intel_batchbuffer *batch,
				716	struct igt_buf *dst,
				717	unsigned x, unsigned y,
				718	unsigned width, unsigned height,
				719	uint8_t color)
				720	{
				721	uint32_t curbe_buffer, interface_descriptor;
				722	uint32_t batch_end;
				723
				724	intel_batchbuffer_flush(batch);
				725
				726	/* setup states */
				727	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
				728
				729	/*
				730	* const buffer needs to fill for every thread, but as we have just 1 thread
				731	* per every group, so need only one curbe data.
				732	*
				733	* For each thread, just use thread group ID for buffer offset.
				734	*/
				735	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
				736
				737	interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
				738	gen8_gpgpu_kernel,
				739	sizeof(gen8_gpgpu_kernel));
				740	igt_assert(batch->ptr < &batch->buffer[4095]);
				741
				742	batch->ptr = batch->buffer;
				743
				744	/* GPGPU pipeline */
				745	OUT_BATCH(GEN7_PIPELINE_SELECT \| PIPELINE_SELECT_GPGPU);
				746
				747	gen8_emit_state_base_address(batch);
				748	gen8_emit_vfe_state_gpgpu(batch);
				749	gen7_emit_curbe_load(batch, curbe_buffer);
				750	gen8_emit_interface_descriptor_load(batch, interface_descriptor);
				751	gen8_emit_gpgpu_walk(batch, x, y, width, height);
				752
				753	OUT_BATCH(MI_BATCH_BUFFER_END);
				754
				755	batch_end = batch_align(batch, 8);
				756	igt_assert(batch_end < BATCH_STATE_SPLIT);
				757
				758	gen7_render_flush(batch, batch_end);
				759	intel_batchbuffer_reset(batch);
				760	}
Dominik Zeromski	3676912	2015-07-17 11:25:45 +0200	[diff] [blame]	761
				762	void
				763	gen9_gpgpu_fillfunc(struct intel_batchbuffer *batch,
				764	struct igt_buf *dst,
				765	unsigned x, unsigned y,
				766	unsigned width, unsigned height,
				767	uint8_t color)
				768	{
				769	uint32_t curbe_buffer, interface_descriptor;
				770	uint32_t batch_end;
				771
				772	intel_batchbuffer_flush(batch);
				773
				774	/* setup states */
				775	batch->ptr = &batch->buffer[BATCH_STATE_SPLIT];
				776
				777	/*
				778	* const buffer needs to fill for every thread, but as we have just 1 thread
				779	* per every group, so need only one curbe data.
				780	*
				781	* For each thread, just use thread group ID for buffer offset.
				782	*/
				783	curbe_buffer = gen7_fill_curbe_buffer_data(batch, color);
				784
				785	interface_descriptor = gen8_fill_interface_descriptor(batch, dst,
				786	gen9_gpgpu_kernel,
				787	sizeof(gen9_gpgpu_kernel));
				788	igt_assert(batch->ptr < &batch->buffer[4095]);
				789
				790	batch->ptr = batch->buffer;
				791
				792	/* GPGPU pipeline */
				793	OUT_BATCH(GEN7_PIPELINE_SELECT \| PIPELINE_SELECT_GPGPU);
				794
				795	gen9_emit_state_base_address(batch);
				796	gen8_emit_vfe_state_gpgpu(batch);
				797	gen7_emit_curbe_load(batch, curbe_buffer);
				798	gen7_emit_interface_descriptor_load(batch, interface_descriptor);
				799	gen8_emit_gpgpu_walk(batch, x, y, width, height);
				800
				801	OUT_BATCH(MI_BATCH_BUFFER_END);
				802
				803	batch_end = batch_align(batch, 8);
				804	igt_assert(batch_end < BATCH_STATE_SPLIT);
				805
				806	gen7_render_flush(batch, batch_end);
				807	intel_batchbuffer_reset(batch);
				808	}