Blame - assembler/brw_eu_emit.c - platform/external/igt-gpu-tools

blob: 119eb349cd075cdae096b9f4f30cc64d6d1a703a [file] [log] [blame]

Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	1	/*
				2	Copyright (C) Intel Corp. 2006. All Rights Reserved.
				3	Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
				4	develop this 3D driver.
				5
				6	Permission is hereby granted, free of charge, to any person obtaining
				7	a copy of this software and associated documentation files (the
				8	"Software"), to deal in the Software without restriction, including
				9	without limitation the rights to use, copy, modify, merge, publish,
				10	distribute, sublicense, and/or sell copies of the Software, and to
				11	permit persons to whom the Software is furnished to do so, subject to
				12	the following conditions:
				13
				14	The above copyright notice and this permission notice (including the
				15	next paragraph) shall be included in all copies or substantial
				16	portions of the Software.
				17
				18	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				19	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				20	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
				21	IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
				22	LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
				23	OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
				24	WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
				25
				26	**********************************************************************/
				27	/*
				28	* Authors:
				29	* Keith Whitwell <keith@tungstengraphics.com>
				30	*/
				31
				32	#include <string.h>
				33
				34	#include "brw_context.h"
				35	#include "brw_defines.h"
				36	#include "brw_eu.h"
				37
				38	#include "ralloc.h"
				39
				40	/***********************************************************************
				41	* Internal helper for constructing instructions
				42	*/
				43
				44	static void guess_execution_size(struct brw_compile *p,
				45	struct brw_instruction *insn,
				46	struct brw_reg reg)
				47	{
				48	if (reg.width == BRW_WIDTH_8 && p->compressed)
				49	insn->header.execution_size = BRW_EXECUTE_16;
				50	else
				51	insn->header.execution_size = reg.width; /* note - definitions are compatible */
				52	}
				53
				54
				55	/**
				56	* Prior to Sandybridge, the SEND instruction accepted non-MRF source
				57	* registers, implicitly moving the operand to a message register.
				58	*
				59	* On Sandybridge, this is no longer the case. This function performs the
				60	* explicit move; it should be called before emitting a SEND instruction.
				61	*/
				62	void
				63	gen6_resolve_implied_move(struct brw_compile *p,
				64	struct brw_reg *src,
				65	GLuint msg_reg_nr)
				66	{
				67	struct intel_context *intel = &p->brw->intel;
				68	if (intel->gen < 6)
				69	return;
				70
				71	if (src->file == BRW_MESSAGE_REGISTER_FILE)
				72	return;
				73
				74	if (src->file != BRW_ARCHITECTURE_REGISTER_FILE \|\| src->nr != BRW_ARF_NULL) {
				75	brw_push_insn_state(p);
				76	brw_set_mask_control(p, BRW_MASK_DISABLE);
				77	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				78	brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
				79	retype(*src, BRW_REGISTER_TYPE_UD));
				80	brw_pop_insn_state(p);
				81	}
				82	*src = brw_message_reg(msg_reg_nr);
				83	}
				84
				85	static void
				86	gen7_convert_mrf_to_grf(struct brw_compile p, struct brw_reg reg)
				87	{
				88	/* From the BSpec / ISA Reference / send - [DevIVB+]:
				89	* "The send with EOT should use register space R112-R127 for <src>. This is
				90	* to enable loading of a new thread into the same slot while the message
				91	* with EOT for current thread is pending dispatch."
				92	*
				93	* Since we're pretending to have 16 MRFs anyway, we may as well use the
				94	* registers required for messages with EOT.
				95	*/
				96	struct intel_context *intel = &p->brw->intel;
				97	if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
				98	reg->file = BRW_GENERAL_REGISTER_FILE;
				99	reg->nr += GEN7_MRF_HACK_START;
				100	}
				101	}
				102
				103
				104	void
				105	brw_set_dest(struct brw_compile p, struct brw_instruction insn,
				106	struct brw_reg dest)
				107	{
				108	if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
				109	dest.file != BRW_MESSAGE_REGISTER_FILE)
				110	assert(dest.nr < 128);
				111
				112	gen7_convert_mrf_to_grf(p, &dest);
				113
				114	insn->bits1.da1.dest_reg_file = dest.file;
				115	insn->bits1.da1.dest_reg_type = dest.type;
				116	insn->bits1.da1.dest_address_mode = dest.address_mode;
				117
				118	if (dest.address_mode == BRW_ADDRESS_DIRECT) {
				119	insn->bits1.da1.dest_reg_nr = dest.nr;
				120
				121	if (insn->header.access_mode == BRW_ALIGN_1) {
				122	insn->bits1.da1.dest_subreg_nr = dest.subnr;
				123	if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
				124	dest.hstride = BRW_HORIZONTAL_STRIDE_1;
				125	insn->bits1.da1.dest_horiz_stride = dest.hstride;
				126	}
				127	else {
				128	insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
				129	insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
				130	/* even ignored in da16, still need to set as '01' */
				131	insn->bits1.da16.dest_horiz_stride = 1;
				132	}
				133	}
				134	else {
				135	insn->bits1.ia1.dest_subreg_nr = dest.subnr;
				136
				137	/* These are different sizes in align1 vs align16:
				138	*/
				139	if (insn->header.access_mode == BRW_ALIGN_1) {
				140	insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
				141	if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
				142	dest.hstride = BRW_HORIZONTAL_STRIDE_1;
				143	insn->bits1.ia1.dest_horiz_stride = dest.hstride;
				144	}
				145	else {
				146	insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
				147	/* even ignored in da16, still need to set as '01' */
				148	insn->bits1.ia16.dest_horiz_stride = 1;
				149	}
				150	}
				151
				152	/* NEW: Set the execution size based on dest.width and
				153	* insn->compression_control:
				154	*/
				155	guess_execution_size(p, insn, dest);
				156	}
				157
				158	extern int reg_type_size[];
				159
				160	static void
				161	validate_reg(struct brw_instruction *insn, struct brw_reg reg)
				162	{
				163	int hstride_for_reg[] = {0, 1, 2, 4};
				164	int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
				165	int width_for_reg[] = {1, 2, 4, 8, 16};
Damien Lespiau	6e83eb6	2013-01-25 15:12:12 +0000	[diff] [blame]	166	int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	167	int width, hstride, vstride, execsize;
				168
				169	if (reg.file == BRW_IMMEDIATE_VALUE) {
				170	/* 3.3.6: Region Parameters. Restriction: Immediate vectors
				171	* mean the destination has to be 128-bit aligned and the
				172	* destination horiz stride has to be a word.
				173	*/
				174	if (reg.type == BRW_REGISTER_TYPE_V) {
				175	assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
				176	reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
				177	}
				178
				179	return;
				180	}
				181
				182	if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
				183	reg.file == BRW_ARF_NULL)
				184	return;
				185
				186	assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
				187	hstride = hstride_for_reg[reg.hstride];
				188
				189	if (reg.vstride == 0xf) {
				190	vstride = -1;
				191	} else {
				192	assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
				193	vstride = vstride_for_reg[reg.vstride];
				194	}
				195
				196	assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
				197	width = width_for_reg[reg.width];
				198
				199	assert(insn->header.execution_size >= 0 &&
				200	insn->header.execution_size < Elements(execsize_for_reg));
				201	execsize = execsize_for_reg[insn->header.execution_size];
				202
				203	/* Restrictions from 3.3.10: Register Region Restrictions. */
				204	/* 3. */
				205	assert(execsize >= width);
				206
				207	/* 4. */
				208	if (execsize == width && hstride != 0) {
				209	assert(vstride == -1 \|\| vstride == width * hstride);
				210	}
				211
				212	/* 5. */
				213	if (execsize == width && hstride == 0) {
				214	/* no restriction on vstride. */
				215	}
				216
				217	/* 6. */
				218	if (width == 1) {
				219	assert(hstride == 0);
				220	}
				221
				222	/* 7. */
				223	if (execsize == 1 && width == 1) {
				224	assert(hstride == 0);
				225	assert(vstride == 0);
				226	}
				227
				228	/* 8. */
				229	if (vstride == 0 && hstride == 0) {
				230	assert(width == 1);
				231	}
				232
				233	/* 10. Check destination issues. */
				234	}
				235
				236	void
				237	brw_set_src0(struct brw_compile p, struct brw_instruction insn,
				238	struct brw_reg reg)
				239	{
				240	struct brw_context *brw = p->brw;
				241	struct intel_context *intel = &brw->intel;
				242
Damien Lespiau	103edcc	2013-01-25 15:13:30 +0000	[diff] [blame^]	243	if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	244	assert(reg.nr < 128);
				245
				246	gen7_convert_mrf_to_grf(p, &reg);
				247
				248	if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND \|\|
				249	insn->header.opcode == BRW_OPCODE_SENDC)) {
				250	/* Any source modifiers or regions will be ignored, since this just
				251	* identifies the MRF/GRF to start reading the message contents from.
				252	* Check for some likely failures.
				253	*/
				254	assert(!reg.negate);
				255	assert(!reg.abs);
				256	assert(reg.address_mode == BRW_ADDRESS_DIRECT);
				257	}
				258
				259	validate_reg(insn, reg);
				260
				261	insn->bits1.da1.src0_reg_file = reg.file;
				262	insn->bits1.da1.src0_reg_type = reg.type;
				263	insn->bits2.da1.src0_abs = reg.abs;
				264	insn->bits2.da1.src0_negate = reg.negate;
				265	insn->bits2.da1.src0_address_mode = reg.address_mode;
				266
				267	if (reg.file == BRW_IMMEDIATE_VALUE) {
				268	insn->bits3.ud = reg.dw1.ud;
				269
				270	/* Required to set some fields in src1 as well:
				271	*/
				272	insn->bits1.da1.src1_reg_file = 0; /* arf */
				273	insn->bits1.da1.src1_reg_type = reg.type;
				274	}
				275	else
				276	{
				277	if (reg.address_mode == BRW_ADDRESS_DIRECT) {
				278	if (insn->header.access_mode == BRW_ALIGN_1) {
				279	insn->bits2.da1.src0_subreg_nr = reg.subnr;
				280	insn->bits2.da1.src0_reg_nr = reg.nr;
				281	}
				282	else {
				283	insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
				284	insn->bits2.da16.src0_reg_nr = reg.nr;
				285	}
				286	}
				287	else {
				288	insn->bits2.ia1.src0_subreg_nr = reg.subnr;
				289
				290	if (insn->header.access_mode == BRW_ALIGN_1) {
				291	insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
				292	}
				293	else {
				294	insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
				295	}
				296	}
				297
				298	if (insn->header.access_mode == BRW_ALIGN_1) {
				299	if (reg.width == BRW_WIDTH_1 &&
				300	insn->header.execution_size == BRW_EXECUTE_1) {
				301	insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
				302	insn->bits2.da1.src0_width = BRW_WIDTH_1;
				303	insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
				304	}
				305	else {
				306	insn->bits2.da1.src0_horiz_stride = reg.hstride;
				307	insn->bits2.da1.src0_width = reg.width;
				308	insn->bits2.da1.src0_vert_stride = reg.vstride;
				309	}
				310	}
				311	else {
				312	insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
				313	insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
				314	insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
				315	insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
				316
				317	/* This is an oddity of the fact we're using the same
				318	* descriptions for registers in align_16 as align_1:
				319	*/
				320	if (reg.vstride == BRW_VERTICAL_STRIDE_8)
				321	insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
				322	else
				323	insn->bits2.da16.src0_vert_stride = reg.vstride;
				324	}
				325	}
				326	}
				327
				328
				329	void brw_set_src1(struct brw_compile *p,
				330	struct brw_instruction *insn,
				331	struct brw_reg reg)
				332	{
				333	assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
				334
Damien Lespiau	103edcc	2013-01-25 15:13:30 +0000	[diff] [blame^]	335	if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	336	assert(reg.nr < 128);
				337
				338	gen7_convert_mrf_to_grf(p, &reg);
				339
				340	validate_reg(insn, reg);
				341
				342	insn->bits1.da1.src1_reg_file = reg.file;
				343	insn->bits1.da1.src1_reg_type = reg.type;
				344	insn->bits3.da1.src1_abs = reg.abs;
				345	insn->bits3.da1.src1_negate = reg.negate;
				346
				347	/* Only src1 can be immediate in two-argument instructions.
				348	*/
				349	assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
				350
				351	if (reg.file == BRW_IMMEDIATE_VALUE) {
				352	insn->bits3.ud = reg.dw1.ud;
				353	}
				354	else {
				355	/* This is a hardware restriction, which may or may not be lifted
				356	* in the future:
				357	*/
				358	assert (reg.address_mode == BRW_ADDRESS_DIRECT);
				359	/* assert (reg.file == BRW_GENERAL_REGISTER_FILE); */
				360
				361	if (insn->header.access_mode == BRW_ALIGN_1) {
				362	insn->bits3.da1.src1_subreg_nr = reg.subnr;
				363	insn->bits3.da1.src1_reg_nr = reg.nr;
				364	}
				365	else {
				366	insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
				367	insn->bits3.da16.src1_reg_nr = reg.nr;
				368	}
				369
				370	if (insn->header.access_mode == BRW_ALIGN_1) {
				371	if (reg.width == BRW_WIDTH_1 &&
				372	insn->header.execution_size == BRW_EXECUTE_1) {
				373	insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
				374	insn->bits3.da1.src1_width = BRW_WIDTH_1;
				375	insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
				376	}
				377	else {
				378	insn->bits3.da1.src1_horiz_stride = reg.hstride;
				379	insn->bits3.da1.src1_width = reg.width;
				380	insn->bits3.da1.src1_vert_stride = reg.vstride;
				381	}
				382	}
				383	else {
				384	insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
				385	insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
				386	insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
				387	insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
				388
				389	/* This is an oddity of the fact we're using the same
				390	* descriptions for registers in align_16 as align_1:
				391	*/
				392	if (reg.vstride == BRW_VERTICAL_STRIDE_8)
				393	insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
				394	else
				395	insn->bits3.da16.src1_vert_stride = reg.vstride;
				396	}
				397	}
				398	}
				399
				400	/**
				401	* Set the Message Descriptor and Extended Message Descriptor fields
				402	* for SEND messages.
				403	*
				404	* \note This zeroes out the Function Control bits, so it must be called
				405	* \b before filling out any message-specific data. Callers can
				406	* choose not to fill in irrelevant bits; they will be zero.
				407	*/
				408	static void
				409	brw_set_message_descriptor(struct brw_compile *p,
				410	struct brw_instruction *inst,
				411	enum brw_message_target sfid,
				412	unsigned msg_length,
				413	unsigned response_length,
				414	bool header_present,
				415	bool end_of_thread)
				416	{
				417	struct intel_context *intel = &p->brw->intel;
				418
				419	brw_set_src1(p, inst, brw_imm_d(0));
				420
				421	if (intel->gen >= 5) {
				422	inst->bits3.generic_gen5.header_present = header_present;
				423	inst->bits3.generic_gen5.response_length = response_length;
				424	inst->bits3.generic_gen5.msg_length = msg_length;
				425	inst->bits3.generic_gen5.end_of_thread = end_of_thread;
				426
				427	if (intel->gen >= 6) {
				428	/* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
				429	inst->header.destreg__conditionalmod = sfid;
				430	} else {
				431	/* Set Extended Message Descriptor (ex_desc) */
				432	inst->bits2.send_gen5.sfid = sfid;
				433	inst->bits2.send_gen5.end_of_thread = end_of_thread;
				434	}
				435	} else {
				436	inst->bits3.generic.response_length = response_length;
				437	inst->bits3.generic.msg_length = msg_length;
				438	inst->bits3.generic.msg_target = sfid;
				439	inst->bits3.generic.end_of_thread = end_of_thread;
				440	}
				441	}
				442
				443	static void brw_set_math_message( struct brw_compile *p,
				444	struct brw_instruction *insn,
				445	GLuint function,
				446	GLuint integer_type,
				447	bool low_precision,
				448	GLuint dataType )
				449	{
				450	struct brw_context *brw = p->brw;
				451	struct intel_context *intel = &brw->intel;
				452	unsigned msg_length;
				453	unsigned response_length;
				454
				455	/* Infer message length from the function */
				456	switch (function) {
				457	case BRW_MATH_FUNCTION_POW:
				458	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
				459	case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
				460	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
				461	msg_length = 2;
				462	break;
				463	default:
				464	msg_length = 1;
				465	break;
				466	}
				467
				468	/* Infer response length from the function */
				469	switch (function) {
				470	case BRW_MATH_FUNCTION_SINCOS:
				471	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
				472	response_length = 2;
				473	break;
				474	default:
				475	response_length = 1;
				476	break;
				477	}
				478
				479
				480	brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
				481	msg_length, response_length, false, false);
				482	if (intel->gen == 5) {
				483	insn->bits3.math_gen5.function = function;
				484	insn->bits3.math_gen5.int_type = integer_type;
				485	insn->bits3.math_gen5.precision = low_precision;
				486	insn->bits3.math_gen5.saturate = insn->header.saturate;
				487	insn->bits3.math_gen5.data_type = dataType;
				488	insn->bits3.math_gen5.snapshot = 0;
				489	} else {
				490	insn->bits3.math.function = function;
				491	insn->bits3.math.int_type = integer_type;
				492	insn->bits3.math.precision = low_precision;
				493	insn->bits3.math.saturate = insn->header.saturate;
				494	insn->bits3.math.data_type = dataType;
				495	}
				496	insn->header.saturate = 0;
				497	}
				498
				499
				500	static void brw_set_ff_sync_message(struct brw_compile *p,
				501	struct brw_instruction *insn,
				502	bool allocate,
				503	GLuint response_length,
				504	bool end_of_thread)
				505	{
				506	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
				507	1, response_length, true, end_of_thread);
				508	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
				509	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
				510	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
				511	insn->bits3.urb_gen5.allocate = allocate;
				512	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
				513	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
				514	}
				515
				516	static void brw_set_urb_message( struct brw_compile *p,
				517	struct brw_instruction *insn,
				518	bool allocate,
				519	bool used,
				520	GLuint msg_length,
				521	GLuint response_length,
				522	bool end_of_thread,
				523	bool complete,
				524	GLuint offset,
				525	GLuint swizzle_control )
				526	{
				527	struct brw_context *brw = p->brw;
				528	struct intel_context *intel = &brw->intel;
				529
				530	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
				531	msg_length, response_length, true, end_of_thread);
				532	if (intel->gen == 7) {
				533	insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
				534	insn->bits3.urb_gen7.offset = offset;
				535	assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
				536	insn->bits3.urb_gen7.swizzle_control = swizzle_control;
				537	/* per_slot_offset = 0 makes it ignore offsets in message header */
				538	insn->bits3.urb_gen7.per_slot_offset = 0;
				539	insn->bits3.urb_gen7.complete = complete;
				540	} else if (intel->gen >= 5) {
				541	insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
				542	insn->bits3.urb_gen5.offset = offset;
				543	insn->bits3.urb_gen5.swizzle_control = swizzle_control;
				544	insn->bits3.urb_gen5.allocate = allocate;
				545	insn->bits3.urb_gen5.used = used; /* ? */
				546	insn->bits3.urb_gen5.complete = complete;
				547	} else {
				548	insn->bits3.urb.opcode = 0; /* ? */
				549	insn->bits3.urb.offset = offset;
				550	insn->bits3.urb.swizzle_control = swizzle_control;
				551	insn->bits3.urb.allocate = allocate;
				552	insn->bits3.urb.used = used; /* ? */
				553	insn->bits3.urb.complete = complete;
				554	}
				555	}
				556
				557	void
				558	brw_set_dp_write_message(struct brw_compile *p,
				559	struct brw_instruction *insn,
				560	GLuint binding_table_index,
				561	GLuint msg_control,
				562	GLuint msg_type,
				563	GLuint msg_length,
				564	bool header_present,
				565	GLuint last_render_target,
				566	GLuint response_length,
				567	GLuint end_of_thread,
				568	GLuint send_commit_msg)
				569	{
				570	struct brw_context *brw = p->brw;
				571	struct intel_context *intel = &brw->intel;
				572	unsigned sfid;
				573
				574	if (intel->gen >= 7) {
				575	/* Use the Render Cache for RT writes; otherwise use the Data Cache */
				576	if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
				577	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				578	else
				579	sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
				580	} else if (intel->gen == 6) {
				581	/* Use the render cache for all write messages. */
				582	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				583	} else {
				584	sfid = BRW_SFID_DATAPORT_WRITE;
				585	}
				586
				587	brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
				588	header_present, end_of_thread);
				589
				590	if (intel->gen >= 7) {
				591	insn->bits3.gen7_dp.binding_table_index = binding_table_index;
				592	insn->bits3.gen7_dp.msg_control = msg_control \|
				593	last_render_target << 6;
				594	insn->bits3.gen7_dp.msg_type = msg_type;
				595	} else if (intel->gen == 6) {
				596	insn->bits3.gen6_dp.binding_table_index = binding_table_index;
				597	insn->bits3.gen6_dp.msg_control = msg_control \|
				598	last_render_target << 5;
				599	insn->bits3.gen6_dp.msg_type = msg_type;
				600	insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
				601	} else if (intel->gen == 5) {
				602	insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
				603	insn->bits3.dp_write_gen5.msg_control = msg_control;
				604	insn->bits3.dp_write_gen5.last_render_target = last_render_target;
				605	insn->bits3.dp_write_gen5.msg_type = msg_type;
				606	insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
				607	} else {
				608	insn->bits3.dp_write.binding_table_index = binding_table_index;
				609	insn->bits3.dp_write.msg_control = msg_control;
				610	insn->bits3.dp_write.last_render_target = last_render_target;
				611	insn->bits3.dp_write.msg_type = msg_type;
				612	insn->bits3.dp_write.send_commit_msg = send_commit_msg;
				613	}
				614	}
				615
				616	void
				617	brw_set_dp_read_message(struct brw_compile *p,
				618	struct brw_instruction *insn,
				619	GLuint binding_table_index,
				620	GLuint msg_control,
				621	GLuint msg_type,
				622	GLuint target_cache,
				623	GLuint msg_length,
				624	bool header_present,
				625	GLuint response_length)
				626	{
				627	struct brw_context *brw = p->brw;
				628	struct intel_context *intel = &brw->intel;
				629	unsigned sfid;
				630
				631	if (intel->gen >= 7) {
				632	sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
				633	} else if (intel->gen == 6) {
				634	if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
				635	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				636	else
				637	sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
				638	} else {
				639	sfid = BRW_SFID_DATAPORT_READ;
				640	}
				641
				642	brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
				643	header_present, false);
				644
				645	if (intel->gen >= 7) {
				646	insn->bits3.gen7_dp.binding_table_index = binding_table_index;
				647	insn->bits3.gen7_dp.msg_control = msg_control;
				648	insn->bits3.gen7_dp.msg_type = msg_type;
				649	} else if (intel->gen == 6) {
				650	insn->bits3.gen6_dp.binding_table_index = binding_table_index;
				651	insn->bits3.gen6_dp.msg_control = msg_control;
				652	insn->bits3.gen6_dp.msg_type = msg_type;
				653	insn->bits3.gen6_dp.send_commit_msg = 0;
				654	} else if (intel->gen == 5) {
				655	insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
				656	insn->bits3.dp_read_gen5.msg_control = msg_control;
				657	insn->bits3.dp_read_gen5.msg_type = msg_type;
				658	insn->bits3.dp_read_gen5.target_cache = target_cache;
				659	} else if (intel->is_g4x) {
				660	insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /0:7/
				661	insn->bits3.dp_read_g4x.msg_control = msg_control; /8:10/
				662	insn->bits3.dp_read_g4x.msg_type = msg_type; /11:13/
				663	insn->bits3.dp_read_g4x.target_cache = target_cache; /14:15/
				664	} else {
				665	insn->bits3.dp_read.binding_table_index = binding_table_index; /0:7/
				666	insn->bits3.dp_read.msg_control = msg_control; /8:11/
				667	insn->bits3.dp_read.msg_type = msg_type; /12:13/
				668	insn->bits3.dp_read.target_cache = target_cache; /14:15/
				669	}
				670	}
				671
				672	void
				673	brw_set_sampler_message(struct brw_compile *p,
				674	struct brw_instruction *insn,
				675	GLuint binding_table_index,
				676	GLuint sampler,
				677	GLuint msg_type,
				678	GLuint response_length,
				679	GLuint msg_length,
				680	GLuint header_present,
				681	GLuint simd_mode,
				682	GLuint return_format)
				683	{
				684	struct brw_context *brw = p->brw;
				685	struct intel_context *intel = &brw->intel;
				686
				687	brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
				688	response_length, header_present, false);
				689
				690	if (intel->gen >= 7) {
				691	insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
				692	insn->bits3.sampler_gen7.sampler = sampler;
				693	insn->bits3.sampler_gen7.msg_type = msg_type;
				694	insn->bits3.sampler_gen7.simd_mode = simd_mode;
				695	} else if (intel->gen >= 5) {
				696	insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
				697	insn->bits3.sampler_gen5.sampler = sampler;
				698	insn->bits3.sampler_gen5.msg_type = msg_type;
				699	insn->bits3.sampler_gen5.simd_mode = simd_mode;
				700	} else if (intel->is_g4x) {
				701	insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
				702	insn->bits3.sampler_g4x.sampler = sampler;
				703	insn->bits3.sampler_g4x.msg_type = msg_type;
				704	} else {
				705	insn->bits3.sampler.binding_table_index = binding_table_index;
				706	insn->bits3.sampler.sampler = sampler;
				707	insn->bits3.sampler.msg_type = msg_type;
				708	insn->bits3.sampler.return_format = return_format;
				709	}
				710	}
				711
				712
				713	#define next_insn brw_next_insn
				714	struct brw_instruction *
				715	brw_next_insn(struct brw_compile *p, GLuint opcode)
				716	{
				717	struct brw_instruction *insn;
				718
				719	if (p->nr_insn + 1 > p->store_size) {
				720	if (0)
				721	printf("incresing the store size to %d\n", p->store_size << 1);
				722	p->store_size <<= 1;
				723	p->store = reralloc(p->mem_ctx, p->store,
				724	struct brw_instruction, p->store_size);
				725	if (!p->store)
				726	assert(!"realloc eu store memeory failed");
				727	}
				728
				729	p->next_insn_offset += 16;
				730	insn = &p->store[p->nr_insn++];
				731	memcpy(insn, p->current, sizeof(*insn));
				732
				733	/* Reset this one-shot flag:
				734	*/
				735
				736	if (p->current->header.destreg__conditionalmod) {
				737	p->current->header.destreg__conditionalmod = 0;
				738	p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
				739	}
				740
				741	insn->header.opcode = opcode;
				742	return insn;
				743	}
				744
				745	static struct brw_instruction brw_alu1( struct brw_compile p,
				746	GLuint opcode,
				747	struct brw_reg dest,
				748	struct brw_reg src )
				749	{
				750	struct brw_instruction *insn = next_insn(p, opcode);
				751	brw_set_dest(p, insn, dest);
				752	brw_set_src0(p, insn, src);
				753	return insn;
				754	}
				755
				756	static struct brw_instruction brw_alu2(struct brw_compile p,
				757	GLuint opcode,
				758	struct brw_reg dest,
				759	struct brw_reg src0,
				760	struct brw_reg src1 )
				761	{
				762	struct brw_instruction *insn = next_insn(p, opcode);
				763	brw_set_dest(p, insn, dest);
				764	brw_set_src0(p, insn, src0);
				765	brw_set_src1(p, insn, src1);
				766	return insn;
				767	}
				768
				769	static int
				770	get_3src_subreg_nr(struct brw_reg reg)
				771	{
				772	if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
				773	assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
				774	return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
				775	} else {
				776	return reg.subnr / 4;
				777	}
				778	}
				779
				780	static struct brw_instruction brw_alu3(struct brw_compile p,
				781	GLuint opcode,
				782	struct brw_reg dest,
				783	struct brw_reg src0,
				784	struct brw_reg src1,
				785	struct brw_reg src2)
				786	{
				787	struct brw_instruction *insn = next_insn(p, opcode);
				788
				789	gen7_convert_mrf_to_grf(p, &dest);
				790
				791	assert(insn->header.access_mode == BRW_ALIGN_16);
				792
				793	assert(dest.file == BRW_GENERAL_REGISTER_FILE \|\|
				794	dest.file == BRW_MESSAGE_REGISTER_FILE);
				795	assert(dest.nr < 128);
				796	assert(dest.address_mode == BRW_ADDRESS_DIRECT);
				797	assert(dest.type == BRW_REGISTER_TYPE_F);
				798	insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
				799	insn->bits1.da3src.dest_reg_nr = dest.nr;
				800	insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
				801	insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
				802	guess_execution_size(p, insn, dest);
				803
				804	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
				805	assert(src0.address_mode == BRW_ADDRESS_DIRECT);
				806	assert(src0.nr < 128);
				807	assert(src0.type == BRW_REGISTER_TYPE_F);
				808	insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
				809	insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
				810	insn->bits2.da3src.src0_reg_nr = src0.nr;
				811	insn->bits1.da3src.src0_abs = src0.abs;
				812	insn->bits1.da3src.src0_negate = src0.negate;
				813	insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
				814
				815	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
				816	assert(src1.address_mode == BRW_ADDRESS_DIRECT);
				817	assert(src1.nr < 128);
				818	assert(src1.type == BRW_REGISTER_TYPE_F);
				819	insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
				820	insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
				821	insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
				822	insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
				823	insn->bits3.da3src.src1_reg_nr = src1.nr;
				824	insn->bits1.da3src.src1_abs = src1.abs;
				825	insn->bits1.da3src.src1_negate = src1.negate;
				826
				827	assert(src2.file == BRW_GENERAL_REGISTER_FILE);
				828	assert(src2.address_mode == BRW_ADDRESS_DIRECT);
				829	assert(src2.nr < 128);
				830	assert(src2.type == BRW_REGISTER_TYPE_F);
				831	insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
				832	insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
				833	insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
				834	insn->bits3.da3src.src2_reg_nr = src2.nr;
				835	insn->bits1.da3src.src2_abs = src2.abs;
				836	insn->bits1.da3src.src2_negate = src2.negate;
				837
				838	return insn;
				839	}
				840
				841
				842	/***********************************************************************
				843	* Convenience routines.
				844	*/
				845	#define ALU1(OP) \
				846	struct brw_instruction brw_##OP(struct brw_compile p, \
				847	struct brw_reg dest, \
				848	struct brw_reg src0) \
				849	{ \
				850	return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
				851	}
				852
				853	#define ALU2(OP) \
				854	struct brw_instruction brw_##OP(struct brw_compile p, \
				855	struct brw_reg dest, \
				856	struct brw_reg src0, \
				857	struct brw_reg src1) \
				858	{ \
				859	return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
				860	}
				861
				862	#define ALU3(OP) \
				863	struct brw_instruction brw_##OP(struct brw_compile p, \
				864	struct brw_reg dest, \
				865	struct brw_reg src0, \
				866	struct brw_reg src1, \
				867	struct brw_reg src2) \
				868	{ \
				869	return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
				870	}
				871
				872	/* Rounding operations (other than RNDD) require two instructions - the first
				873	* stores a rounded value (possibly the wrong way) in the dest register, but
				874	* also sets a per-channel "increment bit" in the flag register. A predicated
				875	* add of 1.0 fixes dest to contain the desired result.
				876	*
				877	* Sandybridge and later appear to round correctly without an ADD.
				878	*/
				879	#define ROUND(OP) \
				880	void brw_##OP(struct brw_compile *p, \
				881	struct brw_reg dest, \
				882	struct brw_reg src) \
				883	{ \
				884	struct brw_instruction rnd, add; \
				885	rnd = next_insn(p, BRW_OPCODE_##OP); \
				886	brw_set_dest(p, rnd, dest); \
				887	brw_set_src0(p, rnd, src); \
				888	\
				889	if (p->brw->intel.gen < 6) { \
				890	/* turn on round-increments */ \
				891	rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
				892	add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
				893	add->header.predicate_control = BRW_PREDICATE_NORMAL; \
				894	} \
				895	}
				896
				897
				898	ALU1(MOV)
				899	ALU2(SEL)
				900	ALU1(NOT)
				901	ALU2(AND)
				902	ALU2(OR)
				903	ALU2(XOR)
				904	ALU2(SHR)
				905	ALU2(SHL)
				906	ALU2(RSR)
				907	ALU2(RSL)
				908	ALU2(ASR)
				909	ALU1(FRC)
				910	ALU1(RNDD)
				911	ALU2(MAC)
				912	ALU2(MACH)
				913	ALU1(LZD)
				914	ALU2(DP4)
				915	ALU2(DPH)
				916	ALU2(DP3)
				917	ALU2(DP2)
				918	ALU2(LINE)
				919	ALU2(PLN)
				920	ALU3(MAD)
				921
				922	ROUND(RNDZ)
				923	ROUND(RNDE)
				924
				925
				926	struct brw_instruction brw_ADD(struct brw_compile p,
				927	struct brw_reg dest,
				928	struct brw_reg src0,
				929	struct brw_reg src1)
				930	{
				931	/* 6.2.2: add */
				932	if (src0.type == BRW_REGISTER_TYPE_F \|\|
				933	(src0.file == BRW_IMMEDIATE_VALUE &&
				934	src0.type == BRW_REGISTER_TYPE_VF)) {
				935	assert(src1.type != BRW_REGISTER_TYPE_UD);
				936	assert(src1.type != BRW_REGISTER_TYPE_D);
				937	}
				938
				939	if (src1.type == BRW_REGISTER_TYPE_F \|\|
				940	(src1.file == BRW_IMMEDIATE_VALUE &&
				941	src1.type == BRW_REGISTER_TYPE_VF)) {
				942	assert(src0.type != BRW_REGISTER_TYPE_UD);
				943	assert(src0.type != BRW_REGISTER_TYPE_D);
				944	}
				945
				946	return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
				947	}
				948
				949	struct brw_instruction brw_AVG(struct brw_compile p,
				950	struct brw_reg dest,
				951	struct brw_reg src0,
				952	struct brw_reg src1)
				953	{
				954	assert(dest.type == src0.type);
				955	assert(src0.type == src1.type);
				956	switch (src0.type) {
				957	case BRW_REGISTER_TYPE_B:
				958	case BRW_REGISTER_TYPE_UB:
				959	case BRW_REGISTER_TYPE_W:
				960	case BRW_REGISTER_TYPE_UW:
				961	case BRW_REGISTER_TYPE_D:
				962	case BRW_REGISTER_TYPE_UD:
				963	break;
				964	default:
				965	assert(!"Bad type for brw_AVG");
				966	}
				967
				968	return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
				969	}
				970
				971	struct brw_instruction brw_MUL(struct brw_compile p,
				972	struct brw_reg dest,
				973	struct brw_reg src0,
				974	struct brw_reg src1)
				975	{
				976	/* 6.32.38: mul */
				977	if (src0.type == BRW_REGISTER_TYPE_D \|\|
				978	src0.type == BRW_REGISTER_TYPE_UD \|\|
				979	src1.type == BRW_REGISTER_TYPE_D \|\|
				980	src1.type == BRW_REGISTER_TYPE_UD) {
				981	assert(dest.type != BRW_REGISTER_TYPE_F);
				982	}
				983
				984	if (src0.type == BRW_REGISTER_TYPE_F \|\|
				985	(src0.file == BRW_IMMEDIATE_VALUE &&
				986	src0.type == BRW_REGISTER_TYPE_VF)) {
				987	assert(src1.type != BRW_REGISTER_TYPE_UD);
				988	assert(src1.type != BRW_REGISTER_TYPE_D);
				989	}
				990
				991	if (src1.type == BRW_REGISTER_TYPE_F \|\|
				992	(src1.file == BRW_IMMEDIATE_VALUE &&
				993	src1.type == BRW_REGISTER_TYPE_VF)) {
				994	assert(src0.type != BRW_REGISTER_TYPE_UD);
				995	assert(src0.type != BRW_REGISTER_TYPE_D);
				996	}
				997
				998	assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE \|\|
				999	src0.nr != BRW_ARF_ACCUMULATOR);
				1000	assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE \|\|
				1001	src1.nr != BRW_ARF_ACCUMULATOR);
				1002
				1003	return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
				1004	}
				1005
				1006
				1007	void brw_NOP(struct brw_compile *p)
				1008	{
				1009	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
				1010	brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1011	brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1012	brw_set_src1(p, insn, brw_imm_ud(0x0));
				1013	}
				1014
				1015
				1016
				1017
				1018
				1019	/***********************************************************************
				1020	* Comparisons, if/else/endif
				1021	*/
				1022
				1023	struct brw_instruction brw_JMPI(struct brw_compile p,
				1024	struct brw_reg dest,
				1025	struct brw_reg src0,
				1026	struct brw_reg src1)
				1027	{
				1028	struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
				1029
				1030	insn->header.execution_size = 1;
				1031	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1032	insn->header.mask_control = BRW_MASK_DISABLE;
				1033
				1034	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1035
				1036	return insn;
				1037	}
				1038
				1039	static void
				1040	push_if_stack(struct brw_compile p, struct brw_instruction inst)
				1041	{
				1042	p->if_stack[p->if_stack_depth] = inst - p->store;
				1043
				1044	p->if_stack_depth++;
				1045	if (p->if_stack_array_size <= p->if_stack_depth) {
				1046	p->if_stack_array_size *= 2;
				1047	p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
				1048	p->if_stack_array_size);
				1049	}
				1050	}
				1051
				1052	static struct brw_instruction *
				1053	pop_if_stack(struct brw_compile *p)
				1054	{
				1055	p->if_stack_depth--;
				1056	return &p->store[p->if_stack[p->if_stack_depth]];
				1057	}
				1058
				1059	static void
				1060	push_loop_stack(struct brw_compile p, struct brw_instruction inst)
				1061	{
				1062	if (p->loop_stack_array_size < p->loop_stack_depth) {
				1063	p->loop_stack_array_size *= 2;
				1064	p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
				1065	p->loop_stack_array_size);
				1066	p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
				1067	p->loop_stack_array_size);
				1068	}
				1069
				1070	p->loop_stack[p->loop_stack_depth] = inst - p->store;
				1071	p->loop_stack_depth++;
				1072	p->if_depth_in_loop[p->loop_stack_depth] = 0;
				1073	}
				1074
				1075	static struct brw_instruction *
				1076	get_inner_do_insn(struct brw_compile *p)
				1077	{
				1078	return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
				1079	}
				1080
				1081	/* EU takes the value from the flag register and pushes it onto some
				1082	* sort of a stack (presumably merging with any flag value already on
				1083	* the stack). Within an if block, the flags at the top of the stack
				1084	* control execution on each channel of the unit, eg. on each of the
				1085	* 16 pixel values in our wm programs.
				1086	*
				1087	* When the matching 'else' instruction is reached (presumably by
				1088	* countdown of the instruction count patched in by our ELSE/ENDIF
				1089	* functions), the relevent flags are inverted.
				1090	*
				1091	* When the matching 'endif' instruction is reached, the flags are
				1092	* popped off. If the stack is now empty, normal execution resumes.
				1093	*/
				1094	struct brw_instruction *
				1095	brw_IF(struct brw_compile *p, GLuint execute_size)
				1096	{
				1097	struct intel_context *intel = &p->brw->intel;
				1098	struct brw_instruction *insn;
				1099
				1100	insn = next_insn(p, BRW_OPCODE_IF);
				1101
				1102	/* Override the defaults for this instruction:
				1103	*/
				1104	if (intel->gen < 6) {
				1105	brw_set_dest(p, insn, brw_ip_reg());
				1106	brw_set_src0(p, insn, brw_ip_reg());
				1107	brw_set_src1(p, insn, brw_imm_d(0x0));
				1108	} else if (intel->gen == 6) {
				1109	brw_set_dest(p, insn, brw_imm_w(0));
				1110	insn->bits1.branch_gen6.jump_count = 0;
				1111	brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1112	brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1113	} else {
				1114	brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1115	brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1116	brw_set_src1(p, insn, brw_imm_ud(0));
				1117	insn->bits3.break_cont.jip = 0;
				1118	insn->bits3.break_cont.uip = 0;
				1119	}
				1120
				1121	insn->header.execution_size = execute_size;
				1122	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1123	insn->header.predicate_control = BRW_PREDICATE_NORMAL;
				1124	insn->header.mask_control = BRW_MASK_ENABLE;
				1125	if (!p->single_program_flow)
				1126	insn->header.thread_control = BRW_THREAD_SWITCH;
				1127
				1128	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1129
				1130	push_if_stack(p, insn);
				1131	p->if_depth_in_loop[p->loop_stack_depth]++;
				1132	return insn;
				1133	}
				1134
				1135	/* This function is only used for gen6-style IF instructions with an
				1136	* embedded comparison (conditional modifier). It is not used on gen7.
				1137	*/
				1138	struct brw_instruction *
				1139	gen6_IF(struct brw_compile *p, uint32_t conditional,
				1140	struct brw_reg src0, struct brw_reg src1)
				1141	{
				1142	struct brw_instruction *insn;
				1143
				1144	insn = next_insn(p, BRW_OPCODE_IF);
				1145
				1146	brw_set_dest(p, insn, brw_imm_w(0));
				1147	if (p->compressed) {
				1148	insn->header.execution_size = BRW_EXECUTE_16;
				1149	} else {
				1150	insn->header.execution_size = BRW_EXECUTE_8;
				1151	}
				1152	insn->bits1.branch_gen6.jump_count = 0;
				1153	brw_set_src0(p, insn, src0);
				1154	brw_set_src1(p, insn, src1);
				1155
				1156	assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
				1157	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
				1158	insn->header.destreg__conditionalmod = conditional;
				1159
				1160	if (!p->single_program_flow)
				1161	insn->header.thread_control = BRW_THREAD_SWITCH;
				1162
				1163	push_if_stack(p, insn);
				1164	return insn;
				1165	}
				1166
				1167	/**
				1168	* In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
				1169	*/
				1170	static void
				1171	convert_IF_ELSE_to_ADD(struct brw_compile *p,
				1172	struct brw_instruction *if_inst,
				1173	struct brw_instruction *else_inst)
				1174	{
				1175	/* The next instruction (where the ENDIF would be, if it existed) */
				1176	struct brw_instruction *next_inst = &p->store[p->nr_insn];
				1177
				1178	assert(p->single_program_flow);
				1179	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
				1180	assert(else_inst == NULL \|\| else_inst->header.opcode == BRW_OPCODE_ELSE);
				1181	assert(if_inst->header.execution_size == BRW_EXECUTE_1);
				1182
				1183	/* Convert IF to an ADD instruction that moves the instruction pointer
				1184	* to the first instruction of the ELSE block. If there is no ELSE
				1185	* block, point to where ENDIF would be. Reverse the predicate.
				1186	*
				1187	* There's no need to execute an ENDIF since we don't need to do any
				1188	* stack operations, and if we're currently executing, we just want to
				1189	* continue normally.
				1190	*/
				1191	if_inst->header.opcode = BRW_OPCODE_ADD;
				1192	if_inst->header.predicate_inverse = 1;
				1193
				1194	if (else_inst != NULL) {
				1195	/* Convert ELSE to an ADD instruction that points where the ENDIF
				1196	* would be.
				1197	*/
				1198	else_inst->header.opcode = BRW_OPCODE_ADD;
				1199
				1200	if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
				1201	else_inst->bits3.ud = (next_inst - else_inst) * 16;
				1202	} else {
				1203	if_inst->bits3.ud = (next_inst - if_inst) * 16;
				1204	}
				1205	}
				1206
				1207	/**
				1208	* Patch IF and ELSE instructions with appropriate jump targets.
				1209	*/
				1210	static void
				1211	patch_IF_ELSE(struct brw_compile *p,
				1212	struct brw_instruction *if_inst,
				1213	struct brw_instruction *else_inst,
				1214	struct brw_instruction *endif_inst)
				1215	{
				1216	struct intel_context *intel = &p->brw->intel;
				1217
				1218	/* We shouldn't be patching IF and ELSE instructions in single program flow
				1219	* mode when gen < 6, because in single program flow mode on those
				1220	* platforms, we convert flow control instructions to conditional ADDs that
				1221	* operate on IP (see brw_ENDIF).
				1222	*
				1223	* However, on Gen6, writing to IP doesn't work in single program flow mode
				1224	* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
				1225	* not be updated by non-flow control instructions."). And on later
				1226	* platforms, there is no significant benefit to converting control flow
				1227	* instructions to conditional ADDs. So we do patch IF and ELSE
				1228	* instructions in single program flow mode on those platforms.
				1229	*/
				1230	if (intel->gen < 6)
				1231	assert(!p->single_program_flow);
				1232
				1233	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
				1234	assert(endif_inst != NULL);
				1235	assert(else_inst == NULL \|\| else_inst->header.opcode == BRW_OPCODE_ELSE);
				1236
				1237	unsigned br = 1;
				1238	/* Jump count is for 64bit data chunk each, so one 128bit instruction
				1239	* requires 2 chunks.
				1240	*/
				1241	if (intel->gen >= 5)
				1242	br = 2;
				1243
				1244	assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
				1245	endif_inst->header.execution_size = if_inst->header.execution_size;
				1246
				1247	if (else_inst == NULL) {
				1248	/* Patch IF -> ENDIF */
				1249	if (intel->gen < 6) {
				1250	/* Turn it into an IFF, which means no mask stack operations for
				1251	* all-false and jumping past the ENDIF.
				1252	*/
				1253	if_inst->header.opcode = BRW_OPCODE_IFF;
				1254	if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
				1255	if_inst->bits3.if_else.pop_count = 0;
				1256	if_inst->bits3.if_else.pad0 = 0;
				1257	} else if (intel->gen == 6) {
				1258	/* As of gen6, there is no IFF and IF must point to the ENDIF. */
				1259	if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
				1260	} else {
				1261	if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
				1262	if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
				1263	}
				1264	} else {
				1265	else_inst->header.execution_size = if_inst->header.execution_size;
				1266
				1267	/* Patch IF -> ELSE */
				1268	if (intel->gen < 6) {
				1269	if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
				1270	if_inst->bits3.if_else.pop_count = 0;
				1271	if_inst->bits3.if_else.pad0 = 0;
				1272	} else if (intel->gen == 6) {
				1273	if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
				1274	}
				1275
				1276	/* Patch ELSE -> ENDIF */
				1277	if (intel->gen < 6) {
				1278	/* BRW_OPCODE_ELSE pre-gen6 should point just past the
				1279	* matching ENDIF.
				1280	*/
				1281	else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
				1282	else_inst->bits3.if_else.pop_count = 1;
				1283	else_inst->bits3.if_else.pad0 = 0;
				1284	} else if (intel->gen == 6) {
				1285	/* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
				1286	else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
				1287	} else {
				1288	/* The IF instruction's JIP should point just past the ELSE */
				1289	if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
				1290	/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
				1291	if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
				1292	else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
				1293	}
				1294	}
				1295	}
				1296
				1297	void
				1298	brw_ELSE(struct brw_compile *p)
				1299	{
				1300	struct intel_context *intel = &p->brw->intel;
				1301	struct brw_instruction *insn;
				1302
				1303	insn = next_insn(p, BRW_OPCODE_ELSE);
				1304
				1305	if (intel->gen < 6) {
				1306	brw_set_dest(p, insn, brw_ip_reg());
				1307	brw_set_src0(p, insn, brw_ip_reg());
				1308	brw_set_src1(p, insn, brw_imm_d(0x0));
				1309	} else if (intel->gen == 6) {
				1310	brw_set_dest(p, insn, brw_imm_w(0));
				1311	insn->bits1.branch_gen6.jump_count = 0;
				1312	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1313	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1314	} else {
				1315	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1316	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1317	brw_set_src1(p, insn, brw_imm_ud(0));
				1318	insn->bits3.break_cont.jip = 0;
				1319	insn->bits3.break_cont.uip = 0;
				1320	}
				1321
				1322	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1323	insn->header.mask_control = BRW_MASK_ENABLE;
				1324	if (!p->single_program_flow)
				1325	insn->header.thread_control = BRW_THREAD_SWITCH;
				1326
				1327	push_if_stack(p, insn);
				1328	}
				1329
				1330	void
				1331	brw_ENDIF(struct brw_compile *p)
				1332	{
				1333	struct intel_context *intel = &p->brw->intel;
				1334	struct brw_instruction *insn = NULL;
				1335	struct brw_instruction *else_inst = NULL;
				1336	struct brw_instruction *if_inst = NULL;
				1337	struct brw_instruction *tmp;
				1338	bool emit_endif = true;
				1339
				1340	/* In single program flow mode, we can express IF and ELSE instructions
				1341	* equivalently as ADD instructions that operate on IP. On platforms prior
				1342	* to Gen6, flow control instructions cause an implied thread switch, so
				1343	* this is a significant savings.
				1344	*
				1345	* However, on Gen6, writing to IP doesn't work in single program flow mode
				1346	* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
				1347	* not be updated by non-flow control instructions."). And on later
				1348	* platforms, there is no significant benefit to converting control flow
				1349	* instructions to conditional ADDs. So we only do this trick on Gen4 and
				1350	* Gen5.
				1351	*/
				1352	if (intel->gen < 6 && p->single_program_flow)
				1353	emit_endif = false;
				1354
				1355	/*
				1356	* A single next_insn() may change the base adress of instruction store
				1357	* memory(p->store), so call it first before referencing the instruction
				1358	* store pointer from an index
				1359	*/
				1360	if (emit_endif)
				1361	insn = next_insn(p, BRW_OPCODE_ENDIF);
				1362
				1363	/* Pop the IF and (optional) ELSE instructions from the stack */
				1364	p->if_depth_in_loop[p->loop_stack_depth]--;
				1365	tmp = pop_if_stack(p);
				1366	if (tmp->header.opcode == BRW_OPCODE_ELSE) {
				1367	else_inst = tmp;
				1368	tmp = pop_if_stack(p);
				1369	}
				1370	if_inst = tmp;
				1371
				1372	if (!emit_endif) {
				1373	/* ENDIF is useless; don't bother emitting it. */
				1374	convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
				1375	return;
				1376	}
				1377
				1378	if (intel->gen < 6) {
				1379	brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1380	brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1381	brw_set_src1(p, insn, brw_imm_d(0x0));
				1382	} else if (intel->gen == 6) {
				1383	brw_set_dest(p, insn, brw_imm_w(0));
				1384	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1385	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1386	} else {
				1387	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1388	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1389	brw_set_src1(p, insn, brw_imm_ud(0));
				1390	}
				1391
				1392	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1393	insn->header.mask_control = BRW_MASK_ENABLE;
				1394	insn->header.thread_control = BRW_THREAD_SWITCH;
				1395
				1396	/* Also pop item off the stack in the endif instruction: */
				1397	if (intel->gen < 6) {
				1398	insn->bits3.if_else.jump_count = 0;
				1399	insn->bits3.if_else.pop_count = 1;
				1400	insn->bits3.if_else.pad0 = 0;
				1401	} else if (intel->gen == 6) {
				1402	insn->bits1.branch_gen6.jump_count = 2;
				1403	} else {
				1404	insn->bits3.break_cont.jip = 2;
				1405	}
				1406	patch_IF_ELSE(p, if_inst, else_inst, insn);
				1407	}
				1408
				1409	struct brw_instruction brw_BREAK(struct brw_compile p)
				1410	{
				1411	struct intel_context *intel = &p->brw->intel;
				1412	struct brw_instruction *insn;
				1413
				1414	insn = next_insn(p, BRW_OPCODE_BREAK);
				1415	if (intel->gen >= 6) {
				1416	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1417	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1418	brw_set_src1(p, insn, brw_imm_d(0x0));
				1419	} else {
				1420	brw_set_dest(p, insn, brw_ip_reg());
				1421	brw_set_src0(p, insn, brw_ip_reg());
				1422	brw_set_src1(p, insn, brw_imm_d(0x0));
				1423	insn->bits3.if_else.pad0 = 0;
				1424	insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
				1425	}
				1426	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1427	insn->header.execution_size = BRW_EXECUTE_8;
				1428
				1429	return insn;
				1430	}
				1431
				1432	struct brw_instruction gen6_CONT(struct brw_compile p)
				1433	{
				1434	struct brw_instruction *insn;
				1435
				1436	insn = next_insn(p, BRW_OPCODE_CONTINUE);
				1437	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1438	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1439	brw_set_dest(p, insn, brw_ip_reg());
				1440	brw_set_src0(p, insn, brw_ip_reg());
				1441	brw_set_src1(p, insn, brw_imm_d(0x0));
				1442
				1443	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1444	insn->header.execution_size = BRW_EXECUTE_8;
				1445	return insn;
				1446	}
				1447
				1448	struct brw_instruction brw_CONT(struct brw_compile p)
				1449	{
				1450	struct brw_instruction *insn;
				1451	insn = next_insn(p, BRW_OPCODE_CONTINUE);
				1452	brw_set_dest(p, insn, brw_ip_reg());
				1453	brw_set_src0(p, insn, brw_ip_reg());
				1454	brw_set_src1(p, insn, brw_imm_d(0x0));
				1455	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1456	insn->header.execution_size = BRW_EXECUTE_8;
				1457	/* insn->header.mask_control = BRW_MASK_DISABLE; */
				1458	insn->bits3.if_else.pad0 = 0;
				1459	insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
				1460	return insn;
				1461	}
				1462
				1463	struct brw_instruction gen6_HALT(struct brw_compile p)
				1464	{
				1465	struct brw_instruction *insn;
				1466
				1467	insn = next_insn(p, BRW_OPCODE_HALT);
				1468	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1469	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1470	brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
				1471
				1472	if (p->compressed) {
				1473	insn->header.execution_size = BRW_EXECUTE_16;
				1474	} else {
				1475	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1476	insn->header.execution_size = BRW_EXECUTE_8;
				1477	}
				1478	return insn;
				1479	}
				1480
				1481	/* DO/WHILE loop:
				1482	*
				1483	* The DO/WHILE is just an unterminated loop -- break or continue are
				1484	* used for control within the loop. We have a few ways they can be
				1485	* done.
				1486	*
				1487	* For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
				1488	* jip and no DO instruction.
				1489	*
				1490	* For non-uniform control flow pre-gen6, there's a DO instruction to
				1491	* push the mask, and a WHILE to jump back, and BREAK to get out and
				1492	* pop the mask.
				1493	*
				1494	* For gen6, there's no more mask stack, so no need for DO. WHILE
				1495	* just points back to the first instruction of the loop.
				1496	*/
				1497	struct brw_instruction brw_DO(struct brw_compile p, GLuint execute_size)
				1498	{
				1499	struct intel_context *intel = &p->brw->intel;
				1500
				1501	if (intel->gen >= 6 \|\| p->single_program_flow) {
				1502	push_loop_stack(p, &p->store[p->nr_insn]);
				1503	return &p->store[p->nr_insn];
				1504	} else {
				1505	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
				1506
				1507	push_loop_stack(p, insn);
				1508
				1509	/* Override the defaults for this instruction:
				1510	*/
				1511	brw_set_dest(p, insn, brw_null_reg());
				1512	brw_set_src0(p, insn, brw_null_reg());
				1513	brw_set_src1(p, insn, brw_null_reg());
				1514
				1515	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1516	insn->header.execution_size = execute_size;
				1517	insn->header.predicate_control = BRW_PREDICATE_NONE;
				1518	/* insn->header.mask_control = BRW_MASK_ENABLE; */
				1519	/* insn->header.mask_control = BRW_MASK_DISABLE; */
				1520
				1521	return insn;
				1522	}
				1523	}
				1524
				1525	/**
				1526	* For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
				1527	* instruction here.
				1528	*
				1529	* For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
				1530	* nesting, since it can always just point to the end of the block/current loop.
				1531	*/
				1532	static void
				1533	brw_patch_break_cont(struct brw_compile p, struct brw_instruction while_inst)
				1534	{
				1535	struct intel_context *intel = &p->brw->intel;
				1536	struct brw_instruction *do_inst = get_inner_do_insn(p);
				1537	struct brw_instruction *inst;
				1538	int br = (intel->gen == 5) ? 2 : 1;
				1539
				1540	for (inst = while_inst - 1; inst != do_inst; inst--) {
				1541	/* If the jump count is != 0, that means that this instruction has already
				1542	* been patched because it's part of a loop inside of the one we're
				1543	* patching.
				1544	*/
				1545	if (inst->header.opcode == BRW_OPCODE_BREAK &&
				1546	inst->bits3.if_else.jump_count == 0) {
				1547	inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
				1548	} else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
				1549	inst->bits3.if_else.jump_count == 0) {
				1550	inst->bits3.if_else.jump_count = br * (while_inst - inst);
				1551	}
				1552	}
				1553	}
				1554
				1555	struct brw_instruction brw_WHILE(struct brw_compile p)
				1556	{
				1557	struct intel_context *intel = &p->brw->intel;
				1558	struct brw_instruction insn, do_insn;
				1559	GLuint br = 1;
				1560
				1561	if (intel->gen >= 5)
				1562	br = 2;
				1563
				1564	if (intel->gen >= 7) {
				1565	insn = next_insn(p, BRW_OPCODE_WHILE);
				1566	do_insn = get_inner_do_insn(p);
				1567
				1568	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1569	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1570	brw_set_src1(p, insn, brw_imm_ud(0));
				1571	insn->bits3.break_cont.jip = br * (do_insn - insn);
				1572
				1573	insn->header.execution_size = BRW_EXECUTE_8;
				1574	} else if (intel->gen == 6) {
				1575	insn = next_insn(p, BRW_OPCODE_WHILE);
				1576	do_insn = get_inner_do_insn(p);
				1577
				1578	brw_set_dest(p, insn, brw_imm_w(0));
				1579	insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
				1580	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1581	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1582
				1583	insn->header.execution_size = BRW_EXECUTE_8;
				1584	} else {
				1585	if (p->single_program_flow) {
				1586	insn = next_insn(p, BRW_OPCODE_ADD);
				1587	do_insn = get_inner_do_insn(p);
				1588
				1589	brw_set_dest(p, insn, brw_ip_reg());
				1590	brw_set_src0(p, insn, brw_ip_reg());
				1591	brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
				1592	insn->header.execution_size = BRW_EXECUTE_1;
				1593	} else {
				1594	insn = next_insn(p, BRW_OPCODE_WHILE);
				1595	do_insn = get_inner_do_insn(p);
				1596
				1597	assert(do_insn->header.opcode == BRW_OPCODE_DO);
				1598
				1599	brw_set_dest(p, insn, brw_ip_reg());
				1600	brw_set_src0(p, insn, brw_ip_reg());
				1601	brw_set_src1(p, insn, brw_imm_d(0));
				1602
				1603	insn->header.execution_size = do_insn->header.execution_size;
				1604	insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
				1605	insn->bits3.if_else.pop_count = 0;
				1606	insn->bits3.if_else.pad0 = 0;
				1607
				1608	brw_patch_break_cont(p, insn);
				1609	}
				1610	}
				1611	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1612	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1613
				1614	p->loop_stack_depth--;
				1615
				1616	return insn;
				1617	}
				1618
				1619
				1620	/* FORWARD JUMPS:
				1621	*/
				1622	void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
				1623	{
				1624	struct intel_context *intel = &p->brw->intel;
				1625	struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
				1626	GLuint jmpi = 1;
				1627
				1628	if (intel->gen >= 5)
				1629	jmpi = 2;
				1630
				1631	assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
				1632	assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
				1633
				1634	jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
				1635	}
				1636
				1637
				1638
				1639	/* To integrate with the above, it makes sense that the comparison
				1640	* instruction should populate the flag register. It might be simpler
				1641	* just to use the flag reg for most WM tasks?
				1642	*/
				1643	void brw_CMP(struct brw_compile *p,
				1644	struct brw_reg dest,
				1645	GLuint conditional,
				1646	struct brw_reg src0,
				1647	struct brw_reg src1)
				1648	{
				1649	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
				1650
				1651	insn->header.destreg__conditionalmod = conditional;
				1652	brw_set_dest(p, insn, dest);
				1653	brw_set_src0(p, insn, src0);
				1654	brw_set_src1(p, insn, src1);
				1655
				1656	/* guess_execution_size(insn, src0); */
				1657
				1658
				1659	/* Make it so that future instructions will use the computed flag
				1660	* value until brw_set_predicate_control_flag_value() is called
				1661	* again.
				1662	*/
				1663	if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
				1664	dest.nr == 0) {
				1665	p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
				1666	p->flag_value = 0xff;
				1667	}
				1668	}
				1669
				1670	/* Issue 'wait' instruction for n1, host could program MMIO
				1671	to wake up thread. */
				1672	void brw_WAIT (struct brw_compile *p)
				1673	{
				1674	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
				1675	struct brw_reg src = brw_notification_1_reg();
				1676
				1677	brw_set_dest(p, insn, src);
				1678	brw_set_src0(p, insn, src);
				1679	brw_set_src1(p, insn, brw_null_reg());
				1680	insn->header.execution_size = 0; /* must */
				1681	insn->header.predicate_control = 0;
				1682	insn->header.compression_control = 0;
				1683	}
				1684
				1685
				1686	/***********************************************************************
				1687	* Helpers for the various SEND message types:
				1688	*/
				1689
				1690	/** Extended math function, float[8].
				1691	*/
				1692	void brw_math( struct brw_compile *p,
				1693	struct brw_reg dest,
				1694	GLuint function,
				1695	GLuint msg_reg_nr,
				1696	struct brw_reg src,
				1697	GLuint data_type,
				1698	GLuint precision )
				1699	{
				1700	struct intel_context *intel = &p->brw->intel;
				1701
				1702	if (intel->gen >= 6) {
				1703	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
				1704
				1705	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
				1706	assert(src.file == BRW_GENERAL_REGISTER_FILE);
				1707
				1708	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
				1709	if (intel->gen == 6)
				1710	assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
				1711
				1712	/* Source modifiers are ignored for extended math instructions on Gen6. */
				1713	if (intel->gen == 6) {
				1714	assert(!src.negate);
				1715	assert(!src.abs);
				1716	}
				1717
				1718	if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT \|\|
				1719	function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER \|\|
				1720	function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
				1721	assert(src.type != BRW_REGISTER_TYPE_F);
				1722	} else {
				1723	assert(src.type == BRW_REGISTER_TYPE_F);
				1724	}
				1725
				1726	/* Math is the same ISA format as other opcodes, except that CondModifier
				1727	* becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
				1728	*/
				1729	insn->header.destreg__conditionalmod = function;
				1730
				1731	brw_set_dest(p, insn, dest);
				1732	brw_set_src0(p, insn, src);
				1733	brw_set_src1(p, insn, brw_null_reg());
				1734	} else {
				1735	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				1736
				1737	/* Example code doesn't set predicate_control for send
				1738	* instructions.
				1739	*/
				1740	insn->header.predicate_control = 0;
				1741	insn->header.destreg__conditionalmod = msg_reg_nr;
				1742
				1743	brw_set_dest(p, insn, dest);
				1744	brw_set_src0(p, insn, src);
				1745	brw_set_math_message(p,
				1746	insn,
				1747	function,
				1748	src.type == BRW_REGISTER_TYPE_D,
				1749	precision,
				1750	data_type);
				1751	}
				1752	}
				1753
				1754	/** Extended math function, float[8].
				1755	*/
				1756	void brw_math2(struct brw_compile *p,
				1757	struct brw_reg dest,
				1758	GLuint function,
				1759	struct brw_reg src0,
				1760	struct brw_reg src1)
				1761	{
				1762	struct intel_context *intel = &p->brw->intel;
				1763	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
				1764
				1765	assert(intel->gen >= 6);
				1766	(void) intel;
				1767
				1768
				1769	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
				1770	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
				1771	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
				1772
				1773	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
				1774	if (intel->gen == 6) {
				1775	assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
				1776	assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
				1777	}
				1778
				1779	if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT \|\|
				1780	function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER \|\|
				1781	function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
				1782	assert(src0.type != BRW_REGISTER_TYPE_F);
				1783	assert(src1.type != BRW_REGISTER_TYPE_F);
				1784	} else {
				1785	assert(src0.type == BRW_REGISTER_TYPE_F);
				1786	assert(src1.type == BRW_REGISTER_TYPE_F);
				1787	}
				1788
				1789	/* Source modifiers are ignored for extended math instructions on Gen6. */
				1790	if (intel->gen == 6) {
				1791	assert(!src0.negate);
				1792	assert(!src0.abs);
				1793	assert(!src1.negate);
				1794	assert(!src1.abs);
				1795	}
				1796
				1797	/* Math is the same ISA format as other opcodes, except that CondModifier
				1798	* becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
				1799	*/
				1800	insn->header.destreg__conditionalmod = function;
				1801
				1802	brw_set_dest(p, insn, dest);
				1803	brw_set_src0(p, insn, src0);
				1804	brw_set_src1(p, insn, src1);
				1805	}
				1806
				1807
				1808	/**
				1809	* Write a block of OWORDs (half a GRF each) from the scratch buffer,
				1810	* using a constant offset per channel.
				1811	*
				1812	* The offset must be aligned to oword size (16 bytes). Used for
				1813	* register spilling.
				1814	*/
				1815	void brw_oword_block_write_scratch(struct brw_compile *p,
				1816	struct brw_reg mrf,
				1817	int num_regs,
				1818	GLuint offset)
				1819	{
				1820	struct intel_context *intel = &p->brw->intel;
				1821	uint32_t msg_control, msg_type;
				1822	int mlen;
				1823
				1824	if (intel->gen >= 6)
				1825	offset /= 16;
				1826
				1827	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				1828
				1829	if (num_regs == 1) {
				1830	msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
				1831	mlen = 2;
				1832	} else {
				1833	msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
				1834	mlen = 3;
				1835	}
				1836
				1837	/* Set up the message header. This is g0, with g0.2 filled with
				1838	* the offset. We don't want to leave our offset around in g0 or
				1839	* it'll screw up texture samples, so set it up inside the message
				1840	* reg.
				1841	*/
				1842	{
				1843	brw_push_insn_state(p);
				1844	brw_set_mask_control(p, BRW_MASK_DISABLE);
				1845	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				1846
				1847	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				1848
				1849	/* set message header global offset field (reg 0, element 2) */
				1850	brw_MOV(p,
				1851	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				1852	mrf.nr,
				1853	2), BRW_REGISTER_TYPE_UD),
				1854	brw_imm_ud(offset));
				1855
				1856	brw_pop_insn_state(p);
				1857	}
				1858
				1859	{
				1860	struct brw_reg dest;
				1861	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				1862	int send_commit_msg;
				1863	struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
				1864	BRW_REGISTER_TYPE_UW);
				1865
				1866	if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
				1867	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1868	src_header = vec16(src_header);
				1869	}
				1870	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
				1871	insn->header.destreg__conditionalmod = mrf.nr;
				1872
				1873	/* Until gen6, writes followed by reads from the same location
				1874	* are not guaranteed to be ordered unless write_commit is set.
				1875	* If set, then a no-op write is issued to the destination
				1876	* register to set a dependency, and a read from the destination
				1877	* can be used to ensure the ordering.
				1878	*
				1879	* For gen6, only writes between different threads need ordering
				1880	* protection. Our use of DP writes is all about register
				1881	* spilling within a thread.
				1882	*/
				1883	if (intel->gen >= 6) {
				1884	dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				1885	send_commit_msg = 0;
				1886	} else {
				1887	dest = src_header;
				1888	send_commit_msg = 1;
				1889	}
				1890
				1891	brw_set_dest(p, insn, dest);
				1892	if (intel->gen >= 6) {
				1893	brw_set_src0(p, insn, mrf);
				1894	} else {
				1895	brw_set_src0(p, insn, brw_null_reg());
				1896	}
				1897
				1898	if (intel->gen >= 6)
				1899	msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
				1900	else
				1901	msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
				1902
				1903	brw_set_dp_write_message(p,
				1904	insn,
				1905	255, /* binding table index (255=stateless) */
				1906	msg_control,
				1907	msg_type,
				1908	mlen,
				1909	true, /* header_present */
				1910	0, /* not a render target */
				1911	send_commit_msg, /* response_length */
				1912	0, /* eot */
				1913	send_commit_msg);
				1914	}
				1915	}
				1916
				1917
				1918	/**
				1919	* Read a block of owords (half a GRF each) from the scratch buffer
				1920	* using a constant index per channel.
				1921	*
				1922	* Offset must be aligned to oword size (16 bytes). Used for register
				1923	* spilling.
				1924	*/
				1925	void
				1926	brw_oword_block_read_scratch(struct brw_compile *p,
				1927	struct brw_reg dest,
				1928	struct brw_reg mrf,
				1929	int num_regs,
				1930	GLuint offset)
				1931	{
				1932	struct intel_context *intel = &p->brw->intel;
				1933	uint32_t msg_control;
				1934	int rlen;
				1935
				1936	if (intel->gen >= 6)
				1937	offset /= 16;
				1938
				1939	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				1940	dest = retype(dest, BRW_REGISTER_TYPE_UW);
				1941
				1942	if (num_regs == 1) {
				1943	msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
				1944	rlen = 1;
				1945	} else {
				1946	msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
				1947	rlen = 2;
				1948	}
				1949
				1950	{
				1951	brw_push_insn_state(p);
				1952	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				1953	brw_set_mask_control(p, BRW_MASK_DISABLE);
				1954
				1955	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				1956
				1957	/* set message header global offset field (reg 0, element 2) */
				1958	brw_MOV(p,
				1959	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				1960	mrf.nr,
				1961	2), BRW_REGISTER_TYPE_UD),
				1962	brw_imm_ud(offset));
				1963
				1964	brw_pop_insn_state(p);
				1965	}
				1966
				1967	{
				1968	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				1969
				1970	assert(insn->header.predicate_control == 0);
				1971	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1972	insn->header.destreg__conditionalmod = mrf.nr;
				1973
				1974	brw_set_dest(p, insn, dest); /* UW? */
				1975	if (intel->gen >= 6) {
				1976	brw_set_src0(p, insn, mrf);
				1977	} else {
				1978	brw_set_src0(p, insn, brw_null_reg());
				1979	}
				1980
				1981	brw_set_dp_read_message(p,
				1982	insn,
				1983	255, /* binding table index (255=stateless) */
				1984	msg_control,
				1985	BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
				1986	BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
				1987	1, /* msg_length */
				1988	true, /* header_present */
				1989	rlen);
				1990	}
				1991	}
				1992
				1993	/**
				1994	* Read a float[4] vector from the data port Data Cache (const buffer).
				1995	* Location (in buffer) should be a multiple of 16.
				1996	* Used for fetching shader constants.
				1997	*/
				1998	void brw_oword_block_read(struct brw_compile *p,
				1999	struct brw_reg dest,
				2000	struct brw_reg mrf,
				2001	uint32_t offset,
				2002	uint32_t bind_table_index)
				2003	{
				2004	struct intel_context *intel = &p->brw->intel;
				2005
				2006	/* On newer hardware, offset is in units of owords. */
				2007	if (intel->gen >= 6)
				2008	offset /= 16;
				2009
				2010	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				2011
				2012	brw_push_insn_state(p);
				2013	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
				2014	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2015	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2016
				2017	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				2018
				2019	/* set message header global offset field (reg 0, element 2) */
				2020	brw_MOV(p,
				2021	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				2022	mrf.nr,
				2023	2), BRW_REGISTER_TYPE_UD),
				2024	brw_imm_ud(offset));
				2025
				2026	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				2027	insn->header.destreg__conditionalmod = mrf.nr;
				2028
				2029	/* cast dest to a uword[8] vector */
				2030	dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
				2031
				2032	brw_set_dest(p, insn, dest);
				2033	if (intel->gen >= 6) {
				2034	brw_set_src0(p, insn, mrf);
				2035	} else {
				2036	brw_set_src0(p, insn, brw_null_reg());
				2037	}
				2038
				2039	brw_set_dp_read_message(p,
				2040	insn,
				2041	bind_table_index,
				2042	BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
				2043	BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
				2044	BRW_DATAPORT_READ_TARGET_DATA_CACHE,
				2045	1, /* msg_length */
				2046	true, /* header_present */
				2047	1); /* response_length (1 reg, 2 owords!) */
				2048
				2049	brw_pop_insn_state(p);
				2050	}
				2051
				2052
				2053	void brw_fb_WRITE(struct brw_compile *p,
				2054	int dispatch_width,
				2055	GLuint msg_reg_nr,
				2056	struct brw_reg src0,
				2057	GLuint msg_control,
				2058	GLuint binding_table_index,
				2059	GLuint msg_length,
				2060	GLuint response_length,
				2061	bool eot,
				2062	bool header_present)
				2063	{
				2064	struct intel_context *intel = &p->brw->intel;
				2065	struct brw_instruction *insn;
				2066	GLuint msg_type;
				2067	struct brw_reg dest;
				2068
				2069	if (dispatch_width == 16)
				2070	dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				2071	else
				2072	dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				2073
				2074	if (intel->gen >= 6) {
				2075	insn = next_insn(p, BRW_OPCODE_SENDC);
				2076	} else {
				2077	insn = next_insn(p, BRW_OPCODE_SEND);
				2078	}
				2079	/* The execution mask is ignored for render target writes. */
				2080	insn->header.predicate_control = 0;
				2081	insn->header.compression_control = BRW_COMPRESSION_NONE;
				2082
				2083	if (intel->gen >= 6) {
				2084	/* headerless version, just submit color payload */
				2085	src0 = brw_message_reg(msg_reg_nr);
				2086
				2087	msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
				2088	} else {
				2089	insn->header.destreg__conditionalmod = msg_reg_nr;
				2090
				2091	msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
				2092	}
				2093
				2094	brw_set_dest(p, insn, dest);
				2095	brw_set_src0(p, insn, src0);
				2096	brw_set_dp_write_message(p,
				2097	insn,
				2098	binding_table_index,
				2099	msg_control,
				2100	msg_type,
				2101	msg_length,
				2102	header_present,
				2103	eot, /* last render target write */
				2104	response_length,
				2105	eot,
				2106	0 /* send_commit_msg */);
				2107	}
				2108
				2109
				2110	/**
				2111	* Texture sample instruction.
				2112	* Note: the msg_type plus msg_length values determine exactly what kind
				2113	* of sampling operation is performed. See volume 4, page 161 of docs.
				2114	*/
				2115	void brw_SAMPLE(struct brw_compile *p,
				2116	struct brw_reg dest,
				2117	GLuint msg_reg_nr,
				2118	struct brw_reg src0,
				2119	GLuint binding_table_index,
				2120	GLuint sampler,
				2121	GLuint writemask,
				2122	GLuint msg_type,
				2123	GLuint response_length,
				2124	GLuint msg_length,
				2125	GLuint header_present,
				2126	GLuint simd_mode,
				2127	GLuint return_format)
				2128	{
				2129	struct intel_context *intel = &p->brw->intel;
				2130	bool need_stall = 0;
				2131
				2132	if (writemask == 0) {
				2133	/printf("%s: zero writemask??\n", __FUNCTION__); /
				2134	return;
				2135	}
				2136
				2137	/* Hardware doesn't do destination dependency checking on send
				2138	* instructions properly. Add a workaround which generates the
				2139	* dependency by other means. In practice it seems like this bug
				2140	* only crops up for texture samples, and only where registers are
				2141	* written by the send and then written again later without being
				2142	* read in between. Luckily for us, we already track that
				2143	* information and use it to modify the writemask for the
				2144	* instruction, so that is a guide for whether a workaround is
				2145	* needed.
				2146	*/
				2147	if (writemask != BRW_WRITEMASK_XYZW) {
				2148	GLuint dst_offset = 0;
				2149	GLuint i, newmask = 0, len = 0;
				2150
				2151	for (i = 0; i < 4; i++) {
				2152	if (writemask & (1<<i))
				2153	break;
				2154	dst_offset += 2;
				2155	}
				2156	for (; i < 4; i++) {
				2157	if (!(writemask & (1<<i)))
				2158	break;
				2159	newmask \|= 1<<i;
				2160	len++;
				2161	}
				2162
				2163	if (newmask != writemask) {
				2164	need_stall = 1;
				2165	/* printf("need stall %x %x\n", newmask , writemask); */
				2166	}
				2167	else {
				2168	bool dispatch_16 = false;
				2169
				2170	struct brw_reg m1 = brw_message_reg(msg_reg_nr);
				2171
				2172	guess_execution_size(p, p->current, dest);
				2173	if (p->current->header.execution_size == BRW_EXECUTE_16)
				2174	dispatch_16 = true;
				2175
				2176	newmask = ~newmask & BRW_WRITEMASK_XYZW;
				2177
				2178	brw_push_insn_state(p);
				2179
				2180	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2181	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2182
				2183	brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
				2184	retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
				2185	brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
				2186
				2187	brw_pop_insn_state(p);
				2188
				2189	src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
				2190	dest = offset(dest, dst_offset);
				2191
				2192	/* For 16-wide dispatch, masked channels are skipped in the
				2193	* response. For 8-wide, masked channels still take up slots,
				2194	* and are just not written to.
				2195	*/
				2196	if (dispatch_16)
				2197	response_length = len * 2;
				2198	}
				2199	}
				2200
				2201	{
				2202	struct brw_instruction *insn;
				2203
				2204	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2205
				2206	insn = next_insn(p, BRW_OPCODE_SEND);
				2207	insn->header.predicate_control = 0; /* XXX */
				2208	insn->header.compression_control = BRW_COMPRESSION_NONE;
				2209	if (intel->gen < 6)
				2210	insn->header.destreg__conditionalmod = msg_reg_nr;
				2211
				2212	brw_set_dest(p, insn, dest);
				2213	brw_set_src0(p, insn, src0);
				2214	brw_set_sampler_message(p, insn,
				2215	binding_table_index,
				2216	sampler,
				2217	msg_type,
				2218	response_length,
				2219	msg_length,
				2220	header_present,
				2221	simd_mode,
				2222	return_format);
				2223	}
				2224
				2225	if (need_stall) {
				2226	struct brw_reg reg = vec8(offset(dest, response_length-1));
				2227
				2228	/* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
				2229	*/
				2230	brw_push_insn_state(p);
				2231	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2232	brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
				2233	retype(reg, BRW_REGISTER_TYPE_UD));
				2234	brw_pop_insn_state(p);
				2235	}
				2236
				2237	}
				2238
				2239	/* All these variables are pretty confusing - we might be better off
				2240	* using bitmasks and macros for this, in the old style. Or perhaps
				2241	* just having the caller instantiate the fields in dword3 itself.
				2242	*/
				2243	void brw_urb_WRITE(struct brw_compile *p,
				2244	struct brw_reg dest,
				2245	GLuint msg_reg_nr,
				2246	struct brw_reg src0,
				2247	bool allocate,
				2248	bool used,
				2249	GLuint msg_length,
				2250	GLuint response_length,
				2251	bool eot,
				2252	bool writes_complete,
				2253	GLuint offset,
				2254	GLuint swizzle)
				2255	{
				2256	struct intel_context *intel = &p->brw->intel;
				2257	struct brw_instruction *insn;
				2258
				2259	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2260
				2261	if (intel->gen == 7) {
				2262	/* Enable Channel Masks in the URB_WRITE_HWORD message header */
				2263	brw_push_insn_state(p);
				2264	brw_set_access_mode(p, BRW_ALIGN_1);
				2265	brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
				2266	BRW_REGISTER_TYPE_UD),
				2267	retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
				2268	brw_imm_ud(0xff00));
				2269	brw_pop_insn_state(p);
				2270	}
				2271
				2272	insn = next_insn(p, BRW_OPCODE_SEND);
				2273
				2274	assert(msg_length < BRW_MAX_MRF);
				2275
				2276	brw_set_dest(p, insn, dest);
				2277	brw_set_src0(p, insn, src0);
				2278	brw_set_src1(p, insn, brw_imm_d(0));
				2279
				2280	if (intel->gen < 6)
				2281	insn->header.destreg__conditionalmod = msg_reg_nr;
				2282
				2283	brw_set_urb_message(p,
				2284	insn,
				2285	allocate,
				2286	used,
				2287	msg_length,
				2288	response_length,
				2289	eot,
				2290	writes_complete,
				2291	offset,
				2292	swizzle);
				2293	}
				2294
				2295	static int
				2296	next_ip(struct brw_compile *p, int ip)
				2297	{
				2298	struct brw_instruction insn = (void )p->store + ip;
				2299
				2300	if (insn->header.cmpt_control)
				2301	return ip + 8;
				2302	else
				2303	return ip + 16;
				2304	}
				2305
				2306	static int
				2307	brw_find_next_block_end(struct brw_compile *p, int start)
				2308	{
				2309	int ip;
				2310	void *store = p->store;
				2311
				2312	for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2313	struct brw_instruction *insn = store + ip;
				2314
				2315	switch (insn->header.opcode) {
				2316	case BRW_OPCODE_ENDIF:
				2317	case BRW_OPCODE_ELSE:
				2318	case BRW_OPCODE_WHILE:
				2319	case BRW_OPCODE_HALT:
				2320	return ip;
				2321	}
				2322	}
				2323
				2324	return 0;
				2325	}
				2326
				2327	/* There is no DO instruction on gen6, so to find the end of the loop
				2328	* we have to see if the loop is jumping back before our start
				2329	* instruction.
				2330	*/
				2331	static int
				2332	brw_find_loop_end(struct brw_compile *p, int start)
				2333	{
				2334	struct intel_context *intel = &p->brw->intel;
				2335	int ip;
				2336	int scale = 8;
				2337	void *store = p->store;
				2338
				2339	/* Always start after the instruction (such as a WHILE) we're trying to fix
				2340	* up.
				2341	*/
				2342	for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2343	struct brw_instruction *insn = store + ip;
				2344
				2345	if (insn->header.opcode == BRW_OPCODE_WHILE) {
				2346	int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
				2347	: insn->bits3.break_cont.jip;
				2348	if (ip + jip * scale <= start)
				2349	return ip;
				2350	}
				2351	}
				2352	assert(!"not reached");
				2353	return start;
				2354	}
				2355
				2356	/* After program generation, go back and update the UIP and JIP of
				2357	* BREAK, CONT, and HALT instructions to their correct locations.
				2358	*/
				2359	void
				2360	brw_set_uip_jip(struct brw_compile *p)
				2361	{
				2362	struct intel_context *intel = &p->brw->intel;
				2363	int ip;
				2364	int scale = 8;
				2365	void *store = p->store;
				2366
				2367	if (intel->gen < 6)
				2368	return;
				2369
				2370	for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2371	struct brw_instruction *insn = store + ip;
				2372
				2373	if (insn->header.cmpt_control) {
				2374	/* Fixups for compacted BREAK/CONTINUE not supported yet. */
				2375	assert(insn->header.opcode != BRW_OPCODE_BREAK &&
				2376	insn->header.opcode != BRW_OPCODE_CONTINUE &&
				2377	insn->header.opcode != BRW_OPCODE_HALT);
				2378	continue;
				2379	}
				2380
				2381	int block_end_ip = brw_find_next_block_end(p, ip);
				2382	switch (insn->header.opcode) {
				2383	case BRW_OPCODE_BREAK:
				2384	assert(block_end_ip != 0);
				2385	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2386	/* Gen7 UIP points to WHILE; Gen6 points just after it */
				2387	insn->bits3.break_cont.uip =
				2388	(brw_find_loop_end(p, ip) - ip +
				2389	(intel->gen == 6 ? 16 : 0)) / scale;
				2390	break;
				2391	case BRW_OPCODE_CONTINUE:
				2392	assert(block_end_ip != 0);
				2393	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2394	insn->bits3.break_cont.uip =
				2395	(brw_find_loop_end(p, ip) - ip) / scale;
				2396
				2397	assert(insn->bits3.break_cont.uip != 0);
				2398	assert(insn->bits3.break_cont.jip != 0);
				2399	break;
				2400
				2401	case BRW_OPCODE_ENDIF:
				2402	if (block_end_ip == 0)
				2403	insn->bits3.break_cont.jip = 2;
				2404	else
				2405	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2406	break;
				2407
				2408	case BRW_OPCODE_HALT:
				2409	/* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
				2410	*
				2411	* "In case of the halt instruction not inside any conditional
				2412	* code block, the value of <JIP> and <UIP> should be the
				2413	* same. In case of the halt instruction inside conditional code
				2414	* block, the <UIP> should be the end of the program, and the
				2415	* <JIP> should be end of the most inner conditional code block."
				2416	*
				2417	* The uip will have already been set by whoever set up the
				2418	* instruction.
				2419	*/
				2420	if (block_end_ip == 0) {
				2421	insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
				2422	} else {
				2423	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2424	}
				2425	assert(insn->bits3.break_cont.uip != 0);
				2426	assert(insn->bits3.break_cont.jip != 0);
				2427	break;
				2428	}
				2429	}
				2430	}
				2431
				2432	void brw_ff_sync(struct brw_compile *p,
				2433	struct brw_reg dest,
				2434	GLuint msg_reg_nr,
				2435	struct brw_reg src0,
				2436	bool allocate,
				2437	GLuint response_length,
				2438	bool eot)
				2439	{
				2440	struct intel_context *intel = &p->brw->intel;
				2441	struct brw_instruction *insn;
				2442
				2443	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2444
				2445	insn = next_insn(p, BRW_OPCODE_SEND);
				2446	brw_set_dest(p, insn, dest);
				2447	brw_set_src0(p, insn, src0);
				2448	brw_set_src1(p, insn, brw_imm_d(0));
				2449
				2450	if (intel->gen < 6)
				2451	insn->header.destreg__conditionalmod = msg_reg_nr;
				2452
				2453	brw_set_ff_sync_message(p,
				2454	insn,
				2455	allocate,
				2456	response_length,
				2457	eot);
				2458	}
				2459
				2460	/**
				2461	* Emit the SEND instruction necessary to generate stream output data on Gen6
				2462	* (for transform feedback).
				2463	*
				2464	* If send_commit_msg is true, this is the last piece of stream output data
				2465	* from this thread, so send the data as a committed write. According to the
				2466	* Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
				2467	*
				2468	* "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
				2469	* writes are complete by sending the final write as a committed write."
				2470	*/
				2471	void
				2472	brw_svb_write(struct brw_compile *p,
				2473	struct brw_reg dest,
				2474	GLuint msg_reg_nr,
				2475	struct brw_reg src0,
				2476	GLuint binding_table_index,
				2477	bool send_commit_msg)
				2478	{
				2479	struct brw_instruction *insn;
				2480
				2481	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2482
				2483	insn = next_insn(p, BRW_OPCODE_SEND);
				2484	brw_set_dest(p, insn, dest);
				2485	brw_set_src0(p, insn, src0);
				2486	brw_set_src1(p, insn, brw_imm_d(0));
				2487	brw_set_dp_write_message(p, insn,
				2488	binding_table_index,
				2489	0, /* msg_control: ignored */
				2490	GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
				2491	1, /* msg_length */
				2492	true, /* header_present */
				2493	0, /* last_render_target: ignored */
				2494	send_commit_msg, /* response_length */
				2495	0, /* end_of_thread */
				2496	send_commit_msg); /* send_commit_msg */
				2497	}
				2498
				2499	/**
				2500	* This instruction is generated as a single-channel align1 instruction by
				2501	* both the VS and FS stages when using INTEL_DEBUG=shader_time.
				2502	*
				2503	* We can't use the typed atomic op in the FS because that has the execution
				2504	* mask ANDed with the pixel mask, but we just want to write the one dword for
				2505	* all the pixels.
				2506	*
				2507	* We don't use the SIMD4x2 atomic ops in the VS because want to just write
				2508	* one u32. So we use the same untyped atomic write message as the pixel
				2509	* shader.
				2510	*
				2511	* The untyped atomic operation requires a BUFFER surface type with RAW
				2512	* format, and is only accessible through the legacy DATA_CACHE dataport
				2513	* messages.
				2514	*/
				2515	void brw_shader_time_add(struct brw_compile *p,
				2516	int base_mrf,
				2517	uint32_t surf_index)
				2518	{
				2519	struct intel_context *intel = &p->brw->intel;
				2520	assert(intel->gen >= 7);
				2521
				2522	brw_push_insn_state(p);
				2523	brw_set_access_mode(p, BRW_ALIGN_1);
				2524	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2525	struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
				2526	brw_pop_insn_state(p);
				2527
				2528	/* We use brw_vec1_reg and unmasked because we want to increment the given
				2529	* offset only once.
				2530	*/
				2531	brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
				2532	BRW_ARF_NULL, 0));
				2533	brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				2534	base_mrf, 0));
				2535
				2536	bool header_present = false;
				2537	bool eot = false;
				2538	uint32_t mlen = 2; /* offset, value */
				2539	uint32_t rlen = 0;
				2540	brw_set_message_descriptor(p, send,
				2541	GEN7_SFID_DATAPORT_DATA_CACHE,
				2542	mlen, rlen, header_present, eot);
				2543
				2544	send->bits3.ud \|= 6 << 14; /* untyped atomic op */
				2545	send->bits3.ud \|= 0 << 13; /* no return data */
				2546	send->bits3.ud \|= 1 << 12; /* SIMD8 mode */
				2547	send->bits3.ud \|= BRW_AOP_ADD << 8;
				2548	send->bits3.ud \|= surf_index << 0;
				2549	}