Blame - assembler/brw_eu_emit.c - platform/external/igt-gpu-tools

blob: ae570c7fe02bcd63cfa2e031ca0a03e1e542f23e [file] [log] [blame]

Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	1	/*
				2	Copyright (C) Intel Corp. 2006. All Rights Reserved.
				3	Intel funded Tungsten Graphics (http://www.tungstengraphics.com) to
				4	develop this 3D driver.
				5
				6	Permission is hereby granted, free of charge, to any person obtaining
				7	a copy of this software and associated documentation files (the
				8	"Software"), to deal in the Software without restriction, including
				9	without limitation the rights to use, copy, modify, merge, publish,
				10	distribute, sublicense, and/or sell copies of the Software, and to
				11	permit persons to whom the Software is furnished to do so, subject to
				12	the following conditions:
				13
				14	The above copyright notice and this permission notice (including the
				15	next paragraph) shall be included in all copies or substantial
				16	portions of the Software.
				17
				18	THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
				19	EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
				20	MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
				21	IN NO EVENT SHALL THE COPYRIGHT OWNER(S) AND/OR ITS SUPPLIERS BE
				22	LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
				23	OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
				24	WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
				25
				26	**********************************************************************/
				27	/*
				28	* Authors:
				29	* Keith Whitwell <keith@tungstengraphics.com>
				30	*/
				31
				32	#include <string.h>
				33
				34	#include "brw_context.h"
				35	#include "brw_defines.h"
				36	#include "brw_eu.h"
				37
				38	#include "ralloc.h"
				39
				40	/***********************************************************************
				41	* Internal helper for constructing instructions
				42	*/
				43
				44	static void guess_execution_size(struct brw_compile *p,
				45	struct brw_instruction *insn,
				46	struct brw_reg reg)
				47	{
				48	if (reg.width == BRW_WIDTH_8 && p->compressed)
				49	insn->header.execution_size = BRW_EXECUTE_16;
				50	else
				51	insn->header.execution_size = reg.width; /* note - definitions are compatible */
				52	}
				53
				54
				55	/**
				56	* Prior to Sandybridge, the SEND instruction accepted non-MRF source
				57	* registers, implicitly moving the operand to a message register.
				58	*
				59	* On Sandybridge, this is no longer the case. This function performs the
				60	* explicit move; it should be called before emitting a SEND instruction.
				61	*/
				62	void
				63	gen6_resolve_implied_move(struct brw_compile *p,
				64	struct brw_reg *src,
				65	GLuint msg_reg_nr)
				66	{
				67	struct intel_context *intel = &p->brw->intel;
				68	if (intel->gen < 6)
				69	return;
				70
				71	if (src->file == BRW_MESSAGE_REGISTER_FILE)
				72	return;
				73
				74	if (src->file != BRW_ARCHITECTURE_REGISTER_FILE \|\| src->nr != BRW_ARF_NULL) {
				75	brw_push_insn_state(p);
				76	brw_set_mask_control(p, BRW_MASK_DISABLE);
				77	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				78	brw_MOV(p, retype(brw_message_reg(msg_reg_nr), BRW_REGISTER_TYPE_UD),
				79	retype(*src, BRW_REGISTER_TYPE_UD));
				80	brw_pop_insn_state(p);
				81	}
				82	*src = brw_message_reg(msg_reg_nr);
				83	}
				84
				85	static void
				86	gen7_convert_mrf_to_grf(struct brw_compile p, struct brw_reg reg)
				87	{
				88	/* From the BSpec / ISA Reference / send - [DevIVB+]:
				89	* "The send with EOT should use register space R112-R127 for <src>. This is
				90	* to enable loading of a new thread into the same slot while the message
				91	* with EOT for current thread is pending dispatch."
				92	*
				93	* Since we're pretending to have 16 MRFs anyway, we may as well use the
				94	* registers required for messages with EOT.
				95	*/
				96	struct intel_context *intel = &p->brw->intel;
				97	if (intel->gen == 7 && reg->file == BRW_MESSAGE_REGISTER_FILE) {
				98	reg->file = BRW_GENERAL_REGISTER_FILE;
				99	reg->nr += GEN7_MRF_HACK_START;
				100	}
				101	}
				102
				103
				104	void
				105	brw_set_dest(struct brw_compile p, struct brw_instruction insn,
				106	struct brw_reg dest)
				107	{
				108	if (dest.file != BRW_ARCHITECTURE_REGISTER_FILE &&
				109	dest.file != BRW_MESSAGE_REGISTER_FILE)
				110	assert(dest.nr < 128);
				111
				112	gen7_convert_mrf_to_grf(p, &dest);
				113
				114	insn->bits1.da1.dest_reg_file = dest.file;
				115	insn->bits1.da1.dest_reg_type = dest.type;
				116	insn->bits1.da1.dest_address_mode = dest.address_mode;
				117
				118	if (dest.address_mode == BRW_ADDRESS_DIRECT) {
				119	insn->bits1.da1.dest_reg_nr = dest.nr;
				120
				121	if (insn->header.access_mode == BRW_ALIGN_1) {
				122	insn->bits1.da1.dest_subreg_nr = dest.subnr;
				123	if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
				124	dest.hstride = BRW_HORIZONTAL_STRIDE_1;
				125	insn->bits1.da1.dest_horiz_stride = dest.hstride;
				126	}
				127	else {
				128	insn->bits1.da16.dest_subreg_nr = dest.subnr / 16;
				129	insn->bits1.da16.dest_writemask = dest.dw1.bits.writemask;
				130	/* even ignored in da16, still need to set as '01' */
				131	insn->bits1.da16.dest_horiz_stride = 1;
				132	}
				133	}
				134	else {
				135	insn->bits1.ia1.dest_subreg_nr = dest.subnr;
				136
				137	/* These are different sizes in align1 vs align16:
				138	*/
				139	if (insn->header.access_mode == BRW_ALIGN_1) {
				140	insn->bits1.ia1.dest_indirect_offset = dest.dw1.bits.indirect_offset;
				141	if (dest.hstride == BRW_HORIZONTAL_STRIDE_0)
				142	dest.hstride = BRW_HORIZONTAL_STRIDE_1;
				143	insn->bits1.ia1.dest_horiz_stride = dest.hstride;
				144	}
				145	else {
				146	insn->bits1.ia16.dest_indirect_offset = dest.dw1.bits.indirect_offset;
				147	/* even ignored in da16, still need to set as '01' */
				148	insn->bits1.ia16.dest_horiz_stride = 1;
				149	}
				150	}
				151
				152	/* NEW: Set the execution size based on dest.width and
				153	* insn->compression_control:
				154	*/
				155	guess_execution_size(p, insn, dest);
				156	}
				157
				158	extern int reg_type_size[];
				159
				160	static void
				161	validate_reg(struct brw_instruction *insn, struct brw_reg reg)
				162	{
				163	int hstride_for_reg[] = {0, 1, 2, 4};
				164	int vstride_for_reg[] = {0, 1, 2, 4, 8, 16, 32, 64, 128, 256};
				165	int width_for_reg[] = {1, 2, 4, 8, 16};
Damien Lespiau	6e83eb6	2013-01-25 15:12:12 +0000	[diff] [blame]	166	int execsize_for_reg[] = {1, 2, 4, 8, 16, 32};
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	167	int width, hstride, vstride, execsize;
				168
				169	if (reg.file == BRW_IMMEDIATE_VALUE) {
				170	/* 3.3.6: Region Parameters. Restriction: Immediate vectors
				171	* mean the destination has to be 128-bit aligned and the
				172	* destination horiz stride has to be a word.
				173	*/
				174	if (reg.type == BRW_REGISTER_TYPE_V) {
				175	assert(hstride_for_reg[insn->bits1.da1.dest_horiz_stride] *
				176	reg_type_size[insn->bits1.da1.dest_reg_type] == 2);
				177	}
				178
				179	return;
				180	}
				181
				182	if (reg.file == BRW_ARCHITECTURE_REGISTER_FILE &&
				183	reg.file == BRW_ARF_NULL)
				184	return;
				185
				186	assert(reg.hstride >= 0 && reg.hstride < Elements(hstride_for_reg));
				187	hstride = hstride_for_reg[reg.hstride];
				188
				189	if (reg.vstride == 0xf) {
				190	vstride = -1;
				191	} else {
				192	assert(reg.vstride >= 0 && reg.vstride < Elements(vstride_for_reg));
				193	vstride = vstride_for_reg[reg.vstride];
				194	}
				195
				196	assert(reg.width >= 0 && reg.width < Elements(width_for_reg));
				197	width = width_for_reg[reg.width];
				198
				199	assert(insn->header.execution_size >= 0 &&
				200	insn->header.execution_size < Elements(execsize_for_reg));
				201	execsize = execsize_for_reg[insn->header.execution_size];
				202
				203	/* Restrictions from 3.3.10: Register Region Restrictions. */
				204	/* 3. */
				205	assert(execsize >= width);
				206
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	207	/* FIXME: the assembler has a lot of code written that triggers the
				208	* assertions commented it below. Let's paper over it (for now!) until we
				209	* can re-validate the shaders with those little inconsistencies fixed. */
				210
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	211	/* 4. */
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	212	#if 0
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	213	if (execsize == width && hstride != 0) {
				214	assert(vstride == -1 \|\| vstride == width * hstride);
				215	}
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	216	#endif
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	217
				218	/* 5. */
				219	if (execsize == width && hstride == 0) {
				220	/* no restriction on vstride. */
				221	}
				222
				223	/* 6. */
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	224	#if 0
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	225	if (width == 1) {
				226	assert(hstride == 0);
				227	}
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	228	#endif
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	229
				230	/* 7. */
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	231	#if 0
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	232	if (execsize == 1 && width == 1) {
				233	assert(hstride == 0);
				234	assert(vstride == 0);
				235	}
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	236	#endif
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	237
				238	/* 8. */
				239	if (vstride == 0 && hstride == 0) {
				240	assert(width == 1);
				241	}
				242
				243	/* 10. Check destination issues. */
				244	}
				245
				246	void
				247	brw_set_src0(struct brw_compile p, struct brw_instruction insn,
				248	struct brw_reg reg)
				249	{
				250	struct brw_context *brw = p->brw;
				251	struct intel_context *intel = &brw->intel;
				252
Damien Lespiau	103edcc	2013-01-25 15:13:30 +0000	[diff] [blame]	253	if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	254	assert(reg.nr < 128);
				255
				256	gen7_convert_mrf_to_grf(p, &reg);
				257
				258	if (intel->gen >= 6 && (insn->header.opcode == BRW_OPCODE_SEND \|\|
				259	insn->header.opcode == BRW_OPCODE_SENDC)) {
				260	/* Any source modifiers or regions will be ignored, since this just
				261	* identifies the MRF/GRF to start reading the message contents from.
				262	* Check for some likely failures.
				263	*/
				264	assert(!reg.negate);
				265	assert(!reg.abs);
				266	assert(reg.address_mode == BRW_ADDRESS_DIRECT);
				267	}
				268
				269	validate_reg(insn, reg);
				270
				271	insn->bits1.da1.src0_reg_file = reg.file;
				272	insn->bits1.da1.src0_reg_type = reg.type;
				273	insn->bits2.da1.src0_abs = reg.abs;
				274	insn->bits2.da1.src0_negate = reg.negate;
				275	insn->bits2.da1.src0_address_mode = reg.address_mode;
				276
				277	if (reg.file == BRW_IMMEDIATE_VALUE) {
				278	insn->bits3.ud = reg.dw1.ud;
				279
				280	/* Required to set some fields in src1 as well:
				281	*/
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	282
				283	/* FIXME: This looks quite wrong, tempering with src1. I did not find
				284	* anything in the bspec that was hinting it woud be needed when setting
				285	* src0. before removing this one needs to run piglit.
				286
				287	insn->bits1.da1.src1_reg_file = 0;
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	288	insn->bits1.da1.src1_reg_type = reg.type;
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	289	*/
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	290	}
				291	else
				292	{
				293	if (reg.address_mode == BRW_ADDRESS_DIRECT) {
				294	if (insn->header.access_mode == BRW_ALIGN_1) {
				295	insn->bits2.da1.src0_subreg_nr = reg.subnr;
				296	insn->bits2.da1.src0_reg_nr = reg.nr;
				297	}
				298	else {
				299	insn->bits2.da16.src0_subreg_nr = reg.subnr / 16;
				300	insn->bits2.da16.src0_reg_nr = reg.nr;
				301	}
				302	}
				303	else {
				304	insn->bits2.ia1.src0_subreg_nr = reg.subnr;
				305
				306	if (insn->header.access_mode == BRW_ALIGN_1) {
				307	insn->bits2.ia1.src0_indirect_offset = reg.dw1.bits.indirect_offset;
				308	}
				309	else {
				310	insn->bits2.ia16.src0_subreg_nr = reg.dw1.bits.indirect_offset;
				311	}
				312	}
				313
				314	if (insn->header.access_mode == BRW_ALIGN_1) {
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	315
				316	/* FIXME: While this is correct, if the assembler uses that code path
				317	* the opcode generated are different and thus needs a validation
				318	* pass.
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	319	if (reg.width == BRW_WIDTH_1 &&
				320	insn->header.execution_size == BRW_EXECUTE_1) {
				321	insn->bits2.da1.src0_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
				322	insn->bits2.da1.src0_width = BRW_WIDTH_1;
				323	insn->bits2.da1.src0_vert_stride = BRW_VERTICAL_STRIDE_0;
				324	}
				325	else {
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	326	*/
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	327	insn->bits2.da1.src0_horiz_stride = reg.hstride;
				328	insn->bits2.da1.src0_width = reg.width;
				329	insn->bits2.da1.src0_vert_stride = reg.vstride;
Damien Lespiau	e7cca1a	2013-01-27 02:06:22 +0000	[diff] [blame]	330	/* } */
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	331	}
				332	else {
				333	insn->bits2.da16.src0_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
				334	insn->bits2.da16.src0_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
				335	insn->bits2.da16.src0_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
				336	insn->bits2.da16.src0_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
				337
				338	/* This is an oddity of the fact we're using the same
				339	* descriptions for registers in align_16 as align_1:
				340	*/
				341	if (reg.vstride == BRW_VERTICAL_STRIDE_8)
				342	insn->bits2.da16.src0_vert_stride = BRW_VERTICAL_STRIDE_4;
				343	else
				344	insn->bits2.da16.src0_vert_stride = reg.vstride;
				345	}
				346	}
				347	}
				348
				349
				350	void brw_set_src1(struct brw_compile *p,
				351	struct brw_instruction *insn,
				352	struct brw_reg reg)
				353	{
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	354	struct brw_context *brw = p->brw;
				355	struct intel_context *intel = &brw->intel;
				356
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	357	assert(reg.file != BRW_MESSAGE_REGISTER_FILE);
				358
Damien Lespiau	103edcc	2013-01-25 15:13:30 +0000	[diff] [blame]	359	if (reg.file != BRW_ARCHITECTURE_REGISTER_FILE)
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	360	assert(reg.nr < 128);
				361
				362	gen7_convert_mrf_to_grf(p, &reg);
				363
				364	validate_reg(insn, reg);
				365
				366	insn->bits1.da1.src1_reg_file = reg.file;
				367	insn->bits1.da1.src1_reg_type = reg.type;
				368	insn->bits3.da1.src1_abs = reg.abs;
				369	insn->bits3.da1.src1_negate = reg.negate;
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	370	insn->bits3.da1.src1_address_mode = reg.address_mode;
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	371
				372	/* Only src1 can be immediate in two-argument instructions.
				373	*/
				374	assert(insn->bits1.da1.src0_reg_file != BRW_IMMEDIATE_VALUE);
				375
				376	if (reg.file == BRW_IMMEDIATE_VALUE) {
				377	insn->bits3.ud = reg.dw1.ud;
				378	}
				379	else {
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	380	/* It's only BRW that does not support register-indirect addressing on
				381	* src1 */
				382	assert (intel->gen >= 4 \|\| reg.address_mode == BRW_ADDRESS_DIRECT);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	383
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	384	if (reg.address_mode == BRW_ADDRESS_DIRECT) {
				385	if (insn->header.access_mode == BRW_ALIGN_1) {
				386	insn->bits3.da1.src1_subreg_nr = reg.subnr;
				387	insn->bits3.da1.src1_reg_nr = reg.nr;
				388	}
				389	else {
				390	insn->bits3.da16.src1_subreg_nr = reg.subnr / 16;
				391	insn->bits3.da16.src1_reg_nr = reg.nr;
				392	}
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	393	}
				394	else {
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	395	insn->bits3.ia1.src1_subreg_nr = reg.subnr;
				396
				397	if (insn->header.access_mode == BRW_ALIGN_1)
				398	insn->bits3.ia1.src1_indirect_offset = reg.dw1.bits.indirect_offset;
				399	else
				400	insn->bits3.ia16.src1_indirect_offset = reg.dw1.bits.indirect_offset / 16;
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	401	}
				402
				403	if (insn->header.access_mode == BRW_ALIGN_1) {
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	404	/* FIXME: While this is correct, if the assembler uses that code path
				405	* the opcode generated are different and thus needs a validation
				406	* pass.
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	407	if (reg.width == BRW_WIDTH_1 &&
				408	insn->header.execution_size == BRW_EXECUTE_1) {
				409	insn->bits3.da1.src1_horiz_stride = BRW_HORIZONTAL_STRIDE_0;
				410	insn->bits3.da1.src1_width = BRW_WIDTH_1;
				411	insn->bits3.da1.src1_vert_stride = BRW_VERTICAL_STRIDE_0;
				412	}
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	413	else { */
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	414	insn->bits3.da1.src1_horiz_stride = reg.hstride;
				415	insn->bits3.da1.src1_width = reg.width;
				416	insn->bits3.da1.src1_vert_stride = reg.vstride;
Damien Lespiau	d3a2a67	2013-01-28 15:27:59 +0000	[diff] [blame]	417	/* } */
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	418	}
				419	else {
				420	insn->bits3.da16.src1_swz_x = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_X);
				421	insn->bits3.da16.src1_swz_y = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Y);
				422	insn->bits3.da16.src1_swz_z = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_Z);
				423	insn->bits3.da16.src1_swz_w = BRW_GET_SWZ(reg.dw1.bits.swizzle, BRW_CHANNEL_W);
				424
				425	/* This is an oddity of the fact we're using the same
				426	* descriptions for registers in align_16 as align_1:
				427	*/
				428	if (reg.vstride == BRW_VERTICAL_STRIDE_8)
				429	insn->bits3.da16.src1_vert_stride = BRW_VERTICAL_STRIDE_4;
				430	else
				431	insn->bits3.da16.src1_vert_stride = reg.vstride;
				432	}
				433	}
				434	}
				435
				436	/**
				437	* Set the Message Descriptor and Extended Message Descriptor fields
				438	* for SEND messages.
				439	*
				440	* \note This zeroes out the Function Control bits, so it must be called
				441	* \b before filling out any message-specific data. Callers can
				442	* choose not to fill in irrelevant bits; they will be zero.
				443	*/
				444	static void
				445	brw_set_message_descriptor(struct brw_compile *p,
				446	struct brw_instruction *inst,
				447	enum brw_message_target sfid,
				448	unsigned msg_length,
				449	unsigned response_length,
				450	bool header_present,
				451	bool end_of_thread)
				452	{
				453	struct intel_context *intel = &p->brw->intel;
				454
				455	brw_set_src1(p, inst, brw_imm_d(0));
				456
				457	if (intel->gen >= 5) {
				458	inst->bits3.generic_gen5.header_present = header_present;
				459	inst->bits3.generic_gen5.response_length = response_length;
				460	inst->bits3.generic_gen5.msg_length = msg_length;
				461	inst->bits3.generic_gen5.end_of_thread = end_of_thread;
				462
				463	if (intel->gen >= 6) {
				464	/* On Gen6+ Message target/SFID goes in bits 27:24 of the header */
				465	inst->header.destreg__conditionalmod = sfid;
				466	} else {
				467	/* Set Extended Message Descriptor (ex_desc) */
				468	inst->bits2.send_gen5.sfid = sfid;
				469	inst->bits2.send_gen5.end_of_thread = end_of_thread;
				470	}
				471	} else {
				472	inst->bits3.generic.response_length = response_length;
				473	inst->bits3.generic.msg_length = msg_length;
				474	inst->bits3.generic.msg_target = sfid;
				475	inst->bits3.generic.end_of_thread = end_of_thread;
				476	}
				477	}
				478
				479	static void brw_set_math_message( struct brw_compile *p,
				480	struct brw_instruction *insn,
				481	GLuint function,
				482	GLuint integer_type,
				483	bool low_precision,
				484	GLuint dataType )
				485	{
				486	struct brw_context *brw = p->brw;
				487	struct intel_context *intel = &brw->intel;
				488	unsigned msg_length;
				489	unsigned response_length;
				490
				491	/* Infer message length from the function */
				492	switch (function) {
				493	case BRW_MATH_FUNCTION_POW:
				494	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT:
				495	case BRW_MATH_FUNCTION_INT_DIV_REMAINDER:
				496	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
				497	msg_length = 2;
				498	break;
				499	default:
				500	msg_length = 1;
				501	break;
				502	}
				503
				504	/* Infer response length from the function */
				505	switch (function) {
				506	case BRW_MATH_FUNCTION_SINCOS:
				507	case BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER:
				508	response_length = 2;
				509	break;
				510	default:
				511	response_length = 1;
				512	break;
				513	}
				514
				515
				516	brw_set_message_descriptor(p, insn, BRW_SFID_MATH,
				517	msg_length, response_length, false, false);
				518	if (intel->gen == 5) {
				519	insn->bits3.math_gen5.function = function;
				520	insn->bits3.math_gen5.int_type = integer_type;
				521	insn->bits3.math_gen5.precision = low_precision;
				522	insn->bits3.math_gen5.saturate = insn->header.saturate;
				523	insn->bits3.math_gen5.data_type = dataType;
				524	insn->bits3.math_gen5.snapshot = 0;
				525	} else {
				526	insn->bits3.math.function = function;
				527	insn->bits3.math.int_type = integer_type;
				528	insn->bits3.math.precision = low_precision;
				529	insn->bits3.math.saturate = insn->header.saturate;
				530	insn->bits3.math.data_type = dataType;
				531	}
				532	insn->header.saturate = 0;
				533	}
				534
				535
				536	static void brw_set_ff_sync_message(struct brw_compile *p,
				537	struct brw_instruction *insn,
				538	bool allocate,
				539	GLuint response_length,
				540	bool end_of_thread)
				541	{
				542	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
				543	1, response_length, true, end_of_thread);
				544	insn->bits3.urb_gen5.opcode = 1; /* FF_SYNC */
				545	insn->bits3.urb_gen5.offset = 0; /* Not used by FF_SYNC */
				546	insn->bits3.urb_gen5.swizzle_control = 0; /* Not used by FF_SYNC */
				547	insn->bits3.urb_gen5.allocate = allocate;
				548	insn->bits3.urb_gen5.used = 0; /* Not used by FF_SYNC */
				549	insn->bits3.urb_gen5.complete = 0; /* Not used by FF_SYNC */
				550	}
				551
				552	static void brw_set_urb_message( struct brw_compile *p,
				553	struct brw_instruction *insn,
				554	bool allocate,
				555	bool used,
				556	GLuint msg_length,
				557	GLuint response_length,
				558	bool end_of_thread,
				559	bool complete,
				560	GLuint offset,
				561	GLuint swizzle_control )
				562	{
				563	struct brw_context *brw = p->brw;
				564	struct intel_context *intel = &brw->intel;
				565
				566	brw_set_message_descriptor(p, insn, BRW_SFID_URB,
				567	msg_length, response_length, true, end_of_thread);
				568	if (intel->gen == 7) {
				569	insn->bits3.urb_gen7.opcode = 0; /* URB_WRITE_HWORD */
				570	insn->bits3.urb_gen7.offset = offset;
				571	assert(swizzle_control != BRW_URB_SWIZZLE_TRANSPOSE);
				572	insn->bits3.urb_gen7.swizzle_control = swizzle_control;
				573	/* per_slot_offset = 0 makes it ignore offsets in message header */
				574	insn->bits3.urb_gen7.per_slot_offset = 0;
				575	insn->bits3.urb_gen7.complete = complete;
				576	} else if (intel->gen >= 5) {
				577	insn->bits3.urb_gen5.opcode = 0; /* URB_WRITE */
				578	insn->bits3.urb_gen5.offset = offset;
				579	insn->bits3.urb_gen5.swizzle_control = swizzle_control;
				580	insn->bits3.urb_gen5.allocate = allocate;
				581	insn->bits3.urb_gen5.used = used; /* ? */
				582	insn->bits3.urb_gen5.complete = complete;
				583	} else {
				584	insn->bits3.urb.opcode = 0; /* ? */
				585	insn->bits3.urb.offset = offset;
				586	insn->bits3.urb.swizzle_control = swizzle_control;
				587	insn->bits3.urb.allocate = allocate;
				588	insn->bits3.urb.used = used; /* ? */
				589	insn->bits3.urb.complete = complete;
				590	}
				591	}
				592
				593	void
				594	brw_set_dp_write_message(struct brw_compile *p,
				595	struct brw_instruction *insn,
				596	GLuint binding_table_index,
				597	GLuint msg_control,
				598	GLuint msg_type,
				599	GLuint msg_length,
				600	bool header_present,
				601	GLuint last_render_target,
				602	GLuint response_length,
				603	GLuint end_of_thread,
				604	GLuint send_commit_msg)
				605	{
				606	struct brw_context *brw = p->brw;
				607	struct intel_context *intel = &brw->intel;
				608	unsigned sfid;
				609
				610	if (intel->gen >= 7) {
				611	/* Use the Render Cache for RT writes; otherwise use the Data Cache */
				612	if (msg_type == GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE)
				613	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				614	else
				615	sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
				616	} else if (intel->gen == 6) {
				617	/* Use the render cache for all write messages. */
				618	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				619	} else {
				620	sfid = BRW_SFID_DATAPORT_WRITE;
				621	}
				622
				623	brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
				624	header_present, end_of_thread);
				625
				626	if (intel->gen >= 7) {
				627	insn->bits3.gen7_dp.binding_table_index = binding_table_index;
				628	insn->bits3.gen7_dp.msg_control = msg_control \|
				629	last_render_target << 6;
				630	insn->bits3.gen7_dp.msg_type = msg_type;
				631	} else if (intel->gen == 6) {
				632	insn->bits3.gen6_dp.binding_table_index = binding_table_index;
				633	insn->bits3.gen6_dp.msg_control = msg_control \|
				634	last_render_target << 5;
				635	insn->bits3.gen6_dp.msg_type = msg_type;
				636	insn->bits3.gen6_dp.send_commit_msg = send_commit_msg;
				637	} else if (intel->gen == 5) {
				638	insn->bits3.dp_write_gen5.binding_table_index = binding_table_index;
				639	insn->bits3.dp_write_gen5.msg_control = msg_control;
				640	insn->bits3.dp_write_gen5.last_render_target = last_render_target;
				641	insn->bits3.dp_write_gen5.msg_type = msg_type;
				642	insn->bits3.dp_write_gen5.send_commit_msg = send_commit_msg;
				643	} else {
				644	insn->bits3.dp_write.binding_table_index = binding_table_index;
				645	insn->bits3.dp_write.msg_control = msg_control;
				646	insn->bits3.dp_write.last_render_target = last_render_target;
				647	insn->bits3.dp_write.msg_type = msg_type;
				648	insn->bits3.dp_write.send_commit_msg = send_commit_msg;
				649	}
				650	}
				651
				652	void
				653	brw_set_dp_read_message(struct brw_compile *p,
				654	struct brw_instruction *insn,
				655	GLuint binding_table_index,
				656	GLuint msg_control,
				657	GLuint msg_type,
				658	GLuint target_cache,
				659	GLuint msg_length,
				660	bool header_present,
				661	GLuint response_length)
				662	{
				663	struct brw_context *brw = p->brw;
				664	struct intel_context *intel = &brw->intel;
				665	unsigned sfid;
				666
				667	if (intel->gen >= 7) {
				668	sfid = GEN7_SFID_DATAPORT_DATA_CACHE;
				669	} else if (intel->gen == 6) {
				670	if (target_cache == BRW_DATAPORT_READ_TARGET_RENDER_CACHE)
				671	sfid = GEN6_SFID_DATAPORT_RENDER_CACHE;
				672	else
				673	sfid = GEN6_SFID_DATAPORT_SAMPLER_CACHE;
				674	} else {
				675	sfid = BRW_SFID_DATAPORT_READ;
				676	}
				677
				678	brw_set_message_descriptor(p, insn, sfid, msg_length, response_length,
				679	header_present, false);
				680
				681	if (intel->gen >= 7) {
				682	insn->bits3.gen7_dp.binding_table_index = binding_table_index;
				683	insn->bits3.gen7_dp.msg_control = msg_control;
				684	insn->bits3.gen7_dp.msg_type = msg_type;
				685	} else if (intel->gen == 6) {
				686	insn->bits3.gen6_dp.binding_table_index = binding_table_index;
				687	insn->bits3.gen6_dp.msg_control = msg_control;
				688	insn->bits3.gen6_dp.msg_type = msg_type;
				689	insn->bits3.gen6_dp.send_commit_msg = 0;
				690	} else if (intel->gen == 5) {
				691	insn->bits3.dp_read_gen5.binding_table_index = binding_table_index;
				692	insn->bits3.dp_read_gen5.msg_control = msg_control;
				693	insn->bits3.dp_read_gen5.msg_type = msg_type;
				694	insn->bits3.dp_read_gen5.target_cache = target_cache;
				695	} else if (intel->is_g4x) {
				696	insn->bits3.dp_read_g4x.binding_table_index = binding_table_index; /0:7/
				697	insn->bits3.dp_read_g4x.msg_control = msg_control; /8:10/
				698	insn->bits3.dp_read_g4x.msg_type = msg_type; /11:13/
				699	insn->bits3.dp_read_g4x.target_cache = target_cache; /14:15/
				700	} else {
				701	insn->bits3.dp_read.binding_table_index = binding_table_index; /0:7/
				702	insn->bits3.dp_read.msg_control = msg_control; /8:11/
				703	insn->bits3.dp_read.msg_type = msg_type; /12:13/
				704	insn->bits3.dp_read.target_cache = target_cache; /14:15/
				705	}
				706	}
				707
				708	void
				709	brw_set_sampler_message(struct brw_compile *p,
				710	struct brw_instruction *insn,
				711	GLuint binding_table_index,
				712	GLuint sampler,
				713	GLuint msg_type,
				714	GLuint response_length,
				715	GLuint msg_length,
				716	GLuint header_present,
				717	GLuint simd_mode,
				718	GLuint return_format)
				719	{
				720	struct brw_context *brw = p->brw;
				721	struct intel_context *intel = &brw->intel;
				722
				723	brw_set_message_descriptor(p, insn, BRW_SFID_SAMPLER, msg_length,
				724	response_length, header_present, false);
				725
				726	if (intel->gen >= 7) {
				727	insn->bits3.sampler_gen7.binding_table_index = binding_table_index;
				728	insn->bits3.sampler_gen7.sampler = sampler;
				729	insn->bits3.sampler_gen7.msg_type = msg_type;
				730	insn->bits3.sampler_gen7.simd_mode = simd_mode;
				731	} else if (intel->gen >= 5) {
				732	insn->bits3.sampler_gen5.binding_table_index = binding_table_index;
				733	insn->bits3.sampler_gen5.sampler = sampler;
				734	insn->bits3.sampler_gen5.msg_type = msg_type;
				735	insn->bits3.sampler_gen5.simd_mode = simd_mode;
				736	} else if (intel->is_g4x) {
				737	insn->bits3.sampler_g4x.binding_table_index = binding_table_index;
				738	insn->bits3.sampler_g4x.sampler = sampler;
				739	insn->bits3.sampler_g4x.msg_type = msg_type;
				740	} else {
				741	insn->bits3.sampler.binding_table_index = binding_table_index;
				742	insn->bits3.sampler.sampler = sampler;
				743	insn->bits3.sampler.msg_type = msg_type;
				744	insn->bits3.sampler.return_format = return_format;
				745	}
				746	}
				747
				748
				749	#define next_insn brw_next_insn
				750	struct brw_instruction *
				751	brw_next_insn(struct brw_compile *p, GLuint opcode)
				752	{
				753	struct brw_instruction *insn;
				754
				755	if (p->nr_insn + 1 > p->store_size) {
				756	if (0)
				757	printf("incresing the store size to %d\n", p->store_size << 1);
				758	p->store_size <<= 1;
				759	p->store = reralloc(p->mem_ctx, p->store,
				760	struct brw_instruction, p->store_size);
				761	if (!p->store)
				762	assert(!"realloc eu store memeory failed");
				763	}
				764
				765	p->next_insn_offset += 16;
				766	insn = &p->store[p->nr_insn++];
				767	memcpy(insn, p->current, sizeof(*insn));
				768
				769	/* Reset this one-shot flag:
				770	*/
				771
				772	if (p->current->header.destreg__conditionalmod) {
				773	p->current->header.destreg__conditionalmod = 0;
				774	p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
				775	}
				776
				777	insn->header.opcode = opcode;
				778	return insn;
				779	}
				780
				781	static struct brw_instruction brw_alu1( struct brw_compile p,
				782	GLuint opcode,
				783	struct brw_reg dest,
				784	struct brw_reg src )
				785	{
				786	struct brw_instruction *insn = next_insn(p, opcode);
				787	brw_set_dest(p, insn, dest);
				788	brw_set_src0(p, insn, src);
				789	return insn;
				790	}
				791
				792	static struct brw_instruction brw_alu2(struct brw_compile p,
				793	GLuint opcode,
				794	struct brw_reg dest,
				795	struct brw_reg src0,
				796	struct brw_reg src1 )
				797	{
				798	struct brw_instruction *insn = next_insn(p, opcode);
				799	brw_set_dest(p, insn, dest);
				800	brw_set_src0(p, insn, src0);
				801	brw_set_src1(p, insn, src1);
				802	return insn;
				803	}
				804
				805	static int
				806	get_3src_subreg_nr(struct brw_reg reg)
				807	{
				808	if (reg.vstride == BRW_VERTICAL_STRIDE_0) {
				809	assert(brw_is_single_value_swizzle(reg.dw1.bits.swizzle));
				810	return reg.subnr / 4 + BRW_GET_SWZ(reg.dw1.bits.swizzle, 0);
				811	} else {
				812	return reg.subnr / 4;
				813	}
				814	}
				815
Damien Lespiau	67f3f94	2013-01-31 01:27:36 +0000	[diff] [blame^]	816	static int get_3src_type(int type)
				817	{
				818	assert(type == BRW_REGISTER_TYPE_F \|\|
				819	type == BRW_REGISTER_TYPE_D \|\|
				820	type == BRW_REGISTER_TYPE_UD);
				821
				822	switch(type) {
				823	case BRW_REGISTER_TYPE_F: return BRW_REGISTER_3SRC_TYPE_F;
				824	case BRW_REGISTER_TYPE_D: return BRW_REGISTER_3SRC_TYPE_D;
				825	case BRW_REGISTER_TYPE_UD: return BRW_REGISTER_3SRC_TYPE_UD;
				826	}
				827
				828	return BRW_REGISTER_3SRC_TYPE_F;
				829	}
				830
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	831	void
				832	brw_set_3src_dest(struct brw_compile *p,
				833	struct brw_instruction *insn,
				834	struct brw_reg dest)
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	835	{
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	836	gen7_convert_mrf_to_grf(p, &dest);
				837
				838	assert(insn->header.access_mode == BRW_ALIGN_16);
				839
				840	assert(dest.file == BRW_GENERAL_REGISTER_FILE \|\|
				841	dest.file == BRW_MESSAGE_REGISTER_FILE);
				842	assert(dest.nr < 128);
				843	assert(dest.address_mode == BRW_ADDRESS_DIRECT);
Damien Lespiau	67f3f94	2013-01-31 01:27:36 +0000	[diff] [blame^]	844	insn->bits1.da3src.dest_reg_type = get_3src_type(dest.type);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	845	insn->bits1.da3src.dest_reg_file = (dest.file == BRW_MESSAGE_REGISTER_FILE);
				846	insn->bits1.da3src.dest_reg_nr = dest.nr;
				847	insn->bits1.da3src.dest_subreg_nr = dest.subnr / 16;
				848	insn->bits1.da3src.dest_writemask = dest.dw1.bits.writemask;
				849	guess_execution_size(p, insn, dest);
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	850	}
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	851
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	852	void
				853	brw_set_3src_src0(struct brw_compile *p,
				854	struct brw_instruction *insn,
				855	struct brw_reg src0)
				856	{
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	857	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
				858	assert(src0.address_mode == BRW_ADDRESS_DIRECT);
				859	assert(src0.nr < 128);
Damien Lespiau	67f3f94	2013-01-31 01:27:36 +0000	[diff] [blame^]	860	insn->bits1.da3src.src_reg_type = get_3src_type(src0.type);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	861	insn->bits2.da3src.src0_swizzle = src0.dw1.bits.swizzle;
				862	insn->bits2.da3src.src0_subreg_nr = get_3src_subreg_nr(src0);
				863	insn->bits2.da3src.src0_reg_nr = src0.nr;
				864	insn->bits1.da3src.src0_abs = src0.abs;
				865	insn->bits1.da3src.src0_negate = src0.negate;
				866	insn->bits2.da3src.src0_rep_ctrl = src0.vstride == BRW_VERTICAL_STRIDE_0;
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	867	}
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	868
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	869	void
				870	brw_set_3src_src1(struct brw_compile *p,
				871	struct brw_instruction *insn,
				872	struct brw_reg src1)
				873	{
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	874	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
				875	assert(src1.address_mode == BRW_ADDRESS_DIRECT);
				876	assert(src1.nr < 128);
Damien Lespiau	67f3f94	2013-01-31 01:27:36 +0000	[diff] [blame^]	877	assert(src1.type == insn->bits1.da3src.src_reg_type);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	878	insn->bits2.da3src.src1_swizzle = src1.dw1.bits.swizzle;
				879	insn->bits2.da3src.src1_subreg_nr_low = get_3src_subreg_nr(src1) & 0x3;
				880	insn->bits3.da3src.src1_subreg_nr_high = get_3src_subreg_nr(src1) >> 2;
				881	insn->bits2.da3src.src1_rep_ctrl = src1.vstride == BRW_VERTICAL_STRIDE_0;
				882	insn->bits3.da3src.src1_reg_nr = src1.nr;
				883	insn->bits1.da3src.src1_abs = src1.abs;
				884	insn->bits1.da3src.src1_negate = src1.negate;
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	885	}
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	886
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	887	void
				888	brw_set_3src_src2(struct brw_compile *p,
				889	struct brw_instruction *insn,
				890	struct brw_reg src2)
				891	{
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	892	assert(src2.file == BRW_GENERAL_REGISTER_FILE);
				893	assert(src2.address_mode == BRW_ADDRESS_DIRECT);
				894	assert(src2.nr < 128);
Damien Lespiau	67f3f94	2013-01-31 01:27:36 +0000	[diff] [blame^]	895	assert(src2.type == insn->bits1.da3src.src_reg_type);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	896	insn->bits3.da3src.src2_swizzle = src2.dw1.bits.swizzle;
				897	insn->bits3.da3src.src2_subreg_nr = get_3src_subreg_nr(src2);
				898	insn->bits3.da3src.src2_rep_ctrl = src2.vstride == BRW_VERTICAL_STRIDE_0;
				899	insn->bits3.da3src.src2_reg_nr = src2.nr;
				900	insn->bits1.da3src.src2_abs = src2.abs;
				901	insn->bits1.da3src.src2_negate = src2.negate;
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	902	}
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	903
Damien Lespiau	a2a6583	2013-01-31 00:53:46 +0000	[diff] [blame]	904	static struct brw_instruction brw_alu3(struct brw_compile p,
				905	GLuint opcode,
				906	struct brw_reg dest,
				907	struct brw_reg src0,
				908	struct brw_reg src1,
				909	struct brw_reg src2)
				910	{
				911	struct brw_instruction *insn = next_insn(p, opcode);
				912	brw_set_3src_dest(p, insn, dest);
				913	brw_set_3src_src0(p, insn, src0);
				914	brw_set_3src_src1(p, insn, src1);
				915	brw_set_3src_src2(p, insn, src2);
Damien Lespiau	042e935	2013-01-19 23:27:46 +0000	[diff] [blame]	916	return insn;
				917	}
				918
				919
				920	/***********************************************************************
				921	* Convenience routines.
				922	*/
				923	#define ALU1(OP) \
				924	struct brw_instruction brw_##OP(struct brw_compile p, \
				925	struct brw_reg dest, \
				926	struct brw_reg src0) \
				927	{ \
				928	return brw_alu1(p, BRW_OPCODE_##OP, dest, src0); \
				929	}
				930
				931	#define ALU2(OP) \
				932	struct brw_instruction brw_##OP(struct brw_compile p, \
				933	struct brw_reg dest, \
				934	struct brw_reg src0, \
				935	struct brw_reg src1) \
				936	{ \
				937	return brw_alu2(p, BRW_OPCODE_##OP, dest, src0, src1); \
				938	}
				939
				940	#define ALU3(OP) \
				941	struct brw_instruction brw_##OP(struct brw_compile p, \
				942	struct brw_reg dest, \
				943	struct brw_reg src0, \
				944	struct brw_reg src1, \
				945	struct brw_reg src2) \
				946	{ \
				947	return brw_alu3(p, BRW_OPCODE_##OP, dest, src0, src1, src2); \
				948	}
				949
				950	/* Rounding operations (other than RNDD) require two instructions - the first
				951	* stores a rounded value (possibly the wrong way) in the dest register, but
				952	* also sets a per-channel "increment bit" in the flag register. A predicated
				953	* add of 1.0 fixes dest to contain the desired result.
				954	*
				955	* Sandybridge and later appear to round correctly without an ADD.
				956	*/
				957	#define ROUND(OP) \
				958	void brw_##OP(struct brw_compile *p, \
				959	struct brw_reg dest, \
				960	struct brw_reg src) \
				961	{ \
				962	struct brw_instruction rnd, add; \
				963	rnd = next_insn(p, BRW_OPCODE_##OP); \
				964	brw_set_dest(p, rnd, dest); \
				965	brw_set_src0(p, rnd, src); \
				966	\
				967	if (p->brw->intel.gen < 6) { \
				968	/* turn on round-increments */ \
				969	rnd->header.destreg__conditionalmod = BRW_CONDITIONAL_R; \
				970	add = brw_ADD(p, dest, dest, brw_imm_f(1.0f)); \
				971	add->header.predicate_control = BRW_PREDICATE_NORMAL; \
				972	} \
				973	}
				974
				975
				976	ALU1(MOV)
				977	ALU2(SEL)
				978	ALU1(NOT)
				979	ALU2(AND)
				980	ALU2(OR)
				981	ALU2(XOR)
				982	ALU2(SHR)
				983	ALU2(SHL)
				984	ALU2(RSR)
				985	ALU2(RSL)
				986	ALU2(ASR)
				987	ALU1(FRC)
				988	ALU1(RNDD)
				989	ALU2(MAC)
				990	ALU2(MACH)
				991	ALU1(LZD)
				992	ALU2(DP4)
				993	ALU2(DPH)
				994	ALU2(DP3)
				995	ALU2(DP2)
				996	ALU2(LINE)
				997	ALU2(PLN)
				998	ALU3(MAD)
				999
				1000	ROUND(RNDZ)
				1001	ROUND(RNDE)
				1002
				1003
				1004	struct brw_instruction brw_ADD(struct brw_compile p,
				1005	struct brw_reg dest,
				1006	struct brw_reg src0,
				1007	struct brw_reg src1)
				1008	{
				1009	/* 6.2.2: add */
				1010	if (src0.type == BRW_REGISTER_TYPE_F \|\|
				1011	(src0.file == BRW_IMMEDIATE_VALUE &&
				1012	src0.type == BRW_REGISTER_TYPE_VF)) {
				1013	assert(src1.type != BRW_REGISTER_TYPE_UD);
				1014	assert(src1.type != BRW_REGISTER_TYPE_D);
				1015	}
				1016
				1017	if (src1.type == BRW_REGISTER_TYPE_F \|\|
				1018	(src1.file == BRW_IMMEDIATE_VALUE &&
				1019	src1.type == BRW_REGISTER_TYPE_VF)) {
				1020	assert(src0.type != BRW_REGISTER_TYPE_UD);
				1021	assert(src0.type != BRW_REGISTER_TYPE_D);
				1022	}
				1023
				1024	return brw_alu2(p, BRW_OPCODE_ADD, dest, src0, src1);
				1025	}
				1026
				1027	struct brw_instruction brw_AVG(struct brw_compile p,
				1028	struct brw_reg dest,
				1029	struct brw_reg src0,
				1030	struct brw_reg src1)
				1031	{
				1032	assert(dest.type == src0.type);
				1033	assert(src0.type == src1.type);
				1034	switch (src0.type) {
				1035	case BRW_REGISTER_TYPE_B:
				1036	case BRW_REGISTER_TYPE_UB:
				1037	case BRW_REGISTER_TYPE_W:
				1038	case BRW_REGISTER_TYPE_UW:
				1039	case BRW_REGISTER_TYPE_D:
				1040	case BRW_REGISTER_TYPE_UD:
				1041	break;
				1042	default:
				1043	assert(!"Bad type for brw_AVG");
				1044	}
				1045
				1046	return brw_alu2(p, BRW_OPCODE_AVG, dest, src0, src1);
				1047	}
				1048
				1049	struct brw_instruction brw_MUL(struct brw_compile p,
				1050	struct brw_reg dest,
				1051	struct brw_reg src0,
				1052	struct brw_reg src1)
				1053	{
				1054	/* 6.32.38: mul */
				1055	if (src0.type == BRW_REGISTER_TYPE_D \|\|
				1056	src0.type == BRW_REGISTER_TYPE_UD \|\|
				1057	src1.type == BRW_REGISTER_TYPE_D \|\|
				1058	src1.type == BRW_REGISTER_TYPE_UD) {
				1059	assert(dest.type != BRW_REGISTER_TYPE_F);
				1060	}
				1061
				1062	if (src0.type == BRW_REGISTER_TYPE_F \|\|
				1063	(src0.file == BRW_IMMEDIATE_VALUE &&
				1064	src0.type == BRW_REGISTER_TYPE_VF)) {
				1065	assert(src1.type != BRW_REGISTER_TYPE_UD);
				1066	assert(src1.type != BRW_REGISTER_TYPE_D);
				1067	}
				1068
				1069	if (src1.type == BRW_REGISTER_TYPE_F \|\|
				1070	(src1.file == BRW_IMMEDIATE_VALUE &&
				1071	src1.type == BRW_REGISTER_TYPE_VF)) {
				1072	assert(src0.type != BRW_REGISTER_TYPE_UD);
				1073	assert(src0.type != BRW_REGISTER_TYPE_D);
				1074	}
				1075
				1076	assert(src0.file != BRW_ARCHITECTURE_REGISTER_FILE \|\|
				1077	src0.nr != BRW_ARF_ACCUMULATOR);
				1078	assert(src1.file != BRW_ARCHITECTURE_REGISTER_FILE \|\|
				1079	src1.nr != BRW_ARF_ACCUMULATOR);
				1080
				1081	return brw_alu2(p, BRW_OPCODE_MUL, dest, src0, src1);
				1082	}
				1083
				1084
				1085	void brw_NOP(struct brw_compile *p)
				1086	{
				1087	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_NOP);
				1088	brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1089	brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1090	brw_set_src1(p, insn, brw_imm_ud(0x0));
				1091	}
				1092
				1093
				1094
				1095
				1096
				1097	/***********************************************************************
				1098	* Comparisons, if/else/endif
				1099	*/
				1100
				1101	struct brw_instruction brw_JMPI(struct brw_compile p,
				1102	struct brw_reg dest,
				1103	struct brw_reg src0,
				1104	struct brw_reg src1)
				1105	{
				1106	struct brw_instruction *insn = brw_alu2(p, BRW_OPCODE_JMPI, dest, src0, src1);
				1107
				1108	insn->header.execution_size = 1;
				1109	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1110	insn->header.mask_control = BRW_MASK_DISABLE;
				1111
				1112	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1113
				1114	return insn;
				1115	}
				1116
				1117	static void
				1118	push_if_stack(struct brw_compile p, struct brw_instruction inst)
				1119	{
				1120	p->if_stack[p->if_stack_depth] = inst - p->store;
				1121
				1122	p->if_stack_depth++;
				1123	if (p->if_stack_array_size <= p->if_stack_depth) {
				1124	p->if_stack_array_size *= 2;
				1125	p->if_stack = reralloc(p->mem_ctx, p->if_stack, int,
				1126	p->if_stack_array_size);
				1127	}
				1128	}
				1129
				1130	static struct brw_instruction *
				1131	pop_if_stack(struct brw_compile *p)
				1132	{
				1133	p->if_stack_depth--;
				1134	return &p->store[p->if_stack[p->if_stack_depth]];
				1135	}
				1136
				1137	static void
				1138	push_loop_stack(struct brw_compile p, struct brw_instruction inst)
				1139	{
				1140	if (p->loop_stack_array_size < p->loop_stack_depth) {
				1141	p->loop_stack_array_size *= 2;
				1142	p->loop_stack = reralloc(p->mem_ctx, p->loop_stack, int,
				1143	p->loop_stack_array_size);
				1144	p->if_depth_in_loop = reralloc(p->mem_ctx, p->if_depth_in_loop, int,
				1145	p->loop_stack_array_size);
				1146	}
				1147
				1148	p->loop_stack[p->loop_stack_depth] = inst - p->store;
				1149	p->loop_stack_depth++;
				1150	p->if_depth_in_loop[p->loop_stack_depth] = 0;
				1151	}
				1152
				1153	static struct brw_instruction *
				1154	get_inner_do_insn(struct brw_compile *p)
				1155	{
				1156	return &p->store[p->loop_stack[p->loop_stack_depth - 1]];
				1157	}
				1158
				1159	/* EU takes the value from the flag register and pushes it onto some
				1160	* sort of a stack (presumably merging with any flag value already on
				1161	* the stack). Within an if block, the flags at the top of the stack
				1162	* control execution on each channel of the unit, eg. on each of the
				1163	* 16 pixel values in our wm programs.
				1164	*
				1165	* When the matching 'else' instruction is reached (presumably by
				1166	* countdown of the instruction count patched in by our ELSE/ENDIF
				1167	* functions), the relevent flags are inverted.
				1168	*
				1169	* When the matching 'endif' instruction is reached, the flags are
				1170	* popped off. If the stack is now empty, normal execution resumes.
				1171	*/
				1172	struct brw_instruction *
				1173	brw_IF(struct brw_compile *p, GLuint execute_size)
				1174	{
				1175	struct intel_context *intel = &p->brw->intel;
				1176	struct brw_instruction *insn;
				1177
				1178	insn = next_insn(p, BRW_OPCODE_IF);
				1179
				1180	/* Override the defaults for this instruction:
				1181	*/
				1182	if (intel->gen < 6) {
				1183	brw_set_dest(p, insn, brw_ip_reg());
				1184	brw_set_src0(p, insn, brw_ip_reg());
				1185	brw_set_src1(p, insn, brw_imm_d(0x0));
				1186	} else if (intel->gen == 6) {
				1187	brw_set_dest(p, insn, brw_imm_w(0));
				1188	insn->bits1.branch_gen6.jump_count = 0;
				1189	brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1190	brw_set_src1(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1191	} else {
				1192	brw_set_dest(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1193	brw_set_src0(p, insn, vec1(retype(brw_null_reg(), BRW_REGISTER_TYPE_D)));
				1194	brw_set_src1(p, insn, brw_imm_ud(0));
				1195	insn->bits3.break_cont.jip = 0;
				1196	insn->bits3.break_cont.uip = 0;
				1197	}
				1198
				1199	insn->header.execution_size = execute_size;
				1200	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1201	insn->header.predicate_control = BRW_PREDICATE_NORMAL;
				1202	insn->header.mask_control = BRW_MASK_ENABLE;
				1203	if (!p->single_program_flow)
				1204	insn->header.thread_control = BRW_THREAD_SWITCH;
				1205
				1206	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1207
				1208	push_if_stack(p, insn);
				1209	p->if_depth_in_loop[p->loop_stack_depth]++;
				1210	return insn;
				1211	}
				1212
				1213	/* This function is only used for gen6-style IF instructions with an
				1214	* embedded comparison (conditional modifier). It is not used on gen7.
				1215	*/
				1216	struct brw_instruction *
				1217	gen6_IF(struct brw_compile *p, uint32_t conditional,
				1218	struct brw_reg src0, struct brw_reg src1)
				1219	{
				1220	struct brw_instruction *insn;
				1221
				1222	insn = next_insn(p, BRW_OPCODE_IF);
				1223
				1224	brw_set_dest(p, insn, brw_imm_w(0));
				1225	if (p->compressed) {
				1226	insn->header.execution_size = BRW_EXECUTE_16;
				1227	} else {
				1228	insn->header.execution_size = BRW_EXECUTE_8;
				1229	}
				1230	insn->bits1.branch_gen6.jump_count = 0;
				1231	brw_set_src0(p, insn, src0);
				1232	brw_set_src1(p, insn, src1);
				1233
				1234	assert(insn->header.compression_control == BRW_COMPRESSION_NONE);
				1235	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
				1236	insn->header.destreg__conditionalmod = conditional;
				1237
				1238	if (!p->single_program_flow)
				1239	insn->header.thread_control = BRW_THREAD_SWITCH;
				1240
				1241	push_if_stack(p, insn);
				1242	return insn;
				1243	}
				1244
				1245	/**
				1246	* In single-program-flow (SPF) mode, convert IF and ELSE into ADDs.
				1247	*/
				1248	static void
				1249	convert_IF_ELSE_to_ADD(struct brw_compile *p,
				1250	struct brw_instruction *if_inst,
				1251	struct brw_instruction *else_inst)
				1252	{
				1253	/* The next instruction (where the ENDIF would be, if it existed) */
				1254	struct brw_instruction *next_inst = &p->store[p->nr_insn];
				1255
				1256	assert(p->single_program_flow);
				1257	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
				1258	assert(else_inst == NULL \|\| else_inst->header.opcode == BRW_OPCODE_ELSE);
				1259	assert(if_inst->header.execution_size == BRW_EXECUTE_1);
				1260
				1261	/* Convert IF to an ADD instruction that moves the instruction pointer
				1262	* to the first instruction of the ELSE block. If there is no ELSE
				1263	* block, point to where ENDIF would be. Reverse the predicate.
				1264	*
				1265	* There's no need to execute an ENDIF since we don't need to do any
				1266	* stack operations, and if we're currently executing, we just want to
				1267	* continue normally.
				1268	*/
				1269	if_inst->header.opcode = BRW_OPCODE_ADD;
				1270	if_inst->header.predicate_inverse = 1;
				1271
				1272	if (else_inst != NULL) {
				1273	/* Convert ELSE to an ADD instruction that points where the ENDIF
				1274	* would be.
				1275	*/
				1276	else_inst->header.opcode = BRW_OPCODE_ADD;
				1277
				1278	if_inst->bits3.ud = (else_inst - if_inst + 1) * 16;
				1279	else_inst->bits3.ud = (next_inst - else_inst) * 16;
				1280	} else {
				1281	if_inst->bits3.ud = (next_inst - if_inst) * 16;
				1282	}
				1283	}
				1284
				1285	/**
				1286	* Patch IF and ELSE instructions with appropriate jump targets.
				1287	*/
				1288	static void
				1289	patch_IF_ELSE(struct brw_compile *p,
				1290	struct brw_instruction *if_inst,
				1291	struct brw_instruction *else_inst,
				1292	struct brw_instruction *endif_inst)
				1293	{
				1294	struct intel_context *intel = &p->brw->intel;
				1295
				1296	/* We shouldn't be patching IF and ELSE instructions in single program flow
				1297	* mode when gen < 6, because in single program flow mode on those
				1298	* platforms, we convert flow control instructions to conditional ADDs that
				1299	* operate on IP (see brw_ENDIF).
				1300	*
				1301	* However, on Gen6, writing to IP doesn't work in single program flow mode
				1302	* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
				1303	* not be updated by non-flow control instructions."). And on later
				1304	* platforms, there is no significant benefit to converting control flow
				1305	* instructions to conditional ADDs. So we do patch IF and ELSE
				1306	* instructions in single program flow mode on those platforms.
				1307	*/
				1308	if (intel->gen < 6)
				1309	assert(!p->single_program_flow);
				1310
				1311	assert(if_inst != NULL && if_inst->header.opcode == BRW_OPCODE_IF);
				1312	assert(endif_inst != NULL);
				1313	assert(else_inst == NULL \|\| else_inst->header.opcode == BRW_OPCODE_ELSE);
				1314
				1315	unsigned br = 1;
				1316	/* Jump count is for 64bit data chunk each, so one 128bit instruction
				1317	* requires 2 chunks.
				1318	*/
				1319	if (intel->gen >= 5)
				1320	br = 2;
				1321
				1322	assert(endif_inst->header.opcode == BRW_OPCODE_ENDIF);
				1323	endif_inst->header.execution_size = if_inst->header.execution_size;
				1324
				1325	if (else_inst == NULL) {
				1326	/* Patch IF -> ENDIF */
				1327	if (intel->gen < 6) {
				1328	/* Turn it into an IFF, which means no mask stack operations for
				1329	* all-false and jumping past the ENDIF.
				1330	*/
				1331	if_inst->header.opcode = BRW_OPCODE_IFF;
				1332	if_inst->bits3.if_else.jump_count = br * (endif_inst - if_inst + 1);
				1333	if_inst->bits3.if_else.pop_count = 0;
				1334	if_inst->bits3.if_else.pad0 = 0;
				1335	} else if (intel->gen == 6) {
				1336	/* As of gen6, there is no IFF and IF must point to the ENDIF. */
				1337	if_inst->bits1.branch_gen6.jump_count = br * (endif_inst - if_inst);
				1338	} else {
				1339	if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
				1340	if_inst->bits3.break_cont.jip = br * (endif_inst - if_inst);
				1341	}
				1342	} else {
				1343	else_inst->header.execution_size = if_inst->header.execution_size;
				1344
				1345	/* Patch IF -> ELSE */
				1346	if (intel->gen < 6) {
				1347	if_inst->bits3.if_else.jump_count = br * (else_inst - if_inst);
				1348	if_inst->bits3.if_else.pop_count = 0;
				1349	if_inst->bits3.if_else.pad0 = 0;
				1350	} else if (intel->gen == 6) {
				1351	if_inst->bits1.branch_gen6.jump_count = br * (else_inst - if_inst + 1);
				1352	}
				1353
				1354	/* Patch ELSE -> ENDIF */
				1355	if (intel->gen < 6) {
				1356	/* BRW_OPCODE_ELSE pre-gen6 should point just past the
				1357	* matching ENDIF.
				1358	*/
				1359	else_inst->bits3.if_else.jump_count = br*(endif_inst - else_inst + 1);
				1360	else_inst->bits3.if_else.pop_count = 1;
				1361	else_inst->bits3.if_else.pad0 = 0;
				1362	} else if (intel->gen == 6) {
				1363	/* BRW_OPCODE_ELSE on gen6 should point to the matching ENDIF. */
				1364	else_inst->bits1.branch_gen6.jump_count = br*(endif_inst - else_inst);
				1365	} else {
				1366	/* The IF instruction's JIP should point just past the ELSE */
				1367	if_inst->bits3.break_cont.jip = br * (else_inst - if_inst + 1);
				1368	/* The IF instruction's UIP and ELSE's JIP should point to ENDIF */
				1369	if_inst->bits3.break_cont.uip = br * (endif_inst - if_inst);
				1370	else_inst->bits3.break_cont.jip = br * (endif_inst - else_inst);
				1371	}
				1372	}
				1373	}
				1374
				1375	void
				1376	brw_ELSE(struct brw_compile *p)
				1377	{
				1378	struct intel_context *intel = &p->brw->intel;
				1379	struct brw_instruction *insn;
				1380
				1381	insn = next_insn(p, BRW_OPCODE_ELSE);
				1382
				1383	if (intel->gen < 6) {
				1384	brw_set_dest(p, insn, brw_ip_reg());
				1385	brw_set_src0(p, insn, brw_ip_reg());
				1386	brw_set_src1(p, insn, brw_imm_d(0x0));
				1387	} else if (intel->gen == 6) {
				1388	brw_set_dest(p, insn, brw_imm_w(0));
				1389	insn->bits1.branch_gen6.jump_count = 0;
				1390	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1391	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1392	} else {
				1393	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1394	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1395	brw_set_src1(p, insn, brw_imm_ud(0));
				1396	insn->bits3.break_cont.jip = 0;
				1397	insn->bits3.break_cont.uip = 0;
				1398	}
				1399
				1400	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1401	insn->header.mask_control = BRW_MASK_ENABLE;
				1402	if (!p->single_program_flow)
				1403	insn->header.thread_control = BRW_THREAD_SWITCH;
				1404
				1405	push_if_stack(p, insn);
				1406	}
				1407
				1408	void
				1409	brw_ENDIF(struct brw_compile *p)
				1410	{
				1411	struct intel_context *intel = &p->brw->intel;
				1412	struct brw_instruction *insn = NULL;
				1413	struct brw_instruction *else_inst = NULL;
				1414	struct brw_instruction *if_inst = NULL;
				1415	struct brw_instruction *tmp;
				1416	bool emit_endif = true;
				1417
				1418	/* In single program flow mode, we can express IF and ELSE instructions
				1419	* equivalently as ADD instructions that operate on IP. On platforms prior
				1420	* to Gen6, flow control instructions cause an implied thread switch, so
				1421	* this is a significant savings.
				1422	*
				1423	* However, on Gen6, writing to IP doesn't work in single program flow mode
				1424	* (see the SandyBridge PRM, Volume 4 part 2, p79: "When SPF is ON, IP may
				1425	* not be updated by non-flow control instructions."). And on later
				1426	* platforms, there is no significant benefit to converting control flow
				1427	* instructions to conditional ADDs. So we only do this trick on Gen4 and
				1428	* Gen5.
				1429	*/
				1430	if (intel->gen < 6 && p->single_program_flow)
				1431	emit_endif = false;
				1432
				1433	/*
				1434	* A single next_insn() may change the base adress of instruction store
				1435	* memory(p->store), so call it first before referencing the instruction
				1436	* store pointer from an index
				1437	*/
				1438	if (emit_endif)
				1439	insn = next_insn(p, BRW_OPCODE_ENDIF);
				1440
				1441	/* Pop the IF and (optional) ELSE instructions from the stack */
				1442	p->if_depth_in_loop[p->loop_stack_depth]--;
				1443	tmp = pop_if_stack(p);
				1444	if (tmp->header.opcode == BRW_OPCODE_ELSE) {
				1445	else_inst = tmp;
				1446	tmp = pop_if_stack(p);
				1447	}
				1448	if_inst = tmp;
				1449
				1450	if (!emit_endif) {
				1451	/* ENDIF is useless; don't bother emitting it. */
				1452	convert_IF_ELSE_to_ADD(p, if_inst, else_inst);
				1453	return;
				1454	}
				1455
				1456	if (intel->gen < 6) {
				1457	brw_set_dest(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1458	brw_set_src0(p, insn, retype(brw_vec4_grf(0,0), BRW_REGISTER_TYPE_UD));
				1459	brw_set_src1(p, insn, brw_imm_d(0x0));
				1460	} else if (intel->gen == 6) {
				1461	brw_set_dest(p, insn, brw_imm_w(0));
				1462	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1463	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1464	} else {
				1465	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1466	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1467	brw_set_src1(p, insn, brw_imm_ud(0));
				1468	}
				1469
				1470	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1471	insn->header.mask_control = BRW_MASK_ENABLE;
				1472	insn->header.thread_control = BRW_THREAD_SWITCH;
				1473
				1474	/* Also pop item off the stack in the endif instruction: */
				1475	if (intel->gen < 6) {
				1476	insn->bits3.if_else.jump_count = 0;
				1477	insn->bits3.if_else.pop_count = 1;
				1478	insn->bits3.if_else.pad0 = 0;
				1479	} else if (intel->gen == 6) {
				1480	insn->bits1.branch_gen6.jump_count = 2;
				1481	} else {
				1482	insn->bits3.break_cont.jip = 2;
				1483	}
				1484	patch_IF_ELSE(p, if_inst, else_inst, insn);
				1485	}
				1486
				1487	struct brw_instruction brw_BREAK(struct brw_compile p)
				1488	{
				1489	struct intel_context *intel = &p->brw->intel;
				1490	struct brw_instruction *insn;
				1491
				1492	insn = next_insn(p, BRW_OPCODE_BREAK);
				1493	if (intel->gen >= 6) {
				1494	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1495	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1496	brw_set_src1(p, insn, brw_imm_d(0x0));
				1497	} else {
				1498	brw_set_dest(p, insn, brw_ip_reg());
				1499	brw_set_src0(p, insn, brw_ip_reg());
				1500	brw_set_src1(p, insn, brw_imm_d(0x0));
				1501	insn->bits3.if_else.pad0 = 0;
				1502	insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
				1503	}
				1504	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1505	insn->header.execution_size = BRW_EXECUTE_8;
				1506
				1507	return insn;
				1508	}
				1509
				1510	struct brw_instruction gen6_CONT(struct brw_compile p)
				1511	{
				1512	struct brw_instruction *insn;
				1513
				1514	insn = next_insn(p, BRW_OPCODE_CONTINUE);
				1515	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1516	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1517	brw_set_dest(p, insn, brw_ip_reg());
				1518	brw_set_src0(p, insn, brw_ip_reg());
				1519	brw_set_src1(p, insn, brw_imm_d(0x0));
				1520
				1521	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1522	insn->header.execution_size = BRW_EXECUTE_8;
				1523	return insn;
				1524	}
				1525
				1526	struct brw_instruction brw_CONT(struct brw_compile p)
				1527	{
				1528	struct brw_instruction *insn;
				1529	insn = next_insn(p, BRW_OPCODE_CONTINUE);
				1530	brw_set_dest(p, insn, brw_ip_reg());
				1531	brw_set_src0(p, insn, brw_ip_reg());
				1532	brw_set_src1(p, insn, brw_imm_d(0x0));
				1533	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1534	insn->header.execution_size = BRW_EXECUTE_8;
				1535	/* insn->header.mask_control = BRW_MASK_DISABLE; */
				1536	insn->bits3.if_else.pad0 = 0;
				1537	insn->bits3.if_else.pop_count = p->if_depth_in_loop[p->loop_stack_depth];
				1538	return insn;
				1539	}
				1540
				1541	struct brw_instruction gen6_HALT(struct brw_compile p)
				1542	{
				1543	struct brw_instruction *insn;
				1544
				1545	insn = next_insn(p, BRW_OPCODE_HALT);
				1546	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1547	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1548	brw_set_src1(p, insn, brw_imm_d(0x0)); /* UIP and JIP, updated later. */
				1549
				1550	if (p->compressed) {
				1551	insn->header.execution_size = BRW_EXECUTE_16;
				1552	} else {
				1553	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1554	insn->header.execution_size = BRW_EXECUTE_8;
				1555	}
				1556	return insn;
				1557	}
				1558
				1559	/* DO/WHILE loop:
				1560	*
				1561	* The DO/WHILE is just an unterminated loop -- break or continue are
				1562	* used for control within the loop. We have a few ways they can be
				1563	* done.
				1564	*
				1565	* For uniform control flow, the WHILE is just a jump, so ADD ip, ip,
				1566	* jip and no DO instruction.
				1567	*
				1568	* For non-uniform control flow pre-gen6, there's a DO instruction to
				1569	* push the mask, and a WHILE to jump back, and BREAK to get out and
				1570	* pop the mask.
				1571	*
				1572	* For gen6, there's no more mask stack, so no need for DO. WHILE
				1573	* just points back to the first instruction of the loop.
				1574	*/
				1575	struct brw_instruction brw_DO(struct brw_compile p, GLuint execute_size)
				1576	{
				1577	struct intel_context *intel = &p->brw->intel;
				1578
				1579	if (intel->gen >= 6 \|\| p->single_program_flow) {
				1580	push_loop_stack(p, &p->store[p->nr_insn]);
				1581	return &p->store[p->nr_insn];
				1582	} else {
				1583	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_DO);
				1584
				1585	push_loop_stack(p, insn);
				1586
				1587	/* Override the defaults for this instruction:
				1588	*/
				1589	brw_set_dest(p, insn, brw_null_reg());
				1590	brw_set_src0(p, insn, brw_null_reg());
				1591	brw_set_src1(p, insn, brw_null_reg());
				1592
				1593	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1594	insn->header.execution_size = execute_size;
				1595	insn->header.predicate_control = BRW_PREDICATE_NONE;
				1596	/* insn->header.mask_control = BRW_MASK_ENABLE; */
				1597	/* insn->header.mask_control = BRW_MASK_DISABLE; */
				1598
				1599	return insn;
				1600	}
				1601	}
				1602
				1603	/**
				1604	* For pre-gen6, we patch BREAK/CONT instructions to point at the WHILE
				1605	* instruction here.
				1606	*
				1607	* For gen6+, see brw_set_uip_jip(), which doesn't care so much about the loop
				1608	* nesting, since it can always just point to the end of the block/current loop.
				1609	*/
				1610	static void
				1611	brw_patch_break_cont(struct brw_compile p, struct brw_instruction while_inst)
				1612	{
				1613	struct intel_context *intel = &p->brw->intel;
				1614	struct brw_instruction *do_inst = get_inner_do_insn(p);
				1615	struct brw_instruction *inst;
				1616	int br = (intel->gen == 5) ? 2 : 1;
				1617
				1618	for (inst = while_inst - 1; inst != do_inst; inst--) {
				1619	/* If the jump count is != 0, that means that this instruction has already
				1620	* been patched because it's part of a loop inside of the one we're
				1621	* patching.
				1622	*/
				1623	if (inst->header.opcode == BRW_OPCODE_BREAK &&
				1624	inst->bits3.if_else.jump_count == 0) {
				1625	inst->bits3.if_else.jump_count = br * ((while_inst - inst) + 1);
				1626	} else if (inst->header.opcode == BRW_OPCODE_CONTINUE &&
				1627	inst->bits3.if_else.jump_count == 0) {
				1628	inst->bits3.if_else.jump_count = br * (while_inst - inst);
				1629	}
				1630	}
				1631	}
				1632
				1633	struct brw_instruction brw_WHILE(struct brw_compile p)
				1634	{
				1635	struct intel_context *intel = &p->brw->intel;
				1636	struct brw_instruction insn, do_insn;
				1637	GLuint br = 1;
				1638
				1639	if (intel->gen >= 5)
				1640	br = 2;
				1641
				1642	if (intel->gen >= 7) {
				1643	insn = next_insn(p, BRW_OPCODE_WHILE);
				1644	do_insn = get_inner_do_insn(p);
				1645
				1646	brw_set_dest(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1647	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1648	brw_set_src1(p, insn, brw_imm_ud(0));
				1649	insn->bits3.break_cont.jip = br * (do_insn - insn);
				1650
				1651	insn->header.execution_size = BRW_EXECUTE_8;
				1652	} else if (intel->gen == 6) {
				1653	insn = next_insn(p, BRW_OPCODE_WHILE);
				1654	do_insn = get_inner_do_insn(p);
				1655
				1656	brw_set_dest(p, insn, brw_imm_w(0));
				1657	insn->bits1.branch_gen6.jump_count = br * (do_insn - insn);
				1658	brw_set_src0(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1659	brw_set_src1(p, insn, retype(brw_null_reg(), BRW_REGISTER_TYPE_D));
				1660
				1661	insn->header.execution_size = BRW_EXECUTE_8;
				1662	} else {
				1663	if (p->single_program_flow) {
				1664	insn = next_insn(p, BRW_OPCODE_ADD);
				1665	do_insn = get_inner_do_insn(p);
				1666
				1667	brw_set_dest(p, insn, brw_ip_reg());
				1668	brw_set_src0(p, insn, brw_ip_reg());
				1669	brw_set_src1(p, insn, brw_imm_d((do_insn - insn) * 16));
				1670	insn->header.execution_size = BRW_EXECUTE_1;
				1671	} else {
				1672	insn = next_insn(p, BRW_OPCODE_WHILE);
				1673	do_insn = get_inner_do_insn(p);
				1674
				1675	assert(do_insn->header.opcode == BRW_OPCODE_DO);
				1676
				1677	brw_set_dest(p, insn, brw_ip_reg());
				1678	brw_set_src0(p, insn, brw_ip_reg());
				1679	brw_set_src1(p, insn, brw_imm_d(0));
				1680
				1681	insn->header.execution_size = do_insn->header.execution_size;
				1682	insn->bits3.if_else.jump_count = br * (do_insn - insn + 1);
				1683	insn->bits3.if_else.pop_count = 0;
				1684	insn->bits3.if_else.pad0 = 0;
				1685
				1686	brw_patch_break_cont(p, insn);
				1687	}
				1688	}
				1689	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1690	p->current->header.predicate_control = BRW_PREDICATE_NONE;
				1691
				1692	p->loop_stack_depth--;
				1693
				1694	return insn;
				1695	}
				1696
				1697
				1698	/* FORWARD JUMPS:
				1699	*/
				1700	void brw_land_fwd_jump(struct brw_compile *p, int jmp_insn_idx)
				1701	{
				1702	struct intel_context *intel = &p->brw->intel;
				1703	struct brw_instruction *jmp_insn = &p->store[jmp_insn_idx];
				1704	GLuint jmpi = 1;
				1705
				1706	if (intel->gen >= 5)
				1707	jmpi = 2;
				1708
				1709	assert(jmp_insn->header.opcode == BRW_OPCODE_JMPI);
				1710	assert(jmp_insn->bits1.da1.src1_reg_file == BRW_IMMEDIATE_VALUE);
				1711
				1712	jmp_insn->bits3.ud = jmpi * (p->nr_insn - jmp_insn_idx - 1);
				1713	}
				1714
				1715
				1716
				1717	/* To integrate with the above, it makes sense that the comparison
				1718	* instruction should populate the flag register. It might be simpler
				1719	* just to use the flag reg for most WM tasks?
				1720	*/
				1721	void brw_CMP(struct brw_compile *p,
				1722	struct brw_reg dest,
				1723	GLuint conditional,
				1724	struct brw_reg src0,
				1725	struct brw_reg src1)
				1726	{
				1727	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_CMP);
				1728
				1729	insn->header.destreg__conditionalmod = conditional;
				1730	brw_set_dest(p, insn, dest);
				1731	brw_set_src0(p, insn, src0);
				1732	brw_set_src1(p, insn, src1);
				1733
				1734	/* guess_execution_size(insn, src0); */
				1735
				1736
				1737	/* Make it so that future instructions will use the computed flag
				1738	* value until brw_set_predicate_control_flag_value() is called
				1739	* again.
				1740	*/
				1741	if (dest.file == BRW_ARCHITECTURE_REGISTER_FILE &&
				1742	dest.nr == 0) {
				1743	p->current->header.predicate_control = BRW_PREDICATE_NORMAL;
				1744	p->flag_value = 0xff;
				1745	}
				1746	}
				1747
				1748	/* Issue 'wait' instruction for n1, host could program MMIO
				1749	to wake up thread. */
				1750	void brw_WAIT (struct brw_compile *p)
				1751	{
				1752	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_WAIT);
				1753	struct brw_reg src = brw_notification_1_reg();
				1754
				1755	brw_set_dest(p, insn, src);
				1756	brw_set_src0(p, insn, src);
				1757	brw_set_src1(p, insn, brw_null_reg());
				1758	insn->header.execution_size = 0; /* must */
				1759	insn->header.predicate_control = 0;
				1760	insn->header.compression_control = 0;
				1761	}
				1762
				1763
				1764	/***********************************************************************
				1765	* Helpers for the various SEND message types:
				1766	*/
				1767
				1768	/** Extended math function, float[8].
				1769	*/
				1770	void brw_math( struct brw_compile *p,
				1771	struct brw_reg dest,
				1772	GLuint function,
				1773	GLuint msg_reg_nr,
				1774	struct brw_reg src,
				1775	GLuint data_type,
				1776	GLuint precision )
				1777	{
				1778	struct intel_context *intel = &p->brw->intel;
				1779
				1780	if (intel->gen >= 6) {
				1781	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
				1782
				1783	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
				1784	assert(src.file == BRW_GENERAL_REGISTER_FILE);
				1785
				1786	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
				1787	if (intel->gen == 6)
				1788	assert(src.hstride == BRW_HORIZONTAL_STRIDE_1);
				1789
				1790	/* Source modifiers are ignored for extended math instructions on Gen6. */
				1791	if (intel->gen == 6) {
				1792	assert(!src.negate);
				1793	assert(!src.abs);
				1794	}
				1795
				1796	if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT \|\|
				1797	function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER \|\|
				1798	function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
				1799	assert(src.type != BRW_REGISTER_TYPE_F);
				1800	} else {
				1801	assert(src.type == BRW_REGISTER_TYPE_F);
				1802	}
				1803
				1804	/* Math is the same ISA format as other opcodes, except that CondModifier
				1805	* becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
				1806	*/
				1807	insn->header.destreg__conditionalmod = function;
				1808
				1809	brw_set_dest(p, insn, dest);
				1810	brw_set_src0(p, insn, src);
				1811	brw_set_src1(p, insn, brw_null_reg());
				1812	} else {
				1813	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				1814
				1815	/* Example code doesn't set predicate_control for send
				1816	* instructions.
				1817	*/
				1818	insn->header.predicate_control = 0;
				1819	insn->header.destreg__conditionalmod = msg_reg_nr;
				1820
				1821	brw_set_dest(p, insn, dest);
				1822	brw_set_src0(p, insn, src);
				1823	brw_set_math_message(p,
				1824	insn,
				1825	function,
				1826	src.type == BRW_REGISTER_TYPE_D,
				1827	precision,
				1828	data_type);
				1829	}
				1830	}
				1831
				1832	/** Extended math function, float[8].
				1833	*/
				1834	void brw_math2(struct brw_compile *p,
				1835	struct brw_reg dest,
				1836	GLuint function,
				1837	struct brw_reg src0,
				1838	struct brw_reg src1)
				1839	{
				1840	struct intel_context *intel = &p->brw->intel;
				1841	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_MATH);
				1842
				1843	assert(intel->gen >= 6);
				1844	(void) intel;
				1845
				1846
				1847	assert(dest.file == BRW_GENERAL_REGISTER_FILE);
				1848	assert(src0.file == BRW_GENERAL_REGISTER_FILE);
				1849	assert(src1.file == BRW_GENERAL_REGISTER_FILE);
				1850
				1851	assert(dest.hstride == BRW_HORIZONTAL_STRIDE_1);
				1852	if (intel->gen == 6) {
				1853	assert(src0.hstride == BRW_HORIZONTAL_STRIDE_1);
				1854	assert(src1.hstride == BRW_HORIZONTAL_STRIDE_1);
				1855	}
				1856
				1857	if (function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT \|\|
				1858	function == BRW_MATH_FUNCTION_INT_DIV_REMAINDER \|\|
				1859	function == BRW_MATH_FUNCTION_INT_DIV_QUOTIENT_AND_REMAINDER) {
				1860	assert(src0.type != BRW_REGISTER_TYPE_F);
				1861	assert(src1.type != BRW_REGISTER_TYPE_F);
				1862	} else {
				1863	assert(src0.type == BRW_REGISTER_TYPE_F);
				1864	assert(src1.type == BRW_REGISTER_TYPE_F);
				1865	}
				1866
				1867	/* Source modifiers are ignored for extended math instructions on Gen6. */
				1868	if (intel->gen == 6) {
				1869	assert(!src0.negate);
				1870	assert(!src0.abs);
				1871	assert(!src1.negate);
				1872	assert(!src1.abs);
				1873	}
				1874
				1875	/* Math is the same ISA format as other opcodes, except that CondModifier
				1876	* becomes FC[3:0] and ThreadCtrl becomes FC[5:4].
				1877	*/
				1878	insn->header.destreg__conditionalmod = function;
				1879
				1880	brw_set_dest(p, insn, dest);
				1881	brw_set_src0(p, insn, src0);
				1882	brw_set_src1(p, insn, src1);
				1883	}
				1884
				1885
				1886	/**
				1887	* Write a block of OWORDs (half a GRF each) from the scratch buffer,
				1888	* using a constant offset per channel.
				1889	*
				1890	* The offset must be aligned to oword size (16 bytes). Used for
				1891	* register spilling.
				1892	*/
				1893	void brw_oword_block_write_scratch(struct brw_compile *p,
				1894	struct brw_reg mrf,
				1895	int num_regs,
				1896	GLuint offset)
				1897	{
				1898	struct intel_context *intel = &p->brw->intel;
				1899	uint32_t msg_control, msg_type;
				1900	int mlen;
				1901
				1902	if (intel->gen >= 6)
				1903	offset /= 16;
				1904
				1905	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				1906
				1907	if (num_regs == 1) {
				1908	msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
				1909	mlen = 2;
				1910	} else {
				1911	msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
				1912	mlen = 3;
				1913	}
				1914
				1915	/* Set up the message header. This is g0, with g0.2 filled with
				1916	* the offset. We don't want to leave our offset around in g0 or
				1917	* it'll screw up texture samples, so set it up inside the message
				1918	* reg.
				1919	*/
				1920	{
				1921	brw_push_insn_state(p);
				1922	brw_set_mask_control(p, BRW_MASK_DISABLE);
				1923	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				1924
				1925	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				1926
				1927	/* set message header global offset field (reg 0, element 2) */
				1928	brw_MOV(p,
				1929	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				1930	mrf.nr,
				1931	2), BRW_REGISTER_TYPE_UD),
				1932	brw_imm_ud(offset));
				1933
				1934	brw_pop_insn_state(p);
				1935	}
				1936
				1937	{
				1938	struct brw_reg dest;
				1939	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				1940	int send_commit_msg;
				1941	struct brw_reg src_header = retype(brw_vec8_grf(0, 0),
				1942	BRW_REGISTER_TYPE_UW);
				1943
				1944	if (insn->header.compression_control != BRW_COMPRESSION_NONE) {
				1945	insn->header.compression_control = BRW_COMPRESSION_NONE;
				1946	src_header = vec16(src_header);
				1947	}
				1948	assert(insn->header.predicate_control == BRW_PREDICATE_NONE);
				1949	insn->header.destreg__conditionalmod = mrf.nr;
				1950
				1951	/* Until gen6, writes followed by reads from the same location
				1952	* are not guaranteed to be ordered unless write_commit is set.
				1953	* If set, then a no-op write is issued to the destination
				1954	* register to set a dependency, and a read from the destination
				1955	* can be used to ensure the ordering.
				1956	*
				1957	* For gen6, only writes between different threads need ordering
				1958	* protection. Our use of DP writes is all about register
				1959	* spilling within a thread.
				1960	*/
				1961	if (intel->gen >= 6) {
				1962	dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				1963	send_commit_msg = 0;
				1964	} else {
				1965	dest = src_header;
				1966	send_commit_msg = 1;
				1967	}
				1968
				1969	brw_set_dest(p, insn, dest);
				1970	if (intel->gen >= 6) {
				1971	brw_set_src0(p, insn, mrf);
				1972	} else {
				1973	brw_set_src0(p, insn, brw_null_reg());
				1974	}
				1975
				1976	if (intel->gen >= 6)
				1977	msg_type = GEN6_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
				1978	else
				1979	msg_type = BRW_DATAPORT_WRITE_MESSAGE_OWORD_BLOCK_WRITE;
				1980
				1981	brw_set_dp_write_message(p,
				1982	insn,
				1983	255, /* binding table index (255=stateless) */
				1984	msg_control,
				1985	msg_type,
				1986	mlen,
				1987	true, /* header_present */
				1988	0, /* not a render target */
				1989	send_commit_msg, /* response_length */
				1990	0, /* eot */
				1991	send_commit_msg);
				1992	}
				1993	}
				1994
				1995
				1996	/**
				1997	* Read a block of owords (half a GRF each) from the scratch buffer
				1998	* using a constant index per channel.
				1999	*
				2000	* Offset must be aligned to oword size (16 bytes). Used for register
				2001	* spilling.
				2002	*/
				2003	void
				2004	brw_oword_block_read_scratch(struct brw_compile *p,
				2005	struct brw_reg dest,
				2006	struct brw_reg mrf,
				2007	int num_regs,
				2008	GLuint offset)
				2009	{
				2010	struct intel_context *intel = &p->brw->intel;
				2011	uint32_t msg_control;
				2012	int rlen;
				2013
				2014	if (intel->gen >= 6)
				2015	offset /= 16;
				2016
				2017	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				2018	dest = retype(dest, BRW_REGISTER_TYPE_UW);
				2019
				2020	if (num_regs == 1) {
				2021	msg_control = BRW_DATAPORT_OWORD_BLOCK_2_OWORDS;
				2022	rlen = 1;
				2023	} else {
				2024	msg_control = BRW_DATAPORT_OWORD_BLOCK_4_OWORDS;
				2025	rlen = 2;
				2026	}
				2027
				2028	{
				2029	brw_push_insn_state(p);
				2030	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2031	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2032
				2033	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				2034
				2035	/* set message header global offset field (reg 0, element 2) */
				2036	brw_MOV(p,
				2037	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				2038	mrf.nr,
				2039	2), BRW_REGISTER_TYPE_UD),
				2040	brw_imm_ud(offset));
				2041
				2042	brw_pop_insn_state(p);
				2043	}
				2044
				2045	{
				2046	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				2047
				2048	assert(insn->header.predicate_control == 0);
				2049	insn->header.compression_control = BRW_COMPRESSION_NONE;
				2050	insn->header.destreg__conditionalmod = mrf.nr;
				2051
				2052	brw_set_dest(p, insn, dest); /* UW? */
				2053	if (intel->gen >= 6) {
				2054	brw_set_src0(p, insn, mrf);
				2055	} else {
				2056	brw_set_src0(p, insn, brw_null_reg());
				2057	}
				2058
				2059	brw_set_dp_read_message(p,
				2060	insn,
				2061	255, /* binding table index (255=stateless) */
				2062	msg_control,
				2063	BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ, /* msg_type */
				2064	BRW_DATAPORT_READ_TARGET_RENDER_CACHE,
				2065	1, /* msg_length */
				2066	true, /* header_present */
				2067	rlen);
				2068	}
				2069	}
				2070
				2071	/**
				2072	* Read a float[4] vector from the data port Data Cache (const buffer).
				2073	* Location (in buffer) should be a multiple of 16.
				2074	* Used for fetching shader constants.
				2075	*/
				2076	void brw_oword_block_read(struct brw_compile *p,
				2077	struct brw_reg dest,
				2078	struct brw_reg mrf,
				2079	uint32_t offset,
				2080	uint32_t bind_table_index)
				2081	{
				2082	struct intel_context *intel = &p->brw->intel;
				2083
				2084	/* On newer hardware, offset is in units of owords. */
				2085	if (intel->gen >= 6)
				2086	offset /= 16;
				2087
				2088	mrf = retype(mrf, BRW_REGISTER_TYPE_UD);
				2089
				2090	brw_push_insn_state(p);
				2091	brw_set_predicate_control(p, BRW_PREDICATE_NONE);
				2092	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2093	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2094
				2095	brw_MOV(p, mrf, retype(brw_vec8_grf(0, 0), BRW_REGISTER_TYPE_UD));
				2096
				2097	/* set message header global offset field (reg 0, element 2) */
				2098	brw_MOV(p,
				2099	retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				2100	mrf.nr,
				2101	2), BRW_REGISTER_TYPE_UD),
				2102	brw_imm_ud(offset));
				2103
				2104	struct brw_instruction *insn = next_insn(p, BRW_OPCODE_SEND);
				2105	insn->header.destreg__conditionalmod = mrf.nr;
				2106
				2107	/* cast dest to a uword[8] vector */
				2108	dest = retype(vec8(dest), BRW_REGISTER_TYPE_UW);
				2109
				2110	brw_set_dest(p, insn, dest);
				2111	if (intel->gen >= 6) {
				2112	brw_set_src0(p, insn, mrf);
				2113	} else {
				2114	brw_set_src0(p, insn, brw_null_reg());
				2115	}
				2116
				2117	brw_set_dp_read_message(p,
				2118	insn,
				2119	bind_table_index,
				2120	BRW_DATAPORT_OWORD_BLOCK_1_OWORDLOW,
				2121	BRW_DATAPORT_READ_MESSAGE_OWORD_BLOCK_READ,
				2122	BRW_DATAPORT_READ_TARGET_DATA_CACHE,
				2123	1, /* msg_length */
				2124	true, /* header_present */
				2125	1); /* response_length (1 reg, 2 owords!) */
				2126
				2127	brw_pop_insn_state(p);
				2128	}
				2129
				2130
				2131	void brw_fb_WRITE(struct brw_compile *p,
				2132	int dispatch_width,
				2133	GLuint msg_reg_nr,
				2134	struct brw_reg src0,
				2135	GLuint msg_control,
				2136	GLuint binding_table_index,
				2137	GLuint msg_length,
				2138	GLuint response_length,
				2139	bool eot,
				2140	bool header_present)
				2141	{
				2142	struct intel_context *intel = &p->brw->intel;
				2143	struct brw_instruction *insn;
				2144	GLuint msg_type;
				2145	struct brw_reg dest;
				2146
				2147	if (dispatch_width == 16)
				2148	dest = retype(vec16(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				2149	else
				2150	dest = retype(vec8(brw_null_reg()), BRW_REGISTER_TYPE_UW);
				2151
				2152	if (intel->gen >= 6) {
				2153	insn = next_insn(p, BRW_OPCODE_SENDC);
				2154	} else {
				2155	insn = next_insn(p, BRW_OPCODE_SEND);
				2156	}
				2157	/* The execution mask is ignored for render target writes. */
				2158	insn->header.predicate_control = 0;
				2159	insn->header.compression_control = BRW_COMPRESSION_NONE;
				2160
				2161	if (intel->gen >= 6) {
				2162	/* headerless version, just submit color payload */
				2163	src0 = brw_message_reg(msg_reg_nr);
				2164
				2165	msg_type = GEN6_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
				2166	} else {
				2167	insn->header.destreg__conditionalmod = msg_reg_nr;
				2168
				2169	msg_type = BRW_DATAPORT_WRITE_MESSAGE_RENDER_TARGET_WRITE;
				2170	}
				2171
				2172	brw_set_dest(p, insn, dest);
				2173	brw_set_src0(p, insn, src0);
				2174	brw_set_dp_write_message(p,
				2175	insn,
				2176	binding_table_index,
				2177	msg_control,
				2178	msg_type,
				2179	msg_length,
				2180	header_present,
				2181	eot, /* last render target write */
				2182	response_length,
				2183	eot,
				2184	0 /* send_commit_msg */);
				2185	}
				2186
				2187
				2188	/**
				2189	* Texture sample instruction.
				2190	* Note: the msg_type plus msg_length values determine exactly what kind
				2191	* of sampling operation is performed. See volume 4, page 161 of docs.
				2192	*/
				2193	void brw_SAMPLE(struct brw_compile *p,
				2194	struct brw_reg dest,
				2195	GLuint msg_reg_nr,
				2196	struct brw_reg src0,
				2197	GLuint binding_table_index,
				2198	GLuint sampler,
				2199	GLuint writemask,
				2200	GLuint msg_type,
				2201	GLuint response_length,
				2202	GLuint msg_length,
				2203	GLuint header_present,
				2204	GLuint simd_mode,
				2205	GLuint return_format)
				2206	{
				2207	struct intel_context *intel = &p->brw->intel;
				2208	bool need_stall = 0;
				2209
				2210	if (writemask == 0) {
				2211	/printf("%s: zero writemask??\n", __FUNCTION__); /
				2212	return;
				2213	}
				2214
				2215	/* Hardware doesn't do destination dependency checking on send
				2216	* instructions properly. Add a workaround which generates the
				2217	* dependency by other means. In practice it seems like this bug
				2218	* only crops up for texture samples, and only where registers are
				2219	* written by the send and then written again later without being
				2220	* read in between. Luckily for us, we already track that
				2221	* information and use it to modify the writemask for the
				2222	* instruction, so that is a guide for whether a workaround is
				2223	* needed.
				2224	*/
				2225	if (writemask != BRW_WRITEMASK_XYZW) {
				2226	GLuint dst_offset = 0;
				2227	GLuint i, newmask = 0, len = 0;
				2228
				2229	for (i = 0; i < 4; i++) {
				2230	if (writemask & (1<<i))
				2231	break;
				2232	dst_offset += 2;
				2233	}
				2234	for (; i < 4; i++) {
				2235	if (!(writemask & (1<<i)))
				2236	break;
				2237	newmask \|= 1<<i;
				2238	len++;
				2239	}
				2240
				2241	if (newmask != writemask) {
				2242	need_stall = 1;
				2243	/* printf("need stall %x %x\n", newmask , writemask); */
				2244	}
				2245	else {
				2246	bool dispatch_16 = false;
				2247
				2248	struct brw_reg m1 = brw_message_reg(msg_reg_nr);
				2249
				2250	guess_execution_size(p, p->current, dest);
				2251	if (p->current->header.execution_size == BRW_EXECUTE_16)
				2252	dispatch_16 = true;
				2253
				2254	newmask = ~newmask & BRW_WRITEMASK_XYZW;
				2255
				2256	brw_push_insn_state(p);
				2257
				2258	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2259	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2260
				2261	brw_MOV(p, retype(m1, BRW_REGISTER_TYPE_UD),
				2262	retype(brw_vec8_grf(0,0), BRW_REGISTER_TYPE_UD));
				2263	brw_MOV(p, get_element_ud(m1, 2), brw_imm_ud(newmask << 12));
				2264
				2265	brw_pop_insn_state(p);
				2266
				2267	src0 = retype(brw_null_reg(), BRW_REGISTER_TYPE_UW);
				2268	dest = offset(dest, dst_offset);
				2269
				2270	/* For 16-wide dispatch, masked channels are skipped in the
				2271	* response. For 8-wide, masked channels still take up slots,
				2272	* and are just not written to.
				2273	*/
				2274	if (dispatch_16)
				2275	response_length = len * 2;
				2276	}
				2277	}
				2278
				2279	{
				2280	struct brw_instruction *insn;
				2281
				2282	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2283
				2284	insn = next_insn(p, BRW_OPCODE_SEND);
				2285	insn->header.predicate_control = 0; /* XXX */
				2286	insn->header.compression_control = BRW_COMPRESSION_NONE;
				2287	if (intel->gen < 6)
				2288	insn->header.destreg__conditionalmod = msg_reg_nr;
				2289
				2290	brw_set_dest(p, insn, dest);
				2291	brw_set_src0(p, insn, src0);
				2292	brw_set_sampler_message(p, insn,
				2293	binding_table_index,
				2294	sampler,
				2295	msg_type,
				2296	response_length,
				2297	msg_length,
				2298	header_present,
				2299	simd_mode,
				2300	return_format);
				2301	}
				2302
				2303	if (need_stall) {
				2304	struct brw_reg reg = vec8(offset(dest, response_length-1));
				2305
				2306	/* mov (8) r9.0<1>:f r9.0<8;8,1>:f { Align1 }
				2307	*/
				2308	brw_push_insn_state(p);
				2309	brw_set_compression_control(p, BRW_COMPRESSION_NONE);
				2310	brw_MOV(p, retype(reg, BRW_REGISTER_TYPE_UD),
				2311	retype(reg, BRW_REGISTER_TYPE_UD));
				2312	brw_pop_insn_state(p);
				2313	}
				2314
				2315	}
				2316
				2317	/* All these variables are pretty confusing - we might be better off
				2318	* using bitmasks and macros for this, in the old style. Or perhaps
				2319	* just having the caller instantiate the fields in dword3 itself.
				2320	*/
				2321	void brw_urb_WRITE(struct brw_compile *p,
				2322	struct brw_reg dest,
				2323	GLuint msg_reg_nr,
				2324	struct brw_reg src0,
				2325	bool allocate,
				2326	bool used,
				2327	GLuint msg_length,
				2328	GLuint response_length,
				2329	bool eot,
				2330	bool writes_complete,
				2331	GLuint offset,
				2332	GLuint swizzle)
				2333	{
				2334	struct intel_context *intel = &p->brw->intel;
				2335	struct brw_instruction *insn;
				2336
				2337	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2338
				2339	if (intel->gen == 7) {
				2340	/* Enable Channel Masks in the URB_WRITE_HWORD message header */
				2341	brw_push_insn_state(p);
				2342	brw_set_access_mode(p, BRW_ALIGN_1);
				2343	brw_OR(p, retype(brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE, msg_reg_nr, 5),
				2344	BRW_REGISTER_TYPE_UD),
				2345	retype(brw_vec1_grf(0, 5), BRW_REGISTER_TYPE_UD),
				2346	brw_imm_ud(0xff00));
				2347	brw_pop_insn_state(p);
				2348	}
				2349
				2350	insn = next_insn(p, BRW_OPCODE_SEND);
				2351
				2352	assert(msg_length < BRW_MAX_MRF);
				2353
				2354	brw_set_dest(p, insn, dest);
				2355	brw_set_src0(p, insn, src0);
				2356	brw_set_src1(p, insn, brw_imm_d(0));
				2357
				2358	if (intel->gen < 6)
				2359	insn->header.destreg__conditionalmod = msg_reg_nr;
				2360
				2361	brw_set_urb_message(p,
				2362	insn,
				2363	allocate,
				2364	used,
				2365	msg_length,
				2366	response_length,
				2367	eot,
				2368	writes_complete,
				2369	offset,
				2370	swizzle);
				2371	}
				2372
				2373	static int
				2374	next_ip(struct brw_compile *p, int ip)
				2375	{
				2376	struct brw_instruction insn = (void )p->store + ip;
				2377
				2378	if (insn->header.cmpt_control)
				2379	return ip + 8;
				2380	else
				2381	return ip + 16;
				2382	}
				2383
				2384	static int
				2385	brw_find_next_block_end(struct brw_compile *p, int start)
				2386	{
				2387	int ip;
				2388	void *store = p->store;
				2389
				2390	for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2391	struct brw_instruction *insn = store + ip;
				2392
				2393	switch (insn->header.opcode) {
				2394	case BRW_OPCODE_ENDIF:
				2395	case BRW_OPCODE_ELSE:
				2396	case BRW_OPCODE_WHILE:
				2397	case BRW_OPCODE_HALT:
				2398	return ip;
				2399	}
				2400	}
				2401
				2402	return 0;
				2403	}
				2404
				2405	/* There is no DO instruction on gen6, so to find the end of the loop
				2406	* we have to see if the loop is jumping back before our start
				2407	* instruction.
				2408	*/
				2409	static int
				2410	brw_find_loop_end(struct brw_compile *p, int start)
				2411	{
				2412	struct intel_context *intel = &p->brw->intel;
				2413	int ip;
				2414	int scale = 8;
				2415	void *store = p->store;
				2416
				2417	/* Always start after the instruction (such as a WHILE) we're trying to fix
				2418	* up.
				2419	*/
				2420	for (ip = next_ip(p, start); ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2421	struct brw_instruction *insn = store + ip;
				2422
				2423	if (insn->header.opcode == BRW_OPCODE_WHILE) {
				2424	int jip = intel->gen == 6 ? insn->bits1.branch_gen6.jump_count
				2425	: insn->bits3.break_cont.jip;
				2426	if (ip + jip * scale <= start)
				2427	return ip;
				2428	}
				2429	}
				2430	assert(!"not reached");
				2431	return start;
				2432	}
				2433
				2434	/* After program generation, go back and update the UIP and JIP of
				2435	* BREAK, CONT, and HALT instructions to their correct locations.
				2436	*/
				2437	void
				2438	brw_set_uip_jip(struct brw_compile *p)
				2439	{
				2440	struct intel_context *intel = &p->brw->intel;
				2441	int ip;
				2442	int scale = 8;
				2443	void *store = p->store;
				2444
				2445	if (intel->gen < 6)
				2446	return;
				2447
				2448	for (ip = 0; ip < p->next_insn_offset; ip = next_ip(p, ip)) {
				2449	struct brw_instruction *insn = store + ip;
				2450
				2451	if (insn->header.cmpt_control) {
				2452	/* Fixups for compacted BREAK/CONTINUE not supported yet. */
				2453	assert(insn->header.opcode != BRW_OPCODE_BREAK &&
				2454	insn->header.opcode != BRW_OPCODE_CONTINUE &&
				2455	insn->header.opcode != BRW_OPCODE_HALT);
				2456	continue;
				2457	}
				2458
				2459	int block_end_ip = brw_find_next_block_end(p, ip);
				2460	switch (insn->header.opcode) {
				2461	case BRW_OPCODE_BREAK:
				2462	assert(block_end_ip != 0);
				2463	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2464	/* Gen7 UIP points to WHILE; Gen6 points just after it */
				2465	insn->bits3.break_cont.uip =
				2466	(brw_find_loop_end(p, ip) - ip +
				2467	(intel->gen == 6 ? 16 : 0)) / scale;
				2468	break;
				2469	case BRW_OPCODE_CONTINUE:
				2470	assert(block_end_ip != 0);
				2471	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2472	insn->bits3.break_cont.uip =
				2473	(brw_find_loop_end(p, ip) - ip) / scale;
				2474
				2475	assert(insn->bits3.break_cont.uip != 0);
				2476	assert(insn->bits3.break_cont.jip != 0);
				2477	break;
				2478
				2479	case BRW_OPCODE_ENDIF:
				2480	if (block_end_ip == 0)
				2481	insn->bits3.break_cont.jip = 2;
				2482	else
				2483	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2484	break;
				2485
				2486	case BRW_OPCODE_HALT:
				2487	/* From the Sandy Bridge PRM (volume 4, part 2, section 8.3.19):
				2488	*
				2489	* "In case of the halt instruction not inside any conditional
				2490	* code block, the value of <JIP> and <UIP> should be the
				2491	* same. In case of the halt instruction inside conditional code
				2492	* block, the <UIP> should be the end of the program, and the
				2493	* <JIP> should be end of the most inner conditional code block."
				2494	*
				2495	* The uip will have already been set by whoever set up the
				2496	* instruction.
				2497	*/
				2498	if (block_end_ip == 0) {
				2499	insn->bits3.break_cont.jip = insn->bits3.break_cont.uip;
				2500	} else {
				2501	insn->bits3.break_cont.jip = (block_end_ip - ip) / scale;
				2502	}
				2503	assert(insn->bits3.break_cont.uip != 0);
				2504	assert(insn->bits3.break_cont.jip != 0);
				2505	break;
				2506	}
				2507	}
				2508	}
				2509
				2510	void brw_ff_sync(struct brw_compile *p,
				2511	struct brw_reg dest,
				2512	GLuint msg_reg_nr,
				2513	struct brw_reg src0,
				2514	bool allocate,
				2515	GLuint response_length,
				2516	bool eot)
				2517	{
				2518	struct intel_context *intel = &p->brw->intel;
				2519	struct brw_instruction *insn;
				2520
				2521	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2522
				2523	insn = next_insn(p, BRW_OPCODE_SEND);
				2524	brw_set_dest(p, insn, dest);
				2525	brw_set_src0(p, insn, src0);
				2526	brw_set_src1(p, insn, brw_imm_d(0));
				2527
				2528	if (intel->gen < 6)
				2529	insn->header.destreg__conditionalmod = msg_reg_nr;
				2530
				2531	brw_set_ff_sync_message(p,
				2532	insn,
				2533	allocate,
				2534	response_length,
				2535	eot);
				2536	}
				2537
				2538	/**
				2539	* Emit the SEND instruction necessary to generate stream output data on Gen6
				2540	* (for transform feedback).
				2541	*
				2542	* If send_commit_msg is true, this is the last piece of stream output data
				2543	* from this thread, so send the data as a committed write. According to the
				2544	* Sandy Bridge PRM (volume 2 part 1, section 4.5.1):
				2545	*
				2546	* "Prior to End of Thread with a URB_WRITE, the kernel must ensure all
				2547	* writes are complete by sending the final write as a committed write."
				2548	*/
				2549	void
				2550	brw_svb_write(struct brw_compile *p,
				2551	struct brw_reg dest,
				2552	GLuint msg_reg_nr,
				2553	struct brw_reg src0,
				2554	GLuint binding_table_index,
				2555	bool send_commit_msg)
				2556	{
				2557	struct brw_instruction *insn;
				2558
				2559	gen6_resolve_implied_move(p, &src0, msg_reg_nr);
				2560
				2561	insn = next_insn(p, BRW_OPCODE_SEND);
				2562	brw_set_dest(p, insn, dest);
				2563	brw_set_src0(p, insn, src0);
				2564	brw_set_src1(p, insn, brw_imm_d(0));
				2565	brw_set_dp_write_message(p, insn,
				2566	binding_table_index,
				2567	0, /* msg_control: ignored */
				2568	GEN6_DATAPORT_WRITE_MESSAGE_STREAMED_VB_WRITE,
				2569	1, /* msg_length */
				2570	true, /* header_present */
				2571	0, /* last_render_target: ignored */
				2572	send_commit_msg, /* response_length */
				2573	0, /* end_of_thread */
				2574	send_commit_msg); /* send_commit_msg */
				2575	}
				2576
				2577	/**
				2578	* This instruction is generated as a single-channel align1 instruction by
				2579	* both the VS and FS stages when using INTEL_DEBUG=shader_time.
				2580	*
				2581	* We can't use the typed atomic op in the FS because that has the execution
				2582	* mask ANDed with the pixel mask, but we just want to write the one dword for
				2583	* all the pixels.
				2584	*
				2585	* We don't use the SIMD4x2 atomic ops in the VS because want to just write
				2586	* one u32. So we use the same untyped atomic write message as the pixel
				2587	* shader.
				2588	*
				2589	* The untyped atomic operation requires a BUFFER surface type with RAW
				2590	* format, and is only accessible through the legacy DATA_CACHE dataport
				2591	* messages.
				2592	*/
				2593	void brw_shader_time_add(struct brw_compile *p,
				2594	int base_mrf,
				2595	uint32_t surf_index)
				2596	{
				2597	struct intel_context *intel = &p->brw->intel;
				2598	assert(intel->gen >= 7);
				2599
				2600	brw_push_insn_state(p);
				2601	brw_set_access_mode(p, BRW_ALIGN_1);
				2602	brw_set_mask_control(p, BRW_MASK_DISABLE);
				2603	struct brw_instruction *send = brw_next_insn(p, BRW_OPCODE_SEND);
				2604	brw_pop_insn_state(p);
				2605
				2606	/* We use brw_vec1_reg and unmasked because we want to increment the given
				2607	* offset only once.
				2608	*/
				2609	brw_set_dest(p, send, brw_vec1_reg(BRW_ARCHITECTURE_REGISTER_FILE,
				2610	BRW_ARF_NULL, 0));
				2611	brw_set_src0(p, send, brw_vec1_reg(BRW_MESSAGE_REGISTER_FILE,
				2612	base_mrf, 0));
				2613
				2614	bool header_present = false;
				2615	bool eot = false;
				2616	uint32_t mlen = 2; /* offset, value */
				2617	uint32_t rlen = 0;
				2618	brw_set_message_descriptor(p, send,
				2619	GEN7_SFID_DATAPORT_DATA_CACHE,
				2620	mlen, rlen, header_present, eot);
				2621
				2622	send->bits3.ud \|= 6 << 14; /* untyped atomic op */
				2623	send->bits3.ud \|= 0 << 13; /* no return data */
				2624	send->bits3.ud \|= 1 << 12; /* SIMD8 mode */
				2625	send->bits3.ud \|= BRW_AOP_ADD << 8;
				2626	send->bits3.ud \|= surf_index << 0;
				2627	}