Blame - src/mesa/shader/prog_execute.c - platform/external/mesa3d

blob: f466cc7affbb9b7642647dad7937ec4b24cb7a39 [file] [log] [blame]

Brian	13e3b21	2007-02-22 16:09:40 -0700	[diff] [blame^]	1	/*
				2	* Mesa 3-D graphics library
				3	* Version: 6.5.3
				4	*
				5	* Copyright (C) 1999-2007 Brian Paul All Rights Reserved.
				6	*
				7	* Permission is hereby granted, free of charge, to any person obtaining a
				8	* copy of this software and associated documentation files (the "Software"),
				9	* to deal in the Software without restriction, including without limitation
				10	* the rights to use, copy, modify, merge, publish, distribute, sublicense,
				11	* and/or sell copies of the Software, and to permit persons to whom the
				12	* Software is furnished to do so, subject to the following conditions:
				13	*
				14	* The above copyright notice and this permission notice shall be included
				15	* in all copies or substantial portions of the Software.
				16	*
				17	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
				18	* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
				19	* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
				20	* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
				21	* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
				22	* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
				23	*/
				24
				25	/**
				26	* \file prog_execute.c
				27	* Software interpreter for vertex/fragment programs.
				28	* \author Brian Paul
				29	*/
				30
				31	/*
				32	* NOTE: we do everything in single-precision floating point; we don't
				33	* currently observe the single/half/fixed-precision qualifiers.
				34	*
				35	*/
				36
				37
				38	#include "glheader.h"
				39	#include "colormac.h"
				40	#include "context.h"
				41	#include "program.h"
				42	#include "prog_execute.h"
				43	#include "prog_instruction.h"
				44	#include "prog_parameter.h"
				45	#include "prog_print.h"
				46	#include "slang_library_noise.h"
				47
				48
				49	/* See comments below for info about this */
				50	#define LAMBDA_ZERO 1
				51
				52	/* debug predicate */
				53	#define DEBUG_PROG 0
				54
				55
				56	#if FEATURE_MESA_program_debug
				57	static struct gl_program_machine *CurrentMachine = NULL;
				58
				59	/**
				60	* For GL_MESA_program_debug.
				61	* Return current value (4*GLfloat) of a program register.
				62	* Called via ctx->Driver.GetFragmentProgramRegister().
				63	*/
				64	void
				65	_mesa_get_program_register(GLcontext *ctx, enum register_file file,
				66	GLuint index, GLfloat val[4])
				67	{
				68	if (CurrentMachine) {
				69	switch (file) {
				70	case PROGRAM_INPUT:
				71	if (CurrentMachine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
				72	COPY_4V(val, CurrentMachine->VertAttribs[index]);
				73	}
				74	else {
				75	COPY_4V(val,
				76	CurrentMachine->Attribs[index][CurrentMachine->CurElement]);
				77	}
				78	break;
				79	case PROGRAM_OUTPUT:
				80	COPY_4V(val, CurrentMachine->Outputs[index]);
				81	break;
				82	case PROGRAM_TEMPORARY:
				83	COPY_4V(val, CurrentMachine->Temporaries[index]);
				84	break;
				85	default:
				86	_mesa_problem(NULL,
				87	"bad register file in _swrast_get_program_register");
				88	}
				89	}
				90	}
				91	#endif /* FEATURE_MESA_program_debug */
				92
				93
				94
				95	/**
				96	* Return a pointer to the 4-element float vector specified by the given
				97	* source register.
				98	*/
				99	static INLINE const GLfloat *
				100	get_register_pointer( GLcontext *ctx,
				101	const struct prog_src_register *source,
				102	const struct gl_program_machine *machine)
				103	{
				104	/* XXX relative addressing... */
				105	switch (source->File) {
				106	case PROGRAM_TEMPORARY:
				107	ASSERT(source->Index < MAX_PROGRAM_TEMPS);
				108	return machine->Temporaries[source->Index];
				109
				110	case PROGRAM_INPUT:
				111	if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB) {
				112	ASSERT(source->Index < VERT_ATTRIB_MAX);
				113	return machine->VertAttribs[source->Index];
				114	}
				115	else {
				116	ASSERT(source->Index < FRAG_ATTRIB_MAX);
				117	return machine->Attribs[source->Index][machine->CurElement];
				118	}
				119
				120	case PROGRAM_OUTPUT:
				121	/* This is only for PRINT */
				122	ASSERT(source->Index < FRAG_RESULT_MAX);
				123	return machine->Outputs[source->Index];
				124
				125	case PROGRAM_LOCAL_PARAM:
				126	ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
				127	return machine->CurProgram->LocalParams[source->Index];
				128
				129	case PROGRAM_ENV_PARAM:
				130	ASSERT(source->Index < MAX_PROGRAM_ENV_PARAMS);
				131	if (machine->CurProgram->Target == GL_VERTEX_PROGRAM_ARB)
				132	return ctx->VertexProgram.Parameters[source->Index];
				133	else
				134	return ctx->FragmentProgram.Parameters[source->Index];
				135
				136	case PROGRAM_STATE_VAR:
				137	/* Fallthrough */
				138	case PROGRAM_CONSTANT:
				139	/* Fallthrough */
				140	case PROGRAM_UNIFORM:
				141	/* Fallthrough */
				142	case PROGRAM_NAMED_PARAM:
				143	ASSERT(source->Index <
				144	(GLint) machine->CurProgram->Parameters->NumParameters);
				145	return machine->CurProgram->Parameters->ParameterValues[source->Index];
				146
				147	default:
				148	_mesa_problem(ctx,
				149	"Invalid input register file %d in get_register_pointer()",
				150	source->File);
				151	return NULL;
				152	}
				153	}
				154
				155
				156	/**
				157	* Fetch a 4-element float vector from the given source register.
				158	* Apply swizzling and negating as needed.
				159	*/
				160	static void
				161	fetch_vector4( GLcontext *ctx,
				162	const struct prog_src_register *source,
				163	const struct gl_program_machine *machine,
				164	const struct gl_program *program,
				165	GLfloat result[4] )
				166	{
				167	const GLfloat *src = get_register_pointer(ctx, source, machine);
				168	ASSERT(src);
				169
				170	if (source->Swizzle == SWIZZLE_NOOP) {
				171	/* no swizzling */
				172	COPY_4V(result, src);
				173	}
				174	else {
				175	ASSERT(GET_SWZ(source->Swizzle, 0) <= 3);
				176	ASSERT(GET_SWZ(source->Swizzle, 1) <= 3);
				177	ASSERT(GET_SWZ(source->Swizzle, 2) <= 3);
				178	ASSERT(GET_SWZ(source->Swizzle, 3) <= 3);
				179	result[0] = src[GET_SWZ(source->Swizzle, 0)];
				180	result[1] = src[GET_SWZ(source->Swizzle, 1)];
				181	result[2] = src[GET_SWZ(source->Swizzle, 2)];
				182	result[3] = src[GET_SWZ(source->Swizzle, 3)];
				183	}
				184
				185	if (source->NegateBase) {
				186	result[0] = -result[0];
				187	result[1] = -result[1];
				188	result[2] = -result[2];
				189	result[3] = -result[3];
				190	}
				191	if (source->Abs) {
				192	result[0] = FABSF(result[0]);
				193	result[1] = FABSF(result[1]);
				194	result[2] = FABSF(result[2]);
				195	result[3] = FABSF(result[3]);
				196	}
				197	if (source->NegateAbs) {
				198	result[0] = -result[0];
				199	result[1] = -result[1];
				200	result[2] = -result[2];
				201	result[3] = -result[3];
				202	}
				203	}
				204
				205	#if 0
				206	/**
				207	* Fetch the derivative with respect to X for the given register.
				208	* \return GL_TRUE if it was easily computed or GL_FALSE if we
				209	* need to execute another instance of the program (ugh)!
				210	*/
				211	static GLboolean
				212	fetch_vector4_deriv( GLcontext *ctx,
				213	const struct prog_src_register *source,
				214	const SWspan *span,
				215	char xOrY, GLint column, GLfloat result[4] )
				216	{
				217	GLfloat src[4];
				218
				219	ASSERT(xOrY == 'X' \|\| xOrY == 'Y');
				220
				221	switch (source->Index) {
				222	case FRAG_ATTRIB_WPOS:
				223	if (xOrY == 'X') {
				224	src[0] = 1.0;
				225	src[1] = 0.0;
				226	src[2] = span->attrStepX[FRAG_ATTRIB_WPOS][2]
				227	/ ctx->DrawBuffer->_DepthMaxF;
				228	src[3] = span->attrStepX[FRAG_ATTRIB_WPOS][3];
				229	}
				230	else {
				231	src[0] = 0.0;
				232	src[1] = 1.0;
				233	src[2] = span->attrStepY[FRAG_ATTRIB_WPOS][2]
				234	/ ctx->DrawBuffer->_DepthMaxF;
				235	src[3] = span->attrStepY[FRAG_ATTRIB_WPOS][3];
				236	}
				237	break;
				238	case FRAG_ATTRIB_COL0:
				239	case FRAG_ATTRIB_COL1:
				240	if (xOrY == 'X') {
				241	src[0] = span->attrStepX[source->Index][0] * (1.0F / CHAN_MAXF);
				242	src[1] = span->attrStepX[source->Index][1] * (1.0F / CHAN_MAXF);
				243	src[2] = span->attrStepX[source->Index][2] * (1.0F / CHAN_MAXF);
				244	src[3] = span->attrStepX[source->Index][3] * (1.0F / CHAN_MAXF);
				245	}
				246	else {
				247	src[0] = span->attrStepY[source->Index][0] * (1.0F / CHAN_MAXF);
				248	src[1] = span->attrStepY[source->Index][1] * (1.0F / CHAN_MAXF);
				249	src[2] = span->attrStepY[source->Index][2] * (1.0F / CHAN_MAXF);
				250	src[3] = span->attrStepY[source->Index][3] * (1.0F / CHAN_MAXF);
				251	}
				252	break;
				253	case FRAG_ATTRIB_FOGC:
				254	if (xOrY == 'X') {
				255	src[0] = span->attrStepX[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
				256	src[1] = 0.0;
				257	src[2] = 0.0;
				258	src[3] = 0.0;
				259	}
				260	else {
				261	src[0] = span->attrStepY[FRAG_ATTRIB_FOGC][0] * (1.0F / CHAN_MAXF);
				262	src[1] = 0.0;
				263	src[2] = 0.0;
				264	src[3] = 0.0;
				265	}
				266	break;
				267	default:
				268	assert(source->Index < FRAG_ATTRIB_MAX);
				269	/* texcoord or varying */
				270	if (xOrY == 'X') {
				271	/* this is a little tricky - I think I've got it right */
				272	const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
				273	+ span->attrStepX[source->Index][3] * column);
				274	src[0] = span->attrStepX[source->Index][0] * invQ;
				275	src[1] = span->attrStepX[source->Index][1] * invQ;
				276	src[2] = span->attrStepX[source->Index][2] * invQ;
				277	src[3] = span->attrStepX[source->Index][3] * invQ;
				278	}
				279	else {
				280	/* Tricky, as above, but in Y direction */
				281	const GLfloat invQ = 1.0f / (span->attrStart[source->Index][3]
				282	+ span->attrStepY[source->Index][3]);
				283	src[0] = span->attrStepY[source->Index][0] * invQ;
				284	src[1] = span->attrStepY[source->Index][1] * invQ;
				285	src[2] = span->attrStepY[source->Index][2] * invQ;
				286	src[3] = span->attrStepY[source->Index][3] * invQ;
				287	}
				288	break;
				289	}
				290
				291	result[0] = src[GET_SWZ(source->Swizzle, 0)];
				292	result[1] = src[GET_SWZ(source->Swizzle, 1)];
				293	result[2] = src[GET_SWZ(source->Swizzle, 2)];
				294	result[3] = src[GET_SWZ(source->Swizzle, 3)];
				295
				296	if (source->NegateBase) {
				297	result[0] = -result[0];
				298	result[1] = -result[1];
				299	result[2] = -result[2];
				300	result[3] = -result[3];
				301	}
				302	if (source->Abs) {
				303	result[0] = FABSF(result[0]);
				304	result[1] = FABSF(result[1]);
				305	result[2] = FABSF(result[2]);
				306	result[3] = FABSF(result[3]);
				307	}
				308	if (source->NegateAbs) {
				309	result[0] = -result[0];
				310	result[1] = -result[1];
				311	result[2] = -result[2];
				312	result[3] = -result[3];
				313	}
				314	return GL_TRUE;
				315	}
				316	#endif
				317
				318
				319	/**
				320	* As above, but only return result[0] element.
				321	*/
				322	static void
				323	fetch_vector1( GLcontext *ctx,
				324	const struct prog_src_register *source,
				325	const struct gl_program_machine *machine,
				326	const struct gl_program *program,
				327	GLfloat result[4] )
				328	{
				329	const GLfloat *src = get_register_pointer(ctx, source, machine);
				330	ASSERT(src);
				331
				332	result[0] = src[GET_SWZ(source->Swizzle, 0)];
				333
				334	if (source->NegateBase) {
				335	result[0] = -result[0];
				336	}
				337	if (source->Abs) {
				338	result[0] = FABSF(result[0]);
				339	}
				340	if (source->NegateAbs) {
				341	result[0] = -result[0];
				342	}
				343	}
				344
				345
				346	/**
				347	* Test value against zero and return GT, LT, EQ or UN if NaN.
				348	*/
				349	static INLINE GLuint
				350	generate_cc( float value )
				351	{
				352	if (value != value)
				353	return COND_UN; /* NaN */
				354	if (value > 0.0F)
				355	return COND_GT;
				356	if (value < 0.0F)
				357	return COND_LT;
				358	return COND_EQ;
				359	}
				360
				361
				362	/**
				363	* Test if the ccMaskRule is satisfied by the given condition code.
				364	* Used to mask destination writes according to the current condition code.
				365	*/
				366	static INLINE GLboolean
				367	test_cc(GLuint condCode, GLuint ccMaskRule)
				368	{
				369	switch (ccMaskRule) {
				370	case COND_EQ: return (condCode == COND_EQ);
				371	case COND_NE: return (condCode != COND_EQ);
				372	case COND_LT: return (condCode == COND_LT);
				373	case COND_GE: return (condCode == COND_GT \|\| condCode == COND_EQ);
				374	case COND_LE: return (condCode == COND_LT \|\| condCode == COND_EQ);
				375	case COND_GT: return (condCode == COND_GT);
				376	case COND_TR: return GL_TRUE;
				377	case COND_FL: return GL_FALSE;
				378	default: return GL_TRUE;
				379	}
				380	}
				381
				382
				383	/**
				384	* Evaluate the 4 condition codes against a predicate and return GL_TRUE
				385	* or GL_FALSE to indicate result.
				386	*/
				387	static INLINE GLboolean
				388	eval_condition(const struct gl_program_machine *machine,
				389	const struct prog_instruction *inst)
				390	{
				391	const GLuint swizzle = inst->DstReg.CondSwizzle;
				392	const GLuint condMask = inst->DstReg.CondMask;
				393	if (test_cc(machine->CondCodes[GET_SWZ(swizzle, 0)], condMask) \|\|
				394	test_cc(machine->CondCodes[GET_SWZ(swizzle, 1)], condMask) \|\|
				395	test_cc(machine->CondCodes[GET_SWZ(swizzle, 2)], condMask) \|\|
				396	test_cc(machine->CondCodes[GET_SWZ(swizzle, 3)], condMask)) {
				397	return GL_TRUE;
				398	}
				399	else {
				400	return GL_FALSE;
				401	}
				402	}
				403
				404
				405
				406	/**
				407	* Store 4 floats into a register. Observe the instructions saturate and
				408	* set-condition-code flags.
				409	*/
				410	static void
				411	store_vector4( const struct prog_instruction *inst,
				412	struct gl_program_machine *machine,
				413	const GLfloat value[4] )
				414	{
				415	const struct prog_dst_register *dest = &(inst->DstReg);
				416	const GLboolean clamp = inst->SaturateMode == SATURATE_ZERO_ONE;
				417	GLfloat *dstReg;
				418	GLfloat dummyReg[4];
				419	GLfloat clampedValue[4];
				420	GLuint writeMask = dest->WriteMask;
				421
				422	switch (dest->File) {
				423	case PROGRAM_OUTPUT:
				424	dstReg = machine->Outputs[dest->Index];
				425	break;
				426	case PROGRAM_TEMPORARY:
				427	dstReg = machine->Temporaries[dest->Index];
				428	break;
				429	case PROGRAM_WRITE_ONLY:
				430	dstReg = dummyReg;
				431	return;
				432	default:
				433	_mesa_problem(NULL, "bad register file in store_vector4(fp)");
				434	return;
				435	}
				436
				437	#if 0
				438	if (value[0] > 1.0e10 \|\|
				439	IS_INF_OR_NAN(value[0]) \|\|
				440	IS_INF_OR_NAN(value[1]) \|\|
				441	IS_INF_OR_NAN(value[2]) \|\|
				442	IS_INF_OR_NAN(value[3]) )
				443	printf("store %g %g %g %g\n", value[0], value[1], value[2], value[3]);
				444	#endif
				445
				446	if (clamp) {
				447	clampedValue[0] = CLAMP(value[0], 0.0F, 1.0F);
				448	clampedValue[1] = CLAMP(value[1], 0.0F, 1.0F);
				449	clampedValue[2] = CLAMP(value[2], 0.0F, 1.0F);
				450	clampedValue[3] = CLAMP(value[3], 0.0F, 1.0F);
				451	value = clampedValue;
				452	}
				453
				454	if (dest->CondMask != COND_TR) {
				455	/* condition codes may turn off some writes */
				456	if (writeMask & WRITEMASK_X) {
				457	if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 0)],
				458	dest->CondMask))
				459	writeMask &= ~WRITEMASK_X;
				460	}
				461	if (writeMask & WRITEMASK_Y) {
				462	if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 1)],
				463	dest->CondMask))
				464	writeMask &= ~WRITEMASK_Y;
				465	}
				466	if (writeMask & WRITEMASK_Z) {
				467	if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 2)],
				468	dest->CondMask))
				469	writeMask &= ~WRITEMASK_Z;
				470	}
				471	if (writeMask & WRITEMASK_W) {
				472	if (!test_cc(machine->CondCodes[GET_SWZ(dest->CondSwizzle, 3)],
				473	dest->CondMask))
				474	writeMask &= ~WRITEMASK_W;
				475	}
				476	}
				477
				478	if (writeMask & WRITEMASK_X)
				479	dstReg[0] = value[0];
				480	if (writeMask & WRITEMASK_Y)
				481	dstReg[1] = value[1];
				482	if (writeMask & WRITEMASK_Z)
				483	dstReg[2] = value[2];
				484	if (writeMask & WRITEMASK_W)
				485	dstReg[3] = value[3];
				486
				487	if (inst->CondUpdate) {
				488	if (writeMask & WRITEMASK_X)
				489	machine->CondCodes[0] = generate_cc(value[0]);
				490	if (writeMask & WRITEMASK_Y)
				491	machine->CondCodes[1] = generate_cc(value[1]);
				492	if (writeMask & WRITEMASK_Z)
				493	machine->CondCodes[2] = generate_cc(value[2]);
				494	if (writeMask & WRITEMASK_W)
				495	machine->CondCodes[3] = generate_cc(value[3]);
				496	}
				497	}
				498
				499
				500	#if 0
				501	/**
				502	* Initialize a new machine state instance from an existing one, adding
				503	* the partial derivatives onto the input registers.
				504	* Used to implement DDX and DDY instructions in non-trivial cases.
				505	*/
				506	static void
				507	init_machine_deriv( GLcontext *ctx,
				508	const struct gl_program_machine *machine,
				509	const struct gl_fragment_program *program,
				510	const SWspan *span, char xOrY,
				511	struct gl_program_machine *dMachine )
				512	{
				513	GLuint attr;
				514
				515	ASSERT(xOrY == 'X' \|\| xOrY == 'Y');
				516
				517	/* copy existing machine */
				518	_mesa_memcpy(dMachine, machine, sizeof(struct gl_program_machine));
				519
				520	if (program->Base.Target == GL_FRAGMENT_PROGRAM_NV) {
				521	/* XXX also need to do this when using valgrind */
				522	/* Clear temporary registers (undefined for ARB_f_p) */
				523	_mesa_bzero( (void*) machine->Temporaries,
				524	MAX_PROGRAM_TEMPS * 4 * sizeof(GLfloat));
				525	}
				526
				527	/* Add derivatives */
				528	if (program->Base.InputsRead & FRAG_BIT_WPOS) {
				529	GLfloat *wpos = machine->Attribs[FRAG_ATTRIB_WPOS][machine->CurElement];
				530	if (xOrY == 'X') {
				531	wpos[0] += 1.0F;
				532	wpos[1] += 0.0F;
				533	wpos[2] += span->attrStepX[FRAG_ATTRIB_WPOS][2];
				534	wpos[3] += span->attrStepX[FRAG_ATTRIB_WPOS][3];
				535	}
				536	else {
				537	wpos[0] += 0.0F;
				538	wpos[1] += 1.0F;
				539	wpos[2] += span->attrStepY[FRAG_ATTRIB_WPOS][2];
				540	wpos[3] += span->attrStepY[FRAG_ATTRIB_WPOS][3];
				541	}
				542	}
				543
				544	/* primary, secondary colors */
				545	for (attr = FRAG_ATTRIB_COL0; attr <= FRAG_ATTRIB_COL1; attr++) {
				546	if (program->Base.InputsRead & (1 << attr)) {
				547	GLfloat *col = machine->Attribs[attr][machine->CurElement];
				548	if (xOrY == 'X') {
				549	col[0] += span->attrStepX[attr][0] * (1.0F / CHAN_MAXF);
				550	col[1] += span->attrStepX[attr][1] * (1.0F / CHAN_MAXF);
				551	col[2] += span->attrStepX[attr][2] * (1.0F / CHAN_MAXF);
				552	col[3] += span->attrStepX[attr][3] * (1.0F / CHAN_MAXF);
				553	}
				554	else {
				555	col[0] += span->attrStepY[attr][0] * (1.0F / CHAN_MAXF);
				556	col[1] += span->attrStepY[attr][1] * (1.0F / CHAN_MAXF);
				557	col[2] += span->attrStepY[attr][2] * (1.0F / CHAN_MAXF);
				558	col[3] += span->attrStepY[attr][3] * (1.0F / CHAN_MAXF);
				559	}
				560	}
				561	}
				562	if (program->Base.InputsRead & FRAG_BIT_FOGC) {
				563	GLfloat *fogc = machine->Attribs[FRAG_ATTRIB_FOGC][machine->CurElement];
				564	if (xOrY == 'X') {
				565	fogc[0] += span->attrStepX[FRAG_ATTRIB_FOGC][0];
				566	}
				567	else {
				568	fogc[0] += span->attrStepY[FRAG_ATTRIB_FOGC][0];
				569	}
				570	}
				571	/* texcoord and varying vars */
				572	for (attr = FRAG_ATTRIB_TEX0; attr < FRAG_ATTRIB_MAX; attr++) {
				573	if (program->Base.InputsRead & (1 << attr)) {
				574	GLfloat *val = machine->Attribs[attr][machine->CurElement];
				575	/* XXX perspective-correct interpolation */
				576	if (xOrY == 'X') {
				577	val[0] += span->attrStepX[attr][0];
				578	val[1] += span->attrStepX[attr][1];
				579	val[2] += span->attrStepX[attr][2];
				580	val[3] += span->attrStepX[attr][3];
				581	}
				582	else {
				583	val[0] += span->attrStepY[attr][0];
				584	val[1] += span->attrStepY[attr][1];
				585	val[2] += span->attrStepY[attr][2];
				586	val[3] += span->attrStepY[attr][3];
				587	}
				588	}
				589	}
				590
				591	/* init condition codes */
				592	dMachine->CondCodes[0] = COND_EQ;
				593	dMachine->CondCodes[1] = COND_EQ;
				594	dMachine->CondCodes[2] = COND_EQ;
				595	dMachine->CondCodes[3] = COND_EQ;
				596	}
				597	#endif
				598
				599
				600	/**
				601	* Execute the given vertex/fragment program.
				602	*
				603	* \param ctx - rendering context
				604	* \param program - the fragment program to execute
				605	* \param machine - machine state (register file)
				606	* \param maxInst - max number of instructions to execute
				607	* \return GL_TRUE if program completed or GL_FALSE if program executed KIL.
				608	*/
				609	GLboolean
				610	_mesa_execute_program(GLcontext *ctx,
				611	const struct gl_program *program, GLuint maxInst,
				612	struct gl_program_machine *machine, GLuint element)
				613	{
				614	const GLuint MAX_EXEC = 10000;
				615	GLint pc, total = 0;
				616
				617	machine->CurProgram = program;
				618
				619	if (DEBUG_PROG) {
				620	printf("execute program %u --------------------\n", program->Id);
				621	}
				622
				623	#if FEATURE_MESA_program_debug
				624	CurrentMachine = machine;
				625	#endif
				626
				627	for (pc = 0; pc < maxInst; pc++) {
				628	const struct prog_instruction *inst = program->Instructions + pc;
				629
				630	#if FEATURE_MESA_program_debug
				631	if (ctx->FragmentProgram.CallbackEnabled &&
				632	ctx->FragmentProgram.Callback) {
				633	ctx->FragmentProgram.CurrentPosition = inst->StringPos;
				634	ctx->FragmentProgram.Callback(program->Target,
				635	ctx->FragmentProgram.CallbackData);
				636	}
				637	#endif
				638
				639	if (DEBUG_PROG) {
				640	_mesa_print_instruction(inst);
				641	}
				642
				643	switch (inst->Opcode) {
				644	case OPCODE_ABS:
				645	{
				646	GLfloat a[4], result[4];
				647	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				648	result[0] = FABSF(a[0]);
				649	result[1] = FABSF(a[1]);
				650	result[2] = FABSF(a[2]);
				651	result[3] = FABSF(a[3]);
				652	store_vector4( inst, machine, result );
				653	}
				654	break;
				655	case OPCODE_ADD:
				656	{
				657	GLfloat a[4], b[4], result[4];
				658	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				659	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				660	result[0] = a[0] + b[0];
				661	result[1] = a[1] + b[1];
				662	result[2] = a[2] + b[2];
				663	result[3] = a[3] + b[3];
				664	store_vector4( inst, machine, result );
				665	if (DEBUG_PROG) {
				666	printf("ADD (%g %g %g %g) = (%g %g %g %g) + (%g %g %g %g)\n",
				667	result[0], result[1], result[2], result[3],
				668	a[0], a[1], a[2], a[3],
				669	b[0], b[1], b[2], b[3]);
				670	}
				671	}
				672	break;
				673	case OPCODE_BGNLOOP:
				674	/* no-op */
				675	break;
				676	case OPCODE_ENDLOOP:
				677	/* subtract 1 here since pc is incremented by for(pc) loop */
				678	pc = inst->BranchTarget - 1; /* go to matching BNGLOOP */
				679	break;
				680	case OPCODE_BGNSUB: /* begin subroutine */
				681	break;
				682	case OPCODE_ENDSUB: /* end subroutine */
				683	break;
				684	case OPCODE_BRA: /* branch (conditional) */
				685	/* fall-through */
				686	case OPCODE_BRK: /* break out of loop (conditional) */
				687	/* fall-through */
				688	case OPCODE_CONT: /* continue loop (conditional) */
				689	if (eval_condition(machine, inst)) {
				690	/* take branch */
				691	/* Subtract 1 here since we'll do pc++ at end of for-loop */
				692	pc = inst->BranchTarget - 1;
				693	}
				694	break;
				695	case OPCODE_CAL: /* Call subroutine (conditional) */
				696	if (eval_condition(machine, inst)) {
				697	/* call the subroutine */
				698	if (machine->StackDepth >= MAX_PROGRAM_CALL_DEPTH) {
				699	return GL_TRUE; /* Per GL_NV_vertex_program2 spec */
				700	}
				701	machine->CallStack[machine->StackDepth++] = pc + 1;
				702	pc = inst->BranchTarget; /* XXX - 1 ??? */
				703	}
				704	break;
				705	case OPCODE_CMP:
				706	{
				707	GLfloat a[4], b[4], c[4], result[4];
				708	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				709	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				710	fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
				711	result[0] = a[0] < 0.0F ? b[0] : c[0];
				712	result[1] = a[1] < 0.0F ? b[1] : c[1];
				713	result[2] = a[2] < 0.0F ? b[2] : c[2];
				714	result[3] = a[3] < 0.0F ? b[3] : c[3];
				715	store_vector4( inst, machine, result );
				716	}
				717	break;
				718	case OPCODE_COS:
				719	{
				720	GLfloat a[4], result[4];
				721	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				722	result[0] = result[1] = result[2] = result[3]
				723	= (GLfloat) _mesa_cos(a[0]);
				724	store_vector4( inst, machine, result );
				725	}
				726	break;
				727	case OPCODE_DDX: /* Partial derivative with respect to X */
				728	{
				729	#if 0
				730	GLfloat a[4], aNext[4], result[4];
				731	struct gl_program_machine dMachine;
				732	if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'X',
				733	column, result)) {
				734	/* This is tricky. Make a copy of the current machine state,
				735	* increment the input registers by the dx or dy partial
				736	* derivatives, then re-execute the program up to the
				737	* preceeding instruction, then fetch the source register.
				738	* Finally, find the difference in the register values for
				739	* the original and derivative runs.
				740	*/
				741	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
				742	init_machine_deriv(ctx, machine, program, span,
				743	'X', &dMachine);
				744	execute_program(ctx, program, pc, &dMachine, span, column);
				745	fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
				746	result[0] = aNext[0] - a[0];
				747	result[1] = aNext[1] - a[1];
				748	result[2] = aNext[2] - a[2];
				749	result[3] = aNext[3] - a[3];
				750	}
				751	store_vector4( inst, machine, result );
				752	#else
				753	static const GLfloat result[4] = { 0, 0, 0, 0 };
				754	store_vector4( inst, machine, result );
				755	#endif
				756	}
				757	break;
				758	case OPCODE_DDY: /* Partial derivative with respect to Y */
				759	{
				760	#if 0
				761	GLfloat a[4], aNext[4], result[4];
				762	struct gl_program_machine dMachine;
				763	if (!fetch_vector4_deriv(ctx, &inst->SrcReg[0], span, 'Y',
				764	column, result)) {
				765	init_machine_deriv(ctx, machine, program, span,
				766	'Y', &dMachine);
				767	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
				768	execute_program(ctx, program, pc, &dMachine, span, column);
				769	fetch_vector4( ctx, &inst->SrcReg[0], &dMachine, program, aNext );
				770	result[0] = aNext[0] - a[0];
				771	result[1] = aNext[1] - a[1];
				772	result[2] = aNext[2] - a[2];
				773	result[3] = aNext[3] - a[3];
				774	}
				775	store_vector4( inst, machine, result );
				776	#else
				777	static const GLfloat result[4] = { 0, 0, 0, 0 };
				778	store_vector4( inst, machine, result );
				779	#endif
				780	}
				781	break;
				782	case OPCODE_DP3:
				783	{
				784	GLfloat a[4], b[4], result[4];
				785	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				786	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				787	result[0] = result[1] = result[2] = result[3] = DOT3(a, b);
				788	store_vector4( inst, machine, result );
				789	if (DEBUG_PROG) {
				790	printf("DP3 %g = (%g %g %g) . (%g %g %g)\n",
				791	result[0], a[0], a[1], a[2], b[0], b[1], b[2]);
				792	}
				793	}
				794	break;
				795	case OPCODE_DP4:
				796	{
				797	GLfloat a[4], b[4], result[4];
				798	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				799	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				800	result[0] = result[1] = result[2] = result[3] = DOT4(a,b);
				801	store_vector4( inst, machine, result );
				802	if (DEBUG_PROG) {
				803	printf("DP4 %g = (%g, %g %g %g) . (%g, %g %g %g)\n",
				804	result[0], a[0], a[1], a[2], a[3],
				805	b[0], b[1], b[2], b[3]);
				806	}
				807	}
				808	break;
				809	case OPCODE_DPH:
				810	{
				811	GLfloat a[4], b[4], result[4];
				812	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				813	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				814	result[0] = result[1] = result[2] = result[3] =
				815	a[0] * b[0] + a[1] * b[1] + a[2] * b[2] + b[3];
				816	store_vector4( inst, machine, result );
				817	}
				818	break;
				819	case OPCODE_DST: /* Distance vector */
				820	{
				821	GLfloat a[4], b[4], result[4];
				822	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				823	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				824	result[0] = 1.0F;
				825	result[1] = a[1] * b[1];
				826	result[2] = a[2];
				827	result[3] = b[3];
				828	store_vector4( inst, machine, result );
				829	}
				830	break;
				831	case OPCODE_EX2: /* Exponential base 2 */
				832	{
				833	GLfloat a[4], result[4];
				834	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				835	result[0] = result[1] = result[2] = result[3] =
				836	(GLfloat) _mesa_pow(2.0, a[0]);
				837	store_vector4( inst, machine, result );
				838	}
				839	break;
				840	case OPCODE_FLR:
				841	{
				842	GLfloat a[4], result[4];
				843	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				844	result[0] = FLOORF(a[0]);
				845	result[1] = FLOORF(a[1]);
				846	result[2] = FLOORF(a[2]);
				847	result[3] = FLOORF(a[3]);
				848	store_vector4( inst, machine, result );
				849	}
				850	break;
				851	case OPCODE_FRC:
				852	{
				853	GLfloat a[4], result[4];
				854	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				855	result[0] = a[0] - FLOORF(a[0]);
				856	result[1] = a[1] - FLOORF(a[1]);
				857	result[2] = a[2] - FLOORF(a[2]);
				858	result[3] = a[3] - FLOORF(a[3]);
				859	store_vector4( inst, machine, result );
				860	}
				861	break;
				862	case OPCODE_IF:
				863	if (eval_condition(machine, inst)) {
				864	/* do if-clause (just continue execution) */
				865	}
				866	else {
				867	/* go to the instruction after ELSE or ENDIF */
				868	assert(inst->BranchTarget >= 0);
				869	pc = inst->BranchTarget - 1;
				870	}
				871	break;
				872	case OPCODE_ELSE:
				873	/* goto ENDIF */
				874	assert(inst->BranchTarget >= 0);
				875	pc = inst->BranchTarget - 1;
				876	break;
				877	case OPCODE_ENDIF:
				878	/* nothing */
				879	break;
				880	case OPCODE_INT: /* float to int */
				881	{
				882	GLfloat a[4], result[4];
				883	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				884	result[0] = (GLfloat) (GLint) a[0];
				885	result[1] = (GLfloat) (GLint) a[1];
				886	result[2] = (GLfloat) (GLint) a[2];
				887	result[3] = (GLfloat) (GLint) a[3];
				888	store_vector4( inst, machine, result );
				889	}
				890	break;
				891	case OPCODE_KIL_NV: /* NV_f_p only (conditional) */
				892	if (eval_condition(machine, inst)) {
				893	return GL_FALSE;
				894	}
				895	break;
				896	case OPCODE_KIL: /* ARB_f_p only */
				897	{
				898	GLfloat a[4];
				899	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				900	if (a[0] < 0.0F \|\| a[1] < 0.0F \|\| a[2] < 0.0F \|\| a[3] < 0.0F) {
				901	return GL_FALSE;
				902	}
				903	}
				904	break;
				905	case OPCODE_LG2: /* log base 2 */
				906	{
				907	GLfloat a[4], result[4];
				908	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				909	result[0] = result[1] = result[2] = result[3] = LOG2(a[0]);
				910	store_vector4( inst, machine, result );
				911	}
				912	break;
				913	case OPCODE_LIT:
				914	{
				915	const GLfloat epsilon = 1.0F / 256.0F; /* from NV VP spec */
				916	GLfloat a[4], result[4];
				917	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				918	a[0] = MAX2(a[0], 0.0F);
				919	a[1] = MAX2(a[1], 0.0F);
				920	/* XXX ARB version clamps a[3], NV version doesn't */
				921	a[3] = CLAMP(a[3], -(128.0F - epsilon), (128.0F - epsilon));
				922	result[0] = 1.0F;
				923	result[1] = a[0];
				924	/* XXX we could probably just use pow() here */
				925	if (a[0] > 0.0F) {
				926	if (a[1] == 0.0 && a[3] == 0.0)
				927	result[2] = 1.0;
				928	else
				929	result[2] = EXPF(a[3] * LOGF(a[1]));
				930	}
				931	else {
				932	result[2] = 0.0;
				933	}
				934	result[3] = 1.0F;
				935	store_vector4( inst, machine, result );
				936	if (DEBUG_PROG) {
				937	printf("LIT (%g %g %g %g) : (%g %g %g %g)\n",
				938	result[0], result[1], result[2], result[3],
				939	a[0], a[1], a[2], a[3]);
				940	}
				941	}
				942	break;
				943	case OPCODE_LRP:
				944	{
				945	GLfloat a[4], b[4], c[4], result[4];
				946	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				947	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				948	fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
				949	result[0] = a[0] * b[0] + (1.0F - a[0]) * c[0];
				950	result[1] = a[1] * b[1] + (1.0F - a[1]) * c[1];
				951	result[2] = a[2] * b[2] + (1.0F - a[2]) * c[2];
				952	result[3] = a[3] * b[3] + (1.0F - a[3]) * c[3];
				953	store_vector4( inst, machine, result );
				954	if (DEBUG_PROG) {
				955	printf("LRP (%g %g %g %g) = (%g %g %g %g), "
				956	"(%g %g %g %g), (%g %g %g %g)\n",
				957	result[0], result[1], result[2], result[3],
				958	a[0], a[1], a[2], a[3],
				959	b[0], b[1], b[2], b[3],
				960	c[0], c[1], c[2], c[3]);
				961	}
				962	}
				963	break;
				964	case OPCODE_MAD:
				965	{
				966	GLfloat a[4], b[4], c[4], result[4];
				967	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				968	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				969	fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
				970	result[0] = a[0] * b[0] + c[0];
				971	result[1] = a[1] * b[1] + c[1];
				972	result[2] = a[2] * b[2] + c[2];
				973	result[3] = a[3] * b[3] + c[3];
				974	store_vector4( inst, machine, result );
				975	if (DEBUG_PROG) {
				976	printf("MAD (%g %g %g %g) = (%g %g %g %g) * "
				977	"(%g %g %g %g) + (%g %g %g %g)\n",
				978	result[0], result[1], result[2], result[3],
				979	a[0], a[1], a[2], a[3],
				980	b[0], b[1], b[2], b[3],
				981	c[0], c[1], c[2], c[3]);
				982	}
				983	}
				984	break;
				985	case OPCODE_MAX:
				986	{
				987	GLfloat a[4], b[4], result[4];
				988	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				989	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				990	result[0] = MAX2(a[0], b[0]);
				991	result[1] = MAX2(a[1], b[1]);
				992	result[2] = MAX2(a[2], b[2]);
				993	result[3] = MAX2(a[3], b[3]);
				994	store_vector4( inst, machine, result );
				995	if (DEBUG_PROG) {
				996	printf("MAX (%g %g %g %g) = (%g %g %g %g), (%g %g %g %g)\n",
				997	result[0], result[1], result[2], result[3],
				998	a[0], a[1], a[2], a[3],
				999	b[0], b[1], b[2], b[3]);
				1000	}
				1001	}
				1002	break;
				1003	case OPCODE_MIN:
				1004	{
				1005	GLfloat a[4], b[4], result[4];
				1006	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1007	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1008	result[0] = MIN2(a[0], b[0]);
				1009	result[1] = MIN2(a[1], b[1]);
				1010	result[2] = MIN2(a[2], b[2]);
				1011	result[3] = MIN2(a[3], b[3]);
				1012	store_vector4( inst, machine, result );
				1013	}
				1014	break;
				1015	case OPCODE_MOV:
				1016	{
				1017	GLfloat result[4];
				1018	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, result );
				1019	store_vector4( inst, machine, result );
				1020	if (DEBUG_PROG) {
				1021	printf("MOV (%g %g %g %g)\n",
				1022	result[0], result[1], result[2], result[3]);
				1023	}
				1024	}
				1025	break;
				1026	case OPCODE_MUL:
				1027	{
				1028	GLfloat a[4], b[4], result[4];
				1029	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1030	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1031	result[0] = a[0] * b[0];
				1032	result[1] = a[1] * b[1];
				1033	result[2] = a[2] * b[2];
				1034	result[3] = a[3] * b[3];
				1035	store_vector4( inst, machine, result );
				1036	if (DEBUG_PROG) {
				1037	printf("MUL (%g %g %g %g) = (%g %g %g %g) * (%g %g %g %g)\n",
				1038	result[0], result[1], result[2], result[3],
				1039	a[0], a[1], a[2], a[3],
				1040	b[0], b[1], b[2], b[3]);
				1041	}
				1042	}
				1043	break;
				1044	case OPCODE_NOISE1:
				1045	{
				1046	GLfloat a[4], result[4];
				1047	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1048	result[0] =
				1049	result[1] =
				1050	result[2] =
				1051	result[3] = _slang_library_noise1(a[0]);
				1052	store_vector4( inst, machine, result );
				1053	}
				1054	break;
				1055	case OPCODE_NOISE2:
				1056	{
				1057	GLfloat a[4], result[4];
				1058	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1059	result[0] =
				1060	result[1] =
				1061	result[2] =
				1062	result[3] = _slang_library_noise2(a[0], a[1]);
				1063	store_vector4( inst, machine, result );
				1064	}
				1065	break;
				1066	case OPCODE_NOISE3:
				1067	{
				1068	GLfloat a[4], result[4];
				1069	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1070	result[0] =
				1071	result[1] =
				1072	result[2] =
				1073	result[3] = _slang_library_noise3(a[0], a[1], a[2]);
				1074	store_vector4( inst, machine, result );
				1075	}
				1076	break;
				1077	case OPCODE_NOISE4:
				1078	{
				1079	GLfloat a[4], result[4];
				1080	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1081	result[0] =
				1082	result[1] =
				1083	result[2] =
				1084	result[3] = _slang_library_noise4(a[0], a[1], a[2], a[3]);
				1085	store_vector4( inst, machine, result );
				1086	}
				1087	break;
				1088	case OPCODE_NOP:
				1089	break;
				1090	case OPCODE_PK2H: /* pack two 16-bit floats in one 32-bit float */
				1091	{
				1092	GLfloat a[4], result[4];
				1093	GLhalfNV hx, hy;
				1094	GLuint rawResult = (GLuint ) result;
				1095	GLuint twoHalves;
				1096	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1097	hx = _mesa_float_to_half(a[0]);
				1098	hy = _mesa_float_to_half(a[1]);
				1099	twoHalves = hx \| (hy << 16);
				1100	rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
				1101	= twoHalves;
				1102	store_vector4( inst, machine, result );
				1103	}
				1104	break;
				1105	case OPCODE_PK2US: /* pack two GLushorts into one 32-bit float */
				1106	{
				1107	GLfloat a[4], result[4];
				1108	GLuint usx, usy, rawResult = (GLuint ) result;
				1109	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1110	a[0] = CLAMP(a[0], 0.0F, 1.0F);
				1111	a[1] = CLAMP(a[1], 0.0F, 1.0F);
				1112	usx = IROUND(a[0] * 65535.0F);
				1113	usy = IROUND(a[1] * 65535.0F);
				1114	rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
				1115	= usx \| (usy << 16);
				1116	store_vector4( inst, machine, result );
				1117	}
				1118	break;
				1119	case OPCODE_PK4B: /* pack four GLbytes into one 32-bit float */
				1120	{
				1121	GLfloat a[4], result[4];
				1122	GLuint ubx, uby, ubz, ubw, rawResult = (GLuint ) result;
				1123	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1124	a[0] = CLAMP(a[0], -128.0F / 127.0F, 1.0F);
				1125	a[1] = CLAMP(a[1], -128.0F / 127.0F, 1.0F);
				1126	a[2] = CLAMP(a[2], -128.0F / 127.0F, 1.0F);
				1127	a[3] = CLAMP(a[3], -128.0F / 127.0F, 1.0F);
				1128	ubx = IROUND(127.0F * a[0] + 128.0F);
				1129	uby = IROUND(127.0F * a[1] + 128.0F);
				1130	ubz = IROUND(127.0F * a[2] + 128.0F);
				1131	ubw = IROUND(127.0F * a[3] + 128.0F);
				1132	rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
				1133	= ubx \| (uby << 8) \| (ubz << 16) \| (ubw << 24);
				1134	store_vector4( inst, machine, result );
				1135	}
				1136	break;
				1137	case OPCODE_PK4UB: /* pack four GLubytes into one 32-bit float */
				1138	{
				1139	GLfloat a[4], result[4];
				1140	GLuint ubx, uby, ubz, ubw, rawResult = (GLuint ) result;
				1141	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1142	a[0] = CLAMP(a[0], 0.0F, 1.0F);
				1143	a[1] = CLAMP(a[1], 0.0F, 1.0F);
				1144	a[2] = CLAMP(a[2], 0.0F, 1.0F);
				1145	a[3] = CLAMP(a[3], 0.0F, 1.0F);
				1146	ubx = IROUND(255.0F * a[0]);
				1147	uby = IROUND(255.0F * a[1]);
				1148	ubz = IROUND(255.0F * a[2]);
				1149	ubw = IROUND(255.0F * a[3]);
				1150	rawResult[0] = rawResult[1] = rawResult[2] = rawResult[3]
				1151	= ubx \| (uby << 8) \| (ubz << 16) \| (ubw << 24);
				1152	store_vector4( inst, machine, result );
				1153	}
				1154	break;
				1155	case OPCODE_POW:
				1156	{
				1157	GLfloat a[4], b[4], result[4];
				1158	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1159	fetch_vector1( ctx, &inst->SrcReg[1], machine, program, b );
				1160	result[0] = result[1] = result[2] = result[3]
				1161	= (GLfloat)_mesa_pow(a[0], b[0]);
				1162	store_vector4( inst, machine, result );
				1163	}
				1164	break;
				1165	case OPCODE_RCP:
				1166	{
				1167	GLfloat a[4], result[4];
				1168	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1169	if (DEBUG_PROG) {
				1170	if (a[0] == 0)
				1171	printf("RCP(0)\n");
				1172	else if (IS_INF_OR_NAN(a[0]))
				1173	printf("RCP(inf)\n");
				1174	}
				1175	result[0] = result[1] = result[2] = result[3] = 1.0F / a[0];
				1176	store_vector4( inst, machine, result );
				1177	}
				1178	break;
				1179	case OPCODE_RET: /* return from subroutine (conditional) */
				1180	if (eval_condition(machine, inst)) {
				1181	if (machine->StackDepth == 0) {
				1182	return GL_TRUE; /* Per GL_NV_vertex_program2 spec */
				1183	}
				1184	pc = machine->CallStack[--machine->StackDepth];
				1185	}
				1186	break;
				1187	case OPCODE_RFL: /* reflection vector */
				1188	{
				1189	GLfloat axis[4], dir[4], result[4], tmpX, tmpW;
				1190	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, axis );
				1191	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dir );
				1192	tmpW = DOT3(axis, axis);
				1193	tmpX = (2.0F * DOT3(axis, dir)) / tmpW;
				1194	result[0] = tmpX * axis[0] - dir[0];
				1195	result[1] = tmpX * axis[1] - dir[1];
				1196	result[2] = tmpX * axis[2] - dir[2];
				1197	/* result[3] is never written! XXX enforce in parser! */
				1198	store_vector4( inst, machine, result );
				1199	}
				1200	break;
				1201	case OPCODE_RSQ: /* 1 / sqrt() */
				1202	{
				1203	GLfloat a[4], result[4];
				1204	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1205	a[0] = FABSF(a[0]);
				1206	result[0] = result[1] = result[2] = result[3] = INV_SQRTF(a[0]);
				1207	store_vector4( inst, machine, result );
				1208	if (DEBUG_PROG) {
				1209	printf("RSQ %g = 1/sqrt(\|%g\|)\n", result[0], a[0]);
				1210	}
				1211	}
				1212	break;
				1213	case OPCODE_SCS: /* sine and cos */
				1214	{
				1215	GLfloat a[4], result[4];
				1216	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1217	result[0] = (GLfloat) _mesa_cos(a[0]);
				1218	result[1] = (GLfloat) _mesa_sin(a[0]);
				1219	result[2] = 0.0; /* undefined! */
				1220	result[3] = 0.0; /* undefined! */
				1221	store_vector4( inst, machine, result );
				1222	}
				1223	break;
				1224	case OPCODE_SEQ: /* set on equal */
				1225	{
				1226	GLfloat a[4], b[4], result[4];
				1227	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1228	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1229	result[0] = (a[0] == b[0]) ? 1.0F : 0.0F;
				1230	result[1] = (a[1] == b[1]) ? 1.0F : 0.0F;
				1231	result[2] = (a[2] == b[2]) ? 1.0F : 0.0F;
				1232	result[3] = (a[3] == b[3]) ? 1.0F : 0.0F;
				1233	store_vector4( inst, machine, result );
				1234	}
				1235	break;
				1236	case OPCODE_SFL: /* set false, operands ignored */
				1237	{
				1238	static const GLfloat result[4] = { 0.0F, 0.0F, 0.0F, 0.0F };
				1239	store_vector4( inst, machine, result );
				1240	}
				1241	break;
				1242	case OPCODE_SGE: /* set on greater or equal */
				1243	{
				1244	GLfloat a[4], b[4], result[4];
				1245	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1246	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1247	result[0] = (a[0] >= b[0]) ? 1.0F : 0.0F;
				1248	result[1] = (a[1] >= b[1]) ? 1.0F : 0.0F;
				1249	result[2] = (a[2] >= b[2]) ? 1.0F : 0.0F;
				1250	result[3] = (a[3] >= b[3]) ? 1.0F : 0.0F;
				1251	store_vector4( inst, machine, result );
				1252	}
				1253	break;
				1254	case OPCODE_SGT: /* set on greater */
				1255	{
				1256	GLfloat a[4], b[4], result[4];
				1257	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1258	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1259	result[0] = (a[0] > b[0]) ? 1.0F : 0.0F;
				1260	result[1] = (a[1] > b[1]) ? 1.0F : 0.0F;
				1261	result[2] = (a[2] > b[2]) ? 1.0F : 0.0F;
				1262	result[3] = (a[3] > b[3]) ? 1.0F : 0.0F;
				1263	store_vector4( inst, machine, result );
				1264	if (DEBUG_PROG) {
				1265	printf("SGT %g %g %g %g\n",
				1266	result[0], result[1], result[2], result[3]);
				1267	}
				1268	}
				1269	break;
				1270	case OPCODE_SIN:
				1271	{
				1272	GLfloat a[4], result[4];
				1273	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1274	result[0] = result[1] = result[2] = result[3]
				1275	= (GLfloat) _mesa_sin(a[0]);
				1276	store_vector4( inst, machine, result );
				1277	}
				1278	break;
				1279	case OPCODE_SLE: /* set on less or equal */
				1280	{
				1281	GLfloat a[4], b[4], result[4];
				1282	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1283	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1284	result[0] = (a[0] <= b[0]) ? 1.0F : 0.0F;
				1285	result[1] = (a[1] <= b[1]) ? 1.0F : 0.0F;
				1286	result[2] = (a[2] <= b[2]) ? 1.0F : 0.0F;
				1287	result[3] = (a[3] <= b[3]) ? 1.0F : 0.0F;
				1288	store_vector4( inst, machine, result );
				1289	}
				1290	break;
				1291	case OPCODE_SLT: /* set on less */
				1292	{
				1293	GLfloat a[4], b[4], result[4];
				1294	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1295	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1296	result[0] = (a[0] < b[0]) ? 1.0F : 0.0F;
				1297	result[1] = (a[1] < b[1]) ? 1.0F : 0.0F;
				1298	result[2] = (a[2] < b[2]) ? 1.0F : 0.0F;
				1299	result[3] = (a[3] < b[3]) ? 1.0F : 0.0F;
				1300	store_vector4( inst, machine, result );
				1301	}
				1302	break;
				1303	case OPCODE_SNE: /* set on not equal */
				1304	{
				1305	GLfloat a[4], b[4], result[4];
				1306	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1307	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1308	result[0] = (a[0] != b[0]) ? 1.0F : 0.0F;
				1309	result[1] = (a[1] != b[1]) ? 1.0F : 0.0F;
				1310	result[2] = (a[2] != b[2]) ? 1.0F : 0.0F;
				1311	result[3] = (a[3] != b[3]) ? 1.0F : 0.0F;
				1312	store_vector4( inst, machine, result );
				1313	}
				1314	break;
				1315	case OPCODE_STR: /* set true, operands ignored */
				1316	{
				1317	static const GLfloat result[4] = { 1.0F, 1.0F, 1.0F, 1.0F };
				1318	store_vector4( inst, machine, result );
				1319	}
				1320	break;
				1321	case OPCODE_SUB:
				1322	{
				1323	GLfloat a[4], b[4], result[4];
				1324	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1325	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1326	result[0] = a[0] - b[0];
				1327	result[1] = a[1] - b[1];
				1328	result[2] = a[2] - b[2];
				1329	result[3] = a[3] - b[3];
				1330	store_vector4( inst, machine, result );
				1331	if (DEBUG_PROG) {
				1332	printf("SUB (%g %g %g %g) = (%g %g %g %g) - (%g %g %g %g)\n",
				1333	result[0], result[1], result[2], result[3],
				1334	a[0], a[1], a[2], a[3], b[0], b[1], b[2], b[3]);
				1335	}
				1336	}
				1337	break;
				1338	case OPCODE_SWZ: /* extended swizzle */
				1339	{
				1340	const struct prog_src_register *source = &inst->SrcReg[0];
				1341	const GLfloat *src = get_register_pointer(ctx, source, machine);
				1342	GLfloat result[4];
				1343	GLuint i;
				1344	for (i = 0; i < 4; i++) {
				1345	const GLuint swz = GET_SWZ(source->Swizzle, i);
				1346	if (swz == SWIZZLE_ZERO)
				1347	result[i] = 0.0;
				1348	else if (swz == SWIZZLE_ONE)
				1349	result[i] = 1.0;
				1350	else {
				1351	ASSERT(swz >= 0);
				1352	ASSERT(swz <= 3);
				1353	result[i] = src[swz];
				1354	}
				1355	if (source->NegateBase & (1 << i))
				1356	result[i] = -result[i];
				1357	}
				1358	store_vector4( inst, machine, result );
				1359	}
				1360	break;
				1361	case OPCODE_TEX: /* Both ARB and NV frag prog */
				1362	/* Texel lookup */
				1363	{
				1364	/* Note: only use the precomputed lambda value when we're
				1365	* sampling texture unit [K] with texcoord[K].
				1366	* Otherwise, the lambda value may have no relation to the
				1367	* instruction's texcoord or texture image. Using the wrong
				1368	* lambda is usually bad news.
				1369	* The rest of the time, just use zero (until we get a more
				1370	* sophisticated way of computing lambda).
				1371	*/
				1372	GLfloat coord[4], color[4], lambda;
				1373	#if 0
				1374	if (inst->SrcReg[0].File == PROGRAM_INPUT &&
				1375	inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
				1376	lambda = span->array->lambda[inst->TexSrcUnit][column];
				1377	else
				1378	#endif
				1379	lambda = 0.0;
				1380	fetch_vector4(ctx, &inst->SrcReg[0], machine, program, coord);
				1381	machine->FetchTexelLod(ctx, coord, lambda, inst->TexSrcUnit, color);
				1382	if (DEBUG_PROG) {
				1383	printf("TEX (%g, %g, %g, %g) = texture[%d][%g, %g, %g, %g], "
				1384	"lod %f\n",
				1385	color[0], color[1], color[2], color[3],
				1386	inst->TexSrcUnit,
				1387	coord[0], coord[1], coord[2], coord[3], lambda);
				1388	}
				1389	store_vector4( inst, machine, color );
				1390	}
				1391	break;
				1392	case OPCODE_TXB: /* GL_ARB_fragment_program only */
				1393	/* Texel lookup with LOD bias */
				1394	{
				1395	const struct gl_texture_unit *texUnit
				1396	= &ctx->Texture.Unit[inst->TexSrcUnit];
				1397	GLfloat coord[4], color[4], lambda, bias;
				1398	#if 0
				1399	if (inst->SrcReg[0].File == PROGRAM_INPUT &&
				1400	inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
				1401	lambda = span->array->lambda[inst->TexSrcUnit][column];
				1402	else
				1403	#endif
				1404	lambda = 0.0;
				1405	fetch_vector4(ctx, &inst->SrcReg[0], machine, program, coord);
				1406	/* coord[3] is the bias to add to lambda */
				1407	bias = texUnit->LodBias + coord[3];
				1408	if (texUnit->_Current)
				1409	bias += texUnit->_Current->LodBias;
				1410	machine->FetchTexelLod(ctx, coord, lambda + bias,
				1411	inst->TexSrcUnit, color);
				1412	store_vector4( inst, machine, color );
				1413	}
				1414	break;
				1415	case OPCODE_TXD: /* GL_NV_fragment_program only */
				1416	/* Texture lookup w/ partial derivatives for LOD */
				1417	{
				1418	GLfloat texcoord[4], dtdx[4], dtdy[4], color[4];
				1419	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, texcoord );
				1420	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, dtdx );
				1421	fetch_vector4( ctx, &inst->SrcReg[2], machine, program, dtdy );
				1422	machine->FetchTexelDeriv(ctx, texcoord, dtdx, dtdy,
				1423	inst->TexSrcUnit, color );
				1424	store_vector4( inst, machine, color );
				1425	}
				1426	break;
				1427	case OPCODE_TXP: /* GL_ARB_fragment_program only */
				1428	/* Texture lookup w/ projective divide */
				1429	{
				1430	GLfloat texcoord[4], color[4], lambda;
				1431	#if 0
				1432	if (inst->SrcReg[0].File == PROGRAM_INPUT &&
				1433	inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
				1434	lambda = span->array->lambda[inst->TexSrcUnit][column];
				1435	else
				1436	#endif
				1437	lambda = 0.0;
				1438	fetch_vector4(ctx, &inst->SrcReg[0], machine, program,texcoord);
				1439	/* Not so sure about this test - if texcoord[3] is
				1440	* zero, we'd probably be fine except for an ASSERT in
				1441	* IROUND_POS() which gets triggered by the inf values created.
				1442	*/
				1443	if (texcoord[3] != 0.0) {
				1444	texcoord[0] /= texcoord[3];
				1445	texcoord[1] /= texcoord[3];
				1446	texcoord[2] /= texcoord[3];
				1447	}
				1448	machine->FetchTexelLod(ctx, texcoord, lambda,
				1449	inst->TexSrcUnit, color);
				1450	store_vector4( inst, machine, color );
				1451	}
				1452	break;
				1453	case OPCODE_TXP_NV: /* GL_NV_fragment_program only */
				1454	/* Texture lookup w/ projective divide */
				1455	{
				1456	GLfloat texcoord[4], color[4], lambda;
				1457	#if 0
				1458	if (inst->SrcReg[0].File == PROGRAM_INPUT &&
				1459	inst->SrcReg[0].Index == FRAG_ATTRIB_TEX0+inst->TexSrcUnit)
				1460	lambda = span->array->lambda[inst->TexSrcUnit][column];
				1461	else
				1462	#endif
				1463	lambda = 0.0;
				1464	fetch_vector4(ctx, &inst->SrcReg[0], machine, program,texcoord);
				1465	if (inst->TexSrcTarget != TEXTURE_CUBE_INDEX &&
				1466	texcoord[3] != 0.0) {
				1467	texcoord[0] /= texcoord[3];
				1468	texcoord[1] /= texcoord[3];
				1469	texcoord[2] /= texcoord[3];
				1470	}
				1471	machine->FetchTexelLod(ctx, texcoord, lambda,
				1472	inst->TexSrcUnit, color);
				1473	store_vector4( inst, machine, color );
				1474	}
				1475	break;
				1476	case OPCODE_UP2H: /* unpack two 16-bit floats */
				1477	{
				1478	GLfloat a[4], result[4];
				1479	const GLuint rawBits = (const GLuint ) a;
				1480	GLhalfNV hx, hy;
				1481	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1482	hx = rawBits[0] & 0xffff;
				1483	hy = rawBits[0] >> 16;
				1484	result[0] = result[2] = _mesa_half_to_float(hx);
				1485	result[1] = result[3] = _mesa_half_to_float(hy);
				1486	store_vector4( inst, machine, result );
				1487	}
				1488	break;
				1489	case OPCODE_UP2US: /* unpack two GLushorts */
				1490	{
				1491	GLfloat a[4], result[4];
				1492	const GLuint rawBits = (const GLuint ) a;
				1493	GLushort usx, usy;
				1494	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1495	usx = rawBits[0] & 0xffff;
				1496	usy = rawBits[0] >> 16;
				1497	result[0] = result[2] = usx * (1.0f / 65535.0f);
				1498	result[1] = result[3] = usy * (1.0f / 65535.0f);
				1499	store_vector4( inst, machine, result );
				1500	}
				1501	break;
				1502	case OPCODE_UP4B: /* unpack four GLbytes */
				1503	{
				1504	GLfloat a[4], result[4];
				1505	const GLuint rawBits = (const GLuint ) a;
				1506	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1507	result[0] = (((rawBits[0] >> 0) & 0xff) - 128) / 127.0F;
				1508	result[1] = (((rawBits[0] >> 8) & 0xff) - 128) / 127.0F;
				1509	result[2] = (((rawBits[0] >> 16) & 0xff) - 128) / 127.0F;
				1510	result[3] = (((rawBits[0] >> 24) & 0xff) - 128) / 127.0F;
				1511	store_vector4( inst, machine, result );
				1512	}
				1513	break;
				1514	case OPCODE_UP4UB: /* unpack four GLubytes */
				1515	{
				1516	GLfloat a[4], result[4];
				1517	const GLuint rawBits = (const GLuint ) a;
				1518	fetch_vector1( ctx, &inst->SrcReg[0], machine, program, a );
				1519	result[0] = ((rawBits[0] >> 0) & 0xff) / 255.0F;
				1520	result[1] = ((rawBits[0] >> 8) & 0xff) / 255.0F;
				1521	result[2] = ((rawBits[0] >> 16) & 0xff) / 255.0F;
				1522	result[3] = ((rawBits[0] >> 24) & 0xff) / 255.0F;
				1523	store_vector4( inst, machine, result );
				1524	}
				1525	break;
				1526	case OPCODE_XPD: /* cross product */
				1527	{
				1528	GLfloat a[4], b[4], result[4];
				1529	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1530	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1531	result[0] = a[1] * b[2] - a[2] * b[1];
				1532	result[1] = a[2] * b[0] - a[0] * b[2];
				1533	result[2] = a[0] * b[1] - a[1] * b[0];
				1534	result[3] = 1.0;
				1535	store_vector4( inst, machine, result );
				1536	}
				1537	break;
				1538	case OPCODE_X2D: /* 2-D matrix transform */
				1539	{
				1540	GLfloat a[4], b[4], c[4], result[4];
				1541	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a );
				1542	fetch_vector4( ctx, &inst->SrcReg[1], machine, program, b );
				1543	fetch_vector4( ctx, &inst->SrcReg[2], machine, program, c );
				1544	result[0] = a[0] + b[0] * c[0] + b[1] * c[1];
				1545	result[1] = a[1] + b[0] * c[2] + b[1] * c[3];
				1546	result[2] = a[2] + b[0] * c[0] + b[1] * c[1];
				1547	result[3] = a[3] + b[0] * c[2] + b[1] * c[3];
				1548	store_vector4( inst, machine, result );
				1549	}
				1550	break;
				1551	case OPCODE_PRINT:
				1552	{
				1553	if (inst->SrcReg[0].File != -1) {
				1554	GLfloat a[4];
				1555	fetch_vector4( ctx, &inst->SrcReg[0], machine, program, a);
				1556	_mesa_printf("%s%g, %g, %g, %g\n", (const char *) inst->Data,
				1557	a[0], a[1], a[2], a[3]);
				1558	}
				1559	else {
				1560	_mesa_printf("%s\n", (const char *) inst->Data);
				1561	}
				1562	}
				1563	break;
				1564	case OPCODE_END:
				1565	return GL_TRUE;
				1566	default:
				1567	_mesa_problem(ctx, "Bad opcode %d in _mesa_exec_fragment_program",
				1568	inst->Opcode);
				1569	return GL_TRUE; /* return value doesn't matter */
				1570
				1571	}
				1572	total++;
				1573	if (total > MAX_EXEC) {
				1574	_mesa_problem(ctx, "Infinite loop detected in fragment program");
				1575	return GL_TRUE;
				1576	abort();
				1577	}
				1578	}
				1579
				1580	#if FEATURE_MESA_program_debug
				1581	CurrentMachine = NULL;
				1582	#endif
				1583
				1584	return GL_TRUE;
				1585	}