blob: d86d8e09a5159d1aac40e285460cb553405e4af9 [file] [log] [blame]
/**************************************************************************
*
* Copyright 2007 Tungsten Graphics, Inc., Cedar Park, Texas.
* All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the
* "Software"), to deal in the Software without restriction, including
* without limitation the rights to use, copy, modify, merge, publish,
* distribute, sub license, and/or sell copies of the Software, and to
* permit persons to whom the Software is furnished to do so, subject to
* the following conditions:
*
* The above copyright notice and this permission notice (including the
* next paragraph) shall be included in all copies or substantial portions
* of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.
* IN NO EVENT SHALL TUNGSTEN GRAPHICS AND/OR ITS SUPPLIERS BE LIABLE FOR
* ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
* TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
* SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*
**************************************************************************/
/**
* TGSI interpretor/executor.
*
* Flow control information:
*
* Since we operate on 'quads' (4 pixels or 4 vertices in parallel)
* flow control statements (IF/ELSE/ENDIF, LOOP/ENDLOOP) require special
* care since a condition may be true for some quad components but false
* for other components.
*
* We basically execute all statements (even if they're in the part of
* an IF/ELSE clause that's "not taken") and use a special mask to
* control writing to destination registers. This is the ExecMask.
* See store_dest().
*
* The ExecMask is computed from three other masks (CondMask, LoopMask and
* ContMask) which are controlled by the flow control instructions (namely:
* (IF/ELSE/ENDIF, LOOP/ENDLOOP and CONT).
*
*
* Authors:
* Michal Krol
* Brian Paul
*/
#include <transpose_matrix4x4.h>
#include <simdmath/ceilf4.h>
#include <simdmath/cosf4.h>
#include <simdmath/divf4.h>
#include <simdmath/floorf4.h>
#include <simdmath/log2f4.h>
#include <simdmath/powf4.h>
#include <simdmath/sinf4.h>
#include <simdmath/sqrtf4.h>
#include <simdmath/truncf4.h>
#include "pipe/p_compiler.h"
#include "pipe/p_state.h"
#include "pipe/p_shader_tokens.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "spu_exec.h"
#include "spu_main.h"
#include "spu_vertex_shader.h"
#include "spu_dcache.h"
#include "cell/common.h"
#define TILE_TOP_LEFT 0
#define TILE_TOP_RIGHT 1
#define TILE_BOTTOM_LEFT 2
#define TILE_BOTTOM_RIGHT 3
/*
* Shorthand locations of various utility registers (_I = Index, _C = Channel)
*/
#define TEMP_0_I TGSI_EXEC_TEMP_00000000_I
#define TEMP_0_C TGSI_EXEC_TEMP_00000000_C
#define TEMP_7F_I TGSI_EXEC_TEMP_7FFFFFFF_I
#define TEMP_7F_C TGSI_EXEC_TEMP_7FFFFFFF_C
#define TEMP_80_I TGSI_EXEC_TEMP_80000000_I
#define TEMP_80_C TGSI_EXEC_TEMP_80000000_C
#define TEMP_FF_I TGSI_EXEC_TEMP_FFFFFFFF_I
#define TEMP_FF_C TGSI_EXEC_TEMP_FFFFFFFF_C
#define TEMP_1_I TGSI_EXEC_TEMP_ONE_I
#define TEMP_1_C TGSI_EXEC_TEMP_ONE_C
#define TEMP_2_I TGSI_EXEC_TEMP_TWO_I
#define TEMP_2_C TGSI_EXEC_TEMP_TWO_C
#define TEMP_128_I TGSI_EXEC_TEMP_128_I
#define TEMP_128_C TGSI_EXEC_TEMP_128_C
#define TEMP_M128_I TGSI_EXEC_TEMP_MINUS_128_I
#define TEMP_M128_C TGSI_EXEC_TEMP_MINUS_128_C
#define TEMP_KILMASK_I TGSI_EXEC_TEMP_KILMASK_I
#define TEMP_KILMASK_C TGSI_EXEC_TEMP_KILMASK_C
#define TEMP_OUTPUT_I TGSI_EXEC_TEMP_OUTPUT_I
#define TEMP_OUTPUT_C TGSI_EXEC_TEMP_OUTPUT_C
#define TEMP_PRIMITIVE_I TGSI_EXEC_TEMP_PRIMITIVE_I
#define TEMP_PRIMITIVE_C TGSI_EXEC_TEMP_PRIMITIVE_C
#define TEMP_R0 TGSI_EXEC_TEMP_R0
#define FOR_EACH_CHANNEL(CHAN)\
for (CHAN = 0; CHAN < 4; CHAN++)
#define IS_CHANNEL_ENABLED(INST, CHAN)\
((INST).Dst[0].Register.WriteMask & (1 << (CHAN)))
#define IS_CHANNEL_ENABLED2(INST, CHAN)\
((INST).Dst[1].Register.WriteMask & (1 << (CHAN)))
#define FOR_EACH_ENABLED_CHANNEL(INST, CHAN)\
FOR_EACH_CHANNEL( CHAN )\
if (IS_CHANNEL_ENABLED( INST, CHAN ))
#define FOR_EACH_ENABLED_CHANNEL2(INST, CHAN)\
FOR_EACH_CHANNEL( CHAN )\
if (IS_CHANNEL_ENABLED2( INST, CHAN ))
/** The execution mask depends on the conditional mask and the loop mask */
#define UPDATE_EXEC_MASK(MACH) \
MACH->ExecMask = MACH->CondMask & MACH->LoopMask & MACH->ContMask & MACH->FuncMask
#define CHAN_X 0
#define CHAN_Y 1
#define CHAN_Z 2
#define CHAN_W 3
/**
* Initialize machine state by expanding tokens to full instructions,
* allocating temporary storage, setting up constants, etc.
* After this, we can call spu_exec_machine_run() many times.
*/
void
spu_exec_machine_init(struct spu_exec_machine *mach,
uint numSamplers,
struct spu_sampler *samplers,
unsigned processor)
{
const qword zero = si_il(0);
const qword not_zero = si_il(~0);
(void) numSamplers;
mach->Samplers = samplers;
mach->Processor = processor;
mach->Addrs = &mach->Temps[TGSI_EXEC_NUM_TEMPS];
/* Setup constants. */
mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q = zero;
mach->Temps[TEMP_FF_I].xyzw[TEMP_FF_C].q = not_zero;
mach->Temps[TEMP_7F_I].xyzw[TEMP_7F_C].q = si_shli(not_zero, -1);
mach->Temps[TEMP_80_I].xyzw[TEMP_80_C].q = si_shli(not_zero, 31);
mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q = (qword) spu_splats(1.0f);
mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q = (qword) spu_splats(2.0f);
mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q = (qword) spu_splats(128.0f);
mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q = (qword) spu_splats(-128.0f);
}
static INLINE qword
micro_abs(qword src)
{
return si_rotmi(si_shli(src, 1), -1);
}
static INLINE qword
micro_ceil(qword src)
{
return (qword) _ceilf4((vec_float4) src);
}
static INLINE qword
micro_cos(qword src)
{
return (qword) _cosf4((vec_float4) src);
}
static const qword br_shuf = {
TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
TILE_BOTTOM_RIGHT + 0, TILE_BOTTOM_RIGHT + 1,
TILE_BOTTOM_RIGHT + 2, TILE_BOTTOM_RIGHT + 3,
};
static const qword bl_shuf = {
TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
TILE_BOTTOM_LEFT + 0, TILE_BOTTOM_LEFT + 1,
TILE_BOTTOM_LEFT + 2, TILE_BOTTOM_LEFT + 3,
};
static const qword tl_shuf = {
TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
TILE_TOP_LEFT + 0, TILE_TOP_LEFT + 1,
TILE_TOP_LEFT + 2, TILE_TOP_LEFT + 3,
};
static qword
micro_ddx(qword src)
{
qword bottom_right = si_shufb(src, src, br_shuf);
qword bottom_left = si_shufb(src, src, bl_shuf);
return si_fs(bottom_right, bottom_left);
}
static qword
micro_ddy(qword src)
{
qword top_left = si_shufb(src, src, tl_shuf);
qword bottom_left = si_shufb(src, src, bl_shuf);
return si_fs(top_left, bottom_left);
}
static INLINE qword
micro_div(qword src0, qword src1)
{
return (qword) _divf4((vec_float4) src0, (vec_float4) src1);
}
static qword
micro_flr(qword src)
{
return (qword) _floorf4((vec_float4) src);
}
static qword
micro_frc(qword src)
{
return si_fs(src, (qword) _floorf4((vec_float4) src));
}
static INLINE qword
micro_ge(qword src0, qword src1)
{
return si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
}
static qword
micro_lg2(qword src)
{
return (qword) _log2f4((vec_float4) src);
}
static INLINE qword
micro_lt(qword src0, qword src1)
{
const qword tmp = si_or(si_fceq(src0, src1), si_fcgt(src0, src1));
return si_xori(tmp, 0xff);
}
static INLINE qword
micro_max(qword src0, qword src1)
{
return si_selb(src1, src0, si_fcgt(src0, src1));
}
static INLINE qword
micro_min(qword src0, qword src1)
{
return si_selb(src0, src1, si_fcgt(src0, src1));
}
static qword
micro_neg(qword src)
{
return si_xor(src, (qword) spu_splats(0x80000000));
}
static qword
micro_set_sign(qword src)
{
return si_or(src, (qword) spu_splats(0x80000000));
}
static qword
micro_pow(qword src0, qword src1)
{
return (qword) _powf4((vec_float4) src0, (vec_float4) src1);
}
static qword
micro_rnd(qword src)
{
const qword half = (qword) spu_splats(0.5f);
/* May be able to use _roundf4. There may be some difference, though.
*/
return (qword) _floorf4((vec_float4) si_fa(src, half));
}
static INLINE qword
micro_ishr(qword src0, qword src1)
{
return si_rotma(src0, si_sfi(src1, 0));
}
static qword
micro_trunc(qword src)
{
return (qword) _truncf4((vec_float4) src);
}
static qword
micro_sin(qword src)
{
return (qword) _sinf4((vec_float4) src);
}
static INLINE qword
micro_sqrt(qword src)
{
return (qword) _sqrtf4((vec_float4) src);
}
static void
fetch_src_file_channel(
const struct spu_exec_machine *mach,
const uint file,
const uint swizzle,
const union spu_exec_channel *index,
union spu_exec_channel *chan )
{
switch( swizzle ) {
case TGSI_SWIZZLE_X:
case TGSI_SWIZZLE_Y:
case TGSI_SWIZZLE_Z:
case TGSI_SWIZZLE_W:
switch( file ) {
case TGSI_FILE_CONSTANT: {
unsigned i;
for (i = 0; i < 4; i++) {
const float *ptr = mach->Consts[index->i[i]];
float tmp[4];
spu_dcache_fetch_unaligned((qword *) tmp,
(uintptr_t)(ptr + swizzle),
sizeof(float));
chan->f[i] = tmp[0];
}
break;
}
case TGSI_FILE_INPUT:
chan->u[0] = mach->Inputs[index->i[0]].xyzw[swizzle].u[0];
chan->u[1] = mach->Inputs[index->i[1]].xyzw[swizzle].u[1];
chan->u[2] = mach->Inputs[index->i[2]].xyzw[swizzle].u[2];
chan->u[3] = mach->Inputs[index->i[3]].xyzw[swizzle].u[3];
break;
case TGSI_FILE_TEMPORARY:
chan->u[0] = mach->Temps[index->i[0]].xyzw[swizzle].u[0];
chan->u[1] = mach->Temps[index->i[1]].xyzw[swizzle].u[1];
chan->u[2] = mach->Temps[index->i[2]].xyzw[swizzle].u[2];
chan->u[3] = mach->Temps[index->i[3]].xyzw[swizzle].u[3];
break;
case TGSI_FILE_IMMEDIATE:
ASSERT( index->i[0] < (int) mach->ImmLimit );
ASSERT( index->i[1] < (int) mach->ImmLimit );
ASSERT( index->i[2] < (int) mach->ImmLimit );
ASSERT( index->i[3] < (int) mach->ImmLimit );
chan->f[0] = mach->Imms[index->i[0]][swizzle];
chan->f[1] = mach->Imms[index->i[1]][swizzle];
chan->f[2] = mach->Imms[index->i[2]][swizzle];
chan->f[3] = mach->Imms[index->i[3]][swizzle];
break;
case TGSI_FILE_ADDRESS:
chan->u[0] = mach->Addrs[index->i[0]].xyzw[swizzle].u[0];
chan->u[1] = mach->Addrs[index->i[1]].xyzw[swizzle].u[1];
chan->u[2] = mach->Addrs[index->i[2]].xyzw[swizzle].u[2];
chan->u[3] = mach->Addrs[index->i[3]].xyzw[swizzle].u[3];
break;
case TGSI_FILE_OUTPUT:
/* vertex/fragment output vars can be read too */
chan->u[0] = mach->Outputs[index->i[0]].xyzw[swizzle].u[0];
chan->u[1] = mach->Outputs[index->i[1]].xyzw[swizzle].u[1];
chan->u[2] = mach->Outputs[index->i[2]].xyzw[swizzle].u[2];
chan->u[3] = mach->Outputs[index->i[3]].xyzw[swizzle].u[3];
break;
default:
ASSERT( 0 );
}
break;
default:
ASSERT( 0 );
}
}
static void
fetch_source(
const struct spu_exec_machine *mach,
union spu_exec_channel *chan,
const struct tgsi_full_src_register *reg,
const uint chan_index )
{
union spu_exec_channel index;
uint swizzle;
index.i[0] =
index.i[1] =
index.i[2] =
index.i[3] = reg->Register.Index;
if (reg->Register.Indirect) {
union spu_exec_channel index2;
union spu_exec_channel indir_index;
index2.i[0] =
index2.i[1] =
index2.i[2] =
index2.i[3] = reg->Indirect.Index;
swizzle = tgsi_util_get_src_register_swizzle(&reg->Indirect,
CHAN_X);
fetch_src_file_channel(
mach,
reg->Indirect.File,
swizzle,
&index2,
&indir_index );
index.q = si_a(index.q, indir_index.q);
}
if( reg->Register.Dimension ) {
switch( reg->Register.File ) {
case TGSI_FILE_INPUT:
index.q = si_mpyi(index.q, 17);
break;
case TGSI_FILE_CONSTANT:
index.q = si_shli(index.q, 12);
break;
default:
ASSERT( 0 );
}
index.i[0] += reg->Dimension.Index;
index.i[1] += reg->Dimension.Index;
index.i[2] += reg->Dimension.Index;
index.i[3] += reg->Dimension.Index;
if (reg->Dimension.Indirect) {
union spu_exec_channel index2;
union spu_exec_channel indir_index;
index2.i[0] =
index2.i[1] =
index2.i[2] =
index2.i[3] = reg->DimIndirect.Index;
swizzle = tgsi_util_get_src_register_swizzle( &reg->DimIndirect, CHAN_X );
fetch_src_file_channel(
mach,
reg->DimIndirect.File,
swizzle,
&index2,
&indir_index );
index.q = si_a(index.q, indir_index.q);
}
}
swizzle = tgsi_util_get_full_src_register_swizzle( reg, chan_index );
fetch_src_file_channel(
mach,
reg->Register.File,
swizzle,
&index,
chan );
switch (tgsi_util_get_full_src_register_sign_mode( reg, chan_index )) {
case TGSI_UTIL_SIGN_CLEAR:
chan->q = micro_abs(chan->q);
break;
case TGSI_UTIL_SIGN_SET:
chan->q = micro_set_sign(chan->q);
break;
case TGSI_UTIL_SIGN_TOGGLE:
chan->q = micro_neg(chan->q);
break;
case TGSI_UTIL_SIGN_KEEP:
break;
}
if (reg->RegisterExtMod.Complement) {
chan->q = si_fs(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, chan->q);
}
}
static void
store_dest(
struct spu_exec_machine *mach,
const union spu_exec_channel *chan,
const struct tgsi_full_dst_register *reg,
const struct tgsi_full_instruction *inst,
uint chan_index )
{
union spu_exec_channel *dst;
switch( reg->Register.File ) {
case TGSI_FILE_NULL:
return;
case TGSI_FILE_OUTPUT:
dst = &mach->Outputs[mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0]
+ reg->Register.Index].xyzw[chan_index];
break;
case TGSI_FILE_TEMPORARY:
dst = &mach->Temps[reg->Register.Index].xyzw[chan_index];
break;
case TGSI_FILE_ADDRESS:
dst = &mach->Addrs[reg->Register.Index].xyzw[chan_index];
break;
default:
ASSERT( 0 );
return;
}
switch (inst->Instruction.Saturate)
{
case TGSI_SAT_NONE:
if (mach->ExecMask & 0x1)
dst->i[0] = chan->i[0];
if (mach->ExecMask & 0x2)
dst->i[1] = chan->i[1];
if (mach->ExecMask & 0x4)
dst->i[2] = chan->i[2];
if (mach->ExecMask & 0x8)
dst->i[3] = chan->i[3];
break;
case TGSI_SAT_ZERO_ONE:
/* XXX need to obey ExecMask here */
dst->q = micro_max(chan->q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
dst->q = micro_min(dst->q, mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q);
break;
case TGSI_SAT_MINUS_PLUS_ONE:
ASSERT( 0 );
break;
default:
ASSERT( 0 );
}
}
#define FETCH(VAL,INDEX,CHAN)\
fetch_source (mach, VAL, &inst->Src[INDEX], CHAN)
#define STORE(VAL,INDEX,CHAN)\
store_dest (mach, VAL, &inst->Dst[INDEX], inst, CHAN )
/**
* Execute ARB-style KIL which is predicated by a src register.
* Kill fragment if any of the four values is less than zero.
*/
static void
exec_kil(struct spu_exec_machine *mach,
const struct tgsi_full_instruction *inst)
{
uint uniquemask;
uint chan_index;
uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
union spu_exec_channel r[1];
/* This mask stores component bits that were already tested. */
uniquemask = 0;
for (chan_index = 0; chan_index < 4; chan_index++)
{
uint swizzle;
uint i;
/* unswizzle channel */
swizzle = tgsi_util_get_full_src_register_swizzle (
&inst->Src[0],
chan_index);
/* check if the component has not been already tested */
if (uniquemask & (1 << swizzle))
continue;
uniquemask |= 1 << swizzle;
FETCH(&r[0], 0, chan_index);
for (i = 0; i < 4; i++)
if (r[0].f[i] < 0.0f)
kilmask |= 1 << i;
}
mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
}
/**
* Execute NVIDIA-style KIL which is predicated by a condition code.
* Kill fragment if the condition code is TRUE.
*/
static void
exec_kilp(struct tgsi_exec_machine *mach,
const struct tgsi_full_instruction *inst)
{
uint kilmask = 0; /* bit 0 = pixel 0, bit 1 = pixel 1, etc */
/* TODO: build kilmask from CC mask */
mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] |= kilmask;
}
/*
* Fetch a texel using STR texture coordinates.
*/
static void
fetch_texel( struct spu_sampler *sampler,
const union spu_exec_channel *s,
const union spu_exec_channel *t,
const union spu_exec_channel *p,
float lodbias, /* XXX should be float[4] */
union spu_exec_channel *r,
union spu_exec_channel *g,
union spu_exec_channel *b,
union spu_exec_channel *a )
{
qword rgba[4];
qword out[4];
sampler->get_samples(sampler, s->f, t->f, p->f, lodbias,
(float (*)[4]) rgba);
_transpose_matrix4x4((vec_float4 *) out, (vec_float4 *) rgba);
r->q = out[0];
g->q = out[1];
b->q = out[2];
a->q = out[3];
}
static void
exec_tex(struct spu_exec_machine *mach,
const struct tgsi_full_instruction *inst,
boolean biasLod, boolean projected)
{
const uint unit = inst->Src[1].Register.Index;
union spu_exec_channel r[8];
uint chan_index;
float lodBias;
/* printf("Sampler %u unit %u\n", sampler, unit); */
switch (inst->InstructionExtTexture.Texture) {
case TGSI_TEXTURE_1D:
FETCH(&r[0], 0, CHAN_X);
if (projected) {
FETCH(&r[1], 0, CHAN_W);
r[0].q = micro_div(r[0].q, r[1].q);
}
if (biasLod) {
FETCH(&r[1], 0, CHAN_W);
lodBias = r[2].f[0];
}
else
lodBias = 0.0;
fetch_texel(&mach->Samplers[unit],
&r[0], NULL, NULL, lodBias, /* S, T, P, BIAS */
&r[0], &r[1], &r[2], &r[3]); /* R, G, B, A */
break;
case TGSI_TEXTURE_2D:
case TGSI_TEXTURE_RECT:
FETCH(&r[0], 0, CHAN_X);
FETCH(&r[1], 0, CHAN_Y);
FETCH(&r[2], 0, CHAN_Z);
if (projected) {
FETCH(&r[3], 0, CHAN_W);
r[0].q = micro_div(r[0].q, r[3].q);
r[1].q = micro_div(r[1].q, r[3].q);
r[2].q = micro_div(r[2].q, r[3].q);
}
if (biasLod) {
FETCH(&r[3], 0, CHAN_W);
lodBias = r[3].f[0];
}
else
lodBias = 0.0;
fetch_texel(&mach->Samplers[unit],
&r[0], &r[1], &r[2], lodBias, /* inputs */
&r[0], &r[1], &r[2], &r[3]); /* outputs */
break;
case TGSI_TEXTURE_3D:
case TGSI_TEXTURE_CUBE:
FETCH(&r[0], 0, CHAN_X);
FETCH(&r[1], 0, CHAN_Y);
FETCH(&r[2], 0, CHAN_Z);
if (projected) {
FETCH(&r[3], 0, CHAN_W);
r[0].q = micro_div(r[0].q, r[3].q);
r[1].q = micro_div(r[1].q, r[3].q);
r[2].q = micro_div(r[2].q, r[3].q);
}
if (biasLod) {
FETCH(&r[3], 0, CHAN_W);
lodBias = r[3].f[0];
}
else
lodBias = 0.0;
fetch_texel(&mach->Samplers[unit],
&r[0], &r[1], &r[2], lodBias,
&r[0], &r[1], &r[2], &r[3]);
break;
default:
ASSERT (0);
}
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[chan_index], 0, chan_index );
}
}
static void
constant_interpolation(
struct spu_exec_machine *mach,
unsigned attrib,
unsigned chan )
{
unsigned i;
for( i = 0; i < QUAD_SIZE; i++ ) {
mach->Inputs[attrib].xyzw[chan].f[i] = mach->InterpCoefs[attrib].a0[chan];
}
}
static void
linear_interpolation(
struct spu_exec_machine *mach,
unsigned attrib,
unsigned chan )
{
const float x = mach->QuadPos.xyzw[0].f[0];
const float y = mach->QuadPos.xyzw[1].f[0];
const float dadx = mach->InterpCoefs[attrib].dadx[chan];
const float dady = mach->InterpCoefs[attrib].dady[chan];
const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
mach->Inputs[attrib].xyzw[chan].f[0] = a0;
mach->Inputs[attrib].xyzw[chan].f[1] = a0 + dadx;
mach->Inputs[attrib].xyzw[chan].f[2] = a0 + dady;
mach->Inputs[attrib].xyzw[chan].f[3] = a0 + dadx + dady;
}
static void
perspective_interpolation(
struct spu_exec_machine *mach,
unsigned attrib,
unsigned chan )
{
const float x = mach->QuadPos.xyzw[0].f[0];
const float y = mach->QuadPos.xyzw[1].f[0];
const float dadx = mach->InterpCoefs[attrib].dadx[chan];
const float dady = mach->InterpCoefs[attrib].dady[chan];
const float a0 = mach->InterpCoefs[attrib].a0[chan] + dadx * x + dady * y;
const float *w = mach->QuadPos.xyzw[3].f;
/* divide by W here */
mach->Inputs[attrib].xyzw[chan].f[0] = a0 / w[0];
mach->Inputs[attrib].xyzw[chan].f[1] = (a0 + dadx) / w[1];
mach->Inputs[attrib].xyzw[chan].f[2] = (a0 + dady) / w[2];
mach->Inputs[attrib].xyzw[chan].f[3] = (a0 + dadx + dady) / w[3];
}
typedef void (* interpolation_func)(
struct spu_exec_machine *mach,
unsigned attrib,
unsigned chan );
static void
exec_declaration(struct spu_exec_machine *mach,
const struct tgsi_full_declaration *decl)
{
if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
if( decl->Declaration.File == TGSI_FILE_INPUT ) {
unsigned first, last, mask;
interpolation_func interp;
first = decl->Range.First;
last = decl->Range.Last;
mask = decl->Declaration.UsageMask;
switch( decl->Declaration.Interpolate ) {
case TGSI_INTERPOLATE_CONSTANT:
interp = constant_interpolation;
break;
case TGSI_INTERPOLATE_LINEAR:
interp = linear_interpolation;
break;
case TGSI_INTERPOLATE_PERSPECTIVE:
interp = perspective_interpolation;
break;
default:
ASSERT( 0 );
}
if( mask == TGSI_WRITEMASK_XYZW ) {
unsigned i, j;
for( i = first; i <= last; i++ ) {
for( j = 0; j < NUM_CHANNELS; j++ ) {
interp( mach, i, j );
}
}
}
else {
unsigned i, j;
for( j = 0; j < NUM_CHANNELS; j++ ) {
if( mask & (1 << j) ) {
for( i = first; i <= last; i++ ) {
interp( mach, i, j );
}
}
}
}
}
}
}
static void
exec_instruction(
struct spu_exec_machine *mach,
const struct tgsi_full_instruction *inst,
int *pc )
{
uint chan_index;
union spu_exec_channel r[8];
(*pc)++;
switch (inst->Instruction.Opcode) {
case TGSI_OPCODE_ARL:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = si_cflts(r[0].q, 0);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_MOV:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_LIT:
if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_Y ) || IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
FETCH( &r[0], 0, CHAN_X );
if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
r[0].q = micro_max(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
STORE( &r[0], 0, CHAN_Y );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
FETCH( &r[1], 0, CHAN_Y );
r[1].q = micro_max(r[1].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
FETCH( &r[2], 0, CHAN_W );
r[2].q = micro_min(r[2].q, mach->Temps[TEMP_128_I].xyzw[TEMP_128_C].q);
r[2].q = micro_max(r[2].q, mach->Temps[TEMP_M128_I].xyzw[TEMP_M128_C].q);
r[1].q = micro_pow(r[1].q, r[2].q);
/* r0 = (r0 > 0.0) ? r1 : 0.0
*/
r[0].q = si_fcgt(r[0].q, mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q);
r[0].q = si_selb(mach->Temps[TEMP_0_I].xyzw[TEMP_0_C].q, r[1].q,
r[0].q);
STORE( &r[0], 0, CHAN_Z );
}
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
}
break;
case TGSI_OPCODE_RCP:
FETCH( &r[0], 0, CHAN_X );
r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_RSQ:
FETCH( &r[0], 0, CHAN_X );
r[0].q = micro_sqrt(r[0].q);
r[0].q = micro_div(mach->Temps[TEMP_1_I].xyzw[TEMP_1_C].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_EXP:
ASSERT (0);
break;
case TGSI_OPCODE_LOG:
ASSERT (0);
break;
case TGSI_OPCODE_MUL:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index )
{
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
r[0].q = si_fm(r[0].q, r[1].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_ADD:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_fa(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_DP3:
/* TGSI_OPCODE_DOT3 */
FETCH( &r[0], 0, CHAN_X );
FETCH( &r[1], 1, CHAN_X );
r[0].q = si_fm(r[0].q, r[1].q);
FETCH( &r[1], 0, CHAN_Y );
FETCH( &r[2], 1, CHAN_Y );
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FETCH( &r[1], 0, CHAN_Z );
FETCH( &r[2], 1, CHAN_Z );
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_DP4:
/* TGSI_OPCODE_DOT4 */
FETCH(&r[0], 0, CHAN_X);
FETCH(&r[1], 1, CHAN_X);
r[0].q = si_fm(r[0].q, r[1].q);
FETCH(&r[1], 0, CHAN_Y);
FETCH(&r[2], 1, CHAN_Y);
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FETCH(&r[1], 0, CHAN_Z);
FETCH(&r[2], 1, CHAN_Z);
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FETCH(&r[1], 0, CHAN_W);
FETCH(&r[2], 1, CHAN_W);
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_DST:
if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_X );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
FETCH( &r[0], 0, CHAN_Y );
FETCH( &r[1], 1, CHAN_Y);
r[0].q = si_fm(r[0].q, r[1].q);
STORE( &r[0], 0, CHAN_Y );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
FETCH( &r[0], 0, CHAN_Z );
STORE( &r[0], 0, CHAN_Z );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
FETCH( &r[0], 1, CHAN_W );
STORE( &r[0], 0, CHAN_W );
}
break;
case TGSI_OPCODE_MIN:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
r[0].q = micro_min(r[0].q, r[1].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_MAX:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
r[0].q = micro_max(r[0].q, r[1].q);
STORE(&r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SLT:
/* TGSI_OPCODE_SETLT */
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = micro_ge(r[0].q, r[1].q);
r[0].q = si_xori(r[0].q, 0xff);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SGE:
/* TGSI_OPCODE_SETGE */
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = micro_ge(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_MAD:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
FETCH( &r[2], 2, chan_index );
r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SUB:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
r[0].q = si_fs(r[0].q, r[1].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_LRP:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
FETCH(&r[2], 2, chan_index);
r[1].q = si_fs(r[1].q, r[2].q);
r[0].q = si_fma(r[0].q, r[1].q, r[2].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_CND:
ASSERT (0);
break;
case TGSI_OPCODE_DP2A:
ASSERT (0);
break;
case TGSI_OPCODE_FRC:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_frc(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_CLAMP:
ASSERT (0);
break;
case TGSI_OPCODE_FLR:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_flr(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_ROUND:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_rnd(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_EX2:
FETCH(&r[0], 0, CHAN_X);
r[0].q = micro_pow(mach->Temps[TEMP_2_I].xyzw[TEMP_2_C].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_LG2:
FETCH( &r[0], 0, CHAN_X );
r[0].q = micro_lg2(r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_POW:
FETCH(&r[0], 0, CHAN_X);
FETCH(&r[1], 1, CHAN_X);
r[0].q = micro_pow(r[0].q, r[1].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_XPD:
/* TGSI_OPCODE_XPD */
FETCH(&r[0], 0, CHAN_Y);
FETCH(&r[1], 1, CHAN_Z);
FETCH(&r[3], 0, CHAN_Z);
FETCH(&r[4], 1, CHAN_Y);
/* r2 = (r0 * r1) - (r3 * r5)
*/
r[2].q = si_fm(r[3].q, r[5].q);
r[2].q = si_fms(r[0].q, r[1].q, r[2].q);
if (IS_CHANNEL_ENABLED( *inst, CHAN_X )) {
STORE( &r[2], 0, CHAN_X );
}
FETCH(&r[2], 1, CHAN_X);
FETCH(&r[5], 0, CHAN_X);
/* r3 = (r3 * r2) - (r1 * r5)
*/
r[1].q = si_fm(r[1].q, r[5].q);
r[3].q = si_fms(r[3].q, r[2].q, r[1].q);
if (IS_CHANNEL_ENABLED( *inst, CHAN_Y )) {
STORE( &r[3], 0, CHAN_Y );
}
/* r5 = (r5 * r4) - (r0 * r2)
*/
r[0].q = si_fm(r[0].q, r[2].q);
r[5].q = si_fms(r[5].q, r[4].q, r[0].q);
if (IS_CHANNEL_ENABLED( *inst, CHAN_Z )) {
STORE( &r[5], 0, CHAN_Z );
}
if (IS_CHANNEL_ENABLED( *inst, CHAN_W )) {
STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
}
break;
case TGSI_OPCODE_ABS:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
r[0].q = micro_abs(r[0].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_RCC:
ASSERT (0);
break;
case TGSI_OPCODE_DPH:
FETCH(&r[0], 0, CHAN_X);
FETCH(&r[1], 1, CHAN_X);
r[0].q = si_fm(r[0].q, r[1].q);
FETCH(&r[1], 0, CHAN_Y);
FETCH(&r[2], 1, CHAN_Y);
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FETCH(&r[1], 0, CHAN_Z);
FETCH(&r[2], 1, CHAN_Z);
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FETCH(&r[1], 1, CHAN_W);
r[0].q = si_fa(r[0].q, r[1].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_COS:
FETCH(&r[0], 0, CHAN_X);
r[0].q = micro_cos(r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_DDX:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_ddx(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_DDY:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_ddy(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_KILP:
exec_kilp (mach, inst);
break;
case TGSI_OPCODE_KIL:
exec_kil (mach, inst);
break;
case TGSI_OPCODE_PK2H:
ASSERT (0);
break;
case TGSI_OPCODE_PK2US:
ASSERT (0);
break;
case TGSI_OPCODE_PK4B:
ASSERT (0);
break;
case TGSI_OPCODE_PK4UB:
ASSERT (0);
break;
case TGSI_OPCODE_RFL:
ASSERT (0);
break;
case TGSI_OPCODE_SEQ:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_fceq(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SFL:
ASSERT (0);
break;
case TGSI_OPCODE_SGT:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_fcgt(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SIN:
FETCH( &r[0], 0, CHAN_X );
r[0].q = micro_sin(r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SLE:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_fcgt(r[0].q, r[1].q);
r[0].q = si_xori(r[0].q, 0xff);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SNE:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_fceq(r[0].q, r[1].q);
r[0].q = si_xori(r[0].q, 0xff);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_STR:
ASSERT (0);
break;
case TGSI_OPCODE_TEX:
/* simple texture lookup */
/* src[0] = texcoord */
/* src[1] = sampler unit */
exec_tex(mach, inst, FALSE, FALSE);
break;
case TGSI_OPCODE_TXB:
/* Texture lookup with lod bias */
/* src[0] = texcoord (src[0].w = load bias) */
/* src[1] = sampler unit */
exec_tex(mach, inst, TRUE, FALSE);
break;
case TGSI_OPCODE_TXD:
/* Texture lookup with explict partial derivatives */
/* src[0] = texcoord */
/* src[1] = d[strq]/dx */
/* src[2] = d[strq]/dy */
/* src[3] = sampler unit */
ASSERT (0);
break;
case TGSI_OPCODE_TXL:
/* Texture lookup with explit LOD */
/* src[0] = texcoord (src[0].w = load bias) */
/* src[1] = sampler unit */
exec_tex(mach, inst, TRUE, FALSE);
break;
case TGSI_OPCODE_TXP:
/* Texture lookup with projection */
/* src[0] = texcoord (src[0].w = projection) */
/* src[1] = sampler unit */
exec_tex(mach, inst, TRUE, TRUE);
break;
case TGSI_OPCODE_UP2H:
ASSERT (0);
break;
case TGSI_OPCODE_UP2US:
ASSERT (0);
break;
case TGSI_OPCODE_UP4B:
ASSERT (0);
break;
case TGSI_OPCODE_UP4UB:
ASSERT (0);
break;
case TGSI_OPCODE_X2D:
ASSERT (0);
break;
case TGSI_OPCODE_ARA:
ASSERT (0);
break;
case TGSI_OPCODE_ARR:
ASSERT (0);
break;
case TGSI_OPCODE_BRA:
ASSERT (0);
break;
case TGSI_OPCODE_CAL:
/* skip the call if no execution channels are enabled */
if (mach->ExecMask) {
/* do the call */
/* push the Cond, Loop, Cont stacks */
ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
mach->CondStack[mach->CondStackTop++] = mach->CondMask;
ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
mach->ContStack[mach->ContStackTop++] = mach->ContMask;
ASSERT(mach->FuncStackTop < TGSI_EXEC_MAX_CALL_NESTING);
mach->FuncStack[mach->FuncStackTop++] = mach->FuncMask;
/* note that PC was already incremented above */
mach->CallStack[mach->CallStackTop++] = *pc;
*pc = inst->InstructionExtLabel.Label;
}
break;
case TGSI_OPCODE_RET:
mach->FuncMask &= ~mach->ExecMask;
UPDATE_EXEC_MASK(mach);
if (mach->ExecMask == 0x0) {
/* really return now (otherwise, keep executing */
if (mach->CallStackTop == 0) {
/* returning from main() */
*pc = -1;
return;
}
*pc = mach->CallStack[--mach->CallStackTop];
/* pop the Cond, Loop, Cont stacks */
ASSERT(mach->CondStackTop > 0);
mach->CondMask = mach->CondStack[--mach->CondStackTop];
ASSERT(mach->LoopStackTop > 0);
mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
ASSERT(mach->ContStackTop > 0);
mach->ContMask = mach->ContStack[--mach->ContStackTop];
ASSERT(mach->FuncStackTop > 0);
mach->FuncMask = mach->FuncStack[--mach->FuncStackTop];
UPDATE_EXEC_MASK(mach);
}
break;
case TGSI_OPCODE_SSG:
ASSERT (0);
break;
case TGSI_OPCODE_CMP:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH(&r[0], 0, chan_index);
FETCH(&r[1], 1, chan_index);
FETCH(&r[2], 2, chan_index);
/* r0 = (r0 < 0.0) ? r1 : r2
*/
r[3].q = si_xor(r[3].q, r[3].q);
r[0].q = micro_lt(r[0].q, r[3].q);
r[0].q = si_selb(r[1].q, r[2].q, r[0].q);
STORE(&r[0], 0, chan_index);
}
break;
case TGSI_OPCODE_SCS:
if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) || IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
FETCH( &r[0], 0, CHAN_X );
}
if( IS_CHANNEL_ENABLED( *inst, CHAN_X ) ) {
r[1].q = micro_cos(r[0].q);
STORE( &r[1], 0, CHAN_X );
}
if( IS_CHANNEL_ENABLED( *inst, CHAN_Y ) ) {
r[1].q = micro_sin(r[0].q);
STORE( &r[1], 0, CHAN_Y );
}
if( IS_CHANNEL_ENABLED( *inst, CHAN_Z ) ) {
STORE( &mach->Temps[TEMP_0_I].xyzw[TEMP_0_C], 0, CHAN_Z );
}
if( IS_CHANNEL_ENABLED( *inst, CHAN_W ) ) {
STORE( &mach->Temps[TEMP_1_I].xyzw[TEMP_1_C], 0, CHAN_W );
}
break;
case TGSI_OPCODE_NRM:
ASSERT (0);
break;
case TGSI_OPCODE_DIV:
ASSERT( 0 );
break;
case TGSI_OPCODE_DP2:
FETCH( &r[0], 0, CHAN_X );
FETCH( &r[1], 1, CHAN_X );
r[0].q = si_fm(r[0].q, r[1].q);
FETCH( &r[1], 0, CHAN_Y );
FETCH( &r[2], 1, CHAN_Y );
r[0].q = si_fma(r[1].q, r[2].q, r[0].q);
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_IF:
/* push CondMask */
ASSERT(mach->CondStackTop < TGSI_EXEC_MAX_COND_NESTING);
mach->CondStack[mach->CondStackTop++] = mach->CondMask;
FETCH( &r[0], 0, CHAN_X );
/* update CondMask */
if( ! r[0].u[0] ) {
mach->CondMask &= ~0x1;
}
if( ! r[0].u[1] ) {
mach->CondMask &= ~0x2;
}
if( ! r[0].u[2] ) {
mach->CondMask &= ~0x4;
}
if( ! r[0].u[3] ) {
mach->CondMask &= ~0x8;
}
UPDATE_EXEC_MASK(mach);
/* Todo: If CondMask==0, jump to ELSE */
break;
case TGSI_OPCODE_ELSE:
/* invert CondMask wrt previous mask */
{
uint prevMask;
ASSERT(mach->CondStackTop > 0);
prevMask = mach->CondStack[mach->CondStackTop - 1];
mach->CondMask = ~mach->CondMask & prevMask;
UPDATE_EXEC_MASK(mach);
/* Todo: If CondMask==0, jump to ENDIF */
}
break;
case TGSI_OPCODE_ENDIF:
/* pop CondMask */
ASSERT(mach->CondStackTop > 0);
mach->CondMask = mach->CondStack[--mach->CondStackTop];
UPDATE_EXEC_MASK(mach);
break;
case TGSI_OPCODE_END:
/* halt execution */
*pc = -1;
break;
case TGSI_OPCODE_REP:
ASSERT (0);
break;
case TGSI_OPCODE_ENDREP:
ASSERT (0);
break;
case TGSI_OPCODE_PUSHA:
ASSERT (0);
break;
case TGSI_OPCODE_POPA:
ASSERT (0);
break;
case TGSI_OPCODE_CEIL:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_ceil(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_I2F:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = si_csflt(r[0].q, 0);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_NOT:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = si_xorbi(r[0].q, 0xff);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_TRUNC:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
r[0].q = micro_trunc(r[0].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SHL:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_shl(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_ISHR:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = micro_ishr(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_AND:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_and(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_OR:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_or(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_MOD:
ASSERT (0);
break;
case TGSI_OPCODE_XOR:
FOR_EACH_ENABLED_CHANNEL( *inst, chan_index ) {
FETCH( &r[0], 0, chan_index );
FETCH( &r[1], 1, chan_index );
r[0].q = si_xor(r[0].q, r[1].q);
STORE( &r[0], 0, chan_index );
}
break;
case TGSI_OPCODE_SAD:
ASSERT (0);
break;
case TGSI_OPCODE_TXF:
ASSERT (0);
break;
case TGSI_OPCODE_TXQ:
ASSERT (0);
break;
case TGSI_OPCODE_EMIT:
mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] += 16;
mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]]++;
break;
case TGSI_OPCODE_ENDPRIM:
mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]++;
mach->Primitives[mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0]] = 0;
break;
case TGSI_OPCODE_BGNFOR:
/* fall-through (for now) */
case TGSI_OPCODE_BGNLOOP:
/* push LoopMask and ContMasks */
ASSERT(mach->LoopStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
mach->LoopStack[mach->LoopStackTop++] = mach->LoopMask;
ASSERT(mach->ContStackTop < TGSI_EXEC_MAX_LOOP_NESTING);
mach->ContStack[mach->ContStackTop++] = mach->ContMask;
break;
case TGSI_OPCODE_ENDFOR:
/* fall-through (for now at least) */
case TGSI_OPCODE_ENDLOOP:
/* Restore ContMask, but don't pop */
ASSERT(mach->ContStackTop > 0);
mach->ContMask = mach->ContStack[mach->ContStackTop - 1];
if (mach->LoopMask) {
/* repeat loop: jump to instruction just past BGNLOOP */
*pc = inst->InstructionExtLabel.Label + 1;
}
else {
/* exit loop: pop LoopMask */
ASSERT(mach->LoopStackTop > 0);
mach->LoopMask = mach->LoopStack[--mach->LoopStackTop];
/* pop ContMask */
ASSERT(mach->ContStackTop > 0);
mach->ContMask = mach->ContStack[--mach->ContStackTop];
}
UPDATE_EXEC_MASK(mach);
break;
case TGSI_OPCODE_BRK:
/* turn off loop channels for each enabled exec channel */
mach->LoopMask &= ~mach->ExecMask;
/* Todo: if mach->LoopMask == 0, jump to end of loop */
UPDATE_EXEC_MASK(mach);
break;
case TGSI_OPCODE_CONT:
/* turn off cont channels for each enabled exec channel */
mach->ContMask &= ~mach->ExecMask;
/* Todo: if mach->LoopMask == 0, jump to end of loop */
UPDATE_EXEC_MASK(mach);
break;
case TGSI_OPCODE_BGNSUB:
/* no-op */
break;
case TGSI_OPCODE_ENDSUB:
/* no-op */
break;
case TGSI_OPCODE_NOP:
break;
default:
ASSERT( 0 );
}
}
/**
* Run TGSI interpreter.
* \return bitmask of "alive" quad components
*/
uint
spu_exec_machine_run( struct spu_exec_machine *mach )
{
uint i;
int pc = 0;
mach->CondMask = 0xf;
mach->LoopMask = 0xf;
mach->ContMask = 0xf;
mach->FuncMask = 0xf;
mach->ExecMask = 0xf;
mach->CondStackTop = 0; /* temporarily subvert this ASSERTion */
ASSERT(mach->CondStackTop == 0);
ASSERT(mach->LoopStackTop == 0);
ASSERT(mach->ContStackTop == 0);
ASSERT(mach->CallStackTop == 0);
mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0] = 0;
mach->Temps[TEMP_OUTPUT_I].xyzw[TEMP_OUTPUT_C].u[0] = 0;
if( mach->Processor == TGSI_PROCESSOR_GEOMETRY ) {
mach->Temps[TEMP_PRIMITIVE_I].xyzw[TEMP_PRIMITIVE_C].u[0] = 0;
mach->Primitives[0] = 0;
}
/* execute declarations (interpolants) */
if( mach->Processor == TGSI_PROCESSOR_FRAGMENT ) {
for (i = 0; i < mach->NumDeclarations; i++) {
union {
struct tgsi_full_declaration decl;
qword buffer[ROUNDUP16(sizeof(struct tgsi_full_declaration)) / 16];
} d ALIGN16_ATTRIB;
unsigned ea = (unsigned) (mach->Declarations + pc);
spu_dcache_fetch_unaligned(d.buffer, ea, sizeof(d.decl));
exec_declaration( mach, &d.decl );
}
}
/* execute instructions, until pc is set to -1 */
while (pc != -1) {
union {
struct tgsi_full_instruction inst;
qword buffer[ROUNDUP16(sizeof(struct tgsi_full_instruction)) / 16];
} i ALIGN16_ATTRIB;
unsigned ea = (unsigned) (mach->Instructions + pc);
spu_dcache_fetch_unaligned(i.buffer, ea, sizeof(i.inst));
exec_instruction( mach, & i.inst, &pc );
}
#if 0
/* we scale from floats in [0,1] to Zbuffer ints in sp_quad_depth_test.c */
if (mach->Processor == TGSI_PROCESSOR_FRAGMENT) {
/*
* Scale back depth component.
*/
for (i = 0; i < 4; i++)
mach->Outputs[0].xyzw[2].f[i] *= ctx->DrawBuffer->_DepthMaxF;
}
#endif
return ~mach->Temps[TEMP_KILMASK_I].xyzw[TEMP_KILMASK_C].u[0];
}