blob: 1aaae4ab7a4eed74924f4953d83557cb1ad82086 [file] [log] [blame]
/*
* Mesa 3-D graphics library
* Version: 6.3
*
* Copyright (C) 1999-2004 Brian Paul All Rights Reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a
* copy of this software and associated documentation files (the "Software"),
* to deal in the Software without restriction, including without limitation
* the rights to use, copy, modify, merge, publish, distribute, sublicense,
* and/or sell copies of the Software, and to permit persons to whom the
* Software is furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included
* in all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
* OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
* BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
* AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
*/
/**
* Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
* using the rtasm runtime assembler. Based on the old
* t_vb_arb_program_sse.c
*/
#include "util/u_memory.h"
#include "util/u_math.h"
#include "pipe/p_shader_tokens.h"
#include "util/u_debug.h"
#include "tgsi/tgsi_parse.h"
#include "tgsi/tgsi_util.h"
#include "tgsi/tgsi_exec.h"
#include "tgsi/tgsi_dump.h"
#include "draw_vs.h"
#include "draw_vs_aos.h"
#include "rtasm/rtasm_x86sse.h"
#ifdef PIPE_ARCH_X86
#define DISASSEM 0
#define FAST_MATH 1
static const char *files[] =
{
"NULL",
"CONST",
"IN",
"OUT",
"TEMP",
"SAMP",
"ADDR",
"IMM",
"INTERNAL",
};
static INLINE boolean eq( struct x86_reg a,
struct x86_reg b )
{
return (a.file == b.file &&
a.idx == b.idx &&
a.mod == b.mod &&
a.disp == b.disp);
}
struct x86_reg aos_get_x86( struct aos_compilation *cp,
unsigned which_reg, /* quick hack */
unsigned value )
{
struct x86_reg reg;
if (which_reg == 0)
reg = cp->temp_EBP;
else
reg = cp->tmp_EAX;
if (cp->x86_reg[which_reg] != value) {
unsigned offset;
switch (value) {
case X86_IMMEDIATES:
assert(which_reg == 0);
offset = Offset(struct aos_machine, immediates);
break;
case X86_CONSTANTS:
assert(which_reg == 1);
offset = Offset(struct aos_machine, constants);
break;
case X86_BUFFERS:
assert(which_reg == 0);
offset = Offset(struct aos_machine, buffer);
break;
default:
assert(0);
offset = 0;
}
x86_mov(cp->func, reg,
x86_make_disp(cp->machine_EDX, offset));
cp->x86_reg[which_reg] = value;
}
return reg;
}
static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
unsigned file,
unsigned idx )
{
struct x86_reg ptr = cp->machine_EDX;
switch (file) {
case TGSI_FILE_INPUT:
assert(idx < MAX_INPUTS);
return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));
case TGSI_FILE_OUTPUT:
return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));
case TGSI_FILE_TEMPORARY:
assert(idx < MAX_TEMPS);
return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));
case AOS_FILE_INTERNAL:
assert(idx < MAX_INTERNALS);
return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));
case TGSI_FILE_IMMEDIATE:
assert(idx < MAX_IMMEDIATES); /* just a sanity check */
return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));
case TGSI_FILE_CONSTANT:
assert(idx < MAX_CONSTANTS); /* just a sanity check */
return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));
default:
AOS_ERROR(cp, "unknown reg file");
return x86_make_reg(0,0);
}
}
#define X87_CW_EXCEPTION_INV_OP (1<<0)
#define X87_CW_EXCEPTION_DENORM_OP (1<<1)
#define X87_CW_EXCEPTION_ZERO_DIVIDE (1<<2)
#define X87_CW_EXCEPTION_OVERFLOW (1<<3)
#define X87_CW_EXCEPTION_UNDERFLOW (1<<4)
#define X87_CW_EXCEPTION_PRECISION (1<<5)
#define X87_CW_PRECISION_SINGLE (0<<8)
#define X87_CW_PRECISION_RESERVED (1<<8)
#define X87_CW_PRECISION_DOUBLE (2<<8)
#define X87_CW_PRECISION_DOUBLE_EXT (3<<8)
#define X87_CW_PRECISION_MASK (3<<8)
#define X87_CW_ROUND_NEAREST (0<<10)
#define X87_CW_ROUND_DOWN (1<<10)
#define X87_CW_ROUND_UP (2<<10)
#define X87_CW_ROUND_ZERO (3<<10)
#define X87_CW_ROUND_MASK (3<<10)
#define X87_CW_INFINITY (1<<12)
static void spill( struct aos_compilation *cp, unsigned idx )
{
if (!cp->xmm[idx].dirty ||
(cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
AOS_ERROR(cp, "invalid spill");
return;
}
else {
struct x86_reg oldval = get_reg_ptr(cp,
cp->xmm[idx].file,
cp->xmm[idx].idx);
if (0) debug_printf("\nspill %s[%d]",
files[cp->xmm[idx].file],
cp->xmm[idx].idx);
assert(cp->xmm[idx].dirty);
sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
cp->xmm[idx].dirty = 0;
}
}
void aos_spill_all( struct aos_compilation *cp )
{
unsigned i;
for (i = 0; i < 8; i++) {
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
}
static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
struct x86_reg reg )
{
if (reg.file != file_XMM ||
cp->xmm[reg.idx].file != TGSI_FILE_NULL)
{
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, reg);
reg = tmp;
}
cp->xmm[reg.idx].last_used = cp->insn_counter;
return reg;
}
static struct x86_reg get_xmm( struct aos_compilation *cp,
struct x86_reg reg )
{
if (reg.file != file_XMM)
{
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, reg);
reg = tmp;
}
cp->xmm[reg.idx].last_used = cp->insn_counter;
return reg;
}
/* Allocate an empty xmm register, either as a temporary or later to
* "adopt" as a shader reg.
*/
struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
{
unsigned i;
unsigned oldest = 0;
boolean found = FALSE;
for (i = 0; i < 8; i++)
if (cp->xmm[i].last_used != cp->insn_counter &&
cp->xmm[i].file == TGSI_FILE_NULL) {
oldest = i;
found = TRUE;
}
if (!found) {
for (i = 0; i < 8; i++)
if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
oldest = i;
}
/* Need to write out the old value?
*/
if (cp->xmm[oldest].dirty)
spill(cp, oldest);
assert(cp->xmm[oldest].last_used != cp->insn_counter);
cp->xmm[oldest].file = TGSI_FILE_NULL;
cp->xmm[oldest].idx = 0;
cp->xmm[oldest].dirty = 0;
cp->xmm[oldest].last_used = cp->insn_counter;
return x86_make_reg(file_XMM, oldest);
}
void aos_release_xmm_reg( struct aos_compilation *cp,
unsigned idx )
{
cp->xmm[idx].file = TGSI_FILE_NULL;
cp->xmm[idx].idx = 0;
cp->xmm[idx].dirty = 0;
cp->xmm[idx].last_used = 0;
}
static void aos_soft_release_xmm( struct aos_compilation *cp,
struct x86_reg reg )
{
if (reg.file == file_XMM) {
assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
}
}
/* Mark an xmm reg as holding the current copy of a shader reg.
*/
void aos_adopt_xmm_reg( struct aos_compilation *cp,
struct x86_reg reg,
unsigned file,
unsigned idx,
unsigned dirty )
{
unsigned i;
if (reg.file != file_XMM) {
assert(0);
return;
}
/* If any xmm reg thinks it holds this shader reg, break the
* illusion.
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].file == file &&
cp->xmm[i].idx == idx)
{
/* If an xmm reg is already holding this shader reg, take into account its
* dirty flag...
*/
dirty |= cp->xmm[i].dirty;
aos_release_xmm_reg(cp, i);
}
}
cp->xmm[reg.idx].file = file;
cp->xmm[reg.idx].idx = idx;
cp->xmm[reg.idx].dirty = dirty;
cp->xmm[reg.idx].last_used = cp->insn_counter;
}
/* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
*/
static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
unsigned file,
unsigned idx )
{
unsigned i;
/* Ensure the in-memory copy of this reg is up-to-date
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].file == file &&
cp->xmm[i].idx == idx &&
cp->xmm[i].dirty) {
spill(cp, i);
}
}
return get_reg_ptr( cp, file, idx );
}
/* As above, but return a pointer. Note - this pointer may alias
* those returned by get_arg_ptr().
*/
static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
const struct tgsi_full_dst_register *dst )
{
unsigned file = dst->Register.File;
unsigned idx = dst->Register.Index;
unsigned i;
/* Ensure in-memory copy of this reg is up-to-date and invalidate
* any xmm copies.
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].file == file &&
cp->xmm[i].idx == idx)
{
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
}
return get_reg_ptr( cp, file, idx );
}
/* Return an XMM reg if the argument is resident, otherwise return a
* base+offset pointer to the saved value.
*/
struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
unsigned file,
unsigned idx )
{
unsigned i;
for (i = 0; i < 8; i++) {
if (cp->xmm[i].file == file &&
cp->xmm[i].idx == idx)
{
cp->xmm[i].last_used = cp->insn_counter;
return x86_make_reg(file_XMM, i);
}
}
/* If not found in the XMM register file, return an indirect
* reference to the in-memory copy:
*/
return get_reg_ptr( cp, file, idx );
}
static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
unsigned file,
unsigned idx )
{
struct x86_reg reg = get_xmm( cp,
aos_get_shader_reg( cp, file, idx ) );
aos_adopt_xmm_reg( cp,
reg,
file,
idx,
FALSE );
return reg;
}
struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
unsigned imm )
{
return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
}
struct x86_reg aos_get_internal( struct aos_compilation *cp,
unsigned imm )
{
return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
}
/* Emulate pshufd insn in regular SSE, if necessary:
*/
static void emit_pshufd( struct aos_compilation *cp,
struct x86_reg dst,
struct x86_reg arg0,
ubyte shuf )
{
if (cp->have_sse2) {
sse2_pshufd(cp->func, dst, arg0, shuf);
}
else {
if (!eq(dst, arg0))
sse_movaps(cp->func, dst, arg0);
sse_shufps(cp->func, dst, dst, shuf);
}
}
/* load masks (pack into negs??)
* pshufd - shuffle according to writemask
* and - result, mask
* nand - dest, mask
* or - dest, result
*/
static boolean mask_write( struct aos_compilation *cp,
struct x86_reg dst,
struct x86_reg result,
unsigned mask )
{
struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
struct x86_reg tmp = aos_get_xmm_reg(cp);
emit_pshufd(cp, tmp, imm_swz,
SHUF((mask & 1) ? 2 : 3,
(mask & 2) ? 2 : 3,
(mask & 4) ? 2 : 3,
(mask & 8) ? 2 : 3));
sse_andps(cp->func, dst, tmp);
sse_andnps(cp->func, tmp, result);
sse_orps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
return TRUE;
}
/* Helper for writemask:
*/
static boolean emit_shuf_copy2( struct aos_compilation *cp,
struct x86_reg dst,
struct x86_reg arg0,
struct x86_reg arg1,
ubyte shuf )
{
struct x86_reg tmp = aos_get_xmm_reg(cp);
emit_pshufd(cp, dst, arg1, shuf);
emit_pshufd(cp, tmp, arg0, shuf);
sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
emit_pshufd(cp, dst, dst, shuf);
aos_release_xmm_reg(cp, tmp.idx);
return TRUE;
}
#define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))
/* Locate a source register and perform any required (simple) swizzle.
*
* Just fail on complex swizzles at this point.
*/
static struct x86_reg fetch_src( struct aos_compilation *cp,
const struct tgsi_full_src_register *src )
{
struct x86_reg arg0 = aos_get_shader_reg(cp,
src->Register.File,
src->Register.Index);
unsigned i;
ubyte swz = 0;
unsigned negs = 0;
unsigned abs = 0;
for (i = 0; i < 4; i++) {
unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );
swz |= (swizzle & 0x3) << (i * 2);
switch (neg) {
case TGSI_UTIL_SIGN_TOGGLE:
negs |= (1<<i);
break;
case TGSI_UTIL_SIGN_KEEP:
break;
case TGSI_UTIL_SIGN_CLEAR:
abs |= (1<<i);
break;
default:
AOS_ERROR(cp, "unsupported sign-mode");
break;
}
}
if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
struct x86_reg dst = aos_get_xmm_reg(cp);
if (swz != SSE_SWIZZLE_NOOP)
emit_pshufd(cp, dst, arg0, swz);
else
sse_movaps(cp->func, dst, arg0);
if (negs && negs != 0xf) {
struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
struct x86_reg tmp = aos_get_xmm_reg(cp);
/* Load 1,-1,0,0
* Use neg as arg to pshufd
* Multiply
*/
emit_pshufd(cp, tmp, imm_swz,
SHUF((negs & 1) ? 1 : 0,
(negs & 2) ? 1 : 0,
(negs & 4) ? 1 : 0,
(negs & 8) ? 1 : 0));
sse_mulps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
aos_soft_release_xmm(cp, imm_swz);
}
else if (negs) {
struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
sse_mulps(cp->func, dst, imm_negs);
aos_soft_release_xmm(cp, imm_negs);
}
if (abs && abs != 0xf) {
AOS_ERROR(cp, "unsupported partial abs");
}
else if (abs) {
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, dst);
sse_mulps(cp->func, tmp, neg);
sse_maxps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
aos_soft_release_xmm(cp, neg);
}
aos_soft_release_xmm(cp, arg0);
return dst;
}
return arg0;
}
static void x87_fld_src( struct aos_compilation *cp,
const struct tgsi_full_src_register *src,
unsigned channel )
{
struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
src->Register.File,
src->Register.Index);
unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );
x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );
switch (neg) {
case TGSI_UTIL_SIGN_TOGGLE:
/* Flip the sign:
*/
x87_fchs( cp->func );
break;
case TGSI_UTIL_SIGN_KEEP:
break;
case TGSI_UTIL_SIGN_CLEAR:
x87_fabs( cp->func );
break;
case TGSI_UTIL_SIGN_SET:
x87_fabs( cp->func );
x87_fchs( cp->func );
break;
default:
AOS_ERROR(cp, "unsupported sign-mode");
break;
}
}
/* Used to implement write masking. This and most of the other instructions
* here would be easier to implement if there had been a translation
* to a 2 argument format (dst/arg0, arg1) at the shader level before
* attempting to translate to x86/sse code.
*/
static void store_dest( struct aos_compilation *cp,
const struct tgsi_full_dst_register *reg,
struct x86_reg result )
{
struct x86_reg dst;
switch (reg->Register.WriteMask) {
case 0:
return;
case TGSI_WRITEMASK_XYZW:
aos_adopt_xmm_reg(cp,
get_xmm_writable(cp, result),
reg->Register.File,
reg->Register.Index,
TRUE);
return;
default:
break;
}
dst = aos_get_shader_reg_xmm(cp,
reg->Register.File,
reg->Register.Index);
switch (reg->Register.WriteMask) {
case TGSI_WRITEMASK_X:
sse_movss(cp->func, dst, get_xmm(cp, result));
break;
case TGSI_WRITEMASK_ZW:
sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
break;
case TGSI_WRITEMASK_XY:
result = get_xmm_writable(cp, result);
sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
dst = result;
break;
case TGSI_WRITEMASK_YZW:
result = get_xmm_writable(cp, result);
sse_movss(cp->func, result, dst);
dst = result;
break;
default:
mask_write(cp, dst, result, reg->Register.WriteMask);
break;
}
aos_adopt_xmm_reg(cp,
dst,
reg->Register.File,
reg->Register.Index,
TRUE);
}
static void inject_scalar( struct aos_compilation *cp,
struct x86_reg dst,
struct x86_reg result,
ubyte swizzle )
{
sse_shufps(cp->func, dst, dst, swizzle);
sse_movss(cp->func, dst, result);
sse_shufps(cp->func, dst, dst, swizzle);
}
static void store_scalar_dest( struct aos_compilation *cp,
const struct tgsi_full_dst_register *reg,
struct x86_reg result )
{
unsigned writemask = reg->Register.WriteMask;
struct x86_reg dst;
if (writemask != TGSI_WRITEMASK_X &&
writemask != TGSI_WRITEMASK_Y &&
writemask != TGSI_WRITEMASK_Z &&
writemask != TGSI_WRITEMASK_W &&
writemask != 0)
{
result = get_xmm_writable(cp, result); /* already true, right? */
sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
store_dest(cp, reg, result);
return;
}
result = get_xmm(cp, result);
dst = aos_get_shader_reg_xmm(cp,
reg->Register.File,
reg->Register.Index);
switch (reg->Register.WriteMask) {
case TGSI_WRITEMASK_X:
sse_movss(cp->func, dst, result);
break;
case TGSI_WRITEMASK_Y:
inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
break;
case TGSI_WRITEMASK_Z:
inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
break;
case TGSI_WRITEMASK_W:
inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
break;
default:
break;
}
aos_adopt_xmm_reg(cp,
dst,
reg->Register.File,
reg->Register.Index,
TRUE);
}
static void x87_fst_or_nop( struct x86_function *func,
unsigned writemask,
unsigned channel,
struct x86_reg ptr )
{
assert(ptr.file == file_REG32);
if (writemask & (1<<channel))
x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
}
static void x87_fstp_or_pop( struct x86_function *func,
unsigned writemask,
unsigned channel,
struct x86_reg ptr )
{
assert(ptr.file == file_REG32);
if (writemask & (1<<channel))
x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
else
x87_fstp( func, x86_make_reg( file_x87, 0 ));
}
/*
*/
static void x87_fstp_dest4( struct aos_compilation *cp,
const struct tgsi_full_dst_register *dst )
{
struct x86_reg ptr = get_dst_ptr(cp, dst);
unsigned writemask = dst->Register.WriteMask;
x87_fst_or_nop(cp->func, writemask, 0, ptr);
x87_fst_or_nop(cp->func, writemask, 1, ptr);
x87_fst_or_nop(cp->func, writemask, 2, ptr);
x87_fstp_or_pop(cp->func, writemask, 3, ptr);
}
/* Save current x87 state and put it into single precision mode.
*/
static void save_fpu_state( struct aos_compilation *cp )
{
x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, fpu_restore)));
}
static void restore_fpu_state( struct aos_compilation *cp )
{
x87_fnclex(cp->func);
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, fpu_restore)));
}
static void set_fpu_round_neg_inf( struct aos_compilation *cp )
{
if (cp->fpucntl != FPU_RND_NEG) {
cp->fpucntl = FPU_RND_NEG;
x87_fnclex(cp->func);
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, fpu_rnd_neg_inf)));
}
}
static void set_fpu_round_nearest( struct aos_compilation *cp )
{
if (cp->fpucntl != FPU_RND_NEAREST) {
cp->fpucntl = FPU_RND_NEAREST;
x87_fnclex(cp->func);
x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, fpu_rnd_nearest)));
}
}
#if 0
static void x87_emit_ex2( struct aos_compilation *cp )
{
struct x86_reg st0 = x86_make_reg(file_x87, 0);
struct x86_reg st1 = x86_make_reg(file_x87, 1);
int stack = cp->func->x87_stack;
/* set_fpu_round_neg_inf( cp ); */
x87_fld(cp->func, st0); /* a a */
x87_fprndint( cp->func ); /* int(a) a*/
x87_fsubr(cp->func, st1, st0); /* int(a) frc(a) */
x87_fxch(cp->func, st1); /* frc(a) int(a) */
x87_f2xm1(cp->func); /* (2^frc(a))-1 int(a) */
x87_fld1(cp->func); /* 1 (2^frc(a))-1 int(a) */
x87_faddp(cp->func, st1); /* 2^frac(a) int(a) */
x87_fscale(cp->func); /* (2^frac(a)*2^int(int(a))) int(a) */
/* 2^a int(a) */
x87_fstp(cp->func, st1); /* 2^a */
assert( stack == cp->func->x87_stack);
}
#endif
#if 0
static void PIPE_CDECL print_reg( const char *msg,
const float *reg )
{
debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
}
#endif
#if 0
static void emit_print( struct aos_compilation *cp,
const char *message, /* must point to a static string! */
unsigned file,
unsigned idx )
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
unsigned i;
/* There shouldn't be anything on the x87 stack. Can add this
* capacity later if need be.
*/
assert(cp->func->x87_stack == 0);
/* For absolute correctness, need to spill/invalidate all XMM regs
* too. We're obviously not concerned about performance on this
* debug path, so here goes:
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
/* Push caller-save (ie scratch) regs.
*/
x86_cdecl_caller_push_regs( cp->func );
/* Push the arguments:
*/
x86_lea( cp->func, ecx, arg );
x86_push( cp->func, ecx );
x86_push_imm32( cp->func, (int)message );
/* Call the helper. Could call debug_printf directly, but
* print_reg is a nice place to put a breakpoint if need be.
*/
x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
x86_call( cp->func, ecx );
x86_pop( cp->func, ecx );
x86_pop( cp->func, ecx );
/* Pop caller-save regs
*/
x86_cdecl_caller_pop_regs( cp->func );
/* Done...
*/
}
#endif
/**
* The traditional instructions. All operate on internal registers
* and ignore write masks and swizzling issues.
*/
static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, arg0);
sse_mulps(cp->func, tmp, neg);
sse_maxps(cp->func, tmp, arg0);
store_dest(cp, &op->Dst[0], tmp);
return TRUE;
}
static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_addps(cp->func, dst, arg1);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
x87_fld_src(cp, &op->Src[0], 0);
x87_fcos(cp->func);
x87_fstp_dest4(cp, &op->Dst[0]);
return TRUE;
}
/* The dotproduct instructions don't really do that well in sse:
* XXX: produces wrong results -- disabled.
*/
static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
/* Now the hard bit: sum the first 3 values:
*/
sse_movhlps(cp->func, tmp, dst);
sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
store_scalar_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
/* Now the hard bit: sum the values:
*/
sse_movhlps(cp->func, tmp, dst);
sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
store_scalar_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
/* Now the hard bit: sum the values (from DP3):
*/
sse_movhlps(cp->func, tmp, dst);
sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
sse_addss(cp->func, dst, tmp);
emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
sse_addss(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
store_scalar_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = aos_get_xmm_reg(cp);
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
/* dst[0] = 1.0 * 1.0F; */
/* dst[1] = arg0[1] * arg1[1]; */
/* dst[2] = arg0[2] * 1.0; */
/* dst[3] = 1.0 * arg1[3]; */
emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
sse_mulps(cp->func, dst, tmp);
aos_release_xmm_reg(cp, tmp.idx);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
x87_fld1(cp->func); /* 1 */
x87_fld_src(cp, &op->Src[0], 0); /* a0 1 */
x87_fyl2x(cp->func); /* log2(a0) */
x87_fstp_dest4(cp, &op->Dst[0]);
return TRUE;
}
#if 0
static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
x87_fld_src(cp, &op->Src[0], 0);
x87_emit_ex2(cp);
x87_fstp_dest4(cp, &op->Dst[0]);
return TRUE;
}
#endif
static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
unsigned writemask = op->Dst[0].Register.WriteMask;
int i;
set_fpu_round_neg_inf( cp );
/* Load all sources first to avoid aliasing
*/
for (i = 3; i >= 0; i--) {
if (writemask & (1<<i)) {
x87_fld_src(cp, &op->Src[0], i);
}
}
for (i = 0; i < 4; i++) {
if (writemask & (1<<i)) {
x87_fprndint( cp->func );
x87_fstp(cp->func, x86_make_disp(dst, i*4));
}
}
return TRUE;
}
static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
unsigned writemask = op->Dst[0].Register.WriteMask;
int i;
set_fpu_round_nearest( cp );
/* Load all sources first to avoid aliasing
*/
for (i = 3; i >= 0; i--) {
if (writemask & (1<<i)) {
x87_fld_src(cp, &op->Src[0], i);
}
}
for (i = 0; i < 4; i++) {
if (writemask & (1<<i)) {
x87_fprndint( cp->func );
x87_fstp(cp->func, x86_make_disp(dst, i*4));
}
}
return TRUE;
}
static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
struct x86_reg st0 = x86_make_reg(file_x87, 0);
struct x86_reg st1 = x86_make_reg(file_x87, 1);
unsigned writemask = op->Dst[0].Register.WriteMask;
int i;
set_fpu_round_neg_inf( cp );
/* suck all the source values onto the stack before writing out any
* dst, which may alias...
*/
for (i = 3; i >= 0; i--) {
if (writemask & (1<<i)) {
x87_fld_src(cp, &op->Src[0], i);
}
}
for (i = 0; i < 4; i++) {
if (writemask & (1<<i)) {
x87_fld(cp->func, st0); /* a a */
x87_fprndint( cp->func ); /* flr(a) a */
x87_fsubp(cp->func, st1); /* frc(a) */
x87_fstp(cp->func, x86_make_disp(dst, i*4));
}
}
return TRUE;
}
static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
unsigned writemask = op->Dst[0].Register.WriteMask;
unsigned lit_count = cp->lit_count++;
struct x86_reg result, arg0;
unsigned i;
#if 1
/* For absolute correctness, need to spill/invalidate all XMM regs
* too.
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
#endif
if (writemask != TGSI_WRITEMASK_XYZW)
result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
else
result = get_dst_ptr(cp, &op->Dst[0]);
arg0 = fetch_src( cp, &op->Src[0] );
if (arg0.file == file_XMM) {
struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, tmp[1]));
sse_movaps( cp->func, tmp, arg0 );
arg0 = tmp;
}
/* Push caller-save (ie scratch) regs.
*/
x86_cdecl_caller_push_regs( cp->func );
/* Push the arguments:
*/
x86_push_imm32( cp->func, lit_count );
x86_lea( cp->func, ecx, arg0 );
x86_push( cp->func, ecx );
x86_lea( cp->func, ecx, result );
x86_push( cp->func, ecx );
x86_push( cp->func, cp->machine_EDX );
if (lit_count < MAX_LIT_INFO) {
x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
Offset(struct aos_machine, lit_info) +
lit_count * sizeof(struct lit_info) +
Offset(struct lit_info, func)));
}
else {
x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
}
x86_call( cp->func, ecx );
x86_pop( cp->func, ecx ); /* fixme... */
x86_pop( cp->func, ecx );
x86_pop( cp->func, ecx );
x86_pop( cp->func, ecx );
x86_cdecl_caller_pop_regs( cp->func );
if (writemask != TGSI_WRITEMASK_XYZW) {
store_dest( cp,
&op->Dst[0],
get_xmm_writable( cp, result ) );
}
return TRUE;
}
#if 0
static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
unsigned writemask = op->Dst[0].Register.WriteMask;
if (writemask & TGSI_WRITEMASK_YZ) {
struct x86_reg st1 = x86_make_reg(file_x87, 1);
struct x86_reg st2 = x86_make_reg(file_x87, 2);
/* a1' = a1 <= 0 ? 1 : a1;
*/
x87_fldz(cp->func); /* 1 0 */
#if 1
x87_fld1(cp->func); /* 1 0 */
#else
/* Correct but slow due to fp exceptions generated in fyl2x - fix me.
*/
x87_fldz(cp->func); /* 1 0 */
#endif
x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0 */
x87_fcomi(cp->func, st2); /* a1 1 0 */
x87_fcmovb(cp->func, st1); /* a1' 1 0 */
x87_fstp(cp->func, st1); /* a1' 0 */
x87_fstp(cp->func, st1); /* a1' */
x87_fld_src(cp, &op->Src[0], 3); /* a3 a1' */
x87_fxch(cp->func, st1); /* a1' a3 */
/* Compute pow(a1, a3)
*/
x87_fyl2x(cp->func); /* a3*log2(a1) */
x87_emit_ex2( cp ); /* 2^(a3*log2(a1)) */
/* a0' = max2(a0, 0):
*/
x87_fldz(cp->func); /* 0 r2 */
x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
x87_fcomi(cp->func, st1);
x87_fcmovb(cp->func, st1); /* a0' 0 r2 */
x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */
x87_fcomi(cp->func, st1); /* a0' 0 r2 */
x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */
x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
x87_fpop(cp->func); /* r2 */
x87_fpop(cp->func);
}
if (writemask & TGSI_WRITEMASK_XW) {
x87_fld1(cp->func);
x87_fst_or_nop(cp->func, writemask, 0, dst);
x87_fstp_or_pop(cp->func, writemask, 3, dst);
}
return TRUE;
}
#endif
static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_maxps(cp->func, dst, arg1);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_minps(cp->func, dst, arg1);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
/* potentially nothing to do */
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, dst, arg1);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);
/* If we can't clobber old contents of arg0, get a temporary & copy
* it there, then clobber it...
*/
arg0 = get_xmm_writable(cp, arg0);
sse_mulps(cp->func, arg0, arg1);
sse_addps(cp->func, arg0, arg2);
store_dest(cp, &op->Dst[0], arg0);
return TRUE;
}
/* A wrapper for powf().
* Makes sure it is cdecl and operates on floats.
*/
static float PIPE_CDECL _powerf( float x, float y )
{
#if FAST_MATH
return util_fast_pow(x, y);
#else
return powf( x, y );
#endif
}
#if FAST_MATH
static float PIPE_CDECL _exp2(float x)
{
return util_fast_exp2(x);
}
#endif
/* Really not sufficient -- need to check for conditions that could
* generate inf/nan values, which will slow things down hugely.
*/
static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
#if 0
x87_fld_src(cp, &op->Src[1], 0); /* a1.x */
x87_fld_src(cp, &op->Src[0], 0); /* a0.x a1.x */
x87_fyl2x(cp->func); /* a1*log2(a0) */
x87_emit_ex2( cp ); /* 2^(a1*log2(a0)) */
x87_fstp_dest4(cp, &op->Dst[0]);
#else
uint i;
/* For absolute correctness, need to spill/invalidate all XMM regs
* too.
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
/* Push caller-save (ie scratch) regs.
*/
x86_cdecl_caller_push_regs( cp->func );
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );
x87_fld_src( cp, &op->Src[1], 0 );
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
x87_fld_src( cp, &op->Src[0], 0 );
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
/* tmp_EAX has been pushed & will be restored below */
x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
x86_call( cp->func, cp->tmp_EAX );
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );
x86_cdecl_caller_pop_regs( cp->func );
/* Note retval on x87 stack:
*/
cp->func->x87_stack++;
x87_fstp_dest4( cp, &op->Dst[0] );
#endif
return TRUE;
}
#if FAST_MATH
static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
uint i;
/* For absolute correctness, need to spill/invalidate all XMM regs
* too.
*/
for (i = 0; i < 8; i++) {
if (cp->xmm[i].dirty)
spill(cp, i);
aos_release_xmm_reg(cp, i);
}
/* Push caller-save (ie scratch) regs.
*/
x86_cdecl_caller_push_regs( cp->func );
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );
x87_fld_src( cp, &op->Src[0], 0 );
x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );
/* tmp_EAX has been pushed & will be restored below */
x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
x86_call( cp->func, cp->tmp_EAX );
x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );
x86_cdecl_caller_pop_regs( cp->func );
/* Note retval on x87 stack:
*/
cp->func->x87_stack++;
x87_fstp_dest4( cp, &op->Dst[0] );
return TRUE;
}
#endif
static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg dst = aos_get_xmm_reg(cp);
if (cp->have_sse2) {
sse2_rcpss(cp->func, dst, arg0);
/* extend precision here...
*/
}
else {
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
sse_movss(cp->func, dst, ones);
sse_divss(cp->func, dst, arg0);
}
store_scalar_dest(cp, &op->Dst[0], dst);
return TRUE;
}
/* Although rsqrtps() and rcpps() are low precision on some/all SSE
* implementations, it is possible to improve its precision at
* fairly low cost, using a newton/raphson step, as below:
*
* x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
* x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
* or:
* x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
*
*
* See: http://softwarecommunity.intel.com/articles/eng/1818.htm
*/
static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
if (0) {
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg r = aos_get_xmm_reg(cp);
sse_rsqrtss(cp->func, r, arg0);
store_scalar_dest(cp, &op->Dst[0], r);
return TRUE;
}
else {
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg r = aos_get_xmm_reg(cp);
struct x86_reg neg_half = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
struct x86_reg src = get_xmm_writable( cp, arg0 );
struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
struct x86_reg tmp = aos_get_xmm_reg(cp);
sse_movaps(cp->func, tmp, src);
sse_mulps(cp->func, tmp, neg);
sse_maxps(cp->func, tmp, src);
sse_rsqrtss( cp->func, r, tmp ); /* rsqrtss(a) */
sse_mulss( cp->func, tmp, neg_half ); /* -.5 * a */
sse_mulss( cp->func, tmp, r ); /* -.5 * a * r */
sse_mulss( cp->func, tmp, r ); /* -.5 * a * r * r */
sse_addss( cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
sse_mulss( cp->func, r, tmp ); /* r * (1.5 - .5 * a * r * r) */
store_scalar_dest(cp, &op->Dst[0], r);
aos_release_xmm_reg(cp, tmp.idx);
return TRUE;
}
}
static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
sse_andps(cp->func, dst, ones);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
x87_fld_src(cp, &op->Src[0], 0);
x87_fsin(cp->func);
x87_fstp_dest4(cp, &op->Dst[0]);
return TRUE;
}
static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_cmpps(cp->func, dst, arg1, cc_LessThan);
sse_andps(cp->func, dst, ones);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg dst = get_xmm_writable(cp, arg0);
sse_subps(cp->func, dst, arg1);
store_dest(cp, &op->Dst[0], dst);
return TRUE;
}
static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg tmp0 = aos_get_xmm_reg(cp);
sse2_cvttps2dq(cp->func, tmp0, arg0);
sse2_cvtdq2ps(cp->func, tmp0, tmp0);
store_dest(cp, &op->Dst[0], tmp0);
return TRUE;
}
static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
{
struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
struct x86_reg tmp0 = aos_get_xmm_reg(cp);
struct x86_reg tmp1 = aos_get_xmm_reg(cp);
emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
sse_mulps(cp->func, tmp1, arg0);
emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
sse_mulps(cp->func, tmp0, arg1);
sse_subps(cp->func, tmp1, tmp0);
sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));
/* dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
/* dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
/* dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
/* dst[3] is undef */
aos_release_xmm_reg(cp, tmp0.idx);
store_dest(cp, &op->Dst[0], tmp1);
return TRUE;
}
static boolean
emit_instruction( struct aos_compilation *cp,
struct tgsi_full_instruction *inst )
{
x87_assert_stack_empty(cp->func);
switch( inst->Instruction.Opcode ) {
case TGSI_OPCODE_MOV:
return emit_MOV( cp, inst );
case TGSI_OPCODE_LIT:
return emit_LIT(cp, inst);
case TGSI_OPCODE_RCP:
return emit_RCP(cp, inst);
case TGSI_OPCODE_RSQ:
return emit_RSQ(cp, inst);
case TGSI_OPCODE_EXP:
/*return emit_EXP(cp, inst);*/
return FALSE;
case TGSI_OPCODE_LOG:
/*return emit_LOG(cp, inst);*/
return FALSE;
case TGSI_OPCODE_MUL:
return emit_MUL(cp, inst);
case TGSI_OPCODE_ADD:
return emit_ADD(cp, inst);
case TGSI_OPCODE_DP3:
return emit_DP3(cp, inst);
case TGSI_OPCODE_DP4:
return emit_DP4(cp, inst);
case TGSI_OPCODE_DST:
return emit_DST(cp, inst);
case TGSI_OPCODE_MIN:
return emit_MIN(cp, inst);
case TGSI_OPCODE_MAX:
return emit_MAX(cp, inst);
case TGSI_OPCODE_SLT:
return emit_SLT(cp, inst);
case TGSI_OPCODE_SGE:
return emit_SGE(cp, inst);
case TGSI_OPCODE_MAD:
return emit_MAD(cp, inst);
case TGSI_OPCODE_SUB:
return emit_SUB(cp, inst);
case TGSI_OPCODE_LRP:
/*return emit_LERP(cp, inst);*/
return FALSE;
case TGSI_OPCODE_FRC:
return emit_FRC(cp, inst);
case TGSI_OPCODE_CLAMP:
/*return emit_CLAMP(cp, inst);*/
return FALSE;
case TGSI_OPCODE_FLR:
return emit_FLR(cp, inst);
case TGSI_OPCODE_ROUND:
return emit_RND(cp, inst);
case TGSI_OPCODE_EX2:
#if FAST_MATH
return emit_EXPBASE2(cp, inst);
#elif 0
/* this seems to fail for "larger" exponents.
* See glean tvertProg1's EX2 test.
*/
return emit_EX2(cp, inst);
#else
return FALSE;
#endif
case TGSI_OPCODE_LG2:
return emit_LG2(cp, inst);
case TGSI_OPCODE_POW:
return emit_POW(cp, inst);
case TGSI_OPCODE_XPD:
return emit_XPD(cp, inst);
case TGSI_OPCODE_ABS:
return emit_ABS(cp, inst);
case TGSI_OPCODE_DPH:
return emit_DPH(cp, inst);
case TGSI_OPCODE_COS:
return emit_COS(cp, inst);
case TGSI_OPCODE_SIN:
return emit_SIN(cp, inst);
case TGSI_OPCODE_TRUNC:
return emit_TRUNC(cp, inst);
case TGSI_OPCODE_END:
return TRUE;
default:
return FALSE;
}
}
static boolean emit_viewport( struct aos_compilation *cp )
{
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
TGSI_FILE_OUTPUT,
cp->vaos->draw->vs.position_output );
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, scale));
struct x86_reg translate = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, translate));
sse_mulps(cp->func, pos, scale);
sse_addps(cp->func, pos, translate);
aos_adopt_xmm_reg( cp,
pos,
TGSI_FILE_OUTPUT,
cp->vaos->draw->vs.position_output,
TRUE );
return TRUE;
}
/* This is useful to be able to see the results on softpipe. Doesn't
* do proper clipping, just assumes the backend can do it during
* rasterization -- for debug only...
*/
static boolean emit_rhw_viewport( struct aos_compilation *cp )
{
struct x86_reg tmp = aos_get_xmm_reg(cp);
struct x86_reg pos = aos_get_shader_reg_xmm(cp,
TGSI_FILE_OUTPUT,
cp->vaos->draw->vs.position_output);
struct x86_reg scale = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, scale));
struct x86_reg translate = x86_make_disp(cp->machine_EDX,
Offset(struct aos_machine, translate));
emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
sse2_rcpss(cp->func, tmp, tmp);
sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));
sse_mulps(cp->func, pos, scale);
sse_mulps(cp->func, pos, tmp);
sse_addps(cp->func, pos, translate);
/* Set pos[3] = w
*/
mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);
aos_adopt_xmm_reg( cp,
pos,
TGSI_FILE_OUTPUT,
cp->vaos->draw->vs.position_output,
TRUE );
return TRUE;
}
#if 0
static boolean note_immediate( struct aos_compilation *cp,
struct tgsi_full_immediate *imm )
{
unsigned pos = cp->num_immediates++;
unsigned j;
assert( imm->Immediate.NrTokens <= 4 + 1 );
for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
}
return TRUE;
}
#endif
static void find_last_write_outputs( struct aos_compilation *cp )
{
struct tgsi_parse_context parse;
unsigned this_instruction = 0;
unsigned i;
tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );
while (!tgsi_parse_end_of_tokens( &parse )) {
tgsi_parse_token( &parse );
if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
continue;
for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
TGSI_FILE_OUTPUT)
{
unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
cp->output_last_write[idx] = this_instruction;
}
}
this_instruction++;
}
tgsi_parse_free( &parse );
}
#define ARG_MACHINE 1
#define ARG_START_ELTS 2
#define ARG_COUNT 3
#define ARG_OUTBUF 4
static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
boolean linear )
{
struct tgsi_parse_context parse;
struct aos_compilation cp;
unsigned fixup, label;
util_init_math();
tgsi_parse_init( &parse, varient->base.vs->state.tokens );
memset(&cp, 0, sizeof(cp));
cp.insn_counter = 1;
cp.vaos = varient;
cp.have_sse2 = 1;
cp.func = &varient->func[ linear ? 0 : 1 ];
cp.tmp_EAX = x86_make_reg(file_REG32, reg_AX);
cp.idx_EBX = x86_make_reg(file_REG32, reg_BX);
cp.outbuf_ECX = x86_make_reg(file_REG32, reg_CX);
cp.machine_EDX = x86_make_reg(file_REG32, reg_DX);
cp.count_ESI = x86_make_reg(file_REG32, reg_SI);
cp.temp_EBP = x86_make_reg(file_REG32, reg_BP);
cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );
x86_init_func(cp.func);
find_last_write_outputs(&cp);
x86_push(cp.func, cp.idx_EBX);
x86_push(cp.func, cp.count_ESI);
x86_push(cp.func, cp.temp_EBP);
/* Load arguments into regs:
*/
x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));
/* Compare count to zero and possibly bail.
*/
x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
fixup = x86_jcc_forward(cp.func, cc_E);
save_fpu_state( &cp );
set_fpu_round_nearest( &cp );
aos_init_inputs( &cp, linear );
cp.x86_reg[0] = 0;
cp.x86_reg[1] = 0;
/* Note address for loop jump
*/
label = x86_get_label(cp.func);
{
/* Fetch inputs... TODO: fetch lazily...
*/
if (!aos_fetch_inputs( &cp, linear ))
goto fail;
/* Emit the shader:
*/
while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
{
tgsi_parse_token( &parse );
switch (parse.FullToken.Token.Type) {
case TGSI_TOKEN_TYPE_IMMEDIATE:
#if 0
if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
goto fail;
#endif
break;
case TGSI_TOKEN_TYPE_INSTRUCTION:
if (DISASSEM)
tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );
if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
goto fail;
break;
}
x87_assert_stack_empty(cp.func);
cp.insn_counter++;
if (DISASSEM)
debug_printf("\n");
}
{
unsigned i;
for (i = 0; i < 8; i++) {
if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
cp.xmm[i].file = TGSI_FILE_NULL;
cp.xmm[i].dirty = 0;
}
}
}
if (cp.error)
goto fail;
if (cp.vaos->base.key.clip) {
/* not really handling clipping, just do the rhw so we can
* see the results...
*/
emit_rhw_viewport(&cp);
}
else if (cp.vaos->base.key.viewport) {
emit_viewport(&cp);
}
/* Emit output... TODO: do this eagerly after the last write to a
* given output.
*/
if (!aos_emit_outputs( &cp ))
goto fail;
/* Next vertex:
*/
x86_lea(cp.func,
cp.outbuf_ECX,
x86_make_disp(cp.outbuf_ECX,
cp.vaos->base.key.output_stride));
/* Incr index
*/
aos_incr_inputs( &cp, linear );
}
/* decr count, loop if not zero
*/
x86_dec(cp.func, cp.count_ESI);
x86_jcc(cp.func, cc_NZ, label);
restore_fpu_state(&cp);
/* Land forward jump here:
*/
x86_fixup_fwd_jump(cp.func, fixup);
/* Exit mmx state?
*/
if (cp.func->need_emms)
mmx_emms(cp.func);
x86_pop(cp.func, cp.temp_EBP);
x86_pop(cp.func, cp.count_ESI);
x86_pop(cp.func, cp.idx_EBX);
x87_assert_stack_empty(cp.func);
x86_ret(cp.func);
tgsi_parse_free( &parse );
return !cp.error;
fail:
tgsi_parse_free( &parse );
return FALSE;
}
static void vaos_set_buffer( struct draw_vs_varient *varient,
unsigned buf,
const void *ptr,
unsigned stride )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
if (buf < vaos->nr_vb) {
vaos->buffer[buf].base_ptr = (char *)ptr;
vaos->buffer[buf].stride = stride;
}
if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
}
static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
const unsigned *elts,
unsigned count,
void *output_buffer )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
struct aos_machine *machine = vaos->draw->vs.aos_machine;
if (0) debug_printf("%s %d\n", __FUNCTION__, count);
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
machine->constants = vaos->draw->vs.aligned_constants;
machine->immediates = vaos->base.vs->immediates;
machine->buffer = vaos->buffer;
vaos->gen_run_elts( machine,
elts,
count,
output_buffer );
}
static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
unsigned start,
unsigned count,
void *output_buffer )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
struct aos_machine *machine = vaos->draw->vs.aos_machine;
if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
vaos->base.key.const_vbuffers);
machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
machine->constants = vaos->draw->vs.aligned_constants;
machine->immediates = vaos->base.vs->immediates;
machine->buffer = vaos->buffer;
vaos->gen_run_linear( machine,
start,
count,
output_buffer );
/* Sanity spot checks to make sure we didn't trash our constants */
assert(machine->internal[IMM_ONES][0] == 1.0f);
assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
assert(machine->internal[IMM_NEGS][0] == -1.0f);
}
static void vaos_destroy( struct draw_vs_varient *varient )
{
struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
FREE( vaos->buffer );
x86_release_func( &vaos->func[0] );
x86_release_func( &vaos->func[1] );
FREE(vaos);
}
static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
const struct draw_vs_varient_key *key )
{
unsigned i;
struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);
if (!vaos)
goto fail;
vaos->base.key = *key;
vaos->base.vs = vs;
vaos->base.set_buffer = vaos_set_buffer;
vaos->base.destroy = vaos_destroy;
vaos->base.run_linear = vaos_run_linear;
vaos->base.run_elts = vaos_run_elts;
vaos->draw = vs->draw;
for (i = 0; i < key->nr_inputs; i++)
vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );
vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
if (!vaos->buffer)
goto fail;
if (0)
debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);
#if 0
tgsi_dump(vs->state.tokens, 0);
#endif
if (!build_vertex_program( vaos, TRUE ))
goto fail;
if (!build_vertex_program( vaos, FALSE ))
goto fail;
vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
if (!vaos->gen_run_linear)
goto fail;
vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
if (!vaos->gen_run_elts)
goto fail;
return &vaos->base;
fail:
if (vaos && vaos->buffer)
FREE(vaos->buffer);
if (vaos)
x86_release_func( &vaos->func[0] );
if (vaos)
x86_release_func( &vaos->func[1] );
FREE(vaos);
return NULL;
}
struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
const struct draw_vs_varient_key *key )
{
struct draw_vs_varient *varient = varient_aos_sse( vs, key );
if (varient == NULL) {
varient = draw_vs_varient_generic( vs, key );
}
return varient;
}
#endif /* PIPE_ARCH_X86 */