src/gallium/auxiliary/draw/draw_vs_aos.c - fp2-dev/platform/external/mesa3d - Gitiles

 /*
  * Mesa 3-D graphics library
  * Version:  6.3
  *
  * Copyright (C) 1999-2004  Brian Paul   All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  * and/or sell copies of the Software, and to permit persons to whom the
  * Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice shall be included
  * in all copies or substantial portions of the Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
  */

 /**
  * Translate tgsi vertex programs to x86/x87/SSE/SSE2 machine code
  * using the rtasm runtime assembler.  Based on the old
  * t_vb_arb_program_sse.c
  */


 #include "util/u_memory.h"
 #include "util/u_math.h"
 #include "pipe/p_shader_tokens.h"
 #include "util/u_debug.h"
 #include "tgsi/tgsi_parse.h"
 #include "tgsi/tgsi_util.h"
 #include "tgsi/tgsi_exec.h"
 #include "tgsi/tgsi_dump.h"

 #include "draw_vs.h"
 #include "draw_vs_aos.h"

 #include "rtasm/rtasm_x86sse.h"

 #ifdef PIPE_ARCH_X86
 #define DISASSEM 0
 #define FAST_MATH 1

 static const char *files[] =
 {
    "NULL",
    "CONST",
    "IN",
    "OUT",
    "TEMP",
    "SAMP",
    "ADDR",
    "IMM",
    "INTERNAL",
 };

 static INLINE boolean eq( struct x86_reg a,
 			    struct x86_reg b )
 {
    return (a.file == b.file &&
 	   a.idx == b.idx &&
 	   a.mod == b.mod &&
 	   a.disp == b.disp);
 }

 struct x86_reg aos_get_x86( struct aos_compilation *cp,
                             unsigned which_reg, /* quick hack */
                             unsigned value )
 {
    struct x86_reg reg;

    if (which_reg == 0)
       reg = cp->temp_EBP;
    else
       reg = cp->tmp_EAX;

    if (cp->x86_reg[which_reg] != value) {
       unsigned offset;

       switch (value) {
       case X86_IMMEDIATES:
          assert(which_reg == 0);
          offset = Offset(struct aos_machine, immediates);
          break;
       case X86_CONSTANTS:
          assert(which_reg == 1);
          offset = Offset(struct aos_machine, constants);
          break;
       case X86_BUFFERS:
          assert(which_reg == 0);
          offset = Offset(struct aos_machine, buffer);
          break;
       default:
          assert(0);
          offset = 0;
       }


       x86_mov(cp->func, reg,
               x86_make_disp(cp->machine_EDX, offset));

       cp->x86_reg[which_reg] = value;
    }

    return reg;
 }


 static struct x86_reg get_reg_ptr(struct aos_compilation *cp,
                                   unsigned file,
 				  unsigned idx )
 {
    struct x86_reg ptr = cp->machine_EDX;

    switch (file) {
    case TGSI_FILE_INPUT:
       assert(idx < MAX_INPUTS);
       return x86_make_disp(ptr, Offset(struct aos_machine, input[idx]));

    case TGSI_FILE_OUTPUT:
       return x86_make_disp(ptr, Offset(struct aos_machine, output[idx]));

    case TGSI_FILE_TEMPORARY:
       assert(idx < MAX_TEMPS);
       return x86_make_disp(ptr, Offset(struct aos_machine, temp[idx]));

    case AOS_FILE_INTERNAL:
       assert(idx < MAX_INTERNALS);
       return x86_make_disp(ptr, Offset(struct aos_machine, internal[idx]));

    case TGSI_FILE_IMMEDIATE:
       assert(idx < MAX_IMMEDIATES);  /* just a sanity check */
       return x86_make_disp(aos_get_x86(cp, 0, X86_IMMEDIATES), idx * 4 * sizeof(float));

    case TGSI_FILE_CONSTANT:
       assert(idx < MAX_CONSTANTS);  /* just a sanity check */
       return x86_make_disp(aos_get_x86(cp, 1, X86_CONSTANTS), idx * 4 * sizeof(float));

    default:
       AOS_ERROR(cp, "unknown reg file");
       return x86_make_reg(0,0);
    }
 }


 #define X87_CW_EXCEPTION_INV_OP       (1<<0)
 #define X87_CW_EXCEPTION_DENORM_OP    (1<<1)
 #define X87_CW_EXCEPTION_ZERO_DIVIDE  (1<<2)
 #define X87_CW_EXCEPTION_OVERFLOW     (1<<3)
 #define X87_CW_EXCEPTION_UNDERFLOW    (1<<4)
 #define X87_CW_EXCEPTION_PRECISION    (1<<5)
 #define X87_CW_PRECISION_SINGLE       (0<<8)
 #define X87_CW_PRECISION_RESERVED     (1<<8)
 #define X87_CW_PRECISION_DOUBLE       (2<<8)
 #define X87_CW_PRECISION_DOUBLE_EXT   (3<<8)
 #define X87_CW_PRECISION_MASK         (3<<8)
 #define X87_CW_ROUND_NEAREST          (0<<10)
 #define X87_CW_ROUND_DOWN             (1<<10)
 #define X87_CW_ROUND_UP               (2<<10)
 #define X87_CW_ROUND_ZERO             (3<<10)
 #define X87_CW_ROUND_MASK             (3<<10)
 #define X87_CW_INFINITY               (1<<12)


 static void spill( struct aos_compilation *cp, unsigned idx )
 {
    if (!cp->xmm[idx].dirty ||
        (cp->xmm[idx].file != TGSI_FILE_INPUT && /* inputs are fetched into xmm & set dirty */
         cp->xmm[idx].file != TGSI_FILE_OUTPUT &&
         cp->xmm[idx].file != TGSI_FILE_TEMPORARY)) {
       AOS_ERROR(cp, "invalid spill");
       return;
    }
    else {
       struct x86_reg oldval = get_reg_ptr(cp,
                                           cp->xmm[idx].file,
                                           cp->xmm[idx].idx);

       if (0) debug_printf("\nspill %s[%d]",
                           files[cp->xmm[idx].file],
                           cp->xmm[idx].idx);

       assert(cp->xmm[idx].dirty);
       sse_movaps(cp->func, oldval, x86_make_reg(file_XMM, idx));
       cp->xmm[idx].dirty = 0;
    }
 }


 void aos_spill_all( struct aos_compilation *cp )
 {
    unsigned i;

    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].dirty)
          spill(cp, i);
       aos_release_xmm_reg(cp, i);
    }
 }


 static struct x86_reg get_xmm_writable( struct aos_compilation *cp,
                                         struct x86_reg reg )
 {
    if (reg.file != file_XMM ||
        cp->xmm[reg.idx].file != TGSI_FILE_NULL)
    {
       struct x86_reg tmp = aos_get_xmm_reg(cp);
       sse_movaps(cp->func, tmp, reg);
       reg = tmp;
    }

    cp->xmm[reg.idx].last_used = cp->insn_counter;
    return reg;
 }

 static struct x86_reg get_xmm( struct aos_compilation *cp,
                                struct x86_reg reg )
 {
    if (reg.file != file_XMM)
    {
       struct x86_reg tmp = aos_get_xmm_reg(cp);
       sse_movaps(cp->func, tmp, reg);
       reg = tmp;
    }

    cp->xmm[reg.idx].last_used = cp->insn_counter;
    return reg;
 }


 /* Allocate an empty xmm register, either as a temporary or later to
  * "adopt" as a shader reg.
  */
 struct x86_reg aos_get_xmm_reg( struct aos_compilation *cp )
 {
    unsigned i;
    unsigned oldest = 0;
    boolean found = FALSE;

    for (i = 0; i < 8; i++)
       if (cp->xmm[i].last_used != cp->insn_counter &&
           cp->xmm[i].file == TGSI_FILE_NULL) {
 	 oldest = i;
          found = TRUE;
       }

    if (!found) {
       for (i = 0; i < 8; i++)
          if (cp->xmm[i].last_used < cp->xmm[oldest].last_used)
             oldest = i;
    }

    /* Need to write out the old value?
     */
    if (cp->xmm[oldest].dirty)
       spill(cp, oldest);

    assert(cp->xmm[oldest].last_used != cp->insn_counter);

    cp->xmm[oldest].file = TGSI_FILE_NULL;
    cp->xmm[oldest].idx = 0;
    cp->xmm[oldest].dirty = 0;
    cp->xmm[oldest].last_used = cp->insn_counter;
    return x86_make_reg(file_XMM, oldest);
 }

 void aos_release_xmm_reg( struct aos_compilation *cp,
                           unsigned idx )
 {
    cp->xmm[idx].file = TGSI_FILE_NULL;
    cp->xmm[idx].idx = 0;
    cp->xmm[idx].dirty = 0;
    cp->xmm[idx].last_used = 0;
 }


 static void aos_soft_release_xmm( struct aos_compilation *cp,
                                   struct x86_reg reg )
 {
    if (reg.file == file_XMM) {
       assert(cp->xmm[reg.idx].last_used == cp->insn_counter);
       cp->xmm[reg.idx].last_used = cp->insn_counter - 1;
    }
 }


 /* Mark an xmm reg as holding the current copy of a shader reg.
  */
 void aos_adopt_xmm_reg( struct aos_compilation *cp,
                         struct x86_reg reg,
                         unsigned file,
                         unsigned idx,
                         unsigned dirty )
 {
    unsigned i;

    if (reg.file != file_XMM) {
       assert(0);
       return;
    }


    /* If any xmm reg thinks it holds this shader reg, break the
     * illusion.
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].file == file &&
           cp->xmm[i].idx == idx)
       {
          /* If an xmm reg is already holding this shader reg, take into account its
           * dirty flag...
           */
          dirty |= cp->xmm[i].dirty;
          aos_release_xmm_reg(cp, i);
       }
    }

    cp->xmm[reg.idx].file = file;
    cp->xmm[reg.idx].idx = idx;
    cp->xmm[reg.idx].dirty = dirty;
    cp->xmm[reg.idx].last_used = cp->insn_counter;
 }


 /* Return a pointer to the in-memory copy of the reg, making sure it is uptodate.
  */
 static struct x86_reg aos_get_shader_reg_ptr( struct aos_compilation *cp,
                                               unsigned file,
                                               unsigned idx )
 {
    unsigned i;

    /* Ensure the in-memory copy of this reg is up-to-date
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].file == file &&
           cp->xmm[i].idx == idx &&
           cp->xmm[i].dirty) {
          spill(cp, i);
       }
    }

    return get_reg_ptr( cp, file, idx );
 }


 /* As above, but return a pointer.  Note - this pointer may alias
  * those returned by get_arg_ptr().
  */
 static struct x86_reg get_dst_ptr( struct aos_compilation *cp,
                                    const struct tgsi_full_dst_register *dst )
 {
    unsigned file = dst->Register.File;
    unsigned idx = dst->Register.Index;
    unsigned i;


    /* Ensure in-memory copy of this reg is up-to-date and invalidate
     * any xmm copies.
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].file == file &&
           cp->xmm[i].idx == idx)
       {
          if (cp->xmm[i].dirty)
             spill(cp, i);

          aos_release_xmm_reg(cp, i);
       }
    }

    return get_reg_ptr( cp, file, idx );
 }


 /* Return an XMM reg if the argument is resident, otherwise return a
  * base+offset pointer to the saved value.
  */
 struct x86_reg aos_get_shader_reg( struct aos_compilation *cp,
                                    unsigned file,
                                    unsigned idx )
 {
    unsigned i;

    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].file == file &&
 	  cp->xmm[i].idx  == idx)
       {
 	 cp->xmm[i].last_used = cp->insn_counter;
 	 return x86_make_reg(file_XMM, i);
       }
    }

    /* If not found in the XMM register file, return an indirect
     * reference to the in-memory copy:
     */
    return get_reg_ptr( cp, file, idx );
 }


 static struct x86_reg aos_get_shader_reg_xmm( struct aos_compilation *cp,
                                               unsigned file,
                                               unsigned idx )
 {
    struct x86_reg reg = get_xmm( cp,
                                  aos_get_shader_reg( cp, file, idx ) );

    aos_adopt_xmm_reg( cp,
                       reg,
                       file,
                       idx,
                       FALSE );

    return reg;
 }


 struct x86_reg aos_get_internal_xmm( struct aos_compilation *cp,
                                      unsigned imm )
 {
    return aos_get_shader_reg_xmm( cp, AOS_FILE_INTERNAL, imm );
 }


 struct x86_reg aos_get_internal( struct aos_compilation *cp,
                                  unsigned imm )
 {
    return aos_get_shader_reg( cp, AOS_FILE_INTERNAL, imm );
 }


 /* Emulate pshufd insn in regular SSE, if necessary:
  */
 static void emit_pshufd( struct aos_compilation *cp,
 			 struct x86_reg dst,
 			 struct x86_reg arg0,
 			 ubyte shuf )
 {
    if (cp->have_sse2) {
       sse2_pshufd(cp->func, dst, arg0, shuf);
    }
    else {
       if (!eq(dst, arg0))
 	 sse_movaps(cp->func, dst, arg0);

       sse_shufps(cp->func, dst, dst, shuf);
    }
 }

 /* load masks (pack into negs??)
  * pshufd - shuffle according to writemask
  * and - result, mask
  * nand - dest, mask
  * or - dest, result
  */
 static boolean mask_write( struct aos_compilation *cp,
                            struct x86_reg dst,
                            struct x86_reg result,
                            unsigned mask )
 {
    struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
    struct x86_reg tmp = aos_get_xmm_reg(cp);

    emit_pshufd(cp, tmp, imm_swz,
                SHUF((mask & 1) ? 2 : 3,
                     (mask & 2) ? 2 : 3,
                     (mask & 4) ? 2 : 3,
                     (mask & 8) ? 2 : 3));

    sse_andps(cp->func, dst, tmp);
    sse_andnps(cp->func, tmp, result);
    sse_orps(cp->func, dst, tmp);

    aos_release_xmm_reg(cp, tmp.idx);
    return TRUE;
 }


 /* Helper for writemask:
  */
 static boolean emit_shuf_copy2( struct aos_compilation *cp,
 				  struct x86_reg dst,
 				  struct x86_reg arg0,
 				  struct x86_reg arg1,
 				  ubyte shuf )
 {
    struct x86_reg tmp = aos_get_xmm_reg(cp);

    emit_pshufd(cp, dst, arg1, shuf);
    emit_pshufd(cp, tmp, arg0, shuf);
    sse_shufps(cp->func, dst, tmp, SHUF(X, Y, Z, W));
    emit_pshufd(cp, dst, dst, shuf);

    aos_release_xmm_reg(cp, tmp.idx);
    return TRUE;
 }


 #define SSE_SWIZZLE_NOOP ((0<<0) | (1<<2) | (2<<4) | (3<<6))


 /* Locate a source register and perform any required (simple) swizzle.
  *
  * Just fail on complex swizzles at this point.
  */
 static struct x86_reg fetch_src( struct aos_compilation *cp,
                                  const struct tgsi_full_src_register *src )
 {
    struct x86_reg arg0 = aos_get_shader_reg(cp,
                                             src->Register.File,
                                             src->Register.Index);
    unsigned i;
    ubyte swz = 0;
    unsigned negs = 0;
    unsigned abs = 0;

    for (i = 0; i < 4; i++) {
       unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, i );
       unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, i );

       swz |= (swizzle & 0x3) << (i * 2);

       switch (neg) {
       case TGSI_UTIL_SIGN_TOGGLE:
          negs |= (1<<i);
          break;

       case TGSI_UTIL_SIGN_KEEP:
          break;

       case TGSI_UTIL_SIGN_CLEAR:
          abs |= (1<<i);
          break;

       default:
          AOS_ERROR(cp, "unsupported sign-mode");
          break;
       }
    }

    if (swz != SSE_SWIZZLE_NOOP || negs != 0 || abs != 0) {
       struct x86_reg dst = aos_get_xmm_reg(cp);

       if (swz != SSE_SWIZZLE_NOOP)
          emit_pshufd(cp, dst, arg0, swz);
       else
          sse_movaps(cp->func, dst, arg0);

       if (negs && negs != 0xf) {
          struct x86_reg imm_swz = aos_get_internal_xmm(cp, IMM_SWZ);
          struct x86_reg tmp = aos_get_xmm_reg(cp);

          /* Load 1,-1,0,0
           * Use neg as arg to pshufd
           * Multiply
           */
          emit_pshufd(cp, tmp, imm_swz,
                      SHUF((negs & 1) ? 1 : 0,
                           (negs & 2) ? 1 : 0,
                           (negs & 4) ? 1 : 0,
                           (negs & 8) ? 1 : 0));
          sse_mulps(cp->func, dst, tmp);

          aos_release_xmm_reg(cp, tmp.idx);
          aos_soft_release_xmm(cp, imm_swz);
       }
       else if (negs) {
          struct x86_reg imm_negs = aos_get_internal_xmm(cp, IMM_NEGS);
          sse_mulps(cp->func, dst, imm_negs);
          aos_soft_release_xmm(cp, imm_negs);
       }


       if (abs && abs != 0xf) {
          AOS_ERROR(cp, "unsupported partial abs");
       }
       else if (abs) {
          struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
          struct x86_reg tmp = aos_get_xmm_reg(cp);

          sse_movaps(cp->func, tmp, dst);
          sse_mulps(cp->func, tmp, neg);
          sse_maxps(cp->func, dst, tmp);

          aos_release_xmm_reg(cp, tmp.idx);
          aos_soft_release_xmm(cp, neg);
       }

       aos_soft_release_xmm(cp, arg0);
       return dst;
    }

    return arg0;
 }

 static void x87_fld_src( struct aos_compilation *cp,
                          const struct tgsi_full_src_register *src,
                          unsigned channel )
 {
    struct x86_reg arg0 = aos_get_shader_reg_ptr(cp,
                                                 src->Register.File,
                                                 src->Register.Index);

    unsigned swizzle = tgsi_util_get_full_src_register_swizzle( src, channel );
    unsigned neg = tgsi_util_get_full_src_register_sign_mode( src, channel );

    x87_fld( cp->func, x86_make_disp(arg0, (swizzle & 3) * sizeof(float)) );

    switch (neg) {
    case TGSI_UTIL_SIGN_TOGGLE:
       /* Flip the sign:
        */
       x87_fchs( cp->func );
       break;

    case TGSI_UTIL_SIGN_KEEP:
       break;

    case TGSI_UTIL_SIGN_CLEAR:
       x87_fabs( cp->func );
       break;

    case TGSI_UTIL_SIGN_SET:
       x87_fabs( cp->func );
       x87_fchs( cp->func );
       break;

    default:
       AOS_ERROR(cp, "unsupported sign-mode");
       break;
    }
 }


 /* Used to implement write masking.  This and most of the other instructions
  * here would be easier to implement if there had been a translation
  * to a 2 argument format (dst/arg0, arg1) at the shader level before
  * attempting to translate to x86/sse code.
  */
 static void store_dest( struct aos_compilation *cp,
                         const struct tgsi_full_dst_register *reg,
                         struct x86_reg result )
 {
    struct x86_reg dst;

    switch (reg->Register.WriteMask) {
    case 0:
       return;

    case TGSI_WRITEMASK_XYZW:
       aos_adopt_xmm_reg(cp,
                         get_xmm_writable(cp, result),
                         reg->Register.File,
                         reg->Register.Index,
                         TRUE);
       return;
    default:
       break;
    }

    dst = aos_get_shader_reg_xmm(cp,
                                 reg->Register.File,
                                 reg->Register.Index);

    switch (reg->Register.WriteMask) {
    case TGSI_WRITEMASK_X:
       sse_movss(cp->func, dst, get_xmm(cp, result));
       break;

    case TGSI_WRITEMASK_ZW:
       sse_shufps(cp->func, dst, get_xmm(cp, result), SHUF(X, Y, Z, W));
       break;

    case TGSI_WRITEMASK_XY:
       result = get_xmm_writable(cp, result);
       sse_shufps(cp->func, result, dst, SHUF(X, Y, Z, W));
       dst = result;
       break;

    case TGSI_WRITEMASK_YZW:
       result = get_xmm_writable(cp, result);
       sse_movss(cp->func, result, dst);
       dst = result;
       break;

    default:
       mask_write(cp, dst, result, reg->Register.WriteMask);
       break;
    }

    aos_adopt_xmm_reg(cp,
                      dst,
                      reg->Register.File,
                      reg->Register.Index,
                      TRUE);

 }

 static void inject_scalar( struct aos_compilation *cp,
                            struct x86_reg dst,
                            struct x86_reg result,
                            ubyte swizzle )
 {
    sse_shufps(cp->func, dst, dst, swizzle);
    sse_movss(cp->func, dst, result);
    sse_shufps(cp->func, dst, dst, swizzle);
 }


 static void store_scalar_dest( struct aos_compilation *cp,
                                const struct tgsi_full_dst_register *reg,
                                struct x86_reg result )
 {
    unsigned writemask = reg->Register.WriteMask;
    struct x86_reg dst;

    if (writemask != TGSI_WRITEMASK_X &&
        writemask != TGSI_WRITEMASK_Y &&
        writemask != TGSI_WRITEMASK_Z &&
        writemask != TGSI_WRITEMASK_W &&
        writemask != 0)
    {
       result = get_xmm_writable(cp, result); /* already true, right? */
       sse_shufps(cp->func, result, result, SHUF(X,X,X,X));
       store_dest(cp, reg, result);
       return;
    }

    result = get_xmm(cp, result);
    dst = aos_get_shader_reg_xmm(cp,
                                 reg->Register.File,
                                 reg->Register.Index);


    switch (reg->Register.WriteMask) {
    case TGSI_WRITEMASK_X:
       sse_movss(cp->func, dst, result);
       break;

    case TGSI_WRITEMASK_Y:
       inject_scalar(cp, dst, result, SHUF(Y, X, Z, W));
       break;

    case TGSI_WRITEMASK_Z:
       inject_scalar(cp, dst, result, SHUF(Z, Y, X, W));
       break;

    case TGSI_WRITEMASK_W:
       inject_scalar(cp, dst, result, SHUF(W, Y, Z, X));
       break;

    default:
       break;
    }

    aos_adopt_xmm_reg(cp,
                      dst,
                      reg->Register.File,
                      reg->Register.Index,
                      TRUE);
 }


 static void x87_fst_or_nop( struct x86_function *func,
                             unsigned writemask,
                             unsigned channel,
                             struct x86_reg ptr )
 {
    assert(ptr.file == file_REG32);
    if (writemask & (1<<channel))
       x87_fst( func, x86_make_disp(ptr, channel * sizeof(float)) );
 }

 static void x87_fstp_or_pop( struct x86_function *func,
                              unsigned writemask,
                              unsigned channel,
                              struct x86_reg ptr )
 {
    assert(ptr.file == file_REG32);
    if (writemask & (1<<channel))
       x87_fstp( func, x86_make_disp(ptr, channel * sizeof(float)) );
    else
       x87_fstp( func, x86_make_reg( file_x87, 0 ));
 }


 /*
  */
 static void x87_fstp_dest4( struct aos_compilation *cp,
                             const struct tgsi_full_dst_register *dst )
 {
    struct x86_reg ptr = get_dst_ptr(cp, dst);
    unsigned writemask = dst->Register.WriteMask;

    x87_fst_or_nop(cp->func, writemask, 0, ptr);
    x87_fst_or_nop(cp->func, writemask, 1, ptr);
    x87_fst_or_nop(cp->func, writemask, 2, ptr);
    x87_fstp_or_pop(cp->func, writemask, 3, ptr);
 }

 /* Save current x87 state and put it into single precision mode.
  */
 static void save_fpu_state( struct aos_compilation *cp )
 {
    x87_fnstcw( cp->func, x86_make_disp(cp->machine_EDX,
                                        Offset(struct aos_machine, fpu_restore)));
 }

 static void restore_fpu_state( struct aos_compilation *cp )
 {
    x87_fnclex(cp->func);
    x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
                                       Offset(struct aos_machine, fpu_restore)));
 }

 static void set_fpu_round_neg_inf( struct aos_compilation *cp )
 {
    if (cp->fpucntl != FPU_RND_NEG) {
       cp->fpucntl = FPU_RND_NEG;
       x87_fnclex(cp->func);
       x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
                                          Offset(struct aos_machine, fpu_rnd_neg_inf)));
    }
 }

 static void set_fpu_round_nearest( struct aos_compilation *cp )
 {
    if (cp->fpucntl != FPU_RND_NEAREST) {
       cp->fpucntl = FPU_RND_NEAREST;
       x87_fnclex(cp->func);
       x87_fldcw( cp->func, x86_make_disp(cp->machine_EDX,
                                          Offset(struct aos_machine, fpu_rnd_nearest)));
    }
 }

 #if 0
 static void x87_emit_ex2( struct aos_compilation *cp )
 {
    struct x86_reg st0 = x86_make_reg(file_x87, 0);
    struct x86_reg st1 = x86_make_reg(file_x87, 1);
    int stack = cp->func->x87_stack;

    /* set_fpu_round_neg_inf( cp ); */

    x87_fld(cp->func, st0);      /* a a */
    x87_fprndint( cp->func );	/* int(a) a*/
    x87_fsubr(cp->func, st1, st0);    /* int(a) frc(a) */
    x87_fxch(cp->func, st1);     /* frc(a) int(a) */
    x87_f2xm1(cp->func);         /* (2^frc(a))-1 int(a) */
    x87_fld1(cp->func);          /* 1 (2^frc(a))-1 int(a) */
    x87_faddp(cp->func, st1);	/* 2^frac(a) int(a)  */
    x87_fscale(cp->func);	/* (2^frac(a)*2^int(int(a))) int(a) */
                                 /* 2^a int(a) */
    x87_fstp(cp->func, st1);     /* 2^a */

    assert( stack == cp->func->x87_stack);

 }
 #endif

 #if 0
 static void PIPE_CDECL print_reg( const char *msg,
                                   const float *reg )
 {
    debug_printf("%s: %f %f %f %f\n", msg, reg[0], reg[1], reg[2], reg[3]);
 }
 #endif

 #if 0
 static void emit_print( struct aos_compilation *cp,
                         const char *message, /* must point to a static string! */
                         unsigned file,
                         unsigned idx )
 {
    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
    struct x86_reg arg = aos_get_shader_reg_ptr( cp, file, idx );
    unsigned i;

    /* There shouldn't be anything on the x87 stack.  Can add this
     * capacity later if need be.
     */
    assert(cp->func->x87_stack == 0);

    /* For absolute correctness, need to spill/invalidate all XMM regs
     * too.  We're obviously not concerned about performance on this
     * debug path, so here goes:
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].dirty)
          spill(cp, i);

       aos_release_xmm_reg(cp, i);
    }

    /* Push caller-save (ie scratch) regs.
     */
    x86_cdecl_caller_push_regs( cp->func );


    /* Push the arguments:
     */
    x86_lea( cp->func, ecx, arg );
    x86_push( cp->func, ecx );
    x86_push_imm32( cp->func, (int)message );

    /* Call the helper.  Could call debug_printf directly, but
     * print_reg is a nice place to put a breakpoint if need be.
     */
    x86_mov_reg_imm( cp->func, ecx, (int)print_reg );
    x86_call( cp->func, ecx );
    x86_pop( cp->func, ecx );
    x86_pop( cp->func, ecx );

    /* Pop caller-save regs
     */
    x86_cdecl_caller_pop_regs( cp->func );

    /* Done...
     */
 }
 #endif

 /**
  * The traditional instructions.  All operate on internal registers
  * and ignore write masks and swizzling issues.
  */

 static boolean emit_ABS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg neg = aos_get_internal(cp, IMM_NEGS);
    struct x86_reg tmp = aos_get_xmm_reg(cp);

    sse_movaps(cp->func, tmp, arg0);
    sse_mulps(cp->func, tmp, neg);
    sse_maxps(cp->func, tmp, arg0);

    store_dest(cp, &op->Dst[0], tmp);
    return TRUE;
 }

 static boolean emit_ADD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_addps(cp->func, dst, arg1);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_COS( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    x87_fld_src(cp, &op->Src[0], 0);
    x87_fcos(cp->func);
    x87_fstp_dest4(cp, &op->Dst[0]);
    return TRUE;
 }

 /* The dotproduct instructions don't really do that well in sse:
  * XXX: produces wrong results -- disabled.
  */
 static boolean emit_DP3( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg tmp = aos_get_xmm_reg(cp);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_mulps(cp->func, dst, arg1);
    /* Now the hard bit: sum the first 3 values:
     */
    sse_movhlps(cp->func, tmp, dst);
    sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(cp->func, dst, tmp);

    aos_release_xmm_reg(cp, tmp.idx);
    store_scalar_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_DP4( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg tmp = aos_get_xmm_reg(cp);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_mulps(cp->func, dst, arg1);

    /* Now the hard bit: sum the values:
     */
    sse_movhlps(cp->func, tmp, dst);
    sse_addps(cp->func, dst, tmp); /* a*x+c*z, b*y+d*w, a*x+c*z, b*y+d*w */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(cp->func, dst, tmp);

    aos_release_xmm_reg(cp, tmp.idx);
    store_scalar_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_DPH( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg tmp = aos_get_xmm_reg(cp);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_mulps(cp->func, dst, arg1);

    /* Now the hard bit: sum the values (from DP3):
     */
    sse_movhlps(cp->func, tmp, dst);
    sse_addss(cp->func, dst, tmp); /* a*x+c*z, b*y, ?, ? */
    emit_pshufd(cp, tmp, dst, SHUF(Y,X,W,Z));
    sse_addss(cp->func, dst, tmp);
    emit_pshufd(cp, tmp, arg1, SHUF(W,W,W,W));
    sse_addss(cp->func, dst, tmp);

    aos_release_xmm_reg(cp, tmp.idx);
    store_scalar_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_DST( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
     struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
     struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
     struct x86_reg dst = aos_get_xmm_reg(cp);
     struct x86_reg tmp = aos_get_xmm_reg(cp);
     struct x86_reg ones = aos_get_internal(cp, IMM_ONES);

 /*    dst[0] = 1.0     * 1.0F; */
 /*    dst[1] = arg0[1] * arg1[1]; */
 /*    dst[2] = arg0[2] * 1.0; */
 /*    dst[3] = 1.0     * arg1[3]; */

     emit_shuf_copy2(cp, dst, arg0, ones, SHUF(X,W,Z,Y));
     emit_shuf_copy2(cp, tmp, arg1, ones, SHUF(X,Z,Y,W));
     sse_mulps(cp->func, dst, tmp);

     aos_release_xmm_reg(cp, tmp.idx);
     store_dest(cp, &op->Dst[0], dst);
     return TRUE;
 }

 static boolean emit_LG2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    x87_fld1(cp->func);		/* 1 */
    x87_fld_src(cp, &op->Src[0], 0);	/* a0 1 */
    x87_fyl2x(cp->func);	/* log2(a0) */
    x87_fstp_dest4(cp, &op->Dst[0]);
    return TRUE;
 }

 #if 0
 static boolean emit_EX2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    x87_fld_src(cp, &op->Src[0], 0);
    x87_emit_ex2(cp);
    x87_fstp_dest4(cp, &op->Dst[0]);
    return TRUE;
 }
 #endif


 static boolean emit_FLR( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
    unsigned writemask = op->Dst[0].Register.WriteMask;
    int i;

    set_fpu_round_neg_inf( cp );

    /* Load all sources first to avoid aliasing
     */
    for (i = 3; i >= 0; i--) {
       if (writemask & (1<<i)) {
          x87_fld_src(cp, &op->Src[0], i);
       }
    }

    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
          x87_fprndint( cp->func );
          x87_fstp(cp->func, x86_make_disp(dst, i*4));
       }
    }

    return TRUE;
 }


 static boolean emit_RND( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
    unsigned writemask = op->Dst[0].Register.WriteMask;
    int i;

    set_fpu_round_nearest( cp );

    /* Load all sources first to avoid aliasing
     */
    for (i = 3; i >= 0; i--) {
       if (writemask & (1<<i)) {
          x87_fld_src(cp, &op->Src[0], i);
       }
    }

    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
          x87_fprndint( cp->func );
          x87_fstp(cp->func, x86_make_disp(dst, i*4));
       }
    }

    return TRUE;
 }


 static boolean emit_FRC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
    struct x86_reg st0 = x86_make_reg(file_x87, 0);
    struct x86_reg st1 = x86_make_reg(file_x87, 1);
    unsigned writemask = op->Dst[0].Register.WriteMask;
    int i;

    set_fpu_round_neg_inf( cp );

    /* suck all the source values onto the stack before writing out any
     * dst, which may alias...
     */
    for (i = 3; i >= 0; i--) {
       if (writemask & (1<<i)) {
          x87_fld_src(cp, &op->Src[0], i);
       }
    }

    for (i = 0; i < 4; i++) {
       if (writemask & (1<<i)) {
          x87_fld(cp->func, st0);     /* a a */
          x87_fprndint( cp->func );   /* flr(a) a */
          x87_fsubp(cp->func, st1);  /* frc(a) */
          x87_fstp(cp->func, x86_make_disp(dst, i*4));
       }
    }

    return TRUE;
 }


 static boolean emit_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg ecx = x86_make_reg( file_REG32, reg_CX );
    unsigned writemask = op->Dst[0].Register.WriteMask;
    unsigned lit_count = cp->lit_count++;
    struct x86_reg result, arg0;
    unsigned i;

 #if 1
    /* For absolute correctness, need to spill/invalidate all XMM regs
     * too.
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].dirty)
          spill(cp, i);
       aos_release_xmm_reg(cp, i);
    }
 #endif

    if (writemask != TGSI_WRITEMASK_XYZW)
       result = x86_make_disp(cp->machine_EDX, Offset(struct aos_machine, tmp[0]));
    else
       result = get_dst_ptr(cp, &op->Dst[0]);


    arg0 = fetch_src( cp, &op->Src[0] );
    if (arg0.file == file_XMM) {
       struct x86_reg tmp = x86_make_disp(cp->machine_EDX,
                                          Offset(struct aos_machine, tmp[1]));
       sse_movaps( cp->func, tmp, arg0 );
       arg0 = tmp;
    }


    /* Push caller-save (ie scratch) regs.
     */
    x86_cdecl_caller_push_regs( cp->func );

    /* Push the arguments:
     */
    x86_push_imm32( cp->func, lit_count );

    x86_lea( cp->func, ecx, arg0 );
    x86_push( cp->func, ecx );

    x86_lea( cp->func, ecx, result );
    x86_push( cp->func, ecx );

    x86_push( cp->func, cp->machine_EDX );

    if (lit_count < MAX_LIT_INFO) {
       x86_mov( cp->func, ecx, x86_make_disp( cp->machine_EDX,
                                              Offset(struct aos_machine, lit_info) +
                                              lit_count * sizeof(struct lit_info) +
                                              Offset(struct lit_info, func)));
    }
    else {
       x86_mov_reg_imm( cp->func, ecx, (int)aos_do_lit );
    }

    x86_call( cp->func, ecx );

    x86_pop( cp->func, ecx );    /* fixme... */
    x86_pop( cp->func, ecx );
    x86_pop( cp->func, ecx );
    x86_pop( cp->func, ecx );

    x86_cdecl_caller_pop_regs( cp->func );

    if (writemask != TGSI_WRITEMASK_XYZW) {
       store_dest( cp,
                   &op->Dst[0],
                   get_xmm_writable( cp, result ) );
    }

    return TRUE;
 }

 #if 0
 static boolean emit_inline_LIT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg dst = get_dst_ptr(cp, &op->Dst[0]);
    unsigned writemask = op->Dst[0].Register.WriteMask;

    if (writemask & TGSI_WRITEMASK_YZ) {
       struct x86_reg st1 = x86_make_reg(file_x87, 1);
       struct x86_reg st2 = x86_make_reg(file_x87, 2);

       /* a1' = a1 <= 0 ? 1 : a1;
        */
       x87_fldz(cp->func);                           /* 1 0  */
 #if 1
       x87_fld1(cp->func);                           /* 1 0  */
 #else
       /* Correct but slow due to fp exceptions generated in fyl2x - fix me.
        */
       x87_fldz(cp->func);                           /* 1 0  */
 #endif
       x87_fld_src(cp, &op->Src[0], 1); /* a1 1 0  */
       x87_fcomi(cp->func, st2);	                    /* a1 1 0  */
       x87_fcmovb(cp->func, st1);                    /* a1' 1 0  */
       x87_fstp(cp->func, st1);                      /* a1' 0  */
       x87_fstp(cp->func, st1);                      /* a1'  */

       x87_fld_src(cp, &op->Src[0], 3); /* a3 a1'  */
       x87_fxch(cp->func, st1);                      /* a1' a3  */


       /* Compute pow(a1, a3)
        */
       x87_fyl2x(cp->func);	/* a3*log2(a1)      */
       x87_emit_ex2( cp );       /* 2^(a3*log2(a1))   */


       /* a0' = max2(a0, 0):
        */
       x87_fldz(cp->func);                           /* 0 r2 */
       x87_fld_src(cp, &op->Src[0], 0); /* a0 0 r2 */
       x87_fcomi(cp->func, st1);
       x87_fcmovb(cp->func, st1);                    /* a0' 0 r2 */

       x87_fst_or_nop(cp->func, writemask, 1, dst); /* result[1] = a0' */

       x87_fcomi(cp->func, st1);  /* a0' 0 r2 */
       x87_fcmovnbe(cp->func, st2); /* r2' 0' r2 */

       x87_fstp_or_pop(cp->func, writemask, 2, dst); /* 0 r2 */
       x87_fpop(cp->func);       /* r2 */
       x87_fpop(cp->func);
    }

    if (writemask & TGSI_WRITEMASK_XW) {
       x87_fld1(cp->func);
       x87_fst_or_nop(cp->func, writemask, 0, dst);
       x87_fstp_or_pop(cp->func, writemask, 3, dst);
    }

    return TRUE;
 }
 #endif


 static boolean emit_MAX( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_maxps(cp->func, dst, arg1);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }


 static boolean emit_MIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_minps(cp->func, dst, arg1);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_MOV( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    /* potentially nothing to do */

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_MUL( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_mulps(cp->func, dst, arg1);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }


 static boolean emit_MAD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg arg2 = fetch_src(cp, &op->Src[2]);

    /* If we can't clobber old contents of arg0, get a temporary & copy
     * it there, then clobber it...
     */
    arg0 = get_xmm_writable(cp, arg0);

    sse_mulps(cp->func, arg0, arg1);
    sse_addps(cp->func, arg0, arg2);
    store_dest(cp, &op->Dst[0], arg0);
    return TRUE;
 }


 /* A wrapper for powf().
  * Makes sure it is cdecl and operates on floats.
  */
 static float PIPE_CDECL _powerf( float x, float y )
 {
 #if FAST_MATH
    return util_fast_pow(x, y);
 #else
    return powf( x, y );
 #endif
 }

 #if FAST_MATH
 static float PIPE_CDECL _exp2(float x)
 {
    return util_fast_exp2(x);
 }
 #endif


 /* Really not sufficient -- need to check for conditions that could
  * generate inf/nan values, which will slow things down hugely.
  */
 static boolean emit_POW( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
 #if 0
    x87_fld_src(cp, &op->Src[1], 0);  /* a1.x */
    x87_fld_src(cp, &op->Src[0], 0);	/* a0.x a1.x */
    x87_fyl2x(cp->func);	                                /* a1*log2(a0) */

    x87_emit_ex2( cp );		/* 2^(a1*log2(a0)) */

    x87_fstp_dest4(cp, &op->Dst[0]);
 #else
    uint i;

    /* For absolute correctness, need to spill/invalidate all XMM regs
     * too.
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].dirty)
          spill(cp, i);
       aos_release_xmm_reg(cp, i);
    }

    /* Push caller-save (ie scratch) regs.
     */
    x86_cdecl_caller_push_regs( cp->func );

    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -8) );

    x87_fld_src( cp, &op->Src[1], 0 );
    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 4 ) );
    x87_fld_src( cp, &op->Src[0], 0 );
    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );

    /* tmp_EAX has been pushed & will be restored below */
    x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _powerf );
    x86_call( cp->func, cp->tmp_EAX );

    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 8) );

    x86_cdecl_caller_pop_regs( cp->func );

    /* Note retval on x87 stack:
     */
    cp->func->x87_stack++;

    x87_fstp_dest4( cp, &op->Dst[0] );
 #endif
    return TRUE;
 }


 #if FAST_MATH
 static boolean emit_EXPBASE2( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    uint i;

    /* For absolute correctness, need to spill/invalidate all XMM regs
     * too.
     */
    for (i = 0; i < 8; i++) {
       if (cp->xmm[i].dirty)
          spill(cp, i);
       aos_release_xmm_reg(cp, i);
    }

    /* Push caller-save (ie scratch) regs.
     */
    x86_cdecl_caller_push_regs( cp->func );

    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, -4) );

    x87_fld_src( cp, &op->Src[0], 0 );
    x87_fstp( cp->func, x86_make_disp( cp->stack_ESP, 0 ) );

    /* tmp_EAX has been pushed & will be restored below */
    x86_mov_reg_imm( cp->func, cp->tmp_EAX, (unsigned long) _exp2 );
    x86_call( cp->func, cp->tmp_EAX );

    x86_lea( cp->func, cp->stack_ESP, x86_make_disp(cp->stack_ESP, 4) );

    x86_cdecl_caller_pop_regs( cp->func );

    /* Note retval on x87 stack:
     */
    cp->func->x87_stack++;

    x87_fstp_dest4( cp, &op->Dst[0] );

    return TRUE;
 }
 #endif


 static boolean emit_RCP( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg dst = aos_get_xmm_reg(cp);

    if (cp->have_sse2) {
       sse2_rcpss(cp->func, dst, arg0);
       /* extend precision here...
        */
    }
    else {
       struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
       sse_movss(cp->func, dst, ones);
       sse_divss(cp->func, dst, arg0);
    }

    store_scalar_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }


 /* Although rsqrtps() and rcpps() are low precision on some/all SSE
  * implementations, it is possible to improve its precision at
  * fairly low cost, using a newton/raphson step, as below:
  *
  * x1 = 2 * rcpps(a) - a * rcpps(a) * rcpps(a)
  * x1 = 0.5 * rsqrtps(a) * [3.0 - (a * rsqrtps(a))* rsqrtps(a)]
  * or:
  *   x1 = rsqrtps(a) * [1.5 - .5 * a * rsqrtps(a) * rsqrtps(a)]
  *
  *
  * See: http://softwarecommunity.intel.com/articles/eng/1818.htm
  */
 static boolean emit_RSQ( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    if (0) {
       struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
       struct x86_reg r = aos_get_xmm_reg(cp);
       sse_rsqrtss(cp->func, r, arg0);
       store_scalar_dest(cp, &op->Dst[0], r);
       return TRUE;
    }
    else {
       struct x86_reg arg0           = fetch_src(cp, &op->Src[0]);
       struct x86_reg r              = aos_get_xmm_reg(cp);

       struct x86_reg neg_half       = get_reg_ptr( cp, AOS_FILE_INTERNAL, IMM_RSQ );
       struct x86_reg one_point_five = x86_make_disp( neg_half, 4 );
       struct x86_reg src            = get_xmm_writable( cp, arg0 );
       struct x86_reg neg            = aos_get_internal(cp, IMM_NEGS);
       struct x86_reg tmp            = aos_get_xmm_reg(cp);

       sse_movaps(cp->func, tmp, src);
       sse_mulps(cp->func, tmp, neg);
       sse_maxps(cp->func, tmp, src);

       sse_rsqrtss( cp->func, r, tmp  );             /* rsqrtss(a) */
       sse_mulss(   cp->func, tmp, neg_half  );      /* -.5 * a */
       sse_mulss(   cp->func, tmp,  r );             /* -.5 * a * r */
       sse_mulss(   cp->func, tmp,  r );             /* -.5 * a * r * r */
       sse_addss(   cp->func, tmp, one_point_five ); /* 1.5 - .5 * a * r * r */
       sse_mulss(   cp->func, r,  tmp );             /* r * (1.5 - .5 * a * r * r) */

       store_scalar_dest(cp, &op->Dst[0], r);

       aos_release_xmm_reg(cp, tmp.idx);

       return TRUE;
    }
 }


 static boolean emit_SGE( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_cmpps(cp->func, dst, arg1, cc_NotLessThan);
    sse_andps(cp->func, dst, ones);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_SIN( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    x87_fld_src(cp, &op->Src[0], 0);
    x87_fsin(cp->func);
    x87_fstp_dest4(cp, &op->Dst[0]);
    return TRUE;
 }


 static boolean emit_SLT( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg ones = aos_get_internal(cp, IMM_ONES);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_cmpps(cp->func, dst, arg1, cc_LessThan);
    sse_andps(cp->func, dst, ones);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_SUB( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg dst = get_xmm_writable(cp, arg0);

    sse_subps(cp->func, dst, arg1);

    store_dest(cp, &op->Dst[0], dst);
    return TRUE;
 }

 static boolean emit_TRUNC( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg tmp0 = aos_get_xmm_reg(cp);

    sse2_cvttps2dq(cp->func, tmp0, arg0);
    sse2_cvtdq2ps(cp->func, tmp0, tmp0);

    store_dest(cp, &op->Dst[0], tmp0);
    return TRUE;
 }

 static boolean emit_XPD( struct aos_compilation *cp, const struct tgsi_full_instruction *op )
 {
    struct x86_reg arg0 = fetch_src(cp, &op->Src[0]);
    struct x86_reg arg1 = fetch_src(cp, &op->Src[1]);
    struct x86_reg tmp0 = aos_get_xmm_reg(cp);
    struct x86_reg tmp1 = aos_get_xmm_reg(cp);

    emit_pshufd(cp, tmp1, arg1, SHUF(Y, Z, X, W));
    sse_mulps(cp->func, tmp1, arg0);
    emit_pshufd(cp, tmp0, arg0, SHUF(Y, Z, X, W));
    sse_mulps(cp->func, tmp0, arg1);
    sse_subps(cp->func, tmp1, tmp0);
    sse_shufps(cp->func, tmp1, tmp1, SHUF(Y, Z, X, W));

 /*    dst[2] = arg0[0] * arg1[1] - arg0[1] * arg1[0]; */
 /*    dst[0] = arg0[1] * arg1[2] - arg0[2] * arg1[1]; */
 /*    dst[1] = arg0[2] * arg1[0] - arg0[0] * arg1[2]; */
 /*    dst[3] is undef */


    aos_release_xmm_reg(cp, tmp0.idx);
    store_dest(cp, &op->Dst[0], tmp1);
    return TRUE;
 }


 static boolean
 emit_instruction( struct aos_compilation *cp,
                   struct tgsi_full_instruction *inst )
 {
    x87_assert_stack_empty(cp->func);

    switch( inst->Instruction.Opcode ) {
    case TGSI_OPCODE_MOV:
       return emit_MOV( cp, inst );

    case TGSI_OPCODE_LIT:
       return emit_LIT(cp, inst);

    case TGSI_OPCODE_RCP:
       return emit_RCP(cp, inst);

    case TGSI_OPCODE_RSQ:
       return emit_RSQ(cp, inst);

    case TGSI_OPCODE_EXP:
       /*return emit_EXP(cp, inst);*/
       return FALSE;

    case TGSI_OPCODE_LOG:
       /*return emit_LOG(cp, inst);*/
       return FALSE;

    case TGSI_OPCODE_MUL:
       return emit_MUL(cp, inst);

    case TGSI_OPCODE_ADD:
       return emit_ADD(cp, inst);

    case TGSI_OPCODE_DP3:
       return emit_DP3(cp, inst);

    case TGSI_OPCODE_DP4:
       return emit_DP4(cp, inst);

    case TGSI_OPCODE_DST:
       return emit_DST(cp, inst);

    case TGSI_OPCODE_MIN:
       return emit_MIN(cp, inst);

    case TGSI_OPCODE_MAX:
       return emit_MAX(cp, inst);

    case TGSI_OPCODE_SLT:
       return emit_SLT(cp, inst);

    case TGSI_OPCODE_SGE:
       return emit_SGE(cp, inst);

    case TGSI_OPCODE_MAD:
       return emit_MAD(cp, inst);

    case TGSI_OPCODE_SUB:
       return emit_SUB(cp, inst);

    case TGSI_OPCODE_LRP:
       /*return emit_LERP(cp, inst);*/
       return FALSE;

    case TGSI_OPCODE_FRC:
       return emit_FRC(cp, inst);

    case TGSI_OPCODE_CLAMP:
       /*return emit_CLAMP(cp, inst);*/
       return FALSE;

    case TGSI_OPCODE_FLR:
       return emit_FLR(cp, inst);

    case TGSI_OPCODE_ROUND:
       return emit_RND(cp, inst);

    case TGSI_OPCODE_EX2:
 #if FAST_MATH
       return emit_EXPBASE2(cp, inst);
 #elif 0
       /* this seems to fail for "larger" exponents.
        * See glean tvertProg1's EX2 test.
        */
       return emit_EX2(cp, inst);
 #else
       return FALSE;
 #endif

    case TGSI_OPCODE_LG2:
       return emit_LG2(cp, inst);

    case TGSI_OPCODE_POW:
       return emit_POW(cp, inst);

    case TGSI_OPCODE_XPD:
       return emit_XPD(cp, inst);

    case TGSI_OPCODE_ABS:
       return emit_ABS(cp, inst);

    case TGSI_OPCODE_DPH:
       return emit_DPH(cp, inst);

    case TGSI_OPCODE_COS:
       return emit_COS(cp, inst);

    case TGSI_OPCODE_SIN:
       return emit_SIN(cp, inst);

    case TGSI_OPCODE_TRUNC:
       return emit_TRUNC(cp, inst);

    case TGSI_OPCODE_END:
       return TRUE;

    default:
       return FALSE;
    }
 }


 static boolean emit_viewport( struct aos_compilation *cp )
 {
    struct x86_reg pos = aos_get_shader_reg_xmm(cp,
                                                TGSI_FILE_OUTPUT,
                                                cp->vaos->draw->vs.position_output );

    struct x86_reg scale = x86_make_disp(cp->machine_EDX,
                                         Offset(struct aos_machine, scale));

    struct x86_reg translate = x86_make_disp(cp->machine_EDX,
                                         Offset(struct aos_machine, translate));

    sse_mulps(cp->func, pos, scale);
    sse_addps(cp->func, pos, translate);

    aos_adopt_xmm_reg( cp,
                       pos,
                       TGSI_FILE_OUTPUT,
                       cp->vaos->draw->vs.position_output,
                       TRUE );
    return TRUE;
 }


 /* This is useful to be able to see the results on softpipe.  Doesn't
  * do proper clipping, just assumes the backend can do it during
  * rasterization -- for debug only...
  */
 static boolean emit_rhw_viewport( struct aos_compilation *cp )
 {
    struct x86_reg tmp = aos_get_xmm_reg(cp);
    struct x86_reg pos = aos_get_shader_reg_xmm(cp,
                                                TGSI_FILE_OUTPUT,
                                                cp->vaos->draw->vs.position_output);

    struct x86_reg scale = x86_make_disp(cp->machine_EDX,
                                         Offset(struct aos_machine, scale));

    struct x86_reg translate = x86_make_disp(cp->machine_EDX,
                                         Offset(struct aos_machine, translate));


    emit_pshufd(cp, tmp, pos, SHUF(W, W, W, W));
    sse2_rcpss(cp->func, tmp, tmp);
    sse_shufps(cp->func, tmp, tmp, SHUF(X, X, X, X));

    sse_mulps(cp->func, pos, scale);
    sse_mulps(cp->func, pos, tmp);
    sse_addps(cp->func, pos, translate);

    /* Set pos[3] = w
     */
    mask_write(cp, pos, tmp, TGSI_WRITEMASK_W);

    aos_adopt_xmm_reg( cp,
                       pos,
                       TGSI_FILE_OUTPUT,
                       cp->vaos->draw->vs.position_output,
                       TRUE );
    return TRUE;
 }


 #if 0
 static boolean note_immediate( struct aos_compilation *cp,
                                struct tgsi_full_immediate *imm )
 {
    unsigned pos = cp->num_immediates++;
    unsigned j;

    assert( imm->Immediate.NrTokens <= 4 + 1 );
    for (j = 0; j < imm->Immediate.NrTokens - 1; j++) {
       cp->vaos->machine->immediate[pos][j] = imm->u[j].Float;
    }

    return TRUE;
 }
 #endif


 static void find_last_write_outputs( struct aos_compilation *cp )
 {
    struct tgsi_parse_context parse;
    unsigned this_instruction = 0;
    unsigned i;

    tgsi_parse_init( &parse, cp->vaos->base.vs->state.tokens );

    while (!tgsi_parse_end_of_tokens( &parse )) {

       tgsi_parse_token( &parse );

       if (parse.FullToken.Token.Type != TGSI_TOKEN_TYPE_INSTRUCTION)
          continue;

       for (i = 0; i < TGSI_FULL_MAX_DST_REGISTERS; i++) {
          if (parse.FullToken.FullInstruction.Dst[i].Register.File ==
              TGSI_FILE_OUTPUT)
          {
             unsigned idx = parse.FullToken.FullInstruction.Dst[i].Register.Index;
             cp->output_last_write[idx] = this_instruction;
          }
       }

       this_instruction++;
    }

    tgsi_parse_free( &parse );
 }


 #define ARG_MACHINE    1
 #define ARG_START_ELTS 2
 #define ARG_COUNT      3
 #define ARG_OUTBUF     4


 static boolean build_vertex_program( struct draw_vs_varient_aos_sse *varient,
                                      boolean linear )
 {
    struct tgsi_parse_context parse;
    struct aos_compilation cp;
    unsigned fixup, label;

    util_init_math();

    tgsi_parse_init( &parse, varient->base.vs->state.tokens );

    memset(&cp, 0, sizeof(cp));

    cp.insn_counter = 1;
    cp.vaos = varient;
    cp.have_sse2 = 1;
    cp.func = &varient->func[ linear ? 0 : 1 ];

    cp.tmp_EAX       = x86_make_reg(file_REG32, reg_AX);
    cp.idx_EBX      = x86_make_reg(file_REG32, reg_BX);
    cp.outbuf_ECX    = x86_make_reg(file_REG32, reg_CX);
    cp.machine_EDX   = x86_make_reg(file_REG32, reg_DX);
    cp.count_ESI     = x86_make_reg(file_REG32, reg_SI);
    cp.temp_EBP     = x86_make_reg(file_REG32, reg_BP);
    cp.stack_ESP = x86_make_reg( file_REG32, reg_SP );

    x86_init_func(cp.func);

    find_last_write_outputs(&cp);

    x86_push(cp.func, cp.idx_EBX);
    x86_push(cp.func, cp.count_ESI);
    x86_push(cp.func, cp.temp_EBP);


    /* Load arguments into regs:
     */
    x86_mov(cp.func, cp.machine_EDX, x86_fn_arg(cp.func, ARG_MACHINE));
    x86_mov(cp.func, cp.idx_EBX, x86_fn_arg(cp.func, ARG_START_ELTS));
    x86_mov(cp.func, cp.count_ESI, x86_fn_arg(cp.func, ARG_COUNT));
    x86_mov(cp.func, cp.outbuf_ECX, x86_fn_arg(cp.func, ARG_OUTBUF));


    /* Compare count to zero and possibly bail.
     */
    x86_xor(cp.func, cp.tmp_EAX, cp.tmp_EAX);
    x86_cmp(cp.func, cp.count_ESI, cp.tmp_EAX);
    fixup = x86_jcc_forward(cp.func, cc_E);


    save_fpu_state( &cp );
    set_fpu_round_nearest( &cp );

    aos_init_inputs( &cp, linear );

    cp.x86_reg[0] = 0;
    cp.x86_reg[1] = 0;

    /* Note address for loop jump
     */
    label = x86_get_label(cp.func);
    {
       /* Fetch inputs...  TODO:  fetch lazily...
        */
       if (!aos_fetch_inputs( &cp, linear ))
          goto fail;

       /* Emit the shader:
        */
       while( !tgsi_parse_end_of_tokens( &parse ) && !cp.error )
       {
          tgsi_parse_token( &parse );

          switch (parse.FullToken.Token.Type) {
          case TGSI_TOKEN_TYPE_IMMEDIATE:
 #if 0
             if (!note_immediate( &cp, &parse.FullToken.FullImmediate ))
                goto fail;
 #endif
             break;

          case TGSI_TOKEN_TYPE_INSTRUCTION:
             if (DISASSEM)
                tgsi_dump_instruction( &parse.FullToken.FullInstruction, cp.insn_counter );

             if (!emit_instruction( &cp, &parse.FullToken.FullInstruction ))
                goto fail;
             break;
          }

          x87_assert_stack_empty(cp.func);
          cp.insn_counter++;

          if (DISASSEM)
             debug_printf("\n");
       }


       {
          unsigned i;
          for (i = 0; i < 8; i++) {
             if (cp.xmm[i].file != TGSI_FILE_OUTPUT) {
                cp.xmm[i].file = TGSI_FILE_NULL;
                cp.xmm[i].dirty = 0;
             }
          }
       }

       if (cp.error)
          goto fail;

       if (cp.vaos->base.key.clip) {
          /* not really handling clipping, just do the rhw so we can
           * see the results...
           */
          emit_rhw_viewport(&cp);
       }
       else if (cp.vaos->base.key.viewport) {
          emit_viewport(&cp);
       }

       /* Emit output...  TODO: do this eagerly after the last write to a
        * given output.
        */
       if (!aos_emit_outputs( &cp ))
          goto fail;


       /* Next vertex:
        */
       x86_lea(cp.func,
               cp.outbuf_ECX,
               x86_make_disp(cp.outbuf_ECX,
                             cp.vaos->base.key.output_stride));

       /* Incr index
        */
       aos_incr_inputs( &cp, linear );
    }
    /* decr count, loop if not zero
     */
    x86_dec(cp.func, cp.count_ESI);
    x86_jcc(cp.func, cc_NZ, label);

    restore_fpu_state(&cp);

    /* Land forward jump here:
     */
    x86_fixup_fwd_jump(cp.func, fixup);

    /* Exit mmx state?
     */
    if (cp.func->need_emms)
       mmx_emms(cp.func);

    x86_pop(cp.func, cp.temp_EBP);
    x86_pop(cp.func, cp.count_ESI);
    x86_pop(cp.func, cp.idx_EBX);

    x87_assert_stack_empty(cp.func);
    x86_ret(cp.func);

    tgsi_parse_free( &parse );
    return !cp.error;

  fail:
    tgsi_parse_free( &parse );
    return FALSE;
 }


 static void vaos_set_buffer( struct draw_vs_varient *varient,
                              unsigned buf,
                              const void *ptr,
                              unsigned stride )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;

    if (buf < vaos->nr_vb) {
       vaos->buffer[buf].base_ptr = (char *)ptr;
       vaos->buffer[buf].stride = stride;
    }

    if (0) debug_printf("%s %d/%d: %p %d\n", __FUNCTION__, buf, vaos->nr_vb, ptr, stride);
 }


 static void PIPE_CDECL vaos_run_elts( struct draw_vs_varient *varient,
                                       const unsigned *elts,
                                       unsigned count,
                                       void *output_buffer )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;

    if (0) debug_printf("%s %d\n", __FUNCTION__, count);

    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
    machine->buffer = vaos->buffer;

    vaos->gen_run_elts( machine,
                        elts,
                        count,
                        output_buffer );
 }

 static void PIPE_CDECL vaos_run_linear( struct draw_vs_varient *varient,
                                         unsigned start,
                                         unsigned count,
                                         void *output_buffer )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;
    struct aos_machine *machine = vaos->draw->vs.aos_machine;

    if (0) debug_printf("%s %d %d const: %x\n", __FUNCTION__, start, count,
                        vaos->base.key.const_vbuffers);

    machine->internal[IMM_PSIZE][0] = vaos->draw->rasterizer->point_size;
    machine->constants = vaos->draw->vs.aligned_constants;
    machine->immediates = vaos->base.vs->immediates;
    machine->buffer = vaos->buffer;

    vaos->gen_run_linear( machine,
                          start,
                          count,
                          output_buffer );

    /* Sanity spot checks to make sure we didn't trash our constants */
    assert(machine->internal[IMM_ONES][0] == 1.0f);
    assert(machine->internal[IMM_IDENTITY][0] == 0.0f);
    assert(machine->internal[IMM_NEGS][0] == -1.0f);
 }


 static void vaos_destroy( struct draw_vs_varient *varient )
 {
    struct draw_vs_varient_aos_sse *vaos = (struct draw_vs_varient_aos_sse *)varient;

    FREE( vaos->buffer );

    x86_release_func( &vaos->func[0] );
    x86_release_func( &vaos->func[1] );

    FREE(vaos);
 }


 static struct draw_vs_varient *varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
    unsigned i;
    struct draw_vs_varient_aos_sse *vaos = CALLOC_STRUCT(draw_vs_varient_aos_sse);

    if (!vaos)
       goto fail;

    vaos->base.key = *key;
    vaos->base.vs = vs;
    vaos->base.set_buffer = vaos_set_buffer;
    vaos->base.destroy = vaos_destroy;
    vaos->base.run_linear = vaos_run_linear;
    vaos->base.run_elts = vaos_run_elts;

    vaos->draw = vs->draw;

    for (i = 0; i < key->nr_inputs; i++)
       vaos->nr_vb = MAX2( vaos->nr_vb, key->element[i].in.buffer + 1 );

    vaos->buffer = MALLOC( vaos->nr_vb * sizeof(vaos->buffer[0]) );
    if (!vaos->buffer)
       goto fail;

    if (0)
       debug_printf("nr_vb: %d const: %x\n", vaos->nr_vb, vaos->base.key.const_vbuffers);

 #if 0
    tgsi_dump(vs->state.tokens, 0);
 #endif

    if (!build_vertex_program( vaos, TRUE ))
       goto fail;

    if (!build_vertex_program( vaos, FALSE ))
       goto fail;

    vaos->gen_run_linear = (vaos_run_linear_func)x86_get_func(&vaos->func[0]);
    if (!vaos->gen_run_linear)
       goto fail;

    vaos->gen_run_elts = (vaos_run_elts_func)x86_get_func(&vaos->func[1]);
    if (!vaos->gen_run_elts)
       goto fail;

    return &vaos->base;

  fail:
    if (vaos && vaos->buffer)
       FREE(vaos->buffer);

    if (vaos)
       x86_release_func( &vaos->func[0] );

    if (vaos)
       x86_release_func( &vaos->func[1] );

    FREE(vaos);

    return NULL;
 }


 struct draw_vs_varient *draw_vs_varient_aos_sse( struct draw_vertex_shader *vs,
                                                  const struct draw_vs_varient_key *key )
 {
    struct draw_vs_varient *varient = varient_aos_sse( vs, key );

    if (varient == NULL) {
       varient = draw_vs_varient_generic( vs, key );
    }

    return varient;
 }


 #endif /* PIPE_ARCH_X86 */