src/mesa/vf/vf_sse.c - platform/external/mesa3d - Gitiles

 /*
  * Copyright 2003 Tungsten Graphics, inc.
  * All Rights Reserved.
  *
  * Permission is hereby granted, free of charge, to any person obtaining a
  * copy of this software and associated documentation files (the "Software"),
  * to deal in the Software without restriction, including without limitation
  * on the rights to use, copy, modify, merge, publish, distribute, sub
  * license, and/or sell copies of the Software, and to permit persons to whom
  * the Software is furnished to do so, subject to the following conditions:
  *
  * The above copyright notice and this permission notice (including the next
  * paragraph) shall be included in all copies or substantial portions of the
  * Software.
  *
  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  * FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT.  IN NO EVENT SHALL
  * TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
  * DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
  * OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
  * USE OR OTHER DEALINGS IN THE SOFTWARE.
  *
  * Authors:
  *    Keith Whitwell <keithw@tungstengraphics.com>
  */

 #include "main/glheader.h"
 #include "main/colormac.h"
 #include "main/simple_list.h"
 #include "main/enums.h"

 #include "vf/vf.h"

 #if defined(USE_SSE_ASM)

 #include "x86/rtasm/x86sse.h"
 #include "x86/common_x86_asm.h"


 #define X    0
 #define Y    1
 #define Z    2
 #define W    3


 struct x86_program {
    struct x86_function func;

    struct vertex_fetch *vf;
    GLboolean inputs_safe;
    GLboolean outputs_safe;
    GLboolean have_sse2;

    struct x86_reg identity;
    struct x86_reg chan0;
 };


 static struct x86_reg get_identity( struct x86_program *p )
 {
    return p->identity;
 }

 static void emit_load4f_4( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    sse_movups(&p->func, dest, arg0);
 }

 static void emit_load4f_3( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    /* Have to jump through some hoops:
     *
     * c 0 0 0
     * c 0 0 1
     * 0 0 c 1
     * a b c 1
     */
    sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
    sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
    sse_movlps(&p->func, dest, arg0);
 }

 static void emit_load4f_2( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    /* Initialize from identity, then pull in low two words:
     */
    sse_movups(&p->func, dest, get_identity(p));
    sse_movlps(&p->func, dest, arg0);
 }

 static void emit_load4f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    /* Pull in low word, then swizzle in identity */
    sse_movss(&p->func, dest, arg0);
    sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
 }


 static void emit_load3f_3( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    /* Over-reads by 1 dword - potential SEGV if input is a vertex
     * array.
     */
    if (p->inputs_safe) {
       sse_movups(&p->func, dest, arg0);
    }
    else {
       /* c 0 0 0
        * c c c c
        * a b c c
        */
       sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
       sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
       sse_movlps(&p->func, dest, arg0);
    }
 }

 static void emit_load3f_2( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    emit_load4f_2(p, dest, arg0);
 }

 static void emit_load3f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    emit_load4f_1(p, dest, arg0);
 }

 static void emit_load2f_2( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    sse_movlps(&p->func, dest, arg0);
 }

 static void emit_load2f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    emit_load4f_1(p, dest, arg0);
 }

 static void emit_load1f_1( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    sse_movss(&p->func, dest, arg0);
 }

 static void (*load[4][4])( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 ) = {
    { emit_load1f_1,
      emit_load1f_1,
      emit_load1f_1,
      emit_load1f_1 },

    { emit_load2f_1,
      emit_load2f_2,
      emit_load2f_2,
      emit_load2f_2 },

    { emit_load3f_1,
      emit_load3f_2,
      emit_load3f_3,
      emit_load3f_3 },

    { emit_load4f_1,
      emit_load4f_2,
      emit_load4f_3,
      emit_load4f_4 }
 };

 static void emit_load( struct x86_program *p,
 		       struct x86_reg dest,
 		       GLuint sz,
 		       struct x86_reg src,
 		       GLuint src_sz)
 {
    load[sz-1][src_sz-1](p, dest, src);
 }

 static void emit_store4f( struct x86_program *p,
 			  struct x86_reg dest,
 			  struct x86_reg arg0 )
 {
    sse_movups(&p->func, dest, arg0);
 }

 static void emit_store3f( struct x86_program *p,
 			  struct x86_reg dest,
 			  struct x86_reg arg0 )
 {
    if (p->outputs_safe) {
       /* Emit the extra dword anyway.  This may hurt writecombining,
        * may cause other problems.
        */
       sse_movups(&p->func, dest, arg0);
    }
    else {
       /* Alternate strategy - emit two, shuffle, emit one.
        */
       sse_movlps(&p->func, dest, arg0);
       sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
       sse_movss(&p->func, x86_make_disp(dest,8), arg0);
    }
 }

 static void emit_store2f( struct x86_program *p,
 			   struct x86_reg dest,
 			   struct x86_reg arg0 )
 {
    sse_movlps(&p->func, dest, arg0);
 }

 static void emit_store1f( struct x86_program *p,
 			  struct x86_reg dest,
 			  struct x86_reg arg0 )
 {
    sse_movss(&p->func, dest, arg0);
 }


 static void (*store[4])( struct x86_program *p,
 			 struct x86_reg dest,
 			 struct x86_reg arg0 ) =
 {
    emit_store1f,
    emit_store2f,
    emit_store3f,
    emit_store4f
 };

 static void emit_store( struct x86_program *p,
 			struct x86_reg dest,
 			GLuint sz,
 			struct x86_reg temp )

 {
    store[sz-1](p, dest, temp);
 }

 static void emit_pack_store_4ub( struct x86_program *p,
 				 struct x86_reg dest,
 				 struct x86_reg temp )
 {
    /* Scale by 255.0
     */
    sse_mulps(&p->func, temp, p->chan0);

    if (p->have_sse2) {
       sse2_cvtps2dq(&p->func, temp, temp);
       sse2_packssdw(&p->func, temp, temp);
       sse2_packuswb(&p->func, temp, temp);
       sse_movss(&p->func, dest, temp);
    }
    else {
       struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
       struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
       sse_cvtps2pi(&p->func, mmx0, temp);
       sse_movhlps(&p->func, temp, temp);
       sse_cvtps2pi(&p->func, mmx1, temp);
       mmx_packssdw(&p->func, mmx0, mmx1);
       mmx_packuswb(&p->func, mmx0, mmx0);
       mmx_movd(&p->func, dest, mmx0);
    }
 }

 static GLint get_offset( const void *a, const void *b )
 {
    return (const char *)b - (const char *)a;
 }

 /* Not much happens here.  Eventually use this function to try and
  * avoid saving/reloading the source pointers each vertex (if some of
  * them can fit in registers).
  */
 static void get_src_ptr( struct x86_program *p,
 			 struct x86_reg srcREG,
 			 struct x86_reg vfREG,
 			 struct vf_attr *a )
 {
    struct vertex_fetch *vf = p->vf;
    struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));

    /* Load current a[j].inputptr
     */
    x86_mov(&p->func, srcREG, ptr_to_src);
 }

 static void update_src_ptr( struct x86_program *p,
 			 struct x86_reg srcREG,
 			 struct x86_reg vfREG,
 			 struct vf_attr *a )
 {
    if (a->inputstride) {
       struct vertex_fetch *vf = p->vf;
       struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));

       /* add a[j].inputstride (hardcoded value - could just as easily
        * pull the stride value from memory each time).
        */
       x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));

       /* save new value of a[j].inputptr
        */
       x86_mov(&p->func, ptr_to_src, srcREG);
    }
 }


 /* Lots of hardcoding
  *
  * EAX -- pointer to current output vertex
  * ECX -- pointer to current attribute
  *
  */
 static GLboolean build_vertex_emit( struct x86_program *p )
 {
    struct vertex_fetch *vf = p->vf;
    GLuint j = 0;

    struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
    struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
    struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
    struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
    struct x86_reg temp = x86_make_reg(file_XMM, 0);
    struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
    struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
    GLubyte *fixup, *label;

    /* Push a few regs?
     */
    x86_push(&p->func, countEBP);
    x86_push(&p->func, vfESI);


    /* Get vertex count, compare to zero
     */
    x86_xor(&p->func, srcECX, srcECX);
    x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
    x86_cmp(&p->func, countEBP, srcECX);
    fixup = x86_jcc_forward(&p->func, cc_E);

    /* Initialize destination register.
     */
    x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));

    /* Move argument 1 (vf) into a reg:
     */
    x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));


    /* Possibly load vp0, vp1 for viewport calcs:
     */
    if (vf->allow_viewport_emits) {
       sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
       sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
    }

    /* always load, needed or not:
     */
    sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
    sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));

    /* Note address for loop jump */
    label = x86_get_label(&p->func);

    /* Emit code for each of the attributes.  Currently routes
     * everything through SSE registers, even when it might be more
     * efficient to stick with regular old x86.  No optimization or
     * other tricks - enough new ground to cover here just getting
     * things working.
     */
    while (j < vf->attr_count) {
       struct vf_attr *a = &vf->attr[j];
       struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);

       /* Now, load an XMM reg from src, perhaps transform, then save.
        * Could be shortcircuited in specific cases:
        */
       switch (a->format) {
       case EMIT_1F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 1, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_2F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_3F:
 	 /* Potentially the worst case - hardcode 2+1 copying:
 	  */
 	 if (0) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	    emit_store(p, dest, 3, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	    emit_store(p, dest, 2, temp);
 	    if (a->inputsize > 2) {
 	       emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
 	       emit_store(p, x86_make_disp(dest,8), 1, temp);
 	    }
 	    else {
 	       sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
 	    }
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 break;
       case EMIT_4F:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_2F_VIEWPORT:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
 	 sse_addps(&p->func, temp, vp1);
 	 emit_store(p, dest, 2, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_3F_VIEWPORT:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
 	 sse_addps(&p->func, temp, vp1);
 	 emit_store(p, dest, 3, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_4F_VIEWPORT:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_mulps(&p->func, temp, vp0);
 	 sse_addps(&p->func, temp, vp1);
 	 emit_store(p, dest, 4, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_3F_XYW:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
 	 emit_store(p, dest, 3, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;

       case EMIT_1UB_1F:
 	 /* Test for PAD3 + 1UB:
 	  */
 	 if (j > 0 &&
 	     a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
 	 {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
 	    sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
 	    emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 else {
 	    printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
 	    return GL_FALSE;
 	 }
 	 break;
       case EMIT_3UB_3F_RGB:
       case EMIT_3UB_3F_BGR:
 	 /* Test for 3UB + PAD1:
 	  */
 	 if (j == vf->attr_count - 1 ||
 	     a[1].vertoffset >= a->vertoffset + 4) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	    if (a->format == EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	    emit_pack_store_4ub(p, dest, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
 	 }
 	 /* Test for 3UB + 1UB:
 	  */
 	 else if (j < vf->attr_count - 1 &&
 		  a[1].format == EMIT_1UB_1F &&
 		  a[1].vertoffset == a->vertoffset + 3) {
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
 	    update_src_ptr(p, srcECX, vfESI, a);

 	    /* Make room for incoming value:
 	     */
 	    sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));

 	    get_src_ptr(p, srcECX, vfESI, &a[1]);
 	    emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
 	    update_src_ptr(p, srcECX, vfESI, &a[1]);

 	    /* Rearrange and possibly do BGR conversion:
 	     */
 	    if (a->format == EMIT_3UB_3F_BGR)
 	       sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	    else
 	       sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));

 	    emit_pack_store_4ub(p, dest, temp);
 	    j++;		/* NOTE: two attrs consumed */
 	 }
 	 else {
 	    printf("Can't emit 3ub\n");
 	 }
 	 return GL_FALSE;	/* add this later */
 	 break;

       case EMIT_4UB_4F_RGBA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_4UB_4F_BGRA:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_4UB_4F_ARGB:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_4UB_4F_ABGR:
 	 get_src_ptr(p, srcECX, vfESI, a);
 	 emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	 sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
 	 emit_pack_store_4ub(p, dest, temp);
 	 update_src_ptr(p, srcECX, vfESI, a);
 	 break;
       case EMIT_4CHAN_4F_RGBA:
 	 switch (CHAN_TYPE) {
 	 case GL_UNSIGNED_BYTE:
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	    emit_pack_store_4ub(p, dest, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
 	    break;
 	 case GL_FLOAT:
 	    get_src_ptr(p, srcECX, vfESI, a);
 	    emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
 	    emit_store(p, dest, 4, temp);
 	    update_src_ptr(p, srcECX, vfESI, a);
 	    break;
 	 case GL_UNSIGNED_SHORT:
 	 default:
 	    printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
 	    return GL_FALSE;
 	 }
 	 break;
       default:
 	 printf("unknown a[%d].format %d\n", j, a->format);
 	 return GL_FALSE;	/* catch any new opcodes */
       }

       /* Increment j by at least 1 - may have been incremented above also:
        */
       j++;
    }

    /* Next vertex:
     */
    x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));

    /* decr count, loop if not zero
     */
    x86_dec(&p->func, countEBP);
    x86_test(&p->func, countEBP, countEBP);
    x86_jcc(&p->func, cc_NZ, label);

    /* Exit mmx state?
     */
    if (p->func.need_emms)
       mmx_emms(&p->func);

    /* Land forward jump here:
     */
    x86_fixup_fwd_jump(&p->func, fixup);

    /* Pop regs and return
     */
    x86_pop(&p->func, x86_get_base_reg(vfESI));
    x86_pop(&p->func, countEBP);
    x86_ret(&p->func);

    vf->emit = (vf_emit_func)x86_get_func(&p->func);
    return GL_TRUE;
 }


 void vf_generate_sse_emit( struct vertex_fetch *vf )
 {
    struct x86_program p;

    if (!cpu_has_xmm) {
       vf->codegen_emit = NULL;
       return;
    }

    memset(&p, 0, sizeof(p));

    p.vf = vf;
    p.inputs_safe = 0;		/* for now */
    p.outputs_safe = 0;		/* for now */
    p.have_sse2 = cpu_has_xmm2;
    p.identity = x86_make_reg(file_XMM, 6);
    p.chan0 = x86_make_reg(file_XMM, 7);

    x86_init_func(&p.func);

    if (build_vertex_emit(&p)) {
       vf_register_fastpath( vf, GL_TRUE );
    }
    else {
       /* Note the failure so that we don't keep trying to codegen an
        * impossible state:
        */
       vf_register_fastpath( vf, GL_FALSE );
       x86_release_func(&p.func);
    }
 }

 #else

 void vf_generate_sse_emit( struct vertex_fetch *vf )
 {
    /* Dummy version for when USE_SSE_ASM not defined */
 }

 #endif
	/*
	* Copyright 2003 Tungsten Graphics, inc.
	* All Rights Reserved.
	*
	* Permission is hereby granted, free of charge, to any person obtaining a
	* copy of this software and associated documentation files (the "Software"),
	* to deal in the Software without restriction, including without limitation
	* on the rights to use, copy, modify, merge, publish, distribute, sub
	* license, and/or sell copies of the Software, and to permit persons to whom
	* the Software is furnished to do so, subject to the following conditions:
	*
	* The above copyright notice and this permission notice (including the next
	* paragraph) shall be included in all copies or substantial portions of the
	* Software.
	*
	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
	* TUNGSTEN GRAPHICS AND/OR THEIR SUPPLIERS BE LIABLE FOR ANY CLAIM,
	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
	* USE OR OTHER DEALINGS IN THE SOFTWARE.
	*
	* Authors:
	* Keith Whitwell <keithw@tungstengraphics.com>
	*/

	#include "main/glheader.h"
	#include "main/colormac.h"
	#include "main/simple_list.h"
	#include "main/enums.h"

	#include "vf/vf.h"

	#if defined(USE_SSE_ASM)

	#include "x86/rtasm/x86sse.h"
	#include "x86/common_x86_asm.h"


	#define X 0
	#define Y 1
	#define Z 2
	#define W 3


	struct x86_program {
	struct x86_function func;

	struct vertex_fetch *vf;
	GLboolean inputs_safe;
	GLboolean outputs_safe;
	GLboolean have_sse2;

	struct x86_reg identity;
	struct x86_reg chan0;
	};


	static struct x86_reg get_identity( struct x86_program *p )
	{
	return p->identity;
	}

	static void emit_load4f_4( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movups(&p->func, dest, arg0);
	}

	static void emit_load4f_3( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	/* Have to jump through some hoops:
	*
	* c 0 0 0
	* c 0 0 1
	* 0 0 c 1
	* a b c 1
	*/
	sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
	sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
	sse_shufps(&p->func, dest, dest, SHUF(Y,Z,X,W) );
	sse_movlps(&p->func, dest, arg0);
	}

	static void emit_load4f_2( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	/* Initialize from identity, then pull in low two words:
	*/
	sse_movups(&p->func, dest, get_identity(p));
	sse_movlps(&p->func, dest, arg0);
	}

	static void emit_load4f_1( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	/* Pull in low word, then swizzle in identity */
	sse_movss(&p->func, dest, arg0);
	sse_shufps(&p->func, dest, get_identity(p), SHUF(X,Y,Z,W) );
	}



	static void emit_load3f_3( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	/* Over-reads by 1 dword - potential SEGV if input is a vertex
	* array.
	*/
	if (p->inputs_safe) {
	sse_movups(&p->func, dest, arg0);
	}
	else {
	/* c 0 0 0
	* c c c c
	* a b c c
	*/
	sse_movss(&p->func, dest, x86_make_disp(arg0, 8));
	sse_shufps(&p->func, dest, dest, SHUF(X,X,X,X));
	sse_movlps(&p->func, dest, arg0);
	}
	}

	static void emit_load3f_2( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	emit_load4f_2(p, dest, arg0);
	}

	static void emit_load3f_1( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	emit_load4f_1(p, dest, arg0);
	}

	static void emit_load2f_2( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movlps(&p->func, dest, arg0);
	}

	static void emit_load2f_1( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	emit_load4f_1(p, dest, arg0);
	}

	static void emit_load1f_1( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movss(&p->func, dest, arg0);
	}

	static void (load[4][4])( struct x86_program p,
	struct x86_reg dest,
	struct x86_reg arg0 ) = {
	{ emit_load1f_1,
	emit_load1f_1,
	emit_load1f_1,
	emit_load1f_1 },

	{ emit_load2f_1,
	emit_load2f_2,
	emit_load2f_2,
	emit_load2f_2 },

	{ emit_load3f_1,
	emit_load3f_2,
	emit_load3f_3,
	emit_load3f_3 },

	{ emit_load4f_1,
	emit_load4f_2,
	emit_load4f_3,
	emit_load4f_4 }
	};

	static void emit_load( struct x86_program *p,
	struct x86_reg dest,
	GLuint sz,
	struct x86_reg src,
	GLuint src_sz)
	{
	load[sz-1][src_sz-1](p, dest, src);
	}

	static void emit_store4f( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movups(&p->func, dest, arg0);
	}

	static void emit_store3f( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	if (p->outputs_safe) {
	/* Emit the extra dword anyway. This may hurt writecombining,
	* may cause other problems.
	*/
	sse_movups(&p->func, dest, arg0);
	}
	else {
	/* Alternate strategy - emit two, shuffle, emit one.
	*/
	sse_movlps(&p->func, dest, arg0);
	sse_shufps(&p->func, arg0, arg0, SHUF(Z,Z,Z,Z) ); /* NOTE! destructive */
	sse_movss(&p->func, x86_make_disp(dest,8), arg0);
	}
	}

	static void emit_store2f( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movlps(&p->func, dest, arg0);
	}

	static void emit_store1f( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg arg0 )
	{
	sse_movss(&p->func, dest, arg0);
	}


	static void (store[4])( struct x86_program p,
	struct x86_reg dest,
	struct x86_reg arg0 ) =
	{
	emit_store1f,
	emit_store2f,
	emit_store3f,
	emit_store4f
	};

	static void emit_store( struct x86_program *p,
	struct x86_reg dest,
	GLuint sz,
	struct x86_reg temp )

	{
	store[sz-1](p, dest, temp);
	}

	static void emit_pack_store_4ub( struct x86_program *p,
	struct x86_reg dest,
	struct x86_reg temp )
	{
	/* Scale by 255.0
	*/
	sse_mulps(&p->func, temp, p->chan0);

	if (p->have_sse2) {
	sse2_cvtps2dq(&p->func, temp, temp);
	sse2_packssdw(&p->func, temp, temp);
	sse2_packuswb(&p->func, temp, temp);
	sse_movss(&p->func, dest, temp);
	}
	else {
	struct x86_reg mmx0 = x86_make_reg(file_MMX, 0);
	struct x86_reg mmx1 = x86_make_reg(file_MMX, 1);
	sse_cvtps2pi(&p->func, mmx0, temp);
	sse_movhlps(&p->func, temp, temp);
	sse_cvtps2pi(&p->func, mmx1, temp);
	mmx_packssdw(&p->func, mmx0, mmx1);
	mmx_packuswb(&p->func, mmx0, mmx0);
	mmx_movd(&p->func, dest, mmx0);
	}
	}

	static GLint get_offset( const void a, const void b )
	{
	return (const char )b - (const char )a;
	}

	/* Not much happens here. Eventually use this function to try and
	* avoid saving/reloading the source pointers each vertex (if some of
	* them can fit in registers).
	*/
	static void get_src_ptr( struct x86_program *p,
	struct x86_reg srcREG,
	struct x86_reg vfREG,
	struct vf_attr *a )
	{
	struct vertex_fetch *vf = p->vf;
	struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));

	/* Load current a[j].inputptr
	*/
	x86_mov(&p->func, srcREG, ptr_to_src);
	}

	static void update_src_ptr( struct x86_program *p,
	struct x86_reg srcREG,
	struct x86_reg vfREG,
	struct vf_attr *a )
	{
	if (a->inputstride) {
	struct vertex_fetch *vf = p->vf;
	struct x86_reg ptr_to_src = x86_make_disp(vfREG, get_offset(vf, &a->inputptr));

	/* add a[j].inputstride (hardcoded value - could just as easily
	* pull the stride value from memory each time).
	*/
	x86_lea(&p->func, srcREG, x86_make_disp(srcREG, a->inputstride));

	/* save new value of a[j].inputptr
	*/
	x86_mov(&p->func, ptr_to_src, srcREG);
	}
	}


	/* Lots of hardcoding
	*
	* EAX -- pointer to current output vertex
	* ECX -- pointer to current attribute
	*
	*/
	static GLboolean build_vertex_emit( struct x86_program *p )
	{
	struct vertex_fetch *vf = p->vf;
	GLuint j = 0;

	struct x86_reg vertexEAX = x86_make_reg(file_REG32, reg_AX);
	struct x86_reg srcECX = x86_make_reg(file_REG32, reg_CX);
	struct x86_reg countEBP = x86_make_reg(file_REG32, reg_BP);
	struct x86_reg vfESI = x86_make_reg(file_REG32, reg_SI);
	struct x86_reg temp = x86_make_reg(file_XMM, 0);
	struct x86_reg vp0 = x86_make_reg(file_XMM, 1);
	struct x86_reg vp1 = x86_make_reg(file_XMM, 2);
	GLubyte fixup, label;

	/* Push a few regs?
	*/
	x86_push(&p->func, countEBP);
	x86_push(&p->func, vfESI);


	/* Get vertex count, compare to zero
	*/
	x86_xor(&p->func, srcECX, srcECX);
	x86_mov(&p->func, countEBP, x86_fn_arg(&p->func, 2));
	x86_cmp(&p->func, countEBP, srcECX);
	fixup = x86_jcc_forward(&p->func, cc_E);

	/* Initialize destination register.
	*/
	x86_mov(&p->func, vertexEAX, x86_fn_arg(&p->func, 3));

	/* Move argument 1 (vf) into a reg:
	*/
	x86_mov(&p->func, vfESI, x86_fn_arg(&p->func, 1));


	/* Possibly load vp0, vp1 for viewport calcs:
	*/
	if (vf->allow_viewport_emits) {
	sse_movups(&p->func, vp0, x86_make_disp(vfESI, get_offset(vf, &vf->vp[0])));
	sse_movups(&p->func, vp1, x86_make_disp(vfESI, get_offset(vf, &vf->vp[4])));
	}

	/* always load, needed or not:
	*/
	sse_movups(&p->func, p->chan0, x86_make_disp(vfESI, get_offset(vf, &vf->chan_scale[0])));
	sse_movups(&p->func, p->identity, x86_make_disp(vfESI, get_offset(vf, &vf->identity[0])));

	/* Note address for loop jump */
	label = x86_get_label(&p->func);

	/* Emit code for each of the attributes. Currently routes
	* everything through SSE registers, even when it might be more
	* efficient to stick with regular old x86. No optimization or
	* other tricks - enough new ground to cover here just getting
	* things working.
	*/
	while (j < vf->attr_count) {
	struct vf_attr *a = &vf->attr[j];
	struct x86_reg dest = x86_make_disp(vertexEAX, a->vertoffset);

	/* Now, load an XMM reg from src, perhaps transform, then save.
	* Could be shortcircuited in specific cases:
	*/
	switch (a->format) {
	case EMIT_1F:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 1, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_2F:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 2, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_3F:
	/* Potentially the worst case - hardcode 2+1 copying:
	*/
	if (0) {
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 3, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	}
	else {
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 2, temp);
	if (a->inputsize > 2) {
	emit_load(p, temp, 1, x86_make_disp(srcECX, 8), 1);
	emit_store(p, x86_make_disp(dest,8), 1, temp);
	}
	else {
	sse_movss(&p->func, x86_make_disp(dest,8), get_identity(p));
	}
	update_src_ptr(p, srcECX, vfESI, a);
	}
	break;
	case EMIT_4F:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 4, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_2F_VIEWPORT:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 2, x86_deref(srcECX), a->inputsize);
	sse_mulps(&p->func, temp, vp0);
	sse_addps(&p->func, temp, vp1);
	emit_store(p, dest, 2, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_3F_VIEWPORT:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
	sse_mulps(&p->func, temp, vp0);
	sse_addps(&p->func, temp, vp1);
	emit_store(p, dest, 3, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_4F_VIEWPORT:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	sse_mulps(&p->func, temp, vp0);
	sse_addps(&p->func, temp, vp1);
	emit_store(p, dest, 4, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_3F_XYW:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	sse_shufps(&p->func, temp, temp, SHUF(X,Y,W,Z));
	emit_store(p, dest, 3, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;

	case EMIT_1UB_1F:
	/* Test for PAD3 + 1UB:
	*/
	if (j > 0 &&
	a[-1].vertoffset + a[-1].vertattrsize <= a->vertoffset - 3)
	{
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 1, x86_deref(srcECX), a->inputsize);
	sse_shufps(&p->func, temp, temp, SHUF(X,X,X,X));
	emit_pack_store_4ub(p, x86_make_disp(dest, -3), temp); /* overkill! */
	update_src_ptr(p, srcECX, vfESI, a);
	}
	else {
	printf("Can't emit 1ub %x %x %d\n", a->vertoffset, a[-1].vertoffset, a[-1].vertattrsize );
	return GL_FALSE;
	}
	break;
	case EMIT_3UB_3F_RGB:
	case EMIT_3UB_3F_BGR:
	/* Test for 3UB + PAD1:
	*/
	if (j == vf->attr_count - 1 \|\|
	a[1].vertoffset >= a->vertoffset + 4) {
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
	if (a->format == EMIT_3UB_3F_BGR)
	sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	}
	/* Test for 3UB + 1UB:
	*/
	else if (j < vf->attr_count - 1 &&
	a[1].format == EMIT_1UB_1F &&
	a[1].vertoffset == a->vertoffset + 3) {
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 3, x86_deref(srcECX), a->inputsize);
	update_src_ptr(p, srcECX, vfESI, a);

	/* Make room for incoming value:
	*/
	sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));

	get_src_ptr(p, srcECX, vfESI, &a[1]);
	emit_load(p, temp, 1, x86_deref(srcECX), a[1].inputsize);
	update_src_ptr(p, srcECX, vfESI, &a[1]);

	/* Rearrange and possibly do BGR conversion:
	*/
	if (a->format == EMIT_3UB_3F_BGR)
	sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
	else
	sse_shufps(&p->func, temp, temp, SHUF(Y,Z,W,X));

	emit_pack_store_4ub(p, dest, temp);
	j++; /* NOTE: two attrs consumed */
	}
	else {
	printf("Can't emit 3ub\n");
	}
	return GL_FALSE; /* add this later */
	break;

	case EMIT_4UB_4F_RGBA:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_4UB_4F_BGRA:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	sse_shufps(&p->func, temp, temp, SHUF(Z,Y,X,W));
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_4UB_4F_ARGB:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	sse_shufps(&p->func, temp, temp, SHUF(W,X,Y,Z));
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_4UB_4F_ABGR:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	sse_shufps(&p->func, temp, temp, SHUF(W,Z,Y,X));
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case EMIT_4CHAN_4F_RGBA:
	switch (CHAN_TYPE) {
	case GL_UNSIGNED_BYTE:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	emit_pack_store_4ub(p, dest, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case GL_FLOAT:
	get_src_ptr(p, srcECX, vfESI, a);
	emit_load(p, temp, 4, x86_deref(srcECX), a->inputsize);
	emit_store(p, dest, 4, temp);
	update_src_ptr(p, srcECX, vfESI, a);
	break;
	case GL_UNSIGNED_SHORT:
	default:
	printf("unknown CHAN_TYPE %s\n", _mesa_lookup_enum_by_nr(CHAN_TYPE));
	return GL_FALSE;
	}
	break;
	default:
	printf("unknown a[%d].format %d\n", j, a->format);
	return GL_FALSE; /* catch any new opcodes */
	}

	/* Increment j by at least 1 - may have been incremented above also:
	*/
	j++;
	}

	/* Next vertex:
	*/
	x86_lea(&p->func, vertexEAX, x86_make_disp(vertexEAX, vf->vertex_stride));

	/* decr count, loop if not zero
	*/
	x86_dec(&p->func, countEBP);
	x86_test(&p->func, countEBP, countEBP);
	x86_jcc(&p->func, cc_NZ, label);

	/* Exit mmx state?
	*/
	if (p->func.need_emms)
	mmx_emms(&p->func);

	/* Land forward jump here:
	*/
	x86_fixup_fwd_jump(&p->func, fixup);

	/* Pop regs and return
	*/
	x86_pop(&p->func, x86_get_base_reg(vfESI));
	x86_pop(&p->func, countEBP);
	x86_ret(&p->func);

	vf->emit = (vf_emit_func)x86_get_func(&p->func);
	return GL_TRUE;
	}



	void vf_generate_sse_emit( struct vertex_fetch *vf )
	{
	struct x86_program p;

	if (!cpu_has_xmm) {
	vf->codegen_emit = NULL;
	return;
	}

	memset(&p, 0, sizeof(p));

	p.vf = vf;
	p.inputs_safe = 0; /* for now */
	p.outputs_safe = 0; /* for now */
	p.have_sse2 = cpu_has_xmm2;
	p.identity = x86_make_reg(file_XMM, 6);
	p.chan0 = x86_make_reg(file_XMM, 7);

	x86_init_func(&p.func);

	if (build_vertex_emit(&p)) {
	vf_register_fastpath( vf, GL_TRUE );
	}
	else {
	/* Note the failure so that we don't keep trying to codegen an
	* impossible state:
	*/
	vf_register_fastpath( vf, GL_FALSE );
	x86_release_func(&p.func);
	}
	}

	#else

	void vf_generate_sse_emit( struct vertex_fetch *vf )
	{
	/* Dummy version for when USE_SSE_ASM not defined */
	}

	#endif