diff --git a/src/mesa/main/mtypes.h b/src/mesa/main/mtypes.h
index 5a326ff..ffae6ff 100644
--- a/src/mesa/main/mtypes.h
+++ b/src/mesa/main/mtypes.h
@@ -1914,20 +1914,6 @@
 
 
 /**
- * Virtual vertex program machine state.
- * Only used during program execution (may be moved someday):
- */
-struct gl_vertex_program_machine
-{
-   GLfloat Temporaries[MAX_NV_VERTEX_PROGRAM_TEMPS][4];
-   GLfloat Inputs[MAX_NV_VERTEX_PROGRAM_INPUTS][4];
-   GLuint InputsSize[MAX_NV_VERTEX_PROGRAM_INPUTS];
-   GLfloat Outputs[MAX_NV_VERTEX_PROGRAM_OUTPUTS][4];
-   GLint AddressReg[4];
-};
-
-
-/**
  * Context state for vertex programs.
  */
 struct gl_vertex_program_state
@@ -1943,8 +1929,6 @@
 
    GLfloat Parameters[MAX_NV_VERTEX_PROGRAM_PARAMS][4]; /**< Env params */
 
-   struct gl_vertex_program_machine Machine;
-
    /* For GL_NV_vertex_program only: */
    GLenum TrackMatrix[MAX_NV_VERTEX_PROGRAM_PARAMS / 4];
    GLenum TrackMatrixTransform[MAX_NV_VERTEX_PROGRAM_PARAMS / 4];
diff --git a/src/mesa/shader/nvvertexec.c b/src/mesa/shader/nvvertexec.c
index a4c5728..f00f1e8 100644
--- a/src/mesa/shader/nvvertexec.c
+++ b/src/mesa/shader/nvvertexec.c
@@ -47,23 +47,23 @@
  * per-vertex.
  */
 void
-_mesa_init_vp_per_vertex_registers(GLcontext *ctx)
+_mesa_init_vp_per_vertex_registers(GLcontext *ctx, struct vp_machine *machine)
 {
    /* Input registers get initialized from the current vertex attribs */
-   MEMCPY(ctx->VertexProgram.Machine.Inputs, ctx->Current.Attrib,
+   MEMCPY(machine->Inputs, ctx->Current.Attrib,
           MAX_VERTEX_PROGRAM_ATTRIBS * 4 * sizeof(GLfloat));
 
    if (ctx->VertexProgram.Current->IsNVProgram) {
       GLuint i;
       /* Output/result regs are initialized to [0,0,0,1] */
       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
-         ASSIGN_4V(ctx->VertexProgram.Machine.Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
+         ASSIGN_4V(machine->Outputs[i], 0.0F, 0.0F, 0.0F, 1.0F);
       }
       /* Temp regs are initialized to [0,0,0,0] */
       for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
-         ASSIGN_4V(ctx->VertexProgram.Machine.Temporaries[i], 0.0F, 0.0F, 0.0F, 0.0F);
+         ASSIGN_4V(machine->Temporaries[i], 0.0F, 0.0F, 0.0F, 0.0F);
       }
-      ASSIGN_4V(ctx->VertexProgram.Machine.AddressReg, 0, 0, 0, 0);
+      ASSIGN_4V(machine->AddressReg, 0, 0, 0, 0);
    }
 }
 
@@ -139,7 +139,7 @@
             continue;
          }
 
-         /* load the matrix */
+         /* load the matrix values into sequential registers */
          if (ctx->VertexProgram.TrackMatrixTransform[i] == GL_IDENTITY_NV) {
             load_matrix(ctx->VertexProgram.Parameters, i*4, mat->m);
          }
@@ -176,36 +176,37 @@
  * For debugging.  Dump the current vertex program machine registers.
  */
 void
-_mesa_dump_vp_state( const struct gl_vertex_program_state *state )
+_mesa_dump_vp_state( const struct gl_vertex_program_state *state,
+                     const struct vp_machine *machine)
 {
    int i;
    _mesa_printf("VertexIn:\n");
    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_INPUTS; i++) {
       _mesa_printf("%d: %f %f %f %f   ", i,
-                   state->Machine.Inputs[i][0],
-                   state->Machine.Inputs[i][1],
-                   state->Machine.Inputs[i][2],
-                   state->Machine.Inputs[i][3]);
+                   machine->Inputs[i][0],
+                   machine->Inputs[i][1],
+                   machine->Inputs[i][2],
+                   machine->Inputs[i][3]);
    }
    _mesa_printf("\n");
 
    _mesa_printf("VertexOut:\n");
    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_OUTPUTS; i++) {
       _mesa_printf("%d: %f %f %f %f   ", i,
-                  state->Machine.Outputs[i][0],
-                  state->Machine.Outputs[i][1],
-                  state->Machine.Outputs[i][2],
-                  state->Machine.Outputs[i][3]);
+                  machine->Outputs[i][0],
+                  machine->Outputs[i][1],
+                  machine->Outputs[i][2],
+                  machine->Outputs[i][3]);
    }
    _mesa_printf("\n");
 
    _mesa_printf("Registers:\n");
    for (i = 0; i < MAX_NV_VERTEX_PROGRAM_TEMPS; i++) {
       _mesa_printf("%d: %f %f %f %f   ", i,
-                  state->Machine.Temporaries[i][0],
-                  state->Machine.Temporaries[i][1],
-                  state->Machine.Temporaries[i][2],
-                  state->Machine.Temporaries[i][3]);
+                  machine->Temporaries[i][0],
+                  machine->Temporaries[i][1],
+                  machine->Temporaries[i][2],
+                  machine->Temporaries[i][3]);
    }
    _mesa_printf("\n");
 
@@ -227,41 +228,45 @@
  * source register.
  */
 static INLINE const GLfloat *
-get_register_pointer( const struct prog_src_register *source,
-                      const struct gl_vertex_program_state *state )
+get_register_pointer( GLcontext *ctx,
+                      const struct prog_src_register *source,
+                      struct vp_machine *machine,
+                      const struct gl_vertex_program *program )
 {
    if (source->RelAddr) {
-      const GLint reg = source->Index + state->Machine.AddressReg[0];
+      const GLint reg = source->Index + machine->AddressReg[0];
       ASSERT( (source->File == PROGRAM_ENV_PARAM) || 
         (source->File == PROGRAM_STATE_VAR) );
       if (reg < 0 || reg > MAX_NV_VERTEX_PROGRAM_PARAMS)
          return ZeroVec;
       else if (source->File == PROGRAM_ENV_PARAM)
-         return state->Parameters[reg];
-      else
-         return state->Current->Base.Parameters->ParameterValues[reg];
+         return ctx->VertexProgram.Parameters[reg];
+      else {
+         ASSERT(source->File == PROGRAM_LOCAL_PARAM);
+         return program->Base.Parameters->ParameterValues[reg];
+      }
    }
    else {
       switch (source->File) {
          case PROGRAM_TEMPORARY:
             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_TEMPS);
-            return state->Machine.Temporaries[source->Index];
+            return machine->Temporaries[source->Index];
          case PROGRAM_INPUT:
             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_INPUTS);
-            return state->Machine.Inputs[source->Index];
+            return machine->Inputs[source->Index];
          case PROGRAM_OUTPUT:
             /* This is only needed for the PRINT instruction */
             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_OUTPUTS);
-            return state->Machine.Outputs[source->Index];
+            return machine->Outputs[source->Index];
          case PROGRAM_LOCAL_PARAM:
             ASSERT(source->Index < MAX_PROGRAM_LOCAL_PARAMS);
-            return state->Current->Base.LocalParams[source->Index];
+            return program->Base.LocalParams[source->Index];
          case PROGRAM_ENV_PARAM:
             ASSERT(source->Index < MAX_NV_VERTEX_PROGRAM_PARAMS);
-            return state->Parameters[source->Index];
+            return ctx->VertexProgram.Parameters[source->Index];
          case PROGRAM_STATE_VAR:
-            ASSERT(source->Index < state->Current->Base.Parameters->NumParameters);
-            return state->Current->Base.Parameters->ParameterValues[source->Index];
+            ASSERT(source->Index < program->Base.Parameters->NumParameters);
+            return program->Base.Parameters->ParameterValues[source->Index];
          default:
             _mesa_problem(NULL,
                           "Bad source register file in get_register_pointer");
@@ -277,11 +282,13 @@
  * Apply swizzling and negating as needed.
  */
 static INLINE void
-fetch_vector4( const struct prog_src_register *source,
-               const struct gl_vertex_program_state *state,
+fetch_vector4( GLcontext *ctx, 
+               const struct prog_src_register *source,
+               struct vp_machine *machine,
+               const struct gl_vertex_program *program,
                GLfloat result[4] )
 {
-   const GLfloat *src = get_register_pointer(source, state);
+   const GLfloat *src = get_register_pointer(ctx, source, machine, program);
 
    if (source->NegateBase) {
       result[0] = -src[GET_SWZ(source->Swizzle, 0)];
@@ -303,12 +310,13 @@
  * As above, but only return result[0] element.
  */
 static INLINE void
-fetch_vector1( const struct prog_src_register *source,
-               const struct gl_vertex_program_state *state,
+fetch_vector1( GLcontext *ctx,
+               const struct prog_src_register *source,
+               struct vp_machine *machine,
+               const struct gl_vertex_program *program,
                GLfloat result[4] )
 {
-   const GLfloat *src = get_register_pointer(source, state);
-
+   const GLfloat *src = get_register_pointer(ctx, source, machine, program);
    if (source->NegateBase) {
       result[0] = -src[GET_SWZ(source->Swizzle, 0)];
    }
@@ -322,17 +330,18 @@
  * Store 4 floats into a register.
  */
 static void
-store_vector4( const struct prog_dst_register *dest,
-               struct gl_vertex_program_state *state,
+store_vector4( const struct prog_instruction *inst,
+               struct vp_machine *machine,
                const GLfloat value[4] )
 {
+   const struct prog_dst_register *dest = &(inst->DstReg);
    GLfloat *dst;
    switch (dest->File) {
       case PROGRAM_TEMPORARY:
-         dst = state->Machine.Temporaries[dest->Index];
+         dst = machine->Temporaries[dest->Index];
          break;
       case PROGRAM_OUTPUT:
-         dst = state->Machine.Outputs[dest->Index];
+         dst = machine->Outputs[dest->Index];
          break;
       case PROGRAM_ENV_PARAM:
          {
@@ -379,9 +388,10 @@
  * Execute the given vertex program
  */
 void
-_mesa_exec_vertex_program(GLcontext *ctx, const struct gl_vertex_program *program)
+_mesa_exec_vertex_program(GLcontext *ctx,
+                          struct vp_machine *machine,
+                          const struct gl_vertex_program *program)
 {
-   struct gl_vertex_program_state *state = &ctx->VertexProgram;
    const struct prog_instruction *inst;
 
    ctx->_CurrentProgram = GL_VERTEX_PROGRAM_ARB; /* or NV, doesn't matter */
@@ -390,9 +400,9 @@
     * by the MVP matrix and store in the vertex position result register.
     */
    if (ctx->VertexProgram.Current->IsPositionInvariant) {
-      TRANSFORM_POINT( ctx->VertexProgram.Machine.Outputs[VERT_RESULT_HPOS], 
+      TRANSFORM_POINT( machine->Outputs[VERT_RESULT_HPOS], 
                        ctx->_ModelProjectMatrix.m, 
-                       ctx->VertexProgram.Machine.Inputs[VERT_ATTRIB_POS]);
+                       machine->Inputs[VERT_ATTRIB_POS]);
 
       /* XXX: This could go elsewhere */
       ctx->VertexProgram.Current->Base.OutputsWritten |= VERT_BIT_POS;
@@ -411,15 +421,15 @@
          case OPCODE_MOV:
             {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               store_vector4( &inst->DstReg, state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_LIT:
             {
                const GLfloat epsilon = 1.0F / 256.0F; /* per NV spec */
                GLfloat t[4], lit[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = MAX2(t[0], 0.0F);
                t[1] = MAX2(t[1], 0.0F);
                t[3] = CLAMP(t[3], -(128.0F - epsilon), (128.0F - epsilon));
@@ -427,32 +437,32 @@
                lit[1] = t[0];
                lit[2] = (t[0] > 0.0) ? (GLfloat) _mesa_pow(t[1], t[3]) : 0.0F;
                lit[3] = 1.0;
-               store_vector4( &inst->DstReg, state, lit );
+               store_vector4( inst, machine, lit );
             }
             break;
          case OPCODE_RCP:
             {
                GLfloat t[4];
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                if (t[0] != 1.0F)
                   t[0] = 1.0F / t[0];  /* div by zero is infinity! */
                t[1] = t[2] = t[3] = t[0];
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_RSQ:
             {
                GLfloat t[4];
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = INV_SQRTF(FABSF(t[0]));
                t[1] = t[2] = t[3] = t[0];
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_EXP:
             {
                GLfloat t[4], q[4], floor_t0;
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                floor_t0 = FLOORF(t[0]);
                if (floor_t0 > FLT_MAX_EXP) {
                   SET_POS_INFINITY(q[0]);
@@ -475,13 +485,13 @@
                }
                q[1] = t[0] - floor_t0;
                q[3] = 1.0F;
-               store_vector4( &inst->DstReg, state, q );
+               store_vector4( inst, machine, q );
             }
             break;
          case OPCODE_LOG:
             {
                GLfloat t[4], q[4], abs_t0;
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                abs_t0 = FABSF(t[0]);
                if (abs_t0 != 0.0F) {
                   /* Since we really can't handle infinite values on VMS
@@ -512,147 +522,147 @@
                   SET_NEG_INFINITY(q[2]);
                }
                q[3] = 1.0;
-               store_vector4( &inst->DstReg, state, q );
+               store_vector4( inst, machine, q );
             }
             break;
          case OPCODE_MUL:
             {
                GLfloat t[4], u[4], prod[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                prod[0] = t[0] * u[0];
                prod[1] = t[1] * u[1];
                prod[2] = t[2] * u[2];
                prod[3] = t[3] * u[3];
-               store_vector4( &inst->DstReg, state, prod );
+               store_vector4( inst, machine, prod );
             }
             break;
          case OPCODE_ADD:
             {
                GLfloat t[4], u[4], sum[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                sum[0] = t[0] + u[0];
                sum[1] = t[1] + u[1];
                sum[2] = t[2] + u[2];
                sum[3] = t[3] + u[3];
-               store_vector4( &inst->DstReg, state, sum );
+               store_vector4( inst, machine, sum );
             }
             break;
          case OPCODE_DP3:
             {
                GLfloat t[4], u[4], dot[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2];
                dot[1] = dot[2] = dot[3] = dot[0];
-               store_vector4( &inst->DstReg, state, dot );
+               store_vector4( inst, machine, dot );
             }
             break;
          case OPCODE_DP4:
             {
                GLfloat t[4], u[4], dot[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + t[3] * u[3];
                dot[1] = dot[2] = dot[3] = dot[0];
-               store_vector4( &inst->DstReg, state, dot );
+               store_vector4( inst, machine, dot );
             }
             break;
          case OPCODE_DST:
             {
                GLfloat t[4], u[4], dst[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                dst[0] = 1.0F;
                dst[1] = t[1] * u[1];
                dst[2] = t[2];
                dst[3] = u[3];
-               store_vector4( &inst->DstReg, state, dst );
+               store_vector4( inst, machine, dst );
             }
             break;
          case OPCODE_MIN:
             {
                GLfloat t[4], u[4], min[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                min[0] = (t[0] < u[0]) ? t[0] : u[0];
                min[1] = (t[1] < u[1]) ? t[1] : u[1];
                min[2] = (t[2] < u[2]) ? t[2] : u[2];
                min[3] = (t[3] < u[3]) ? t[3] : u[3];
-               store_vector4( &inst->DstReg, state, min );
+               store_vector4( inst, machine, min );
             }
             break;
          case OPCODE_MAX:
             {
                GLfloat t[4], u[4], max[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                max[0] = (t[0] > u[0]) ? t[0] : u[0];
                max[1] = (t[1] > u[1]) ? t[1] : u[1];
                max[2] = (t[2] > u[2]) ? t[2] : u[2];
                max[3] = (t[3] > u[3]) ? t[3] : u[3];
-               store_vector4( &inst->DstReg, state, max );
+               store_vector4( inst, machine, max );
             }
             break;
          case OPCODE_SLT:
             {
                GLfloat t[4], u[4], slt[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                slt[0] = (t[0] < u[0]) ? 1.0F : 0.0F;
                slt[1] = (t[1] < u[1]) ? 1.0F : 0.0F;
                slt[2] = (t[2] < u[2]) ? 1.0F : 0.0F;
                slt[3] = (t[3] < u[3]) ? 1.0F : 0.0F;
-               store_vector4( &inst->DstReg, state, slt );
+               store_vector4( inst, machine, slt );
             }
             break;
          case OPCODE_SGE:
             {
                GLfloat t[4], u[4], sge[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                sge[0] = (t[0] >= u[0]) ? 1.0F : 0.0F;
                sge[1] = (t[1] >= u[1]) ? 1.0F : 0.0F;
                sge[2] = (t[2] >= u[2]) ? 1.0F : 0.0F;
                sge[3] = (t[3] >= u[3]) ? 1.0F : 0.0F;
-               store_vector4( &inst->DstReg, state, sge );
+               store_vector4( inst, machine, sge );
             }
             break;
          case OPCODE_MAD:
             {
                GLfloat t[4], u[4], v[4], sum[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
-               fetch_vector4( &inst->SrcReg[2], state, v );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
+               fetch_vector4( ctx, &inst->SrcReg[2], machine, program, v );
                sum[0] = t[0] * u[0] + v[0];
                sum[1] = t[1] * u[1] + v[1];
                sum[2] = t[2] * u[2] + v[2];
                sum[3] = t[3] * u[3] + v[3];
-               store_vector4( &inst->DstReg, state, sum );
+               store_vector4( inst, machine, sum );
             }
             break;
          case OPCODE_ARL:
             {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               state->Machine.AddressReg[0] = (GLint) FLOORF(t[0]);
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               machine->AddressReg[0] = (GLint) FLOORF(t[0]);
             }
             break;
          case OPCODE_DPH:
             {
                GLfloat t[4], u[4], dot[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                dot[0] = t[0] * u[0] + t[1] * u[1] + t[2] * u[2] + u[3];
                dot[1] = dot[2] = dot[3] = dot[0];
-               store_vector4( &inst->DstReg, state, dot );
+               store_vector4( inst, machine, dot );
             }
             break;
          case OPCODE_RCC:
             {
                GLfloat t[4], u;
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                if (t[0] == 1.0F)
                   u = 1.0F;
                else
@@ -674,115 +684,120 @@
                   }
                }
                t[0] = t[1] = t[2] = t[3] = u;
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_SUB: /* GL_NV_vertex_program1_1 */
             {
                GLfloat t[4], u[4], sum[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                sum[0] = t[0] - u[0];
                sum[1] = t[1] - u[1];
                sum[2] = t[2] - u[2];
                sum[3] = t[3] - u[3];
-               store_vector4( &inst->DstReg, state, sum );
+               store_vector4( inst, machine, sum );
             }
             break;
          case OPCODE_ABS: /* GL_NV_vertex_program1_1 */
             {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
                if (t[0] < 0.0)  t[0] = -t[0];
                if (t[1] < 0.0)  t[1] = -t[1];
                if (t[2] < 0.0)  t[2] = -t[2];
                if (t[3] < 0.0)  t[3] = -t[3];
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_FLR: /* GL_ARB_vertex_program */
             {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = FLOORF(t[0]);
                t[1] = FLOORF(t[1]);
                t[2] = FLOORF(t[2]);
                t[3] = FLOORF(t[3]);
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_FRC: /* GL_ARB_vertex_program */
             {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = t[0] - FLOORF(t[0]);
                t[1] = t[1] - FLOORF(t[1]);
                t[2] = t[2] - FLOORF(t[2]);
                t[3] = t[3] - FLOORF(t[3]);
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_EX2: /* GL_ARB_vertex_program */
             {
                GLfloat t[4];
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(2.0, t[0]);
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_LG2: /* GL_ARB_vertex_program */
             {
                GLfloat t[4];
-               fetch_vector1( &inst->SrcReg[0], state, t );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
                t[0] = t[1] = t[2] = t[3] = LOG2(t[0]);
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_POW: /* GL_ARB_vertex_program */
             {
                GLfloat t[4], u[4];
-               fetch_vector1( &inst->SrcReg[0], state, t );
-               fetch_vector1( &inst->SrcReg[1], state, u );
+               fetch_vector1( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector1( ctx, &inst->SrcReg[1], machine, program, u );
                t[0] = t[1] = t[2] = t[3] = (GLfloat)_mesa_pow(t[0], u[0]);
-               store_vector4( &inst->DstReg, state, t );
+               store_vector4( inst, machine, t );
             }
             break;
          case OPCODE_XPD: /* GL_ARB_vertex_program */
             {
                GLfloat t[4], u[4], cross[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
-               fetch_vector4( &inst->SrcReg[1], state, u );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
+               fetch_vector4( ctx, &inst->SrcReg[1], machine, program, u );
                cross[0] = t[1] * u[2] - t[2] * u[1];
                cross[1] = t[2] * u[0] - t[0] * u[2];
                cross[2] = t[0] * u[1] - t[1] * u[0];
-               store_vector4( &inst->DstReg, state, cross );
+               store_vector4( inst, machine, cross );
             }
             break;
          case OPCODE_SWZ: /* GL_ARB_vertex_program */
             {
                const struct prog_src_register *source = &inst->SrcReg[0];
-               const GLfloat *src = get_register_pointer(source, state);
+               const GLfloat *src = get_register_pointer(ctx, source,
+                                                         machine, program);
                GLfloat result[4];
                GLuint i;
 
                /* do extended swizzling here */
                for (i = 0; i < 4; i++) {
-                  if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ZERO)
+                  const GLuint swz = GET_SWZ(source->Swizzle, i);
+                  if (swz == SWIZZLE_ZERO)
                      result[i] = 0.0;
-                  else if (GET_SWZ(source->Swizzle, i) == SWIZZLE_ONE)
+                  else if (swz == SWIZZLE_ONE)
                      result[i] = 1.0;
-                  else
-                     result[i] = src[GET_SWZ(source->Swizzle, i)];
+                  else {
+                     ASSERT(swz >= 0);
+                     ASSERT(swz <= 3);
+                     result[i] = src[swz];
+                  }
                   if (source->NegateBase & (1 << i))
                      result[i] = -result[i];
                }
-               store_vector4( &inst->DstReg, state, result );
+               store_vector4( inst, machine, result );
             }
             break;
          case OPCODE_PRINT:
             if (inst->SrcReg[0].File) {
                GLfloat t[4];
-               fetch_vector4( &inst->SrcReg[0], state, t );
+               fetch_vector4( ctx, &inst->SrcReg[0], machine, program, t );
                _mesa_printf("%s%g, %g, %g, %g\n",
                             (char *) inst->Data, t[0], t[1], t[2], t[3]);
             }
@@ -814,10 +829,11 @@
                                 struct gl_vertex_program *vprog,
                                 const GLfloat *params)
 {
-   _mesa_init_vp_per_vertex_registers(ctx);
+   struct vp_machine machine;
+   _mesa_init_vp_per_vertex_registers(ctx, &machine);
    _mesa_init_vp_per_primitive_registers(ctx);
-   COPY_4V(ctx->VertexProgram.Machine.Inputs[VERT_ATTRIB_POS], params);
-   _mesa_exec_vertex_program(ctx, vprog);
+   COPY_4V(machine.Inputs[VERT_ATTRIB_POS], params);
+   _mesa_exec_vertex_program(ctx, &machine, vprog);
 }
 
 
diff --git a/src/mesa/shader/nvvertexec.h b/src/mesa/shader/nvvertexec.h
index e7c5be0..b1cf31b 100644
--- a/src/mesa/shader/nvvertexec.h
+++ b/src/mesa/shader/nvvertexec.h
@@ -28,14 +28,31 @@
 #ifndef NVVERTEXEC_H
 #define NVVERTEXEC_H
 
+
+/**
+ * Virtual vertex program machine state.
+ * Only used during program execution.
+ */
+struct vp_machine
+{
+   GLfloat Temporaries[MAX_NV_VERTEX_PROGRAM_TEMPS][4];
+   GLfloat Inputs[MAX_NV_VERTEX_PROGRAM_INPUTS][4];
+   GLuint InputsSize[MAX_NV_VERTEX_PROGRAM_INPUTS];
+   GLfloat Outputs[MAX_NV_VERTEX_PROGRAM_OUTPUTS][4];
+   GLint AddressReg[4];
+};
+
+
+
 extern void
-_mesa_init_vp_per_vertex_registers(GLcontext *ctx);
+_mesa_init_vp_per_vertex_registers(GLcontext *ctx, struct vp_machine *machine);
 
 extern void
 _mesa_init_vp_per_primitive_registers(GLcontext *ctx);
 
 extern void
 _mesa_exec_vertex_program(GLcontext *ctx,
+                          struct vp_machine *machine,
                           const struct gl_vertex_program *program);
 
 extern void
@@ -44,6 +61,7 @@
                                 const GLfloat *params);
 
 extern void
-_mesa_dump_vp_state( const struct gl_vertex_program_state *state );
+_mesa_dump_vp_state( const struct gl_vertex_program_state *state,
+                     const struct vp_machine *machine);
 
 #endif
diff --git a/src/mesa/shader/program.c b/src/mesa/shader/program.c
index 96c1388..52476ed 100644
--- a/src/mesa/shader/program.c
+++ b/src/mesa/shader/program.c
@@ -1,6 +1,6 @@
 /*
  * Mesa 3-D graphics library
- * Version:  6.5.1
+ * Version:  6.5.2
  *
  * Copyright (C) 1999-2006  Brian Paul   All Rights Reserved.
  *
@@ -2108,7 +2108,9 @@
                            "glGetProgramRegisterfvMESA(registerName)");
                return;
             }
-            COPY_4V(v, ctx->VertexProgram.Machine.Temporaries[i]);
+#if 0 /* FIX ME */
+            ctx->Driver.GetVertexProgramRegister(ctx, PROGRAM_TEMPORARY, i, v);
+#endif
          }
          else if (reg[0] == 'v' && reg[1] == '[') {
             /* Vertex Input attribute */
@@ -2119,7 +2121,10 @@
                _mesa_sprintf(number, "%d", i);
                if (_mesa_strncmp(reg + 2, name, 4) == 0 ||
                    _mesa_strncmp(reg + 2, number, _mesa_strlen(number)) == 0) {
-                  COPY_4V(v, ctx->VertexProgram.Machine.Inputs[i]);
+#if 0 /* FIX ME */
+                  ctx->Driver.GetVertexProgramRegister(ctx, PROGRAM_INPUT,
+                                                       i, v);
+#endif
                   return;
                }
             }
diff --git a/src/mesa/tnl/t_vb_program.c b/src/mesa/tnl/t_vb_program.c
index b8828ec..f11ac61 100644
--- a/src/mesa/tnl/t_vb_program.c
+++ b/src/mesa/tnl/t_vb_program.c
@@ -76,6 +76,7 @@
    struct vp_stage_data *store = VP_STAGE_DATA(stage);
    struct vertex_buffer *VB = &tnl->vb;
    struct gl_vertex_program *program = ctx->VertexProgram.Current;
+   struct vp_machine machine;
    GLuint i;
 
    if (ctx->ShaderObjects._VertexShaderPresent)
@@ -91,7 +92,7 @@
    for (i = 0; i < VB->Count; i++) {
       GLuint attr;
 
-      _mesa_init_vp_per_vertex_registers(ctx);
+      _mesa_init_vp_per_vertex_registers(ctx, &machine);
 
 #if 0
       printf("Input  %d: %f, %f, %f, %f\n", i,
@@ -118,30 +119,29 @@
 	    const GLuint size = VB->AttribPtr[attr]->size;
 	    const GLuint stride = VB->AttribPtr[attr]->stride;
 	    const GLfloat *data = (GLfloat *) (ptr + stride * i);
-	    COPY_CLEAN_4V(ctx->VertexProgram.Machine.Inputs[attr], size, data);
+	    COPY_CLEAN_4V(machine.Inputs[attr], size, data);
 	 }
       }
 
       /* execute the program */
       ASSERT(program);
-      _mesa_exec_vertex_program(ctx, program);
+      _mesa_exec_vertex_program(ctx, &machine, program);
 
       /* Fixup fog an point size results if needed */
       if (ctx->Fog.Enabled &&
           (program->Base.OutputsWritten & (1 << VERT_RESULT_FOGC)) == 0) {
-         ctx->VertexProgram.Machine.Outputs[VERT_RESULT_FOGC][0] = 1.0;
+         machine.Outputs[VERT_RESULT_FOGC][0] = 1.0;
       }
 
       if (ctx->VertexProgram.PointSizeEnabled &&
           (program->Base.OutputsWritten & (1 << VERT_RESULT_PSIZ)) == 0) {
-         ctx->VertexProgram.Machine.Outputs[VERT_RESULT_PSIZ][0] = ctx->Point.Size;
+         machine.Outputs[VERT_RESULT_PSIZ][0] = ctx->Point.Size;
       }
 
       /* copy the output registers into the VB->attribs arrays */
       /* XXX (optimize) could use a conditional and smaller loop limit here */
       for (attr = 0; attr < 15; attr++) {
-         COPY_4V(store->attribs[attr].data[i],
-                 ctx->VertexProgram.Machine.Outputs[attr]);
+         COPY_4V(store->attribs[attr].data[i], machine.Outputs[attr]);
       }
    }
 
