Major rework of tnl module
New array_cache module
Support 8 texture units in core mesa (now support 8 everywhere)
Rework core mesa statechange operations to avoid flushing on many
noop statechanges.
diff --git a/src/mesa/tnl/t_pipeline.c b/src/mesa/tnl/t_pipeline.c
index 60e375d..457b160 100644
--- a/src/mesa/tnl/t_pipeline.c
+++ b/src/mesa/tnl/t_pipeline.c
@@ -1,4 +1,4 @@
-/* $Id: t_pipeline.c,v 1.6 2000/11/27 09:05:52 joukj Exp $ */
+/* $Id: t_pipeline.c,v 1.7 2000/12/26 05:09:33 keithw Exp $ */
 
 /*
  * Mesa 3-D graphics library
@@ -22,10 +22,9 @@
  * BRIAN PAUL BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN
  * AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
  * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
- */
-
-/* Dynamic pipelines, support for CVA.
- * Copyright (C) 1999 Keith Whitwell.
+ *
+ * Author:
+ *     Keith Whitwell <keithw@valinux.com>
  */
 
 #include "glheader.h"
@@ -38,435 +37,165 @@
 #include "math/m_translate.h"
 #include "math/m_xform.h"
 
-#include "t_bbox.h"
-#include "t_clip.h"
-#include "t_cva.h"
-#include "t_debug.h"
-#include "t_fog.h"
-#include "t_light.h"
+#include "t_context.h"
 #include "t_pipeline.h"
-#include "t_shade.h"
-#include "t_stages.h"
-#include "t_vbcull.h"
-#include "t_vbindirect.h"
-#include "t_vbrender.h"
-#include "t_vbxform.h"
 
 
-
-
-
-void _tnl_print_pipe_ops( const char *msg, GLuint flags )
+void _tnl_install_pipeline( GLcontext *ctx, 
+			    const struct gl_pipeline_stage **stages )
 {
-   fprintf(stderr,
-	   "%s: (0x%x) %s%s%s%s%s%s%s%s%s\n",
-	   msg,
-	   flags,
-	   (flags & PIPE_OP_CVA_PREPARE)   ? "cva-prepare, " : "",
-	   (flags & PIPE_OP_VERT_XFORM)    ? "vert-xform, " : "",
-	   (flags & PIPE_OP_NORM_XFORM)    ? "norm-xform, " : "",
-	   (flags & PIPE_OP_LIGHT)         ? "light, " : "",
-	   (flags & PIPE_OP_FOG)           ? "fog, " : "",
-	   (flags & PIPE_OP_TEX)           ? "tex-gen/tex-mat, " : "",
-	   (flags & PIPE_OP_RAST_SETUP_0)  ? "rast-0, " : "",
-	   (flags & PIPE_OP_RAST_SETUP_1)  ? "rast-1, " : "",
-	   (flags & PIPE_OP_RENDER)        ? "render, " : "");
-
-}
-
-
-
-/* Have to reset only those parts of the vb which are being recalculated.
- */
-void _tnl_reset_cva_vb( struct vertex_buffer *VB, GLuint stages )
-{
-   GLcontext *ctx = VB->ctx;
    TNLcontext *tnl = TNL_CONTEXT(ctx);
-
-   if (MESA_VERBOSE&VERBOSE_PIPELINE)
-      _tnl_print_pipe_ops( "reset cva vb", stages ); 
-
-   if (stages & PIPE_OP_VERT_XFORM)
-   {
-      if (VB->ClipOrMask & CLIP_USER_BIT)
-	 MEMSET(VB->UserClipMask, 0, VB->Count);
-
-      VB->ClipOrMask = 0;
-      VB->ClipAndMask = CLIP_ALL_BITS;
-      VB->CullMode = 0;
-      VB->CullFlag[0] = VB->CullFlag[1] = 0;
-      VB->Culled = 0;
-   }
-
-   if (stages & PIPE_OP_NORM_XFORM) {
-      VB->NormalPtr = &tnl->CVA.v.Normal;
-   }
-
-   if (stages & PIPE_OP_LIGHT)
-   {
-      VB->ColorPtr = VB->Color[0] = VB->Color[1] = &tnl->CVA.v.Color;
-      VB->IndexPtr = VB->Index[0] = VB->Index[1] = &tnl->CVA.v.Index;
-   }
-   else if (stages & PIPE_OP_FOG)
-   {
-      if (ctx->Light.Enabled) {
-	 VB->Color[0] = VB->LitColor[0];
-	 VB->Color[1] = VB->LitColor[1];
-	 VB->Index[0] = VB->LitIndex[0];
-	 VB->Index[1] = VB->LitIndex[1];
-      } else {
-	 VB->Color[0] = VB->Color[1] = &tnl->CVA.v.Color;
-	 VB->Index[0] = VB->Index[1] = &tnl->CVA.v.Index;
-      }
-      VB->ColorPtr = VB->Color[0];
-      VB->IndexPtr = VB->Index[0];
-   }
-}
-
-
-
-
-
-
-static void pipeline_ctr( struct gl_pipeline *p, GLcontext *ctx, GLuint type )
-{
+   struct gl_pipeline *pipe = &tnl->pipeline;
    GLuint i;
-   (void) ctx;
 
-   p->state_change = 0;
-   p->cva_state_change = 0;
-   p->inputs = 0;
-   p->outputs = 0;
-   p->type = type;
-   p->ops = 0;
+   ASSERT(pipe->nr_stages == 0);
 
-   for (i = 0 ; i < _tnl_default_nr_stages ; i++) 
-      p->state_change |= _tnl_default_pipeline[i].state_change;
-}
+   pipe->run_state_changes = ~0;
+   pipe->run_input_changes = ~0;
+   pipe->build_state_changes = ~0;
+   pipe->build_state_trigger = 0;
+   pipe->inputs = 0;
 
-
-void _tnl_pipeline_init( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-
-   MEMCPY( tnl->PipelineStage, 
-	   _tnl_default_pipeline, 
-	   sizeof(*_tnl_default_pipeline) * _tnl_default_nr_stages );
-   
-   tnl->NrPipelineStages = _tnl_default_nr_stages;
-      
-   pipeline_ctr( &tnl->CVA.elt, ctx, PIPE_IMMEDIATE);
-   pipeline_ctr( &tnl->CVA.pre, ctx, PIPE_PRECALC );
-}
-
-
-
-#define MINIMAL_VERT_DATA (VERT_DATA & ~VERT_EVAL_ANY)
-
-#define VERT_CURRENT_DATA (VERT_TEX_ANY | \
-                           VERT_RGBA | \
-                           VERT_SPEC_RGB | \
-                           VERT_FOG_COORD | \
-			   VERT_INDEX | \
-                           VERT_EDGE | \
-                           VERT_NORM | \
-	                   VERT_MATERIAL)
-
-/* Called prior to every recomputation of the CVA precalc data, except where
- * the driver is able to calculate the pipeline unassisted.
- */
-static void build_full_precalc_pipeline( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline_stage *pipeline = tnl->PipelineStage;
-   struct gl_cva *cva = &tnl->CVA;
-   struct gl_pipeline *pre = &cva->pre;
-   struct gl_pipeline_stage **stages = pre->stages;
-   GLuint i;
-   GLuint newstate = pre->new_state;
-   GLuint changed_ops = 0;
-   GLuint oldoutputs = pre->outputs;
-   GLuint oldinputs = pre->inputs;
-   GLuint fallback = (VERT_CURRENT_DATA &
-		      ~tnl->_ArraySummary);
-   GLuint changed_outputs = (tnl->_ArrayNewState |
-			     (fallback & cva->orflag));
-   GLuint available = fallback | tnl->_ArrayFlags;
-
-   pre->cva_state_change = 0;
-   pre->ops = 0;
-   pre->outputs = 0;
-   pre->inputs = 0;
-   pre->forbidden_inputs = 0;
-   pre->fallback = 0;
-
-   /* KW: Disable data reuse during Mesa reorg.  Make this more readable...
+   /* Create a writeable copy of each stage.
     */
-   newstate = ~0;
-
-   if (tnl->_ArraySummary & VERT_ELT)
-      cva->orflag &= VERT_MATERIAL;
-
-   cva->orflag &= ~(tnl->_ArraySummary & ~VERT_OBJ_ANY);
-   available &= ~cva->orflag;
-
-   pre->outputs = available;
-   pre->inputs = available;
-
-   if (MESA_VERBOSE & VERBOSE_PIPELINE) {
-      fprintf(stderr, ": Rebuild pipeline\n");
-      _tnl_print_vert_flags("orflag", cva->orflag);
+   for (i = 0 ; i < MAX_PIPELINE_STAGES && stages[i] ; i++) {
+      MEMCPY( &pipe->stages[i], stages[i], sizeof( **stages ));
+      pipe->build_state_trigger |= pipe->stages[i].check_state;
    }
 
+   pipe->nr_stages = i;
+}
 
+void _tnl_destroy_pipeline( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   GLuint i;
+
+   for (i = 0 ; i < tnl->pipeline.nr_stages ; i++) 
+      tnl->pipeline.stages[i].destroy( &tnl->pipeline.stages[i] );
+
+   tnl->pipeline.nr_stages = 0;
+}
+
+
+
+void _tnl_validate_pipeline( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct gl_pipeline *pipe = &tnl->pipeline;
+   struct gl_pipeline_stage *stage = pipe->stages;
+   GLuint newstate = pipe->build_state_changes;
+   GLuint generated = 0;
+   GLuint i;
+
+   pipe->inputs = 0;
+   pipe->build_state_changes = 0;
+
+   for (i = 0 ; i < pipe->nr_stages ; i++) {
+      if (stage[i].check_state & newstate) {
+	 stage[i].check(ctx, &stage[i]);
+      }
+
+      if (stage[i].active) {
+	 pipe->inputs |= stage[i].inputs & ~generated;
+	 generated |= stage[i].outputs;
+      } 
+   }
+}
+
+
+
+
+void _tnl_run_pipeline( GLcontext *ctx )
+{
+   TNLcontext *tnl = TNL_CONTEXT(ctx);
+   struct gl_pipeline *pipe = &tnl->pipeline;
+   struct gl_pipeline_stage *stage = pipe->stages;
+   GLuint changed_state = pipe->run_state_changes;
+   GLuint changed_inputs = pipe->run_input_changes;
+   GLboolean running = GL_TRUE;
+   GLuint i;
+
+   unsigned short __tmp;
+
+   /* Done elsewhere.
+    */
+   ASSERT(pipe->build_state_changes == 0);
+
+   START_FAST_MATH(__tmp);
 
    /* If something changes in the pipeline, tag all subsequent stages
-    * using this value for recalcuation.  Also used to build the full
-    * pipeline by setting newstate and newinputs to ~0.
+    * using this value for recalculation.  
     *
-    * Because all intermediate values are buffered, the new inputs
-    * are enough to fully specify what needs to be calculated, and a
-    * single pass identifies all stages requiring recalculation.
+    * Even inactive stages have their state and inputs examined to try
+    * to keep cached data alive over state-changes. 
     */
-   for (i = 0 ; i < tnl->NrPipelineStages ; i++)
-   {
-      pipeline[i].check(ctx, &pipeline[i]);
+   for (i = 0 ; i < pipe->nr_stages ; i++) {
+      
+      stage[i].changed_inputs |= stage[i].inputs & changed_inputs;
 
-      if (pipeline[i].type & PIPE_PRECALC)
-      {
-	 if ((newstate & pipeline[i].cva_state_change) ||
-	     (changed_outputs & pipeline[i].inputs) ||
-	     !pipeline[i].inputs)
-	 {	
-	    changed_ops |= pipeline[i].ops;
-	    changed_outputs |= pipeline[i].outputs;
-	    pipeline[i].active &= ~PIPE_PRECALC;
-
-	    if ((pipeline[i].inputs & ~available) == 0 &&
-		(pipeline[i].ops & pre->ops) == 0)
-	    {
-	       pipeline[i].active |= PIPE_PRECALC;
-	       *stages++ = &pipeline[i];
-	    }
-	 }
-
-	 /* Incompatible with multiple stages structs implementing
-	  * the same stage.
-	  */
-	 available &= ~pipeline[i].outputs;
-	 pre->outputs &= ~pipeline[i].outputs;
-
-	 if (pipeline[i].active & PIPE_PRECALC) {
-	    pre->ops |= pipeline[i].ops;
-	    pre->outputs |= pipeline[i].outputs;
-	    available |= pipeline[i].outputs;
-	    pre->forbidden_inputs |= pipeline[i].pre_forbidden_inputs;
-	 }
-      }
-      else if (pipeline[i].active & PIPE_PRECALC)
-      {
-	 pipeline[i].active &= ~PIPE_PRECALC;
-	 changed_outputs |= pipeline[i].outputs;
-	 changed_ops |= pipeline[i].ops;
-      }
-   }
-
-   *stages = 0;
-
-   pre->new_outputs = pre->outputs & (changed_outputs | ~oldoutputs);
-   pre->new_inputs = pre->inputs & ~oldinputs;
-   pre->fallback = pre->inputs & fallback;
-   pre->forbidden_inputs |= pre->inputs & fallback;
-
-   pre->changed_ops = changed_ops;
-}
-
-void _tnl_build_precalc_pipeline( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline *pre = &tnl->CVA.pre;
-   struct gl_pipeline *elt = &tnl->CVA.elt;
-
-   if (!ctx->Driver.BuildPrecalcPipeline ||
-       !ctx->Driver.BuildPrecalcPipeline( ctx ))
-      build_full_precalc_pipeline( ctx );
-
-   pre->data_valid = 0;
-   pre->pipeline_valid = 1;
-   elt->pipeline_valid = 0;
-
-   tnl->CVA.orflag = 0;
-
-   if (MESA_VERBOSE&VERBOSE_PIPELINE)
-      _tnl_print_pipeline( ctx, pre ); 
-}
-
-
-static void build_full_immediate_pipeline( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline_stage *pipeline = tnl->PipelineStage;
-   struct gl_cva *cva = &tnl->CVA;
-   struct gl_pipeline *pre = &cva->pre;
-   struct gl_pipeline *elt = &cva->elt;
-   struct gl_pipeline_stage **stages = elt->stages;
-   GLuint i;
-   GLuint newstate = elt->new_state;
-   GLuint active_ops = 0;
-   GLuint available = cva->orflag | MINIMAL_VERT_DATA;
-   GLuint generated = 0;
-   GLuint is_elt = 0;
-
-   if (pre->data_valid && tnl->CompileCVAFlag) {
-      is_elt = 1;
-      active_ops = cva->pre.ops;
-      available |= pre->outputs | VERT_PRECALC_DATA;
-   }
-
-
-   elt->outputs = 0;		/* not used */
-   elt->inputs = 0;
-
-   for (i = 0 ; i < tnl->NrPipelineStages ; i++) {
-      pipeline[i].active &= ~PIPE_IMMEDIATE;
-
-      if ((pipeline[i].state_change & newstate) ||
-  	  (pipeline[i].elt_forbidden_inputs & available))
-      {
-	 pipeline[i].check(ctx, &pipeline[i]);
+      if (stage[i].run_state & changed_state) {
+	 stage[i].changed_inputs = stage[i].inputs;
       }
 
-      if ((pipeline[i].type & PIPE_IMMEDIATE) &&
-	  (pipeline[i].ops & active_ops) == 0 &&
-	  (pipeline[i].elt_forbidden_inputs & available) == 0
-	 )
-      {
-	 if (pipeline[i].inputs & ~available)
-	    elt->forbidden_inputs |= pipeline[i].inputs & ~available;
-	 else
-	 {
-	    elt->inputs |= pipeline[i].inputs & ~generated;
-	    elt->forbidden_inputs |= pipeline[i].elt_forbidden_inputs;
-	    pipeline[i].active |= PIPE_IMMEDIATE;
-	    *stages++ = &pipeline[i];
-	    generated |= pipeline[i].outputs;
-	    available |= pipeline[i].outputs;
-	    active_ops |= pipeline[i].ops;
+      if (stage[i].active) {
+	 if (stage[i].changed_inputs)
+	    changed_inputs |= stage[i].outputs;
+
+	 if (running) {
+	    running = stage[i].run( ctx, &stage[i] );
 	 }
       }
    }
+   END_FAST_MATH(__tmp);
 
-   *stages = 0;
-
-   elt->copy_transformed_data = 1;
-   elt->replay_copied_vertices = 0;
-
-   if (is_elt) {
-      cva->merge = elt->inputs & pre->outputs;
-      elt->ops = active_ops & ~pre->ops;
-   }
+   pipe->run_state_changes = 0;
+   pipe->run_input_changes = 0;
 }
 
 
 
-void _tnl_build_immediate_pipeline( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   struct gl_pipeline *elt = &tnl->CVA.elt;
-
-   if (!ctx->Driver.BuildEltPipeline ||
-       !ctx->Driver.BuildEltPipeline( ctx )) {
-      build_full_immediate_pipeline( ctx );
-   }
-
-   elt->pipeline_valid = 1;
-   tnl->CVA.orflag = 0;
-
-   if (MESA_VERBOSE&VERBOSE_PIPELINE)
-      _tnl_print_pipeline( ctx, elt ); 
-}
-
-#define INTERESTED ~0
-
-void _tnl_update_pipelines( GLcontext *ctx )
-{
-   TNLcontext *tnl = TNL_CONTEXT(ctx);
-   GLuint newstate = ctx->NewState;
-   struct gl_cva *cva = &tnl->CVA;
-
-   newstate &= INTERESTED;
-
-   if (MESA_VERBOSE & (VERBOSE_API|VERBOSE_STATE))
-      gl_print_enable_flags("enabled", ctx->_Enabled);
-
-   if (newstate ||
-       cva->lock_changed ||
-       cva->orflag != cva->last_orflag ||
-       tnl->_ArrayFlags != cva->last_array_flags)
-   {
-      GLuint j;
-      GLuint flags = VERT_WIN;
-
-      if (ctx->Visual.RGBAflag) {
-	 flags |= VERT_RGBA;
-	 if (ctx->_TriangleCaps && DD_SEPERATE_SPECULAR)
- 	    flags |= VERT_SPEC_RGB;
-      } else
-	 flags |= VERT_INDEX;
-
-      for (j = 0 ; j < ctx->Const.MaxTextureUnits ; j++) {
-	 if (ctx->Texture.Unit[j]._ReallyEnabled)
-	    flags |= VERT_TEX(j);
-      }
-
-      if (ctx->Polygon._Unfilled)
-	 flags |= VERT_EDGE;
-
-      if (ctx->Fog.FogCoordinateSource == GL_FOG_COORDINATE_EXT)
- 	 flags |= VERT_FOG_COORD;
-
-      if (ctx->RenderMode==GL_FEEDBACK) {
-	 flags = (VERT_WIN | VERT_RGBA | VERT_INDEX | VERT_NORM | 
-		  VERT_EDGE | VERT_TEX_ANY);
-      }
-
-      tnl->_RenderFlags = flags;
-
-      cva->elt.new_state |= newstate;
-      cva->elt.pipeline_valid = 0;
-
-      cva->pre.new_state |= newstate;
-      cva->pre.forbidden_inputs = 0;
-      cva->pre.pipeline_valid = 0;
-      cva->lock_changed = 0;
-   }
-
-   if (tnl->_ArrayNewState != cva->last_array_new_state)
-      cva->pre.pipeline_valid = 0;
-
-   cva->pre.data_valid = 0;
-   cva->last_array_new_state = tnl->_ArrayNewState;
-   cva->last_orflag = cva->orflag;
-   cva->last_array_flags = tnl->_ArrayFlags;
-}
-
-void _tnl_run_pipeline( struct vertex_buffer *VB )
-{
-   struct gl_pipeline *pipe = VB->pipeline;
-   struct gl_pipeline_stage **stages = pipe->stages;
-   unsigned short x;
-
-   pipe->data_valid = 1;	/* optimized stages might want to reset this. */
-
-   if (0) _tnl_print_pipeline( VB->ctx, pipe );
-   
-   START_FAST_MATH(x);
-
-   for ( VB->Culled = 0; *stages && !VB->Culled ; stages++ )
-      (*stages)->run( VB );
-
-   END_FAST_MATH(x);
-
-   pipe->new_state = 0;
-}
+/* The default pipeline.  This is useful for software rasterizers, and
+ * simple hardware rasterizers.  For customization, I don't recommend
+ * tampering with the internals of these stages in the way that
+ * drivers did in Mesa 3.4.  These stages are basically black boxes,
+ * and should be left intact.  
+ *
+ * To customize the pipeline, consider: 
+ *
+ * - removing redundant stages (making sure that the software rasterizer
+ *   can cope with this on fallback paths).  An example is fog
+ *   coordinate generation, which is not required in the FX driver.
+ *
+ * - replacing general-purpose machine-independent stages with
+ *   general-purpose machine-specific stages.  There is no example of
+ *   this to date, though it must be borne in mind that all subsequent
+ *   stages that reference the output of the new stage must cope with
+ *   any machine-specific data introduced.  This may not be easy
+ *   unless there are no such stages (ie the new stage is the last in
+ *   the pipe).
+ *
+ * - inserting optimized (but specialized) stages ahead of the
+ *   general-purpose fallback implementation.  For example, the old
+ *   fastpath mechanism, which only works when the VERT_ELT input is
+ *   available, can be duplicated by placing the fastpath stage at the
+ *   head of this pipeline.  Such specialized stages are currently
+ *   constrained to have no outputs (ie. they must either finish the *
+ *   pipeline by returning GL_FALSE from run(), or do nothing).
+ *
+ * Some work can be done to lift some of the restrictions in the final
+ * case, if it becomes necessary to do so. 
+ */
+const struct gl_pipeline_stage *_tnl_default_pipeline[] = {
+   &_tnl_update_material_stage, 
+   &_tnl_vertex_transform_stage, 
+   &_tnl_normal_transform_stage, 
+   &_tnl_lighting_stage, 
+   &_tnl_fog_coordinate_stage, 
+   &_tnl_texgen_stage, 
+   &_tnl_texture_transform_stage, 
+   &_tnl_point_attenuation_stage, 
+   &_tnl_render_stage,
+   0
+};