merge the (rest of) texmem branch
diff --git a/progs/demos/Makefile b/progs/demos/Makefile
index 43d0f17..feb2abd 100644
--- a/progs/demos/Makefile
+++ b/progs/demos/Makefile
@@ -47,6 +47,7 @@
 	renormal \
 	shadowtex \
 	singlebuffer \
+	streaming_rect \
 	spectex \
 	spriteblast \
 	stex3d \
diff --git a/progs/demos/streaming_rect.c b/progs/demos/streaming_rect.c
new file mode 100644
index 0000000..86e0080
--- /dev/null
+++ b/progs/demos/streaming_rect.c
@@ -0,0 +1,322 @@
+
+/*
+ * GL_ARB_multitexture demo
+ *
+ * Command line options:
+ *    -info      print GL implementation information
+ *
+ *
+ * Brian Paul  November 1998  This program is in the public domain.
+ * Modified on 12 Feb 2002 for > 2 texture units.
+ */
+
+#define GL_GLEXT_PROTOTYPES
+
+#include <math.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <GL/glut.h>
+
+#include "readtex.h"
+
+
+#define ANIMATE 10
+#define PBO 11
+#define QUIT 100
+
+static GLboolean Animate = GL_TRUE;
+static GLboolean use_pbo = 1;
+static GLboolean whole_rect = 1;
+
+static GLfloat Drift = 0.0;
+static GLfloat drift_increment = 1/255.0;
+static GLfloat Xrot = 20.0, Yrot = 30.0;
+
+static GLuint Width = 1024;
+static GLuint Height = 512;
+
+
+static void Idle( void )
+{
+   if (Animate) {
+
+      Drift += drift_increment;
+      if (Drift >= 1.0)
+         Drift = 0.0;
+
+      glutPostRedisplay();
+   }
+}
+
+static int max( int a, int b ) { return a > b ? a : b; }
+static int min( int a, int b ) { return a < b ? a : b; }
+
+static void DrawObject()
+{
+   GLint size = Width * Height * 4;
+   
+   if (use_pbo) {
+      /* XXX: This is extremely important - semantically makes the buffer
+       * contents undefined, but in practice means that the driver can
+       * release the old copy of the texture and allocate a new one
+       * without waiting for outstanding rendering to complete.
+       */
+      glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_EXT, size, NULL, GL_STREAM_DRAW_ARB);
+
+      {
+	 char *image = glMapBufferARB(GL_PIXEL_UNPACK_BUFFER_EXT, GL_WRITE_ONLY_ARB);
+      
+	 printf("char %d\n", (unsigned char)(Drift * 255));
+
+	 memset(image, size, (unsigned char)(Drift * 255));
+      
+	 glUnmapBufferARB(GL_PIXEL_UNPACK_BUFFER_EXT);
+      }
+   
+
+      /* BGRA is required for most hardware paths:
+       */
+      glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA, Width, Height, 0,
+		   GL_BGRA, GL_UNSIGNED_BYTE, NULL);
+   }
+   else {
+      static char *image = NULL;
+
+      if (image == NULL) 
+	 image = malloc(size);
+
+      memset(image, size, (unsigned char)(Drift * 255));
+
+      /* BGRA should be the fast path for regular uploads as well.
+       */
+      glTexImage2D(GL_TEXTURE_RECTANGLE_ARB, 0, GL_RGBA, Width, Height, 0,
+		   GL_BGRA, GL_UNSIGNED_BYTE, image);
+   }
+
+   {
+      int x,y,w,h;
+
+      if (whole_rect) {
+	 x = y = 0;
+	 w = Width;
+	 h = Height;
+      }
+      else {
+	 x = y = 0;
+	 w = min(10, Width);
+	 h = min(10, Height);
+      }
+
+      glBegin(GL_QUADS);
+
+      glTexCoord2f( x, y);
+      glVertex2f( x, y );
+
+      glTexCoord2f( x, y + h);
+      glVertex2f( x, y + h);
+
+      glTexCoord2f( x + w + .5, y + h);
+      glVertex2f( x + w, y + h );
+
+      glTexCoord2f( x + w, y + .5);
+      glVertex2f( x + w, y );
+
+      glEnd();
+   }
+}
+
+
+
+static void Display( void )
+{
+   static GLint T0 = 0;
+   static GLint Frames = 0;
+   GLint t;
+
+   glClear( GL_COLOR_BUFFER_BIT );
+
+   glPushMatrix();
+      DrawObject();
+   glPopMatrix();
+
+   glutSwapBuffers();
+
+   Frames++;
+
+   t = glutGet(GLUT_ELAPSED_TIME);
+   if (t - T0 >= 1000) {
+      GLfloat seconds = (t - T0) / 1000.0;
+
+      GLfloat fps = Frames / seconds;
+      printf("%d frames in %6.3f seconds = %6.3f FPS\n", Frames, seconds, fps);
+
+      drift_increment = 2.2 * seconds / Frames;
+      T0 = t;
+      Frames = 0;
+   }
+}
+
+
+static void Reshape( int width, int height )
+{
+   glViewport( 0, 0, width, height );
+   glMatrixMode( GL_PROJECTION );
+   glLoadIdentity();
+/*    glFrustum( -1.0, 1.0, -1.0, 1.0, 10.0, 100.0 ); */
+   gluOrtho2D( 0, width, height, 0 );
+   glMatrixMode( GL_MODELVIEW );
+   glLoadIdentity();
+   glTranslatef(0.375, 0.375, 0);
+}
+
+
+static void ModeMenu(int entry)
+{
+   if (entry==ANIMATE) {
+      Animate = !Animate;
+   }
+   else if (entry==PBO) {
+      use_pbo = !use_pbo;
+   }
+   else if (entry==QUIT) {
+      exit(0);
+   }
+
+   glutPostRedisplay();
+}
+
+
+static void Key( unsigned char key, int x, int y )
+{
+   (void) x;
+   (void) y;
+   switch (key) {
+      case 27:
+         exit(0);
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+static void SpecialKey( int key, int x, int y )
+{
+   float step = 3.0;
+   (void) x;
+   (void) y;
+
+   switch (key) {
+      case GLUT_KEY_UP:
+         Xrot += step;
+         break;
+      case GLUT_KEY_DOWN:
+         Xrot -= step;
+         break;
+      case GLUT_KEY_LEFT:
+         Yrot += step;
+         break;
+      case GLUT_KEY_RIGHT:
+         Yrot -= step;
+         break;
+   }
+   glutPostRedisplay();
+}
+
+
+static void Init( int argc, char *argv[] )
+{
+   const char *exten = (const char *) glGetString(GL_EXTENSIONS);
+   GLuint texObj, DrawPBO;
+   GLint size;
+
+
+   if (!strstr(exten, "GL_ARB_multitexture")) {
+      printf("Sorry, GL_ARB_multitexture not supported by this renderer.\n");
+      exit(1);
+   }
+
+   glGetIntegerv(GL_MAX_TEXTURE_SIZE, &size);
+   printf("%d x %d max texture size\n", size, size);
+
+   glPixelStorei(GL_UNPACK_ALIGNMENT, 1);
+
+   /* allocate two texture objects */
+   glGenTextures(1, &texObj);
+
+   /* setup the texture objects */
+   glActiveTextureARB(GL_TEXTURE0_ARB);
+   glBindTexture(GL_TEXTURE_RECTANGLE_ARB, texObj);
+
+   glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MIN_FILTER, GL_NEAREST);
+   glTexParameteri(GL_TEXTURE_RECTANGLE_ARB, GL_TEXTURE_MAG_FILTER, GL_NEAREST);
+
+   glGenBuffersARB(1, &DrawPBO);
+
+   glBindBufferARB(GL_PIXEL_UNPACK_BUFFER_EXT, DrawPBO);
+   glBufferDataARB(GL_PIXEL_UNPACK_BUFFER_EXT,
+		   Width * Height * 4, NULL, GL_STREAM_DRAW);
+
+   glTexEnvi(GL_TEXTURE_ENV, GL_TEXTURE_ENV_MODE, GL_REPLACE);
+
+   glEnable(GL_TEXTURE_RECTANGLE_ARB);
+
+   glShadeModel(GL_SMOOTH);
+   glClearColor(0.3, 0.3, 0.4, 1.0);
+
+   if (argc > 1 && strcmp(argv[1], "-info")==0) {
+      printf("GL_RENDERER   = %s\n", (char *) glGetString(GL_RENDERER));
+      printf("GL_VERSION    = %s\n", (char *) glGetString(GL_VERSION));
+      printf("GL_VENDOR     = %s\n", (char *) glGetString(GL_VENDOR));
+      printf("GL_EXTENSIONS = %s\n", (char *) glGetString(GL_EXTENSIONS));
+   }
+}
+
+
+int main( int argc, char *argv[] )
+{
+   GLint i;
+
+   glutInit( &argc, argv );
+
+   for (i = 1; i < argc; i++) {
+      if (strcmp(argv[i], "-w") == 0) {
+         Width = atoi(argv[i+1]);
+         if (Width <= 0) {
+            printf("Error, bad width\n");
+            exit(1);
+         }
+         i++;
+      }
+      else if (strcmp(argv[i], "-h") == 0) {
+         Height = atoi(argv[i+1]);
+         if (Height <= 0) {
+            printf("Error, bad height\n");
+            exit(1);
+         }
+         i++;
+      }
+   }
+
+   glutInitWindowSize( Width, Height );
+   glutInitWindowPosition( 0, 0 );
+   glutInitDisplayMode( GLUT_RGB | GLUT_DOUBLE );
+   glutCreateWindow(argv[0] );
+
+   Init( argc, argv );
+
+   glutReshapeFunc( Reshape );
+   glutKeyboardFunc( Key );
+   glutSpecialFunc( SpecialKey );
+   glutDisplayFunc( Display );
+   glutIdleFunc( Idle );
+
+   glutCreateMenu(ModeMenu);
+   glutAddMenuEntry("Toggle Animation", ANIMATE);
+   glutAddMenuEntry("Toggle PBO", PBO);
+   glutAddMenuEntry("Quit", QUIT);
+   glutAttachMenu(GLUT_RIGHT_BUTTON);
+
+   glutMainLoop();
+   return 0;
+}
diff --git a/progs/demos/texdown.c b/progs/demos/texdown.c
index 79525a0..fc98fdd 100644
--- a/progs/demos/texdown.c
+++ b/progs/demos/texdown.c
@@ -38,8 +38,8 @@
 #include <GL/glut.h>
 
 
-static GLsizei MaxSize = 1024;
-static GLsizei TexWidth = 256, TexHeight = 256, TexBorder = 0;
+static GLsizei MaxSize = 2048;
+static GLsizei TexWidth = 1024, TexHeight = 1024, TexBorder = 0;
 static GLboolean ScaleAndBias = GL_FALSE;
 static GLboolean SubImage = GL_FALSE;
 static GLdouble DownloadRate = 0.0;  /* texels/sec */
@@ -47,6 +47,32 @@
 static GLuint Mode = 0;
 
 
+/* Try and avoid L2 cache effects by cycling through a small number of
+ * textures.
+ * 
+ * At the initial size of 1024x1024x4 == 4mbyte, say 8 textures will
+ * keep us out of most caches at 32mb total.
+ *
+ * This turns into a fairly interesting question of what exactly you
+ * expect to be in cache in normal usage, and what you think should be
+ * outside.  There's no rules for this, no reason to favour one usage
+ * over another except what the application you care about happens to
+ * resemble most closely.
+ *
+ * - Should the client texture image be in L2 cache?  Has it just been
+ *   generated or read from disk?
+ * - Does the application really use >1 texture, or is it constantly 
+ *   updating one image in-place?
+ *
+ * Different answers will favour different texture upload mechanisms.
+ * To upload an image that is purely outside of cache, a DMA-based
+ * upload will probably win, whereas for small, in-cache textures,
+ * copying looks good.
+ */
+#define NR_TEXOBJ 4
+static GLuint TexObj[NR_TEXOBJ];
+
+
 struct FormatRec {
    GLenum Format;
    GLenum Type;
@@ -116,25 +142,57 @@
    }
 }
 
+/* On x86, there is a performance cliff for memcpy to texture memory
+ * for sources below 64 byte alignment.  We do our best with this in
+ * the driver, but it is better if the images are correctly aligned to
+ * start with:
+ */
+#define ALIGN (1<<12)
+
+static unsigned align(unsigned value, unsigned a)
+{
+   return (value + a - 1) & ~(a-1);
+}
+
+static int MIN2(int a, int b)
+{
+   return a < b ? a : b;
+}
 
 static void
 MeasureDownloadRate(void)
 {
    const int w = TexWidth + 2 * TexBorder;
    const int h = TexHeight + 2 * TexBorder;
-   const int bytes = w * h * BytesPerTexel(Format);
+   const int image_bytes = align(w * h * BytesPerTexel(Format), ALIGN);
+   const int bytes = image_bytes * NR_TEXOBJ;
+   GLubyte *orig_texImage, *orig_getImage;
    GLubyte *texImage, *getImage;
    GLdouble t0, t1, time;
    int count;
    int i;
+   int offset = 0;
+   GLdouble total = 0;		/* ints will tend to overflow */
 
-   texImage = (GLubyte *) malloc(bytes);
-   getImage = (GLubyte *) malloc(bytes);
-   if (!texImage || !getImage) {
+   printf("allocating %d bytes for %d %dx%d images\n",
+	  bytes, NR_TEXOBJ, w, h);
+
+   orig_texImage = (GLubyte *) malloc(bytes + ALIGN);
+   orig_getImage = (GLubyte *) malloc(image_bytes + ALIGN);
+   if (!orig_texImage || !orig_getImage) {
       DownloadRate = 0.0;
       return;
    }
 
+   printf("alloc %p %p\n", orig_texImage, orig_getImage);
+
+   texImage = (GLubyte *)align((unsigned)orig_texImage, ALIGN);
+   getImage = (GLubyte *)align((unsigned)orig_getImage, ALIGN);   
+
+   for (i = 1; !(((unsigned)texImage) & i); i<<=1)
+      ;
+   printf("texture image alignment: %d bytes (%p)\n", i, texImage);
+      
    for (i = 0; i < bytes; i++) {
       texImage[i] = i & 0xff;
    }
@@ -166,16 +224,50 @@
    count = 0;
    t0 = glutGet(GLUT_ELAPSED_TIME) * 0.001;
    do {
+      int img = count%NR_TEXOBJ;
+      GLubyte *img_ptr = texImage + img * image_bytes;
+
+      glBindTexture(GL_TEXTURE_2D, TexObj[img]);
+
       if (SubImage && count > 0) {
-         glTexSubImage2D(GL_TEXTURE_2D, 0, -TexBorder, -TexBorder, w, h,
+	 /* Only update a portion of the image each iteration.  This
+	  * is presumably why you'd want to use texsubimage, otherwise
+	  * you may as well just call teximage again.
+	  *
+	  * A bigger question is whether to use a pointer that moves
+	  * with each call, ie does the incoming data come from L2
+	  * cache under normal circumstances, or is it pulled from
+	  * uncached memory?  
+	  * 
+	  * There's a good argument to say L2 cache, ie you'd expect
+	  * the data to have been recently generated.  It's possible
+	  * that it could have come from a file read, which may or may
+	  * not have gone through the cpu.
+	  */
+         glTexSubImage2D(GL_TEXTURE_2D, 0, 
+			 -TexBorder, 
+			 -TexBorder + offset * h/8, 
+			 w, 
+			 h/8,
                          FormatTable[Format].Format,
-                         FormatTable[Format].Type, texImage);
+                         FormatTable[Format].Type, 
+#if 1
+			 texImage /* likely in L2$ */
+#else
+			 img_ptr + offset * bytes/8 /* unlikely in L2$ */
+#endif
+	    );
+	 offset += 1;
+	 offset %= 8;
+	 total += w * h / 8;
       }
       else {
          glTexImage2D(GL_TEXTURE_2D, 0,
                       FormatTable[Format].IntFormat, w, h, TexBorder,
                       FormatTable[Format].Format,
-                      FormatTable[Format].Type, texImage);
+                      FormatTable[Format].Type, 
+		      img_ptr);
+	 total += w*h;
       }
 
       /* draw a tiny polygon to force texture into texram */
@@ -192,25 +284,12 @@
 
    glDisable(GL_TEXTURE_2D);
 
-   printf("w*h=%d  count=%d  time=%f\n", w*h, count, time);
-   DownloadRate = w * h * count / time;
+   printf("total texels=%f  time=%f\n", total, time);
+   DownloadRate = total / time;
 
-#if 0
-   if (!ScaleAndBias) {
-      /* verify texture readback */
-      glGetTexImage(GL_TEXTURE_2D, 0,
-                    FormatTable[Format].Format,
-                    FormatTable[Format].Type, getImage);
-      for (i = 0; i < w * h; i++) {
-         if (texImage[i] != getImage[i]) {
-            printf("[%d] %d != %d\n", i, texImage[i], getImage[i]);
-         }
-      }
-   }
-#endif
 
-   free(texImage);
-   free(getImage);
+   free(orig_texImage); 
+   free(orig_getImage); 
 
    {
       GLint err = glGetError();